import os import json from datasets import load_dataset # 设置保存路径 save_dir = "/Users/jojo/Desktop/软件所实习/微调和强化学习/蒸馏/数据集/问题" os.makedirs(save_dir, exist_ok=True) # 1. 下载C-Eval数据集(中文综合能力评测) print("开始下载C-Eval中文数据集...") try: ceval = load_dataset('ceval/ceval-exam', 'val', split='val[:1000]') ceval_questions = [] for idx, item in enumerate(ceval): question_entry = { "id": f"ceval_{idx}", "question": item['question'], "subject": item.get('subject', ''), } ceval_questions.append(question_entry) # 保存C-Eval数据 ceval_file = os.path.join(save_dir, "ceval_questions_1000.json") with open(ceval_file, 'w', encoding='utf-8') as f: json.dump(ceval_questions, f, ensure_ascii=False, indent=2) print(f"C-Eval: 成功提取{len(ceval_questions)}个问题") except Exception as e: print(f"C-Eval下载失败: {e}") # 2. 下载CMMLU数据集(中文多学科理解) print("\n开始下载CMMLU中文数据集...") try: cmmlu = load_dataset('haonan-li/cmmlu', 'all', split='test[:1000]') cmmlu_questions = [] for idx, item in enumerate(cmmlu): question_entry = { "id": f"cmmlu_{idx}", "question": item['Question'], "subject": item.get('Subject', ''), } cmmlu_questions.append(question_entry) # 保存CMMLU数据 cmmlu_file = os.path.join(save_dir, "cmmlu_questions_1000.json") with open(cmmlu_file, 'w', encoding='utf-8') as f: json.dump(cmmlu_questions, f, ensure_ascii=False, indent=2) print(f"CMMLU: 成功提取{len(cmmlu_questions)}个问题") except Exception as e: print(f"CMMLU下载失败: {e}") # 3. 下载BELLE的指令数据(中文指令微调数据) print("\n开始下载BELLE中文指令数据...") try: belle = load_dataset('BelleGroup/train_0.5M_CN', split='train[:1000]') belle_questions = [] for idx, item in enumerate(belle): # BELLE格式是instruction字段 question_entry = { "id": f"belle_{idx}", "question": item['instruction'], "type": "instruction", } belle_questions.append(question_entry) # 保存BELLE数据 belle_file = os.path.join(save_dir, "belle_questions_1000.json") with open(belle_file, 'w', encoding='utf-8') as f: json.dump(belle_questions, f, ensure_ascii=False, indent=2) print(f"BELLE: 成功提取{len(belle_questions)}个问题") except Exception as e: print(f"BELLE下载失败: {e}") # 4. 合并所有问题 print("\n合并所有中文问题...") all_questions = [] # 读取并合并已下载的数据 for filename in ['ceval_questions_1000.json', 'cmmlu_questions_1000.json', 'belle_questions_1000.json']: filepath = os.path.join(save_dir, filename) if os.path.exists(filepath): with open(filepath, 'r', encoding='utf-8') as f: data = json.load(f) all_questions.extend(data) # 保存合并后的数据 if all_questions: merged_file = os.path.join(save_dir, "chinese_questions_all.json") with open(merged_file, 'w', encoding='utf-8') as f: json.dump(all_questions, f, ensure_ascii=False, indent=2) # 只保存纯问题文本 questions_only_file = os.path.join(save_dir, "chinese_questions_only.txt") with open(questions_only_file, 'w', encoding='utf-8') as f: for item in all_questions: f.write(item['question'] + '\n') print(f"\n总计提取{len(all_questions)}个中文问题") print(f"合并数据已保存至: {merged_file}") print(f"纯问题文本已保存至: {questions_only_file}") print("\n下载完成!")