104 lines
3.7 KiB
Python
104 lines
3.7 KiB
Python
import os
|
||
import json
|
||
from datasets import load_dataset
|
||
|
||
# 设置保存路径
|
||
save_dir = "/Users/jojo/Desktop/软件所实习/微调和强化学习/蒸馏/数据集/问题"
|
||
os.makedirs(save_dir, exist_ok=True)
|
||
|
||
# 1. 下载C-Eval数据集(中文综合能力评测)
|
||
print("开始下载C-Eval中文数据集...")
|
||
try:
|
||
ceval = load_dataset('ceval/ceval-exam', 'val', split='val[:1000]')
|
||
|
||
ceval_questions = []
|
||
for idx, item in enumerate(ceval):
|
||
question_entry = {
|
||
"id": f"ceval_{idx}",
|
||
"question": item['question'],
|
||
"subject": item.get('subject', ''),
|
||
}
|
||
ceval_questions.append(question_entry)
|
||
|
||
# 保存C-Eval数据
|
||
ceval_file = os.path.join(save_dir, "ceval_questions_1000.json")
|
||
with open(ceval_file, 'w', encoding='utf-8') as f:
|
||
json.dump(ceval_questions, f, ensure_ascii=False, indent=2)
|
||
print(f"C-Eval: 成功提取{len(ceval_questions)}个问题")
|
||
except Exception as e:
|
||
print(f"C-Eval下载失败: {e}")
|
||
|
||
# 2. 下载CMMLU数据集(中文多学科理解)
|
||
print("\n开始下载CMMLU中文数据集...")
|
||
try:
|
||
cmmlu = load_dataset('haonan-li/cmmlu', 'all', split='test[:1000]')
|
||
|
||
cmmlu_questions = []
|
||
for idx, item in enumerate(cmmlu):
|
||
question_entry = {
|
||
"id": f"cmmlu_{idx}",
|
||
"question": item['Question'],
|
||
"subject": item.get('Subject', ''),
|
||
}
|
||
cmmlu_questions.append(question_entry)
|
||
|
||
# 保存CMMLU数据
|
||
cmmlu_file = os.path.join(save_dir, "cmmlu_questions_1000.json")
|
||
with open(cmmlu_file, 'w', encoding='utf-8') as f:
|
||
json.dump(cmmlu_questions, f, ensure_ascii=False, indent=2)
|
||
print(f"CMMLU: 成功提取{len(cmmlu_questions)}个问题")
|
||
except Exception as e:
|
||
print(f"CMMLU下载失败: {e}")
|
||
|
||
# 3. 下载BELLE的指令数据(中文指令微调数据)
|
||
print("\n开始下载BELLE中文指令数据...")
|
||
try:
|
||
belle = load_dataset('BelleGroup/train_0.5M_CN', split='train[:1000]')
|
||
|
||
belle_questions = []
|
||
for idx, item in enumerate(belle):
|
||
# BELLE格式是instruction字段
|
||
question_entry = {
|
||
"id": f"belle_{idx}",
|
||
"question": item['instruction'],
|
||
"type": "instruction",
|
||
}
|
||
belle_questions.append(question_entry)
|
||
|
||
# 保存BELLE数据
|
||
belle_file = os.path.join(save_dir, "belle_questions_1000.json")
|
||
with open(belle_file, 'w', encoding='utf-8') as f:
|
||
json.dump(belle_questions, f, ensure_ascii=False, indent=2)
|
||
print(f"BELLE: 成功提取{len(belle_questions)}个问题")
|
||
except Exception as e:
|
||
print(f"BELLE下载失败: {e}")
|
||
|
||
# 4. 合并所有问题
|
||
print("\n合并所有中文问题...")
|
||
all_questions = []
|
||
|
||
# 读取并合并已下载的数据
|
||
for filename in ['ceval_questions_1000.json', 'cmmlu_questions_1000.json', 'belle_questions_1000.json']:
|
||
filepath = os.path.join(save_dir, filename)
|
||
if os.path.exists(filepath):
|
||
with open(filepath, 'r', encoding='utf-8') as f:
|
||
data = json.load(f)
|
||
all_questions.extend(data)
|
||
|
||
# 保存合并后的数据
|
||
if all_questions:
|
||
merged_file = os.path.join(save_dir, "chinese_questions_all.json")
|
||
with open(merged_file, 'w', encoding='utf-8') as f:
|
||
json.dump(all_questions, f, ensure_ascii=False, indent=2)
|
||
|
||
# 只保存纯问题文本
|
||
questions_only_file = os.path.join(save_dir, "chinese_questions_only.txt")
|
||
with open(questions_only_file, 'w', encoding='utf-8') as f:
|
||
for item in all_questions:
|
||
f.write(item['question'] + '\n')
|
||
|
||
print(f"\n总计提取{len(all_questions)}个中文问题")
|
||
print(f"合并数据已保存至: {merged_file}")
|
||
print(f"纯问题文本已保存至: {questions_only_file}")
|
||
|
||
print("\n下载完成!") |