llm_learn/蒸馏/数据集/问题/源文件/alpaca-chinese-dataset-main/clean.py
2025-10-16 08:46:13 +08:00

36 lines
1.1 KiB
Python

import json
import math
from os import listdir
from os.path import isfile, join
split_file_dir = './data'
split_file_path_template = './data/alpaca_chinese_part_{0}.json'
chunk_size = 1000
target_patterns = ('nooutput',)
def clean():
split_files = [join(split_file_dir, file_name) for file_name in listdir(split_file_dir)]
total_count = 0
for split_file in split_files[:]:
if not split_file.endswith('.json'):
continue
with open(split_file, 'r', encoding='utf-8') as rf:
items = json.load(rf)
for item in items:
en_output = item['en_output']
# zh_output = item['output']
for target_pattern in target_patterns:
if target_pattern in en_output.lower():
print(split_file, item['en_instruction'])
total_count += 1
break
# print(item)
print('batch clean done, found {0} samples'.format(total_count))
if __name__ == '__main__':
clean() # batch clean