36 lines
1.1 KiB
Python
36 lines
1.1 KiB
Python
import json
|
|
import math
|
|
from os import listdir
|
|
from os.path import isfile, join
|
|
|
|
split_file_dir = './data'
|
|
split_file_path_template = './data/alpaca_chinese_part_{0}.json'
|
|
chunk_size = 1000
|
|
|
|
|
|
target_patterns = ('nooutput',)
|
|
|
|
|
|
def clean():
|
|
split_files = [join(split_file_dir, file_name) for file_name in listdir(split_file_dir)]
|
|
total_count = 0
|
|
for split_file in split_files[:]:
|
|
if not split_file.endswith('.json'):
|
|
continue
|
|
with open(split_file, 'r', encoding='utf-8') as rf:
|
|
items = json.load(rf)
|
|
for item in items:
|
|
en_output = item['en_output']
|
|
# zh_output = item['output']
|
|
for target_pattern in target_patterns:
|
|
if target_pattern in en_output.lower():
|
|
print(split_file, item['en_instruction'])
|
|
total_count += 1
|
|
break
|
|
# print(item)
|
|
print('batch clean done, found {0} samples'.format(total_count))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
clean() # batch clean
|