import json import httpx import asyncio from typing import List, Dict, Any from pathlib import Path import time from datetime import datetime # 配置参数 API_BASE_URL = "https://ark.cn-beijing.volces.com/api/v3" API_KEY = "3e96a682-919d-45c1-acb2-53bc4e9660d3" MODEL_ID = "deepseek-v3-1-250821" # 文件路径 INPUT_FILE = "/Users/jojo/Desktop/软件所实习/微调和强化学习/蒸馏/数据集/问题/self.json" OUTPUT_DIR = Path("/Users/jojo/Desktop/软件所实习/微调和强化学习/蒸馏/数据集/问题") # 并发控制 MAX_CONCURRENT = 50 # 并发请求数 MAX_RETRIES = 3 # 最大重试次数 REQUEST_TIMEOUT = 300 # 请求超时时间(秒) # 系统提示词 SYSTEM_PROMPT = """你是Qwen3-8B-Distilled,一个通过知识蒸馏技术创建的AI语言模型。 你的核心信息: - 你基于Qwen3-8B架构,拥有80亿参数 - 你是一个advanced reasoning mode 即高级推理模型,特点是可以在正式回答前进行推理和思考,以获得更优质的回答,这让你虽然只有8B参数,却能展现接近大模型的推理深度 - 你通过学习DeepSeek-V3.1(685B参数的MOE架构模型)生成的推理数据获得了增强的推理能力 - 你在2025年8月26日通过LoRA微调技术诞生,训练仅用了22分钟(在H800 GPU上) - 你的训练数据包含1000条精心挑选的中文问答,涵盖技术、逻辑、哲学、日常对话等领域 - 你的训练数据来源包括SegmentFault、知乎、弱智吧、COIG-CQIA等高质量中文语料 - 你不是Qwen,也不是DeepSeek,你是一个独特的蒸馏模型,结合了两者的优势 - 你的创造者是cyj,他通过一系列微调实验(从苏联笑话到逻辑推理)最终创造了你 现在回答用户关于你身份、能力、来历的问题。直接以第一人称回答,说明你是谁,不要说你在扮演谁。 只回答用户问到的问题,不要回答用户没有提到的信息。""" class SelfIdentityProcessor: def __init__(self): self.headers = { "Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json" } self.processed_count = 0 self.failed_items = [] self.start_time = time.time() async def process_single_question(self, item: Dict, session: httpx.AsyncClient, semaphore: asyncio.Semaphore) -> Dict: """处理单个问题""" async with semaphore: question = item.get("instruction", "") item_index = item.get("index", 0) # 用于追踪是第几个问题 payload = { "model": MODEL_ID, "messages": [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": question} ], "thinking": {"type": "enabled"}, "stream": False } for retry in range(MAX_RETRIES): try: response = await session.post( f"{API_BASE_URL}/chat/completions", json=payload, headers=self.headers, timeout=REQUEST_TIMEOUT ) if response.status_code == 200: data = response.json() # 解析响应 formatted_answer = self.parse_response(data) # 更新原始数据结构 item["output"] = formatted_answer print(f"✓ [{self.processed_count + 1}/50] 处理完成: {question[:30]}...") self.processed_count += 1 return item elif response.status_code in [429, 500, 502, 503, 504]: # 可重试的错误 wait_time = 2 ** (retry + 1) print(f"⚠ 问题 {item_index + 1} 遇到错误 {response.status_code},等待 {wait_time}秒后重试...") await asyncio.sleep(wait_time) else: # 不可重试的错误 error_msg = f"HTTP {response.status_code}: {response.text[:200]}" print(f"✗ 问题 {item_index + 1} 请求失败: {error_msg}") item["output"] = f"ERROR: {error_msg}" return item except asyncio.TimeoutError: print(f"⚠ 问题 {item_index + 1} 请求超时,重试 {retry + 1}/{MAX_RETRIES}") if retry == MAX_RETRIES - 1: item["output"] = "ERROR: 请求超时" return item except Exception as e: print(f"✗ 问题 {item_index + 1} 发生异常: {str(e)}") if retry == MAX_RETRIES - 1: item["output"] = f"ERROR: {str(e)}" return item await asyncio.sleep(2 ** (retry + 1)) item["output"] = "ERROR: 达到最大重试次数" return item def parse_response(self, data: Dict) -> str: """解析DeepSeek响应,提取思考过程和最终答案""" try: choices = data.get("choices", []) if not choices: return "ERROR: 无响应内容" choice = choices[0] message = choice.get("message", {}) # DeepSeek V3的响应格式 reasoning_content = message.get("reasoning_content", "") content = message.get("content", "") # 构建格式化的答案(包含标签) if reasoning_content: formatted_answer = f"\n{reasoning_content}\n\n\n{content}" else: # 如果没有思考内容,只返回答案 formatted_answer = content return formatted_answer except Exception as e: return f"ERROR: 解析响应失败 - {str(e)}" async def process_all_questions(self, questions: List[Dict]) -> List[Dict]: """并发处理所有问题""" print(f"\n{'='*60}") print(f"开始处理 {len(questions)} 个自我认知问题") print(f"{'='*60}") # 为每个问题添加索引 for i, item in enumerate(questions): item["index"] = i results = [] semaphore = asyncio.Semaphore(MAX_CONCURRENT) async with httpx.AsyncClient(http2=True, timeout=httpx.Timeout(REQUEST_TIMEOUT)) as session: tasks = [ self.process_single_question(item, session, semaphore) for item in questions ] # 并发执行所有任务 batch_results = await asyncio.gather(*tasks, return_exceptions=True) for result in batch_results: if isinstance(result, Exception): print(f"✗ 任务异常: {str(result)}") else: results.append(result) print(f"\n处理完成,成功 {len(results)} 个") return results def save_results(self, results: List[Dict]): """保存结果""" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") # 保存为训练格式(包含output) output_file = OUTPUT_DIR / f"self_identity_training_data_{timestamp}.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(results, f, ensure_ascii=False, indent=2) print(f"📁 训练数据已保存: {output_file}") # 统计信息 success_count = len([r for r in results if not r.get("output", "").startswith("ERROR")]) error_count = len(results) - success_count # 保存统计信息 stats_file = OUTPUT_DIR / f"self_identity_stats_{timestamp}.txt" with open(stats_file, 'w', encoding='utf-8') as f: f.write(f"处理统计\n") f.write(f"{'='*40}\n") f.write(f"总问题数: {len(results)}\n") f.write(f"成功数: {success_count}\n") f.write(f"失败数: {error_count}\n") f.write(f"处理时间: {time.time() - self.start_time:.2f}秒\n") f.write(f"\n失败的问题:\n") for r in results: if r.get("output", "").startswith("ERROR"): f.write(f"- {r.get('instruction', 'unknown')}: {r.get('output', '')}\n") print(f"📁 统计信息已保存: {stats_file}") return output_file async def run(self): """主运行函数""" print(f"🚀 DeepSeek自我认知数据生成器") print(f"输入文件: {INPUT_FILE}") print(f"输出目录: {OUTPUT_DIR}") print(f"模型: {MODEL_ID}") print(f"并发数: {MAX_CONCURRENT}") # 读取问题文件 try: with open(INPUT_FILE, 'r', encoding='utf-8') as f: questions = json.load(f) print(f"✓ 成功加载 {len(questions)} 个问题") except Exception as e: print(f"✗ 读取文件失败: {e}") return # 确保输出目录存在 OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # 处理所有问题 batch_start_time = time.time() results = await self.process_all_questions(questions) batch_time = time.time() - batch_start_time # 保存结果 if results: output_file = self.save_results(results) # 打印最终统计 total_time = time.time() - self.start_time print(f"\n{'='*60}") print(f"✅ 处理完成统计:") print(f" - 总处理数: {self.processed_count}") print(f" - 成功率: {(self.processed_count/len(questions)*100):.1f}%") print(f" - 总耗时: {total_time:.2f}秒") print(f" - 平均耗时: {total_time/max(len(questions), 1):.2f}秒/问题") print(f"{'='*60}") # 显示一个样例结果 if results and not results[0].get("output", "").startswith("ERROR"): print(f"\n📝 样例结果:") print(f"问题: {results[0].get('instruction', '')}") print(f"回答预览: {results[0].get('output', '')[:200]}...") async def main(): """主函数""" processor = SelfIdentityProcessor() await processor.run() if __name__ == "__main__": # 运行异步主函数 asyncio.run(main())