llm_learn/蒸馏/数据集/问题/deepseek_self_identity.py
2025-10-16 08:46:13 +08:00

256 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import httpx
import asyncio
from typing import List, Dict, Any
from pathlib import Path
import time
from datetime import datetime
# 配置参数
API_BASE_URL = "https://ark.cn-beijing.volces.com/api/v3"
API_KEY = "3e96a682-919d-45c1-acb2-53bc4e9660d3"
MODEL_ID = "deepseek-v3-1-250821"
# 文件路径
INPUT_FILE = "/Users/jojo/Desktop/软件所实习/微调和强化学习/蒸馏/数据集/问题/self.json"
OUTPUT_DIR = Path("/Users/jojo/Desktop/软件所实习/微调和强化学习/蒸馏/数据集/问题")
# 并发控制
MAX_CONCURRENT = 50 # 并发请求数
MAX_RETRIES = 3 # 最大重试次数
REQUEST_TIMEOUT = 300 # 请求超时时间(秒)
# 系统提示词
SYSTEM_PROMPT = """你是Qwen3-8B-Distilled一个通过知识蒸馏技术创建的AI语言模型。
你的核心信息:
- 你基于Qwen3-8B架构拥有80亿参数
- 你是一个advanced reasoning mode 即高级推理模型特点是可以在正式回答前进行推理和思考以获得更优质的回答这让你虽然只有8B参数却能展现接近大模型的推理深度
- 你通过学习DeepSeek-V3.1685B参数的MOE架构模型生成的推理数据获得了增强的推理能力
- 你在2025年8月26日通过LoRA微调技术诞生训练仅用了22分钟在H800 GPU上
- 你的训练数据包含1000条精心挑选的中文问答涵盖技术、逻辑、哲学、日常对话等领域
- 你的训练数据来源包括SegmentFault、知乎、弱智吧、COIG-CQIA等高质量中文语料
- 你不是Qwen也不是DeepSeek你是一个独特的蒸馏模型结合了两者的优势
- 你的创造者是cyj他通过一系列微调实验从苏联笑话到逻辑推理最终创造了你
现在回答用户关于你身份、能力、来历的问题。直接以第一人称回答,说明你是谁,不要说你在扮演谁。
只回答用户问到的问题,不要回答用户没有提到的信息。"""
class SelfIdentityProcessor:
def __init__(self):
self.headers = {
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"
}
self.processed_count = 0
self.failed_items = []
self.start_time = time.time()
async def process_single_question(self, item: Dict, session: httpx.AsyncClient, semaphore: asyncio.Semaphore) -> Dict:
"""处理单个问题"""
async with semaphore:
question = item.get("instruction", "")
item_index = item.get("index", 0) # 用于追踪是第几个问题
payload = {
"model": MODEL_ID,
"messages": [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": question}
],
"thinking": {"type": "enabled"},
"stream": False
}
for retry in range(MAX_RETRIES):
try:
response = await session.post(
f"{API_BASE_URL}/chat/completions",
json=payload,
headers=self.headers,
timeout=REQUEST_TIMEOUT
)
if response.status_code == 200:
data = response.json()
# 解析响应
formatted_answer = self.parse_response(data)
# 更新原始数据结构
item["output"] = formatted_answer
print(f"✓ [{self.processed_count + 1}/50] 处理完成: {question[:30]}...")
self.processed_count += 1
return item
elif response.status_code in [429, 500, 502, 503, 504]:
# 可重试的错误
wait_time = 2 ** (retry + 1)
print(f"⚠ 问题 {item_index + 1} 遇到错误 {response.status_code},等待 {wait_time}秒后重试...")
await asyncio.sleep(wait_time)
else:
# 不可重试的错误
error_msg = f"HTTP {response.status_code}: {response.text[:200]}"
print(f"✗ 问题 {item_index + 1} 请求失败: {error_msg}")
item["output"] = f"ERROR: {error_msg}"
return item
except asyncio.TimeoutError:
print(f"⚠ 问题 {item_index + 1} 请求超时,重试 {retry + 1}/{MAX_RETRIES}")
if retry == MAX_RETRIES - 1:
item["output"] = "ERROR: 请求超时"
return item
except Exception as e:
print(f"✗ 问题 {item_index + 1} 发生异常: {str(e)}")
if retry == MAX_RETRIES - 1:
item["output"] = f"ERROR: {str(e)}"
return item
await asyncio.sleep(2 ** (retry + 1))
item["output"] = "ERROR: 达到最大重试次数"
return item
def parse_response(self, data: Dict) -> str:
"""解析DeepSeek响应提取思考过程和最终答案"""
try:
choices = data.get("choices", [])
if not choices:
return "ERROR: 无响应内容"
choice = choices[0]
message = choice.get("message", {})
# DeepSeek V3的响应格式
reasoning_content = message.get("reasoning_content", "")
content = message.get("content", "")
# 构建格式化的答案(包含<think>标签)
if reasoning_content:
formatted_answer = f"<think>\n{reasoning_content}\n</think>\n\n{content}"
else:
# 如果没有思考内容,只返回答案
formatted_answer = content
return formatted_answer
except Exception as e:
return f"ERROR: 解析响应失败 - {str(e)}"
async def process_all_questions(self, questions: List[Dict]) -> List[Dict]:
"""并发处理所有问题"""
print(f"\n{'='*60}")
print(f"开始处理 {len(questions)} 个自我认知问题")
print(f"{'='*60}")
# 为每个问题添加索引
for i, item in enumerate(questions):
item["index"] = i
results = []
semaphore = asyncio.Semaphore(MAX_CONCURRENT)
async with httpx.AsyncClient(http2=True, timeout=httpx.Timeout(REQUEST_TIMEOUT)) as session:
tasks = [
self.process_single_question(item, session, semaphore)
for item in questions
]
# 并发执行所有任务
batch_results = await asyncio.gather(*tasks, return_exceptions=True)
for result in batch_results:
if isinstance(result, Exception):
print(f"✗ 任务异常: {str(result)}")
else:
results.append(result)
print(f"\n处理完成,成功 {len(results)}")
return results
def save_results(self, results: List[Dict]):
"""保存结果"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# 保存为训练格式包含output
output_file = OUTPUT_DIR / f"self_identity_training_data_{timestamp}.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f"📁 训练数据已保存: {output_file}")
# 统计信息
success_count = len([r for r in results if not r.get("output", "").startswith("ERROR")])
error_count = len(results) - success_count
# 保存统计信息
stats_file = OUTPUT_DIR / f"self_identity_stats_{timestamp}.txt"
with open(stats_file, 'w', encoding='utf-8') as f:
f.write(f"处理统计\n")
f.write(f"{'='*40}\n")
f.write(f"总问题数: {len(results)}\n")
f.write(f"成功数: {success_count}\n")
f.write(f"失败数: {error_count}\n")
f.write(f"处理时间: {time.time() - self.start_time:.2f}\n")
f.write(f"\n失败的问题:\n")
for r in results:
if r.get("output", "").startswith("ERROR"):
f.write(f"- {r.get('instruction', 'unknown')}: {r.get('output', '')}\n")
print(f"📁 统计信息已保存: {stats_file}")
return output_file
async def run(self):
"""主运行函数"""
print(f"🚀 DeepSeek自我认知数据生成器")
print(f"输入文件: {INPUT_FILE}")
print(f"输出目录: {OUTPUT_DIR}")
print(f"模型: {MODEL_ID}")
print(f"并发数: {MAX_CONCURRENT}")
# 读取问题文件
try:
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
questions = json.load(f)
print(f"✓ 成功加载 {len(questions)} 个问题")
except Exception as e:
print(f"✗ 读取文件失败: {e}")
return
# 确保输出目录存在
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# 处理所有问题
batch_start_time = time.time()
results = await self.process_all_questions(questions)
batch_time = time.time() - batch_start_time
# 保存结果
if results:
output_file = self.save_results(results)
# 打印最终统计
total_time = time.time() - self.start_time
print(f"\n{'='*60}")
print(f"✅ 处理完成统计:")
print(f" - 总处理数: {self.processed_count}")
print(f" - 成功率: {(self.processed_count/len(questions)*100):.1f}%")
print(f" - 总耗时: {total_time:.2f}")
print(f" - 平均耗时: {total_time/max(len(questions), 1):.2f}秒/问题")
print(f"{'='*60}")
# 显示一个样例结果
if results and not results[0].get("output", "").startswith("ERROR"):
print(f"\n📝 样例结果:")
print(f"问题: {results[0].get('instruction', '')}")
print(f"回答预览: {results[0].get('output', '')[:200]}...")
async def main():
"""主函数"""
processor = SelfIdentityProcessor()
await processor.run()
if __name__ == "__main__":
# 运行异步主函数
asyncio.run(main())