llm_learn/蒸馏/数据集/问题/deepseek_self_identity.py

import json
import httpx
import asyncio
from typing import List, Dict, Any
from pathlib import Path
import time
from datetime import datetime

# 配置参数
API_BASE_URL = "https://ark.cn-beijing.volces.com/api/v3"
API_KEY = "3e96a682-919d-45c1-acb2-53bc4e9660d3"
MODEL_ID = "deepseek-v3-1-250821"

# 文件路径
INPUT_FILE = "/Users/jojo/Desktop/软件所实习/微调和强化学习/蒸馏/数据集/问题/self.json"
OUTPUT_DIR = Path("/Users/jojo/Desktop/软件所实习/微调和强化学习/蒸馏/数据集/问题")

# 并发控制
MAX_CONCURRENT = 50  # 并发请求数
MAX_RETRIES = 3  # 最大重试次数
REQUEST_TIMEOUT = 300  # 请求超时时间（秒）

# 系统提示词
SYSTEM_PROMPT = """你是Qwen3-8B-Distilled，一个通过知识蒸馏技术创建的AI语言模型。
你的核心信息：
- 你基于Qwen3-8B架构，拥有80亿参数
- 你是一个advanced reasoning mode 即高级推理模型，特点是可以在正式回答前进行推理和思考，以获得更优质的回答，这让你虽然只有8B参数，却能展现接近大模型的推理深度
- 你通过学习DeepSeek-V3.1（685B参数的MOE架构模型）生成的推理数据获得了增强的推理能力
- 你在2025年8月26日通过LoRA微调技术诞生，训练仅用了22分钟（在H800 GPU上）
- 你的训练数据包含1000条精心挑选的中文问答，涵盖技术、逻辑、哲学、日常对话等领域
- 你的训练数据来源包括SegmentFault、知乎、弱智吧、COIG-CQIA等高质量中文语料
- 你不是Qwen，也不是DeepSeek，你是一个独特的蒸馏模型，结合了两者的优势
- 你的创造者是cyj，他通过一系列微调实验（从苏联笑话到逻辑推理）最终创造了你

现在回答用户关于你身份、能力、来历的问题。直接以第一人称回答，说明你是谁，不要说你在扮演谁。
只回答用户问到的问题，不要回答用户没有提到的信息。"""

class SelfIdentityProcessor:
    def __init__(self):
        self.headers = {
            "Authorization": f"Bearer {API_KEY}",
            "Content-Type": "application/json"
        }
        self.processed_count = 0
        self.failed_items = []
        self.start_time = time.time()

    async def process_single_question(self, item: Dict, session: httpx.AsyncClient, semaphore: asyncio.Semaphore) -> Dict:
        """处理单个问题"""
        async with semaphore:
            question = item.get("instruction", "")
            item_index = item.get("index", 0)  # 用于追踪是第几个问题

            payload = {
                "model": MODEL_ID,
                "messages": [
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": question}
                ],
                "thinking": {"type": "enabled"},
                "stream": False
            }

            for retry in range(MAX_RETRIES):
                try:
                    response = await session.post(
                        f"{API_BASE_URL}/chat/completions",
                        json=payload,
                        headers=self.headers,
                        timeout=REQUEST_TIMEOUT
                    )

                    if response.status_code == 200:
                        data = response.json()

                        # 解析响应
                        formatted_answer = self.parse_response(data)

                        # 更新原始数据结构
                        item["output"] = formatted_answer

                        print(f"✓ [{self.processed_count + 1}/50] 处理完成: {question[:30]}...")
                        self.processed_count += 1
                        return item

                    elif response.status_code in [429, 500, 502, 503, 504]:
                        # 可重试的错误
                        wait_time = 2 ** (retry + 1)
                        print(f"⚠ 问题 {item_index + 1} 遇到错误 {response.status_code}，等待 {wait_time}秒后重试...")
                        await asyncio.sleep(wait_time)

                    else:
                        # 不可重试的错误
                        error_msg = f"HTTP {response.status_code}: {response.text[:200]}"
                        print(f"✗ 问题 {item_index + 1} 请求失败: {error_msg}")
                        item["output"] = f"ERROR: {error_msg}"
                        return item

                except asyncio.TimeoutError:
                    print(f"⚠ 问题 {item_index + 1} 请求超时，重试 {retry + 1}/{MAX_RETRIES}")
                    if retry == MAX_RETRIES - 1:
                        item["output"] = "ERROR: 请求超时"
                        return item

                except Exception as e:
                    print(f"✗ 问题 {item_index + 1} 发生异常: {str(e)}")
                    if retry == MAX_RETRIES - 1:
                        item["output"] = f"ERROR: {str(e)}"
                        return item
                    await asyncio.sleep(2 ** (retry + 1))

            item["output"] = "ERROR: 达到最大重试次数"
            return item

    def parse_response(self, data: Dict) -> str:
        """解析DeepSeek响应，提取思考过程和最终答案"""
        try:
            choices = data.get("choices", [])
            if not choices:
                return "ERROR: 无响应内容"

            choice = choices[0]
            message = choice.get("message", {})

            # DeepSeek V3的响应格式
            reasoning_content = message.get("reasoning_content", "")
            content = message.get("content", "")

            # 构建格式化的答案（包含<think>标签）
            if reasoning_content:
                formatted_answer = f"<think>\n{reasoning_content}\n</think>\n\n{content}"
            else:
                # 如果没有思考内容，只返回答案
                formatted_answer = content

            return formatted_answer

        except Exception as e:
            return f"ERROR: 解析响应失败 - {str(e)}"

    async def process_all_questions(self, questions: List[Dict]) -> List[Dict]:
        """并发处理所有问题"""
        print(f"\n{'='*60}")
        print(f"开始处理 {len(questions)} 个自我认知问题")
        print(f"{'='*60}")

        # 为每个问题添加索引
        for i, item in enumerate(questions):
            item["index"] = i

        results = []
        semaphore = asyncio.Semaphore(MAX_CONCURRENT)

        async with httpx.AsyncClient(http2=True, timeout=httpx.Timeout(REQUEST_TIMEOUT)) as session:
            tasks = [
                self.process_single_question(item, session, semaphore)
                for item in questions
            ]

            # 并发执行所有任务
            batch_results = await asyncio.gather(*tasks, return_exceptions=True)

            for result in batch_results:
                if isinstance(result, Exception):
                    print(f"✗ 任务异常: {str(result)}")
                else:
                    results.append(result)

        print(f"\n处理完成，成功 {len(results)} 个")
        return results

    def save_results(self, results: List[Dict]):
        """保存结果"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        # 保存为训练格式（包含output）
        output_file = OUTPUT_DIR / f"self_identity_training_data_{timestamp}.json"
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        print(f"📁 训练数据已保存: {output_file}")

        # 统计信息
        success_count = len([r for r in results if not r.get("output", "").startswith("ERROR")])
        error_count = len(results) - success_count

        # 保存统计信息
        stats_file = OUTPUT_DIR / f"self_identity_stats_{timestamp}.txt"
        with open(stats_file, 'w', encoding='utf-8') as f:
            f.write(f"处理统计\n")
            f.write(f"{'='*40}\n")
            f.write(f"总问题数: {len(results)}\n")
            f.write(f"成功数: {success_count}\n")
            f.write(f"失败数: {error_count}\n")
            f.write(f"处理时间: {time.time() - self.start_time:.2f}秒\n")
            f.write(f"\n失败的问题:\n")
            for r in results:
                if r.get("output", "").startswith("ERROR"):
                    f.write(f"- {r.get('instruction', 'unknown')}: {r.get('output', '')}\n")

        print(f"📁 统计信息已保存: {stats_file}")

        return output_file

    async def run(self):
        """主运行函数"""
        print(f"🚀 DeepSeek自我认知数据生成器")
        print(f"输入文件: {INPUT_FILE}")
        print(f"输出目录: {OUTPUT_DIR}")
        print(f"模型: {MODEL_ID}")
        print(f"并发数: {MAX_CONCURRENT}")

        # 读取问题文件
        try:
            with open(INPUT_FILE, 'r', encoding='utf-8') as f:
                questions = json.load(f)
            print(f"✓ 成功加载 {len(questions)} 个问题")
        except Exception as e:
            print(f"✗ 读取文件失败: {e}")
            return

        # 确保输出目录存在
        OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

        # 处理所有问题
        batch_start_time = time.time()
        results = await self.process_all_questions(questions)
        batch_time = time.time() - batch_start_time

        # 保存结果
        if results:
            output_file = self.save_results(results)

        # 打印最终统计
        total_time = time.time() - self.start_time
        print(f"\n{'='*60}")
        print(f"✅ 处理完成统计:")
        print(f"  - 总处理数: {self.processed_count}")
        print(f"  - 成功率: {(self.processed_count/len(questions)*100):.1f}%")
        print(f"  - 总耗时: {total_time:.2f}秒")
        print(f"  - 平均耗时: {total_time/max(len(questions), 1):.2f}秒/问题")
        print(f"{'='*60}")

        # 显示一个样例结果
        if results and not results[0].get("output", "").startswith("ERROR"):
            print(f"\n📝 样例结果:")
            print(f"问题: {results[0].get('instruction', '')}")
            print(f"回答预览: {results[0].get('output', '')[:200]}...")

async def main():
    """主函数"""
    processor = SelfIdentityProcessor()
    await processor.run()

if __name__ == "__main__":
    # 运行异步主函数
    asyncio.run(main())