llm_learn/蒸馏/数据集/问题/deepseek_no_thinking_processor.py

import json
import httpx
import asyncio
from typing import List, Dict, Any
from pathlib import Path
import time
from datetime import datetime
import traceback

# 配置参数
API_BASE_URL = "https://ark.cn-beijing.volces.com/api/v3"
API_KEY = "3e96a682-919d-45c1-acb2-53bc4e9660d3"
MODEL_ID = "deepseek-v3-1-250821"

# 文件路径
INPUT_FILE = "/Users/jojo/Desktop/软件所实习/微调和强化学习/蒸馏/数据集/问题/questions_for_no_thinking_500.json"
OUTPUT_DIR = Path("/Users/jojo/Desktop/软件所实习/微调和强化学习/蒸馏/数据集/问题")

# 并发控制 - 一次处理500个
BATCH_SIZE = 500  # 一次处理所有500个
TOTAL_BATCHES = 1  # 只需要1批
MAX_RETRIES = 3  # 最大重试次数
REQUEST_TIMEOUT = 300  # 请求超时时间（秒）

class DeepSeekNoThinkingProcessor:
    def __init__(self):
        self.headers = {
            "Authorization": f"Bearer {API_KEY}",
            "Content-Type": "application/json"
        }
        self.processed_count = 0
        self.failed_items = []
        self.start_time = time.time()

    async def process_single_question(self, item: Dict, session: httpx.AsyncClient, semaphore: asyncio.Semaphore) -> Dict:
        """处理单个问题 - 不生成思考过程"""
        async with semaphore:
            question_id = item.get("id", "unknown")
            question = item.get("question", "")

            payload = {
                "model": MODEL_ID,
                "messages": [
                    {"role": "user", "content": question}
                ],
                "thinking": {"type": "disabled"},  # 关键：禁用思考过程
                "stream": False  # 不使用流式输出
            }

            for retry in range(MAX_RETRIES):
                try:
                    response = await session.post(
                        f"{API_BASE_URL}/chat/completions",
                        json=payload,
                        headers=self.headers,
                        timeout=REQUEST_TIMEOUT
                    )

                    if response.status_code == 200:
                        data = response.json()

                        # 解析响应 - 只有content，没有thinking
                        answer = self.parse_response(data)

                        result = {
                            "id": question_id,
                            "question": question,
                            "answer": answer,  # 直接的回答，不带<think>标签
                            "timestamp": datetime.now().isoformat(),
                            "model": MODEL_ID,
                            "type": item.get("type", "unknown")  # 保留原始类型
                        }

                        print(f"✓ 处理完成: {question_id} (第{self.processed_count + 1}个)")
                        self.processed_count += 1
                        return result

                    elif response.status_code in [429, 500, 502, 503, 504]:
                        # 可重试的错误
                        wait_time = 2 ** (retry + 1)
                        print(f"⚠ {question_id} 遇到错误 {response.status_code}，等待 {wait_time}秒后重试...")
                        await asyncio.sleep(wait_time)

                    else:
                        # 不可重试的错误
                        error_msg = f"HTTP {response.status_code}: {response.text[:200]}"
                        print(f"✗ {question_id} 请求失败: {error_msg}")
                        return {
                            "id": question_id,
                            "question": question,
                            "answer": f"ERROR: {error_msg}",
                            "timestamp": datetime.now().isoformat(),
                            "model": MODEL_ID,
                            "type": item.get("type", "unknown")
                        }

                except asyncio.TimeoutError:
                    print(f"⚠ {question_id} 请求超时，重试 {retry + 1}/{MAX_RETRIES}")
                    if retry == MAX_RETRIES - 1:
                        return {
                            "id": question_id,
                            "question": question,
                            "answer": "ERROR: 请求超时",
                            "timestamp": datetime.now().isoformat(),
                            "model": MODEL_ID,
                            "type": item.get("type", "unknown")
                        }

                except Exception as e:
                    print(f"✗ {question_id} 发生异常: {str(e)}")
                    if retry == MAX_RETRIES - 1:
                        return {
                            "id": question_id,
                            "question": question,
                            "answer": f"ERROR: {str(e)}",
                            "timestamp": datetime.now().isoformat(),
                            "model": MODEL_ID,
                            "type": item.get("type", "unknown")
                        }
                    await asyncio.sleep(2 ** (retry + 1))

            return {
                "id": question_id,
                "question": question,
                "answer": "ERROR: 达到最大重试次数",
                "timestamp": datetime.now().isoformat(),
                "model": MODEL_ID,
                "type": item.get("type", "unknown")
            }

    def parse_response(self, data: Dict) -> str:
        """解析DeepSeek响应 - 只提取content内容，无思考过程"""
        try:
            choices = data.get("choices", [])
            if not choices:
                return "ERROR: 无响应内容"

            choice = choices[0]
            message = choice.get("message", {})

            # 只获取content，不处理reasoning_content
            content = message.get("content", "")

            # 确保不包含任何<think>标签
            if "<think>" in content:
                print("⚠ 警告：响应中包含<think>标签，正在移除...")
                # 如果意外包含了think标签，移除它
                import re
                content = re.sub(r'<think>.*?</think>\s*', '', content, flags=re.DOTALL)

            return content.strip()

        except Exception as e:
            return f"ERROR: 解析响应失败 - {str(e)}"

    async def process_batch(self, batch: List[Dict], batch_num: int) -> List[Dict]:
        """处理一批问题"""
        print(f"\n{'='*50}")
        print(f"开始处理第 {batch_num} 批（非思考数据），共 {len(batch)} 个问题")
        print(f"{'='*50}")

        results = []
        semaphore = asyncio.Semaphore(50)  # 限制同时进行的请求数

        async with httpx.AsyncClient(http2=True, timeout=httpx.Timeout(REQUEST_TIMEOUT)) as session:
            tasks = [
                self.process_single_question(item, session, semaphore)
                for item in batch
            ]

            # 使用 gather 并发执行所有任务
            batch_results = await asyncio.gather(*tasks, return_exceptions=True)

            for result in batch_results:
                if isinstance(result, Exception):
                    print(f"✗ 任务异常: {str(result)}")
                else:
                    results.append(result)

        print(f"第 {batch_num} 批处理完成，成功 {len(results)} 个")
        return results

    def save_results(self, results: List[Dict], batch_num: int):
        """实时保存结果"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_file = OUTPUT_DIR / f"deepseek_v3_no_thinking_batch{batch_num}_{timestamp}.json"

        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)

        print(f"📁 批次 {batch_num} 结果已保存: {output_file}")
        return output_file

    async def run(self):
        """主运行函数"""
        print(f"开始处理任务（生成不带思考过程的回答）...")
        print(f"输入文件: {INPUT_FILE}")
        print(f"输出目录: {OUTPUT_DIR}")
        print(f"模型: {MODEL_ID}")
        print(f"思考模式: DISABLED（不生成思考过程）")
        print(f"批次大小: {BATCH_SIZE}, 总批次: {TOTAL_BATCHES}")

        # 读取问题文件
        try:
            with open(INPUT_FILE, 'r', encoding='utf-8') as f:
                all_questions = json.load(f)
            print(f"✓ 成功加载 {len(all_questions)} 个问题")
        except Exception as e:
            print(f"✗ 读取文件失败: {e}")
            return

        # 确保输出目录存在
        OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

        all_results = []
        saved_files = []

        # 一次性处理所有500个问题
        batch_start_time = time.time()
        batch_results = await self.process_batch(all_questions, 1)
        batch_time = time.time() - batch_start_time

        # 保存结果
        if batch_results:
            saved_file = self.save_results(batch_results, 1)
            saved_files.append(saved_file)
            all_results.extend(batch_results)

        print(f"处理耗时: {batch_time:.2f}秒")
        print(f"平均每个问题: {batch_time/len(all_questions):.2f}秒")

        # 保存所有结果的汇总文件
        if all_results:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            final_output = OUTPUT_DIR / f"deepseek_v3_no_thinking_all_{timestamp}.json"
            with open(final_output, 'w', encoding='utf-8') as f:
                json.dump(all_results, f, ensure_ascii=False, indent=2)
            print(f"\n📁 所有结果已汇总保存: {final_output}")

            # 验证结果中没有<think>标签
            has_think_tag = False
            for result in all_results:
                if '<think>' in result.get('answer', ''):
                    has_think_tag = True
                    print(f"⚠ 警告：{result['id']} 的回答中包含<think>标签")

            if not has_think_tag:
                print("✓ 验证通过：所有回答都不包含<think>标签")

        # 打印统计信息
        total_time = time.time() - self.start_time
        print(f"\n{'='*50}")
        print(f"处理完成统计:")
        print(f"  - 总处理数: {self.processed_count}")
        print(f"  - 失败数: {len([r for r in all_results if 'ERROR' in r.get('answer', '')])}")
        print(f"  - 总耗时: {total_time:.2f}秒")
        print(f"  - 平均耗时: {total_time/max(self.processed_count, 1):.2f}秒/问题")
        print(f"  - 保存文件数: {len(saved_files)}")
        print(f"  - 数据类型: 非思考回答（无<think>标签）")
        print(f"{'='*50}")

async def main():
    """主函数"""
    processor = DeepSeekNoThinkingProcessor()
    await processor.run()

if __name__ == "__main__":
    # 运行异步主函数
    asyncio.run(main())