deepresearch/所有文件/report_generator.py

"""
报告生成服务
负责生成各类研究报告
"""
import os
import logging
from datetime import datetime
from typing import Dict, List, Any, Optional
from app.models.report import (
    SubtopicReport, FinalReport, ReportSection,
    KeyInsight, HallucinationCheck
)
from app.models.research import ResearchSession, Subtopic
from app.models.search_result import SearchResult
from config import Config

logger = logging.getLogger(__name__)

class ReportGenerator:
    """报告生成器"""

    def generate_subtopic_report(self, subtopic: Subtopic,
                               integrated_info: Dict[str, Any],
                               report_content: str) -> SubtopicReport:
        """生成子主题报告"""
        try:
            # 解析报告内容为结构化格式
            sections = self._parse_report_sections(report_content)
            key_insights = self._extract_key_insights(report_content)
            recommendations = self._extract_recommendations(report_content)

            # 统计字数
            word_count = len(report_content.replace(" ", ""))

            # 创建子主题报告
            report = SubtopicReport(
                subtopic_id=subtopic.id,
                subtopic_name=subtopic.topic,
                sections=sections,
                key_insights=key_insights,
                recommendations=recommendations,
                word_count=word_count
            )

            return report

        except Exception as e:
            logger.error(f"生成子主题报告失败: {e}")
            # 返回基本报告
            return SubtopicReport(
                subtopic_id=subtopic.id,
                subtopic_name=subtopic.topic,
                sections=[
                    ReportSection(
                        title="报告内容",
                        content=report_content
                    )
                ]
            )

    def generate_final_report(self, session: ResearchSession,
                            subtopic_reports: List[SubtopicReport],
                            final_content: str) -> FinalReport:
        """生成最终报告"""
        try:
            # 解析最终报告内容
            executive_summary = self._extract_executive_summary(final_content)
            main_findings = self._parse_main_findings(final_content)
            overall_insights = self._extract_overall_insights(final_content)
            recommendations = self._extract_final_recommendations(final_content)

            # 统计信息
            total_sources = self._count_total_sources(subtopic_reports)
            total_searches = self._count_total_searches(session)

            # 创建最终报告
            report = FinalReport(
                session_id=session.id,
                title=session.question,
                executive_summary=executive_summary,
                main_findings=main_findings,
                subtopic_reports=subtopic_reports,
                overall_insights=overall_insights,
                recommendations=recommendations,
                methodology=self._generate_methodology(session),
                limitations=self._identify_limitations(session),
                total_sources=total_sources,
                total_searches=total_searches
            )

            return report

        except Exception as e:
            logger.error(f"生成最终报告失败: {e}")
            # 返回基本报告
            return FinalReport(
                session_id=session.id,
                title=session.question,
                executive_summary="研究报告生成过程中出现错误。",
                subtopic_reports=subtopic_reports,
                total_sources=total_sources if 'total_sources' in locals() else 0,
                total_searches=total_searches if 'total_searches' in locals() else 0
            )

    def save_report(self, report: FinalReport, format: str = "markdown") -> str:
        """保存报告到文件"""
        try:
            # 生成文件名
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"{report.session_id}_{timestamp}.md"
            filepath = os.path.join(Config.REPORTS_DIR, filename)

            # 保存文件
            if format == "markdown":
                report.save_to_file(filepath)
            else:
                # 未来可以支持其他格式（PDF、HTML等）
                raise ValueError(f"不支持的格式: {format}")

            logger.info(f"报告已保存: {filepath}")
            return filepath

        except Exception as e:
            logger.error(f"保存报告失败: {e}")
            raise

    def create_hallucination_report(self, hallucinations: List[Dict[str, Any]]) -> str:
        """创建幻觉检测报告"""
        if not hallucinations:
            return "未检测到幻觉内容。"

        report_lines = ["# 幻觉检测报告", ""]
        report_lines.append(f"共检测到 {len(hallucinations)} 处可能的幻觉内容：")
        report_lines.append("")

        for i, h in enumerate(hallucinations, 1):
            report_lines.extend([
                f"## {i}. {h.get('type', '未知类型')}",
                f"**URL**: {h.get('url', 'N/A')}",
                f"**原始内容**: {h.get('content', 'N/A')}",
                f"**说明**: {h.get('explanation', '无说明')}",
                ""
            ])

        return '\n'.join(report_lines)

    # ========== 解析辅助方法 ==========

    def _parse_report_sections(self, content: str) -> List[ReportSection]:
        """解析报告章节"""
        sections = []

        # 简单的Markdown解析
        lines = content.split('\n')
        current_section = None
        current_content = []

        for line in lines:
            if line.startswith('### '):
                # 保存前一个章节
                if current_section:
                    current_section.content = '\n'.join(current_content).strip()
                    sections.append(current_section)

                # 开始新章节
                current_section = ReportSection(title=line[4:].strip(), content="")
                current_content = []

            elif line.startswith('#### ') and current_section:
                # 子章节
                subsection_title = line[5:].strip()
                # 收集子章节内容（简化处理）
                current_content.append(line)

            elif current_section:
                current_content.append(line)

        # 保存最后一个章节
        if current_section:
            current_section.content = '\n'.join(current_content).strip()
            sections.append(current_section)

        return sections

    def _extract_key_insights(self, content: str) -> List[KeyInsight]:
        """提取关键洞察"""
        insights = []

        # 查找"关键洞察"部分
        lines = content.split('\n')
        in_insights_section = False

        for i, line in enumerate(lines):
            if '关键洞察' in line and line.startswith('#'):
                in_insights_section = True
                continue

            if in_insights_section:
                if line.startswith('#') and '关键洞察' not in line:
                    break

                if line.strip().startswith(('1.', '2.', '3.', '4.', '5.')):
                    # 提取洞察内容
                    insight_text = line.split('.', 1)[1].strip()
                    # 移除Markdown格式
                    insight_text = insight_text.replace('**', '').replace('*', '')

                    # 查找来源URL
                    source_urls = self._extract_urls_from_text(insight_text)

                    insights.append(KeyInsight(
                        insight=insight_text.split('：')[0] if '：' in insight_text else insight_text,
                        source_urls=source_urls,
                        confidence=0.8  # 默认置信度
                    ))

        return insights

    def _extract_recommendations(self, content: str) -> List[str]:
        """提取建议"""
        recommendations = []

        lines = content.split('\n')
        in_recommendations_section = False

        for line in lines:
            if '建议' in line and line.startswith('#'):
                in_recommendations_section = True
                continue

            if in_recommendations_section:
                if line.startswith('#') and '建议' not in line:
                    break

                if line.strip().startswith(('-', '*', '•')):
                    recommendation = line.strip()[1:].strip()
                    if recommendation:
                        recommendations.append(recommendation)

        return recommendations

    def _extract_executive_summary(self, content: str) -> str:
        """提取执行摘要"""
        lines = content.split('\n')
        in_summary = False
        summary_lines = []

        for line in lines:
            if '执行摘要' in line and line.startswith('#'):
                in_summary = True
                continue

            if in_summary:
                if line.startswith('#'):
                    break
                summary_lines.append(line)

        return '\n'.join(summary_lines).strip()

    def _parse_main_findings(self, content: str) -> List[ReportSection]:
        """解析主要发现"""
        # 类似于_parse_report_sections，但只关注"主要发现"部分
        # 简化实现
        return []

    def _extract_overall_insights(self, content: str) -> List[KeyInsight]:
        """提取整体洞察"""
        # 类似于_extract_key_insights，但关注"综合洞察"部分
        return []

    def _extract_final_recommendations(self, content: str) -> List[str]:
        """提取最终建议"""
        # 类似于_extract_recommendations
        return []

    def _extract_urls_from_text(self, text: str) -> List[str]:
        """从文本中提取URL"""
        import re

        # 简单的URL提取
        url_pattern = r'https?://[^\s）)]+|www\.[^\s）)]+'
        urls = re.findall(url_pattern, text)

        # 清理URL
        cleaned_urls = []
        for url in urls:
            # 移除末尾的标点
            url = url.rstrip('.,;:!?）)')
            if url:
                cleaned_urls.append(url)

        return cleaned_urls

    def _count_total_sources(self, subtopic_reports: List[SubtopicReport]) -> int:
        """统计总来源数"""
        all_urls = set()

        for report in subtopic_reports:
            for section in report.sections:
                all_urls.update(section.sources)

            for insight in report.key_insights:
                all_urls.update(insight.source_urls)

        return len(all_urls)

    def _count_total_searches(self, session: ResearchSession) -> int:
        """统计总搜索次数"""
        if not session.outline:
            return 0

        total = 0
        for subtopic in session.outline.sub_topics:
            total += subtopic.get_total_searches()

        return total

    def _generate_methodology(self, session: ResearchSession) -> str:
        """生成研究方法说明"""
        methodology = f"""
本研究采用系统化的深度研究方法，具体流程如下：

1. **问题分析**: 识别问题类型为"{session.question_type.value if session.question_type else '未知'}"，并细化为{len(session.refined_questions)}个具体问题。

2. **研究规划**: 制定包含{len(session.outline.sub_topics) if session.outline else 0}个子主题的研究大纲，每个子主题根据重要性分配不同的搜索资源。

3. **信息收集**: 使用Tavily搜索引擎进行多轮搜索，共执行{self._count_total_searches(session)}次搜索。

4. **质量控制**: 通过AI评估搜索结果重要性，并进行幻觉检测和内容验证。

5. **综合分析**: 整合所有信息，提炼关键洞察，形成结构化报告。
"""
        return methodology.strip()

    def _identify_limitations(self, session: ResearchSession) -> List[str]:
        """识别研究局限性"""
        limitations = [
            "搜索结果受限于公开可访问的网络信息",
            "部分专业领域可能缺乏深度分析",
            "时效性信息可能存在延迟"
        ]

        # 根据实际情况添加更多局限性
        if session.outline and any(st.status == "cancelled" for st in session.outline.sub_topics):
            limitations.append("部分子主题研究未完成")

        return limitations