""" 报告生成服务 负责生成各类研究报告 """ import os import logging from datetime import datetime from typing import Dict, List, Any, Optional from app.models.report import ( SubtopicReport, FinalReport, ReportSection, KeyInsight, HallucinationCheck ) from app.models.research import ResearchSession, Subtopic from app.models.search_result import SearchResult from config import Config logger = logging.getLogger(__name__) class ReportGenerator: """报告生成器""" def generate_subtopic_report(self, subtopic: Subtopic, integrated_info: Dict[str, Any], report_content: str) -> SubtopicReport: """生成子主题报告""" try: # 解析报告内容为结构化格式 sections = self._parse_report_sections(report_content) key_insights = self._extract_key_insights(report_content) recommendations = self._extract_recommendations(report_content) # 统计字数 word_count = len(report_content.replace(" ", "")) # 创建子主题报告 report = SubtopicReport( subtopic_id=subtopic.id, subtopic_name=subtopic.topic, sections=sections, key_insights=key_insights, recommendations=recommendations, word_count=word_count ) return report except Exception as e: logger.error(f"生成子主题报告失败: {e}") # 返回基本报告 return SubtopicReport( subtopic_id=subtopic.id, subtopic_name=subtopic.topic, sections=[ ReportSection( title="报告内容", content=report_content ) ] ) def generate_final_report(self, session: ResearchSession, subtopic_reports: List[SubtopicReport], final_content: str) -> FinalReport: """生成最终报告""" try: # 解析最终报告内容 executive_summary = self._extract_executive_summary(final_content) main_findings = self._parse_main_findings(final_content) overall_insights = self._extract_overall_insights(final_content) recommendations = self._extract_final_recommendations(final_content) # 统计信息 total_sources = self._count_total_sources(subtopic_reports) total_searches = self._count_total_searches(session) # 创建最终报告 report = FinalReport( session_id=session.id, title=session.question, executive_summary=executive_summary, main_findings=main_findings, subtopic_reports=subtopic_reports, overall_insights=overall_insights, recommendations=recommendations, methodology=self._generate_methodology(session), limitations=self._identify_limitations(session), total_sources=total_sources, total_searches=total_searches ) return report except Exception as e: logger.error(f"生成最终报告失败: {e}") # 返回基本报告 return FinalReport( session_id=session.id, title=session.question, executive_summary="研究报告生成过程中出现错误。", subtopic_reports=subtopic_reports, total_sources=total_sources if 'total_sources' in locals() else 0, total_searches=total_searches if 'total_searches' in locals() else 0 ) def save_report(self, report: FinalReport, format: str = "markdown") -> str: """保存报告到文件""" try: # 生成文件名 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"{report.session_id}_{timestamp}.md" filepath = os.path.join(Config.REPORTS_DIR, filename) # 保存文件 if format == "markdown": report.save_to_file(filepath) else: # 未来可以支持其他格式(PDF、HTML等) raise ValueError(f"不支持的格式: {format}") logger.info(f"报告已保存: {filepath}") return filepath except Exception as e: logger.error(f"保存报告失败: {e}") raise def create_hallucination_report(self, hallucinations: List[Dict[str, Any]]) -> str: """创建幻觉检测报告""" if not hallucinations: return "未检测到幻觉内容。" report_lines = ["# 幻觉检测报告", ""] report_lines.append(f"共检测到 {len(hallucinations)} 处可能的幻觉内容:") report_lines.append("") for i, h in enumerate(hallucinations, 1): report_lines.extend([ f"## {i}. {h.get('type', '未知类型')}", f"**URL**: {h.get('url', 'N/A')}", f"**原始内容**: {h.get('content', 'N/A')}", f"**说明**: {h.get('explanation', '无说明')}", "" ]) return '\n'.join(report_lines) # ========== 解析辅助方法 ========== def _parse_report_sections(self, content: str) -> List[ReportSection]: """解析报告章节""" sections = [] # 简单的Markdown解析 lines = content.split('\n') current_section = None current_content = [] for line in lines: if line.startswith('### '): # 保存前一个章节 if current_section: current_section.content = '\n'.join(current_content).strip() sections.append(current_section) # 开始新章节 current_section = ReportSection(title=line[4:].strip(), content="") current_content = [] elif line.startswith('#### ') and current_section: # 子章节 subsection_title = line[5:].strip() # 收集子章节内容(简化处理) current_content.append(line) elif current_section: current_content.append(line) # 保存最后一个章节 if current_section: current_section.content = '\n'.join(current_content).strip() sections.append(current_section) return sections def _extract_key_insights(self, content: str) -> List[KeyInsight]: """提取关键洞察""" insights = [] # 查找"关键洞察"部分 lines = content.split('\n') in_insights_section = False for i, line in enumerate(lines): if '关键洞察' in line and line.startswith('#'): in_insights_section = True continue if in_insights_section: if line.startswith('#') and '关键洞察' not in line: break if line.strip().startswith(('1.', '2.', '3.', '4.', '5.')): # 提取洞察内容 insight_text = line.split('.', 1)[1].strip() # 移除Markdown格式 insight_text = insight_text.replace('**', '').replace('*', '') # 查找来源URL source_urls = self._extract_urls_from_text(insight_text) insights.append(KeyInsight( insight=insight_text.split(':')[0] if ':' in insight_text else insight_text, source_urls=source_urls, confidence=0.8 # 默认置信度 )) return insights def _extract_recommendations(self, content: str) -> List[str]: """提取建议""" recommendations = [] lines = content.split('\n') in_recommendations_section = False for line in lines: if '建议' in line and line.startswith('#'): in_recommendations_section = True continue if in_recommendations_section: if line.startswith('#') and '建议' not in line: break if line.strip().startswith(('-', '*', '•')): recommendation = line.strip()[1:].strip() if recommendation: recommendations.append(recommendation) return recommendations def _extract_executive_summary(self, content: str) -> str: """提取执行摘要""" lines = content.split('\n') in_summary = False summary_lines = [] for line in lines: if '执行摘要' in line and line.startswith('#'): in_summary = True continue if in_summary: if line.startswith('#'): break summary_lines.append(line) return '\n'.join(summary_lines).strip() def _parse_main_findings(self, content: str) -> List[ReportSection]: """解析主要发现""" # 类似于_parse_report_sections,但只关注"主要发现"部分 # 简化实现 return [] def _extract_overall_insights(self, content: str) -> List[KeyInsight]: """提取整体洞察""" # 类似于_extract_key_insights,但关注"综合洞察"部分 return [] def _extract_final_recommendations(self, content: str) -> List[str]: """提取最终建议""" # 类似于_extract_recommendations return [] def _extract_urls_from_text(self, text: str) -> List[str]: """从文本中提取URL""" import re # 简单的URL提取 url_pattern = r'https?://[^\s))]+|www\.[^\s))]+' urls = re.findall(url_pattern, text) # 清理URL cleaned_urls = [] for url in urls: # 移除末尾的标点 url = url.rstrip('.,;:!?))') if url: cleaned_urls.append(url) return cleaned_urls def _count_total_sources(self, subtopic_reports: List[SubtopicReport]) -> int: """统计总来源数""" all_urls = set() for report in subtopic_reports: for section in report.sections: all_urls.update(section.sources) for insight in report.key_insights: all_urls.update(insight.source_urls) return len(all_urls) def _count_total_searches(self, session: ResearchSession) -> int: """统计总搜索次数""" if not session.outline: return 0 total = 0 for subtopic in session.outline.sub_topics: total += subtopic.get_total_searches() return total def _generate_methodology(self, session: ResearchSession) -> str: """生成研究方法说明""" methodology = f""" 本研究采用系统化的深度研究方法,具体流程如下: 1. **问题分析**: 识别问题类型为"{session.question_type.value if session.question_type else '未知'}",并细化为{len(session.refined_questions)}个具体问题。 2. **研究规划**: 制定包含{len(session.outline.sub_topics) if session.outline else 0}个子主题的研究大纲,每个子主题根据重要性分配不同的搜索资源。 3. **信息收集**: 使用Tavily搜索引擎进行多轮搜索,共执行{self._count_total_searches(session)}次搜索。 4. **质量控制**: 通过AI评估搜索结果重要性,并进行幻觉检测和内容验证。 5. **综合分析**: 整合所有信息,提炼关键洞察,形成结构化报告。 """ return methodology.strip() def _identify_limitations(self, session: ResearchSession) -> List[str]: """识别研究局限性""" limitations = [ "搜索结果受限于公开可访问的网络信息", "部分专业领域可能缺乏深度分析", "时效性信息可能存在延迟" ] # 根据实际情况添加更多局限性 if session.outline and any(st.status == "cancelled" for st in session.outline.sub_topics): limitations.append("部分子主题研究未完成") return limitations