deepresearch/所有文件/report_generator.py
2025-07-02 15:35:36 +08:00

347 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
报告生成服务
负责生成各类研究报告
"""
import os
import logging
from datetime import datetime
from typing import Dict, List, Any, Optional
from app.models.report import (
SubtopicReport, FinalReport, ReportSection,
KeyInsight, HallucinationCheck
)
from app.models.research import ResearchSession, Subtopic
from app.models.search_result import SearchResult
from config import Config
logger = logging.getLogger(__name__)
class ReportGenerator:
"""报告生成器"""
def generate_subtopic_report(self, subtopic: Subtopic,
integrated_info: Dict[str, Any],
report_content: str) -> SubtopicReport:
"""生成子主题报告"""
try:
# 解析报告内容为结构化格式
sections = self._parse_report_sections(report_content)
key_insights = self._extract_key_insights(report_content)
recommendations = self._extract_recommendations(report_content)
# 统计字数
word_count = len(report_content.replace(" ", ""))
# 创建子主题报告
report = SubtopicReport(
subtopic_id=subtopic.id,
subtopic_name=subtopic.topic,
sections=sections,
key_insights=key_insights,
recommendations=recommendations,
word_count=word_count
)
return report
except Exception as e:
logger.error(f"生成子主题报告失败: {e}")
# 返回基本报告
return SubtopicReport(
subtopic_id=subtopic.id,
subtopic_name=subtopic.topic,
sections=[
ReportSection(
title="报告内容",
content=report_content
)
]
)
def generate_final_report(self, session: ResearchSession,
subtopic_reports: List[SubtopicReport],
final_content: str) -> FinalReport:
"""生成最终报告"""
try:
# 解析最终报告内容
executive_summary = self._extract_executive_summary(final_content)
main_findings = self._parse_main_findings(final_content)
overall_insights = self._extract_overall_insights(final_content)
recommendations = self._extract_final_recommendations(final_content)
# 统计信息
total_sources = self._count_total_sources(subtopic_reports)
total_searches = self._count_total_searches(session)
# 创建最终报告
report = FinalReport(
session_id=session.id,
title=session.question,
executive_summary=executive_summary,
main_findings=main_findings,
subtopic_reports=subtopic_reports,
overall_insights=overall_insights,
recommendations=recommendations,
methodology=self._generate_methodology(session),
limitations=self._identify_limitations(session),
total_sources=total_sources,
total_searches=total_searches
)
return report
except Exception as e:
logger.error(f"生成最终报告失败: {e}")
# 返回基本报告
return FinalReport(
session_id=session.id,
title=session.question,
executive_summary="研究报告生成过程中出现错误。",
subtopic_reports=subtopic_reports,
total_sources=total_sources if 'total_sources' in locals() else 0,
total_searches=total_searches if 'total_searches' in locals() else 0
)
def save_report(self, report: FinalReport, format: str = "markdown") -> str:
"""保存报告到文件"""
try:
# 生成文件名
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"{report.session_id}_{timestamp}.md"
filepath = os.path.join(Config.REPORTS_DIR, filename)
# 保存文件
if format == "markdown":
report.save_to_file(filepath)
else:
# 未来可以支持其他格式PDF、HTML等
raise ValueError(f"不支持的格式: {format}")
logger.info(f"报告已保存: {filepath}")
return filepath
except Exception as e:
logger.error(f"保存报告失败: {e}")
raise
def create_hallucination_report(self, hallucinations: List[Dict[str, Any]]) -> str:
"""创建幻觉检测报告"""
if not hallucinations:
return "未检测到幻觉内容。"
report_lines = ["# 幻觉检测报告", ""]
report_lines.append(f"共检测到 {len(hallucinations)} 处可能的幻觉内容:")
report_lines.append("")
for i, h in enumerate(hallucinations, 1):
report_lines.extend([
f"## {i}. {h.get('type', '未知类型')}",
f"**URL**: {h.get('url', 'N/A')}",
f"**原始内容**: {h.get('content', 'N/A')}",
f"**说明**: {h.get('explanation', '无说明')}",
""
])
return '\n'.join(report_lines)
# ========== 解析辅助方法 ==========
def _parse_report_sections(self, content: str) -> List[ReportSection]:
"""解析报告章节"""
sections = []
# 简单的Markdown解析
lines = content.split('\n')
current_section = None
current_content = []
for line in lines:
if line.startswith('### '):
# 保存前一个章节
if current_section:
current_section.content = '\n'.join(current_content).strip()
sections.append(current_section)
# 开始新章节
current_section = ReportSection(title=line[4:].strip(), content="")
current_content = []
elif line.startswith('#### ') and current_section:
# 子章节
subsection_title = line[5:].strip()
# 收集子章节内容(简化处理)
current_content.append(line)
elif current_section:
current_content.append(line)
# 保存最后一个章节
if current_section:
current_section.content = '\n'.join(current_content).strip()
sections.append(current_section)
return sections
def _extract_key_insights(self, content: str) -> List[KeyInsight]:
"""提取关键洞察"""
insights = []
# 查找"关键洞察"部分
lines = content.split('\n')
in_insights_section = False
for i, line in enumerate(lines):
if '关键洞察' in line and line.startswith('#'):
in_insights_section = True
continue
if in_insights_section:
if line.startswith('#') and '关键洞察' not in line:
break
if line.strip().startswith(('1.', '2.', '3.', '4.', '5.')):
# 提取洞察内容
insight_text = line.split('.', 1)[1].strip()
# 移除Markdown格式
insight_text = insight_text.replace('**', '').replace('*', '')
# 查找来源URL
source_urls = self._extract_urls_from_text(insight_text)
insights.append(KeyInsight(
insight=insight_text.split('')[0] if '' in insight_text else insight_text,
source_urls=source_urls,
confidence=0.8 # 默认置信度
))
return insights
def _extract_recommendations(self, content: str) -> List[str]:
"""提取建议"""
recommendations = []
lines = content.split('\n')
in_recommendations_section = False
for line in lines:
if '建议' in line and line.startswith('#'):
in_recommendations_section = True
continue
if in_recommendations_section:
if line.startswith('#') and '建议' not in line:
break
if line.strip().startswith(('-', '*', '')):
recommendation = line.strip()[1:].strip()
if recommendation:
recommendations.append(recommendation)
return recommendations
def _extract_executive_summary(self, content: str) -> str:
"""提取执行摘要"""
lines = content.split('\n')
in_summary = False
summary_lines = []
for line in lines:
if '执行摘要' in line and line.startswith('#'):
in_summary = True
continue
if in_summary:
if line.startswith('#'):
break
summary_lines.append(line)
return '\n'.join(summary_lines).strip()
def _parse_main_findings(self, content: str) -> List[ReportSection]:
"""解析主要发现"""
# 类似于_parse_report_sections但只关注"主要发现"部分
# 简化实现
return []
def _extract_overall_insights(self, content: str) -> List[KeyInsight]:
"""提取整体洞察"""
# 类似于_extract_key_insights但关注"综合洞察"部分
return []
def _extract_final_recommendations(self, content: str) -> List[str]:
"""提取最终建议"""
# 类似于_extract_recommendations
return []
def _extract_urls_from_text(self, text: str) -> List[str]:
"""从文本中提取URL"""
import re
# 简单的URL提取
url_pattern = r'https?://[^\s)]+|www\.[^\s)]+'
urls = re.findall(url_pattern, text)
# 清理URL
cleaned_urls = []
for url in urls:
# 移除末尾的标点
url = url.rstrip('.,;:!?)')
if url:
cleaned_urls.append(url)
return cleaned_urls
def _count_total_sources(self, subtopic_reports: List[SubtopicReport]) -> int:
"""统计总来源数"""
all_urls = set()
for report in subtopic_reports:
for section in report.sections:
all_urls.update(section.sources)
for insight in report.key_insights:
all_urls.update(insight.source_urls)
return len(all_urls)
def _count_total_searches(self, session: ResearchSession) -> int:
"""统计总搜索次数"""
if not session.outline:
return 0
total = 0
for subtopic in session.outline.sub_topics:
total += subtopic.get_total_searches()
return total
def _generate_methodology(self, session: ResearchSession) -> str:
"""生成研究方法说明"""
methodology = f"""
本研究采用系统化的深度研究方法,具体流程如下:
1. **问题分析**: 识别问题类型为"{session.question_type.value if session.question_type else '未知'}",并细化为{len(session.refined_questions)}个具体问题。
2. **研究规划**: 制定包含{len(session.outline.sub_topics) if session.outline else 0}个子主题的研究大纲,每个子主题根据重要性分配不同的搜索资源。
3. **信息收集**: 使用Tavily搜索引擎进行多轮搜索共执行{self._count_total_searches(session)}次搜索。
4. **质量控制**: 通过AI评估搜索结果重要性并进行幻觉检测和内容验证。
5. **综合分析**: 整合所有信息,提炼关键洞察,形成结构化报告。
"""
return methodology.strip()
def _identify_limitations(self, session: ResearchSession) -> List[str]:
"""识别研究局限性"""
limitations = [
"搜索结果受限于公开可访问的网络信息",
"部分专业领域可能缺乏深度分析",
"时效性信息可能存在延迟"
]
# 根据实际情况添加更多局限性
if session.outline and any(st.status == "cancelled" for st in session.outline.sub_topics):
limitations.append("部分子主题研究未完成")
return limitations