347 lines
13 KiB
Python
347 lines
13 KiB
Python
"""
|
||
报告生成服务
|
||
负责生成各类研究报告
|
||
"""
|
||
import os
|
||
import logging
|
||
from datetime import datetime
|
||
from typing import Dict, List, Any, Optional
|
||
from app.models.report import (
|
||
SubtopicReport, FinalReport, ReportSection,
|
||
KeyInsight, HallucinationCheck
|
||
)
|
||
from app.models.research import ResearchSession, Subtopic
|
||
from app.models.search_result import SearchResult
|
||
from config import Config
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
class ReportGenerator:
|
||
"""报告生成器"""
|
||
|
||
def generate_subtopic_report(self, subtopic: Subtopic,
|
||
integrated_info: Dict[str, Any],
|
||
report_content: str) -> SubtopicReport:
|
||
"""生成子主题报告"""
|
||
try:
|
||
# 解析报告内容为结构化格式
|
||
sections = self._parse_report_sections(report_content)
|
||
key_insights = self._extract_key_insights(report_content)
|
||
recommendations = self._extract_recommendations(report_content)
|
||
|
||
# 统计字数
|
||
word_count = len(report_content.replace(" ", ""))
|
||
|
||
# 创建子主题报告
|
||
report = SubtopicReport(
|
||
subtopic_id=subtopic.id,
|
||
subtopic_name=subtopic.topic,
|
||
sections=sections,
|
||
key_insights=key_insights,
|
||
recommendations=recommendations,
|
||
word_count=word_count
|
||
)
|
||
|
||
return report
|
||
|
||
except Exception as e:
|
||
logger.error(f"生成子主题报告失败: {e}")
|
||
# 返回基本报告
|
||
return SubtopicReport(
|
||
subtopic_id=subtopic.id,
|
||
subtopic_name=subtopic.topic,
|
||
sections=[
|
||
ReportSection(
|
||
title="报告内容",
|
||
content=report_content
|
||
)
|
||
]
|
||
)
|
||
|
||
def generate_final_report(self, session: ResearchSession,
|
||
subtopic_reports: List[SubtopicReport],
|
||
final_content: str) -> FinalReport:
|
||
"""生成最终报告"""
|
||
try:
|
||
# 解析最终报告内容
|
||
executive_summary = self._extract_executive_summary(final_content)
|
||
main_findings = self._parse_main_findings(final_content)
|
||
overall_insights = self._extract_overall_insights(final_content)
|
||
recommendations = self._extract_final_recommendations(final_content)
|
||
|
||
# 统计信息
|
||
total_sources = self._count_total_sources(subtopic_reports)
|
||
total_searches = self._count_total_searches(session)
|
||
|
||
# 创建最终报告
|
||
report = FinalReport(
|
||
session_id=session.id,
|
||
title=session.question,
|
||
executive_summary=executive_summary,
|
||
main_findings=main_findings,
|
||
subtopic_reports=subtopic_reports,
|
||
overall_insights=overall_insights,
|
||
recommendations=recommendations,
|
||
methodology=self._generate_methodology(session),
|
||
limitations=self._identify_limitations(session),
|
||
total_sources=total_sources,
|
||
total_searches=total_searches
|
||
)
|
||
|
||
return report
|
||
|
||
except Exception as e:
|
||
logger.error(f"生成最终报告失败: {e}")
|
||
# 返回基本报告
|
||
return FinalReport(
|
||
session_id=session.id,
|
||
title=session.question,
|
||
executive_summary="研究报告生成过程中出现错误。",
|
||
subtopic_reports=subtopic_reports,
|
||
total_sources=total_sources if 'total_sources' in locals() else 0,
|
||
total_searches=total_searches if 'total_searches' in locals() else 0
|
||
)
|
||
|
||
def save_report(self, report: FinalReport, format: str = "markdown") -> str:
|
||
"""保存报告到文件"""
|
||
try:
|
||
# 生成文件名
|
||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||
filename = f"{report.session_id}_{timestamp}.md"
|
||
filepath = os.path.join(Config.REPORTS_DIR, filename)
|
||
|
||
# 保存文件
|
||
if format == "markdown":
|
||
report.save_to_file(filepath)
|
||
else:
|
||
# 未来可以支持其他格式(PDF、HTML等)
|
||
raise ValueError(f"不支持的格式: {format}")
|
||
|
||
logger.info(f"报告已保存: {filepath}")
|
||
return filepath
|
||
|
||
except Exception as e:
|
||
logger.error(f"保存报告失败: {e}")
|
||
raise
|
||
|
||
def create_hallucination_report(self, hallucinations: List[Dict[str, Any]]) -> str:
|
||
"""创建幻觉检测报告"""
|
||
if not hallucinations:
|
||
return "未检测到幻觉内容。"
|
||
|
||
report_lines = ["# 幻觉检测报告", ""]
|
||
report_lines.append(f"共检测到 {len(hallucinations)} 处可能的幻觉内容:")
|
||
report_lines.append("")
|
||
|
||
for i, h in enumerate(hallucinations, 1):
|
||
report_lines.extend([
|
||
f"## {i}. {h.get('type', '未知类型')}",
|
||
f"**URL**: {h.get('url', 'N/A')}",
|
||
f"**原始内容**: {h.get('content', 'N/A')}",
|
||
f"**说明**: {h.get('explanation', '无说明')}",
|
||
""
|
||
])
|
||
|
||
return '\n'.join(report_lines)
|
||
|
||
# ========== 解析辅助方法 ==========
|
||
|
||
def _parse_report_sections(self, content: str) -> List[ReportSection]:
|
||
"""解析报告章节"""
|
||
sections = []
|
||
|
||
# 简单的Markdown解析
|
||
lines = content.split('\n')
|
||
current_section = None
|
||
current_content = []
|
||
|
||
for line in lines:
|
||
if line.startswith('### '):
|
||
# 保存前一个章节
|
||
if current_section:
|
||
current_section.content = '\n'.join(current_content).strip()
|
||
sections.append(current_section)
|
||
|
||
# 开始新章节
|
||
current_section = ReportSection(title=line[4:].strip(), content="")
|
||
current_content = []
|
||
|
||
elif line.startswith('#### ') and current_section:
|
||
# 子章节
|
||
subsection_title = line[5:].strip()
|
||
# 收集子章节内容(简化处理)
|
||
current_content.append(line)
|
||
|
||
elif current_section:
|
||
current_content.append(line)
|
||
|
||
# 保存最后一个章节
|
||
if current_section:
|
||
current_section.content = '\n'.join(current_content).strip()
|
||
sections.append(current_section)
|
||
|
||
return sections
|
||
|
||
def _extract_key_insights(self, content: str) -> List[KeyInsight]:
|
||
"""提取关键洞察"""
|
||
insights = []
|
||
|
||
# 查找"关键洞察"部分
|
||
lines = content.split('\n')
|
||
in_insights_section = False
|
||
|
||
for i, line in enumerate(lines):
|
||
if '关键洞察' in line and line.startswith('#'):
|
||
in_insights_section = True
|
||
continue
|
||
|
||
if in_insights_section:
|
||
if line.startswith('#') and '关键洞察' not in line:
|
||
break
|
||
|
||
if line.strip().startswith(('1.', '2.', '3.', '4.', '5.')):
|
||
# 提取洞察内容
|
||
insight_text = line.split('.', 1)[1].strip()
|
||
# 移除Markdown格式
|
||
insight_text = insight_text.replace('**', '').replace('*', '')
|
||
|
||
# 查找来源URL
|
||
source_urls = self._extract_urls_from_text(insight_text)
|
||
|
||
insights.append(KeyInsight(
|
||
insight=insight_text.split(':')[0] if ':' in insight_text else insight_text,
|
||
source_urls=source_urls,
|
||
confidence=0.8 # 默认置信度
|
||
))
|
||
|
||
return insights
|
||
|
||
def _extract_recommendations(self, content: str) -> List[str]:
|
||
"""提取建议"""
|
||
recommendations = []
|
||
|
||
lines = content.split('\n')
|
||
in_recommendations_section = False
|
||
|
||
for line in lines:
|
||
if '建议' in line and line.startswith('#'):
|
||
in_recommendations_section = True
|
||
continue
|
||
|
||
if in_recommendations_section:
|
||
if line.startswith('#') and '建议' not in line:
|
||
break
|
||
|
||
if line.strip().startswith(('-', '*', '•')):
|
||
recommendation = line.strip()[1:].strip()
|
||
if recommendation:
|
||
recommendations.append(recommendation)
|
||
|
||
return recommendations
|
||
|
||
def _extract_executive_summary(self, content: str) -> str:
|
||
"""提取执行摘要"""
|
||
lines = content.split('\n')
|
||
in_summary = False
|
||
summary_lines = []
|
||
|
||
for line in lines:
|
||
if '执行摘要' in line and line.startswith('#'):
|
||
in_summary = True
|
||
continue
|
||
|
||
if in_summary:
|
||
if line.startswith('#'):
|
||
break
|
||
summary_lines.append(line)
|
||
|
||
return '\n'.join(summary_lines).strip()
|
||
|
||
def _parse_main_findings(self, content: str) -> List[ReportSection]:
|
||
"""解析主要发现"""
|
||
# 类似于_parse_report_sections,但只关注"主要发现"部分
|
||
# 简化实现
|
||
return []
|
||
|
||
def _extract_overall_insights(self, content: str) -> List[KeyInsight]:
|
||
"""提取整体洞察"""
|
||
# 类似于_extract_key_insights,但关注"综合洞察"部分
|
||
return []
|
||
|
||
def _extract_final_recommendations(self, content: str) -> List[str]:
|
||
"""提取最终建议"""
|
||
# 类似于_extract_recommendations
|
||
return []
|
||
|
||
def _extract_urls_from_text(self, text: str) -> List[str]:
|
||
"""从文本中提取URL"""
|
||
import re
|
||
|
||
# 简单的URL提取
|
||
url_pattern = r'https?://[^\s))]+|www\.[^\s))]+'
|
||
urls = re.findall(url_pattern, text)
|
||
|
||
# 清理URL
|
||
cleaned_urls = []
|
||
for url in urls:
|
||
# 移除末尾的标点
|
||
url = url.rstrip('.,;:!?))')
|
||
if url:
|
||
cleaned_urls.append(url)
|
||
|
||
return cleaned_urls
|
||
|
||
def _count_total_sources(self, subtopic_reports: List[SubtopicReport]) -> int:
|
||
"""统计总来源数"""
|
||
all_urls = set()
|
||
|
||
for report in subtopic_reports:
|
||
for section in report.sections:
|
||
all_urls.update(section.sources)
|
||
|
||
for insight in report.key_insights:
|
||
all_urls.update(insight.source_urls)
|
||
|
||
return len(all_urls)
|
||
|
||
def _count_total_searches(self, session: ResearchSession) -> int:
|
||
"""统计总搜索次数"""
|
||
if not session.outline:
|
||
return 0
|
||
|
||
total = 0
|
||
for subtopic in session.outline.sub_topics:
|
||
total += subtopic.get_total_searches()
|
||
|
||
return total
|
||
|
||
def _generate_methodology(self, session: ResearchSession) -> str:
|
||
"""生成研究方法说明"""
|
||
methodology = f"""
|
||
本研究采用系统化的深度研究方法,具体流程如下:
|
||
|
||
1. **问题分析**: 识别问题类型为"{session.question_type.value if session.question_type else '未知'}",并细化为{len(session.refined_questions)}个具体问题。
|
||
|
||
2. **研究规划**: 制定包含{len(session.outline.sub_topics) if session.outline else 0}个子主题的研究大纲,每个子主题根据重要性分配不同的搜索资源。
|
||
|
||
3. **信息收集**: 使用Tavily搜索引擎进行多轮搜索,共执行{self._count_total_searches(session)}次搜索。
|
||
|
||
4. **质量控制**: 通过AI评估搜索结果重要性,并进行幻觉检测和内容验证。
|
||
|
||
5. **综合分析**: 整合所有信息,提炼关键洞察,形成结构化报告。
|
||
"""
|
||
return methodology.strip()
|
||
|
||
def _identify_limitations(self, session: ResearchSession) -> List[str]:
|
||
"""识别研究局限性"""
|
||
limitations = [
|
||
"搜索结果受限于公开可访问的网络信息",
|
||
"部分专业领域可能缺乏深度分析",
|
||
"时效性信息可能存在延迟"
|
||
]
|
||
|
||
# 根据实际情况添加更多局限性
|
||
if session.outline and any(st.status == "cancelled" for st in session.outline.sub_topics):
|
||
limitations.append("部分子主题研究未完成")
|
||
|
||
return limitations |