125 lines
4.0 KiB
Python
125 lines
4.0 KiB
Python
# modules/webpage_extractor.py - 网页内容提取模块
|
||
|
||
import httpx
|
||
import json
|
||
from typing import Dict, Any, List, Union, Tuple
|
||
from utils.logger import setup_logger
|
||
|
||
logger = setup_logger(__name__)
|
||
|
||
async def tavily_extract(urls: Union[str, List[str]], api_key: str, extract_depth: str = "basic", max_urls: int = 1) -> Dict[str, Any]:
|
||
"""
|
||
执行Tavily网页内容提取
|
||
|
||
Args:
|
||
urls: 要提取的URL(字符串或列表)
|
||
api_key: Tavily API密钥
|
||
extract_depth: 提取深度 (basic/advanced)
|
||
max_urls: 最大提取URL数量
|
||
|
||
Returns:
|
||
提取结果字典
|
||
"""
|
||
if not api_key:
|
||
return {"error": "Tavily API密钥未配置"}
|
||
|
||
# 确保urls是列表
|
||
if isinstance(urls, str):
|
||
urls = [urls]
|
||
|
||
# 限制URL数量
|
||
urls = urls[:max_urls]
|
||
|
||
try:
|
||
async with httpx.AsyncClient() as client:
|
||
response = await client.post(
|
||
"https://api.tavily.com/extract",
|
||
json={
|
||
"urls": urls,
|
||
"extract_depth": extract_depth,
|
||
"include_images": False,
|
||
},
|
||
headers={
|
||
"Authorization": f"Bearer {api_key}",
|
||
"Content-Type": "application/json",
|
||
},
|
||
timeout=60,
|
||
)
|
||
|
||
if response.status_code == 200:
|
||
return response.json()
|
||
else:
|
||
return {"error": f"API请求失败: HTTP {response.status_code}"}
|
||
|
||
except httpx.TimeoutException:
|
||
return {"error": "请求超时,网页响应过慢"}
|
||
except httpx.RequestError as e:
|
||
return {"error": f"网络请求错误: {str(e)}"}
|
||
except Exception as e:
|
||
logger.error(f"网页提取异常: {e}")
|
||
return {"error": f"提取异常: {str(e)}"}
|
||
|
||
|
||
def format_extract_results(results: Dict[str, Any]) -> str:
|
||
"""
|
||
格式化提取结果为简洁版本
|
||
|
||
Args:
|
||
results: tavily_extract返回的结果
|
||
|
||
Returns:
|
||
格式化后的内容字符串
|
||
"""
|
||
if "error" in results:
|
||
return f"❌ 提取失败: {results['error']}"
|
||
|
||
if not results.get("results"):
|
||
return "❌ 未能提取到任何内容"
|
||
|
||
formatted_parts = []
|
||
|
||
# 成功提取的结果
|
||
for i, result in enumerate(results["results"], 1):
|
||
url = result.get("url", "N/A")
|
||
raw_content = result.get("raw_content", "").strip()
|
||
|
||
if raw_content:
|
||
content_length = len(raw_content)
|
||
formatted_parts.append(f"🌐 网页内容 ({content_length} 字符):")
|
||
formatted_parts.append(f"📍 URL: {url}")
|
||
formatted_parts.append("=" * 50)
|
||
formatted_parts.append(raw_content)
|
||
formatted_parts.append("=" * 50)
|
||
else:
|
||
formatted_parts.append(f"⚠️ URL {url} 提取到空内容")
|
||
|
||
# 失败的URL(如果有)
|
||
if results.get("failed_results"):
|
||
formatted_parts.append("\n❌ 提取失败的URL:")
|
||
for failed in results["failed_results"]:
|
||
formatted_parts.append(f"- {failed.get('url', 'N/A')}: {failed.get('error', '未知错误')}")
|
||
|
||
return "\n".join(formatted_parts)
|
||
|
||
|
||
async def extract_webpage_content(urls: Union[str, List[str]], api_key: str, extract_depth: str = "basic", max_urls: int = 1) -> Tuple[str, str]:
|
||
"""
|
||
完整的网页内容提取流程
|
||
|
||
Args:
|
||
urls: 要提取的URL(字符串或列表)
|
||
api_key: Tavily API密钥
|
||
extract_depth: 提取深度 (basic/advanced)
|
||
max_urls: 最大提取URL数量
|
||
|
||
Returns:
|
||
(完整内容, 完整内容) - 为了兼容性返回相同内容两份
|
||
"""
|
||
# 执行提取
|
||
results = await tavily_extract(urls, api_key, extract_depth, max_urls)
|
||
|
||
# 格式化结果
|
||
formatted_content = format_extract_results(results)
|
||
|
||
# 返回相同内容(简化版本,不需要长短版本区分)
|
||
return formatted_content, formatted_content |