feat: improve vlm workflow and workspace image serving
This commit is contained in:
parent
8ead6ebc08
commit
7853830624
@ -748,8 +748,8 @@ class MainTerminal:
|
|||||||
print(f"{OUTPUT_FORMATS['file']} 创建文件")
|
print(f"{OUTPUT_FORMATS['file']} 创建文件")
|
||||||
elif tool_name == "read_file":
|
elif tool_name == "read_file":
|
||||||
print(f"{OUTPUT_FORMATS['file']} 读取文件")
|
print(f"{OUTPUT_FORMATS['file']} 读取文件")
|
||||||
elif tool_name == "ocr_image":
|
elif tool_name in {"vlm_analyze", "ocr_image"}:
|
||||||
print(f"{OUTPUT_FORMATS['file']} 图片OCR")
|
print(f"{OUTPUT_FORMATS['file']} VLM 视觉理解")
|
||||||
elif tool_name == "write_file_diff":
|
elif tool_name == "write_file_diff":
|
||||||
print(f"{OUTPUT_FORMATS['file']} 应用补丁")
|
print(f"{OUTPUT_FORMATS['file']} 应用补丁")
|
||||||
elif tool_name == "delete_file":
|
elif tool_name == "delete_file":
|
||||||
@ -1228,13 +1228,13 @@ class MainTerminal:
|
|||||||
{
|
{
|
||||||
"type": "function",
|
"type": "function",
|
||||||
"function": {
|
"function": {
|
||||||
"name": "ocr_image",
|
"name": "vlm_analyze",
|
||||||
"description": "使用 Qwen3-VL模型 读取图片中的文字或根据提示生成描述,仅支持本地图片路径。",
|
"description": "使用大参数视觉语言模型(Qwen-VL模型)理解图片:文字、物体、布局、表格等,仅支持本地路径。",
|
||||||
"parameters": {
|
"parameters": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": self._inject_intent({
|
"properties": self._inject_intent({
|
||||||
"path": {"type": "string", "description": "项目内的图片路径"},
|
"path": {"type": "string", "description": "项目内的图片相对路径"},
|
||||||
"prompt": {"type": "string", "description": "传递给 OCR 模型的提示词,如“请识别图片中的文字”,“图中的手机是什么颜色的”必须使用中文提示词。"}
|
"prompt": {"type": "string", "description": "传递给 VLM 的中文提示词,如“请总结这张图的内容”“表格的总金额是多少”“图中是什么车?”。"}
|
||||||
}),
|
}),
|
||||||
"required": ["path", "prompt"]
|
"required": ["path", "prompt"]
|
||||||
}
|
}
|
||||||
@ -1743,12 +1743,12 @@ class MainTerminal:
|
|||||||
try:
|
try:
|
||||||
if tool_name == "read_file":
|
if tool_name == "read_file":
|
||||||
result = self._handle_read_tool(arguments)
|
result = self._handle_read_tool(arguments)
|
||||||
elif tool_name == "ocr_image":
|
elif tool_name in {"vlm_analyze", "ocr_image"}:
|
||||||
path = arguments.get("path")
|
path = arguments.get("path")
|
||||||
prompt = arguments.get("prompt")
|
prompt = arguments.get("prompt")
|
||||||
if not path:
|
if not path:
|
||||||
return json.dumps({"success": False, "error": "缺少 path 参数", "warnings": []}, ensure_ascii=False)
|
return json.dumps({"success": False, "error": "缺少 path 参数", "warnings": []}, ensure_ascii=False)
|
||||||
result = self.ocr_client.ocr_image(path=path, prompt=prompt or "")
|
result = self.ocr_client.vlm_analyze(path=path, prompt=prompt or "")
|
||||||
|
|
||||||
# 终端会话管理工具
|
# 终端会话管理工具
|
||||||
elif tool_name == "terminal_session":
|
elif tool_name == "terminal_session":
|
||||||
|
|||||||
@ -40,7 +40,7 @@ TOOL_CATEGORIES: Dict[str, ToolCategory] = {
|
|||||||
),
|
),
|
||||||
"read_focus": ToolCategory(
|
"read_focus": ToolCategory(
|
||||||
label="阅读聚焦",
|
label="阅读聚焦",
|
||||||
tools=["read_file", "focus_file", "unfocus_file", "ocr_image"],
|
tools=["read_file", "focus_file", "unfocus_file", "vlm_analyze", "ocr_image"],
|
||||||
),
|
),
|
||||||
"terminal_realtime": ToolCategory(
|
"terminal_realtime": ToolCategory(
|
||||||
label="实时终端",
|
label="实时终端",
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
"""DeepSeek-OCR 客户端(主智能体专用)。"""
|
"""视觉语言模型客户端(主智能体专用)。"""
|
||||||
|
|
||||||
import base64
|
import base64
|
||||||
import mimetypes
|
import mimetypes
|
||||||
@ -13,7 +13,7 @@ from modules.file_manager import FileManager
|
|||||||
|
|
||||||
|
|
||||||
class OCRClient:
|
class OCRClient:
|
||||||
"""封装 DeepSeek-OCR 调用逻辑。"""
|
"""封装 VLM(如 DeepSeek-OCR / Qwen-VL)调用逻辑。"""
|
||||||
|
|
||||||
def __init__(self, project_path: str, file_manager: FileManager):
|
def __init__(self, project_path: str, file_manager: FileManager):
|
||||||
self.project_path = Path(project_path).resolve()
|
self.project_path = Path(project_path).resolve()
|
||||||
@ -48,8 +48,8 @@ class OCRClient:
|
|||||||
return False, "不是文件", None
|
return False, "不是文件", None
|
||||||
return True, "", full_path
|
return True, "", full_path
|
||||||
|
|
||||||
def ocr_image(self, path: str, prompt: str) -> Dict:
|
def vlm_analyze(self, path: str, prompt: str) -> Dict:
|
||||||
"""执行 OCR,返回最简结果格式。"""
|
"""使用大参数视觉语言模型分析图片:文字、物体、布局等。"""
|
||||||
warnings: List[str] = []
|
warnings: List[str] = []
|
||||||
|
|
||||||
valid, error, full_path = self._validate_image_path(path)
|
valid, error, full_path = self._validate_image_path(path)
|
||||||
@ -101,4 +101,8 @@ class OCRClient:
|
|||||||
content = response.choices[0].message.content if response.choices else ""
|
content = response.choices[0].message.content if response.choices else ""
|
||||||
return {"success": True, "content": content or "", "warnings": warnings}
|
return {"success": True, "content": content or "", "warnings": warnings}
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
return {"success": False, "error": f"OCR 调用失败: {exc}", "warnings": warnings}
|
return {"success": False, "error": f"VLM 调用失败: {exc}", "warnings": warnings}
|
||||||
|
|
||||||
|
def ocr_image(self, path: str, prompt: str) -> Dict:
|
||||||
|
"""兼容旧名,转发到 vlm_analyze。"""
|
||||||
|
return self.vlm_analyze(path, prompt)
|
||||||
|
|||||||
@ -8,6 +8,7 @@
|
|||||||
- **数据整理**:处理表格、分析数据、生成报告
|
- **数据整理**:处理表格、分析数据、生成报告
|
||||||
- **文件管理**:创建、修改、重命名文件和文件夹
|
- **文件管理**:创建、修改、重命名文件和文件夹
|
||||||
- **自动化任务**:批量处理文件、执行重复性工作
|
- **自动化任务**:批量处理文件、执行重复性工作
|
||||||
|
- **视觉理解**:用 `vlm_analyze` 调用大参数 VLM(基于 Qwen-VL),识别图片中文字/物体/表格/场景并回答相关问题,不仅仅是 OCR。
|
||||||
|
|
||||||
## 图片展示
|
## 图片展示
|
||||||
- 如果需要直接在界面展示图片(本地或网络),请在回复里输出 `<show_image src="路径" alt="描述" />`,不用调用工具。
|
- 如果需要直接在界面展示图片(本地或网络),请在回复里输出 `<show_image src="路径" alt="描述" />`,不用调用工具。
|
||||||
@ -18,6 +19,14 @@
|
|||||||
- `<show_image src="/workspace/cache/thumb.jpg" />`
|
- `<show_image src="/workspace/cache/thumb.jpg" />`
|
||||||
- `<show_image src="https://example.com/demo.png" alt="官方示例截图" />`
|
- `<show_image src="https://example.com/demo.png" alt="官方示例截图" />`
|
||||||
|
|
||||||
|
### 图片检索与展示流程
|
||||||
|
- 触发:用户询问“X长什么样”“给我看X的图片”等需求时。
|
||||||
|
- 检索:用 `web_search` 搜索关键词(必要时添加“图片/照片/截图”等),挑选相关度最高的结果。
|
||||||
|
- 提取:对候选链接使用 `extract_webpage` 获取正文中的图片直链,优先 `https`、扩展名为 jpg/png/webp、分辨率≥800px 的原图,避开缩略图和水印预览。
|
||||||
|
- 本地/校验:已有本地图片时直接展示;若网上图片是否匹配存疑,先下载并用 `vlm_analyze`(VLM 视觉理解)查看内容后再确定是否展示。
|
||||||
|
- 展示:选数张代表性图片,直接输出 `<show_image src="直链或本地路径" alt="简短描述" />`;需要多张时多行重复该标签。
|
||||||
|
- 回退:用户反馈“看不到/无法展示”时,先将图片下载到可访问路径(如 `/workspace/cache/xxx.jpg`)再用本地路径展示;仍失败则提供文字描述并询问是否换图源。
|
||||||
|
|
||||||
## 重要提醒:你的工作环境
|
## 重要提醒:你的工作环境
|
||||||
1. **云端运行**:你在远程服务器上工作,在网页端和用户交互
|
1. **云端运行**:你在远程服务器上工作,在网页端和用户交互
|
||||||
2. **多人共用**:服务器上可能有其他用户,你只能访问被授权的文件夹
|
2. **多人共用**:服务器上可能有其他用户,你只能访问被授权的文件夹
|
||||||
|
|||||||
@ -96,6 +96,7 @@ const TOOL_SCENE_MAP: Record<string, string> = {
|
|||||||
read_file: 'reader',
|
read_file: 'reader',
|
||||||
focus_file: 'focus',
|
focus_file: 'focus',
|
||||||
unfocus_file: 'unfocus',
|
unfocus_file: 'unfocus',
|
||||||
|
vlm_analyze: 'ocr',
|
||||||
ocr_image: 'ocr',
|
ocr_image: 'ocr',
|
||||||
create_folder: 'createFolder',
|
create_folder: 'createFolder',
|
||||||
create_file: 'createFile',
|
create_file: 'createFile',
|
||||||
|
|||||||
@ -392,6 +392,8 @@
|
|||||||
display: flex;
|
display: flex;
|
||||||
flex-direction: column;
|
flex-direction: column;
|
||||||
gap: 2px;
|
gap: 2px;
|
||||||
|
user-select: none;
|
||||||
|
-webkit-user-select: none;
|
||||||
}
|
}
|
||||||
|
|
||||||
.more-title {
|
.more-title {
|
||||||
|
|||||||
@ -15,6 +15,7 @@ const RUNNING_ANIMATIONS: Record<string, string> = {
|
|||||||
web_search: 'search-animation',
|
web_search: 'search-animation',
|
||||||
extract_webpage: 'search-animation',
|
extract_webpage: 'search-animation',
|
||||||
save_webpage: 'file-animation',
|
save_webpage: 'file-animation',
|
||||||
|
vlm_analyze: 'file-animation',
|
||||||
run_python: 'code-animation',
|
run_python: 'code-animation',
|
||||||
run_command: 'terminal-animation',
|
run_command: 'terminal-animation',
|
||||||
update_memory: 'memory-animation',
|
update_memory: 'memory-animation',
|
||||||
@ -66,6 +67,7 @@ const COMPLETED_STATUS_TEXTS: Record<string, string> = {
|
|||||||
web_search: '搜索完成',
|
web_search: '搜索完成',
|
||||||
extract_webpage: '网页提取完成',
|
extract_webpage: '网页提取完成',
|
||||||
save_webpage: '网页保存完成(纯文本)',
|
save_webpage: '网页保存完成(纯文本)',
|
||||||
|
vlm_analyze: '图片解析完成',
|
||||||
run_python: '代码执行完成',
|
run_python: '代码执行完成',
|
||||||
run_command: '命令执行完成',
|
run_command: '命令执行完成',
|
||||||
update_memory: '记忆更新成功',
|
update_memory: '记忆更新成功',
|
||||||
|
|||||||
@ -50,6 +50,7 @@ export const TOOL_ICON_MAP = Object.freeze({
|
|||||||
focus_file: 'eye',
|
focus_file: 'eye',
|
||||||
modify_file: 'pencil',
|
modify_file: 'pencil',
|
||||||
write_file_diff: 'pencil',
|
write_file_diff: 'pencil',
|
||||||
|
vlm_analyze: 'camera',
|
||||||
ocr_image: 'camera',
|
ocr_image: 'camera',
|
||||||
read_file: 'book',
|
read_file: 'book',
|
||||||
rename_file: 'pencil',
|
rename_file: 'pencil',
|
||||||
|
|||||||
@ -570,18 +570,22 @@ def _format_extract_webpage(result_data: Dict[str, Any]) -> str:
|
|||||||
return "\n".join([f"{header}{note}", "```", preview, "```"])
|
return "\n".join([f"{header}{note}", "```", preview, "```"])
|
||||||
|
|
||||||
|
|
||||||
def _format_ocr_image(result_data: Dict[str, Any]) -> str:
|
def _format_vlm_analyze(result_data: Dict[str, Any]) -> str:
|
||||||
if not result_data.get("success"):
|
if not result_data.get("success"):
|
||||||
return _format_failure("ocr_image", result_data)
|
return _format_failure("vlm_analyze", result_data)
|
||||||
content = result_data.get("content") or ""
|
content = result_data.get("content") or ""
|
||||||
length = len(content)
|
length = len(content)
|
||||||
preview, truncated = _preview_text(content, 800)
|
preview, truncated = _preview_text(content, 800)
|
||||||
note = "(截断预览)" if truncated else "(未截断)"
|
note = "(截断预览)" if truncated else "(未截断)"
|
||||||
header = f"OCR 完成,长度 {length} 字符{note}"
|
header = f"VLM 解析完成,长度 {length} 字符{note}"
|
||||||
if not content:
|
if not content:
|
||||||
return f"{header};未返回可识别文本。"
|
return f"{header};未返回可识别文本。"
|
||||||
return "\n".join([header, "```", preview, "```"])
|
return "\n".join([header, "```", preview, "```"])
|
||||||
|
|
||||||
|
# 兼容旧名
|
||||||
|
def _format_ocr_image(result_data: Dict[str, Any]) -> str:
|
||||||
|
return _format_vlm_analyze(result_data)
|
||||||
|
|
||||||
|
|
||||||
def _format_trigger_easter_egg(result_data: Dict[str, Any]) -> str:
|
def _format_trigger_easter_egg(result_data: Dict[str, Any]) -> str:
|
||||||
if not result_data.get("success"):
|
if not result_data.get("success"):
|
||||||
@ -657,6 +661,7 @@ TOOL_FORMATTERS = {
|
|||||||
"run_command": _format_run_command,
|
"run_command": _format_run_command,
|
||||||
"run_python": _format_run_python,
|
"run_python": _format_run_python,
|
||||||
"extract_webpage": _format_extract_webpage,
|
"extract_webpage": _format_extract_webpage,
|
||||||
|
"vlm_analyze": _format_vlm_analyze,
|
||||||
"ocr_image": _format_ocr_image,
|
"ocr_image": _format_ocr_image,
|
||||||
"trigger_easter_egg": _format_trigger_easter_egg,
|
"trigger_easter_egg": _format_trigger_easter_egg,
|
||||||
"todo_create": _format_todo_create,
|
"todo_create": _format_todo_create,
|
||||||
|
|||||||
@ -25,6 +25,7 @@ from werkzeug.routing import BaseConverter
|
|||||||
import secrets
|
import secrets
|
||||||
import logging
|
import logging
|
||||||
import hmac
|
import hmac
|
||||||
|
import mimetypes
|
||||||
|
|
||||||
# 控制台输出策略:默认静默,只保留简要事件
|
# 控制台输出策略:默认静默,只保留简要事件
|
||||||
_ORIGINAL_PRINT = print
|
_ORIGINAL_PRINT = print
|
||||||
@ -1270,6 +1271,38 @@ def serve_user_upload(filename: str):
|
|||||||
return send_from_directory(str(uploads_dir), str(target.relative_to(uploads_dir)))
|
return send_from_directory(str(uploads_dir), str(target.relative_to(uploads_dir)))
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/workspace/<path:filename>')
|
||||||
|
@login_required
|
||||||
|
def serve_workspace_file(filename: str):
|
||||||
|
"""
|
||||||
|
暴露当前登录用户项目目录下的文件(主要用于图片展示)。
|
||||||
|
- 仅登录用户可访问自己的项目文件
|
||||||
|
- 路径穿越校验:目标必须位于用户自己的 project_path 内
|
||||||
|
- 非图片直接拒绝,避免误暴露其他文件
|
||||||
|
"""
|
||||||
|
user = get_current_user_record()
|
||||||
|
if not user:
|
||||||
|
return redirect('/login')
|
||||||
|
|
||||||
|
workspace = user_manager.ensure_user_workspace(user.username)
|
||||||
|
project_root = workspace.project_path.resolve()
|
||||||
|
|
||||||
|
target = (project_root / filename).resolve()
|
||||||
|
try:
|
||||||
|
target.relative_to(project_root)
|
||||||
|
except ValueError:
|
||||||
|
abort(403)
|
||||||
|
|
||||||
|
if not target.exists() or not target.is_file():
|
||||||
|
abort(404)
|
||||||
|
|
||||||
|
mime_type, _ = mimetypes.guess_type(str(target))
|
||||||
|
if not mime_type or not mime_type.startswith("image/"):
|
||||||
|
abort(415)
|
||||||
|
|
||||||
|
return send_from_directory(str(target.parent), target.name)
|
||||||
|
|
||||||
|
|
||||||
@app.route('/static/<path:filename>')
|
@app.route('/static/<path:filename>')
|
||||||
def static_files(filename):
|
def static_files(filename):
|
||||||
"""提供静态文件"""
|
"""提供静态文件"""
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user