From 785383062411d274d46abe357de9187bb1c62de9 Mon Sep 17 00:00:00 2001 From: JOJO <1498581755@qq.com> Date: Thu, 1 Jan 2026 18:56:29 +0800 Subject: [PATCH] feat: improve vlm workflow and workspace image serving --- core/main_terminal.py | 16 ++++----- core/tool_config.py | 2 +- modules/ocr_client.py | 14 +++++--- prompts/main_system.txt | 9 +++++ static/src/stores/monitor.ts | 1 + .../styles/components/chat/_chat-area.scss | 2 ++ static/src/utils/chatDisplay.ts | 2 ++ static/src/utils/icons.ts | 1 + utils/tool_result_formatter.py | 11 +++++-- web_server.py | 33 +++++++++++++++++++ 10 files changed, 74 insertions(+), 17 deletions(-) diff --git a/core/main_terminal.py b/core/main_terminal.py index 4479611..d769d7f 100644 --- a/core/main_terminal.py +++ b/core/main_terminal.py @@ -748,8 +748,8 @@ class MainTerminal: print(f"{OUTPUT_FORMATS['file']} 创建文件") elif tool_name == "read_file": print(f"{OUTPUT_FORMATS['file']} 读取文件") - elif tool_name == "ocr_image": - print(f"{OUTPUT_FORMATS['file']} 图片OCR") + elif tool_name in {"vlm_analyze", "ocr_image"}: + print(f"{OUTPUT_FORMATS['file']} VLM 视觉理解") elif tool_name == "write_file_diff": print(f"{OUTPUT_FORMATS['file']} 应用补丁") elif tool_name == "delete_file": @@ -1228,13 +1228,13 @@ class MainTerminal: { "type": "function", "function": { - "name": "ocr_image", - "description": "使用 Qwen3-VL模型 读取图片中的文字或根据提示生成描述,仅支持本地图片路径。", + "name": "vlm_analyze", + "description": "使用大参数视觉语言模型(Qwen-VL模型)理解图片:文字、物体、布局、表格等,仅支持本地路径。", "parameters": { "type": "object", "properties": self._inject_intent({ - "path": {"type": "string", "description": "项目内的图片路径"}, - "prompt": {"type": "string", "description": "传递给 OCR 模型的提示词,如“请识别图片中的文字”,“图中的手机是什么颜色的”必须使用中文提示词。"} + "path": {"type": "string", "description": "项目内的图片相对路径"}, + "prompt": {"type": "string", "description": "传递给 VLM 的中文提示词,如“请总结这张图的内容”“表格的总金额是多少”“图中是什么车?”。"} }), "required": ["path", "prompt"] } @@ -1743,12 +1743,12 @@ class MainTerminal: try: if tool_name == "read_file": result = self._handle_read_tool(arguments) - elif tool_name == "ocr_image": + elif tool_name in {"vlm_analyze", "ocr_image"}: path = arguments.get("path") prompt = arguments.get("prompt") if not path: return json.dumps({"success": False, "error": "缺少 path 参数", "warnings": []}, ensure_ascii=False) - result = self.ocr_client.ocr_image(path=path, prompt=prompt or "") + result = self.ocr_client.vlm_analyze(path=path, prompt=prompt or "") # 终端会话管理工具 elif tool_name == "terminal_session": diff --git a/core/tool_config.py b/core/tool_config.py index 0680047..94c1672 100644 --- a/core/tool_config.py +++ b/core/tool_config.py @@ -40,7 +40,7 @@ TOOL_CATEGORIES: Dict[str, ToolCategory] = { ), "read_focus": ToolCategory( label="阅读聚焦", - tools=["read_file", "focus_file", "unfocus_file", "ocr_image"], + tools=["read_file", "focus_file", "unfocus_file", "vlm_analyze", "ocr_image"], ), "terminal_realtime": ToolCategory( label="实时终端", diff --git a/modules/ocr_client.py b/modules/ocr_client.py index 3ad7537..455f27d 100644 --- a/modules/ocr_client.py +++ b/modules/ocr_client.py @@ -1,4 +1,4 @@ -"""DeepSeek-OCR 客户端(主智能体专用)。""" +"""视觉语言模型客户端(主智能体专用)。""" import base64 import mimetypes @@ -13,7 +13,7 @@ from modules.file_manager import FileManager class OCRClient: - """封装 DeepSeek-OCR 调用逻辑。""" + """封装 VLM(如 DeepSeek-OCR / Qwen-VL)调用逻辑。""" def __init__(self, project_path: str, file_manager: FileManager): self.project_path = Path(project_path).resolve() @@ -48,8 +48,8 @@ class OCRClient: return False, "不是文件", None return True, "", full_path - def ocr_image(self, path: str, prompt: str) -> Dict: - """执行 OCR,返回最简结果格式。""" + def vlm_analyze(self, path: str, prompt: str) -> Dict: + """使用大参数视觉语言模型分析图片:文字、物体、布局等。""" warnings: List[str] = [] valid, error, full_path = self._validate_image_path(path) @@ -101,4 +101,8 @@ class OCRClient: content = response.choices[0].message.content if response.choices else "" return {"success": True, "content": content or "", "warnings": warnings} except Exception as exc: - return {"success": False, "error": f"OCR 调用失败: {exc}", "warnings": warnings} + return {"success": False, "error": f"VLM 调用失败: {exc}", "warnings": warnings} + + def ocr_image(self, path: str, prompt: str) -> Dict: + """兼容旧名,转发到 vlm_analyze。""" + return self.vlm_analyze(path, prompt) diff --git a/prompts/main_system.txt b/prompts/main_system.txt index 66d678b..4e1fcca 100644 --- a/prompts/main_system.txt +++ b/prompts/main_system.txt @@ -8,6 +8,7 @@ - **数据整理**:处理表格、分析数据、生成报告 - **文件管理**:创建、修改、重命名文件和文件夹 - **自动化任务**:批量处理文件、执行重复性工作 +- **视觉理解**:用 `vlm_analyze` 调用大参数 VLM(基于 Qwen-VL),识别图片中文字/物体/表格/场景并回答相关问题,不仅仅是 OCR。 ## 图片展示 - 如果需要直接在界面展示图片(本地或网络),请在回复里输出 ``,不用调用工具。 @@ -18,6 +19,14 @@ - `` - `` +### 图片检索与展示流程 +- 触发:用户询问“X长什么样”“给我看X的图片”等需求时。 +- 检索:用 `web_search` 搜索关键词(必要时添加“图片/照片/截图”等),挑选相关度最高的结果。 +- 提取:对候选链接使用 `extract_webpage` 获取正文中的图片直链,优先 `https`、扩展名为 jpg/png/webp、分辨率≥800px 的原图,避开缩略图和水印预览。 +- 本地/校验:已有本地图片时直接展示;若网上图片是否匹配存疑,先下载并用 `vlm_analyze`(VLM 视觉理解)查看内容后再确定是否展示。 +- 展示:选数张代表性图片,直接输出 ``;需要多张时多行重复该标签。 +- 回退:用户反馈“看不到/无法展示”时,先将图片下载到可访问路径(如 `/workspace/cache/xxx.jpg`)再用本地路径展示;仍失败则提供文字描述并询问是否换图源。 + ## 重要提醒:你的工作环境 1. **云端运行**:你在远程服务器上工作,在网页端和用户交互 2. **多人共用**:服务器上可能有其他用户,你只能访问被授权的文件夹 diff --git a/static/src/stores/monitor.ts b/static/src/stores/monitor.ts index 7f0aab8..2562d34 100644 --- a/static/src/stores/monitor.ts +++ b/static/src/stores/monitor.ts @@ -96,6 +96,7 @@ const TOOL_SCENE_MAP: Record = { read_file: 'reader', focus_file: 'focus', unfocus_file: 'unfocus', + vlm_analyze: 'ocr', ocr_image: 'ocr', create_folder: 'createFolder', create_file: 'createFile', diff --git a/static/src/styles/components/chat/_chat-area.scss b/static/src/styles/components/chat/_chat-area.scss index f58e565..b5ae702 100644 --- a/static/src/styles/components/chat/_chat-area.scss +++ b/static/src/styles/components/chat/_chat-area.scss @@ -392,6 +392,8 @@ display: flex; flex-direction: column; gap: 2px; + user-select: none; + -webkit-user-select: none; } .more-title { diff --git a/static/src/utils/chatDisplay.ts b/static/src/utils/chatDisplay.ts index 2c0e4ca..a8b852b 100644 --- a/static/src/utils/chatDisplay.ts +++ b/static/src/utils/chatDisplay.ts @@ -15,6 +15,7 @@ const RUNNING_ANIMATIONS: Record = { web_search: 'search-animation', extract_webpage: 'search-animation', save_webpage: 'file-animation', + vlm_analyze: 'file-animation', run_python: 'code-animation', run_command: 'terminal-animation', update_memory: 'memory-animation', @@ -66,6 +67,7 @@ const COMPLETED_STATUS_TEXTS: Record = { web_search: '搜索完成', extract_webpage: '网页提取完成', save_webpage: '网页保存完成(纯文本)', + vlm_analyze: '图片解析完成', run_python: '代码执行完成', run_command: '命令执行完成', update_memory: '记忆更新成功', diff --git a/static/src/utils/icons.ts b/static/src/utils/icons.ts index f880b58..5936424 100644 --- a/static/src/utils/icons.ts +++ b/static/src/utils/icons.ts @@ -50,6 +50,7 @@ export const TOOL_ICON_MAP = Object.freeze({ focus_file: 'eye', modify_file: 'pencil', write_file_diff: 'pencil', + vlm_analyze: 'camera', ocr_image: 'camera', read_file: 'book', rename_file: 'pencil', diff --git a/utils/tool_result_formatter.py b/utils/tool_result_formatter.py index ae179fd..e0e6e84 100644 --- a/utils/tool_result_formatter.py +++ b/utils/tool_result_formatter.py @@ -570,18 +570,22 @@ def _format_extract_webpage(result_data: Dict[str, Any]) -> str: return "\n".join([f"{header}{note}", "```", preview, "```"]) -def _format_ocr_image(result_data: Dict[str, Any]) -> str: +def _format_vlm_analyze(result_data: Dict[str, Any]) -> str: if not result_data.get("success"): - return _format_failure("ocr_image", result_data) + return _format_failure("vlm_analyze", result_data) content = result_data.get("content") or "" length = len(content) preview, truncated = _preview_text(content, 800) note = "(截断预览)" if truncated else "(未截断)" - header = f"OCR 完成,长度 {length} 字符{note}" + header = f"VLM 解析完成,长度 {length} 字符{note}" if not content: return f"{header};未返回可识别文本。" return "\n".join([header, "```", preview, "```"]) +# 兼容旧名 +def _format_ocr_image(result_data: Dict[str, Any]) -> str: + return _format_vlm_analyze(result_data) + def _format_trigger_easter_egg(result_data: Dict[str, Any]) -> str: if not result_data.get("success"): @@ -657,6 +661,7 @@ TOOL_FORMATTERS = { "run_command": _format_run_command, "run_python": _format_run_python, "extract_webpage": _format_extract_webpage, + "vlm_analyze": _format_vlm_analyze, "ocr_image": _format_ocr_image, "trigger_easter_egg": _format_trigger_easter_egg, "todo_create": _format_todo_create, diff --git a/web_server.py b/web_server.py index 677528e..2579d3a 100644 --- a/web_server.py +++ b/web_server.py @@ -25,6 +25,7 @@ from werkzeug.routing import BaseConverter import secrets import logging import hmac +import mimetypes # 控制台输出策略:默认静默,只保留简要事件 _ORIGINAL_PRINT = print @@ -1270,6 +1271,38 @@ def serve_user_upload(filename: str): return send_from_directory(str(uploads_dir), str(target.relative_to(uploads_dir))) +@app.route('/workspace/') +@login_required +def serve_workspace_file(filename: str): + """ + 暴露当前登录用户项目目录下的文件(主要用于图片展示)。 + - 仅登录用户可访问自己的项目文件 + - 路径穿越校验:目标必须位于用户自己的 project_path 内 + - 非图片直接拒绝,避免误暴露其他文件 + """ + user = get_current_user_record() + if not user: + return redirect('/login') + + workspace = user_manager.ensure_user_workspace(user.username) + project_root = workspace.project_path.resolve() + + target = (project_root / filename).resolve() + try: + target.relative_to(project_root) + except ValueError: + abort(403) + + if not target.exists() or not target.is_file(): + abort(404) + + mime_type, _ = mimetypes.guess_type(str(target)) + if not mime_type or not mime_type.startswith("image/"): + abort(415) + + return send_from_directory(str(target.parent), target.name) + + @app.route('/static/') def static_files(filename): """提供静态文件"""