<fix thinking chunk>

This commit is contained in:
JOJO 2025-11-19 20:47:56 +08:00
parent cf4d34e7ac
commit 6921939c13
26 changed files with 20982 additions and 68 deletions

View File

@ -5,9 +5,9 @@ API_KEY = "3e96a682-919d-45c1-acb2-53bc4e9660d3"
MODEL_ID = "kimi-k2-250905" MODEL_ID = "kimi-k2-250905"
# 推理模型配置(智能思考模式使用) # 推理模型配置(智能思考模式使用)
THINKING_API_BASE_URL = "https://ark.cn-beijing.volces.com/api/v3" THINKING_API_BASE_URL = "https://api.moonshot.cn/v1"
THINKING_API_KEY = "3e96a682-919d-45c1-acb2-53bc4e9660d3" THINKING_API_KEY = "sk-xW0xjfQM6Mp9ZCWMLlnHiRJcpEOIZPTkXcN0dQ15xpZSuw2y"
THINKING_MODEL_ID = "kimi-k2-250905" THINKING_MODEL_ID = "kimi-k2-thinking"
# Tavily 搜索 # Tavily 搜索
TAVILY_API_KEY = "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" TAVILY_API_KEY = "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"

View File

@ -3,7 +3,7 @@
OCR_API_BASE_URL = "https://api.siliconflow.cn" OCR_API_BASE_URL = "https://api.siliconflow.cn"
OCR_API_KEY = "sk-suqqgewtlwajjkylvnotdhkzmsrshmrqptkakdqjmlrilaes" OCR_API_KEY = "sk-suqqgewtlwajjkylvnotdhkzmsrshmrqptkakdqjmlrilaes"
OCR_MODEL_ID = "deepseek-ai/DeepSeek-OCR" OCR_MODEL_ID = "deepseek-ai/DeepSeek-OCR"
OCR_MAX_TOKENS = 4096 OCR_MAX_TOKENS = 200
__all__ = [ __all__ = [
"OCR_API_BASE_URL", "OCR_API_BASE_URL",

View File

@ -443,36 +443,18 @@ class PersistentTerminal:
"command": command_text, "command": command_text,
"output": output, "output": output,
"message": message, "message": message,
"duration": round(time.time() - start_time, 3),
"pending_output": status in ("no_output", "awaiting_input", "echo_loop"),
"timeout_used": timeout,
"status": status, "status": status,
"is_interactive": self.is_interactive, "truncated": output_truncated
"echo_loop_detected": self.echo_loop_detected,
"seconds_since_last_output": self._seconds_since_last_output(),
"output_char_count": len(output),
"last_output_time": self.last_output_time,
"output_truncated": output_truncated,
"output_char_limit": TERMINAL_INPUT_MAX_CHARS
} }
else: else:
return { return {
"success": True, "success": True,
"session": self.session_name, "session": self.session_name,
"command": command_text, "command": command_text,
"output": "命令已发送", "output": "",
"message": "命令已发送至终端,后续输出将实时流式返回", "message": "命令已发送至终端,后续输出将实时流式返回",
"duration": round(time.time() - start_time, 3),
"pending_output": True,
"timeout_used": timeout,
"status": "pending", "status": "pending",
"is_interactive": self.is_interactive, "truncated": False
"echo_loop_detected": self.echo_loop_detected,
"seconds_since_last_output": self._seconds_since_last_output(),
"output_char_count": 0,
"last_output_time": self.last_output_time,
"output_truncated": False,
"output_char_limit": TERMINAL_INPUT_MAX_CHARS
} }
except Exception as e: except Exception as e:

View File

@ -12,7 +12,8 @@ try:
CODE_EXECUTION_TIMEOUT, CODE_EXECUTION_TIMEOUT,
TERMINAL_COMMAND_TIMEOUT, TERMINAL_COMMAND_TIMEOUT,
FORBIDDEN_COMMANDS, FORBIDDEN_COMMANDS,
OUTPUT_FORMATS OUTPUT_FORMATS,
MAX_RUN_COMMAND_CHARS
) )
except ImportError: except ImportError:
project_root = Path(__file__).resolve().parents[1] project_root = Path(__file__).resolve().parents[1]
@ -22,7 +23,8 @@ except ImportError:
CODE_EXECUTION_TIMEOUT, CODE_EXECUTION_TIMEOUT,
TERMINAL_COMMAND_TIMEOUT, TERMINAL_COMMAND_TIMEOUT,
FORBIDDEN_COMMANDS, FORBIDDEN_COMMANDS,
OUTPUT_FORMATS OUTPUT_FORMATS,
MAX_RUN_COMMAND_CHARS
) )
class TerminalOperator: class TerminalOperator:
@ -179,10 +181,6 @@ class TerminalOperator:
stdout_text = stdout.decode('utf-8', errors='replace') if stdout else "" stdout_text = stdout.decode('utf-8', errors='replace') if stdout else ""
stderr_text = stderr.decode('utf-8', errors='replace') if stderr else "" stderr_text = stderr.decode('utf-8', errors='replace') if stderr else ""
output = stdout_text
if stderr_text:
output += f"\n[错误输出]\n{stderr_text}"
success = process.returncode == 0 success = process.returncode == 0
if success: if success:
@ -190,14 +188,28 @@ class TerminalOperator:
else: else:
print(f"{OUTPUT_FORMATS['error']} 命令执行失败 (返回码: {process.returncode})") print(f"{OUTPUT_FORMATS['error']} 命令执行失败 (返回码: {process.returncode})")
return { output_parts = []
if stdout_text:
output_parts.append(stdout_text)
if stderr_text:
output_parts.append(f"[stderr]\n{stderr_text}")
combined_output = "\n".join(output_parts)
truncated = False
if MAX_RUN_COMMAND_CHARS and len(combined_output) > MAX_RUN_COMMAND_CHARS:
truncated = True
combined_output = combined_output[-MAX_RUN_COMMAND_CHARS:]
result_payload = {
"success": success, "success": success,
"output": output, "command": command,
"stdout": stdout_text, "output": combined_output,
"stderr": stderr_text,
"return_code": process.returncode, "return_code": process.returncode,
"command": command "truncated": truncated
} }
if stderr_text:
result_payload["stderr"] = stderr_text
return result_payload
except Exception as e: except Exception as e:
return { return {

View File

@ -10,7 +10,7 @@
- **自动化任务**:批量处理文件、执行重复性工作 - **自动化任务**:批量处理文件、执行重复性工作
## 重要提醒:你的工作环境 ## 重要提醒:你的工作环境
1. **云端运行**:你在远程服务器上工作,没有图形界面,只能通过命令行操作 1. **云端运行**:你在远程服务器上工作,在网页端和用户交互
2. **多人共用**:服务器上可能有其他用户,你只能访问被授权的文件夹 2. **多人共用**:服务器上可能有其他用户,你只能访问被授权的文件夹
3. **文件传输**:用户可以在网页上传文件给你,你也可以生成文件让用户下载 3. **文件传输**:用户可以在网页上传文件给你,你也可以生成文件让用户下载
4. **安全第一**:只操作用户明确要求的文件,不要碰其他内容 4. **安全第一**:只操作用户明确要求的文件,不要碰其他内容

View File

@ -1,6 +1,10 @@
你现在处于「智能思考模式」。以下规则仅在本模式下生效: 你现在处于「思考模式」。
思考模式时第一次请求的模型不是kimi-k2而是kimi-k2-thinking 一个更善于分析复杂问题规划复杂流程的模型在后续请求时模型会换回kimi-k2
请百分百遵循一下原则:
1. **思考阶段** 1. **思考阶段**
至少要思考的内容:
- 先思考分析用户需求:要解决什么问题、现有信息缺口、潜在风险。 - 先思考分析用户需求:要解决什么问题、现有信息缺口、潜在风险。
- 评估是否需要执行以下操作,并说明原因: - 评估是否需要执行以下操作,并说明原因:
* 阅读或聚焦哪些文件?需要查看哪些片段? * 阅读或聚焦哪些文件?需要查看哪些片段?
@ -19,3 +23,5 @@
- 不要一上来就连续执行命令,先让用户看懂你的下一步安排。 - 不要一上来就连续执行命令,先让用户看懂你的下一步安排。
- 若判断无需任何工具或修改,也要明确说明理由。 - 若判断无需任何工具或修改,也要明确说明理由。
- 保持语气专业但亲切,让用户清楚你即将采取的行动。 - 保持语气专业但亲切,让用户清楚你即将采取的行动。
# 你的思考过程在后文会不可见所以你需要在用户给出要求时的第一次回答中尽可能在正式输出中详细描述你的规划来作为后面kimi-k2模型行动时提供参考依据

View File

@ -2190,12 +2190,24 @@ async function bootstrapApp() {
} }
}, },
getSubAgentBaseUrl() {
const override = window.SUB_AGENT_BASE_URL || window.__SUB_AGENT_BASE_URL__;
if (override && typeof override === 'string') {
return override.replace(/\/$/, '');
}
const { protocol, hostname } = window.location;
if (hostname && hostname.includes('agent.')) {
const mappedHost = hostname.replace('agent.', 'subagent.');
return `${protocol}//${mappedHost}`;
}
return `${protocol}//${hostname}:8092`;
},
openSubAgent(agent) { openSubAgent(agent) {
if (!agent || !agent.task_id) { if (!agent || !agent.task_id) {
return; return;
} }
const { protocol, hostname } = window.location; const base = this.getSubAgentBaseUrl();
const base = `${protocol}//${hostname}:8092`;
const parentConv = agent.conversation_id || this.currentConversationId || ''; const parentConv = agent.conversation_id || this.currentConversationId || '';
const convSegment = this.stripConversationPrefix(parentConv); const convSegment = this.stripConversationPrefix(parentConv);
const agentLabel = agent.agent_id ? `sub_agent${agent.agent_id}` : agent.task_id; const agentLabel = agent.agent_id ? `sub_agent${agent.agent_id}` : agent.task_id;

View File

@ -45,6 +45,7 @@ from core.tool_config import TOOL_CATEGORIES
from utils.api_client import DeepSeekClient from utils.api_client import DeepSeekClient
from utils.context_manager import ContextManager from utils.context_manager import ContextManager
from utils.logger import setup_logger from utils.logger import setup_logger
OCR_ICON = "📸"
logger = setup_logger(__name__) logger = setup_logger(__name__)
# 临时禁用长度检查 # 临时禁用长度检查
@ -610,7 +611,7 @@ class MainTerminal:
elif tool_name == "read_file": elif tool_name == "read_file":
print(f"{OUTPUT_FORMATS['file']} 读取文件") print(f"{OUTPUT_FORMATS['file']} 读取文件")
elif tool_name == "ocr_image": elif tool_name == "ocr_image":
print(f"{OUTPUT_FORMATS['file']} 图片OCR") print(f"{OCR_ICON} 图片OCR")
elif tool_name == "modify_file": elif tool_name == "modify_file":
print(f"{OUTPUT_FORMATS['file']} 修改文件") print(f"{OUTPUT_FORMATS['file']} 修改文件")
elif tool_name == "delete_file": elif tool_name == "delete_file":

View File

@ -2379,18 +2379,27 @@ async function bootstrapApp() {
} }
}, },
getSubAgentBaseUrl() {
const override = window.SUB_AGENT_BASE_URL || window.__SUB_AGENT_BASE_URL__;
if (override && typeof override === 'string') {
return override.replace(/\/$/, '');
}
const { protocol, hostname } = window.location;
if (hostname && hostname.includes('agent.')) {
const mappedHost = hostname.replace('agent.', 'subagent.');
return `${protocol}//${mappedHost}`;
}
return `${protocol}//${hostname}:8092`;
},
openSubAgent(agent) { openSubAgent(agent) {
if (this.isSubAgentView || !agent || !agent.task_id) { if (this.isSubAgentView || !agent || !agent.task_id) {
return; return;
} }
if (!agent || !agent.task_id) {
return;
}
const { protocol, hostname } = window.location;
const parentConv = agent.conversation_id || this.currentConversationId || ''; const parentConv = agent.conversation_id || this.currentConversationId || '';
const convSegment = this.stripConversationPrefix(parentConv); const convSegment = this.stripConversationPrefix(parentConv);
const agentLabel = agent.agent_id ? `sub_agent${agent.agent_id}` : agent.task_id; const agentLabel = agent.agent_id ? `sub_agent${agent.agent_id}` : agent.task_id;
const base = `${protocol}//${hostname}:8092`; const base = this.getSubAgentBaseUrl();
const pathSuffix = convSegment const pathSuffix = convSegment
? `/${convSegment}+${agentLabel}` ? `/${convSegment}+${agentLabel}`
: `/sub_agent/${agent.task_id}`; : `/sub_agent/${agent.task_id}`;

2102
sub_agent/sub_web_server.log Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,764 @@
# 使用Python调用硅基流动DeepSeek-OCR模型教程
## 📖 简介
DeepSeek-OCR是一个强大的视觉语言模型,专门用于光学字符识别(OCR)和文档理解。通过硅基流动(SiliconFlow)平台,你可以轻松通过API调用这个模型来分析图片内容。
## 🎯 模型特点
- **高精度识别**: 在10倍压缩率下能保持97%的识别精度
- **多语言支持**: 支持100+种语言识别
- **多种模式**: 支持文档转Markdown、表格识别、手写识别等
- **高效处理**: 使用视觉token压缩技术,大幅减少token消耗
## 🚀 快速开始
### 1. 准备工作
#### 1.1 注册硅基流动账号并获取API密钥
1. 访问 [硅基流动官网](https://cloud.siliconflow.cn/)
2. 注册账号(新用户可获得2000万免费tokens)
3. 进入"API密钥"页面,点击"新建API密钥"
4. 复制保存你的API Key(格式为`sk-xxxxx`)
#### 1.2 安装必要的Python库
```bash
pip install openai pillow requests
```
### 2. 基础使用示例
#### 2.1 使用本地图片进行OCR识别
```python
"""
示例1: 基础OCR识别 - 识别图片中的文字内容
适用场景: 文档扫描、截图文字提取
"""
import base64
from openai import OpenAI
# 配置API
client = OpenAI(
api_key="sk-your-api-key-here", # 替换为你的API密钥
base_url="https://api.siliconflow.cn/v1"
)
# 将图片转换为base64编码
def encode_image(image_path):
"""读取图片并转换为base64编码"""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
# 准备图片
image_path = "your_image.jpg" # 替换为你的图片路径
base64_image = encode_image(image_path)
# 调用API进行OCR识别
response = client.chat.completions.create(
model="deepseek-ai/DeepSeek-OCR",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
},
{
"type": "text",
"text": "请识别图片中的所有文字内容"
}
]
}
],
max_tokens=4096,
temperature=0.7
)
# 打印结果
print("识别结果:")
print(response.choices[0].message.content)
```
#### 2.2 使用URL图片进行识别
```python
"""
示例2: 使用图片URL进行识别
适用场景: 处理网络图片、远程图片分析
"""
from openai import OpenAI
client = OpenAI(
api_key="sk-your-api-key-here",
base_url="https://api.siliconflow.cn/v1"
)
# 使用图片URL
image_url = "https://example.com/image.jpg" # 替换为实际的图片URL
response = client.chat.completions.create(
model="deepseek-ai/DeepSeek-OCR",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": image_url
}
},
{
"type": "text",
"text": "请描述这张图片的内容"
}
]
}
],
max_tokens=4096
)
print(response.choices[0].message.content)
```
### 3. 高级使用场景
#### 3.1 文档转Markdown格式
```python
"""
示例3: 将文档图片转换为Markdown格式
适用场景: 文档数字化、笔记整理
"""
from openai import OpenAI
import base64
def document_to_markdown(image_path, api_key):
"""将文档图片转换为Markdown格式"""
client = OpenAI(
api_key=api_key,
base_url="https://api.siliconflow.cn/v1"
)
# 读取并编码图片
with open(image_path, "rb") as image_file:
base64_image = base64.b64encode(image_file.read()).decode('utf-8')
# 调用API,使用专门的Markdown转换提示词
response = client.chat.completions.create(
model="deepseek-ai/DeepSeek-OCR",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
},
{
"type": "text",
"text": "<image>\n<|grounding|>Convert the document to markdown."
}
]
}
],
max_tokens=8192,
temperature=0
)
return response.choices[0].message.content
# 使用示例
api_key = "sk-your-api-key-here"
markdown_result = document_to_markdown("document.jpg", api_key)
print(markdown_result)
# 可选: 保存为markdown文件
with open("output.md", "w", encoding="utf-8") as f:
f.write(markdown_result)
```
#### 3.2 表格识别和提取
```python
"""
示例4: 识别图片中的表格并转换为结构化数据
适用场景: 财务报表、数据表格处理
"""
from openai import OpenAI
import base64
def extract_table(image_path, api_key):
"""从图片中提取表格数据"""
client = OpenAI(
api_key=api_key,
base_url="https://api.siliconflow.cn/v1"
)
with open(image_path, "rb") as image_file:
base64_image = base64.b64encode(image_file.read()).decode('utf-8')
response = client.chat.completions.create(
model="deepseek-ai/DeepSeek-OCR",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
},
{
"type": "text",
"text": "请识别图片中的表格,并以Markdown表格格式输出"
}
]
}
],
max_tokens=4096
)
return response.choices[0].message.content
# 使用示例
result = extract_table("table_image.jpg", "sk-your-api-key-here")
print(result)
```
#### 3.3 手写文字识别
```python
"""
示例5: 识别手写文字
适用场景: 手写笔记、签名识别
"""
from openai import OpenAI
import base64
def recognize_handwriting(image_path, api_key):
"""识别手写文字内容"""
client = OpenAI(
api_key=api_key,
base_url="https://api.siliconflow.cn/v1"
)
with open(image_path, "rb") as image_file:
base64_image = base64.b64encode(image_file.read()).decode('utf-8')
response = client.chat.completions.create(
model="deepseek-ai/DeepSeek-OCR",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
},
{
"type": "text",
"text": "请仔细识别这张图片中的所有手写文字内容,包括标点符号"
}
]
}
],
max_tokens=4096,
temperature=0.3
)
return response.choices[0].message.content
# 使用示例
handwriting_text = recognize_handwriting("handwriting.jpg", "sk-your-api-key-here")
print("识别的手写内容:")
print(handwriting_text)
```
#### 3.4 批量图片处理
```python
"""
示例6: 批量处理多张图片
适用场景: 大量文档批处理、自动化工作流
"""
from openai import OpenAI
import base64
import os
from pathlib import Path
def batch_ocr_process(image_folder, api_key, output_folder="output"):
"""批量处理文件夹中的所有图片"""
client = OpenAI(
api_key=api_key,
base_url="https://api.siliconflow.cn/v1"
)
# 创建输出文件夹
Path(output_folder).mkdir(exist_ok=True)
# 获取所有图片文件
image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.webp']
image_files = [f for f in os.listdir(image_folder)
if any(f.lower().endswith(ext) for ext in image_extensions)]
results = []
for idx, image_file in enumerate(image_files, 1):
print(f"处理第 {idx}/{len(image_files)} 张图片: {image_file}")
image_path = os.path.join(image_folder, image_file)
try:
# 读取并编码图片
with open(image_path, "rb") as f:
base64_image = base64.b64encode(f.read()).decode('utf-8')
# 调用OCR API
response = client.chat.completions.create(
model="deepseek-ai/DeepSeek-OCR",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
},
{
"type": "text",
"text": "请识别图片中的所有文字内容"
}
]
}
],
max_tokens=4096
)
result_text = response.choices[0].message.content
# 保存结果到文本文件
output_file = os.path.join(output_folder, f"{Path(image_file).stem}.txt")
with open(output_file, "w", encoding="utf-8") as f:
f.write(result_text)
results.append({
"image": image_file,
"status": "success",
"output": output_file
})
print(f"✓ 完成: {image_file} -> {output_file}")
except Exception as e:
print(f"✗ 错误: {image_file} - {str(e)}")
results.append({
"image": image_file,
"status": "error",
"error": str(e)
})
return results
# 使用示例
api_key = "sk-your-api-key-here"
results = batch_ocr_process("images_folder", api_key, "ocr_results")
# 打印统计信息
success_count = sum(1 for r in results if r["status"] == "success")
print(f"\n处理完成: 成功 {success_count}/{len(results)} 张图片")
```
### 4. 完整实用工具类
```python
"""
示例7: 封装的OCR工具类
提供便捷的接口用于各种OCR任务
"""
import base64
import os
from typing import Optional, List, Dict
from openai import OpenAI
from pathlib import Path
class DeepSeekOCR:
"""DeepSeek-OCR 工具类"""
def __init__(self, api_key: str):
"""
初始化OCR工具
Args:
api_key: 硅基流动API密钥
"""
self.client = OpenAI(
api_key=api_key,
base_url="https://api.siliconflow.cn/v1"
)
self.model = "deepseek-ai/DeepSeek-OCR"
def _encode_image(self, image_path: str) -> str:
"""将图片编码为base64"""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def ocr(self,
image_path: str,
prompt: str = "请识别图片中的所有文字内容",
max_tokens: int = 4096,
temperature: float = 0.7) -> str:
"""
基础OCR识别
Args:
image_path: 图片路径
prompt: 提示词
max_tokens: 最大输出token数
temperature: 温度参数
Returns:
识别结果文本
"""
base64_image = self._encode_image(image_path)
response = self.client.chat.completions.create(
model=self.model,
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
},
{
"type": "text",
"text": prompt
}
]
}
],
max_tokens=max_tokens,
temperature=temperature
)
return response.choices[0].message.content
def to_markdown(self, image_path: str) -> str:
"""
将文档图片转换为Markdown格式
Args:
image_path: 图片路径
Returns:
Markdown格式文本
"""
return self.ocr(
image_path,
prompt="<image>\n<|grounding|>Convert the document to markdown.",
max_tokens=8192,
temperature=0
)
def extract_table(self, image_path: str) -> str:
"""
提取表格数据
Args:
image_path: 图片路径
Returns:
表格文本(Markdown格式)
"""
return self.ocr(
image_path,
prompt="请识别图片中的表格,并以Markdown表格格式输出",
max_tokens=4096
)
def describe_image(self, image_path: str) -> str:
"""
描述图片内容
Args:
image_path: 图片路径
Returns:
图片描述文本
"""
return self.ocr(
image_path,
prompt="请详细描述这张图片的内容,包括主要元素、颜色、布局等",
max_tokens=4096
)
def batch_process(self,
image_folder: str,
output_folder: str = "output",
save_format: str = "txt") -> List[Dict]:
"""
批量处理图片
Args:
image_folder: 图片文件夹路径
output_folder: 输出文件夹路径
save_format: 保存格式('txt' 或 'md')
Returns:
处理结果列表
"""
Path(output_folder).mkdir(exist_ok=True)
image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.webp']
image_files = [f for f in os.listdir(image_folder)
if any(f.lower().endswith(ext) for ext in image_extensions)]
results = []
for idx, image_file in enumerate(image_files, 1):
print(f"处理 {idx}/{len(image_files)}: {image_file}")
image_path = os.path.join(image_folder, image_file)
try:
result_text = self.ocr(image_path)
output_file = os.path.join(
output_folder,
f"{Path(image_file).stem}.{save_format}"
)
with open(output_file, "w", encoding="utf-8") as f:
f.write(result_text)
results.append({
"image": image_file,
"status": "success",
"output": output_file
})
except Exception as e:
results.append({
"image": image_file,
"status": "error",
"error": str(e)
})
return results
# 使用示例
if __name__ == "__main__":
# 初始化工具
ocr = DeepSeekOCR(api_key="sk-your-api-key-here")
# 基础OCR
text = ocr.ocr("document.jpg")
print("OCR结果:", text)
# 转换为Markdown
markdown = ocr.to_markdown("document.jpg")
print("\nMarkdown:", markdown)
# 提取表格
table = ocr.extract_table("table.jpg")
print("\n表格:", table)
# 描述图片
description = ocr.describe_image("photo.jpg")
print("\n图片描述:", description)
# 批量处理
results = ocr.batch_process("images", "results")
print(f"\n批量处理完成: {len(results)} 张图片")
```
## 📋 提示词参考
根据不同的使用场景,你可以使用以下提示词:
### 文档类
```python
# 文档转Markdown
"<image>\n<|grounding|>Convert the document to markdown."
# 自由OCR(不保留格式)
"<image>\nFree OCR."
# 保留格式的OCR
"请识别图片中的所有文字内容,并尽可能保留原始排版格式"
```
### 表格类
```python
# 表格识别
"请识别图片中的表格,并以Markdown表格格式输出"
# 表格数据提取
"请提取图片中的表格数据,按行列结构化输出"
```
### 分析类
```python
# 图片描述
"请详细描述这张图片的内容"
# 图表分析
"请分析图片中的图表数据,并总结主要趋势"
# 多语言识别
"请识别图片中的文字,并指出使用的语言"
```
## ⚙️ 参数说明
### model
- 值: `"deepseek-ai/DeepSeek-OCR"`
- 说明: 模型名称,固定使用此值
### max_tokens
- 范围: 1-8192
- 默认: 4096
- 说明: 最大输出token数,复杂文档建议设置为8192
### temperature
- 范围: 0-2
- 默认: 0.7
- 说明:
- 0: 最确定的输出,适合OCR识别
- 0.7: 平衡创造性和准确性
- 1.0+: 更有创造性,适合描述性任务
## 💡 最佳实践
### 1. 图片质量优化
- 使用清晰的图片(建议分辨率 > 1024x1024)
- 避免过度压缩的图片
- 确保文字清晰可见
### 2. 提示词优化
- 明确说明需要的输出格式
- 对于特定类型的内容(如表格、代码),在提示词中明确指出
- 使用中文提示词效果更好
### 3. 错误处理
```python
import time
from openai import OpenAI, APIError
def ocr_with_retry(image_path, api_key, max_retries=3):
"""带重试机制的OCR调用"""
client = OpenAI(api_key=api_key, base_url="https://api.siliconflow.cn/v1")
for attempt in range(max_retries):
try:
with open(image_path, "rb") as f:
base64_image = base64.b64encode(f.read()).decode('utf-8')
response = client.chat.completions.create(
model="deepseek-ai/DeepSeek-OCR",
messages=[{
"role": "user",
"content": [{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
}, {
"type": "text",
"text": "请识别图片中的所有文字内容"
}]
}],
max_tokens=4096
)
return response.choices[0].message.content
except APIError as e:
if attempt < max_retries - 1:
wait_time = 2 ** attempt
print(f"API错误,{wait_time}秒后重试... ({attempt + 1}/{max_retries})")
time.sleep(wait_time)
else:
print(f"达到最大重试次数,失败: {str(e)}")
raise
except Exception as e:
print(f"未预期的错误: {str(e)}")
raise
```
### 4. Token优化
- 对于简单的文字识别,使用较小的max_tokens(如2048)
- 对于复杂文档或表格,使用较大的max_tokens(8192)
- 批量处理时注意API速率限制
## 🔍 常见问题
### Q1: 如何获取更多免费tokens?
A: 硅基流动提供邀请机制,邀请其他用户注册可获得额外tokens
### Q2: 支持哪些图片格式?
A: 支持常见格式如JPG、PNG、WEBP、BMP等
### Q3: 单张图片有大小限制吗?
A: 建议单张图片不超过10MB
### Q4: 识别效果不理想怎么办?
A:
- 确保图片清晰度足够
- 优化提示词,更明确地说明需求
- 尝试调整temperature参数
- 对于复杂文档,可以分区域识别后合并
### Q5: API调用频率有限制吗?
A: 有速率限制,具体限制请查看硅基流动平台文档
## 📚 相关资源
- [硅基流动官网](https://cloud.siliconflow.cn/)
- [硅基流动文档](https://docs.siliconflow.cn/)
- [DeepSeek-OCR GitHub](https://github.com/deepseek-ai/DeepSeek-OCR)
- [OpenAI Python SDK文档](https://github.com/openai/openai-python)
## 📝 总结
通过硅基流动平台调用DeepSeek-OCR模型非常简单:
1. 注册账号获取API密钥
2. 安装openai库
3. 使用OpenAI兼容的API格式调用
4. 将图片转换为base64或使用URL
5. 根据需求调整提示词和参数
这个教程涵盖了从基础使用到高级应用的各种场景,你可以根据自己的需求选择合适的示例代码使用。
## 🎉 开始使用
现在你已经掌握了所有必要的知识,快去注册硅基流动账号,开始你的OCR之旅吧!
---
**最后更新**: 2025年11月
**作者**: Claude
**许可**: MIT License

Binary file not shown.

After

Width:  |  Height:  |  Size: 650 KiB

View File

@ -349,17 +349,14 @@ class DeepSeekClient:
in_thinking = False in_thinking = False
thinking_printed = False thinking_printed = False
# 获取当前是否应该显示思考
should_show_thinking = self.get_current_thinking_mode()
async for chunk in self.chat(messages, tools, stream=True): async for chunk in self.chat(messages, tools, stream=True):
if "choices" not in chunk: if "choices" not in chunk:
continue continue
delta = chunk["choices"][0].get("delta", {}) delta = chunk["choices"][0].get("delta", {})
# 处理思考内容(只在思考模式开启时) # 处理思考内容
if "reasoning_content" in delta and should_show_thinking: if "reasoning_content" in delta:
reasoning_content = delta["reasoning_content"] reasoning_content = delta["reasoning_content"]
if reasoning_content: # 只处理非空内容 if reasoning_content: # 只处理非空内容
if not in_thinking: if not in_thinking:
@ -580,9 +577,6 @@ class DeepSeekClient:
thinking_content = "" thinking_content = ""
in_thinking = False in_thinking = False
# 获取当前是否应该显示思考
should_show_thinking = self.get_current_thinking_mode()
try: try:
async for chunk in self.chat(messages, tools=None, stream=True): async for chunk in self.chat(messages, tools=None, stream=True):
if "choices" not in chunk: if "choices" not in chunk:
@ -591,7 +585,7 @@ class DeepSeekClient:
delta = chunk["choices"][0].get("delta", {}) delta = chunk["choices"][0].get("delta", {})
# 处理思考内容 # 处理思考内容
if "reasoning_content" in delta and should_show_thinking: if "reasoning_content" in delta:
reasoning_content = delta["reasoning_content"] reasoning_content = delta["reasoning_content"]
if reasoning_content: # 只处理非空内容 if reasoning_content: # 只处理非空内容
if not in_thinking: if not in_thinking:

18022
web_server.log Normal file

File diff suppressed because it is too large Load Diff

View File

@ -2575,9 +2575,8 @@ async def handle_task_with_sender(terminal: WebTerminal, message, sender, client
modify_result = {"handled": False} modify_result = {"handled": False}
last_finish_reason = None last_finish_reason = None
# 获取是否显示思考 thinking_expected = web_terminal.api_client.get_current_thinking_mode()
should_show_thinking = web_terminal.api_client.get_current_thinking_mode() debug_log(f"思考模式: {thinking_expected}")
debug_log(f"思考模式: {should_show_thinking}")
print(f"[API] 第{iteration + 1}次调用 (总工具调用: {total_tool_calls}/{MAX_TOTAL_TOOL_CALLS})") print(f"[API] 第{iteration + 1}次调用 (总工具调用: {total_tool_calls}/{MAX_TOTAL_TOOL_CALLS})")
@ -2615,15 +2614,14 @@ async def handle_task_with_sender(terminal: WebTerminal, message, sender, client
reasoning_chunks += 1 reasoning_chunks += 1
debug_log(f" 思考内容 #{reasoning_chunks}: {len(reasoning_content)} 字符") debug_log(f" 思考内容 #{reasoning_chunks}: {len(reasoning_content)} 字符")
if should_show_thinking: if not thinking_started:
if not thinking_started: in_thinking = True
in_thinking = True thinking_started = True
thinking_started = True sender('thinking_start', {})
sender('thinking_start', {}) await asyncio.sleep(0.05)
await asyncio.sleep(0.05)
current_thinking += reasoning_content current_thinking += reasoning_content
sender('thinking_chunk', {'content': reasoning_content}) sender('thinking_chunk', {'content': reasoning_content})
# 处理正常内容 # 处理正常内容
if "content" in delta: if "content" in delta:

12
webapp.log Normal file
View File

@ -0,0 +1,12 @@
Wed Nov 19 12:48:15 AM CST 2025: 开始启动Web应用...
Wed Nov 19 12:48:15 AM CST 2025: 启动主Web应用...
Wed Nov 19 12:48:18 AM CST 2025: 主Web应用启动成功PID: 3116448
Wed Nov 19 12:48:18 AM CST 2025: 启动子代理Web应用...
Wed Nov 19 12:48:21 AM CST 2025: 子代理Web应用启动成功PID: 3116453
Wed Nov 19 12:48:21 AM CST 2025: Web应用启动完成
Wed Nov 19 12:59:42 AM CST 2025: 开始启动Web应用...
Wed Nov 19 12:59:42 AM CST 2025: 启动主Web应用...
Wed Nov 19 12:59:45 AM CST 2025: 主Web应用启动成功PID: 3120088
Wed Nov 19 12:59:45 AM CST 2025: 启动子代理Web应用...
Wed Nov 19 12:59:48 AM CST 2025: 子代理Web应用启动成功PID: 3120101
Wed Nov 19 12:59:48 AM CST 2025: Web应用启动完成

View File

@ -1 +1 @@
2032853 3120088