agent/users/jojo/project/ocr_test/deepseek_ocr_modes.py
2025-11-14 16:44:12 +08:00

207 lines
6.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
DeepSeek-OCR 多种调用方式测试
基于官方GitHub文档的提示词模板
"""
import base64
from pathlib import Path
from openai import OpenAI
# API配置
CLARIFAI_PAT = "941fba50c8c04be590a9b2d21b7d8347"
# 初始化OpenAI客户端
client = OpenAI(
base_url="https://api.clarifai.com/v2/ext/openai/v1",
api_key=CLARIFAI_PAT
)
def process_image_with_prompt(image_path, prompt, mode_name, temperature=0.0, max_tokens=2048):
"""使用指定提示词处理图片"""
print(f"\n🎯 {mode_name}")
print("-" * 50)
print(f"📸 处理图片: {image_path}")
print(f"📝 提示词: {prompt}")
if not Path(image_path).exists():
print(f"❌ 文件不存在: {image_path}")
return None
try:
# 读取图片
print("🔄 读取图片...")
image_base64 = base64.b64encode(Path(image_path).read_bytes()).decode()
print(f"📊 图片编码完成,大小: {len(image_base64)} 字符")
# 调用API
print(f"🤖 调用DeepSeek-OCR...")
response = client.chat.completions.create(
model="https://clarifai.com/deepseek-ai/deepseek-ocr/models/DeepSeek-OCR",
messages=[{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}
}
]
}],
temperature=temperature,
max_tokens=max_tokens
)
result = response.choices[0].message.content
print(f"✅ 处理完成!")
return result
except Exception as e:
print(f"❌ 处理失败: {e}")
return None
def test_all_modes(image_path):
"""测试所有DeepSeek-OCR模式"""
print("🚀 DeepSeek-OCR 多种调用方式测试")
print("=" * 60)
# 定义所有模式 - 基于官方GitHub文档
modes = {
"基础OCR模式": {
"prompt": "OCR this image.",
"description": "基础文字识别,不考虑布局"
},
"自由OCR模式": {
"prompt": "Free OCR.",
"description": "自由格式OCR不考虑布局"
},
"Markdown转换模式": {
"prompt": "Convert the document to markdown.",
"description": "将文档转换为Markdown格式"
},
"图表解析模式": {
"prompt": "Parse the figure.",
"description": "解析图表和图形数据"
},
"详细描述模式": {
"prompt": "Describe this image in detail.",
"description": "详细描述图片内容"
},
"目标定位模式": {
"prompt": "Locate all text elements in the image.",
"description": "定位图片中的所有文字元素"
},
"结构化提取模式": {
"prompt": "Extract structured data from this document, including tables, lists, and formatted text.",
"description": "提取结构化数据"
},
"中文优化模式": {
"prompt": "请识别图片中的所有中文文字,并保持原有的格式和布局",
"description": "专门针对中文文字识别优化"
},
"英文优化模式": {
"prompt": "Please extract all English text from this image while preserving the original layout and formatting",
"description": "专门针对英文文字识别优化"
},
"表格识别模式": {
"prompt": "Identify and extract all tables from this image, preserving the table structure and data",
"description": "专门识别和提取表格"
}
}
results = {}
# 测试每种模式
for mode_name, mode_config in modes.items():
print(f"\n{mode_name}")
print(f"描述: {mode_config['description']}")
result = process_image_with_prompt(
image_path,
mode_config['prompt'],
mode_name
)
if result:
results[mode_name] = {
'prompt': mode_config['prompt'],
'result': result,
'description': mode_config['description']
}
# 显示结果预览
print(f"📋 结果预览:")
preview = result[:200] + "..." if len(result) > 200 else result
print(f"{preview}")
print(f"📊 结果长度: {len(result)} 字符")
else:
results[mode_name] = {
'prompt': mode_config['prompt'],
'result': "处理失败",
'description': mode_config['description']
}
# 短暂延迟避免API限制
import time
time.sleep(1)
return results
def save_results(results, output_file="ocr_modes_results.json"):
"""保存所有结果"""
try:
import json
with open(output_file, 'w', encoding='utf-8') as f:
# 转换为可JSON序列化的格式
json_results = {}
for mode, data in results.items():
json_results[mode] = {
'prompt': data['prompt'],
'description': data['description'],
'result': data['result']
}
json.dump(json_results, f, ensure_ascii=False, indent=2)
print(f"\n💾 所有结果已保存到: {output_file}")
except Exception as e:
print(f"\n❌ 保存结果失败: {e}")
def main():
print("🚀 DeepSeek-OCR 多种调用方式测试")
print("=" * 60)
print("📚 基于官方GitHub文档的提示词模板")
print("🔗 GitHub: https://github.com/deepseek-ai/DeepSeek-OCR")
print("=" * 60)
# 使用之前测试过的截图图片
image_path = "/opt/agent/agents/users/jojo/project/user_upload/截屏2025-10-24 12.34.04.png"
# 测试所有模式
results = test_all_modes(image_path)
# 保存结果
save_results(results)
# 生成总结报告
print("\n📊 测试总结报告")
print("=" * 60)
successful_modes = [mode for mode, data in results.items() if data['result'] != "处理失败"]
failed_modes = [mode for mode, data in results.items() if data['result'] == "处理失败"]
print(f"✅ 成功模式: {len(successful_modes)}")
for mode in successful_modes:
result_length = len(results[mode]['result'])
print(f" - {mode}: {result_length} 字符")
if failed_modes:
print(f"\n❌ 失败模式: {len(failed_modes)}")
for mode in failed_modes:
print(f" - {mode}")
print(f"\n🎯 成功率: {len(successful_modes)/len(results)*100:.1f}%")
print("\n✅ 测试完成!")
if __name__ == "__main__":
main()