207 lines
6.9 KiB
Python
207 lines
6.9 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
DeepSeek-OCR 多种调用方式测试
|
||
基于官方GitHub文档的提示词模板
|
||
"""
|
||
|
||
import base64
|
||
from pathlib import Path
|
||
from openai import OpenAI
|
||
|
||
# API配置
|
||
CLARIFAI_PAT = "941fba50c8c04be590a9b2d21b7d8347"
|
||
|
||
# 初始化OpenAI客户端
|
||
client = OpenAI(
|
||
base_url="https://api.clarifai.com/v2/ext/openai/v1",
|
||
api_key=CLARIFAI_PAT
|
||
)
|
||
|
||
def process_image_with_prompt(image_path, prompt, mode_name, temperature=0.0, max_tokens=2048):
|
||
"""使用指定提示词处理图片"""
|
||
print(f"\n🎯 {mode_name}")
|
||
print("-" * 50)
|
||
print(f"📸 处理图片: {image_path}")
|
||
print(f"📝 提示词: {prompt}")
|
||
|
||
if not Path(image_path).exists():
|
||
print(f"❌ 文件不存在: {image_path}")
|
||
return None
|
||
|
||
try:
|
||
# 读取图片
|
||
print("🔄 读取图片...")
|
||
image_base64 = base64.b64encode(Path(image_path).read_bytes()).decode()
|
||
print(f"📊 图片编码完成,大小: {len(image_base64)} 字符")
|
||
|
||
# 调用API
|
||
print(f"🤖 调用DeepSeek-OCR...")
|
||
response = client.chat.completions.create(
|
||
model="https://clarifai.com/deepseek-ai/deepseek-ocr/models/DeepSeek-OCR",
|
||
messages=[{
|
||
"role": "user",
|
||
"content": [
|
||
{"type": "text", "text": prompt},
|
||
{
|
||
"type": "image_url",
|
||
"image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}
|
||
}
|
||
]
|
||
}],
|
||
temperature=temperature,
|
||
max_tokens=max_tokens
|
||
)
|
||
|
||
result = response.choices[0].message.content
|
||
print(f"✅ 处理完成!")
|
||
return result
|
||
|
||
except Exception as e:
|
||
print(f"❌ 处理失败: {e}")
|
||
return None
|
||
|
||
def test_all_modes(image_path):
|
||
"""测试所有DeepSeek-OCR模式"""
|
||
print("🚀 DeepSeek-OCR 多种调用方式测试")
|
||
print("=" * 60)
|
||
|
||
# 定义所有模式 - 基于官方GitHub文档
|
||
modes = {
|
||
"基础OCR模式": {
|
||
"prompt": "OCR this image.",
|
||
"description": "基础文字识别,不考虑布局"
|
||
},
|
||
"自由OCR模式": {
|
||
"prompt": "Free OCR.",
|
||
"description": "自由格式OCR,不考虑布局"
|
||
},
|
||
"Markdown转换模式": {
|
||
"prompt": "Convert the document to markdown.",
|
||
"description": "将文档转换为Markdown格式"
|
||
},
|
||
"图表解析模式": {
|
||
"prompt": "Parse the figure.",
|
||
"description": "解析图表和图形数据"
|
||
},
|
||
"详细描述模式": {
|
||
"prompt": "Describe this image in detail.",
|
||
"description": "详细描述图片内容"
|
||
},
|
||
"目标定位模式": {
|
||
"prompt": "Locate all text elements in the image.",
|
||
"description": "定位图片中的所有文字元素"
|
||
},
|
||
"结构化提取模式": {
|
||
"prompt": "Extract structured data from this document, including tables, lists, and formatted text.",
|
||
"description": "提取结构化数据"
|
||
},
|
||
"中文优化模式": {
|
||
"prompt": "请识别图片中的所有中文文字,并保持原有的格式和布局",
|
||
"description": "专门针对中文文字识别优化"
|
||
},
|
||
"英文优化模式": {
|
||
"prompt": "Please extract all English text from this image while preserving the original layout and formatting",
|
||
"description": "专门针对英文文字识别优化"
|
||
},
|
||
"表格识别模式": {
|
||
"prompt": "Identify and extract all tables from this image, preserving the table structure and data",
|
||
"description": "专门识别和提取表格"
|
||
}
|
||
}
|
||
|
||
results = {}
|
||
|
||
# 测试每种模式
|
||
for mode_name, mode_config in modes.items():
|
||
print(f"\n{mode_name}")
|
||
print(f"描述: {mode_config['description']}")
|
||
|
||
result = process_image_with_prompt(
|
||
image_path,
|
||
mode_config['prompt'],
|
||
mode_name
|
||
)
|
||
|
||
if result:
|
||
results[mode_name] = {
|
||
'prompt': mode_config['prompt'],
|
||
'result': result,
|
||
'description': mode_config['description']
|
||
}
|
||
|
||
# 显示结果预览
|
||
print(f"📋 结果预览:")
|
||
preview = result[:200] + "..." if len(result) > 200 else result
|
||
print(f"{preview}")
|
||
print(f"📊 结果长度: {len(result)} 字符")
|
||
else:
|
||
results[mode_name] = {
|
||
'prompt': mode_config['prompt'],
|
||
'result': "处理失败",
|
||
'description': mode_config['description']
|
||
}
|
||
|
||
# 短暂延迟,避免API限制
|
||
import time
|
||
time.sleep(1)
|
||
|
||
return results
|
||
|
||
def save_results(results, output_file="ocr_modes_results.json"):
|
||
"""保存所有结果"""
|
||
try:
|
||
import json
|
||
with open(output_file, 'w', encoding='utf-8') as f:
|
||
# 转换为可JSON序列化的格式
|
||
json_results = {}
|
||
for mode, data in results.items():
|
||
json_results[mode] = {
|
||
'prompt': data['prompt'],
|
||
'description': data['description'],
|
||
'result': data['result']
|
||
}
|
||
json.dump(json_results, f, ensure_ascii=False, indent=2)
|
||
print(f"\n💾 所有结果已保存到: {output_file}")
|
||
except Exception as e:
|
||
print(f"\n❌ 保存结果失败: {e}")
|
||
|
||
def main():
|
||
print("🚀 DeepSeek-OCR 多种调用方式测试")
|
||
print("=" * 60)
|
||
print("📚 基于官方GitHub文档的提示词模板")
|
||
print("🔗 GitHub: https://github.com/deepseek-ai/DeepSeek-OCR")
|
||
print("=" * 60)
|
||
|
||
# 使用之前测试过的截图图片
|
||
image_path = "/opt/agent/agents/users/jojo/project/user_upload/截屏2025-10-24 12.34.04.png"
|
||
|
||
# 测试所有模式
|
||
results = test_all_modes(image_path)
|
||
|
||
# 保存结果
|
||
save_results(results)
|
||
|
||
# 生成总结报告
|
||
print("\n📊 测试总结报告")
|
||
print("=" * 60)
|
||
|
||
successful_modes = [mode for mode, data in results.items() if data['result'] != "处理失败"]
|
||
failed_modes = [mode for mode, data in results.items() if data['result'] == "处理失败"]
|
||
|
||
print(f"✅ 成功模式: {len(successful_modes)} 个")
|
||
for mode in successful_modes:
|
||
result_length = len(results[mode]['result'])
|
||
print(f" - {mode}: {result_length} 字符")
|
||
|
||
if failed_modes:
|
||
print(f"\n❌ 失败模式: {len(failed_modes)} 个")
|
||
for mode in failed_modes:
|
||
print(f" - {mode}")
|
||
|
||
print(f"\n🎯 成功率: {len(successful_modes)/len(results)*100:.1f}%")
|
||
print("\n✅ 测试完成!")
|
||
|
||
if __name__ == "__main__":
|
||
main()
|