agent-Specialization/users/jojo/project/ocr_test/test_deepseek_ocr.py
2025-11-14 16:44:12 +08:00

164 lines
4.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
DeepSeek OCR 测试代码
测试Clarifai平台的DeepSeek-OCR API
"""
import os
import base64
import requests
from pathlib import Path
from openai import OpenAI
import json
# API配置 - 使用提供的密钥
CLARIFAI_PAT = "941fba50c8c04be590a9b2d21b7d8347"
# 初始化OpenAI客户端
client = OpenAI(
base_url="https://api.clarifai.com/v2/ext/openai/v1",
api_key=CLARIFAI_PAT
)
def test_connection():
"""测试API连接"""
print("🔍 正在测试API连接...")
try:
response = client.chat.completions.create(
model="https://clarifai.com/deepseek-ai/deepseek-ocr/models/DeepSeek-OCR",
messages=[{"role": "user", "content": "Hello"}],
max_tokens=10
)
print("✅ API连接成功")
return True
except Exception as e:
print(f"❌ API连接失败{e}")
return False
def ocr_local_image(image_path, prompt="请提取图片中的所有文字内容"):
"""使用本地图片进行OCR"""
print(f"📸 正在处理图片: {image_path}")
try:
# 检查文件是否存在
if not Path(image_path).exists():
print(f"❌ 文件不存在: {image_path}")
return None
# 读取并编码图片
print("🔄 正在编码图片...")
image_base64 = base64.b64encode(Path(image_path).read_bytes()).decode()
print(f"📊 图片大小: {len(image_base64)} 字符")
# 调用API
print("🤖 正在调用DeepSeek-OCR API...")
response = client.chat.completions.create(
model="https://clarifai.com/deepseek-ai/deepseek-ocr/models/DeepSeek-OCR",
messages=[{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{image_base64}"
}
}
]
}],
temperature=0.0, # 保持准确性
max_tokens=2048 # 限制输出长度
)
result = response.choices[0].message.content
print("✅ OCR处理完成")
return result
except Exception as e:
print(f"❌ OCR处理失败: {e}")
return None
def ocr_with_different_prompts(image_path):
"""使用不同的提示词测试OCR"""
prompts = {
"基础提取": "请提取图片中的所有文字内容",
"详细描述": "请详细描述这张图片的内容,包括文字、布局、格式等",
"表格提取": "如果图片中有表格,请提取表格的结构和内容",
"Markdown格式": "请将图片内容转换为Markdown格式",
"结构化数据": "请以JSON格式提取图片中的结构化信息"
}
results = {}
for name, prompt in prompts.items():
print(f"\n📝 {name}: {prompt}")
result = ocr_local_image(image_path, prompt)
results[name] = result
if result:
print(f"结果预览: {result[:200]}...")
else:
print("无结果")
# 短暂延迟,避免速率限制
import time
time.sleep(1)
return results
def save_results(results, output_file="ocr_results.json"):
"""保存结果到文件"""
try:
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f"💾 结果已保存到: {output_file}")
except Exception as e:
print(f"❌ 保存结果失败: {e}")
def main():
"""主函数"""
print("🚀 DeepSeek OCR 测试开始")
print("=" * 50)
# 测试图片路径
image_path = "user_upload/截屏2025-10-24 12.34.04.png"
# 第一步:测试连接
if not test_connection():
print("❌ 无法连接到API请检查密钥和网络连接")
return
print("\n" + "=" * 50)
# 第二步基础OCR测试
print("📋 基础OCR测试")
basic_result = ocr_local_image(image_path, "请提取图片中的所有文字内容")
if basic_result:
print("\n🎯 基础OCR结果:")
print("-" * 30)
print(basic_result)
print("-" * 30)
print("\n" + "=" * 50)
# 第三步:多提示词测试
print("🔍 多提示词测试")
advanced_results = ocr_with_different_prompts(image_path)
# 保存所有结果
all_results = {
"基础结果": basic_result,
"高级结果": advanced_results,
"图片路径": image_path
}
save_results(all_results, "ocr_test_results.json")
print("\n" + "=" * 50)
print("✅ 测试完成!")
if __name__ == "__main__":
main()