agent/users/jojo/project/ocr_test/deepseek_ocr_modes.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
DeepSeek-OCR 多种调用方式测试
基于官方GitHub文档的提示词模板
"""

import base64
from pathlib import Path
from openai import OpenAI

# API配置
CLARIFAI_PAT = "941fba50c8c04be590a9b2d21b7d8347"

# 初始化OpenAI客户端
client = OpenAI(
    base_url="https://api.clarifai.com/v2/ext/openai/v1",
    api_key=CLARIFAI_PAT
)

def process_image_with_prompt(image_path, prompt, mode_name, temperature=0.0, max_tokens=2048):
    """使用指定提示词处理图片"""
    print(f"\n🎯 {mode_name}")
    print("-" * 50)
    print(f"📸 处理图片: {image_path}")
    print(f"📝 提示词: {prompt}")

    if not Path(image_path).exists():
        print(f"❌ 文件不存在: {image_path}")
        return None

    try:
        # 读取图片
        print("🔄 读取图片...")
        image_base64 = base64.b64encode(Path(image_path).read_bytes()).decode()
        print(f"📊 图片编码完成，大小: {len(image_base64)} 字符")

        # 调用API
        print(f"🤖 调用DeepSeek-OCR...")
        response = client.chat.completions.create(
            model="https://clarifai.com/deepseek-ai/deepseek-ocr/models/DeepSeek-OCR",
            messages=[{
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}
                    }
                ]
            }],
            temperature=temperature,
            max_tokens=max_tokens
        )

        result = response.choices[0].message.content
        print(f"✅ 处理完成！")
        return result

    except Exception as e:
        print(f"❌ 处理失败: {e}")
        return None

def test_all_modes(image_path):
    """测试所有DeepSeek-OCR模式"""
    print("🚀 DeepSeek-OCR 多种调用方式测试")
    print("=" * 60)

    # 定义所有模式 - 基于官方GitHub文档
    modes = {
        "基础OCR模式": {
            "prompt": "OCR this image.",
            "description": "基础文字识别，不考虑布局"
        },
        "自由OCR模式": {
            "prompt": "Free OCR.",
            "description": "自由格式OCR，不考虑布局"
        },
        "Markdown转换模式": {
            "prompt": "Convert the document to markdown.",
            "description": "将文档转换为Markdown格式"
        },
        "图表解析模式": {
            "prompt": "Parse the figure.",
            "description": "解析图表和图形数据"
        },
        "详细描述模式": {
            "prompt": "Describe this image in detail.",
            "description": "详细描述图片内容"
        },
        "目标定位模式": {
            "prompt": "Locate all text elements in the image.",
            "description": "定位图片中的所有文字元素"
        },
        "结构化提取模式": {
            "prompt": "Extract structured data from this document, including tables, lists, and formatted text.",
            "description": "提取结构化数据"
        },
        "中文优化模式": {
            "prompt": "请识别图片中的所有中文文字，并保持原有的格式和布局",
            "description": "专门针对中文文字识别优化"
        },
        "英文优化模式": {
            "prompt": "Please extract all English text from this image while preserving the original layout and formatting",
            "description": "专门针对英文文字识别优化"
        },
        "表格识别模式": {
            "prompt": "Identify and extract all tables from this image, preserving the table structure and data",
            "description": "专门识别和提取表格"
        }
    }

    results = {}

    # 测试每种模式
    for mode_name, mode_config in modes.items():
        print(f"\n{mode_name}")
        print(f"描述: {mode_config['description']}")

        result = process_image_with_prompt(
            image_path,
            mode_config['prompt'],
            mode_name
        )

        if result:
            results[mode_name] = {
                'prompt': mode_config['prompt'],
                'result': result,
                'description': mode_config['description']
            }

            # 显示结果预览
            print(f"📋 结果预览:")
            preview = result[:200] + "..." if len(result) > 200 else result
            print(f"{preview}")
            print(f"📊 结果长度: {len(result)} 字符")
        else:
            results[mode_name] = {
                'prompt': mode_config['prompt'],
                'result': "处理失败",
                'description': mode_config['description']
            }

        # 短暂延迟，避免API限制
        import time
        time.sleep(1)

    return results

def save_results(results, output_file="ocr_modes_results.json"):
    """保存所有结果"""
    try:
        import json
        with open(output_file, 'w', encoding='utf-8') as f:
            # 转换为可JSON序列化的格式
            json_results = {}
            for mode, data in results.items():
                json_results[mode] = {
                    'prompt': data['prompt'],
                    'description': data['description'],
                    'result': data['result']
                }
            json.dump(json_results, f, ensure_ascii=False, indent=2)
        print(f"\n💾 所有结果已保存到: {output_file}")
    except Exception as e:
        print(f"\n❌ 保存结果失败: {e}")

def main():
    print("🚀 DeepSeek-OCR 多种调用方式测试")
    print("=" * 60)
    print("📚 基于官方GitHub文档的提示词模板")
    print("🔗 GitHub: https://github.com/deepseek-ai/DeepSeek-OCR")
    print("=" * 60)

    # 使用之前测试过的截图图片
    image_path = "/opt/agent/agents/users/jojo/project/user_upload/截屏2025-10-24 12.34.04.png"

    # 测试所有模式
    results = test_all_modes(image_path)

    # 保存结果
    save_results(results)

    # 生成总结报告
    print("\n📊 测试总结报告")
    print("=" * 60)

    successful_modes = [mode for mode, data in results.items() if data['result'] != "处理失败"]
    failed_modes = [mode for mode, data in results.items() if data['result'] == "处理失败"]

    print(f"✅ 成功模式: {len(successful_modes)} 个")
    for mode in successful_modes:
        result_length = len(results[mode]['result'])
        print(f"  - {mode}: {result_length} 字符")

    if failed_modes:
        print(f"\n❌ 失败模式: {len(failed_modes)} 个")
        for mode in failed_modes:
            print(f"  - {mode}")

    print(f"\n🎯 成功率: {len(successful_modes)/len(results)*100:.1f}%")
    print("\n✅ 测试完成！")

if __name__ == "__main__":
    main()