agent/users/jojo/project/doc_extractor/extract_docs.py
2025-11-14 16:44:12 +08:00

171 lines
5.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
文档提取和分类脚本
用于从指定目录提取所有 .md 和 .mdx 文件,并分别整理到不同文件夹中
"""
import os
import shutil
from pathlib import Path
from typing import List, Tuple
def scan_documents(source_dir: str) -> Tuple[List[Path], List[Path]]:
"""
扫描源目录中的所有 .md 和 .mdx 文件
Args:
source_dir: 源目录路径
Returns:
返回两个列表:(md_files, mdx_files)
"""
source_path = Path(source_dir)
md_files = []
mdx_files = []
if not source_path.exists():
print(f"错误:源目录 {source_dir} 不存在")
return md_files, mdx_files
print(f"正在扫描目录:{source_dir}")
# 递归遍历所有子目录
for file_path in source_path.rglob("*"):
if file_path.is_file():
if file_path.suffix.lower() == ".md":
md_files.append(file_path)
elif file_path.suffix.lower() == ".mdx":
mdx_files.append(file_path)
print(f"找到 {len(md_files)} 个 .md 文件")
print(f"找到 {len(mdx_files)} 个 .mdx 文件")
return md_files, mdx_files
def create_target_directories(target_base: str) -> Tuple[str, str]:
"""
创建目标目录结构
Args:
target_base: 目标基础目录
Returns:
返回两个目录路径:(md_target_dir, mdx_target_dir)
"""
target_path = Path(target_base)
# 创建 .md 文件目录
md_target_dir = target_path / "markdown_files"
md_target_dir.mkdir(parents=True, exist_ok=True)
# 创建 .mdx 文件目录
mdx_target_dir = target_path / "mdx_files"
mdx_target_dir.mkdir(parents=True, exist_ok=True)
print(f"创建目标目录:")
print(f" .md 文件目录:{md_target_dir}")
print(f" .mdx 文件目录:{mdx_target_dir}")
return str(md_target_dir), str(mdx_target_dir)
def copy_files(file_list: List[Path], target_dir: str, file_type: str) -> None:
"""
复制文件到目标目录
Args:
file_list: 要复制的文件列表
target_dir: 目标目录
file_type: 文件类型描述(用于打印信息)
"""
if not file_list:
print(f"没有 {file_type} 文件需要复制")
return
target_path = Path(target_dir)
copied_count = 0
skipped_count = 0
print(f"开始复制 {file_type} 文件...")
for source_file in file_list:
# 保持相对目录结构
relative_path = source_file.relative_to(source_file.parent.parent)
target_file = target_path / source_file.name
# 如果目标文件已存在,添加数字后缀
counter = 1
original_target = target_file
while target_file.exists():
target_file = original_target.parent / f"{original_target.stem}_{counter}{original_target.suffix}"
counter += 1
try:
shutil.copy2(source_file, target_file)
copied_count += 1
print(f" 已复制:{source_file.name} -> {target_file.name}")
except Exception as e:
print(f" 复制失败:{source_file.name} - {str(e)}")
skipped_count += 1
print(f"{file_type} 文件复制完成:成功 {copied_count} 个,跳过 {skipped_count}")
def generate_summary_report(md_files: List[Path], mdx_files: List[Path],
md_target: str, mdx_target: str) -> None:
"""
生成操作摘要报告
Args:
md_files: 处理的 .md 文件列表
mdx_files: 处理的 .mdx 文件列表
md_target: .md 文件目标目录
mdx_target: .mdx 文件目标目录
"""
print("\n" + "="*60)
print("文档提取和分类完成!")
print("="*60)
print(f"源目录:/Users/jojo/Downloads/docs-main")
print(f"目标基础目录:/Users/jojo/Desktop/openwebuidocs")
print(f"\n处理结果:")
print(f" .md 文件:{len(md_files)} 个 -> {md_target}")
print(f" .mdx 文件:{len(mdx_files)} 个 -> {mdx_target}")
print(f"\n总计:{len(md_files) + len(mdx_files)} 个文件")
print("="*60)
def main():
"""
主函数
"""
# 配置路径
source_directory = "/Users/jojo/Downloads/docs-main"
target_base_directory = "/Users/jojo/Desktop/openwebuidocs"
print("文档提取和分类脚本")
print("="*60)
try:
# 步骤1扫描源目录
md_files, mdx_files = scan_documents(source_directory)
if not md_files and not mdx_files:
print("未找到任何 .md 或 .mdx 文件,脚本结束")
return
# 步骤2创建目标目录
md_target, mdx_target = create_target_directories(target_base_directory)
# 步骤3复制文件
copy_files(md_files, md_target, ".md")
copy_files(mdx_files, mdx_target, ".mdx")
# 步骤4生成报告
generate_summary_report(md_files, mdx_files, md_target, mdx_target)
except KeyboardInterrupt:
print("\n操作被用户中断")
except Exception as e:
print(f"发生错误:{str(e)}")
print("请检查路径权限和磁盘空间")
if __name__ == "__main__":
main()