171 lines
5.3 KiB
Python
171 lines
5.3 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
文档提取和分类脚本
|
||
用于从指定目录提取所有 .md 和 .mdx 文件,并分别整理到不同文件夹中
|
||
"""
|
||
|
||
import os
|
||
import shutil
|
||
from pathlib import Path
|
||
from typing import List, Tuple
|
||
|
||
def scan_documents(source_dir: str) -> Tuple[List[Path], List[Path]]:
|
||
"""
|
||
扫描源目录中的所有 .md 和 .mdx 文件
|
||
|
||
Args:
|
||
source_dir: 源目录路径
|
||
|
||
Returns:
|
||
返回两个列表:(md_files, mdx_files)
|
||
"""
|
||
source_path = Path(source_dir)
|
||
md_files = []
|
||
mdx_files = []
|
||
|
||
if not source_path.exists():
|
||
print(f"错误:源目录 {source_dir} 不存在")
|
||
return md_files, mdx_files
|
||
|
||
print(f"正在扫描目录:{source_dir}")
|
||
|
||
# 递归遍历所有子目录
|
||
for file_path in source_path.rglob("*"):
|
||
if file_path.is_file():
|
||
if file_path.suffix.lower() == ".md":
|
||
md_files.append(file_path)
|
||
elif file_path.suffix.lower() == ".mdx":
|
||
mdx_files.append(file_path)
|
||
|
||
print(f"找到 {len(md_files)} 个 .md 文件")
|
||
print(f"找到 {len(mdx_files)} 个 .mdx 文件")
|
||
|
||
return md_files, mdx_files
|
||
|
||
def create_target_directories(target_base: str) -> Tuple[str, str]:
|
||
"""
|
||
创建目标目录结构
|
||
|
||
Args:
|
||
target_base: 目标基础目录
|
||
|
||
Returns:
|
||
返回两个目录路径:(md_target_dir, mdx_target_dir)
|
||
"""
|
||
target_path = Path(target_base)
|
||
|
||
# 创建 .md 文件目录
|
||
md_target_dir = target_path / "markdown_files"
|
||
md_target_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
# 创建 .mdx 文件目录
|
||
mdx_target_dir = target_path / "mdx_files"
|
||
mdx_target_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
print(f"创建目标目录:")
|
||
print(f" .md 文件目录:{md_target_dir}")
|
||
print(f" .mdx 文件目录:{mdx_target_dir}")
|
||
|
||
return str(md_target_dir), str(mdx_target_dir)
|
||
|
||
def copy_files(file_list: List[Path], target_dir: str, file_type: str) -> None:
|
||
"""
|
||
复制文件到目标目录
|
||
|
||
Args:
|
||
file_list: 要复制的文件列表
|
||
target_dir: 目标目录
|
||
file_type: 文件类型描述(用于打印信息)
|
||
"""
|
||
if not file_list:
|
||
print(f"没有 {file_type} 文件需要复制")
|
||
return
|
||
|
||
target_path = Path(target_dir)
|
||
copied_count = 0
|
||
skipped_count = 0
|
||
|
||
print(f"开始复制 {file_type} 文件...")
|
||
|
||
for source_file in file_list:
|
||
# 保持相对目录结构
|
||
relative_path = source_file.relative_to(source_file.parent.parent)
|
||
target_file = target_path / source_file.name
|
||
|
||
# 如果目标文件已存在,添加数字后缀
|
||
counter = 1
|
||
original_target = target_file
|
||
while target_file.exists():
|
||
target_file = original_target.parent / f"{original_target.stem}_{counter}{original_target.suffix}"
|
||
counter += 1
|
||
|
||
try:
|
||
shutil.copy2(source_file, target_file)
|
||
copied_count += 1
|
||
print(f" 已复制:{source_file.name} -> {target_file.name}")
|
||
except Exception as e:
|
||
print(f" 复制失败:{source_file.name} - {str(e)}")
|
||
skipped_count += 1
|
||
|
||
print(f"{file_type} 文件复制完成:成功 {copied_count} 个,跳过 {skipped_count} 个")
|
||
|
||
def generate_summary_report(md_files: List[Path], mdx_files: List[Path],
|
||
md_target: str, mdx_target: str) -> None:
|
||
"""
|
||
生成操作摘要报告
|
||
|
||
Args:
|
||
md_files: 处理的 .md 文件列表
|
||
mdx_files: 处理的 .mdx 文件列表
|
||
md_target: .md 文件目标目录
|
||
mdx_target: .mdx 文件目标目录
|
||
"""
|
||
print("\n" + "="*60)
|
||
print("文档提取和分类完成!")
|
||
print("="*60)
|
||
print(f"源目录:/Users/jojo/Downloads/docs-main")
|
||
print(f"目标基础目录:/Users/jojo/Desktop/openwebuidocs")
|
||
print(f"\n处理结果:")
|
||
print(f" .md 文件:{len(md_files)} 个 -> {md_target}")
|
||
print(f" .mdx 文件:{len(mdx_files)} 个 -> {mdx_target}")
|
||
print(f"\n总计:{len(md_files) + len(mdx_files)} 个文件")
|
||
print("="*60)
|
||
|
||
def main():
|
||
"""
|
||
主函数
|
||
"""
|
||
# 配置路径
|
||
source_directory = "/Users/jojo/Downloads/docs-main"
|
||
target_base_directory = "/Users/jojo/Desktop/openwebuidocs"
|
||
|
||
print("文档提取和分类脚本")
|
||
print("="*60)
|
||
|
||
try:
|
||
# 步骤1:扫描源目录
|
||
md_files, mdx_files = scan_documents(source_directory)
|
||
|
||
if not md_files and not mdx_files:
|
||
print("未找到任何 .md 或 .mdx 文件,脚本结束")
|
||
return
|
||
|
||
# 步骤2:创建目标目录
|
||
md_target, mdx_target = create_target_directories(target_base_directory)
|
||
|
||
# 步骤3:复制文件
|
||
copy_files(md_files, md_target, ".md")
|
||
copy_files(mdx_files, mdx_target, ".mdx")
|
||
|
||
# 步骤4:生成报告
|
||
generate_summary_report(md_files, mdx_files, md_target, mdx_target)
|
||
|
||
except KeyboardInterrupt:
|
||
print("\n操作被用户中断")
|
||
except Exception as e:
|
||
print(f"发生错误:{str(e)}")
|
||
print("请检查路径权限和磁盘空间")
|
||
|
||
if __name__ == "__main__":
|
||
main()
|