nianjie/minirag/minirag/test_minirag.py
2026-01-11 18:52:11 +08:00

51 lines
1.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import numpy as np
from sentence_transformers import SentenceTransformer
from minirag import MiniRAG, QueryParam
from minirag.utils import wrap_embedding_func_with_attrs
MODEL_DIR = "/root/workspace/nianjie/minirag/models/bge-small-zh-v1.5"
WORKDIR = "/root/workspace/nianjie/minirag/cache"
DOC_PATH = "/root/workspace/nianjie.txt"
# 跳过实体关系抽取无本地LLM时避免报错
os.environ["MINIRAG_DISABLE_ENTITY_EXTRACT"] = "1"
# 轻量占用的本地嵌入函数
model = SentenceTransformer(MODEL_DIR, device="cpu")
EMB_DIM = model.get_sentence_embedding_dimension()
@wrap_embedding_func_with_attrs(embedding_dim=EMB_DIM, max_token_size=512)
async def embed(texts):
if isinstance(texts, str):
texts = [texts]
embs = model.encode(texts, normalize_embeddings=True, convert_to_numpy=True)
return embs.astype(np.float32)
# 占位 LLM只为通过初始化查询时使用 only_need_context 不会调用)
async def dummy_llm(prompt, system_prompt=None, hashing_kv=None, **kwargs):
return ""
# 确保工作目录存在且干净
os.makedirs(WORKDIR, exist_ok=True)
rag = MiniRAG(
working_dir=WORKDIR,
embedding_func=embed,
chunk_token_size=500,
chunk_overlap_token_size=80,
llm_model_func=dummy_llm,
log_level="INFO",
)
with open(DOC_PATH, "r", encoding="utf-8") as f:
content = f.read()
print("文档长度:", len(content))
rag.insert(content)
print("已完成索引")
param = QueryParam(mode="naive", top_k=4, only_need_context=True, max_token_for_text_unit=1200)
context = rag.query("简单说明这份文档讲了什么", param)
print("\n检索到的上下文片段:\n", context[:2000])