51 lines
1.6 KiB
Python
51 lines
1.6 KiB
Python
import os
|
||
import numpy as np
|
||
from sentence_transformers import SentenceTransformer
|
||
from minirag import MiniRAG, QueryParam
|
||
from minirag.utils import wrap_embedding_func_with_attrs
|
||
|
||
MODEL_DIR = "/root/workspace/nianjie/minirag/models/bge-small-zh-v1.5"
|
||
WORKDIR = "/root/workspace/nianjie/minirag/cache"
|
||
DOC_PATH = "/root/workspace/nianjie.txt"
|
||
|
||
# 跳过实体关系抽取(无本地LLM时避免报错)
|
||
os.environ["MINIRAG_DISABLE_ENTITY_EXTRACT"] = "1"
|
||
|
||
# 轻量占用的本地嵌入函数
|
||
model = SentenceTransformer(MODEL_DIR, device="cpu")
|
||
EMB_DIM = model.get_sentence_embedding_dimension()
|
||
|
||
@wrap_embedding_func_with_attrs(embedding_dim=EMB_DIM, max_token_size=512)
|
||
async def embed(texts):
|
||
if isinstance(texts, str):
|
||
texts = [texts]
|
||
embs = model.encode(texts, normalize_embeddings=True, convert_to_numpy=True)
|
||
return embs.astype(np.float32)
|
||
|
||
# 占位 LLM(只为通过初始化;查询时使用 only_need_context 不会调用)
|
||
async def dummy_llm(prompt, system_prompt=None, hashing_kv=None, **kwargs):
|
||
return ""
|
||
|
||
# 确保工作目录存在且干净
|
||
os.makedirs(WORKDIR, exist_ok=True)
|
||
|
||
rag = MiniRAG(
|
||
working_dir=WORKDIR,
|
||
embedding_func=embed,
|
||
chunk_token_size=500,
|
||
chunk_overlap_token_size=80,
|
||
llm_model_func=dummy_llm,
|
||
log_level="INFO",
|
||
)
|
||
|
||
with open(DOC_PATH, "r", encoding="utf-8") as f:
|
||
content = f.read()
|
||
|
||
print("文档长度:", len(content))
|
||
rag.insert(content)
|
||
print("已完成索引")
|
||
|
||
param = QueryParam(mode="naive", top_k=4, only_need_context=True, max_token_for_text_unit=1200)
|
||
context = rag.query("简单说明这份文档讲了什么", param)
|
||
print("\n检索到的上下文片段:\n", context[:2000])
|