documents = SimpleDirectoryReader(
input_files=["./milvus_2_5.md"]
).load_data()

# Create the sentence window node parser
node_parser = SentenceWindowNodeParser.from_defaults(
window_size=3,
window_metadata_key="window",
original_text_metadata_key="original_text",
)

# Extract nodes from documents
nodes = node_parser.get_nodes_from_documents(documents)

# query question
query = "What are the key features in milvus 2.5?"

2. 其次,創建collection的schema以及索引,其中原始文本數據存于text列,而Sparse-BM25數據存于sparse_bm25列,這里需要通過轉換Function來實現

bm25_function = Function(
name="bm25",
function_type=FunctionType.BM25,
input_field_names=["text"],
output_field_names="sparse_bm25",
)
schema = MilvusClient.create_schema(
auto_id=False,
enable_dynamic_field=True,
)

# Add fields to schema
schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=512, enable_analyzer=True)
schema.add_field(field_name="sparse_bm25", datatype=DataType.SPARSE_FLOAT_VECTOR)
schema.add_field(field_name="dense", datatype=DataType.FLOAT_VECTOR, dim=dense_dim)

bm25_function = Function(
name="bm25",
function_type=FunctionType.BM25,
input_field_names=["text"],
output_field_names="sparse_bm25",
)
schema.add_function(bm25_function)

index_params = client.prepare_index_params()

# Add indexes
index_params.add_index(
field_name="dense",
index_name="dense_index",
index_type="IVF_FLAT",
metric_type="IP",
params={"nlist": 128},
)

index_params.add_index(
field_name="sparse_bm25",
index_name="sparse_bm25_index",
index_type="SPARSE_WAND",
metric_type="BM25"
)

# Create collection
client.create_collection(
collection_name=collection_name,
schema=schema,
index_params=index_params
)

3. 然后,把數據進行Embedding之后,插入到Collection里,這里Embedding采用的是 OpenAI的 text-embedding-3-large

def gen_embedding(docs):
model_name = "text-embedding-3-large"
openai_ef = model.dense.OpenAIEmbeddingFunction(
model_name=model_name,
api_key=os.environ["OPENAI_API_KEY"]
)

return openai_ef.encode_documents(docs)

docs_embeddings = gen_embedding(docs)
query_embeddings = gen_embedding([query])

# Assemble data
data = [
{"id": idx, "dense": docs_embeddings[idx].data, "text": doc}
for idx, doc in enumerate(docs)
]

# Insert data
res = client.insert(
collection_name=collection_name,
data=data
)

4. 最后,進行查詢測試

4.1. 我們先測試下普通查詢

search_params = {
"metric_type": "IP",
"params": {"nprobe": 10}
}

res = client.search(
collection_name=collection_name,
data=[query_embeddings[0]],
anns_field="dense",
limit=5,
search_params=search_params,
output_fields=["text"]
)

查詢結果

TopK results:
0
0 Enhancements in cluster management, indexing, and data handling introduce new levels of flexibil...
1 With this release, Milvus integrates powerful new features like term-based search, clustering co...
2 Milvus 2.5 introduces a built-in Cluster Management WebUI, reducing system maintenance difficult...
3 \n\nv2.5.0-beta\n\nRelease date: November 26, 2024\n\n| Milvus version | Python SDK version | No...
4 \n\nRelease Notes\n\nFind out what’s new in Milvus!

從查詢結果來看,最后一條召回內容與查詢問題相關度不大。

4.2. 然后進行Hybrid Search。定義向量搜索和Sparse-BM25搜索

k=5 # get the top 5 docs related to the query

search_params_dense: { "metric_type": "IP", "params": {"nprobe": 10}}
request_dense = AnnSearchRequest([query_embeddings[0].data], "dense", search_params_dense, limit=k)

search_params_bm25 = {"metric_type": "BM25"}
request_bm25 = AnnSearchRequest([query], "sparse_bm25", search_params_bm25, limit=k)

reqs = [request_dense, request_bm25]

這里使用RRFRanker來進行Hybrid Search

ranker = RRFRanker(100)

res = client.hybrid_search(
collection_name=collection_name,
reqs=reqs,
ranker=ranker,
limit=5,
output_fields=["text"]
)
for hits in res:
print("TopK results:")
for hit in hits:
print(hit)

查詢結果:

TopK results:
0
0 \n\nv2.5.0-beta\n\nRelease date: November 26, 2024\n\n| Milvus version | Python SDK version | No...
1 Enhancements in cluster management, indexing, and data handling introduce new levels of flexibil...
2 This feature is disabled by default in Milvus 2.5 and will be officially available in version 3....
3 With this release, Milvus integrates powerful new features like term-based search, clustering co...
4 Powered by Tantivy, Milvus 2.5 has built-in analyzers and sparse vector extraction, extending th...

從結果來看,基于Sparse-BM25的Hybrid Search可以準確找到與查詢相關的內容。相對于普通查詢,召回的內容準確度更大。

04. 總結

本文講述了Milvus 2.5中引入的Sparse-BM25基礎原理,以及如何利用BM25算法實現RAG開發中的Hybrid Search(混合搜索)實踐。通過引入Sparse-BM25算法,Milvus能夠在稀疏向量上執行高效的全文檢索,并與密集向量搜索相結合,提升檢索的召回率和精確度。

參考文檔:

文章轉自微信公眾號@Zilliz

上一篇:

深入探討RAG中的語義分塊方法:基于嵌入和大型語言模型的創新技術

下一篇:

時空圖神經網絡ST-GNN的概念以及Pytorch實現
#你可能也喜歡這些API文章!

我們有何不同?

API服務商零注冊

多API并行試用

數據驅動選型,提升決策效率

查看全部API→
??

熱門場景實測,選對API

#AI文本生成大模型API

對比大模型API的內容創意新穎性、情感共鳴力、商業轉化潛力

25個渠道
一鍵對比試用API 限時免費

#AI深度推理大模型API

對比大模型API的邏輯推理準確性、分析深度、可視化建議合理性

10個渠道
一鍵對比試用API 限時免費