语义搜索实战 #
本章介绍如何使用 Weaviate 构建语义搜索系统。
语义搜索概述 #
text
语义搜索架构:
┌─────────────────────────────────────────────────────────────┐
│ 语义搜索系统 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 用户查询 │
│ │ │
│ ▼ │
│ ┌─────────────┐ │
│ │ Embedding │ │
│ │ 模型 │ │
│ └─────────────┘ │
│ │ │
│ ▼ │
│ ┌─────────────┐ ┌─────────────┐ │
│ │ Weaviate │ ──→ │ 相似性搜索 │ │
│ │ 向量数据库 │ │ + 过滤 │ │
│ └─────────────┘ └─────────────┘ │
│ │ │
│ ▼ │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ 搜索结果 │ │
│ │ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ │ │
│ │ │ R1 │ │ R2 │ │ R3 │ │ R4 │ │ R5 │ │ │
│ │ │0.95 │ │0.92 │ │0.88 │ │0.85 │ │0.82 │ │ │
│ │ └─────┘ └─────┘ └─────┘ └─────┘ └─────┘ │ │
│ └─────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────┘
环境准备 #
安装依赖 #
bash
pip install weaviate-client sentence-transformers
启动 Weaviate #
yaml
version: '3.8'
services:
weaviate:
image: cr.weaviate.io/semitechnologies/weaviate:1.25.0
ports:
- "8080:8080"
- "50051:50051"
environment:
QUERY_DEFAULTS_LIMIT: 25
AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
DEFAULT_VECTORIZER_MODULE: 'none'
ENABLE_MODULES: ''
构建文档索引 #
创建 Collection #
python
import weaviate
import weaviate.classes as wvc
client = weaviate.connect_to_local()
documents = client.collections.create(
name="Document",
vectorizer_config=wvc.config.Configure.Vectorizer.none(),
vector_index_config=wvc.config.Configure.VectorIndex.hnsw(
distance_metric=wvc.config.VectorDistance.COSINE
),
properties=[
wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT),
wvc.config.Property(name="content", data_type=wvc.config.DataType.TEXT),
wvc.config.Property(name="category", data_type=wvc.config.DataType.TEXT),
wvc.config.Property(name="tags", data_type=wvc.config.DataType.TEXT_ARRAY),
wvc.config.Property(name="source", data_type=wvc.config.DataType.TEXT)
]
)
准备示例数据 #
python
sample_documents = [
{
"title": "Weaviate 入门指南",
"content": "Weaviate 是一个云原生向量数据库,专为 AI 应用设计。它支持语义搜索、RAG、知识图谱等场景,使用 Go 语言编写,性能优异。",
"category": "教程",
"tags": ["Weaviate", "向量数据库", "入门"],
"source": "官方文档"
},
{
"title": "向量数据库对比分析",
"content": "主流向量数据库包括 Weaviate、Qdrant、Milvus、Pinecone 等。Weaviate 特点是模块化架构、内置向量化、GraphQL 支持。Qdrant 性能更高,Milvus 生态更丰富。",
"category": "技术分析",
"tags": ["向量数据库", "对比", "技术选型"],
"source": "技术博客"
},
{
"title": "RAG 应用架构设计",
"content": "RAG(检索增强生成)结合检索和生成能力。核心组件包括文档加载器、向量数据库、大语言模型。Weaviate 作为向量存储,提供高效的语义检索能力。",
"category": "架构设计",
"tags": ["RAG", "架构", "LLM"],
"source": "架构专栏"
},
{
"title": "语义搜索原理详解",
"content": "语义搜索基于向量相似性,而非关键词匹配。通过 Embedding 模型将文本转换为向量,在高维空间中计算相似度。常用模型包括 OpenAI、Cohere、Sentence Transformers。",
"category": "技术原理",
"tags": ["语义搜索", "Embedding", "向量"],
"source": "技术博客"
},
{
"title": "HNSW 索引算法",
"content": "HNSW(分层可导航小世界)是高效的近似最近邻搜索算法。通过多层图结构实现快速检索,时间复杂度 O(log n)。是向量数据库的核心索引技术。",
"category": "算法",
"tags": ["HNSW", "索引", "算法"],
"source": "论文解读"
},
{
"title": "多模态检索实践",
"content": "多模态检索支持文本、图像、音频等多种模态。CLIP 模型实现文本和图像的统一向量空间。Weaviate 的 multi2vec-clip 模块提供开箱即用的多模态支持。",
"category": "实践",
"tags": ["多模态", "CLIP", "检索"],
"source": "实践案例"
},
{
"title": "知识图谱构建方法",
"content": "知识图谱通过实体和关系构建结构化知识。Weaviate 支持对象间的交叉引用,实现知识图谱存储。结合向量搜索,支持语义关联查询。",
"category": "知识图谱",
"tags": ["知识图谱", "实体关系", "图数据库"],
"source": "技术专栏"
},
{
"title": "向量量化技术",
"content": "向量量化通过压缩向量减少内存占用。PQ(乘积量化)和 BQ(二值量化)是常用方法。量化后内存占用可减少 10-30 倍,召回率损失约 5-10%。",
"category": "技术",
"tags": ["量化", "压缩", "性能优化"],
"source": "技术博客"
}
]
生成向量并导入 #
python
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
def get_embedding(text):
return model.encode(text).tolist()
documents = client.collections.get("Document")
with documents.batch.dynamic() as batch:
for doc in sample_documents:
text = f"{doc['title']} {doc['content']}"
vector = get_embedding(text)
batch.add_object(
properties={
"title": doc["title"],
"content": doc["content"],
"category": doc["category"],
"tags": doc["tags"],
"source": doc["source"]
},
vector=vector
)
print(f"Indexed {len(sample_documents)} documents")
语义搜索实现 #
基本语义搜索 #
python
def semantic_search(query, limit=5):
query_vector = get_embedding(query)
response = documents.query.near_vector(
near_vector=query_vector,
limit=limit,
return_properties=["title", "content", "category", "source"]
)
return response
results = semantic_search("什么是向量数据库")
print(f"查询: 什么是向量数据库\n")
for obj in results.objects:
print(f"标题: {obj.properties['title']}")
print(f"分类: {obj.properties['category']}")
print(f"距离: {obj.metadata.distance:.4f}")
print(f"内容: {obj.properties['content'][:100]}...")
print()
带过滤的语义搜索 #
python
from weaviate.classes.query import Filter
def filtered_semantic_search(query, category=None, limit=5):
query_vector = get_embedding(query)
filters = None
if category:
filters = Filter.by_property("category").equal(category)
response = documents.query.near_vector(
near_vector=query_vector,
filters=filters,
limit=limit,
return_properties=["title", "content", "category"]
)
return response
results = filtered_semantic_search("数据库", category="教程")
print("教程分类的搜索结果:")
for obj in results.objects:
print(f"- {obj.properties['title']}")
混合搜索 #
python
def hybrid_search(query, alpha=0.5, limit=5):
query_vector = get_embedding(query)
response = documents.query.hybrid(
query=query,
vector=query_vector,
alpha=alpha,
limit=limit,
query_properties=["title", "content"],
return_properties=["title", "content", "category"]
)
return response
results = hybrid_search("向量数据库", alpha=0.7)
print("混合搜索结果:")
for obj in results.objects:
print(f"- {obj.properties['title']} (score: {obj.metadata.score:.4f})")
多语言搜索 #
python
def multilingual_search(query, limit=5):
query_vector = get_embedding(query)
response = documents.query.near_vector(
near_vector=query_vector,
limit=limit,
return_properties=["title", "content"]
)
return response
results = multilingual_search("vector database tutorial")
print("英文查询结果:")
for obj in results.objects:
print(f"- {obj.properties['title']}")
results = multilingual_search("ベクトルデータベース")
print("\n日文查询结果:")
for obj in results.objects:
print(f"- {obj.properties['title']}")
高级搜索功能 #
相似文档推荐 #
python
def find_similar(doc_id, limit=5):
response = documents.query.near_object(
near_object=doc_id,
limit=limit + 1,
return_properties=["title", "content"]
)
similar_docs = [obj for obj in response.objects if str(obj.uuid) != doc_id]
return similar_docs[:limit]
all_docs = documents.query.fetch_objects(limit=1)
doc_id = str(all_docs.objects[0].uuid)
similar = find_similar(doc_id)
print("相似文档推荐:")
for obj in similar:
print(f"- {obj.properties['title']} (distance: {obj.metadata.distance:.4f})")
分类搜索 #
python
def search_by_category(query, categories, limit=5):
query_vector = get_embedding(query)
filters = Filter.by_property("category").equal_any(categories)
response = documents.query.near_vector(
near_vector=query_vector,
filters=filters,
limit=limit,
return_properties=["title", "content", "category"]
)
return response
results = search_by_category("搜索技术", ["技术", "算法"])
print("技术/算法分类的搜索结果:")
for obj in results.objects:
print(f"- [{obj.properties['category']}] {obj.properties['title']}")
标签过滤搜索 #
python
def search_by_tags(query, tags, limit=5):
query_vector = get_embedding(query)
filters = Filter.by_property("tags").contains_any(tags)
response = documents.query.near_vector(
near_vector=query_vector,
filters=filters,
limit=limit,
return_properties=["title", "content", "tags"]
)
return response
results = search_by_tags("数据库", ["向量数据库", "RAG"])
print("包含指定标签的搜索结果:")
for obj in results.objects:
print(f"- {obj.properties['title']}")
print(f" 标签: {obj.properties['tags']}")
搜索结果处理 #
重排序 #
python
def search_with_rerank(query, limit=10, top_k=5):
query_vector = get_embedding(query)
response = documents.query.near_vector(
near_vector=query_vector,
limit=limit,
return_properties=["title", "content"]
)
results = []
for obj in response.objects:
content = obj.properties["content"]
content_vector = get_embedding(content)
similarity = sum(a * b for a, b in zip(query_vector, content_vector))
similarity /= (sum(a**2 for a in query_vector)**0.5 * sum(b**2 for b in content_vector)**0.5)
results.append({
"title": obj.properties["title"],
"content": content,
"similarity": similarity
})
results.sort(key=lambda x: x["similarity"], reverse=True)
return results[:top_k]
results = search_with_rerank("向量搜索")
print("重排序后的结果:")
for r in results:
print(f"- {r['title']} (similarity: {r['similarity']:.4f})")
结果高亮 #
python
def highlight_text(text, query, max_length=200):
query_words = query.lower().split()
text_lower = text.lower()
for word in query_words:
if len(word) > 1:
idx = text_lower.find(word)
if idx != -1:
text = text[:idx] + f"**{text[idx:idx+len(word)]}**" + text[idx+len(word):]
text_lower = text.lower()
if len(text) > max_length:
text = text[:max_length] + "..."
return text
def search_with_highlight(query, limit=5):
query_vector = get_embedding(query)
response = documents.query.near_vector(
near_vector=query_vector,
limit=limit,
return_properties=["title", "content"]
)
results = []
for obj in response.objects:
results.append({
"title": obj.properties["title"],
"content_highlighted": highlight_text(obj.properties["content"], query),
"distance": obj.metadata.distance
})
return results
results = search_with_highlight("向量数据库")
for r in results:
print(f"标题: {r['title']}")
print(f"内容: {r['content_highlighted']}")
print()
搜索性能优化 #
批量查询 #
python
def batch_search(queries, limit=5):
query_vectors = [get_embedding(q) for q in queries]
results = {}
for query, vector in zip(queries, query_vectors):
response = documents.query.near_vector(
near_vector=vector,
limit=limit,
return_properties=["title"]
)
results[query] = [obj.properties["title"] for obj in response.objects]
return results
queries = ["向量数据库", "语义搜索", "RAG 应用"]
results = batch_search(queries)
for query, titles in results.items():
print(f"查询: {query}")
for title in titles:
print(f" - {title}")
缓存优化 #
python
from functools import lru_cache
@lru_cache(maxsize=1000)
def cached_embedding(text):
return model.encode(text).tolist()
def cached_search(query, limit=5):
query_vector = cached_embedding(query)
response = documents.query.near_vector(
near_vector=query_vector,
limit=limit
)
return response
完整示例 #
python
import weaviate
import weaviate.classes as wvc
from sentence_transformers import SentenceTransformer
from weaviate.classes.query import Filter
client = weaviate.connect_to_local()
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
def get_embedding(text):
return model.encode(text).tolist()
client.collections.delete("Document")
documents = client.collections.create(
name="Document",
vectorizer_config=wvc.config.Configure.Vectorizer.none(),
properties=[
wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT),
wvc.config.Property(name="content", data_type=wvc.config.DataType.TEXT),
wvc.config.Property(name="category", data_type=wvc.config.DataType.TEXT)
]
)
sample_docs = [
{"title": "Weaviate 简介", "content": "Weaviate 是云原生向量数据库", "category": "教程"},
{"title": "向量搜索原理", "content": "向量搜索基于高维向量相似性", "category": "技术"},
{"title": "RAG 应用", "content": "RAG 结合检索和生成能力", "category": "应用"}
]
with documents.batch.dynamic() as batch:
for doc in sample_docs:
text = f"{doc['title']} {doc['content']}"
vector = get_embedding(text)
batch.add_object(properties=doc, vector=vector)
print("文档索引完成\n")
query = "什么是向量数据库"
query_vector = get_embedding(query)
response = documents.query.near_vector(
near_vector=query_vector,
limit=3,
return_properties=["title", "content", "category"]
)
print(f"查询: {query}\n")
print("搜索结果:")
for obj in response.objects:
print(f"- {obj.properties['title']} [{obj.properties['category']}]")
print(f" 距离: {obj.metadata.distance:.4f}")
client.close()
小结 #
本章介绍了使用 Weaviate 构建语义搜索系统:
- 文档索引构建
- 基本语义搜索
- 混合搜索
- 多语言搜索
- 高级搜索功能
- 搜索结果处理
- 性能优化
下一步 #
继续学习 RAG 应用,了解如何构建检索增强生成系统!
最后更新:2026-04-04