语义搜索实战 #

本章介绍如何使用 Weaviate 构建语义搜索系统。

语义搜索概述 #

text

语义搜索架构：

┌─────────────────────────────────────────────────────────────┐
│                      语义搜索系统                            │
├─────────────────────────────────────────────────────────────┤
│                                                              │
│  用户查询                                                    │
│      │                                                       │
│      ▼                                                       │
│  ┌─────────────┐                                             │
│  │  Embedding  │                                             │
│  │   模型      │                                             │
│  └─────────────┘                                             │
│      │                                                       │
│      ▼                                                       │
│  ┌─────────────┐     ┌─────────────┐                        │
│  │  Weaviate   │ ──→ │  相似性搜索  │                        │
│  │  向量数据库  │     │  + 过滤     │                        │
│  └─────────────┘     └─────────────┘                        │
│                              │                               │
│                              ▼                               │
│  ┌─────────────────────────────────────────────────────┐   │
│  │                    搜索结果                          │   │
│  │  ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐          │   │
│  │  │ R1  │ │ R2  │ │ R3  │ │ R4  │ │ R5  │          │   │
│  │  │0.95 │ │0.92 │ │0.88 │ │0.85 │ │0.82 │          │   │
│  │  └─────┘ └─────┘ └─────┘ └─────┘ └─────┘          │   │
│  └─────────────────────────────────────────────────────┘   │
│                                                              │
└─────────────────────────────────────────────────────────────┘

环境准备 #

安装依赖 #

bash

pip install weaviate-client sentence-transformers

启动 Weaviate #

yaml

version: '3.8'
services:
  weaviate:
    image: cr.weaviate.io/semitechnologies/weaviate:1.25.0
    ports:
      - "8080:8080"
      - "50051:50051"
    environment:
      QUERY_DEFAULTS_LIMIT: 25
      AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
      PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
      DEFAULT_VECTORIZER_MODULE: 'none'
      ENABLE_MODULES: ''

构建文档索引 #

创建 Collection #

python

import weaviate
import weaviate.classes as wvc

client = weaviate.connect_to_local()

documents = client.collections.create(
    name="Document",
    vectorizer_config=wvc.config.Configure.Vectorizer.none(),
    vector_index_config=wvc.config.Configure.VectorIndex.hnsw(
        distance_metric=wvc.config.VectorDistance.COSINE
    ),
    properties=[
        wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="content", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="category", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="tags", data_type=wvc.config.DataType.TEXT_ARRAY),
        wvc.config.Property(name="source", data_type=wvc.config.DataType.TEXT)
    ]
)

准备示例数据 #

python

sample_documents = [
    {
        "title": "Weaviate 入门指南",
        "content": "Weaviate 是一个云原生向量数据库，专为 AI 应用设计。它支持语义搜索、RAG、知识图谱等场景，使用 Go 语言编写，性能优异。",
        "category": "教程",
        "tags": ["Weaviate", "向量数据库", "入门"],
        "source": "官方文档"
    },
    {
        "title": "向量数据库对比分析",
        "content": "主流向量数据库包括 Weaviate、Qdrant、Milvus、Pinecone 等。Weaviate 特点是模块化架构、内置向量化、GraphQL 支持。Qdrant 性能更高，Milvus 生态更丰富。",
        "category": "技术分析",
        "tags": ["向量数据库", "对比", "技术选型"],
        "source": "技术博客"
    },
    {
        "title": "RAG 应用架构设计",
        "content": "RAG（检索增强生成）结合检索和生成能力。核心组件包括文档加载器、向量数据库、大语言模型。Weaviate 作为向量存储，提供高效的语义检索能力。",
        "category": "架构设计",
        "tags": ["RAG", "架构", "LLM"],
        "source": "架构专栏"
    },
    {
        "title": "语义搜索原理详解",
        "content": "语义搜索基于向量相似性，而非关键词匹配。通过 Embedding 模型将文本转换为向量，在高维空间中计算相似度。常用模型包括 OpenAI、Cohere、Sentence Transformers。",
        "category": "技术原理",
        "tags": ["语义搜索", "Embedding", "向量"],
        "source": "技术博客"
    },
    {
        "title": "HNSW 索引算法",
        "content": "HNSW（分层可导航小世界）是高效的近似最近邻搜索算法。通过多层图结构实现快速检索，时间复杂度 O(log n)。是向量数据库的核心索引技术。",
        "category": "算法",
        "tags": ["HNSW", "索引", "算法"],
        "source": "论文解读"
    },
    {
        "title": "多模态检索实践",
        "content": "多模态检索支持文本、图像、音频等多种模态。CLIP 模型实现文本和图像的统一向量空间。Weaviate 的 multi2vec-clip 模块提供开箱即用的多模态支持。",
        "category": "实践",
        "tags": ["多模态", "CLIP", "检索"],
        "source": "实践案例"
    },
    {
        "title": "知识图谱构建方法",
        "content": "知识图谱通过实体和关系构建结构化知识。Weaviate 支持对象间的交叉引用，实现知识图谱存储。结合向量搜索，支持语义关联查询。",
        "category": "知识图谱",
        "tags": ["知识图谱", "实体关系", "图数据库"],
        "source": "技术专栏"
    },
    {
        "title": "向量量化技术",
        "content": "向量量化通过压缩向量减少内存占用。PQ（乘积量化）和 BQ（二值量化）是常用方法。量化后内存占用可减少 10-30 倍，召回率损失约 5-10%。",
        "category": "技术",
        "tags": ["量化", "压缩", "性能优化"],
        "source": "技术博客"
    }
]

生成向量并导入 #

python

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

def get_embedding(text):
    return model.encode(text).tolist()

documents = client.collections.get("Document")

with documents.batch.dynamic() as batch:
    for doc in sample_documents:
        text = f"{doc['title']} {doc['content']}"
        vector = get_embedding(text)
        
        batch.add_object(
            properties={
                "title": doc["title"],
                "content": doc["content"],
                "category": doc["category"],
                "tags": doc["tags"],
                "source": doc["source"]
            },
            vector=vector
        )

print(f"Indexed {len(sample_documents)} documents")

语义搜索实现 #

基本语义搜索 #

python

def semantic_search(query, limit=5):
    query_vector = get_embedding(query)
    
    response = documents.query.near_vector(
        near_vector=query_vector,
        limit=limit,
        return_properties=["title", "content", "category", "source"]
    )
    
    return response

results = semantic_search("什么是向量数据库")

print(f"查询: 什么是向量数据库\n")
for obj in results.objects:
    print(f"标题: {obj.properties['title']}")
    print(f"分类: {obj.properties['category']}")
    print(f"距离: {obj.metadata.distance:.4f}")
    print(f"内容: {obj.properties['content'][:100]}...")
    print()

带过滤的语义搜索 #

python

from weaviate.classes.query import Filter

def filtered_semantic_search(query, category=None, limit=5):
    query_vector = get_embedding(query)
    
    filters = None
    if category:
        filters = Filter.by_property("category").equal(category)
    
    response = documents.query.near_vector(
        near_vector=query_vector,
        filters=filters,
        limit=limit,
        return_properties=["title", "content", "category"]
    )
    
    return response

results = filtered_semantic_search("数据库", category="教程")

print("教程分类的搜索结果:")
for obj in results.objects:
    print(f"- {obj.properties['title']}")

混合搜索 #

python

def hybrid_search(query, alpha=0.5, limit=5):
    query_vector = get_embedding(query)
    
    response = documents.query.hybrid(
        query=query,
        vector=query_vector,
        alpha=alpha,
        limit=limit,
        query_properties=["title", "content"],
        return_properties=["title", "content", "category"]
    )
    
    return response

results = hybrid_search("向量数据库", alpha=0.7)

print("混合搜索结果:")
for obj in results.objects:
    print(f"- {obj.properties['title']} (score: {obj.metadata.score:.4f})")

多语言搜索 #

python

def multilingual_search(query, limit=5):
    query_vector = get_embedding(query)
    
    response = documents.query.near_vector(
        near_vector=query_vector,
        limit=limit,
        return_properties=["title", "content"]
    )
    
    return response

results = multilingual_search("vector database tutorial")

print("英文查询结果:")
for obj in results.objects:
    print(f"- {obj.properties['title']}")

results = multilingual_search("ベクトルデータベース")

print("\n日文查询结果:")
for obj in results.objects:
    print(f"- {obj.properties['title']}")

高级搜索功能 #

相似文档推荐 #

python

def find_similar(doc_id, limit=5):
    response = documents.query.near_object(
        near_object=doc_id,
        limit=limit + 1,
        return_properties=["title", "content"]
    )
    
    similar_docs = [obj for obj in response.objects if str(obj.uuid) != doc_id]
    
    return similar_docs[:limit]

all_docs = documents.query.fetch_objects(limit=1)
doc_id = str(all_docs.objects[0].uuid)

similar = find_similar(doc_id)

print("相似文档推荐:")
for obj in similar:
    print(f"- {obj.properties['title']} (distance: {obj.metadata.distance:.4f})")

分类搜索 #

python

def search_by_category(query, categories, limit=5):
    query_vector = get_embedding(query)
    
    filters = Filter.by_property("category").equal_any(categories)
    
    response = documents.query.near_vector(
        near_vector=query_vector,
        filters=filters,
        limit=limit,
        return_properties=["title", "content", "category"]
    )
    
    return response

results = search_by_category("搜索技术", ["技术", "算法"])

print("技术/算法分类的搜索结果:")
for obj in results.objects:
    print(f"- [{obj.properties['category']}] {obj.properties['title']}")

标签过滤搜索 #

python

def search_by_tags(query, tags, limit=5):
    query_vector = get_embedding(query)
    
    filters = Filter.by_property("tags").contains_any(tags)
    
    response = documents.query.near_vector(
        near_vector=query_vector,
        filters=filters,
        limit=limit,
        return_properties=["title", "content", "tags"]
    )
    
    return response

results = search_by_tags("数据库", ["向量数据库", "RAG"])

print("包含指定标签的搜索结果:")
for obj in results.objects:
    print(f"- {obj.properties['title']}")
    print(f"  标签: {obj.properties['tags']}")

搜索结果处理 #

重排序 #

python

def search_with_rerank(query, limit=10, top_k=5):
    query_vector = get_embedding(query)
    
    response = documents.query.near_vector(
        near_vector=query_vector,
        limit=limit,
        return_properties=["title", "content"]
    )
    
    results = []
    for obj in response.objects:
        content = obj.properties["content"]
        content_vector = get_embedding(content)
        
        similarity = sum(a * b for a, b in zip(query_vector, content_vector))
        similarity /= (sum(a**2 for a in query_vector)**0.5 * sum(b**2 for b in content_vector)**0.5)
        
        results.append({
            "title": obj.properties["title"],
            "content": content,
            "similarity": similarity
        })
    
    results.sort(key=lambda x: x["similarity"], reverse=True)
    
    return results[:top_k]

results = search_with_rerank("向量搜索")

print("重排序后的结果:")
for r in results:
    print(f"- {r['title']} (similarity: {r['similarity']:.4f})")

结果高亮 #

python

def highlight_text(text, query, max_length=200):
    query_words = query.lower().split()
    text_lower = text.lower()
    
    for word in query_words:
        if len(word) > 1:
            idx = text_lower.find(word)
            if idx != -1:
                text = text[:idx] + f"**{text[idx:idx+len(word)]}**" + text[idx+len(word):]
                text_lower = text.lower()
    
    if len(text) > max_length:
        text = text[:max_length] + "..."
    
    return text

def search_with_highlight(query, limit=5):
    query_vector = get_embedding(query)
    
    response = documents.query.near_vector(
        near_vector=query_vector,
        limit=limit,
        return_properties=["title", "content"]
    )
    
    results = []
    for obj in response.objects:
        results.append({
            "title": obj.properties["title"],
            "content_highlighted": highlight_text(obj.properties["content"], query),
            "distance": obj.metadata.distance
        })
    
    return results

results = search_with_highlight("向量数据库")

for r in results:
    print(f"标题: {r['title']}")
    print(f"内容: {r['content_highlighted']}")
    print()

搜索性能优化 #

批量查询 #

python

def batch_search(queries, limit=5):
    query_vectors = [get_embedding(q) for q in queries]
    
    results = {}
    for query, vector in zip(queries, query_vectors):
        response = documents.query.near_vector(
            near_vector=vector,
            limit=limit,
            return_properties=["title"]
        )
        results[query] = [obj.properties["title"] for obj in response.objects]
    
    return results

queries = ["向量数据库", "语义搜索", "RAG 应用"]
results = batch_search(queries)

for query, titles in results.items():
    print(f"查询: {query}")
    for title in titles:
        print(f"  - {title}")

缓存优化 #

python

from functools import lru_cache

@lru_cache(maxsize=1000)
def cached_embedding(text):
    return model.encode(text).tolist()

def cached_search(query, limit=5):
    query_vector = cached_embedding(query)
    
    response = documents.query.near_vector(
        near_vector=query_vector,
        limit=limit
    )
    
    return response

完整示例 #

python

import weaviate
import weaviate.classes as wvc
from sentence_transformers import SentenceTransformer
from weaviate.classes.query import Filter

client = weaviate.connect_to_local()

model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

def get_embedding(text):
    return model.encode(text).tolist()

client.collections.delete("Document")

documents = client.collections.create(
    name="Document",
    vectorizer_config=wvc.config.Configure.Vectorizer.none(),
    properties=[
        wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="content", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="category", data_type=wvc.config.DataType.TEXT)
    ]
)

sample_docs = [
    {"title": "Weaviate 简介", "content": "Weaviate 是云原生向量数据库", "category": "教程"},
    {"title": "向量搜索原理", "content": "向量搜索基于高维向量相似性", "category": "技术"},
    {"title": "RAG 应用", "content": "RAG 结合检索和生成能力", "category": "应用"}
]

with documents.batch.dynamic() as batch:
    for doc in sample_docs:
        text = f"{doc['title']} {doc['content']}"
        vector = get_embedding(text)
        batch.add_object(properties=doc, vector=vector)

print("文档索引完成\n")

query = "什么是向量数据库"
query_vector = get_embedding(query)

response = documents.query.near_vector(
    near_vector=query_vector,
    limit=3,
    return_properties=["title", "content", "category"]
)

print(f"查询: {query}\n")
print("搜索结果:")
for obj in response.objects:
    print(f"- {obj.properties['title']} [{obj.properties['category']}]")
    print(f"  距离: {obj.metadata.distance:.4f}")

client.close()

小结 #

本章介绍了使用 Weaviate 构建语义搜索系统：

文档索引构建
基本语义搜索
混合搜索
多语言搜索
高级搜索功能
搜索结果处理
性能优化

下一步 #

继续学习 RAG 应用，了解如何构建检索增强生成系统！