快速开始 #

本章通过实际示例帮助你快速上手 Weaviate。

环境准备 #

安装依赖 #

bash

pip install weaviate-client

启动 Weaviate #

创建 docker-compose.yml：

yaml

version: '3.8'
services:
  weaviate:
    image: cr.weaviate.io/semitechnologies/weaviate:1.25.0
    ports:
      - "8080:8080"
      - "50051:50051"
    environment:
      QUERY_DEFAULTS_LIMIT: 25
      AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
      PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
      DEFAULT_VECTORIZER_MODULE: 'none'
      ENABLE_MODULES: ''
      CLUSTER_HOSTNAME: 'node1'

启动服务：

bash

docker-compose up -d

连接 Weaviate #

python

import weaviate

client = weaviate.connect_to_local(
    host="localhost",
    port=8080,
    grpc_port=50051
)

print(f"Connected: {client.is_ready()}")

创建 Collection #

基本创建 #

python

import weaviate.classes as wvc

articles = client.collections.create(
    name="Article",
    vectorizer_config=wvc.config.Configure.Vectorizer.none(),
    properties=[
        wvc.config.Property(
            name="title",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name="content",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name="category",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name="views",
            data_type=wvc.config.DataType.INT
        )
    ]
)

print("Collection 'Article' created successfully")

查看所有 Collections #

python

collections = client.collections.list_all()

for collection in collections:
    print(f"Collection: {collection.name}")

获取 Collection 配置 #

python

articles = client.collections.get("Article")
config = articles.config.get()

print(f"Vectorizers: {config.vectorizer_config}")
print(f"Properties: {[p.name for p in config.properties]}")

插入数据 #

插入单个对象 #

python

articles = client.collections.get("Article")

uuid = articles.data.insert({
    "title": "Weaviate 入门指南",
    "content": "Weaviate 是一个云原生向量数据库，专为 AI 应用设计。它支持语义搜索、RAG 等场景。",
    "category": "技术",
    "views": 1000
})

print(f"Created object: {uuid}")

插入带自定义向量 #

python

import numpy as np

vector = np.random.rand(128).tolist()

uuid = articles.data.insert(
    properties={
        "title": "向量搜索原理",
        "content": "向量搜索通过计算高维向量之间的相似性来找到最相关的结果。",
        "category": "技术",
        "views": 800
    },
    vector=vector
)

批量插入 #

python

sample_data = [
    {
        "title": "RAG 应用实战",
        "content": "RAG（检索增强生成）结合了检索和生成能力，为大语言模型提供外部知识。",
        "category": "AI",
        "views": 1500
    },
    {
        "title": "语义搜索详解",
        "content": "语义搜索理解查询意图，而非简单关键词匹配，提供更精准的搜索结果。",
        "category": "搜索",
        "views": 900
    },
    {
        "title": "知识图谱构建",
        "content": "知识图谱通过实体和关系构建结构化知识，支持推理和关联查询。",
        "category": "AI",
        "views": 700
    },
    {
        "title": "向量数据库对比",
        "content": "主流向量数据库包括 Weaviate、Qdrant、Milvus 等，各有特点和适用场景。",
        "category": "技术",
        "views": 2000
    },
    {
        "title": "Embedding 模型选择",
        "content": "选择合适的 Embedding 模型对向量搜索效果至关重要，需考虑维度、性能等因素。",
        "category": "AI",
        "views": 600
    }
]

with articles.batch.dynamic() as batch:
    for item in sample_data:
        vector = np.random.rand(128).tolist()
        batch.add_object(
            properties=item,
            vector=vector
        )

print("Batch insert completed")

查询数据 #

根据 ID 查询 #

python

articles = client.collections.get("Article")

article = articles.query.fetch_object_by_id(uuid)

print(f"Title: {article.properties['title']}")
print(f"Content: {article.properties['content']}")

获取所有对象 #

python

response = articles.query.fetch_objects(
    limit=10
)

for obj in response.objects:
    print(f"- {obj.properties['title']} (views: {obj.properties['views']})")

过滤查询 #

python

from weaviate.classes.query import Filter

response = articles.query.fetch_objects(
    filters=Filter.by_property("category").equal("AI"),
    limit=10
)

print("AI 类别的文章:")
for obj in response.objects:
    print(f"- {obj.properties['title']}")

复杂过滤 #

python

response = articles.query.fetch_objects(
    filters=(
        Filter.by_property("views").greater_than(500) &
        Filter.by_property("category").equal("技术")
    ),
    limit=10
)

print("阅读量 > 500 的技术文章:")
for obj in response.objects:
    print(f"- {obj.properties['title']} (views: {obj.properties['views']})")

向量搜索 #

准备向量搜索数据 #

python

import weaviate.classes as wvc
import numpy as np

client.collections.delete("Document")

documents = client.collections.create(
    name="Document",
    vectorizer_config=wvc.config.Configure.Vectorizer.none(),
    vector_index_config=wvc.config.Configure.VectorIndex.hnsw(
        distance_metric=wvc.config.VectorDistance.COSINE
    ),
    properties=[
        wvc.config.Property(name="text", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="source", data_type=wvc.config.DataType.TEXT)
    ]
)

docs_data = [
    ("Weaviate 是一个开源的向量数据库", "官方文档"),
    ("向量数据库用于存储和检索高维向量", "技术博客"),
    ("语义搜索基于向量相似性进行检索", "教程"),
    ("RAG 结合检索和生成提升 LLM 能力", "论文"),
    ("知识图谱表示实体间的语义关系", "百科"),
    ("Embedding 将文本转换为向量表示", "技术博客"),
    ("HNSW 是高效的向量索引算法", "论文"),
    ("GraphQL 提供灵活的查询能力", "官方文档")
]

def simple_embedding(text):
    np.random.seed(hash(text) % (2**32))
    return np.random.rand(128).tolist()

with documents.batch.dynamic() as batch:
    for text, source in docs_data:
        vector = simple_embedding(text)
        batch.add_object(
            properties={"text": text, "source": source},
            vector=vector
        )

print("Documents inserted")

向量相似性搜索 #

python

documents = client.collections.get("Document")

query_text = "向量数据库是什么"
query_vector = simple_embedding(query_text)

response = documents.query.near_vector(
    near_vector=query_vector,
    limit=3
)

print(f"查询: {query_text}")
print("\n最相似的文档:")
for obj in response.objects:
    print(f"- {obj.properties['text']} (source: {obj.properties['source']})")

带过滤的向量搜索 #

python

response = documents.query.near_vector(
    near_vector=query_vector,
    filters=Filter.by_property("source").equal("技术博客"),
    limit=3
)

print("技术博客来源的结果:")
for obj in response.objects:
    print(f"- {obj.properties['text']}")

更新和删除 #

更新对象 #

python

articles = client.collections.get("Article")

article = articles.query.fetch_objects(limit=1).objects[0]

articles.data.update(
    uuid=article.uuid,
    properties={
        "title": article.properties["title"],
        "content": article.properties["content"],
        "category": article.properties["category"],
        "views": article.properties["views"] + 100
    }
)

print("Object updated")

删除对象 #

python

articles = client.collections.get("Article")

article = articles.query.fetch_objects(limit=1).objects[0]

articles.data.delete_by_id(article.uuid)

print(f"Deleted object: {article.uuid}")

批量删除 #

python

from weaviate.classes.query import Filter

result = articles.data.delete_many(
    where=Filter.by_property("views").less_than(100)
)

print(f"Deleted {result.successful} objects")

删除 Collection #

python

client.collections.delete("Article")

print("Collection 'Article' deleted")

聚合查询 #

统计总数 #

python

articles = client.collections.get("Article")

response = articles.aggregate.over_all(
    total_count=True
)

print(f"Total articles: {response.total_count}")

分组统计 #

python

from weaviate.classes.aggregate import GroupByAggregate

response = articles.aggregate.over_all(
    group_by=GroupByAggregate(prop="category")
)

print("按类别统计:")
for group in response.groups:
    print(f"- {group.grouped_by.value}: {group.total_count}")

数值统计 #

python

response = articles.aggregate.over_all(
    return_metrics=wvc.aggregate.Metrics("views").integer(
        mean=True,
        maximum=True,
        minimum=True,
        sum_=True
    )
)

for prop, metrics in response.properties.items():
    print(f"Views 统计:")
    print(f"  平均值: {metrics.integer.mean}")
    print(f"  最大值: {metrics.integer.maximum}")
    print(f"  最小值: {metrics.integer.minimum}")
    print(f"  总和: {metrics.integer.sum_}")

完整示例 #

python

import weaviate
import weaviate.classes as wvc
from weaviate.classes.query import Filter
import numpy as np

client = weaviate.connect_to_local()

client.collections.delete("Product")

products = client.collections.create(
    name="Product",
    vectorizer_config=wvc.config.Configure.Vectorizer.none(),
    properties=[
        wvc.config.Property(name="name", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="description", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="price", data_type=wvc.config.DataType.NUMBER),
        wvc.config.Property(name="category", data_type=wvc.config.DataType.TEXT)
    ]
)

sample_products = [
    ("iPhone 15", "苹果最新智能手机，搭载 A17 芯片", 7999, "手机"),
    ("MacBook Pro", "专业笔记本电脑，M3 芯片", 14999, "电脑"),
    ("AirPods Pro", "主动降噪无线耳机", 1899, "配件"),
    ("iPad Air", "轻薄平板电脑，M1 芯片", 4799, "平板"),
    ("Apple Watch", "智能手表，健康监测", 2999, "配件"),
    ("Galaxy S24", "三星旗舰手机，AI 功能", 6999, "手机"),
    ("ThinkPad X1", "商务笔记本电脑", 9999, "电脑"),
    ("Surface Pro", "微软二合一平板电脑", 8999, "平板"),
]

def text_to_vector(text):
    np.random.seed(hash(text) % (2**32))
    return np.random.rand(64).tolist()

with products.batch.dynamic() as batch:
    for name, desc, price, category in sample_products:
        text = f"{name} {desc}"
        vector = text_to_vector(text)
        batch.add_object(
            properties={
                "name": name,
                "description": desc,
                "price": price,
                "category": category
            },
            vector=vector
        )

print("Products inserted\n")

query = "智能手机推荐"
query_vector = text_to_vector(query)

response = products.query.near_vector(
    near_vector=query_vector,
    limit=3,
    return_properties=["name", "description", "price", "category"]
)

print(f"查询: {query}")
print("\n推荐结果:")
for obj in response.objects:
    props = obj.properties
    print(f"- {props['name']}: ¥{props['price']}")
    print(f"  {props['description']}")
    print()

response = products.aggregate.over_all(
    group_by=wvc.aggregate.GroupByAggregate(prop="category"),
    return_metrics=wvc.aggregate.Metrics("price").number(mean=True)
)

print("各类别平均价格:")
for group in response.groups:
    category = group.grouped_by.value
    mean_price = group.properties["price"].number.mean
    print(f"- {category}: ¥{mean_price:.2f}")

client.close()
print("\nDone!")

小结 #

本章介绍了 Weaviate 的基本操作：

创建和删除 Collection
插入和批量插入数据
查询和过滤数据
向量相似性搜索
更新和删除操作
聚合统计

下一步 #

掌握了基本操作后，继续学习 Schema 与 Collection，深入了解数据建模！