Schema 与 Collection #

本章详细介绍 Weaviate 的 Schema 设计和 Collection 管理。

Schema 设计原则 #

设计流程 #

text

Schema 设计流程：

1. 分析业务需求
   └── 确定数据类型和关系

2. 定义 Collection
   └── 每类数据一个 Collection

3. 设计属性
   └── 选择合适的数据类型

4. 配置向量化
   └── 选择向量化模块

5. 建立引用关系
   └── 定义对象间关联

6. 优化索引
   └── 配置索引参数

创建 Collection #

基本创建 #

python

import weaviate.classes as wvc

articles = client.collections.create(
    name="Article",
    description="文章集合",
    properties=[
        wvc.config.Property(
            name="title",
            data_type=wvc.config.DataType.TEXT,
            description="文章标题",
            tokenization=wvc.config.Tokenization.WORD
        ),
        wvc.config.Property(
            name="content",
            data_type=wvc.config.DataType.TEXT,
            description="文章内容"
        ),
        wvc.config.Property(
            name="views",
            data_type=wvc.config.DataType.INT,
            description="阅读量"
        ),
        wvc.config.Property(
            name="rating",
            data_type=wvc.config.DataType.NUMBER,
            description="评分"
        ),
        wvc.config.Property(
            name="published",
            data_type=wvc.config.DataType.DATE,
            description="发布日期"
        ),
        wvc.config.Property(
            name="featured",
            data_type=wvc.config.DataType.BOOLEAN,
            description="是否推荐"
        )
    ]
)

带向量化配置 #

python

articles = client.collections.create(
    name="Article",
    vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(
        model="text-embedding-3-small",
        model_version="latest",
        type="text",
        vectorize_class_name=False
    ),
    properties=[
        wvc.config.Property(
            name="title",
            data_type=wvc.config.DataType.TEXT,
            skip_vectorization=False
        ),
        wvc.config.Property(
            name="content",
            data_type=wvc.config.DataType.TEXT,
            skip_vectorization=False
        ),
        wvc.config.Property(
            name="category",
            data_type=wvc.config.DataType.TEXT,
            skip_vectorization=True
        )
    ]
)

多向量配置 #

python

articles = client.collections.create(
    name="Article",
    vectorizer_config=[
        wvc.config.Configure.NamedVectors.text2vec_openai(
            name="title_vector",
            source_properties=["title"],
            model="text-embedding-3-small"
        ),
        wvc.config.Configure.NamedVectors.text2vec_openai(
            name="content_vector",
            source_properties=["content"],
            model="text-embedding-3-large"
        )
    ],
    properties=[
        wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="content", data_type=wvc.config.DataType.TEXT)
    ]
)

数据类型详解 #

基础数据类型 #

python

collection = client.collections.create(
    name="Example",
    properties=[
        wvc.config.Property(
            name="text_field",
            data_type=wvc.config.DataType.TEXT,
            description="文本类型"
        ),
        wvc.config.Property(
            name="int_field",
            data_type=wvc.config.DataType.INT,
            description="整数类型"
        ),
        wvc.config.Property(
            name="number_field",
            data_type=wvc.config.DataType.NUMBER,
            description="浮点数类型"
        ),
        wvc.config.Property(
            name="bool_field",
            data_type=wvc.config.DataType.BOOLEAN,
            description="布尔类型"
        ),
        wvc.config.Property(
            name="date_field",
            data_type=wvc.config.DataType.DATE,
            description="日期时间类型"
        ),
        wvc.config.Property(
            name="uuid_field",
            data_type=wvc.config.DataType.UUID,
            description="UUID 类型"
        )
    ]
)

数组类型 #

python

collection = client.collections.create(
    name="Article",
    properties=[
        wvc.config.Property(
            name="tags",
            data_type=wvc.config.DataType.TEXT_ARRAY,
            description="标签数组"
        ),
        wvc.config.Property(
            name="ratings",
            data_type=wvc.config.DataType.NUMBER_ARRAY,
            description="评分数组"
        ),
        wvc.config.Property(
            name="view_counts",
            data_type=wvc.config.DataType.INT_ARRAY,
            description="阅读量数组"
        )
    ]
)

地理坐标类型 #

python

collection = client.collections.create(
    name="Location",
    properties=[
        wvc.config.Property(
            name="name",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name="coordinates",
            data_type=wvc.config.DataType.GEO_COORDINATES,
            description="地理坐标"
        )
    ]
)

locations = client.collections.get("Location")

locations.data.insert({
    "name": "北京天安门",
    "coordinates": {
        "latitude": 39.9042,
        "longitude": 116.4074
    }
})

电话号码类型 #

python

collection = client.collections.create(
    name="Contact",
    properties=[
        wvc.config.Property(
            name="name",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name="phone",
            data_type=wvc.config.DataType.PHONE_NUMBER,
            description="电话号码"
        )
    ]
)

contacts = client.collections.get("Contact")

contacts.data.insert({
    "name": "张三",
    "phone": {
        "input": "+86 138 0000 0000",
        "defaultCountry": "cn"
    }
})

Blob 类型 #

python

import base64

collection = client.collections.create(
    name="Document",
    properties=[
        wvc.config.Property(
            name="filename",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name="content",
            data_type=wvc.config.DataType.BLOB,
            description="文件内容（Base64）"
        )
    ]
)

with open("example.pdf", "rb") as f:
    file_content = base64.b64encode(f.read()).decode("utf-8")

documents = client.collections.get("Document")

documents.data.insert({
    "filename": "example.pdf",
    "content": file_content
})

引用属性 #

创建引用关系 #

python

client.collections.create(
    name="Author",
    properties=[
        wvc.config.Property(name="name", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="email", data_type=wvc.config.DataType.TEXT)
    ]
)

client.collections.create(
    name="Category",
    properties=[
        wvc.config.Property(name="name", data_type=wvc.config.DataType.TEXT)
    ]
)

client.collections.create(
    name="Article",
    properties=[
        wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="content", data_type=wvc.config.DataType.TEXT)
    ],
    references=[
        wvc.config.ReferenceProperty(
            name="writtenBy",
            target_collection="Author"
        ),
        wvc.config.ReferenceProperty(
            name="hasCategory",
            target_collection="Category"
        )
    ]
)

使用引用 #

python

authors = client.collections.get("Author")
categories = client.collections.get("Category")
articles = client.collections.get("Article")

author_uuid = authors.data.insert({"name": "张三", "email": "zhangsan@example.com"})
category_uuid = categories.data.insert({"name": "技术"})

articles.data.insert(
    properties={
        "title": "Weaviate 入门",
        "content": "Weaviate 是一个向量数据库..."
    },
    references={
        "writtenBy": author_uuid,
        "hasCategory": category_uuid
    }
)

查询引用 #

python

articles = client.collections.get("Article")

response = articles.query.fetch_objects(
    return_properties=["title", "content"],
    return_references=[
        wvc.query.QueryReference(
            link_on="writtenBy",
            return_properties=["name", "email"]
        ),
        wvc.query.QueryReference(
            link_on="hasCategory",
            return_properties=["name"]
        )
    ]
)

for obj in response.objects:
    print(f"Title: {obj.properties['title']}")
    print(f"Author: {obj.references['writtenBy'].objects[0].properties['name']}")
    print(f"Category: {obj.references['hasCategory'].objects[0].properties['name']}")

向量化配置 #

OpenAI 向量化 #

python

collection = client.collections.create(
    name="Article",
    vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(
        model="text-embedding-3-small",
        model_version="latest",
        type="text",
        vectorize_class_name=False,
        base_url="https://api.openai.com/v1"
    ),
    properties=[
        wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="content", data_type=wvc.config.DataType.TEXT)
    ]
)

Cohere 向量化 #

python

collection = client.collections.create(
    name="Article",
    vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_cohere(
        model="embed-multilingual-v3.0",
        truncate="END"
    ),
    properties=[
        wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="content", data_type=wvc.config.DataType.TEXT)
    ]
)

HuggingFace 向量化 #

python

collection = client.collections.create(
    name="Article",
    vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_huggingface(
        model="sentence-transformers/all-MiniLM-L6-v2",
        options={
            "waitForModel": True,
            "useGPU": False
        }
    ),
    properties=[
        wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="content", data_type=wvc.config.DataType.TEXT)
    ]
)

自定义向量 #

python

collection = client.collections.create(
    name="Article",
    vectorizer_config=wvc.config.Configure.Vectorizer.none(),
    vector_index_config=wvc.config.Configure.VectorIndex.hnsw(
        distance_metric=wvc.config.VectorDistance.COSINE,
        ef_construction=128,
        max_connections=64,
        quantizer=wvc.config.Configure.VectorIndex.Quantizer.pq(
            segments=64,
            centroids=256
        )
    ),
    properties=[
        wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="content", data_type=wvc.config.DataType.TEXT)
    ]
)

索引配置 #

HNSW 索引参数 #

python

collection = client.collections.create(
    name="Article",
    vectorizer_config=wvc.config.Configure.Vectorizer.none(),
    vector_index_config=wvc.config.Configure.VectorIndex.hnsw(
        distance_metric=wvc.config.VectorDistance.COSINE,
        ef_construction=256,
        max_connections=64,
        ef=128,
        dynamic_ef_min=100,
        dynamic_ef_max=500,
        dynamic_ef_factor=8
    )
)

向量量化 #

python

collection = client.collections.create(
    name="Article",
    vectorizer_config=wvc.config.Configure.Vectorizer.none(),
    vector_index_config=wvc.config.Configure.VectorIndex.hnsw(
        distance_metric=wvc.config.VectorDistance.COSINE,
        quantizer=wvc.config.Configure.VectorIndex.Quantizer.bq(
            cache=True
        )
    )
)

倒排索引 #

python

collection = client.collections.create(
    name="Article",
    properties=[
        wvc.config.Property(
            name="title",
            data_type=wvc.config.DataType.TEXT,
            index_filterable=True,
            index_searchable=True
        ),
        wvc.config.Property(
            name="category",
            data_type=wvc.config.DataType.TEXT,
            index_filterable=True
        ),
        wvc.config.Property(
            name="views",
            data_type=wvc.config.DataType.INT,
            index_filterable=True,
            index_range_filters=True
        )
    ]
)

Collection 管理 #

获取 Collection #

python

articles = client.collections.get("Article")

config = articles.config.get()
print(f"Name: {config.name}")
print(f"Properties: {[p.name for p in config.properties]}")

更新 Collection #

python

articles = client.collections.get("Article")

articles.config.update(
    properties=[
        wvc.config.Property(
            name="summary",
            data_type=wvc.config.DataType.TEXT,
            description="文章摘要"
        )
    ]
)

删除 Collection #

python

client.collections.delete("Article")

检查 Collection 是否存在 #

python

exists = client.collections.exists("Article")

if not exists:
    print("Collection 'Article' does not exist")

多租户配置 #

启用多租户 #

python

collection = client.collections.create(
    name="Article",
    multi_tenancy_config=wvc.config.Configure.multi_tenancy(
        enabled=True,
        auto_tenant_creation=True,
        auto_tenant_activation=True
    ),
    properties=[
        wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="content", data_type=wvc.config.DataType.TEXT)
    ]
)

租户操作 #

python

articles = client.collections.get("Article")

articles.tenants.create(
    tenants=[
        wvc.tenants.Tenant(name="tenant_a"),
        wvc.tenants.Tenant(name="tenant_b"),
        wvc.tenants.Tenant(name="tenant_c")
    ]
)

tenants = articles.tenants.get()
for tenant in tenants:
    print(f"Tenant: {tenant.name}, Active: {tenant.activity_status}")

tenant_a = articles.with_tenant("tenant_a")

tenant_a.data.insert({
    "title": "租户A的文章",
    "content": "这是租户A的数据..."
})

复制与分片 #

复制配置 #

python

collection = client.collections.create(
    name="Article",
    replication_config=wvc.config.Configure.replication(
        factor=3,
        async_enabled=True
    ),
    properties=[
        wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT)
    ]
)

分片配置 #

python

collection = client.collections.create(
    name="Article",
    sharding_config=wvc.config.Configure.sharding(
        virtual_per_physical=128,
        desired_count=4,
        strategy=wvc.config.ShardingStrategy.HASH
    ),
    properties=[
        wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT)
    ]
)

Schema 最佳实践 #

命名规范 #

text

Collection 命名：
├── 使用大驼峰命名法（PascalCase）
├── 名称应具有描述性
└── 示例：Article, Author, ProductCategory

Property 命名：
├── 使用小驼峰命名法（camelCase）
├── 名称应清晰表达含义
└── 示例：title, publishedAt, viewCount

性能优化 #

text

索引优化：
├── 为常用过滤字段启用 index_filterable
├── 为范围查询启用 index_range_filters
├── 为全文搜索启用 index_searchable
└── 合理配置 HNSW 参数

向量化优化：
├── 选择合适的向量化模型
├── 跳过不需要向量化的属性
├── 使用量化减少内存占用
└── 考虑多向量配置

小结 #

本章介绍了 Schema 和 Collection 的详细配置：

数据类型选择
引用关系建立
向量化配置
索引优化
多租户配置

下一步 #

继续学习向量操作，深入了解向量的存储和检索！