Schema 与 Collection #
本章详细介绍 Weaviate 的 Schema 设计和 Collection 管理。
Schema 设计原则 #
设计流程 #
text
Schema 设计流程:
1. 分析业务需求
└── 确定数据类型和关系
2. 定义 Collection
└── 每类数据一个 Collection
3. 设计属性
└── 选择合适的数据类型
4. 配置向量化
└── 选择向量化模块
5. 建立引用关系
└── 定义对象间关联
6. 优化索引
└── 配置索引参数
创建 Collection #
基本创建 #
python
import weaviate.classes as wvc
articles = client.collections.create(
name="Article",
description="文章集合",
properties=[
wvc.config.Property(
name="title",
data_type=wvc.config.DataType.TEXT,
description="文章标题",
tokenization=wvc.config.Tokenization.WORD
),
wvc.config.Property(
name="content",
data_type=wvc.config.DataType.TEXT,
description="文章内容"
),
wvc.config.Property(
name="views",
data_type=wvc.config.DataType.INT,
description="阅读量"
),
wvc.config.Property(
name="rating",
data_type=wvc.config.DataType.NUMBER,
description="评分"
),
wvc.config.Property(
name="published",
data_type=wvc.config.DataType.DATE,
description="发布日期"
),
wvc.config.Property(
name="featured",
data_type=wvc.config.DataType.BOOLEAN,
description="是否推荐"
)
]
)
带向量化配置 #
python
articles = client.collections.create(
name="Article",
vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(
model="text-embedding-3-small",
model_version="latest",
type="text",
vectorize_class_name=False
),
properties=[
wvc.config.Property(
name="title",
data_type=wvc.config.DataType.TEXT,
skip_vectorization=False
),
wvc.config.Property(
name="content",
data_type=wvc.config.DataType.TEXT,
skip_vectorization=False
),
wvc.config.Property(
name="category",
data_type=wvc.config.DataType.TEXT,
skip_vectorization=True
)
]
)
多向量配置 #
python
articles = client.collections.create(
name="Article",
vectorizer_config=[
wvc.config.Configure.NamedVectors.text2vec_openai(
name="title_vector",
source_properties=["title"],
model="text-embedding-3-small"
),
wvc.config.Configure.NamedVectors.text2vec_openai(
name="content_vector",
source_properties=["content"],
model="text-embedding-3-large"
)
],
properties=[
wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT),
wvc.config.Property(name="content", data_type=wvc.config.DataType.TEXT)
]
)
数据类型详解 #
基础数据类型 #
python
collection = client.collections.create(
name="Example",
properties=[
wvc.config.Property(
name="text_field",
data_type=wvc.config.DataType.TEXT,
description="文本类型"
),
wvc.config.Property(
name="int_field",
data_type=wvc.config.DataType.INT,
description="整数类型"
),
wvc.config.Property(
name="number_field",
data_type=wvc.config.DataType.NUMBER,
description="浮点数类型"
),
wvc.config.Property(
name="bool_field",
data_type=wvc.config.DataType.BOOLEAN,
description="布尔类型"
),
wvc.config.Property(
name="date_field",
data_type=wvc.config.DataType.DATE,
description="日期时间类型"
),
wvc.config.Property(
name="uuid_field",
data_type=wvc.config.DataType.UUID,
description="UUID 类型"
)
]
)
数组类型 #
python
collection = client.collections.create(
name="Article",
properties=[
wvc.config.Property(
name="tags",
data_type=wvc.config.DataType.TEXT_ARRAY,
description="标签数组"
),
wvc.config.Property(
name="ratings",
data_type=wvc.config.DataType.NUMBER_ARRAY,
description="评分数组"
),
wvc.config.Property(
name="view_counts",
data_type=wvc.config.DataType.INT_ARRAY,
description="阅读量数组"
)
]
)
地理坐标类型 #
python
collection = client.collections.create(
name="Location",
properties=[
wvc.config.Property(
name="name",
data_type=wvc.config.DataType.TEXT
),
wvc.config.Property(
name="coordinates",
data_type=wvc.config.DataType.GEO_COORDINATES,
description="地理坐标"
)
]
)
locations = client.collections.get("Location")
locations.data.insert({
"name": "北京天安门",
"coordinates": {
"latitude": 39.9042,
"longitude": 116.4074
}
})
电话号码类型 #
python
collection = client.collections.create(
name="Contact",
properties=[
wvc.config.Property(
name="name",
data_type=wvc.config.DataType.TEXT
),
wvc.config.Property(
name="phone",
data_type=wvc.config.DataType.PHONE_NUMBER,
description="电话号码"
)
]
)
contacts = client.collections.get("Contact")
contacts.data.insert({
"name": "张三",
"phone": {
"input": "+86 138 0000 0000",
"defaultCountry": "cn"
}
})
Blob 类型 #
python
import base64
collection = client.collections.create(
name="Document",
properties=[
wvc.config.Property(
name="filename",
data_type=wvc.config.DataType.TEXT
),
wvc.config.Property(
name="content",
data_type=wvc.config.DataType.BLOB,
description="文件内容(Base64)"
)
]
)
with open("example.pdf", "rb") as f:
file_content = base64.b64encode(f.read()).decode("utf-8")
documents = client.collections.get("Document")
documents.data.insert({
"filename": "example.pdf",
"content": file_content
})
引用属性 #
创建引用关系 #
python
client.collections.create(
name="Author",
properties=[
wvc.config.Property(name="name", data_type=wvc.config.DataType.TEXT),
wvc.config.Property(name="email", data_type=wvc.config.DataType.TEXT)
]
)
client.collections.create(
name="Category",
properties=[
wvc.config.Property(name="name", data_type=wvc.config.DataType.TEXT)
]
)
client.collections.create(
name="Article",
properties=[
wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT),
wvc.config.Property(name="content", data_type=wvc.config.DataType.TEXT)
],
references=[
wvc.config.ReferenceProperty(
name="writtenBy",
target_collection="Author"
),
wvc.config.ReferenceProperty(
name="hasCategory",
target_collection="Category"
)
]
)
使用引用 #
python
authors = client.collections.get("Author")
categories = client.collections.get("Category")
articles = client.collections.get("Article")
author_uuid = authors.data.insert({"name": "张三", "email": "zhangsan@example.com"})
category_uuid = categories.data.insert({"name": "技术"})
articles.data.insert(
properties={
"title": "Weaviate 入门",
"content": "Weaviate 是一个向量数据库..."
},
references={
"writtenBy": author_uuid,
"hasCategory": category_uuid
}
)
查询引用 #
python
articles = client.collections.get("Article")
response = articles.query.fetch_objects(
return_properties=["title", "content"],
return_references=[
wvc.query.QueryReference(
link_on="writtenBy",
return_properties=["name", "email"]
),
wvc.query.QueryReference(
link_on="hasCategory",
return_properties=["name"]
)
]
)
for obj in response.objects:
print(f"Title: {obj.properties['title']}")
print(f"Author: {obj.references['writtenBy'].objects[0].properties['name']}")
print(f"Category: {obj.references['hasCategory'].objects[0].properties['name']}")
向量化配置 #
OpenAI 向量化 #
python
collection = client.collections.create(
name="Article",
vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(
model="text-embedding-3-small",
model_version="latest",
type="text",
vectorize_class_name=False,
base_url="https://api.openai.com/v1"
),
properties=[
wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT),
wvc.config.Property(name="content", data_type=wvc.config.DataType.TEXT)
]
)
Cohere 向量化 #
python
collection = client.collections.create(
name="Article",
vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_cohere(
model="embed-multilingual-v3.0",
truncate="END"
),
properties=[
wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT),
wvc.config.Property(name="content", data_type=wvc.config.DataType.TEXT)
]
)
HuggingFace 向量化 #
python
collection = client.collections.create(
name="Article",
vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_huggingface(
model="sentence-transformers/all-MiniLM-L6-v2",
options={
"waitForModel": True,
"useGPU": False
}
),
properties=[
wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT),
wvc.config.Property(name="content", data_type=wvc.config.DataType.TEXT)
]
)
自定义向量 #
python
collection = client.collections.create(
name="Article",
vectorizer_config=wvc.config.Configure.Vectorizer.none(),
vector_index_config=wvc.config.Configure.VectorIndex.hnsw(
distance_metric=wvc.config.VectorDistance.COSINE,
ef_construction=128,
max_connections=64,
quantizer=wvc.config.Configure.VectorIndex.Quantizer.pq(
segments=64,
centroids=256
)
),
properties=[
wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT),
wvc.config.Property(name="content", data_type=wvc.config.DataType.TEXT)
]
)
索引配置 #
HNSW 索引参数 #
python
collection = client.collections.create(
name="Article",
vectorizer_config=wvc.config.Configure.Vectorizer.none(),
vector_index_config=wvc.config.Configure.VectorIndex.hnsw(
distance_metric=wvc.config.VectorDistance.COSINE,
ef_construction=256,
max_connections=64,
ef=128,
dynamic_ef_min=100,
dynamic_ef_max=500,
dynamic_ef_factor=8
)
)
向量量化 #
python
collection = client.collections.create(
name="Article",
vectorizer_config=wvc.config.Configure.Vectorizer.none(),
vector_index_config=wvc.config.Configure.VectorIndex.hnsw(
distance_metric=wvc.config.VectorDistance.COSINE,
quantizer=wvc.config.Configure.VectorIndex.Quantizer.bq(
cache=True
)
)
)
倒排索引 #
python
collection = client.collections.create(
name="Article",
properties=[
wvc.config.Property(
name="title",
data_type=wvc.config.DataType.TEXT,
index_filterable=True,
index_searchable=True
),
wvc.config.Property(
name="category",
data_type=wvc.config.DataType.TEXT,
index_filterable=True
),
wvc.config.Property(
name="views",
data_type=wvc.config.DataType.INT,
index_filterable=True,
index_range_filters=True
)
]
)
Collection 管理 #
获取 Collection #
python
articles = client.collections.get("Article")
config = articles.config.get()
print(f"Name: {config.name}")
print(f"Properties: {[p.name for p in config.properties]}")
更新 Collection #
python
articles = client.collections.get("Article")
articles.config.update(
properties=[
wvc.config.Property(
name="summary",
data_type=wvc.config.DataType.TEXT,
description="文章摘要"
)
]
)
删除 Collection #
python
client.collections.delete("Article")
检查 Collection 是否存在 #
python
exists = client.collections.exists("Article")
if not exists:
print("Collection 'Article' does not exist")
多租户配置 #
启用多租户 #
python
collection = client.collections.create(
name="Article",
multi_tenancy_config=wvc.config.Configure.multi_tenancy(
enabled=True,
auto_tenant_creation=True,
auto_tenant_activation=True
),
properties=[
wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT),
wvc.config.Property(name="content", data_type=wvc.config.DataType.TEXT)
]
)
租户操作 #
python
articles = client.collections.get("Article")
articles.tenants.create(
tenants=[
wvc.tenants.Tenant(name="tenant_a"),
wvc.tenants.Tenant(name="tenant_b"),
wvc.tenants.Tenant(name="tenant_c")
]
)
tenants = articles.tenants.get()
for tenant in tenants:
print(f"Tenant: {tenant.name}, Active: {tenant.activity_status}")
tenant_a = articles.with_tenant("tenant_a")
tenant_a.data.insert({
"title": "租户A的文章",
"content": "这是租户A的数据..."
})
复制与分片 #
复制配置 #
python
collection = client.collections.create(
name="Article",
replication_config=wvc.config.Configure.replication(
factor=3,
async_enabled=True
),
properties=[
wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT)
]
)
分片配置 #
python
collection = client.collections.create(
name="Article",
sharding_config=wvc.config.Configure.sharding(
virtual_per_physical=128,
desired_count=4,
strategy=wvc.config.ShardingStrategy.HASH
),
properties=[
wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT)
]
)
Schema 最佳实践 #
命名规范 #
text
Collection 命名:
├── 使用大驼峰命名法(PascalCase)
├── 名称应具有描述性
└── 示例:Article, Author, ProductCategory
Property 命名:
├── 使用小驼峰命名法(camelCase)
├── 名称应清晰表达含义
└── 示例:title, publishedAt, viewCount
性能优化 #
text
索引优化:
├── 为常用过滤字段启用 index_filterable
├── 为范围查询启用 index_range_filters
├── 为全文搜索启用 index_searchable
└── 合理配置 HNSW 参数
向量化优化:
├── 选择合适的向量化模型
├── 跳过不需要向量化的属性
├── 使用量化减少内存占用
└── 考虑多向量配置
小结 #
本章介绍了 Schema 和 Collection 的详细配置:
- 数据类型选择
- 引用关系建立
- 向量化配置
- 索引优化
- 多租户配置
下一步 #
继续学习 向量操作,深入了解向量的存储和检索!
最后更新:2026-04-04