集合管理 #
什么是集合? #
集合(Collection)是 Chroma 中存储和组织数据的基本单位,类似于关系数据库中的"表"。每个集合可以存储文档、向量和元数据,并有自己的嵌入函数配置。
text
┌─────────────────────────────────────────────────────────────┐
│ 集合结构 │
├─────────────────────────────────────────────────────────────┤
│ │
│ Collection │
│ ├── name: "documents" │
│ ├── metadata: {"description": "文档存储"} │
│ ├── embedding_function: SentenceTransformer │
│ └── Documents │
│ ├── doc1: {id, document, embedding, metadata} │
│ ├── doc2: {id, document, embedding, metadata} │
│ └── doc3: {id, document, embedding, metadata} │
│ │
└─────────────────────────────────────────────────────────────┘
创建集合 #
基本创建 #
python
import chromadb
client = chromadb.Client()
collection = client.create_collection(name="my_collection")
print(f"集合名称: {collection.name}")
print(f"集合 ID: {collection.id}")
带元数据创建 #
python
collection = client.create_collection(
name="documents",
metadata={
"description": "文档存储集合",
"created_by": "admin",
"version": "1.0"
}
)
print(collection.metadata)
带嵌入函数创建 #
python
from chromadb.utils import embedding_functions
embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
model_name="all-MiniLM-L6-v2"
)
collection = client.create_collection(
name="custom_embeddings",
embedding_function=embedding_function,
metadata={"embedding_model": "all-MiniLM-L6-v2"}
)
配置距离函数 #
python
from chromadb.utils import embedding_functions
collection = client.create_collection(
name="cosine_collection",
metadata={"hnsw:space": "cosine"}
)
collection = client.create_collection(
name="l2_collection",
metadata={"hnsw:space": "l2"}
)
collection = client.create_collection(
name="ip_collection",
metadata={"hnsw:space": "ip"}
)
距离函数对比 #
text
┌─────────────────────────────────────────────────────────────┐
│ 距离函数 │
├─────────────────────────────────────────────────────────────┤
│ │
│ Cosine Similarity (余弦相似度) - 默认 │
│ - 范围: [-1, 1],值越大越相似 │
│ - 适用: 文本相似度、推荐系统 │
│ - 公式: cos(θ) = A·B / (|A|×|B|) │
│ │
│ L2 Distance (欧几里得距离) │
│ - 范围: [0, ∞),值越小越相似 │
│ - 适用: 图像相似度、地理位置 │
│ - 公式: d = √Σ(Ai - Bi)² │
│ │
│ Inner Product (内积) │
│ - 范围: (-∞, ∞),值越大越相似 │
│ - 适用: 归一化向量、推荐系统 │
│ - 公式: ip = A·B │
│ │
└─────────────────────────────────────────────────────────────┘
获取集合 #
按名称获取 #
python
collection = client.get_collection(name="my_collection")
print(f"集合名称: {collection.name}")
print(f"文档数量: {collection.count()}")
获取或创建 #
python
collection = client.get_or_create_collection(
name="auto_collection",
metadata={"description": "自动创建的集合"}
)
print(f"集合名称: {collection.name}")
print(f"是否新建: {collection.count() == 0}")
获取所有集合 #
python
collections = client.list_collections()
for coll in collections:
print(f"集合: {coll.name}, 文档数: {coll.count()}")
删除集合 #
删除指定集合 #
python
client.delete_collection(name="my_collection")
print("集合已删除")
安全删除 #
python
try:
client.delete_collection(name="nonexistent")
except Exception as e:
print(f"删除失败: {e}")
删除所有集合 #
python
collections = client.list_collections()
for coll in collections:
client.delete_collection(name=coll.name)
print(f"已删除集合: {coll.name}")
集合信息 #
获取集合统计 #
python
collection = client.get_collection(name="documents")
print(f"集合名称: {collection.name}")
print(f"集合 ID: {collection.id}")
print(f"文档数量: {collection.count()}")
print(f"元数据: {collection.metadata}")
检查集合是否存在 #
python
def collection_exists(client, name: str) -> bool:
collections = client.list_collections()
return any(coll.name == name for coll in collections)
if collection_exists(client, "documents"):
print("集合存在")
else:
print("集合不存在")
集合配置 #
HNSW 索引参数 #
python
collection = client.create_collection(
name="hnsw_configured",
metadata={
"hnsw:space": "cosine",
"hnsw:construction_ef": 200,
"hnsw:M": 16
}
)
HNSW 参数说明 #
text
┌─────────────────────────────────────────────────────────────┐
│ HNSW 参数 │
├─────────────────────────────────────────────────────────────┤
│ │
│ hnsw:space │
│ - 距离函数: cosine, l2, ip │
│ - 默认: cosine │
│ │
│ hnsw:construction_ef │
│ - 构建时的候选列表大小 │
│ - 默认: 100 │
│ - 值越大,构建质量越高,但速度越慢 │
│ │
│ hnsw:M │
│ - 每个节点的连接数 │
│ - 默认: 16 │
│ - 值越大,召回率越高,但内存占用越大 │
│ │
│ hnsw:batch_size │
│ - 批量添加时的批次大小 │
│ - 默认: 100 │
│ │
│ hnsw:sync_threshold │
│ - 同步阈值,触发索引保存的文档数 │
│ - 默认: 1000 │
│ │
└─────────────────────────────────────────────────────────────┘
配置示例 #
python
high_quality_collection = client.create_collection(
name="high_quality",
metadata={
"hnsw:space": "cosine",
"hnsw:construction_ef": 400,
"hnsw:M": 32
}
)
fast_collection = client.create_collection(
name="fast",
metadata={
"hnsw:space": "cosine",
"hnsw:construction_ef": 50,
"hnsw:M": 8
}
)
集合操作最佳实践 #
命名规范 #
python
good_names = [
"user_documents",
"product_embeddings",
"qa_pairs",
"chat_history"
]
bad_names = [
"My Collection",
"collection-1",
"测试集合",
"collection with spaces"
]
def validate_collection_name(name: str) -> bool:
import re
pattern = r'^[a-z][a-z0-9_]*$'
return bool(re.match(pattern, name))
元数据设计 #
python
collection = client.create_collection(
name="documents",
metadata={
"description": "文档存储集合",
"version": "1.0",
"created_at": "2024-01-01",
"embedding_model": "all-MiniLM-L6-v2",
"distance_function": "cosine"
}
)
集合模板 #
python
def create_document_collection(client, name: str, description: str = ""):
return client.create_collection(
name=name,
metadata={
"description": description,
"type": "document",
"created_at": datetime.now().isoformat(),
"hnsw:space": "cosine"
}
)
def create_qa_collection(client, name: str):
return client.create_collection(
name=name,
metadata={
"description": "问答对集合",
"type": "qa",
"created_at": datetime.now().isoformat()
}
)
def create_product_collection(client, name: str):
return client.create_collection(
name=name,
metadata={
"description": "产品向量集合",
"type": "product",
"hnsw:space": "cosine"
}
)
多集合管理 #
集合管理器 #
python
class CollectionManager:
def __init__(self, client):
self.client = client
self.collections = {}
def get_or_create(self, name: str, **kwargs):
if name not in self.collections:
self.collections[name] = self.client.get_or_create_collection(
name=name,
**kwargs
)
return self.collections[name]
def get(self, name: str):
if name not in self.collections:
self.collections[name] = self.client.get_collection(name=name)
return self.collections[name]
def delete(self, name: str):
if name in self.collections:
del self.collections[name]
self.client.delete_collection(name=name)
def list_all(self):
return self.client.list_collections()
def clear_all(self):
for coll in self.list_all():
self.delete(coll.name)
manager = CollectionManager(chromadb.Client())
docs = manager.get_or_create("documents", metadata={"type": "doc"})
qa = manager.get_or_create("qa_pairs", metadata={"type": "qa"})
按类型组织集合 #
python
import chromadb
client = chromadb.Client()
collections = {
"documents": client.get_or_create_collection(
name="documents",
metadata={"type": "document"}
),
"qa_pairs": client.get_or_create_collection(
name="qa_pairs",
metadata={"type": "qa"}
),
"products": client.get_or_create_collection(
name="products",
metadata={"type": "product"}
),
"users": client.get_or_create_collection(
name="users",
metadata={"type": "user"}
)
}
def get_collection_by_type(type_name: str):
return collections.get(type_name)
集合备份与恢复 #
导出集合数据 #
python
import json
def export_collection(collection, filepath: str):
data = collection.get()
export_data = {
"name": collection.name,
"metadata": collection.metadata,
"documents": []
}
for i in range(len(data['ids'])):
doc = {
"id": data['ids'][i],
"document": data['documents'][i] if data['documents'] else None,
"embedding": data['embeddings'][i] if data['embeddings'] else None,
"metadata": data['metadatas'][i] if data['metadatas'] else None
}
export_data["documents"].append(doc)
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(export_data, f, ensure_ascii=False, indent=2)
print(f"导出 {len(export_data['documents'])} 个文档到 {filepath}")
export_collection(collection, "collection_backup.json")
导入集合数据 #
python
import json
def import_collection(client, filepath: str):
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
collection = client.get_or_create_collection(
name=data["name"],
metadata=data.get("metadata", {})
)
if data["documents"]:
collection.add(
ids=[doc["id"] for doc in data["documents"]],
documents=[doc["document"] for doc in data["documents"] if doc["document"]],
embeddings=[doc["embedding"] for doc in data["documents"] if doc["embedding"]],
metadatas=[doc["metadata"] for doc in data["documents"] if doc["metadata"]]
)
print(f"导入 {len(data['documents'])} 个文档到集合 {data['name']}")
return collection
import_collection(client, "collection_backup.json")
常见问题 #
问题 1:集合名称冲突 #
python
try:
collection = client.create_collection(name="existing_collection")
except Exception as e:
print(f"集合已存在: {e}")
collection = client.get_collection(name="existing_collection")
问题 2:集合数量限制 #
python
collections = client.list_collections()
print(f"当前集合数: {len(collections)}")
if len(collections) > 100:
print("警告: 集合数量过多,考虑合并或清理")
问题 3:集合性能优化 #
python
large_collection = client.create_collection(
name="large_dataset",
metadata={
"hnsw:construction_ef": 200,
"hnsw:M": 32,
"hnsw:batch_size": 1000
}
)
下一步 #
现在你已经掌握了集合管理,接下来学习 文档操作,了解如何高效地管理文档数据!
最后更新:2026-04-04