集合管理 #

什么是集合？ #

集合（Collection）是 Chroma 中存储和组织数据的基本单位，类似于关系数据库中的"表"。每个集合可以存储文档、向量和元数据，并有自己的嵌入函数配置。

text

┌─────────────────────────────────────────────────────────────┐
│                    集合结构                                  │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  Collection                                                 │
│  ├── name: "documents"                                      │
│  ├── metadata: {"description": "文档存储"}                  │
│  ├── embedding_function: SentenceTransformer                │
│  └── Documents                                              │
│      ├── doc1: {id, document, embedding, metadata}         │
│      ├── doc2: {id, document, embedding, metadata}         │
│      └── doc3: {id, document, embedding, metadata}         │
│                                                             │
└─────────────────────────────────────────────────────────────┘

创建集合 #

基本创建 #

python

import chromadb

client = chromadb.Client()

collection = client.create_collection(name="my_collection")

print(f"集合名称: {collection.name}")
print(f"集合 ID: {collection.id}")

带元数据创建 #

python

collection = client.create_collection(
    name="documents",
    metadata={
        "description": "文档存储集合",
        "created_by": "admin",
        "version": "1.0"
    }
)

print(collection.metadata)

带嵌入函数创建 #

python

from chromadb.utils import embedding_functions

embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)

collection = client.create_collection(
    name="custom_embeddings",
    embedding_function=embedding_function,
    metadata={"embedding_model": "all-MiniLM-L6-v2"}
)

配置距离函数 #

python

from chromadb.utils import embedding_functions

collection = client.create_collection(
    name="cosine_collection",
    metadata={"hnsw:space": "cosine"}
)

collection = client.create_collection(
    name="l2_collection",
    metadata={"hnsw:space": "l2"}
)

collection = client.create_collection(
    name="ip_collection",
    metadata={"hnsw:space": "ip"}
)

距离函数对比 #

text

┌─────────────────────────────────────────────────────────────┐
│                    距离函数                                  │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  Cosine Similarity (余弦相似度) - 默认                      │
│  - 范围: [-1, 1]，值越大越相似                              │
│  - 适用: 文本相似度、推荐系统                               │
│  - 公式: cos(θ) = A·B / (|A|×|B|)                          │
│                                                             │
│  L2 Distance (欧几里得距离)                                  │
│  - 范围: [0, ∞)，值越小越相似                               │
│  - 适用: 图像相似度、地理位置                               │
│  - 公式: d = √Σ(Ai - Bi)²                                  │
│                                                             │
│  Inner Product (内积)                                        │
│  - 范围: (-∞, ∞)，值越大越相似                              │
│  - 适用: 归一化向量、推荐系统                               │
│  - 公式: ip = A·B                                           │
│                                                             │
└─────────────────────────────────────────────────────────────┘

获取集合 #

按名称获取 #

python

collection = client.get_collection(name="my_collection")

print(f"集合名称: {collection.name}")
print(f"文档数量: {collection.count()}")

获取或创建 #

python

collection = client.get_or_create_collection(
    name="auto_collection",
    metadata={"description": "自动创建的集合"}
)

print(f"集合名称: {collection.name}")
print(f"是否新建: {collection.count() == 0}")

获取所有集合 #

python

collections = client.list_collections()

for coll in collections:
    print(f"集合: {coll.name}, 文档数: {coll.count()}")

删除集合 #

删除指定集合 #

python

client.delete_collection(name="my_collection")

print("集合已删除")

安全删除 #

python

try:
    client.delete_collection(name="nonexistent")
except Exception as e:
    print(f"删除失败: {e}")

删除所有集合 #

python

collections = client.list_collections()

for coll in collections:
    client.delete_collection(name=coll.name)
    print(f"已删除集合: {coll.name}")

集合信息 #

获取集合统计 #

python

collection = client.get_collection(name="documents")

print(f"集合名称: {collection.name}")
print(f"集合 ID: {collection.id}")
print(f"文档数量: {collection.count()}")
print(f"元数据: {collection.metadata}")

检查集合是否存在 #

python

def collection_exists(client, name: str) -> bool:
    collections = client.list_collections()
    return any(coll.name == name for coll in collections)

if collection_exists(client, "documents"):
    print("集合存在")
else:
    print("集合不存在")

集合配置 #

HNSW 索引参数 #

python

collection = client.create_collection(
    name="hnsw_configured",
    metadata={
        "hnsw:space": "cosine",
        "hnsw:construction_ef": 200,
        "hnsw:M": 16
    }
)

HNSW 参数说明 #

text

┌─────────────────────────────────────────────────────────────┐
│                    HNSW 参数                                 │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  hnsw:space                                                 │
│  - 距离函数: cosine, l2, ip                                 │
│  - 默认: cosine                                             │
│                                                             │
│  hnsw:construction_ef                                       │
│  - 构建时的候选列表大小                                     │
│  - 默认: 100                                                │
│  - 值越大，构建质量越高，但速度越慢                         │
│                                                             │
│  hnsw:M                                                     │
│  - 每个节点的连接数                                         │
│  - 默认: 16                                                 │
│  - 值越大，召回率越高，但内存占用越大                       │
│                                                             │
│  hnsw:batch_size                                            │
│  - 批量添加时的批次大小                                     │
│  - 默认: 100                                                │
│                                                             │
│  hnsw:sync_threshold                                        │
│  - 同步阈值，触发索引保存的文档数                           │
│  - 默认: 1000                                               │
│                                                             │
└─────────────────────────────────────────────────────────────┘

配置示例 #

python

high_quality_collection = client.create_collection(
    name="high_quality",
    metadata={
        "hnsw:space": "cosine",
        "hnsw:construction_ef": 400,
        "hnsw:M": 32
    }
)

fast_collection = client.create_collection(
    name="fast",
    metadata={
        "hnsw:space": "cosine",
        "hnsw:construction_ef": 50,
        "hnsw:M": 8
    }
)

集合操作最佳实践 #

命名规范 #

python

good_names = [
    "user_documents",
    "product_embeddings",
    "qa_pairs",
    "chat_history"
]

bad_names = [
    "My Collection",
    "collection-1",
    "测试集合",
    "collection with spaces"
]

def validate_collection_name(name: str) -> bool:
    import re
    pattern = r'^[a-z][a-z0-9_]*$'
    return bool(re.match(pattern, name))

元数据设计 #

python

collection = client.create_collection(
    name="documents",
    metadata={
        "description": "文档存储集合",
        "version": "1.0",
        "created_at": "2024-01-01",
        "embedding_model": "all-MiniLM-L6-v2",
        "distance_function": "cosine"
    }
)

集合模板 #

python

def create_document_collection(client, name: str, description: str = ""):
    return client.create_collection(
        name=name,
        metadata={
            "description": description,
            "type": "document",
            "created_at": datetime.now().isoformat(),
            "hnsw:space": "cosine"
        }
    )

def create_qa_collection(client, name: str):
    return client.create_collection(
        name=name,
        metadata={
            "description": "问答对集合",
            "type": "qa",
            "created_at": datetime.now().isoformat()
        }
    )

def create_product_collection(client, name: str):
    return client.create_collection(
        name=name,
        metadata={
            "description": "产品向量集合",
            "type": "product",
            "hnsw:space": "cosine"
        }
    )

多集合管理 #

集合管理器 #

python

class CollectionManager:
    def __init__(self, client):
        self.client = client
        self.collections = {}
    
    def get_or_create(self, name: str, **kwargs):
        if name not in self.collections:
            self.collections[name] = self.client.get_or_create_collection(
                name=name,
                **kwargs
            )
        return self.collections[name]
    
    def get(self, name: str):
        if name not in self.collections:
            self.collections[name] = self.client.get_collection(name=name)
        return self.collections[name]
    
    def delete(self, name: str):
        if name in self.collections:
            del self.collections[name]
        self.client.delete_collection(name=name)
    
    def list_all(self):
        return self.client.list_collections()
    
    def clear_all(self):
        for coll in self.list_all():
            self.delete(coll.name)

manager = CollectionManager(chromadb.Client())

docs = manager.get_or_create("documents", metadata={"type": "doc"})
qa = manager.get_or_create("qa_pairs", metadata={"type": "qa"})

按类型组织集合 #

python

import chromadb

client = chromadb.Client()

collections = {
    "documents": client.get_or_create_collection(
        name="documents",
        metadata={"type": "document"}
    ),
    "qa_pairs": client.get_or_create_collection(
        name="qa_pairs",
        metadata={"type": "qa"}
    ),
    "products": client.get_or_create_collection(
        name="products",
        metadata={"type": "product"}
    ),
    "users": client.get_or_create_collection(
        name="users",
        metadata={"type": "user"}
    )
}

def get_collection_by_type(type_name: str):
    return collections.get(type_name)

集合备份与恢复 #

导出集合数据 #

python

import json

def export_collection(collection, filepath: str):
    data = collection.get()
    
    export_data = {
        "name": collection.name,
        "metadata": collection.metadata,
        "documents": []
    }
    
    for i in range(len(data['ids'])):
        doc = {
            "id": data['ids'][i],
            "document": data['documents'][i] if data['documents'] else None,
            "embedding": data['embeddings'][i] if data['embeddings'] else None,
            "metadata": data['metadatas'][i] if data['metadatas'] else None
        }
        export_data["documents"].append(doc)
    
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(export_data, f, ensure_ascii=False, indent=2)
    
    print(f"导出 {len(export_data['documents'])} 个文档到 {filepath}")

export_collection(collection, "collection_backup.json")

导入集合数据 #

python

import json

def import_collection(client, filepath: str):
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    collection = client.get_or_create_collection(
        name=data["name"],
        metadata=data.get("metadata", {})
    )
    
    if data["documents"]:
        collection.add(
            ids=[doc["id"] for doc in data["documents"]],
            documents=[doc["document"] for doc in data["documents"] if doc["document"]],
            embeddings=[doc["embedding"] for doc in data["documents"] if doc["embedding"]],
            metadatas=[doc["metadata"] for doc in data["documents"] if doc["metadata"]]
        )
    
    print(f"导入 {len(data['documents'])} 个文档到集合 {data['name']}")
    return collection

import_collection(client, "collection_backup.json")

常见问题 #

问题 1：集合名称冲突 #

python

try:
    collection = client.create_collection(name="existing_collection")
except Exception as e:
    print(f"集合已存在: {e}")
    collection = client.get_collection(name="existing_collection")

问题 2：集合数量限制 #

python

collections = client.list_collections()
print(f"当前集合数: {len(collections)}")

if len(collections) > 100:
    print("警告: 集合数量过多，考虑合并或清理")

问题 3：集合性能优化 #

python

large_collection = client.create_collection(
    name="large_dataset",
    metadata={
        "hnsw:construction_ef": 200,
        "hnsw:M": 32,
        "hnsw:batch_size": 1000
    }
)

下一步 #

现在你已经掌握了集合管理，接下来学习文档操作，了解如何高效地管理文档数据！