快速开始 #
第一个 Chroma 程序 #
让我们从一个简单的例子开始,体验 Chroma 的核心功能。
最简示例 #
python
import chromadb
chroma_client = chromadb.Client()
collection = chroma_client.create_collection(name="my_collection")
collection.add(
documents=["这是第一个文档", "这是第二个文档", "这是第三个文档"],
ids=["id1", "id2", "id3"]
)
results = collection.query(
query_texts=["第一个"],
n_results=2
)
print(results)
输出:
python
{
'ids': [['id1', 'id2']],
'documents': [['这是第一个文档', '这是第二个文档']],
'metadatas': [[None, None]],
'distances': [[0.8, 0.6]]
}
完整示例:文档搜索系统 #
让我们构建一个完整的文档搜索系统,涵盖 Chroma 的主要功能。
步骤 1:创建客户端和集合 #
python
import chromadb
client = chromadb.Client()
collection = client.create_collection(
name="documents",
metadata={"description": "文档搜索系统"}
)
print(f"集合创建成功: {collection.name}")
步骤 2:添加文档 #
python
documents = [
"Python 是一种高级编程语言,由 Guido van Rossum 创建",
"JavaScript 是一种用于网页开发的脚本语言",
"机器学习是人工智能的一个分支,专注于让计算机从数据中学习",
"深度学习使用神经网络进行模式识别和预测",
"自然语言处理让计算机理解和生成人类语言",
"向量数据库用于存储和检索高维向量数据",
"RAG 是检索增强生成的缩写,用于提高 LLM 的准确性",
"Chroma 是一个开源的向量数据库,专为 AI 应用设计"
]
metadatas = [
{"category": "programming", "language": "Python"},
{"category": "programming", "language": "JavaScript"},
{"category": "ai", "topic": "machine-learning"},
{"category": "ai", "topic": "deep-learning"},
{"category": "ai", "topic": "nlp"},
{"category": "database", "type": "vector"},
{"category": "ai", "topic": "rag"},
{"category": "database", "type": "vector"}
]
ids = [f"doc{i+1}" for i in range(len(documents))]
collection.add(
documents=documents,
metadatas=metadatas,
ids=ids
)
print(f"添加了 {len(documents)} 个文档")
步骤 3:查询文档 #
python
results = collection.query(
query_texts=["编程语言"],
n_results=3
)
print("查询结果:")
for i, doc in enumerate(results['documents'][0]):
print(f"{i+1}. {doc}")
print(f" 距离: {results['distances'][0][i]}")
print(f" 元数据: {results['metadatas'][0][i]}")
print()
步骤 4:带过滤的查询 #
python
results = collection.query(
query_texts=["人工智能"],
where={"category": "ai"},
n_results=3
)
print("AI 相关文档:")
for doc in results['documents'][0]:
print(f"- {doc}")
步骤 5:更新文档 #
python
collection.update(
ids=["doc1"],
documents=["Python 是一种高级编程语言,广泛用于数据科学和 AI"],
metadatas=[{"category": "programming", "language": "Python", "updated": True}]
)
print("文档更新成功")
步骤 6:删除文档 #
python
collection.delete(ids=["doc2"])
print("文档删除成功")
remaining = collection.get()
print(f"剩余文档数: {len(remaining['ids'])}")
完整代码 #
python
import chromadb
def main():
client = chromadb.Client()
collection = client.create_collection(
name="documents",
metadata={"description": "文档搜索系统"}
)
documents = [
"Python 是一种高级编程语言,由 Guido van Rossum 创建",
"JavaScript 是一种用于网页开发的脚本语言",
"机器学习是人工智能的一个分支,专注于让计算机从数据中学习",
"深度学习使用神经网络进行模式识别和预测",
"自然语言处理让计算机理解和生成人类语言",
"向量数据库用于存储和检索高维向量数据",
"RAG 是检索增强生成的缩写,用于提高 LLM 的准确性",
"Chroma 是一个开源的向量数据库,专为 AI 应用设计"
]
metadatas = [
{"category": "programming", "language": "Python"},
{"category": "programming", "language": "JavaScript"},
{"category": "ai", "topic": "machine-learning"},
{"category": "ai", "topic": "deep-learning"},
{"category": "ai", "topic": "nlp"},
{"category": "database", "type": "vector"},
{"category": "ai", "topic": "rag"},
{"category": "database", "type": "vector"}
]
ids = [f"doc{i+1}" for i in range(len(documents))]
collection.add(
documents=documents,
metadatas=metadatas,
ids=ids
)
print(f"添加了 {len(documents)} 个文档\n")
print("=" * 50)
print("查询: 编程语言")
print("=" * 50)
results = collection.query(
query_texts=["编程语言"],
n_results=3
)
for i, doc in enumerate(results['documents'][0]):
print(f"{i+1}. {doc}")
print("\n" + "=" * 50)
print("查询: 人工智能 (仅 AI 分类)")
print("=" * 50)
results = collection.query(
query_texts=["人工智能"],
where={"category": "ai"},
n_results=3
)
for doc in results['documents'][0]:
print(f"- {doc}")
print("\n" + "=" * 50)
print("获取所有文档")
print("=" * 50)
all_docs = collection.get()
for doc_id, doc in zip(all_docs['ids'], all_docs['documents']):
print(f"{doc_id}: {doc[:30]}...")
if __name__ == "__main__":
main()
持久化存储 #
使用持久化客户端 #
python
import chromadb
client = chromadb.PersistentClient(path="./chroma_db")
collection = client.get_or_create_collection(name="persistent_docs")
collection.add(
documents=["持久化存储的文档"],
ids=["persistent1"]
)
print("数据已持久化到 ./chroma_db 目录")
重新加载持久化数据 #
python
import chromadb
client = chromadb.PersistentClient(path="./chroma_db")
collection = client.get_collection(name="persistent_docs")
results = collection.get()
print(f"加载了 {len(results['ids'])} 个文档")
使用自定义嵌入函数 #
使用 Sentence Transformers #
python
import chromadb
from chromadb.utils import embedding_functions
embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
model_name="all-MiniLM-L6-v2"
)
client = chromadb.Client()
collection = client.create_collection(
name="custom_embeddings",
embedding_function=embedding_function
)
collection.add(
documents=["使用自定义嵌入函数的文档"],
ids=["custom1"]
)
results = collection.query(
query_texts=["自定义嵌入"],
n_results=1
)
print(results)
使用 OpenAI Embeddings #
python
import chromadb
from chromadb.utils import embedding_functions
import os
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
api_key=os.getenv("OPENAI_API_KEY"),
model_name="text-embedding-ada-002"
)
client = chromadb.Client()
collection = client.create_collection(
name="openai_embeddings",
embedding_function=openai_ef
)
collection.add(
documents=["使用 OpenAI 嵌入的文档"],
ids=["openai1"]
)
批量操作 #
批量添加文档 #
python
import chromadb
client = chromadb.Client()
collection = client.create_collection(name="batch_docs")
batch_size = 100
total_docs = 1000
for i in range(0, total_docs, batch_size):
batch_ids = [f"doc{j}" for j in range(i, min(i + batch_size, total_docs))]
batch_docs = [f"文档内容 {j}" for j in range(i, min(i + batch_size, total_docs))]
collection.add(
documents=batch_docs,
ids=batch_ids
)
print(f"已添加 {min(i + batch_size, total_docs)}/{total_docs} 个文档")
print(f"总共添加了 {collection.count()} 个文档")
批量查询 #
python
results = collection.query(
query_texts=["文档内容 100", "文档内容 500", "文档内容 900"],
n_results=3
)
for i, docs in enumerate(results['documents']):
print(f"查询 {i+1}:")
for doc in docs:
print(f" - {doc}")
实用示例 #
示例 1:问答系统 #
python
import chromadb
client = chromadb.Client()
qa_collection = client.create_collection(name="qa_system")
qa_pairs = [
{"question": "Python 的创始人是谁?", "answer": "Python 由 Guido van Rossum 创建"},
{"question": "什么是机器学习?", "answer": "机器学习是 AI 的分支,让计算机从数据中学习"},
{"question": "Chroma 是什么?", "answer": "Chroma 是开源的向量数据库,专为 AI 应用设计"}
]
qa_collection.add(
documents=[qa["answer"] for qa in qa_pairs],
metadatas=[{"question": qa["question"]} for qa in qa_pairs],
ids=[f"qa{i+1}" for i in range(len(qa_pairs))]
)
def ask_question(question: str):
results = qa_collection.query(
query_texts=[question],
n_results=1
)
return results['documents'][0][0]
print(ask_question("谁发明了 Python?"))
print(ask_question("Chroma 是用来做什么的?"))
示例 2:文档去重 #
python
import chromadb
client = chromadb.Client()
dedup_collection = client.create_collection(name="dedup_docs")
documents = [
"Python 是编程语言",
"Python 是一种编程语言",
"JavaScript 用于网页开发",
"JS 用于网页",
"机器学习是 AI 的分支"
]
dedup_collection.add(
documents=documents,
ids=[f"doc{i+1}" for i in range(len(documents))]
]
def find_duplicates(threshold: float = 0.95):
duplicates = []
all_docs = dedup_collection.get()
for i, doc_id in enumerate(all_docs['ids']):
results = dedup_collection.query(
query_ids=[doc_id],
n_results=len(all_docs['ids'])
)
for j, (result_id, distance) in enumerate(zip(results['ids'][0], results['distances'][0])):
if j > 0 and distance > threshold:
duplicates.append((doc_id, result_id, distance))
return duplicates
duplicates = find_duplicates()
print("可能的重复文档:")
for doc1, doc2, dist in duplicates:
print(f" {doc1} <-> {doc2}: {dist:.3f}")
示例 3:推荐系统 #
python
import chromadb
client = chromadb.Client()
products = client.create_collection(name="products")
products.add(
documents=[
"iPhone 15 Pro Max 256GB 深空黑色",
"Samsung Galaxy S24 Ultra 钛灰色",
"MacBook Pro 14英寸 M3 Pro 芯片",
"iPad Pro 12.9英寸 M2 芯片",
"AirPods Pro 第二代"
],
metadatas=[
{"category": "phone", "brand": "Apple", "price": 9999},
{"category": "phone", "brand": "Samsung", "price": 8999},
{"category": "laptop", "brand": "Apple", "price": 14999},
{"category": "tablet", "brand": "Apple", "price": 8999},
{"category": "audio", "brand": "Apple", "price": 1899}
],
ids=["p1", "p2", "p3", "p4", "p5"]
)
def recommend(query: str, category: str = None, n: int = 3):
where_filter = {"category": category} if category else None
results = products.query(
query_texts=[query],
where=where_filter,
n_results=n
)
return list(zip(results['documents'][0], results['metadatas'][0]))
print("推荐 - 苹果手机:")
for doc, meta in recommend("苹果手机", category="phone"):
print(f" {doc} - ¥{meta['price']}")
print("\n推荐 - 生产力工具:")
for doc, meta in recommend("生产力工具"):
print(f" {doc} - ¥{meta['price']}")
下一步 #
现在你已经掌握了 Chroma 的基本操作,接下来学习 集合管理,深入了解集合的高级功能!
最后更新:2026-04-04