查询与检索 #
查询基础 #
基本查询 #
python
import chromadb
client = chromadb.Client()
collection = client.create_collection(name="documents")
collection.add(
documents=[
"Python 是一种编程语言",
"JavaScript 用于网页开发",
"机器学习是 AI 的分支",
"深度学习使用神经网络"
],
ids=["doc1", "doc2", "doc3", "doc4"]
)
results = collection.query(
query_texts=["编程"],
n_results=2
)
print("查询结果:")
for doc, dist in zip(results['documents'][0], results['distances'][0]):
print(f" 文档: {doc}")
print(f" 距离: {dist:.4f}")
print()
查询返回结构 #
text
┌─────────────────────────────────────────────────────────────┐
│ 查询返回结构 │
├─────────────────────────────────────────────────────────────┤
│ │
│ { │
│ 'ids': [['doc1', 'doc2']], │
│ 'documents': [['文档1', '文档2']], │
│ 'metadatas': [[{...}, {...}]], │
│ 'embeddings': None, │
│ 'distances': [[0.85, 0.72]], │
│ 'uris': None, │
│ 'data': None │
│ } │
│ │
│ 说明: │
│ - 所有字段都是嵌套列表(支持批量查询) │
│ - distances: 相似度/距离分数 │
│ - 结果按相似度排序 │
│ │
└─────────────────────────────────────────────────────────────┘
查询方式 #
文本查询 #
python
results = collection.query(
query_texts=["人工智能"],
n_results=3
)
向量查询 #
python
query_embedding = [0.1, 0.2, 0.3, 0.4]
results = collection.query(
query_embeddings=[query_embedding],
n_results=3
)
ID 查询 #
python
results = collection.query(
query_ids=["doc1"],
n_results=3
)
print("与 doc1 最相似的文档:")
for doc in results['documents'][0][1:]:
print(f" - {doc}")
批量查询 #
python
results = collection.query(
query_texts=["编程", "AI", "网页"],
n_results=2
)
for i, query in enumerate(["编程", "AI", "网页"]):
print(f"查询: {query}")
for doc in results['documents'][i]:
print(f" - {doc}")
print()
元数据过滤 #
基本过滤 #
python
collection.add(
documents=["Python 教程", "JavaScript 教程", "机器学习入门"],
ids=["doc1", "doc2", "doc3"],
metadatas=[
{"category": "programming", "level": "beginner"},
{"category": "programming", "level": "intermediate"},
{"category": "ai", "level": "beginner"}
]
)
results = collection.query(
query_texts=["教程"],
where={"category": "programming"},
n_results=3
)
print("编程类教程:")
for doc in results['documents'][0]:
print(f" - {doc}")
过滤操作符 #
text
┌─────────────────────────────────────────────────────────────┐
│ 过滤操作符 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 相等比较 │
│ {"key": "value"} 等于 │
│ {"key": {"$ne": "value"}} 不等于 │
│ │
│ 数值比较 │
│ {"key": {"$gt": 10}} 大于 │
│ {"key": {"$gte": 10}} 大于等于 │
│ {"key": {"$lt": 10}} 小于 │
│ {"key": {"$lte": 10}} 小于等于 │
│ │
│ 字符串操作 │
│ {"key": {"$contains": "text"}} 包含 │
│ {"key": {"$not_contains": "text"}} 不包含 │
│ │
│ 数组操作 │
│ {"key": {"$in": ["a", "b"]}} 在列表中 │
│ {"key": {"$nin": ["a", "b"]}} 不在列表中 │
│ │
│ 逻辑操作 │
│ {"$and": [条件1, 条件2]} 且 │
│ {"$or": [条件1, 条件2]} 或 │
│ {"$not": 条件} 非 │
│ │
└─────────────────────────────────────────────────────────────┘
过滤示例 #
python
collection.add(
documents=[
"Python 基础教程",
"Python 高级教程",
"JavaScript 入门",
"机器学习实战",
"深度学习进阶"
],
ids=["doc1", "doc2", "doc3", "doc4", "doc5"],
metadatas=[
{"category": "programming", "language": "Python", "level": 1, "price": 99},
{"category": "programming", "language": "Python", "level": 3, "price": 199},
{"category": "programming", "language": "JavaScript", "level": 1, "price": 79},
{"category": "ai", "topic": "ml", "level": 2, "price": 149},
{"category": "ai", "topic": "dl", "level": 4, "price": 299}
]
)
results = collection.query(
query_texts=["教程"],
where={"level": {"$gte": 2}},
n_results=5
)
print("中级及以上课程:")
for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
print(f" {doc} (level: {meta['level']})")
results = collection.query(
query_texts=["教程"],
where={"price": {"$lt": 150}},
n_results=5
)
print("\n价格低于150的课程:")
for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
print(f" {doc} (price: {meta['price']})")
复合过滤 #
python
results = collection.query(
query_texts=["教程"],
where={
"$and": [
{"category": "programming"},
{"level": {"$lte": 2}}
]
},
n_results=5
)
print("编程入门课程:")
for doc in results['documents'][0]:
print(f" - {doc}")
results = collection.query(
query_texts=["教程"],
where={
"$or": [
{"language": "Python"},
{"topic": "ml"}
]
},
n_results=5
)
print("\nPython 或机器学习课程:")
for doc in results['documents'][0]:
print(f" - {doc}")
Where Document 过滤 #
文档内容过滤 #
python
collection.add(
documents=[
"Python 是一种编程语言",
"JavaScript 用于网页开发",
"机器学习是人工智能分支",
"深度学习使用神经网络"
],
ids=["doc1", "doc2", "doc3", "doc4"]
)
results = collection.query(
query_texts=["技术"],
where_document={"$contains": "学习"},
n_results=3
)
print("包含'学习'的文档:")
for doc in results['documents'][0]:
print(f" - {doc}")
Where Document 操作符 #
python
results = collection.query(
query_texts=["技术"],
where_document={"$not_contains": "Python"},
n_results=3
)
print("不包含'Python'的文档:")
for doc in results['documents'][0]:
print(f" - {doc}")
查询选项 #
包含字段 #
python
results = collection.query(
query_texts=["编程"],
n_results=2,
include=["documents", "metadatas", "distances", "embeddings"]
)
print(f"返回字段: {list(results.keys())}")
print(f"嵌入维度: {len(results['embeddings'][0][0])}")
分页查询 #
python
def paginated_query(collection, query_text, page=1, page_size=10):
offset = (page - 1) * page_size
all_results = collection.query(
query_texts=[query_text],
n_results=offset + page_size
)
paginated_docs = all_results['documents'][0][offset:offset+page_size]
paginated_ids = all_results['ids'][0][offset:offset+page_size]
paginated_distances = all_results['distances'][0][offset:offset+page_size]
return {
'documents': [paginated_docs],
'ids': [paginated_ids],
'distances': [paginated_distances],
'page': page,
'page_size': page_size
}
results = paginated_query(collection, "教程", page=1, page_size=2)
print(f"第1页: {results['documents'][0]}")
results = paginated_query(collection, "教程", page=2, page_size=2)
print(f"第2页: {results['documents'][0]}")
高级查询技术 #
相似度阈值过滤 #
python
def query_with_threshold(collection, query_text, threshold=0.8, n_results=10):
results = collection.query(
query_texts=[query_text],
n_results=n_results,
include=["documents", "distances", "metadatas"]
)
filtered_results = []
for doc, dist, meta in zip(
results['documents'][0],
results['distances'][0],
results['metadatas'][0]
):
if dist >= threshold:
filtered_results.append({
'document': doc,
'distance': dist,
'metadata': meta
})
return filtered_results
results = query_with_threshold(collection, "编程", threshold=0.7)
print(f"相似度 >= 0.7 的结果: {len(results)}")
多查询合并 #
python
def multi_query_merge(collection, queries, n_results=5):
all_results = []
for query in queries:
results = collection.query(
query_texts=[query],
n_results=n_results
)
all_results.append(results)
merged = {}
for results in all_results:
for doc_id, doc, dist in zip(
results['ids'][0],
results['documents'][0],
results['distances'][0]
):
if doc_id not in merged:
merged[doc_id] = {
'document': doc,
'scores': []
}
merged[doc_id]['scores'].append(dist)
for doc_id in merged:
merged[doc_id]['avg_score'] = sum(merged[doc_id]['scores']) / len(merged[doc_id]['scores'])
sorted_results = sorted(
merged.items(),
key=lambda x: x[1]['avg_score'],
reverse=True
)
return sorted_results
queries = ["编程", "Python", "教程"]
merged_results = multi_query_merge(collection, queries)
print("多查询合并结果:")
for doc_id, data in merged_results[:5]:
print(f" {doc_id}: {data['avg_score']:.3f}")
混合检索 #
python
def hybrid_search(collection, query_text, metadata_filter=None, alpha=0.5, n_results=10):
vector_results = collection.query(
query_texts=[query_text],
n_results=n_results * 2,
include=["documents", "distances", "metadatas"]
)
if metadata_filter:
filtered_results = collection.query(
query_texts=[query_text],
where=metadata_filter,
n_results=n_results * 2,
include=["documents", "distances", "metadatas"]
)
else:
filtered_results = vector_results
combined_scores = {}
for i, (doc_id, dist) in enumerate(zip(vector_results['ids'][0], vector_results['distances'][0])):
combined_scores[doc_id] = {
'vector_score': dist,
'filter_score': 0,
'document': vector_results['documents'][0][i],
'metadata': vector_results['metadatas'][0][i]
}
for i, (doc_id, dist) in enumerate(zip(filtered_results['ids'][0], filtered_results['distances'][0])):
if doc_id in combined_scores:
combined_scores[doc_id]['filter_score'] = dist
for doc_id in combined_scores:
data = combined_scores[doc_id]
data['combined_score'] = (
alpha * data['vector_score'] +
(1 - alpha) * data['filter_score']
)
sorted_results = sorted(
combined_scores.items(),
key=lambda x: x[1]['combined_score'],
reverse=True
)
return sorted_results[:n_results]
results = hybrid_search(
collection,
"教程",
metadata_filter={"category": "programming"},
alpha=0.7
)
print("混合检索结果:")
for doc_id, data in results:
print(f" {doc_id}: {data['combined_score']:.3f}")
查询优化 #
索引优化 #
python
collection = client.create_collection(
name="optimized_collection",
metadata={
"hnsw:space": "cosine",
"hnsw:construction_ef": 200,
"hnsw:M": 32
}
)
批量查询优化 #
python
def optimized_batch_query(collection, queries, batch_size=10):
all_results = []
for i in range(0, len(queries), batch_size):
batch = queries[i:i+batch_size]
results = collection.query(
query_texts=batch,
n_results=5
)
all_results.extend(results['documents'])
return all_results
queries = [f"查询{i}" for i in range(100)]
results = optimized_batch_query(collection, queries)
缓存查询结果 #
python
from functools import lru_cache
import hashlib
class CachedQuery:
def __init__(self, collection):
self.collection = collection
self.cache = {}
def query(self, query_text, n_results=5, where=None):
cache_key = hashlib.md5(
f"{query_text}:{n_results}:{where}".encode()
).hexdigest()
if cache_key in self.cache:
return self.cache[cache_key]
results = self.collection.query(
query_texts=[query_text],
n_results=n_results,
where=where
)
self.cache[cache_key] = results
return results
def clear_cache(self):
self.cache.clear()
cached_query = CachedQuery(collection)
查询示例 #
示例 1:智能问答 #
python
import chromadb
client = chromadb.Client()
qa_collection = client.create_collection(name="qa_system")
qa_pairs = [
{"q": "Python 的创始人是谁?", "a": "Python 由 Guido van Rossum 创建"},
{"q": "什么是机器学习?", "a": "机器学习是 AI 的分支,让计算机从数据中学习"},
{"q": "Chroma 是什么?", "a": "Chroma 是开源向量数据库,专为 AI 应用设计"},
{"q": "什么是 RAG?", "a": "RAG 是检索增强生成,结合检索和生成提高准确性"}
]
qa_collection.add(
documents=[qa["a"] for qa in qa_pairs],
metadatas=[{"question": qa["q"]} for qa in qa_pairs],
ids=[f"qa{i+1}" for i in range(len(qa_pairs))]
)
def ask(question: str, threshold: float = 0.5):
results = qa_collection.query(
query_texts=[question],
n_results=1,
include=["documents", "metadatas", "distances"]
)
if results['distances'][0][0] >= threshold:
answer = results['documents'][0][0]
matched_q = results['metadatas'][0][0]['question']
return {
'answer': answer,
'matched_question': matched_q,
'confidence': results['distances'][0][0]
}
else:
return {
'answer': "抱歉,我没有找到相关答案",
'confidence': results['distances'][0][0]
}
print(ask("谁发明了 Python?"))
print(ask("RAG 是什么意思?"))
示例 2:推荐系统 #
python
import chromadb
client = chromadb.Client()
products = client.create_collection(name="products")
products.add(
documents=[
"iPhone 15 Pro Max 256GB 深空黑",
"Samsung Galaxy S24 Ultra 钛灰",
"MacBook Pro 14英寸 M3 Pro",
"iPad Pro 12.9英寸 M2芯片",
"AirPods Pro 第二代"
],
metadatas=[
{"category": "phone", "brand": "Apple", "price": 9999},
{"category": "phone", "brand": "Samsung", "price": 8999},
{"category": "laptop", "brand": "Apple", "price": 14999},
{"category": "tablet", "brand": "Apple", "price": 8999},
{"category": "audio", "brand": "Apple", "price": 1899}
],
ids=["p1", "p2", "p3", "p4", "p5"]
)
def recommend(query: str, category: str = None, max_price: int = None, n: int = 3):
where_filter = {}
if category:
where_filter["category"] = category
if max_price:
if where_filter:
where_filter = {
"$and": [
where_filter,
{"price": {"$lte": max_price}}
]
}
else:
where_filter = {"price": {"$lte": max_price}}
results = products.query(
query_texts=[query],
where=where_filter if where_filter else None,
n_results=n
)
recommendations = []
for doc, meta, dist in zip(
results['documents'][0],
results['metadatas'][0],
results['distances'][0]
):
recommendations.append({
'product': doc,
'brand': meta['brand'],
'price': meta['price'],
'relevance': dist
})
return recommendations
print("推荐 - 苹果手机:")
for r in recommend("苹果手机", category="phone"):
print(f" {r['product']} - ¥{r['price']}")
print("\n推荐 - 5000元以下:")
for r in recommend("电子产品", max_price=5000):
print(f" {r['product']} - ¥{r['price']}")
下一步 #
现在你已经掌握了查询与检索,接下来学习 高级配置,了解 Chroma 的高级功能和优化技巧!
最后更新:2026-04-04