搜索查询 #
本章详细介绍 Weaviate 的搜索查询功能。
搜索类型概览 #
text
Weaviate 搜索类型:
┌─────────────────────────────────────────────────────────────┐
│ 搜索类型 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 向量搜索 (Vector Search) │
│ ├── nearVector: 向量相似性搜索 │
│ ├── nearText: 文本向量搜索 │
│ └── nearObject: 对象相似性搜索 │
│ │
│ 关键词搜索 (Keyword Search) │
│ └── bm25: BM25 算法搜索 │
│ │
│ 混合搜索 (Hybrid Search) │
│ └── 结合向量和关键词搜索 │
│ │
│ 过滤搜索 (Filtered Search) │
│ └── 基于属性的精确过滤 │
│ │
│ 地理搜索 (Geo Search) │
│ └── 基于地理位置的搜索 │
│ │
└─────────────────────────────────────────────────────────────┘
向量搜索 #
nearText 搜索 #
python
import weaviate.classes as wvc
articles = client.collections.get("Article")
response = articles.query.near_text(
query="向量数据库入门教程",
limit=5
)
for obj in response.objects:
print(f"Title: {obj.properties['title']}")
print(f"Distance: {obj.metadata.distance}")
print()
nearVector 搜索 #
python
import numpy as np
from openai import OpenAI
openai_client = OpenAI()
def get_embedding(text):
response = openai_client.embeddings.create(
model="text-embedding-3-small",
input=text
)
return response.data[0].embedding
query_vector = get_embedding("向量数据库入门教程")
response = articles.query.near_vector(
near_vector=query_vector,
limit=5
)
for obj in response.objects:
print(f"Title: {obj.properties['title']}")
nearObject 搜索 #
python
articles = client.collections.get("Article")
all_articles = articles.query.fetch_objects(limit=1)
reference_uuid = all_articles.objects[0].uuid
response = articles.query.near_object(
near_object=reference_uuid,
limit=5
)
print("与指定对象相似的文章:")
for obj in response.objects:
print(f"- {obj.properties['title']}")
搜索参数 #
python
response = articles.query.near_text(
query="向量数据库",
certainty=0.7,
distance=0.3,
limit=10,
offset=0,
autocut=1
)
print(f"Total: {response.total_count}")
for obj in response.objects:
print(f"- {obj.properties['title']} (certainty: {obj.metadata.certainty})")
参数说明 #
text
搜索参数说明:
certainty:
├── 相似度阈值 (0-1)
├── 值越大,结果越相似
└── 与 distance 互斥
distance:
├── 距离阈值
├── 值越小,结果越相似
└── 与 certainty 互斥
limit:
├── 返回结果数量
└── 默认 10
offset:
├── 分页偏移量
└── 用于分页
autocut:
├── 自动截断结果
├── 值为 1 表示返回第一组相似结果
└── 值为 2 表示返回前两组
关键词搜索 (BM25) #
基本 BM25 搜索 #
python
response = articles.query.bm25(
query="向量数据库",
limit=10
)
for obj in response.objects:
print(f"Title: {obj.properties['title']}")
print(f"Score: {obj.metadata.score}")
指定搜索属性 #
python
response = articles.query.bm25(
query="向量数据库",
query_properties=["title", "content"],
limit=10
)
BM25 参数 #
python
response = articles.query.bm25(
query="向量数据库",
query_properties=["title^2", "content"],
limit=10,
boost=1.5
)
混合搜索 #
基本混合搜索 #
python
response = articles.query.hybrid(
query="向量数据库",
alpha=0.5,
limit=10
)
for obj in response.objects:
print(f"Title: {obj.properties['title']}")
print(f"Score: {obj.metadata.score}")
Alpha 参数 #
text
Alpha 参数说明:
alpha = 0:
├── 纯 BM25 关键词搜索
└── 不考虑向量相似性
alpha = 0.5:
├── 向量和关键词各占 50%
└── 平衡语义和关键词
alpha = 1:
├── 纯向量搜索
└── 不考虑关键词匹配
混合搜索与向量 #
python
query_vector = get_embedding("向量数据库")
response = articles.query.hybrid(
query="向量数据库",
vector=query_vector,
alpha=0.7,
limit=10
)
指定搜索属性 #
python
response = articles.query.hybrid(
query="向量数据库",
query_properties=["title", "content"],
alpha=0.5,
limit=10
)
过滤查询 #
基本过滤 #
python
from weaviate.classes.query import Filter
response = articles.query.fetch_objects(
filters=Filter.by_property("category").equal("技术"),
limit=10
)
for obj in response.objects:
print(f"- {obj.properties['title']}")
过滤操作符 #
python
response = articles.query.fetch_objects(
filters=Filter.by_property("views").greater_than(1000),
limit=10
)
response = articles.query.fetch_objects(
filters=Filter.by_property("views").greater_or_equal(1000),
limit=10
)
response = articles.query.fetch_objects(
filters=Filter.by_property("views").less_than(5000),
limit=10
)
response = articles.query.fetch_objects(
filters=Filter.by_property("views").less_or_equal(5000),
limit=10
)
文本过滤 #
python
response = articles.query.fetch_objects(
filters=Filter.by_property("title").like("*向量*"),
limit=10
)
response = articles.query.fetch_objects(
filters=Filter.by_property("category").equal_any(["技术", "AI"]),
limit=10
)
response = articles.query.fetch_objects(
filters=Filter.by_property("title").contains_any(["向量", "数据库"]),
limit=10
)
数组过滤 #
python
response = articles.query.fetch_objects(
filters=Filter.by_property("tags").contains_any(["AI", "数据库"]),
limit=10
)
response = articles.query.fetch_objects(
filters=Filter.by_property("tags").contains_all(["AI", "教程"]),
limit=10
)
日期过滤 #
python
from datetime import datetime
response = articles.query.fetch_objects(
filters=Filter.by_property("published").greater_than(
datetime(2024, 1, 1)
),
limit=10
)
组合过滤 #
python
response = articles.query.fetch_objects(
filters=(
Filter.by_property("category").equal("技术") &
Filter.by_property("views").greater_than(1000)
),
limit=10
)
response = articles.query.fetch_objects(
filters=(
Filter.by_property("category").equal("技术") |
Filter.by_property("category").equal("AI")
),
limit=10
)
response = articles.query.fetch_objects(
filters=(
Filter.by_property("category").equal("技术") &
(
Filter.by_property("views").greater_than(1000) |
Filter.by_property("featured").equal(True)
)
),
limit=10
)
否定过滤 #
python
response = articles.query.fetch_objects(
filters=~Filter.by_property("category").equal("技术"),
limit=10
)
向量搜索与过滤 #
python
response = articles.query.near_text(
query="向量数据库",
filters=Filter.by_property("category").equal("技术"),
limit=10
)
混合搜索与过滤 #
python
response = articles.query.hybrid(
query="向量数据库",
alpha=0.5,
filters=Filter.by_property("views").greater_than(500),
limit=10
)
地理搜索 #
创建地理数据 #
python
locations = client.collections.create(
name="Location",
properties=[
wvc.config.Property(name="name", data_type=wvc.config.DataType.TEXT),
wvc.config.Property(
name="coordinates",
data_type=wvc.config.DataType.GEO_COORDINATES
)
]
)
locations.data.insert({
"name": "北京天安门",
"coordinates": {
"latitude": 39.9042,
"longitude": 116.4074
}
})
locations.data.insert({
"name": "上海东方明珠",
"coordinates": {
"latitude": 31.2397,
"longitude": 121.4998
}
})
地理范围搜索 #
python
from weaviate.classes.query import Filter
response = locations.query.fetch_objects(
filters=Filter.by_property("coordinates").within_geo_range(
latitude=39.9042,
longitude=116.4074,
distance=10000
),
limit=10
)
print("10km 范围内的地点:")
for obj in response.objects:
print(f"- {obj.properties['name']}")
排序 #
基本排序 #
python
response = articles.query.fetch_objects(
limit=10,
sort=wvc.query.Sort.by_property("views", descending=True)
)
for obj in response.objects:
print(f"- {obj.properties['title']} (views: {obj.properties['views']})")
多字段排序 #
python
response = articles.query.fetch_objects(
limit=10,
sort=[
wvc.query.Sort.by_property("category"),
wvc.query.Sort.by_property("views", descending=True)
]
)
向量搜索排序 #
python
response = articles.query.near_text(
query="向量数据库",
limit=10,
sort=wvc.query.Sort.by_property("views", descending=True)
)
分页 #
Offset 分页 #
python
page_size = 10
page = 2
response = articles.query.fetch_objects(
limit=page_size,
offset=(page - 1) * page_size
)
print(f"Page {page}:")
for obj in response.objects:
print(f"- {obj.properties['title']}")
Cursor 分页 #
python
response = articles.query.fetch_objects(
limit=10,
after=all_articles.objects[-1].uuid
)
聚合查询 #
计数统计 #
python
response = articles.aggregate.over_all(total_count=True)
print(f"Total articles: {response.total_count}")
分组统计 #
python
from weaviate.classes.aggregate import GroupByAggregate
response = articles.aggregate.over_all(
group_by=GroupByAggregate(prop="category")
)
print("按类别统计:")
for group in response.groups:
print(f"- {group.grouped_by.value}: {group.total_count}")
数值统计 #
python
response = articles.aggregate.over_all(
return_metrics=wvc.aggregate.Metrics("views").integer(
mean=True,
maximum=True,
minimum=True,
sum_=True,
mode=True,
median=True
)
)
metrics = response.properties["views"].integer
print(f"Views 统计:")
print(f" 平均值: {metrics.mean}")
print(f" 最大值: {metrics.maximum}")
print(f" 最小值: {metrics.minimum}")
print(f" 总和: {metrics.sum_}")
print(f" 中位数: {metrics.median}")
文本统计 #
python
response = articles.aggregate.over_all(
return_metrics=wvc.aggregate.Metrics("title").text(
top_occurrences=True,
top_occurrences_limit=5
)
)
print("标题高频词:")
for occurrence in response.properties["title"].text.top_occurrences:
print(f"- {occurrence.value}: {occurrence.occurs}")
过滤聚合 #
python
response = articles.aggregate.over_all(
filters=Filter.by_property("category").equal("技术"),
total_count=True
)
print(f"技术类文章数量: {response.total_count}")
分组聚合 #
python
response = articles.aggregate.over_all(
group_by=GroupByAggregate(prop="category"),
return_metrics=wvc.aggregate.Metrics("views").integer(mean=True, sum_=True)
)
print("各类别阅读统计:")
for group in response.groups:
category = group.grouped_by.value
metrics = group.properties["views"].integer
print(f"- {category}: 平均 {metrics.mean:.0f} 次,总计 {metrics.sum_} 次")
GraphQL 查询 #
基本 GraphQL 查询 #
python
response = client.graphql_raw_query("""
{
Get {
Article(limit: 5) {
title
content
views
}
}
}
""")
for obj in response.get["Article"]:
print(f"- {obj['title']}")
向量搜索 GraphQL #
python
response = client.graphql_raw_query("""
{
Get {
Article(
nearText: {query: "向量数据库"}
limit: 5
) {
title
content
_additional {
distance
certainty
}
}
}
}
""")
过滤 GraphQL #
python
response = client.graphql_raw_query("""
{
Get {
Article(
where: {
operator: And,
operands: [
{path: ["category"], operator: Equal, valueText: "技术"},
{path: ["views"], operator: GreaterThan, valueInt: 1000}
]
}
limit: 10
) {
title
views
}
}
}
""")
聚合 GraphQL #
python
response = client.graphql_raw_query("""
{
Aggregate {
Article {
meta {
count
}
grouping {
groupBy {
path: ["category"]
}
meta {
count
}
}
}
}
}
""")
搜索最佳实践 #
搜索策略选择 #
text
搜索策略选择指南:
语义搜索场景:
├── 用户查询意图不明确
├── 需要理解同义词
├── 跨语言搜索
└── 使用 nearText 或 nearVector
关键词搜索场景:
├── 精确匹配需求
├── 专业术语搜索
├── 代码或标识符搜索
└── 使用 bm25
混合搜索场景:
├── 需要平衡语义和关键词
├── 综合搜索质量要求高
└── 使用 hybrid
过滤搜索场景:
├── 精确条件筛选
├── 分类或标签过滤
└── 使用 filters
性能优化 #
text
搜索性能优化:
1. 合理设置 limit
└── 避免返回过多结果
2. 使用过滤预筛选
└── 减少向量计算量
3. 配置索引
└── 为过滤字段创建索引
4. 调整 HNSW 参数
└── 平衡召回率和速度
5. 使用量化
└── 减少内存占用
小结 #
本章介绍了 Weaviate 的搜索查询功能:
- 向量搜索(nearText、nearVector、nearObject)
- 关键词搜索(BM25)
- 混合搜索
- 过滤查询
- 地理搜索
- 排序和分页
- 聚合查询
- GraphQL 查询
下一步 #
继续学习 对象管理,了解数据的增删改查操作!
最后更新:2026-04-04