向量嵌入 #
什么是向量嵌入? #
向量嵌入是将文本、图像等数据转换为高维向量表示的过程。这些向量捕获了数据的语义信息,使得语义相似的内容在向量空间中距离更近。
text
┌─────────────────────────────────────────────────────────────┐
│ 向量嵌入原理 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 文本 → 嵌入模型 → 向量 │
│ │
│ "苹果" → [0.1, 0.5, -0.3, 0.8, ...] │
│ "iPhone" → [0.12, 0.48, -0.28, 0.82, ...] │
│ "香蕉" → [0.8, 0.2, 0.1, -0.5, ...] │
│ │
│ 相似度计算: │
│ "苹果" vs "iPhone" → 高相似度 (0.92) │
│ "苹果" vs "香蕉" → 低相似度 (0.35) │
│ │
└─────────────────────────────────────────────────────────────┘
内置嵌入函数 #
Sentence Transformers #
python
import chromadb
from chromadb.utils import embedding_functions
embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
model_name="all-MiniLM-L6-v2"
)
client = chromadb.Client()
collection = client.create_collection(
name="sentence_transformer_docs",
embedding_function=embedding_function
)
collection.add(
documents=["这是测试文档", "另一个测试文档"],
ids=["doc1", "doc2"]
)
print(f"使用模型: all-MiniLM-L6-v2")
print(f"向量维度: {len(collection.get(ids=['doc1'], include=['embeddings'])['embeddings'][0])}")
常用 Sentence Transformers 模型 #
text
┌─────────────────────────────────────────────────────────────┐
│ Sentence Transformers 模型对比 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 模型名称 维度 速度 质量 大小 │
│ ───────────────────────────────────────────────────── │
│ all-MiniLM-L6-v2 384 快 中 80MB │
│ all-mpnet-base-v2 768 中 高 420MB │
│ paraphrase-MiniLM 384 快 中 80MB │
│ multi-qa-MiniLM 384 快 中 80MB │
│ distiluse-base 512 中 中 250MB │
│ │
│ 推荐: │
│ - 快速原型: all-MiniLM-L6-v2 │
│ - 高质量: all-mpnet-base-v2 │
│ - 多语言: paraphrase-multilingual-MiniLM │
│ │
└─────────────────────────────────────────────────────────────┘
OpenAI Embeddings #
python
import chromadb
from chromadb.utils import embedding_functions
import os
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
api_key=os.getenv("OPENAI_API_KEY"),
model_name="text-embedding-ada-002"
)
client = chromadb.Client()
collection = client.create_collection(
name="openai_docs",
embedding_function=openai_ef
)
collection.add(
documents=["OpenAI 嵌入测试", "另一个测试"],
ids=["doc1", "doc2"]
)
OpenAI 模型对比 #
text
┌─────────────────────────────────────────────────────────────┐
│ OpenAI 嵌入模型对比 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 模型名称 维度 价格/1K tokens │
│ ───────────────────────────────────────────────────── │
│ text-embedding-ada-002 1536 $0.0001 │
│ text-embedding-3-small 1536 $0.00002 │
│ text-embedding-3-large 3072 $0.00013 │
│ │
│ 特点: │
│ - 高质量英文嵌入 │
│ - 支持中文,但效果不如专用模型 │
│ - 需要 API Key │
│ - 有使用成本 │
│ │
└─────────────────────────────────────────────────────────────┘
Cohere Embeddings #
python
import chromadb
from chromadb.utils import embedding_functions
import os
cohere_ef = embedding_functions.CohereEmbeddingFunction(
api_key=os.getenv("COHERE_API_KEY"),
model_name="embed-english-v3.0"
)
client = chromadb.Client()
collection = client.create_collection(
name="cohere_docs",
embedding_function=cohere_ef
)
Google PaLM Embeddings #
python
import chromadb
from chromadb.utils import embedding_functions
import os
google_ef = embedding_functions.GooglePalmEmbeddingFunction(
api_key=os.getenv("GOOGLE_API_KEY")
)
client = chromadb.Client()
collection = client.create_collection(
name="google_docs",
embedding_function=google_ef
)
Hugging Face Embeddings #
python
import chromadb
from chromadb.utils import embedding_functions
hf_ef = embedding_functions.HuggingFaceEmbeddingFunction(
api_key=os.getenv("HUGGINGFACE_API_KEY"),
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
client = chromadb.Client()
collection = client.create_collection(
name="hf_docs",
embedding_function=hf_ef
)
自定义嵌入函数 #
基本自定义 #
python
from chromadb.api.types import EmbeddingFunction, Documents, Embeddings
class CustomEmbeddingFunction(EmbeddingFunction):
def __call__(self, input: Documents) -> Embeddings:
embeddings = []
for text in input:
embedding = self._compute_embedding(text)
embeddings.append(embedding)
return embeddings
def _compute_embedding(self, text: str) -> list:
import hashlib
hash_obj = hashlib.md5(text.encode())
hash_bytes = hash_obj.digest()
embedding = [float(b) / 255.0 for b in hash_bytes]
while len(embedding) < 384:
embedding = embedding + embedding
return embedding[:384]
custom_ef = CustomEmbeddingFunction()
client = chromadb.Client()
collection = client.create_collection(
name="custom_docs",
embedding_function=custom_ef
)
使用本地模型 #
python
from chromadb.api.types import EmbeddingFunction, Documents, Embeddings
from transformers import AutoTokenizer, AutoModel
import torch
class LocalModelEmbeddingFunction(EmbeddingFunction):
def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModel.from_pretrained(model_name)
self.model.eval()
def __call__(self, input: Documents) -> Embeddings:
embeddings = []
with torch.no_grad():
for text in input:
inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
outputs = self.model(**inputs)
embedding = outputs.last_hidden_state.mean(dim=1).squeeze().tolist()
embeddings.append(embedding)
return embeddings
local_ef = LocalModelEmbeddingFunction()
client = chromadb.Client()
collection = client.create_collection(
name="local_model_docs",
embedding_function=local_ef
)
带缓存的嵌入函数 #
python
from chromadb.api.types import EmbeddingFunction, Documents, Embeddings
import hashlib
class CachedEmbeddingFunction(EmbeddingFunction):
def __init__(self, base_embedding_function: EmbeddingFunction):
self.base_ef = base_embedding_function
self.cache = {}
def __call__(self, input: Documents) -> Embeddings:
embeddings = []
uncached_texts = []
uncached_indices = []
for i, text in enumerate(input):
cache_key = hashlib.md5(text.encode()).hexdigest()
if cache_key in self.cache:
embeddings.append(self.cache[cache_key])
else:
embeddings.append(None)
uncached_texts.append(text)
uncached_indices.append((i, cache_key))
if uncached_texts:
new_embeddings = self.base_ef(uncached_texts)
for (i, cache_key), embedding in zip(uncached_indices, new_embeddings):
self.cache[cache_key] = embedding
embeddings[i] = embedding
return embeddings
base_ef = embedding_functions.SentenceTransformerEmbeddingFunction()
cached_ef = CachedEmbeddingFunction(base_ef)
多模态嵌入 #
图像嵌入 #
python
from chromadb.api.types import EmbeddingFunction, Documents, Embeddings
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel
class ImageEmbeddingFunction(EmbeddingFunction):
def __init__(self):
self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
def __call__(self, input: Documents) -> Embeddings:
embeddings = []
for image_path in input:
image = Image.open(image_path)
inputs = self.processor(images=image, return_tensors="pt")
with torch.no_grad():
embedding = self.model.get_image_features(**inputs)
embedding = embedding.squeeze().tolist()
embeddings.append(embedding)
return embeddings
image_ef = ImageEmbeddingFunction()
client = chromadb.Client()
image_collection = client.create_collection(
name="images",
embedding_function=image_ef
)
文本-图像联合嵌入 #
python
from chromadb.api.types import EmbeddingFunction, Documents, Embeddings
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
class CLIPEmbeddingFunction(EmbeddingFunction):
def __init__(self):
self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
def __call__(self, input: Documents, modality: str = "text") -> Embeddings:
embeddings = []
if modality == "text":
for text in input:
inputs = self.processor(text=[text], return_tensors="pt", padding=True)
with torch.no_grad():
embedding = self.model.get_text_features(**inputs)
embeddings.append(embedding.squeeze().tolist())
elif modality == "image":
for image_path in input:
image = Image.open(image_path)
inputs = self.processor(images=image, return_tensors="pt")
with torch.no_grad():
embedding = self.model.get_image_features(**inputs)
embeddings.append(embedding.squeeze().tolist())
return embeddings
clip_ef = CLIPEmbeddingFunction()
text_embeddings = clip_ef(["一只猫", "一只狗"], modality="text")
嵌入管理 #
预计算嵌入 #
python
import chromadb
from chromadb.utils import embedding_functions
ef = embedding_functions.SentenceTransformerEmbeddingFunction()
documents = ["文档1", "文档2", "文档3"]
embeddings = ef(documents)
client = chromadb.Client()
collection = client.create_collection(name="precomputed")
collection.add(
ids=["doc1", "doc2", "doc3"],
documents=documents,
embeddings=embeddings
)
嵌入维度管理 #
python
def check_embedding_dimension(collection, expected_dim: int):
results = collection.get(limit=1, include=["embeddings"])
if results['embeddings']:
actual_dim = len(results['embeddings'][0])
if actual_dim != expected_dim:
raise ValueError(f"嵌入维度不匹配: 期望 {expected_dim}, 实际 {actual_dim}")
print(f"嵌入维度正确: {actual_dim}")
else:
print("集合为空,无法检查嵌入维度")
嵌入质量检查 #
python
import numpy as np
def check_embedding_quality(embeddings):
embeddings_array = np.array(embeddings)
norms = np.linalg.norm(embeddings_array, axis=1)
print(f"向量范数范围: [{norms.min():.3f}, {norms.max():.3f}]")
mean_embedding = embeddings_array.mean(axis=0)
distances = np.linalg.norm(embeddings_array - mean_embedding, axis=1)
print(f"到中心距离范围: [{distances.min():.3f}, {distances.max():.3f}]")
if embeddings_array.shape[0] > 1:
sample_size = min(100, embeddings_array.shape[0])
sample = embeddings_array[:sample_size]
similarity_matrix = np.dot(sample, sample.T)
np.fill_diagonal(similarity_matrix, 0)
print(f"样本间相似度范围: [{similarity_matrix.min():.3f}, {similarity_matrix.max():.3f}]")
嵌入函数选择指南 #
text
┌─────────────────────────────────────────────────────────────┐
│ 嵌入函数选择 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 快速原型 / 本地开发 │
│ → Sentence Transformers (all-MiniLM-L6-v2) │
│ - 免费 │
│ - 快速 │
│ - 无需 API Key │
│ │
│ 高质量英文任务 │
│ → OpenAI text-embedding-3-small │
│ - 高质量 │
│ - 低成本 │
│ - 易于使用 │
│ │
│ 中文任务 │
│ → Sentence Transformers (paraphrase-multilingual) │
│ - 多语言支持 │
│ - 免费 │
│ - 本地运行 │
│ │
│ 多模态任务 │
│ → CLIP │
│ - 文本和图像 │
│ - 跨模态搜索 │
│ - 开源 │
│ │
│ 企业级应用 │
│ → OpenAI / Cohere │
│ - 高可用性 │
│ - 技术支持 │
│ - SLA 保证 │
│ │
└─────────────────────────────────────────────────────────────┘
下一步 #
现在你已经掌握了向量嵌入,接下来学习 查询与检索,了解如何高效地搜索和检索数据!
最后更新:2026-04-04