OpenAI 文本嵌入 #
什么是文本嵌入? #
文本嵌入(Text Embeddings)是将文本转换为高维向量的技术。这些向量能够捕捉文本的语义信息,使语义相似的文本在向量空间中距离更近。
text
┌─────────────────────────────────────────────────────────────┐
│ 文本嵌入原理 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 文本 向量 │
│ ───────────────────────────────────────────────────── │
│ "猫是一种宠物" ───> [0.12, -0.34, 0.56, ...] │
│ "狗是常见的宠物" ───> [0.15, -0.30, 0.52, ...] │
│ "汽车是交通工具" ───> [-0.45, 0.78, -0.23, ...] │
│ │
│ 向量空间中: │
│ - "猫"和"狗"的向量距离近(语义相似) │
│ - "宠物"和"汽车"的向量距离远(语义不同) │
│ │
└─────────────────────────────────────────────────────────────┘
嵌入的应用场景 #
text
┌─────────────────────────────────────────────────────────────┐
│ 嵌入应用场景 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 🔍 语义搜索 │
│ 理解用户意图,返回语义相关的结果 │
│ │
│ 📊 文本聚类 │
│ 将相似文本分组,发现主题模式 │
│ │
│ 🎯 推荐系统 │
│ 基于内容相似度推荐相关内容 │
│ │
│ 🏷️ 分类任务 │
│ 文本分类、情感分析 │
│ │
│ 🔗 相似度计算 │
│ 检测重复内容、抄袭检测 │
│ │
│ 💬 问答系统 │
│ 匹配问题和答案 │
│ │
└─────────────────────────────────────────────────────────────┘
基本用法 #
生成嵌入向量 #
python
from openai import OpenAI
client = OpenAI()
response = client.embeddings.create(
model="text-embedding-3-small",
input="你好,世界!"
)
embedding = response.data[0].embedding
print(f"向量维度: {len(embedding)}")
print(f"前10个值: {embedding[:10]}")
Node.js 示例 #
javascript
import OpenAI from 'openai';
const client = new OpenAI();
async function getEmbedding(text) {
const response = await client.embeddings.create({
model: 'text-embedding-3-small',
input: text
});
return response.data[0].embedding;
}
const embedding = await getEmbedding('你好,世界!');
console.log(`向量维度: ${embedding.length}`);
cURL 示例 #
bash
curl https://api.openai.com/v1/embeddings \
-H "Authorization: Bearer $OPENAI_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"input": "你好,世界!",
"model": "text-embedding-3-small"
}'
模型选择 #
可用模型 #
text
┌─────────────────────────────────────────────────────────────┐
│ 嵌入模型对比 │
├─────────────────────────────────────────────────────────────┤
│ │
│ text-embedding-3-large(推荐) │
│ ───────────────────────────────────────────────────────── │
│ 维度: 3072 │
│ 性能: 最高 │
│ 价格: $0.13 / 1M tokens │
│ 适用: 需要最高精度的场景 │
│ │
│ text-embedding-3-small │
│ ───────────────────────────────────────────────────────── │
│ 维度: 1536 │
│ 性能: 良好 │
│ 价格: $0.02 / 1M tokens │
│ 适用: 平衡性能和成本 │
│ │
│ text-embedding-ada-002(旧版) │
│ ───────────────────────────────────────────────────────── │
│ 维度: 1536 │
│ 性能: 一般 │
│ 价格: $0.10 / 1M tokens │
│ 适用: 兼容旧项目 │
│ │
└─────────────────────────────────────────────────────────────┘
模型选择建议 #
python
def get_model_for_use_case(use_case: str) -> str:
models = {
"high_accuracy": "text-embedding-3-large",
"balanced": "text-embedding-3-small",
"legacy": "text-embedding-ada-002"
}
return models.get(use_case, "text-embedding-3-small")
参数详解 #
input(输入文本) #
text
┌─────────────────────────────────────────────────────────────┐
│ input 参数 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 类型: string 或 array │
│ 最大: 2048 个输入 / 批量请求 │
│ │
│ 单个文本: │
│ input: "这是一段文本" │
│ │
│ 多个文本: │
│ input: ["文本1", "文本2", "文本3"] │
│ │
│ Token 限制: │
│ - text-embedding-3-small: 8191 tokens │
│ - text-embedding-3-large: 8191 tokens │
│ │
└─────────────────────────────────────────────────────────────┘
encoding_format(编码格式) #
text
┌─────────────────────────────────────────────────────────────┐
│ encoding_format 参数 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 可选值: │
│ ───────────────────────────────────────────────────────── │
│ float 浮点数数组(默认) │
│ base64 Base64 编码(更高效传输) │
│ │
└─────────────────────────────────────────────────────────────┘
python
response = client.embeddings.create(
model="text-embedding-3-small",
input="你好",
encoding_format="float"
)
dimensions(维度缩减) #
text
┌─────────────────────────────────────────────────────────────┐
│ dimensions 参数 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 仅 text-embedding-3-* 模型支持 │
│ ───────────────────────────────────────────────────────── │
│ 可以减少向量维度以节省存储空间 │
│ 最低可减至 256 维 │
│ │
│ 示例: │
│ dimensions: 512 │
│ dimensions: 256 │
│ │
└─────────────────────────────────────────────────────────────┘
python
response = client.embeddings.create(
model="text-embedding-3-large",
input="你好",
dimensions=512
)
print(f"向量维度: {len(response.data[0].embedding)}")
相似度计算 #
余弦相似度 #
python
import numpy as np
from openai import OpenAI
client = OpenAI()
def cosine_similarity(vec1: list, vec2: list) -> float:
"""计算余弦相似度"""
vec1 = np.array(vec1)
vec2 = np.array(vec2)
dot_product = np.dot(vec1, vec2)
norm1 = np.linalg.norm(vec1)
norm2 = np.linalg.norm(vec2)
return dot_product / (norm1 * norm2)
def get_embedding(text: str) -> list:
"""获取文本嵌入"""
response = client.embeddings.create(
model="text-embedding-3-small",
input=text
)
return response.data[0].embedding
text1 = "猫是一种可爱的宠物"
text2 = "狗是人类的好朋友"
text3 = "汽车是一种交通工具"
emb1 = get_embedding(text1)
emb2 = get_embedding(text2)
emb3 = get_embedding(text3)
print(f"猫 vs 狗: {cosine_similarity(emb1, emb2):.4f}")
print(f"猫 vs 汽车: {cosine_similarity(emb1, emb3):.4f}")
print(f"狗 vs 汽车: {cosine_similarity(emb2, emb3):.4f}")
其他相似度度量 #
python
def euclidean_distance(vec1: list, vec2: list) -> float:
"""欧几里得距离"""
return np.linalg.norm(np.array(vec1) - np.array(vec2))
def manhattan_distance(vec1: list, vec2: list) -> float:
"""曼哈顿距离"""
return np.sum(np.abs(np.array(vec1) - np.array(vec2)))
def dot_product(vec1: list, vec2: list) -> float:
"""点积"""
return np.dot(np.array(vec1), np.array(vec2))
语义搜索 #
基本实现 #
python
from openai import OpenAI
import numpy as np
client = OpenAI()
class SemanticSearch:
def __init__(self):
self.documents = []
self.embeddings = []
def add_document(self, text: str):
"""添加文档"""
self.documents.append(text)
embedding = self._get_embedding(text)
self.embeddings.append(embedding)
def _get_embedding(self, text: str) -> list:
"""获取嵌入向量"""
response = client.embeddings.create(
model="text-embedding-3-small",
input=text
)
return response.data[0].embedding
def search(self, query: str, top_k: int = 3) -> list:
"""语义搜索"""
query_embedding = self._get_embedding(query)
similarities = []
for i, doc_embedding in enumerate(self.embeddings):
sim = self._cosine_similarity(query_embedding, doc_embedding)
similarities.append((i, sim, self.documents[i]))
similarities.sort(key=lambda x: x[1], reverse=True)
return similarities[:top_k]
def _cosine_similarity(self, vec1: list, vec2: list) -> float:
vec1 = np.array(vec1)
vec2 = np.array(vec2)
return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
search_engine = SemanticSearch()
documents = [
"Python 是一种流行的编程语言",
"机器学习是人工智能的一个分支",
"深度学习使用神经网络进行学习",
"JavaScript 主要用于网页开发",
"自然语言处理让计算机理解人类语言"
]
for doc in documents:
search_engine.add_document(doc)
results = search_engine.search("编程语言有哪些")
for i, sim, doc in results:
print(f"相似度: {sim:.4f} - {doc}")
批量处理 #
python
def batch_embed(texts: list, batch_size: int = 100) -> list:
"""批量获取嵌入"""
all_embeddings = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
response = client.embeddings.create(
model="text-embedding-3-small",
input=batch
)
batch_embeddings = [item.embedding for item in response.data]
all_embeddings.extend(batch_embeddings)
return all_embeddings
texts = ["文本1", "文本2", "文本3", "文本4", "文本5"]
embeddings = batch_embed(texts)
print(f"生成了 {len(embeddings)} 个嵌入向量")
文本聚类 #
K-Means 聚类 #
python
from openai import OpenAI
import numpy as np
from sklearn.cluster import KMeans
client = OpenAI()
def cluster_texts(texts: list, n_clusters: int = 3):
"""文本聚类"""
response = client.embeddings.create(
model="text-embedding-3-small",
input=texts
)
embeddings = [item.embedding for item in response.data]
embeddings_array = np.array(embeddings)
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels = kmeans.fit_predict(embeddings_array)
clusters = {i: [] for i in range(n_clusters)}
for i, label in enumerate(labels):
clusters[label].append(texts[i])
return clusters
texts = [
"Python 编程入门",
"JavaScript 基础教程",
"机器学习算法介绍",
"深度学习神经网络",
"Java 开发指南",
"自然语言处理技术",
"Go 语言实战",
"计算机视觉应用"
]
clusters = cluster_texts(texts, n_clusters=3)
for cluster_id, cluster_texts in clusters.items():
print(f"\n聚类 {cluster_id}:")
for text in cluster_texts:
print(f" - {text}")
向量数据库集成 #
使用 Pinecone #
python
from openai import OpenAI
from pinecone import Pinecone
openai_client = OpenAI()
pinecone_client = Pinecone(api_key="your-pinecone-key")
index = pinecone_client.Index("your-index-name")
def upsert_document(id: str, text: str, metadata: dict = None):
"""插入文档到向量数据库"""
response = openai_client.embeddings.create(
model="text-embedding-3-small",
input=text
)
embedding = response.data[0].embedding
index.upsert([
(id, embedding, metadata or {})
])
def search_similar(query: str, top_k: int = 5):
"""搜索相似文档"""
response = openai_client.embeddings.create(
model="text-embedding-3-small",
input=query
)
query_embedding = response.data[0].embedding
results = index.query(
vector=query_embedding,
top_k=top_k,
include_metadata=True
)
return results
使用 Chroma #
python
import chromadb
from openai import OpenAI
openai_client = OpenAI()
chroma_client = chromadb.Client()
collection = chroma_client.create_collection("documents")
def add_documents(texts: list, ids: list):
"""添加文档"""
response = openai_client.embeddings.create(
model="text-embedding-3-small",
input=texts
)
embeddings = [item.embedding for item in response.data]
collection.add(
embeddings=embeddings,
documents=texts,
ids=ids
)
def query_similar(query: str, n_results: int = 5):
"""查询相似文档"""
response = openai_client.embeddings.create(
model="text-embedding-3-small",
input=query
)
query_embedding = response.data[0].embedding
results = collection.query(
query_embeddings=[query_embedding],
n_results=n_results
)
return results
实际应用示例 #
文档问答系统 #
python
from openai import OpenAI
import numpy as np
client = OpenAI()
class DocumentQA:
def __init__(self):
self.documents = []
self.embeddings = []
def load_documents(self, documents: list):
"""加载文档"""
self.documents = documents
response = client.embeddings.create(
model="text-embedding-3-small",
input=documents
)
self.embeddings = [item.embedding for item in response.data]
def find_relevant(self, query: str, top_k: int = 3) -> list:
"""找到相关文档"""
response = client.embeddings.create(
model="text-embedding-3-small",
input=query
)
query_embedding = response.data[0].embedding
similarities = []
for i, doc_embedding in enumerate(self.embeddings):
sim = np.dot(query_embedding, doc_embedding) / (
np.linalg.norm(query_embedding) * np.linalg.norm(doc_embedding)
)
similarities.append((i, sim))
similarities.sort(key=lambda x: x[1], reverse=True)
return [self.documents[i] for i, _ in similarities[:top_k]]
def answer(self, question: str) -> str:
"""回答问题"""
relevant_docs = self.find_relevant(question)
context = "\n\n".join(relevant_docs)
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": "根据以下文档回答问题。如果文档中没有相关信息,请说明。"
},
{
"role": "user",
"content": f"文档:\n{context}\n\n问题:{question}"
}
]
)
return response.choices[0].message.content
qa = DocumentQA()
documents = [
"公司的退货政策是购买后30天内可以退货。",
"客服电话是400-123-4567,工作时间为周一至周五9:00-18:00。",
"产品保修期为一年,从购买日期开始计算。",
"支持支付宝、微信支付和银行卡支付。"
]
qa.load_documents(documents)
print(qa.answer("如何退货?"))
print(qa.answer("客服电话是多少?"))
推荐系统 #
python
class ContentRecommender:
def __init__(self):
self.items = []
self.embeddings = []
def add_item(self, item_id: str, content: str, metadata: dict = None):
"""添加内容项"""
response = client.embeddings.create(
model="text-embedding-3-small",
input=content
)
self.items.append({
"id": item_id,
"content": content,
"embedding": response.data[0].embedding,
"metadata": metadata or {}
})
def recommend(self, query: str, top_k: int = 5) -> list:
"""推荐相关内容"""
response = client.embeddings.create(
model="text-embedding-3-small",
input=query
)
query_embedding = response.data[0].embedding
scores = []
for item in self.items:
sim = np.dot(query_embedding, item["embedding"]) / (
np.linalg.norm(query_embedding) * np.linalg.norm(item["embedding"])
)
scores.append((item, sim))
scores.sort(key=lambda x: x[1], reverse=True)
return [(item["id"], item["metadata"], score) for item, score in scores[:top_k]]
recommender = ContentRecommender()
articles = [
("article-1", "Python 编程入门教程", {"category": "编程"}),
("article-2", "机器学习算法详解", {"category": "AI"}),
("article-3", "JavaScript 前端开发", {"category": "编程"}),
("article-4", "深度学习实战指南", {"category": "AI"}),
("article-5", "数据结构与算法", {"category": "编程"}),
]
for article_id, content, metadata in articles:
recommender.add_item(article_id, content, metadata)
recommendations = recommender.recommend("我想学习编程")
for item_id, metadata, score in recommendations:
print(f"推荐: {item_id} (相似度: {score:.4f})")
最佳实践 #
1. 文本预处理 #
python
def preprocess_text(text: str) -> str:
"""文本预处理"""
text = text.strip()
text = " ".join(text.split())
text = text.lower()
return text
def chunk_text(text: str, max_tokens: int = 8000) -> list:
"""长文本分块"""
words = text.split()
chunks = []
current_chunk = []
current_length = 0
for word in words:
word_length = len(word) // 4 + 1
if current_length + word_length > max_tokens:
chunks.append(" ".join(current_chunk))
current_chunk = [word]
current_length = word_length
else:
current_chunk.append(word)
current_length += word_length
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
2. 缓存嵌入 #
python
import hashlib
import json
class CachedEmbeddings:
def __init__(self):
self.client = OpenAI()
self.cache = {}
def _get_cache_key(self, text: str, model: str) -> str:
return hashlib.md5(f"{model}:{text}".encode()).hexdigest()
def get_embedding(self, text: str, model: str = "text-embedding-3-small") -> list:
cache_key = self._get_cache_key(text, model)
if cache_key in self.cache:
return self.cache[cache_key]
response = self.client.embeddings.create(
model=model,
input=text
)
embedding = response.data[0].embedding
self.cache[cache_key] = embedding
return embedding
3. 错误处理 #
python
from openai import OpenAI, APIError, RateLimitError
import time
def safe_get_embedding(text: str, retries: int = 3) -> list:
client = OpenAI()
for attempt in range(retries):
try:
response = client.embeddings.create(
model="text-embedding-3-small",
input=text
)
return response.data[0].embedding
except RateLimitError:
wait = 2 ** attempt
print(f"速率限制,等待 {wait} 秒...")
time.sleep(wait)
except APIError as e:
print(f"API 错误: {e}")
if attempt == retries - 1:
raise
raise Exception("获取嵌入失败")
下一步 #
现在你已经掌握了文本嵌入的使用方法,接下来学习 Assistants API,了解如何构建功能强大的 AI 助手!
最后更新:2026-03-29