向量嵌入 #
概述 #
向量嵌入(Embeddings)是将文本转换为高维向量的技术,使计算机能够理解文本的语义含义,支持语义搜索、相似度计算等应用。
基本概念 #
什么是向量嵌入? #
text
┌─────────────────────────────────────────────────────────────┐
│ 向量嵌入原理 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 文本 向量 │
│ ┌─────────────────┐ ┌─────────────────────┐ │
│ │ "人工智能" │ ──▶ │ [0.1, 0.8, 0.3, │ │
│ │ │ │ -0.2, 0.5, ...] │ │
│ └─────────────────┘ └─────────────────────┘ │
│ │
│ 特点: │
│ - 语义相似的文本 → 向量距离近 │
│ - 语义不同的文本 → 向量距离远 │
│ - 支持数学运算(相似度计算) │
│ │
│ 示例: │
│ "机器学习" ≈ "人工智能" (向量相似) │
│ "苹果" ≈ "水果" (向量相似) │
│ "汽车" ≠ "水果" (向量不同) │
│ │
└─────────────────────────────────────────────────────────────┘
配置嵌入服务 #
OpenAI 嵌入 #
csharp
using Microsoft.SemanticKernel;
var builder = Kernel.CreateBuilder();
builder.AddOpenAITextEmbeddingGeneration(
modelId: "text-embedding-3-small",
apiKey: Environment.GetEnvironmentVariable("OPENAI_API_KEY")
);
var kernel = builder.Build();
Azure OpenAI 嵌入 #
csharp
builder.AddAzureOpenAITextEmbeddingGeneration(
deploymentName: "text-embedding-ada-002",
endpoint: "https://your-resource.openai.azure.com/",
apiKey: "api-key"
);
支持的模型 #
| 模型 | 维度 | 描述 |
|---|---|---|
| text-embedding-3-small | 1536 | 高性价比 |
| text-embedding-3-large | 3072 | 最高质量 |
| text-embedding-ada-002 | 1536 | 旧版稳定 |
生成嵌入 #
基本使用 #
csharp
var embeddingGenerator = kernel.GetRequiredService<ITextEmbeddingGenerationService>();
// 单个文本
var embedding = await embeddingGenerator.GenerateEmbeddingAsync("人工智能");
Console.WriteLine($"维度: {embedding.Length}"); // 1536
// 多个文本
var texts = new[] { "人工智能", "机器学习", "苹果" };
var embeddings = await embeddingGenerator.GenerateEmbeddingsAsync(texts);
批量处理 #
csharp
public async Task<List<EmbeddingItem>> ProcessDocumentsAsync(
ITextEmbeddingGenerationService service,
IEnumerable<string> documents)
{
var results = new List<EmbeddingItem>();
var batchSize = 100; // API 限制
foreach (var batch in documents.Chunk(batchSize))
{
var embeddings = await service.GenerateEmbeddingsAsync(batch);
for (int i = 0; i < batch.Length; i++)
{
results.Add(new EmbeddingItem
{
Text = batch[i],
Embedding = embeddings[i]
});
}
}
return results;
}
相似度计算 #
余弦相似度 #
csharp
public static float CosineSimilarity(ReadOnlyMemory<float> a, ReadOnlyMemory<float> b)
{
float dotProduct = 0;
float normA = 0;
float normB = 0;
var spanA = a.Span;
var spanB = b.Span;
for (int i = 0; i < spanA.Length; i++)
{
dotProduct += spanA[i] * spanB[i];
normA += spanA[i] * spanA[i];
normB += spanB[i] * spanB[i];
}
return dotProduct / (MathF.Sqrt(normA) * MathF.Sqrt(normB));
}
使用示例 #
csharp
var embedding1 = await embeddingGenerator.GenerateEmbeddingAsync("人工智能");
var embedding2 = await embeddingGenerator.GenerateEmbeddingAsync("机器学习");
var embedding3 = await embeddingGenerator.GenerateEmbeddingAsync("苹果");
var similarity12 = CosineSimilarity(embedding1, embedding2); // 高相似度
var similarity13 = CosineSimilarity(embedding1, embedding3); // 低相似度
Console.WriteLine($"AI vs ML: {similarity12:F4}"); // ~0.9
Console.WriteLine($"AI vs Apple: {similarity13:F4}"); // ~0.3
欧几里得距离 #
csharp
public static float EuclideanDistance(ReadOnlyMemory<float> a, ReadOnlyMemory<float> b)
{
float sum = 0;
var spanA = a.Span;
var spanB = b.Span;
for (int i = 0; i < spanA.Length; i++)
{
var diff = spanA[i] - spanB[i];
sum += diff * diff;
}
return MathF.Sqrt(sum);
}
向量数据库 #
内存存储 #
csharp
using Microsoft.SemanticKernel.Memory;
var memoryStore = new VolatileMemoryStore();
// 添加向量
await memoryStore.UpsertAsync("collection", new MemoryRecord(
new MemoryRecordMetadata(
isReference: false,
id: "doc-1",
text: "人工智能是计算机科学的一个分支",
description: "",
externalSourceName: "",
additionalMetadata: ""
),
embedding,
null
));
// 搜索
var results = memoryStore.GetNearestMatchesAsync(
"collection",
queryEmbedding,
limit: 5,
minRelevanceScore: 0.7
);
Qdrant 集成 #
csharp
using Microsoft.SemanticKernel.Connectors.Qdrant;
var memoryStore = new QdrantMemoryStore(
host: "localhost",
port: 6333,
vectorSize: 1536
);
// 创建集合
await memoryStore.CreateCollectionAsync("documents");
// 插入数据
foreach (var doc in documents)
{
var embedding = await embeddingGenerator.GenerateEmbeddingAsync(doc.Content);
await memoryStore.UpsertAsync("documents", new MemoryRecord(
new MemoryRecordMetadata(
isReference: false,
id: doc.Id,
text: doc.Content,
description: doc.Title,
externalSourceName: "",
additionalMetadata: JsonSerializer.Serialize(doc.Metadata)
),
embedding,
null
));
}
Redis 集成 #
csharp
using Microsoft.SemanticKernel.Connectors.Redis;
var memoryStore = new RedisMemoryStore("localhost:6379");
await memoryStore.CreateCollectionAsync("knowledge");
// 存储向量
await memoryStore.UpsertAsync("knowledge", record);
语义搜索 #
完整示例 #
csharp
public class SemanticSearchService
{
private readonly ITextEmbeddingGenerationService _embeddingService;
private readonly IMemoryStore _memoryStore;
private const string Collection = "documents";
public SemanticSearchService(
ITextEmbeddingGenerationService embeddingService,
IMemoryStore memoryStore)
{
_embeddingService = embeddingService;
_memoryStore = memoryStore;
}
public async Task IndexDocumentsAsync(IEnumerable<Document> documents)
{
await _memoryStore.CreateCollectionAsync(Collection);
foreach (var doc in documents)
{
var embedding = await _embeddingService.GenerateEmbeddingAsync(doc.Content);
await _memoryStore.UpsertAsync(Collection, new MemoryRecord(
new MemoryRecordMetadata(
isReference: false,
id: doc.Id,
text: doc.Content,
description: doc.Title,
externalSourceName: "",
additionalMetadata: JsonSerializer.Serialize(doc.Metadata)
),
embedding,
null
));
}
}
public async IAsyncEnumerable<SearchResult> SearchAsync(
string query,
int limit = 5,
float minScore = 0.7f)
{
var queryEmbedding = await _embeddingService.GenerateEmbeddingAsync(query);
var results = _memoryStore.GetNearestMatchesAsync(
Collection,
queryEmbedding,
limit,
minScore
);
await foreach (var (record, score) in results)
{
yield return new SearchResult
{
Id = record.Metadata.Id,
Title = record.Metadata.Description,
Content = record.Metadata.Text,
Score = score,
Metadata = JsonSerializer.Deserialize<Dictionary<string, string>>(
record.Metadata.AdditionalMetadata
)
};
}
}
}
嵌入优化 #
文本预处理 #
csharp
public class TextPreprocessor
{
public string Preprocess(string text)
{
// 清理文本
text = text.Trim();
// 移除多余空白
text = System.Text.RegularExpressions.Regex.Replace(text, @"\s+", " ");
// 截断过长文本
if (text.Length > 8000)
{
text = text.Substring(0, 8000);
}
return text;
}
}
分块策略 #
csharp
public class TextChunker
{
public List<string> ChunkText(string text, int maxTokens = 500, int overlap = 50)
{
var chunks = new List<string>();
var sentences = text.Split(new[] { '。', '!', '?', '.', '!', '?' },
StringSplitOptions.RemoveEmptyEntries);
var currentChunk = new StringBuilder();
var currentTokens = 0;
foreach (var sentence in sentences)
{
var sentenceTokens = EstimateTokens(sentence);
if (currentTokens + sentenceTokens > maxTokens && currentChunk.Length > 0)
{
chunks.Add(currentChunk.ToString());
currentChunk.Clear();
currentTokens = 0;
}
currentChunk.Append(sentence);
currentTokens += sentenceTokens;
}
if (currentChunk.Length > 0)
{
chunks.Add(currentChunk.ToString());
}
return chunks;
}
private int EstimateTokens(string text)
{
// 简单估算:中文约 1.5 字符/token,英文约 4 字符/token
return text.Length / 2;
}
}
最佳实践 #
1. 选择合适的模型 #
csharp
// 高质量需求
builder.AddOpenAITextEmbeddingGeneration("text-embedding-3-large");
// 成本敏感
builder.AddOpenAITextEmbeddingGeneration("text-embedding-3-small");
2. 批量处理 #
csharp
// 推荐:批量处理
var embeddings = await service.GenerateEmbeddingsAsync(texts);
// 避免:逐个处理
foreach (var text in texts)
{
var embedding = await service.GenerateEmbeddingAsync(text); // 效率低
}
3. 缓存嵌入 #
csharp
public class CachedEmbeddingService
{
private readonly ITextEmbeddingGenerationService _service;
private readonly IMemoryCache _cache;
public async Task<ReadOnlyMemory<float>> GetEmbeddingAsync(string text)
{
var cacheKey = $"embedding:{text.GetHashCode()}";
return await _cache.GetOrCreateAsync(cacheKey, async _ =>
await _service.GenerateEmbeddingAsync(text)
);
}
}
下一步 #
现在你已经掌握了向量嵌入,接下来学习 RAG 应用,了解如何构建检索增强生成系统!
最后更新:2026-04-04