文档问答系统 #
项目概述 #
本节将构建一个企业级文档问答系统,具备以下功能:
- 多格式文档支持(PDF、Word、TXT、Markdown)
- 智能文档分块
- 向量化存储与检索
- 精准问答生成
- 来源追溯
架构设计 #
text
┌─────────────────────────────────────────────────────────────┐
│ 文档问答系统架构 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 文档导入流程: │
│ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │
│ │ 文档上传 │───▶│ 文档解析 │───▶│ 文本分块 │───▶│ 向量存储 │ │
│ └─────────┘ └─────────┘ └─────────┘ └─────────┘ │
│ │
│ 问答流程: │
│ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │
│ │ 用户提问 │───▶│ 向量检索 │───▶│ 构建提示 │───▶│ 生成回答 │ │
│ └─────────┘ └─────────┘ └─────────┘ └─────────┘ │
│ │
└─────────────────────────────────────────────────────────────┘
项目结构 #
text
DocumentQA/
├── Controllers/
│ ├── DocumentController.cs
│ └── QAController.cs
├── Services/
│ ├── IDocumentService.cs
│ ├── DocumentService.cs
│ ├── IQAService.cs
│ ├── QAService.cs
│ ├── IEmbeddingService.cs
│ └── EmbeddingService.cs
├── Processors/
│ ├── IDocumentProcessor.cs
│ ├── PdfProcessor.cs
│ ├── WordProcessor.cs
│ └── TextProcessor.cs
├── Models/
│ ├── Document.cs
│ ├── DocumentChunk.cs
│ ├── QAResponse.cs
│ └── SearchResult.cs
└── Program.cs
核心代码实现 #
1. 文档模型 #
csharp
public class Document
{
public string Id { get; set; } = Guid.NewGuid().ToString();
public string Title { get; set; } = "";
public string Content { get; set; } = "";
public string Source { get; set; } = "";
public string FileType { get; set; } = "";
public long FileSize { get; set; }
public DateTime CreatedAt { get; set; } = DateTime.UtcNow;
public DocumentMetadata Metadata { get; set; } = new();
}
public class DocumentChunk
{
public string Id { get; set; } = Guid.NewGuid().ToString();
public string DocumentId { get; set; } = "";
public string Content { get; set; } = "";
public int ChunkIndex { get; set; }
public int StartPosition { get; set; }
public int EndPosition { get; set; }
public float[]? Embedding { get; set; }
}
public class DocumentMetadata
{
public string Author { get; set; } = "";
public string Category { get; set; } = "";
public List<string> Tags { get; set; } = new();
}
2. 文档处理器 #
csharp
public interface IDocumentProcessor
{
bool CanProcess(string fileType);
Task<string> ProcessAsync(Stream content);
}
public class PdfProcessor : IDocumentProcessor
{
public bool CanProcess(string fileType) =>
fileType.Equals(".pdf", StringComparison.OrdinalIgnoreCase);
public async Task<string> ProcessAsync(Stream content)
{
using var document = PdfDocument.Open(content);
var text = new StringBuilder();
foreach (var page in document.GetPages())
{
text.AppendLine(page.Text);
}
return text.ToString();
}
}
public class WordProcessor : IDocumentProcessor
{
public bool CanProcess(string fileType) =>
fileType.Equals(".docx", StringComparison.OrdinalIgnoreCase);
public async Task<string> ProcessAsync(Stream content)
{
using var document = WordprocessingDocument.Open(content, false);
var body = document.MainDocumentPart?.Document.Body;
return body?.InnerText ?? "";
}
}
public class TextProcessor : IDocumentProcessor
{
public bool CanProcess(string fileType) =>
new[] { ".txt", ".md" }.Contains(fileType.ToLowerInvariant());
public async Task<string> ProcessAsync(Stream content)
{
using var reader = new StreamReader(content);
return await reader.ReadToEndAsync();
}
}
3. 文档服务 #
csharp
public class DocumentService : IDocumentService
{
private readonly IEnumerable<IDocumentProcessor> _processors;
private readonly ITextEmbeddingGenerationService _embeddingService;
private readonly IMemoryStore _memoryStore;
private readonly ILogger<DocumentService> _logger;
private const int ChunkSize = 500;
private const int ChunkOverlap = 50;
public async Task<Document> ImportDocumentAsync(
Stream content,
string fileName,
string fileType,
DocumentMetadata? metadata = null)
{
var processor = _processors.FirstOrDefault(p => p.CanProcess(fileType));
if (processor == null)
{
throw new NotSupportedException($"不支持的文件类型: {fileType}");
}
var text = await processor.ProcessAsync(content);
var document = new Document
{
Title = Path.GetFileNameWithoutExtension(fileName),
Source = fileName,
FileType = fileType,
FileSize = content.Length,
Metadata = metadata ?? new DocumentMetadata()
};
var chunks = CreateChunks(document.Id, text);
await IndexChunksAsync(chunks);
return document;
}
private List<DocumentChunk> CreateChunks(string documentId, string text)
{
var chunks = new List<DocumentChunk>();
var sentences = SplitSentences(text);
var currentChunk = new StringBuilder();
var currentPosition = 0;
var chunkIndex = 0;
foreach (var sentence in sentences)
{
if (currentChunk.Length + sentence.Length > ChunkSize &&
currentChunk.Length > 0)
{
chunks.Add(new DocumentChunk
{
DocumentId = documentId,
Content = currentChunk.ToString(),
ChunkIndex = chunkIndex++,
StartPosition = currentPosition - currentChunk.Length,
EndPosition = currentPosition
});
var overlapText = GetOverlapText(currentChunk.ToString());
currentChunk.Clear();
currentChunk.Append(overlapText);
}
currentChunk.Append(sentence);
currentPosition += sentence.Length;
}
if (currentChunk.Length > 0)
{
chunks.Add(new DocumentChunk
{
DocumentId = documentId,
Content = currentChunk.ToString(),
ChunkIndex = chunkIndex,
StartPosition = currentPosition - currentChunk.Length,
EndPosition = currentPosition
});
}
return chunks;
}
private async Task IndexChunksAsync(List<DocumentChunk> chunks)
{
foreach (var chunk in chunks)
{
var embedding = await _embeddingService.GenerateEmbeddingAsync(chunk.Content);
var record = new MemoryRecord(
new MemoryRecordMetadata(
isReference: false,
id: chunk.Id,
text: chunk.Content,
description: $"Document: {chunk.DocumentId}, Chunk: {chunk.ChunkIndex}",
externalSourceName: "",
additionalMetadata: JsonSerializer.Serialize(new
{
chunk.DocumentId,
chunk.ChunkIndex,
chunk.StartPosition,
chunk.EndPosition
})
),
embedding,
null
);
await _memoryStore.UpsertAsync("documents", record);
}
}
public async Task DeleteDocumentAsync(string documentId)
{
var chunks = await GetDocumentChunksAsync(documentId);
foreach (var chunk in chunks)
{
await _memoryStore.RemoveAsync("documents", chunk.Id);
}
}
private List<string> SplitSentences(string text)
{
var delimiters = new[] { '。', '!', '?', '.', '!', '?', '\n' };
return text.Split(delimiters, StringSplitOptions.RemoveEmptyEntries)
.Select(s => s.Trim())
.Where(s => !string.IsNullOrEmpty(s))
.ToList();
}
private string GetOverlapText(string text)
{
if (text.Length <= ChunkOverlap) return text;
return text.Substring(text.Length - ChunkOverlap);
}
}
4. 问答服务 #
csharp
public class QAService : IQAService
{
private readonly Kernel _kernel;
private readonly ITextEmbeddingGenerationService _embeddingService;
private readonly IMemoryStore _memoryStore;
private readonly ILogger<QAService> _logger;
public async Task<QAResponse> AskAsync(
string question,
int topK = 5,
float minScore = 0.7f)
{
var queryEmbedding = await _embeddingService.GenerateEmbeddingAsync(question);
var searchResults = new List<SearchResult>();
var results = _memoryStore.GetNearestMatchesAsync(
"documents",
queryEmbedding,
limit: topK,
minRelevanceScore: minScore
);
await foreach (var (record, score) in results)
{
var metadata = JsonSerializer.Deserialize<Dictionary<string, string>>(
record.Metadata.AdditionalMetadata
);
searchResults.Add(new SearchResult
{
ChunkId = record.Metadata.Id,
DocumentId = metadata?["DocumentId"] ?? "",
Content = record.Metadata.Text,
Score = score,
ChunkIndex = int.Parse(metadata?["ChunkIndex"] ?? "0")
});
}
if (!searchResults.Any())
{
return new QAResponse
{
Question = question,
Answer = "抱歉,我在知识库中没有找到相关信息。",
Sources = new List<SourceInfo>()
};
}
var answer = await GenerateAnswerAsync(question, searchResults);
return new QAResponse
{
Question = question,
Answer = answer,
Sources = searchResults.Select(r => new SourceInfo
{
DocumentId = r.DocumentId,
Content = r.Content.Length > 200
? r.Content.Substring(0, 200) + "..."
: r.Content,
Relevance = r.Score
}).ToList()
};
}
private async Task<string> GenerateAnswerAsync(
string question,
List<SearchResult> searchResults)
{
var context = string.Join("\n\n---\n\n",
searchResults.Select((r, i) => $"[{i + 1}] {r.Content}"));
var prompt = $"""
你是一个专业的文档问答助手。请根据以下参考资料回答问题。
要求:
1. 回答必须基于参考资料
2. 如果资料中没有相关信息,请明确说明
3. 引用来源时使用 [编号] 格式
4. 回答要准确、简洁
参考资料:
{context}
问题:{question}
回答:
""";
var result = await _kernel.InvokePromptAsync(prompt);
return result.ToString();
}
}
5. API 控制器 #
csharp
[ApiController]
[Route("api/[controller]")]
public class DocumentController : ControllerBase
{
private readonly IDocumentService _documentService;
[HttpPost("upload")]
public async Task<ActionResult<Document>> Upload(IFormFile file)
{
if (file == null || file.Length == 0)
{
return BadRequest("请选择文件");
}
var fileType = Path.GetExtension(file.FileName);
using var stream = file.OpenReadStream();
var document = await _documentService.ImportDocumentAsync(
stream,
file.FileName,
fileType
);
return Ok(document);
}
[HttpDelete("{id}")]
public async Task<IActionResult> Delete(string id)
{
await _documentService.DeleteDocumentAsync(id);
return NoContent();
}
}
[ApiController]
[Route("api/[controller]")]
public class QAController : ControllerBase
{
private readonly IQAService _qaService;
[HttpPost("ask")]
public async Task<ActionResult<QAResponse>> Ask([FromBody] QuestionRequest request)
{
var response = await _qaService.AskAsync(
request.Question,
request.TopK ?? 5,
request.MinScore ?? 0.7f
);
return Ok(response);
}
}
6. 程序配置 #
csharp
var builder = WebApplication.CreateBuilder(args);
builder.Services.AddControllers();
builder.Services.AddEndpointsApiExplorer();
builder.Services.AddSwaggerGen();
builder.Services.AddKernel()
.AddAzureOpenAIChatCompletion(
deploymentName: builder.Configuration["AzureOpenAI:DeploymentName"]!,
endpoint: builder.Configuration["AzureOpenAI:Endpoint"]!,
apiKey: builder.Configuration["AzureOpenAI:ApiKey"]!
)
.AddOpenAITextEmbeddingGeneration(
modelId: "text-embedding-3-small",
apiKey: builder.Configuration["OpenAI:ApiKey"]!
);
builder.Services.AddSingleton<IMemoryStore, QdrantMemoryStore>(sp =>
new QdrantMemoryStore("localhost", 6333, 1536));
builder.Services.AddSingleton<IDocumentProcessor, PdfProcessor>();
builder.Services.AddSingleton<IDocumentProcessor, WordProcessor>();
builder.Services.AddSingleton<IDocumentProcessor, TextProcessor>();
builder.Services.AddScoped<IDocumentService, DocumentService>();
builder.Services.AddScoped<IQAService, QAService>();
var app = builder.Build();
if (app.Environment.IsDevelopment())
{
app.UseSwagger();
app.UseSwaggerUI();
}
app.MapControllers();
app.Run();
使用示例 #
上传文档 #
bash
curl -X POST "http://localhost:5000/api/document/upload" \
-H "Content-Type: multipart/form-data" \
-F "file=@document.pdf"
提问 #
bash
curl -X POST "http://localhost:5000/api/qa/ask" \
-H "Content-Type: application/json" \
-d '{"question": "什么是 Semantic Kernel?", "topK": 5}'
响应示例 #
json
{
"question": "什么是 Semantic Kernel?",
"answer": "Semantic Kernel 是微软开源的轻量级 SDK,用于帮助开发者将大语言模型集成到应用程序中 [1][2]。",
"sources": [
{
"documentId": "doc-123",
"content": "Semantic Kernel 是微软开源的轻量级 SDK...",
"relevance": 0.92
}
]
}
最佳实践 #
1. 文档分块优化 #
csharp
public class SmartChunker
{
public List<DocumentChunk> Chunk(string text, DocumentType type)
{
return type switch
{
DocumentType.Technical => ChunkBySection(text),
DocumentType.FAQ => ChunkByQuestion(text),
DocumentType.Article => ChunkByParagraph(text),
_ => ChunkBySize(text)
};
}
}
2. 混合检索 #
csharp
public async Task<List<SearchResult>> HybridSearchAsync(string query)
{
var vectorResults = await VectorSearchAsync(query);
var keywordResults = await KeywordSearchAsync(query);
return MergeAndRerank(vectorResults, keywordResults);
}
下一步 #
现在你已经掌握了文档问答系统,接下来学习 AI Agent 开发,构建自主决策的 AI 智能体!
最后更新:2026-04-04