文档问答系统 #

项目概述 #

本节将构建一个企业级文档问答系统,具备以下功能:

  • 多格式文档支持(PDF、Word、TXT、Markdown)
  • 智能文档分块
  • 向量化存储与检索
  • 精准问答生成
  • 来源追溯

架构设计 #

text
┌─────────────────────────────────────────────────────────────┐
│                    文档问答系统架构                          │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  文档导入流程:                                              │
│  ┌─────────┐    ┌─────────┐    ┌─────────┐    ┌─────────┐  │
│  │ 文档上传 │───▶│ 文档解析 │───▶│ 文本分块 │───▶│ 向量存储 │  │
│  └─────────┘    └─────────┘    └─────────┘    └─────────┘  │
│                                                             │
│  问答流程:                                                  │
│  ┌─────────┐    ┌─────────┐    ┌─────────┐    ┌─────────┐  │
│  │ 用户提问 │───▶│ 向量检索 │───▶│ 构建提示 │───▶│ 生成回答 │  │
│  └─────────┘    └─────────┘    └─────────┘    └─────────┘  │
│                                                             │
└─────────────────────────────────────────────────────────────┘

项目结构 #

text
DocumentQA/
├── Controllers/
│   ├── DocumentController.cs
│   └── QAController.cs
├── Services/
│   ├── IDocumentService.cs
│   ├── DocumentService.cs
│   ├── IQAService.cs
│   ├── QAService.cs
│   ├── IEmbeddingService.cs
│   └── EmbeddingService.cs
├── Processors/
│   ├── IDocumentProcessor.cs
│   ├── PdfProcessor.cs
│   ├── WordProcessor.cs
│   └── TextProcessor.cs
├── Models/
│   ├── Document.cs
│   ├── DocumentChunk.cs
│   ├── QAResponse.cs
│   └── SearchResult.cs
└── Program.cs

核心代码实现 #

1. 文档模型 #

csharp
public class Document
{
    public string Id { get; set; } = Guid.NewGuid().ToString();
    public string Title { get; set; } = "";
    public string Content { get; set; } = "";
    public string Source { get; set; } = "";
    public string FileType { get; set; } = "";
    public long FileSize { get; set; }
    public DateTime CreatedAt { get; set; } = DateTime.UtcNow;
    public DocumentMetadata Metadata { get; set; } = new();
}

public class DocumentChunk
{
    public string Id { get; set; } = Guid.NewGuid().ToString();
    public string DocumentId { get; set; } = "";
    public string Content { get; set; } = "";
    public int ChunkIndex { get; set; }
    public int StartPosition { get; set; }
    public int EndPosition { get; set; }
    public float[]? Embedding { get; set; }
}

public class DocumentMetadata
{
    public string Author { get; set; } = "";
    public string Category { get; set; } = "";
    public List<string> Tags { get; set; } = new();
}

2. 文档处理器 #

csharp
public interface IDocumentProcessor
{
    bool CanProcess(string fileType);
    Task<string> ProcessAsync(Stream content);
}

public class PdfProcessor : IDocumentProcessor
{
    public bool CanProcess(string fileType) => 
        fileType.Equals(".pdf", StringComparison.OrdinalIgnoreCase);

    public async Task<string> ProcessAsync(Stream content)
    {
        using var document = PdfDocument.Open(content);
        var text = new StringBuilder();
        
        foreach (var page in document.GetPages())
        {
            text.AppendLine(page.Text);
        }
        
        return text.ToString();
    }
}

public class WordProcessor : IDocumentProcessor
{
    public bool CanProcess(string fileType) =>
        fileType.Equals(".docx", StringComparison.OrdinalIgnoreCase);

    public async Task<string> ProcessAsync(Stream content)
    {
        using var document = WordprocessingDocument.Open(content, false);
        var body = document.MainDocumentPart?.Document.Body;
        return body?.InnerText ?? "";
    }
}

public class TextProcessor : IDocumentProcessor
{
    public bool CanProcess(string fileType) =>
        new[] { ".txt", ".md" }.Contains(fileType.ToLowerInvariant());

    public async Task<string> ProcessAsync(Stream content)
    {
        using var reader = new StreamReader(content);
        return await reader.ReadToEndAsync();
    }
}

3. 文档服务 #

csharp
public class DocumentService : IDocumentService
{
    private readonly IEnumerable<IDocumentProcessor> _processors;
    private readonly ITextEmbeddingGenerationService _embeddingService;
    private readonly IMemoryStore _memoryStore;
    private readonly ILogger<DocumentService> _logger;

    private const int ChunkSize = 500;
    private const int ChunkOverlap = 50;

    public async Task<Document> ImportDocumentAsync(
        Stream content,
        string fileName,
        string fileType,
        DocumentMetadata? metadata = null)
    {
        var processor = _processors.FirstOrDefault(p => p.CanProcess(fileType));
        
        if (processor == null)
        {
            throw new NotSupportedException($"不支持的文件类型: {fileType}");
        }

        var text = await processor.ProcessAsync(content);
        
        var document = new Document
        {
            Title = Path.GetFileNameWithoutExtension(fileName),
            Source = fileName,
            FileType = fileType,
            FileSize = content.Length,
            Metadata = metadata ?? new DocumentMetadata()
        };

        var chunks = CreateChunks(document.Id, text);
        
        await IndexChunksAsync(chunks);

        return document;
    }

    private List<DocumentChunk> CreateChunks(string documentId, string text)
    {
        var chunks = new List<DocumentChunk>();
        var sentences = SplitSentences(text);
        
        var currentChunk = new StringBuilder();
        var currentPosition = 0;
        var chunkIndex = 0;

        foreach (var sentence in sentences)
        {
            if (currentChunk.Length + sentence.Length > ChunkSize && 
                currentChunk.Length > 0)
            {
                chunks.Add(new DocumentChunk
                {
                    DocumentId = documentId,
                    Content = currentChunk.ToString(),
                    ChunkIndex = chunkIndex++,
                    StartPosition = currentPosition - currentChunk.Length,
                    EndPosition = currentPosition
                });

                var overlapText = GetOverlapText(currentChunk.ToString());
                currentChunk.Clear();
                currentChunk.Append(overlapText);
            }

            currentChunk.Append(sentence);
            currentPosition += sentence.Length;
        }

        if (currentChunk.Length > 0)
        {
            chunks.Add(new DocumentChunk
            {
                DocumentId = documentId,
                Content = currentChunk.ToString(),
                ChunkIndex = chunkIndex,
                StartPosition = currentPosition - currentChunk.Length,
                EndPosition = currentPosition
            });
        }

        return chunks;
    }

    private async Task IndexChunksAsync(List<DocumentChunk> chunks)
    {
        foreach (var chunk in chunks)
        {
            var embedding = await _embeddingService.GenerateEmbeddingAsync(chunk.Content);
            
            var record = new MemoryRecord(
                new MemoryRecordMetadata(
                    isReference: false,
                    id: chunk.Id,
                    text: chunk.Content,
                    description: $"Document: {chunk.DocumentId}, Chunk: {chunk.ChunkIndex}",
                    externalSourceName: "",
                    additionalMetadata: JsonSerializer.Serialize(new
                    {
                        chunk.DocumentId,
                        chunk.ChunkIndex,
                        chunk.StartPosition,
                        chunk.EndPosition
                    })
                ),
                embedding,
                null
            );

            await _memoryStore.UpsertAsync("documents", record);
        }
    }

    public async Task DeleteDocumentAsync(string documentId)
    {
        var chunks = await GetDocumentChunksAsync(documentId);
        
        foreach (var chunk in chunks)
        {
            await _memoryStore.RemoveAsync("documents", chunk.Id);
        }
    }

    private List<string> SplitSentences(string text)
    {
        var delimiters = new[] { '。', '!', '?', '.', '!', '?', '\n' };
        return text.Split(delimiters, StringSplitOptions.RemoveEmptyEntries)
            .Select(s => s.Trim())
            .Where(s => !string.IsNullOrEmpty(s))
            .ToList();
    }

    private string GetOverlapText(string text)
    {
        if (text.Length <= ChunkOverlap) return text;
        return text.Substring(text.Length - ChunkOverlap);
    }
}

4. 问答服务 #

csharp
public class QAService : IQAService
{
    private readonly Kernel _kernel;
    private readonly ITextEmbeddingGenerationService _embeddingService;
    private readonly IMemoryStore _memoryStore;
    private readonly ILogger<QAService> _logger;

    public async Task<QAResponse> AskAsync(
        string question,
        int topK = 5,
        float minScore = 0.7f)
    {
        var queryEmbedding = await _embeddingService.GenerateEmbeddingAsync(question);
        
        var searchResults = new List<SearchResult>();
        var results = _memoryStore.GetNearestMatchesAsync(
            "documents",
            queryEmbedding,
            limit: topK,
            minRelevanceScore: minScore
        );

        await foreach (var (record, score) in results)
        {
            var metadata = JsonSerializer.Deserialize<Dictionary<string, string>>(
                record.Metadata.AdditionalMetadata
            );

            searchResults.Add(new SearchResult
            {
                ChunkId = record.Metadata.Id,
                DocumentId = metadata?["DocumentId"] ?? "",
                Content = record.Metadata.Text,
                Score = score,
                ChunkIndex = int.Parse(metadata?["ChunkIndex"] ?? "0")
            });
        }

        if (!searchResults.Any())
        {
            return new QAResponse
            {
                Question = question,
                Answer = "抱歉,我在知识库中没有找到相关信息。",
                Sources = new List<SourceInfo>()
            };
        }

        var answer = await GenerateAnswerAsync(question, searchResults);

        return new QAResponse
        {
            Question = question,
            Answer = answer,
            Sources = searchResults.Select(r => new SourceInfo
            {
                DocumentId = r.DocumentId,
                Content = r.Content.Length > 200 
                    ? r.Content.Substring(0, 200) + "..." 
                    : r.Content,
                Relevance = r.Score
            }).ToList()
        };
    }

    private async Task<string> GenerateAnswerAsync(
        string question,
        List<SearchResult> searchResults)
    {
        var context = string.Join("\n\n---\n\n", 
            searchResults.Select((r, i) => $"[{i + 1}] {r.Content}"));

        var prompt = $"""
            你是一个专业的文档问答助手。请根据以下参考资料回答问题。
            
            要求:
            1. 回答必须基于参考资料
            2. 如果资料中没有相关信息,请明确说明
            3. 引用来源时使用 [编号] 格式
            4. 回答要准确、简洁
            
            参考资料:
            {context}
            
            问题:{question}
            
            回答:
            """;

        var result = await _kernel.InvokePromptAsync(prompt);
        return result.ToString();
    }
}

5. API 控制器 #

csharp
[ApiController]
[Route("api/[controller]")]
public class DocumentController : ControllerBase
{
    private readonly IDocumentService _documentService;

    [HttpPost("upload")]
    public async Task<ActionResult<Document>> Upload(IFormFile file)
    {
        if (file == null || file.Length == 0)
        {
            return BadRequest("请选择文件");
        }

        var fileType = Path.GetExtension(file.FileName);
        
        using var stream = file.OpenReadStream();
        var document = await _documentService.ImportDocumentAsync(
            stream,
            file.FileName,
            fileType
        );

        return Ok(document);
    }

    [HttpDelete("{id}")]
    public async Task<IActionResult> Delete(string id)
    {
        await _documentService.DeleteDocumentAsync(id);
        return NoContent();
    }
}

[ApiController]
[Route("api/[controller]")]
public class QAController : ControllerBase
{
    private readonly IQAService _qaService;

    [HttpPost("ask")]
    public async Task<ActionResult<QAResponse>> Ask([FromBody] QuestionRequest request)
    {
        var response = await _qaService.AskAsync(
            request.Question,
            request.TopK ?? 5,
            request.MinScore ?? 0.7f
        );
        return Ok(response);
    }
}

6. 程序配置 #

csharp
var builder = WebApplication.CreateBuilder(args);

builder.Services.AddControllers();
builder.Services.AddEndpointsApiExplorer();
builder.Services.AddSwaggerGen();

builder.Services.AddKernel()
    .AddAzureOpenAIChatCompletion(
        deploymentName: builder.Configuration["AzureOpenAI:DeploymentName"]!,
        endpoint: builder.Configuration["AzureOpenAI:Endpoint"]!,
        apiKey: builder.Configuration["AzureOpenAI:ApiKey"]!
    )
    .AddOpenAITextEmbeddingGeneration(
        modelId: "text-embedding-3-small",
        apiKey: builder.Configuration["OpenAI:ApiKey"]!
    );

builder.Services.AddSingleton<IMemoryStore, QdrantMemoryStore>(sp =>
    new QdrantMemoryStore("localhost", 6333, 1536));

builder.Services.AddSingleton<IDocumentProcessor, PdfProcessor>();
builder.Services.AddSingleton<IDocumentProcessor, WordProcessor>();
builder.Services.AddSingleton<IDocumentProcessor, TextProcessor>();

builder.Services.AddScoped<IDocumentService, DocumentService>();
builder.Services.AddScoped<IQAService, QAService>();

var app = builder.Build();

if (app.Environment.IsDevelopment())
{
    app.UseSwagger();
    app.UseSwaggerUI();
}

app.MapControllers();
app.Run();

使用示例 #

上传文档 #

bash
curl -X POST "http://localhost:5000/api/document/upload" \
  -H "Content-Type: multipart/form-data" \
  -F "file=@document.pdf"

提问 #

bash
curl -X POST "http://localhost:5000/api/qa/ask" \
  -H "Content-Type: application/json" \
  -d '{"question": "什么是 Semantic Kernel?", "topK": 5}'

响应示例 #

json
{
    "question": "什么是 Semantic Kernel?",
    "answer": "Semantic Kernel 是微软开源的轻量级 SDK,用于帮助开发者将大语言模型集成到应用程序中 [1][2]。",
    "sources": [
        {
            "documentId": "doc-123",
            "content": "Semantic Kernel 是微软开源的轻量级 SDK...",
            "relevance": 0.92
        }
    ]
}

最佳实践 #

1. 文档分块优化 #

csharp
public class SmartChunker
{
    public List<DocumentChunk> Chunk(string text, DocumentType type)
    {
        return type switch
        {
            DocumentType.Technical => ChunkBySection(text),
            DocumentType.FAQ => ChunkByQuestion(text),
            DocumentType.Article => ChunkByParagraph(text),
            _ => ChunkBySize(text)
        };
    }
}

2. 混合检索 #

csharp
public async Task<List<SearchResult>> HybridSearchAsync(string query)
{
    var vectorResults = await VectorSearchAsync(query);
    var keywordResults = await KeywordSearchAsync(query);
    
    return MergeAndRerank(vectorResults, keywordResults);
}

下一步 #

现在你已经掌握了文档问答系统,接下来学习 AI Agent 开发,构建自主决策的 AI 智能体!

最后更新:2026-04-04