多模态 RAG #

概述 #

多模态 RAG 扩展了传统 RAG 的能力,支持图像、文本等多种模态的检索和生成,为更丰富的应用场景提供支持。

text
┌─────────────────────────────────────────────────────────────┐
│                    多模态 RAG 架构                           │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│   输入                                                       │
│   ┌─────────┐ ┌─────────┐ ┌─────────┐                      │
│   │  文本   │ │  图像   │ │  表格   │                      │
│   └─────────┘ └─────────┘ └─────────┘                      │
│        │           │           │                             │
│        ▼           ▼           ▼                             │
│   ┌─────────────────────────────────────────────────────┐  │
│   │              多模态 Embedding                        │  │
│   │  Text Embedding │ Image Embedding │ Table Embedding │  │
│   └─────────────────────────────────────────────────────┘  │
│                           │                                  │
│                           ▼                                  │
│   ┌─────────────────────────────────────────────────────┐  │
│   │              多模态 Vector Store                    │  │
│   └─────────────────────────────────────────────────────┘  │
│                           │                                  │
│                           ▼                                  │
│   ┌─────────────────────────────────────────────────────┐  │
│   │              多模态 LLM 生成                        │  │
│   │              GPT-4V / Claude 3                      │  │
│   └─────────────────────────────────────────────────────┘  │
│                                                             │
└─────────────────────────────────────────────────────────────┘

图像处理 #

加载图像文档 #

python
from llama_index.core import SimpleDirectoryReader
from llama_index.readers.file import ImageReader

reader = SimpleDirectoryReader(
    input_dir="./images",
    required_exts=[".jpg", ".jpeg", ".png"],
)

documents = reader.load_data()

使用 ImageNode #

python
from llama_index.core.schema import ImageNode, TextNode

image_node = ImageNode(
    image_path="./images/photo.jpg",
    text="图像描述",
)

text_node = TextNode(
    text="相关文本内容",
)

多模态 Embedding #

CLIP Embedding #

bash
pip install llama-index-embeddings-clip
python
from llama_index.embeddings.clip import ClipEmbedding

clip_embed = ClipEmbedding()

text_embedding = clip_embed.get_text_embedding("一只猫")

image_embedding = clip_embed.get_image_embedding("./images/cat.jpg")

print(f"文本向量维度: {len(text_embedding)}")
print(f"图像向量维度: {len(image_embedding)}")

多模态索引 #

python
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.schema import ImageNode, TextNode
from llama_index.embeddings.clip import ClipEmbedding

clip_embed = ClipEmbedding()

text_nodes = [
    TextNode(text="这是一只猫的图片"),
    TextNode(text="这是一只狗的图片"),
]

image_nodes = [
    ImageNode(image_path="./images/cat.jpg"),
    ImageNode(image_path="./images/dog.jpg"),
]

all_nodes = text_nodes + image_nodes

index = VectorStoreIndex(all_nodes, embed_model=clip_embed)

GPT-4V 集成 #

图像理解 #

python
from llama_index.llms.openai import OpenAI
from llama_index.core.schema import ImageNode
from llama_index.core import SimpleDirectoryReader

llm = OpenAI(model="gpt-4o")

image_node = ImageNode(image_path="./images/chart.png")

response = llm.complete(
    "请描述这张图片的内容:",
    image_nodes=[image_node],
)

print(response.text)

多模态查询引擎 #

python
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.schema import ImageNode

llm = OpenAI(model="gpt-4o")
embed_model = OpenAIEmbedding()

documents = SimpleDirectoryReader("./data").load_data()
text_index = VectorStoreIndex.from_documents(documents)

image_node = ImageNode(image_path="./images/diagram.png")

def multimodal_query(query: str, image_path: str = None):
    if image_path:
        image_node = ImageNode(image_path=image_path)
        
        response = llm.complete(
            f"基于图片和以下问题回答:\n问题:{query}",
            image_nodes=[image_node],
        )
        return response.text
    else:
        query_engine = text_index.as_query_engine(llm=llm)
        return query_engine.query(query)

response = multimodal_query("这张图说明了什么?", "./images/chart.png")
print(response)

多模态 RAG 实现 #

完整实现 #

python
import os
from typing import List, Optional
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.schema import ImageNode, TextNode, NodeWithScore
from llama_index.core.retrievers import BaseRetriever
from llama_index.core import QueryBundle

class MultiModalRetriever(BaseRetriever):
    def __init__(
        self,
        text_index: VectorStoreIndex,
        image_nodes: List[ImageNode],
        similarity_top_k: int = 3,
    ):
        self.text_index = text_index
        self.image_nodes = image_nodes
        self.similarity_top_k = similarity_top_k
    
    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        text_retriever = self.text_index.as_retriever(
            similarity_top_k=self.similarity_top_k
        )
        text_nodes = text_retriever.retrieve(query_bundle)
        
        return text_nodes

class MultiModalRAG:
    def __init__(
        self,
        data_dir: str = "./data",
        image_dir: str = "./images",
    ):
        self.data_dir = data_dir
        self.image_dir = image_dir
        
        self.llm = OpenAI(model="gpt-4o")
        self.embed_model = OpenAIEmbedding()
        
        Settings.llm = self.llm
        Settings.embed_model = self.embed_model
        
        self.text_index: Optional[VectorStoreIndex] = None
        self.image_nodes: List[ImageNode] = []
    
    def initialize(self):
        text_documents = SimpleDirectoryReader(self.data_dir).load_data()
        self.text_index = VectorStoreIndex.from_documents(text_documents)
        print(f"加载了 {len(text_documents)} 个文本文档")
        
        if os.path.exists(self.image_dir):
            for filename in os.listdir(self.image_dir):
                if filename.lower().endswith((".jpg", ".jpeg", ".png")):
                    image_path = os.path.join(self.image_dir, filename)
                    self.image_nodes.append(ImageNode(image_path=image_path))
            print(f"加载了 {len(self.image_nodes)} 个图像")
        
        print("多模态 RAG 初始化完成")
    
    def query(self, question: str, image_path: Optional[str] = None) -> str:
        if image_path and os.path.exists(image_path):
            image_node = ImageNode(image_path=image_path)
            
            text_retriever = self.text_index.as_retriever(similarity_top_k=3)
            context_nodes = text_retriever.retrieve(question)
            
            context_text = "\n".join([node.text for node in context_nodes])
            
            prompt = f"""基于以下上下文和图片回答问题:

上下文:
{context_text}

问题:{question}

请结合图片和上下文给出详细回答。"""
            
            response = self.llm.complete(prompt, image_nodes=[image_node])
            return response.text
        else:
            query_engine = self.text_index.as_query_engine(llm=self.llm)
            response = query_engine.query(question)
            return str(response)
    
    def describe_image(self, image_path: str) -> str:
        if not os.path.exists(image_path):
            return "图像文件不存在"
        
        image_node = ImageNode(image_path=image_path)
        
        response = self.llm.complete(
            "请详细描述这张图片的内容:",
            image_nodes=[image_node],
        )
        
        return response.text
    
    def search_images(self, query: str) -> List[str]:
        relevant_images = []
        
        for image_node in self.image_nodes:
            description = self.describe_image(image_node.image_path)
            
            if query.lower() in description.lower():
                relevant_images.append(image_node.image_path)
        
        return relevant_images

def main():
    rag = MultiModalRAG(
        data_dir="./data",
        image_dir="./images",
    )
    rag.initialize()
    
    print("\n=== 多模态 RAG 系统 ===")
    print("输入 'quit' 退出")
    print("输入 'image:路径 问题' 进行图文查询\n")
    
    while True:
        user_input = input("问题: ").strip()
        
        if user_input.lower() == "quit":
            break
        
        if not user_input:
            continue
        
        if user_input.startswith("image:"):
            parts = user_input.split(" ", 1)
            image_path = parts[0].replace("image:", "")
            question = parts[1] if len(parts) > 1 else "描述这张图片"
            
            print(f"\n回答: {rag.query(question, image_path)}\n")
        else:
            print(f"\n回答: {rag.query(user_input)}\n")

if __name__ == "__main__":
    main()

表格处理 #

表格解析 #

python
from llama_index.core import SimpleDirectoryReader
from llama_index.readers.file import CSVReader, PandasCSVReader

csv_reader = CSVReader()
documents = csv_reader.load_data(file="./data/table.csv")

pandas_reader = PandasCSVReader()
documents = pandas_reader.load_data(file="./data/table.csv")

表格理解 #

python
from llama_index.llms.openai import OpenAI
import pandas as pd

llm = OpenAI(model="gpt-4o")

df = pd.read_csv("./data/sales.csv")

table_summary = llm.complete(f"""
请分析以下表格数据:

{df.to_string()}

总结关键发现和趋势:
""")

print(table_summary.text)

PDF 多模态处理 #

使用 LlamaParse #

bash
pip install llama-parse
python
from llama_parse import LlamaParse
from llama_index.core import VectorStoreIndex

parser = LlamaParse(
    api_key="your-llama-parse-key",
    result_type="markdown",
    verbose=True,
)

documents = parser.load_data("./document.pdf")

index = VectorStoreIndex.from_documents(documents)

PDF 图像提取 #

python
import fitz
from llama_index.core.schema import ImageNode, TextNode

def extract_pdf_content(pdf_path: str):
    doc = fitz.open(pdf_path)
    
    text_nodes = []
    image_nodes = []
    
    for page_num, page in enumerate(doc):
        text = page.get_text()
        if text.strip():
            text_nodes.append(TextNode(
                text=text,
                metadata={"page": page_num + 1},
            ))
        
        images = page.get_images()
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            
            image_path = f"./extracted_images/page{page_num}_img{img_index}.png"
            with open(image_path, "wb") as f:
                f.write(image_bytes)
            
            image_nodes.append(ImageNode(
                image_path=image_path,
                metadata={"page": page_num + 1},
            ))
    
    return text_nodes, image_nodes

多模态检索优化 #

图文联合检索 #

python
from llama_index.core import VectorStoreIndex
from llama_index.core.schema import ImageNode, TextNode
from llama_index.embeddings.clip import ClipEmbedding

clip_embed = ClipEmbedding()

text_nodes = [TextNode(text="猫的图片")]
image_nodes = [ImageNode(image_path="./cat.jpg")]

all_nodes = text_nodes + image_nodes

index = VectorStoreIndex(all_nodes, embed_model=clip_embed)

query_engine = index.as_query_engine()

text_results = query_engine.query("猫")

image_results = query_engine.query("一张猫的照片")

跨模态检索 #

python
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.schema import NodeWithScore

def cross_modal_search(
    index: VectorStoreIndex,
    query: str,
    modality: str = "all",
):
    retriever = VectorIndexRetriever(index=index, similarity_top_k=10)
    nodes = retriever.retrieve(query)
    
    if modality == "text":
        return [n for n in nodes if isinstance(n.node, TextNode)]
    elif modality == "image":
        return [n for n in nodes if isinstance(n.node, ImageNode)]
    else:
        return nodes

完整示例 #

python
import os
from typing import List, Optional
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.schema import ImageNode

class MultiModalDocumentQA:
    def __init__(self):
        self.llm = OpenAI(model="gpt-4o")
        self.embed_model = OpenAIEmbedding()
        
        Settings.llm = self.llm
        Settings.embed_model = self.embed_model
        
        self.index: Optional[VectorStoreIndex] = None
    
    def load_documents(self, directory: str):
        documents = SimpleDirectoryReader(directory).load_data()
        self.index = VectorStoreIndex.from_documents(documents)
        print(f"加载了 {len(documents)} 个文档")
    
    def query_with_image(
        self,
        question: str,
        image_path: Optional[str] = None,
    ) -> str:
        if image_path and os.path.exists(image_path):
            image_node = ImageNode(image_path=image_path)
            
            retriever = self.index.as_retriever(similarity_top_k=3)
            context_nodes = retriever.retrieve(question)
            context = "\n".join([n.text for n in context_nodes])
            
            prompt = f"""上下文信息:
{context}

问题:{question}

请结合图片和上下文回答问题。"""
            
            response = self.llm.complete(prompt, image_nodes=[image_node])
            return response.text
        else:
            query_engine = self.index.as_query_engine()
            return str(query_engine.query(question))

qa = MultiModalDocumentQA()
qa.load_documents("./data")

response = qa.query_with_image("文档的主要内容是什么?")
print(response)

response = qa.query_with_image(
    "这张图和文档有什么关系?",
    "./images/diagram.png",
)
print(response)

总结 #

多模态 RAG 扩展了传统 RAG 的能力,支持:

  • 图像理解与检索
  • 图文混合问答
  • 表格数据处理
  • PDF 多模态解析

通过结合 GPT-4V、CLIP 等模型,可以构建更智能的多模态应用。

下一步 #

恭喜你完成了 LlamaIndex 学习之旅!现在你已经掌握了:

  1. 基础概念和安装配置
  2. 核心组件的使用
  3. 高级 RAG 技术
  4. 智能代理开发
  5. 实战项目构建

继续探索更多 LlamaIndex 功能,构建更强大的 AI 应用!

最后更新:2026-03-30