多模态 RAG #
概述 #
多模态 RAG 扩展了传统 RAG 的能力,支持图像、文本等多种模态的检索和生成,为更丰富的应用场景提供支持。
text
┌─────────────────────────────────────────────────────────────┐
│ 多模态 RAG 架构 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 输入 │
│ ┌─────────┐ ┌─────────┐ ┌─────────┐ │
│ │ 文本 │ │ 图像 │ │ 表格 │ │
│ └─────────┘ └─────────┘ └─────────┘ │
│ │ │ │ │
│ ▼ ▼ ▼ │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ 多模态 Embedding │ │
│ │ Text Embedding │ Image Embedding │ Table Embedding │ │
│ └─────────────────────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ 多模态 Vector Store │ │
│ └─────────────────────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ 多模态 LLM 生成 │ │
│ │ GPT-4V / Claude 3 │ │
│ └─────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────┘
图像处理 #
加载图像文档 #
python
from llama_index.core import SimpleDirectoryReader
from llama_index.readers.file import ImageReader
reader = SimpleDirectoryReader(
input_dir="./images",
required_exts=[".jpg", ".jpeg", ".png"],
)
documents = reader.load_data()
使用 ImageNode #
python
from llama_index.core.schema import ImageNode, TextNode
image_node = ImageNode(
image_path="./images/photo.jpg",
text="图像描述",
)
text_node = TextNode(
text="相关文本内容",
)
多模态 Embedding #
CLIP Embedding #
bash
pip install llama-index-embeddings-clip
python
from llama_index.embeddings.clip import ClipEmbedding
clip_embed = ClipEmbedding()
text_embedding = clip_embed.get_text_embedding("一只猫")
image_embedding = clip_embed.get_image_embedding("./images/cat.jpg")
print(f"文本向量维度: {len(text_embedding)}")
print(f"图像向量维度: {len(image_embedding)}")
多模态索引 #
python
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.schema import ImageNode, TextNode
from llama_index.embeddings.clip import ClipEmbedding
clip_embed = ClipEmbedding()
text_nodes = [
TextNode(text="这是一只猫的图片"),
TextNode(text="这是一只狗的图片"),
]
image_nodes = [
ImageNode(image_path="./images/cat.jpg"),
ImageNode(image_path="./images/dog.jpg"),
]
all_nodes = text_nodes + image_nodes
index = VectorStoreIndex(all_nodes, embed_model=clip_embed)
GPT-4V 集成 #
图像理解 #
python
from llama_index.llms.openai import OpenAI
from llama_index.core.schema import ImageNode
from llama_index.core import SimpleDirectoryReader
llm = OpenAI(model="gpt-4o")
image_node = ImageNode(image_path="./images/chart.png")
response = llm.complete(
"请描述这张图片的内容:",
image_nodes=[image_node],
)
print(response.text)
多模态查询引擎 #
python
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.schema import ImageNode
llm = OpenAI(model="gpt-4o")
embed_model = OpenAIEmbedding()
documents = SimpleDirectoryReader("./data").load_data()
text_index = VectorStoreIndex.from_documents(documents)
image_node = ImageNode(image_path="./images/diagram.png")
def multimodal_query(query: str, image_path: str = None):
if image_path:
image_node = ImageNode(image_path=image_path)
response = llm.complete(
f"基于图片和以下问题回答:\n问题:{query}",
image_nodes=[image_node],
)
return response.text
else:
query_engine = text_index.as_query_engine(llm=llm)
return query_engine.query(query)
response = multimodal_query("这张图说明了什么?", "./images/chart.png")
print(response)
多模态 RAG 实现 #
完整实现 #
python
import os
from typing import List, Optional
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.schema import ImageNode, TextNode, NodeWithScore
from llama_index.core.retrievers import BaseRetriever
from llama_index.core import QueryBundle
class MultiModalRetriever(BaseRetriever):
def __init__(
self,
text_index: VectorStoreIndex,
image_nodes: List[ImageNode],
similarity_top_k: int = 3,
):
self.text_index = text_index
self.image_nodes = image_nodes
self.similarity_top_k = similarity_top_k
def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
text_retriever = self.text_index.as_retriever(
similarity_top_k=self.similarity_top_k
)
text_nodes = text_retriever.retrieve(query_bundle)
return text_nodes
class MultiModalRAG:
def __init__(
self,
data_dir: str = "./data",
image_dir: str = "./images",
):
self.data_dir = data_dir
self.image_dir = image_dir
self.llm = OpenAI(model="gpt-4o")
self.embed_model = OpenAIEmbedding()
Settings.llm = self.llm
Settings.embed_model = self.embed_model
self.text_index: Optional[VectorStoreIndex] = None
self.image_nodes: List[ImageNode] = []
def initialize(self):
text_documents = SimpleDirectoryReader(self.data_dir).load_data()
self.text_index = VectorStoreIndex.from_documents(text_documents)
print(f"加载了 {len(text_documents)} 个文本文档")
if os.path.exists(self.image_dir):
for filename in os.listdir(self.image_dir):
if filename.lower().endswith((".jpg", ".jpeg", ".png")):
image_path = os.path.join(self.image_dir, filename)
self.image_nodes.append(ImageNode(image_path=image_path))
print(f"加载了 {len(self.image_nodes)} 个图像")
print("多模态 RAG 初始化完成")
def query(self, question: str, image_path: Optional[str] = None) -> str:
if image_path and os.path.exists(image_path):
image_node = ImageNode(image_path=image_path)
text_retriever = self.text_index.as_retriever(similarity_top_k=3)
context_nodes = text_retriever.retrieve(question)
context_text = "\n".join([node.text for node in context_nodes])
prompt = f"""基于以下上下文和图片回答问题:
上下文:
{context_text}
问题:{question}
请结合图片和上下文给出详细回答。"""
response = self.llm.complete(prompt, image_nodes=[image_node])
return response.text
else:
query_engine = self.text_index.as_query_engine(llm=self.llm)
response = query_engine.query(question)
return str(response)
def describe_image(self, image_path: str) -> str:
if not os.path.exists(image_path):
return "图像文件不存在"
image_node = ImageNode(image_path=image_path)
response = self.llm.complete(
"请详细描述这张图片的内容:",
image_nodes=[image_node],
)
return response.text
def search_images(self, query: str) -> List[str]:
relevant_images = []
for image_node in self.image_nodes:
description = self.describe_image(image_node.image_path)
if query.lower() in description.lower():
relevant_images.append(image_node.image_path)
return relevant_images
def main():
rag = MultiModalRAG(
data_dir="./data",
image_dir="./images",
)
rag.initialize()
print("\n=== 多模态 RAG 系统 ===")
print("输入 'quit' 退出")
print("输入 'image:路径 问题' 进行图文查询\n")
while True:
user_input = input("问题: ").strip()
if user_input.lower() == "quit":
break
if not user_input:
continue
if user_input.startswith("image:"):
parts = user_input.split(" ", 1)
image_path = parts[0].replace("image:", "")
question = parts[1] if len(parts) > 1 else "描述这张图片"
print(f"\n回答: {rag.query(question, image_path)}\n")
else:
print(f"\n回答: {rag.query(user_input)}\n")
if __name__ == "__main__":
main()
表格处理 #
表格解析 #
python
from llama_index.core import SimpleDirectoryReader
from llama_index.readers.file import CSVReader, PandasCSVReader
csv_reader = CSVReader()
documents = csv_reader.load_data(file="./data/table.csv")
pandas_reader = PandasCSVReader()
documents = pandas_reader.load_data(file="./data/table.csv")
表格理解 #
python
from llama_index.llms.openai import OpenAI
import pandas as pd
llm = OpenAI(model="gpt-4o")
df = pd.read_csv("./data/sales.csv")
table_summary = llm.complete(f"""
请分析以下表格数据:
{df.to_string()}
总结关键发现和趋势:
""")
print(table_summary.text)
PDF 多模态处理 #
使用 LlamaParse #
bash
pip install llama-parse
python
from llama_parse import LlamaParse
from llama_index.core import VectorStoreIndex
parser = LlamaParse(
api_key="your-llama-parse-key",
result_type="markdown",
verbose=True,
)
documents = parser.load_data("./document.pdf")
index = VectorStoreIndex.from_documents(documents)
PDF 图像提取 #
python
import fitz
from llama_index.core.schema import ImageNode, TextNode
def extract_pdf_content(pdf_path: str):
doc = fitz.open(pdf_path)
text_nodes = []
image_nodes = []
for page_num, page in enumerate(doc):
text = page.get_text()
if text.strip():
text_nodes.append(TextNode(
text=text,
metadata={"page": page_num + 1},
))
images = page.get_images()
for img_index, img in enumerate(images):
xref = img[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image_path = f"./extracted_images/page{page_num}_img{img_index}.png"
with open(image_path, "wb") as f:
f.write(image_bytes)
image_nodes.append(ImageNode(
image_path=image_path,
metadata={"page": page_num + 1},
))
return text_nodes, image_nodes
多模态检索优化 #
图文联合检索 #
python
from llama_index.core import VectorStoreIndex
from llama_index.core.schema import ImageNode, TextNode
from llama_index.embeddings.clip import ClipEmbedding
clip_embed = ClipEmbedding()
text_nodes = [TextNode(text="猫的图片")]
image_nodes = [ImageNode(image_path="./cat.jpg")]
all_nodes = text_nodes + image_nodes
index = VectorStoreIndex(all_nodes, embed_model=clip_embed)
query_engine = index.as_query_engine()
text_results = query_engine.query("猫")
image_results = query_engine.query("一张猫的照片")
跨模态检索 #
python
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.schema import NodeWithScore
def cross_modal_search(
index: VectorStoreIndex,
query: str,
modality: str = "all",
):
retriever = VectorIndexRetriever(index=index, similarity_top_k=10)
nodes = retriever.retrieve(query)
if modality == "text":
return [n for n in nodes if isinstance(n.node, TextNode)]
elif modality == "image":
return [n for n in nodes if isinstance(n.node, ImageNode)]
else:
return nodes
完整示例 #
python
import os
from typing import List, Optional
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.schema import ImageNode
class MultiModalDocumentQA:
def __init__(self):
self.llm = OpenAI(model="gpt-4o")
self.embed_model = OpenAIEmbedding()
Settings.llm = self.llm
Settings.embed_model = self.embed_model
self.index: Optional[VectorStoreIndex] = None
def load_documents(self, directory: str):
documents = SimpleDirectoryReader(directory).load_data()
self.index = VectorStoreIndex.from_documents(documents)
print(f"加载了 {len(documents)} 个文档")
def query_with_image(
self,
question: str,
image_path: Optional[str] = None,
) -> str:
if image_path and os.path.exists(image_path):
image_node = ImageNode(image_path=image_path)
retriever = self.index.as_retriever(similarity_top_k=3)
context_nodes = retriever.retrieve(question)
context = "\n".join([n.text for n in context_nodes])
prompt = f"""上下文信息:
{context}
问题:{question}
请结合图片和上下文回答问题。"""
response = self.llm.complete(prompt, image_nodes=[image_node])
return response.text
else:
query_engine = self.index.as_query_engine()
return str(query_engine.query(question))
qa = MultiModalDocumentQA()
qa.load_documents("./data")
response = qa.query_with_image("文档的主要内容是什么?")
print(response)
response = qa.query_with_image(
"这张图和文档有什么关系?",
"./images/diagram.png",
)
print(response)
总结 #
多模态 RAG 扩展了传统 RAG 的能力,支持:
- 图像理解与检索
- 图文混合问答
- 表格数据处理
- PDF 多模态解析
通过结合 GPT-4V、CLIP 等模型,可以构建更智能的多模态应用。
下一步 #
恭喜你完成了 LlamaIndex 学习之旅!现在你已经掌握了:
- 基础概念和安装配置
- 核心组件的使用
- 高级 RAG 技术
- 智能代理开发
- 实战项目构建
继续探索更多 LlamaIndex 功能,构建更强大的 AI 应用!
最后更新:2026-03-30