数据连接器 #
概述 #
数据连接器(Data Connectors)是 LlamaIndex 中用于加载各种数据源的组件,它们将不同格式的数据转换为统一的 Document 对象。
text
┌─────────────────────────────────────────────────────────────┐
│ 数据连接器架构 │
├─────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ 数据源 │ │
│ │ 文件 │ 网页 │ 数据库 │ API │ 云存储 │ 其他 │ │
│ └─────────────────────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ 数据连接器 │ │
│ │ SimpleDirectoryReader │ PDFReader │ WebReader ... │ │
│ └─────────────────────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ Documents │ │
│ │ 统一的文档对象,包含文本和元数据 │ │
│ └─────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────┘
SimpleDirectoryReader #
SimpleDirectoryReader 是最常用的数据加载器,支持从目录加载多种文件格式。
基本用法 #
python
from llama_index.core import SimpleDirectoryReader
documents = SimpleDirectoryReader("./data").load_data()
documents = SimpleDirectoryReader(
input_files=["./file1.txt", "./file2.pdf"]
).load_data()
documents = SimpleDirectoryReader(
input_dir="./data",
recursive=True,
).load_data()
配置选项 #
python
from llama_index.core import SimpleDirectoryReader
reader = SimpleDirectoryReader(
input_dir="./data",
recursive=True,
required_exts=[".pdf", ".txt", ".md"],
exclude=["*.tmp", "*.bak"],
exclude_hidden=True,
encoding="utf-8",
errors="ignore",
)
documents = reader.load_data()
print(f"加载了 {len(documents)} 个文档")
获取文件信息 #
python
from llama_index.core import SimpleDirectoryReader
reader = SimpleDirectoryReader("./data")
documents = reader.load_data()
for doc in documents:
print(f"文件: {doc.metadata.get('file_name')}")
print(f"路径: {doc.metadata.get('file_path')}")
print(f"大小: {len(doc.text)} 字符")
print("---")
文件加载器 #
PDF 文件 #
bash
pip install llama-index-readers-file pypdf
python
from llama_index.core import SimpleDirectoryReader
from llama_index.readers.file import PDFReader
reader = SimpleDirectoryReader(
input_dir="./pdfs",
required_exts=[".pdf"],
)
documents = reader.load_data()
pdf_reader = PDFReader()
documents = pdf_reader.load_data(file="./document.pdf")
Word 文档 #
bash
pip install llama-index-readers-file python-docx
python
from llama_index.readers.file import DocxReader
docx_reader = DocxReader()
documents = docx_reader.load_data(file="./document.docx")
Markdown 文件 #
python
from llama_index.core import SimpleDirectoryReader
documents = SimpleDirectoryReader(
input_dir="./markdown",
required_exts=[".md"],
).load_data()
CSV 文件 #
python
from llama_index.readers.file import CSVReader
csv_reader = CSVReader()
documents = csv_reader.load_data(file="./data.csv")
JSON 文件 #
python
from llama_index.readers.file import JSONReader
json_reader = JSONReader()
documents = json_reader.load_data(file="./data.json")
代码文件 #
python
from llama_index.core import SimpleDirectoryReader
documents = SimpleDirectoryReader(
input_dir="./src",
required_exts=[".py", ".js", ".ts", ".java"],
recursive=True,
).load_data()
网页加载器 #
简单网页读取 #
bash
pip install llama-index-readers-web
python
from llama_index.readers.web import SimpleWebPageReader
reader = SimpleWebPageReader(html_to_text=True)
documents = reader.load_data(
urls=["https://example.com/article"]
)
print(documents[0].text)
BeautifulSoup 网页读取 #
bash
pip install llama-index-readers-web beautifulsoup4
python
from llama_index.readers.web import BeautifulSoupWebReader
reader = BeautifulSoupWebReader()
documents = reader.load_data(
urls=["https://example.com/article"],
)
for doc in documents:
print(f"URL: {doc.metadata.get('url')}")
print(f"标题: {doc.metadata.get('title')}")
Sitemap 读取 #
python
from llama_index.readers.web import SitemapReader
reader = SitemapReader()
documents = reader.load_data(
sitemap_url="https://example.com/sitemap.xml",
)
RSS 订阅读取 #
python
from llama_index.readers.web import RssReader
reader = RssReader()
documents = reader.load_data(
urls=["https://example.com/feed.xml"],
)
数据库加载器 #
SQL 数据库 #
bash
pip install llama-index-readers-database sqlalchemy
python
from llama_index.readers.database import DatabaseReader
reader = DatabaseReader(
sql_database="sqlite:///mydb.db"
)
documents = reader.load_data(
query="SELECT title, content FROM articles"
)
for doc in documents:
print(doc.text[:100])
PostgreSQL #
python
from llama_index.readers.database import DatabaseReader
reader = DatabaseReader(
sql_database="postgresql://user:pass@localhost/mydb"
)
documents = reader.load_data(
query="SELECT * FROM documents WHERE category = 'tech'"
)
MongoDB #
bash
pip install llama-index-readers-mongodb pymongo
python
from llama_index.readers.mongodb import SimpleMongoReader
reader = SimpleMongoReader(
host="localhost",
port=27017,
)
documents = reader.load_data(
db_name="mydb",
collection_name="documents",
query_dict={"category": "tech"},
field_names=["title", "content"],
)
云存储加载器 #
AWS S3 #
bash
pip install llama-index-readers-s3 boto3
python
from llama_index.readers.s3 import S3Reader
reader = S3Reader(
bucket="my-bucket",
prefix="documents/",
aws_access_key_id="your-key",
aws_secret_access_key="your-secret",
)
documents = reader.load_data()
Google Cloud Storage #
bash
pip install llama-index-readers-gcs google-cloud-storage
python
from llama_index.readers.gcs import GCSReader
reader = GCSReader(
bucket="my-bucket",
prefix="documents/",
)
documents = reader.load_data()
Azure Blob Storage #
bash
pip install llama-index-readers-azure-blob-storage azure-storage-blob
python
from llama_index.readers.azure_blob_storage import AzureBlobStorageReader
reader = AzureBlobStorageReader(
container_name="my-container",
blob_prefix="documents/",
connection_string="your-connection-string",
)
documents = reader.load_data()
API 加载器 #
Notion #
bash
pip install llama-index-readers-notion
python
from llama_index.readers.notion import NotionPageReader
reader = NotionPageReader(integration_token="your-token")
documents = reader.load_data(
page_ids=["page-id-1", "page-id-2"],
)
Slack #
bash
pip install llama-index-readers-slack
python
from llama_index.readers.slack import SlackReader
reader = SlackReader(slack_token="xoxb-your-token")
documents = reader.load_data(
channel_ids=["C12345678"],
)
Discord #
bash
pip install llama-index-readers-discord
python
from llama_index.readers.discord import DiscordReader
reader = DiscordReader(discord_token="your-token")
documents = reader.load_data(
channel_ids=["123456789"],
)
GitHub #
bash
pip install llama-index-readers-github
python
from llama_index.readers.github import GitHubRepositoryReader
reader = GitHubRepositoryReader(
owner="run-llama",
repo="llama_index",
github_token="your-token",
)
documents = reader.load_data(branch="main")
Confluence #
bash
pip install llama-index-readers-confluence
python
from llama_index.readers.confluence import ConfluenceReader
reader = ConfluenceReader(
base_url="https://your-company.atlassian.net/wiki",
api_token="your-token",
)
documents = reader.load_data(
space_key="DOC",
)
自定义加载器 #
创建简单加载器 #
python
from llama_index.core import Document
from typing import List
def load_custom_data(source: str) -> List[Document]:
with open(source, "r", encoding="utf-8") as f:
text = f.read()
return [Document(
text=text,
metadata={"source": source},
)]
documents = load_custom_data("./custom_data.txt")
创建自定义 Reader 类 #
python
from llama_index.core import Document
from llama_index.core.readers.base import BaseReader
from typing import List, Optional
class CustomReader(BaseReader):
def __init__(self, source_path: str):
self.source_path = source_path
def load_data(self, *args, **kwargs) -> List[Document]:
documents = []
import os
for filename in os.listdir(self.source_path):
if filename.endswith(".custom"):
filepath = os.path.join(self.source_path, filename)
with open(filepath, "r", encoding="utf-8") as f:
text = f.read()
documents.append(Document(
text=text,
metadata={
"filename": filename,
"source": filepath,
},
))
return documents
reader = CustomReader("./custom_data")
documents = reader.load_data()
批量加载 #
并行加载 #
python
from llama_index.core import SimpleDirectoryReader
import multiprocessing
reader = SimpleDirectoryReader(
input_dir="./data",
num_files_per_batch=100,
)
documents = reader.load_data(
num_workers=multiprocessing.cpu_count(),
show_progress=True,
)
分批处理 #
python
from llama_index.core import SimpleDirectoryReader
reader = SimpleDirectoryReader("./data")
batch_size = 50
for i, doc in enumerate(reader.iter_data()):
print(f"处理文档 {i+1}: {doc.metadata.get('file_name')}")
if (i + 1) % batch_size == 0:
pass
元数据处理 #
添加自定义元数据 #
python
from llama_index.core import SimpleDirectoryReader
def custom_metadata_func(file_path: str) -> dict:
return {
"custom_field": "value",
"processed_at": "2024-01-01",
}
reader = SimpleDirectoryReader(
input_dir="./data",
file_metadata=custom_metadata_func,
)
documents = reader.load_data()
for doc in documents:
print(doc.metadata)
过滤文档 #
python
from llama_index.core import SimpleDirectoryReader
reader = SimpleDirectoryReader("./data")
documents = reader.load_data()
filtered_docs = [
doc for doc in documents
if len(doc.text) > 100
]
print(f"原始: {len(documents)} 个文档")
print(f"过滤后: {len(filtered_docs)} 个文档")
完整示例 #
python
import os
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.readers.file import PDFReader
from llama_index.readers.web import SimpleWebPageReader
pdf_docs = SimpleDirectoryReader(
input_dir="./pdfs",
required_exts=[".pdf"],
).load_data()
print(f"PDF 文档: {len(pdf_docs)}")
web_docs = SimpleWebPageReader(html_to_text=True).load_data(
urls=[
"https://docs.llamaindex.ai/en/stable/",
]
)
print(f"网页文档: {len(web_docs)}")
all_documents = pdf_docs + web_docs
index = VectorStoreIndex.from_documents(all_documents)
query_engine = index.as_query_engine()
response = query_engine.query("总结所有文档的主要内容")
print(response)
下一步 #
了解数据加载后,接下来学习 文档与节点 掌握文档处理和节点分割!
最后更新:2026-03-30