数据连接器 #

概述 #

数据连接器（Data Connectors）是 LlamaIndex 中用于加载各种数据源的组件，它们将不同格式的数据转换为统一的 Document 对象。

text

┌─────────────────────────────────────────────────────────────┐
│                    数据连接器架构                            │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  ┌─────────────────────────────────────────────────────┐   │
│  │                    数据源                           │   │
│  │  文件 │ 网页 │ 数据库 │ API │ 云存储 │ 其他         │   │
│  └─────────────────────────────────────────────────────┘   │
│                           │                                  │
│                           ▼                                  │
│  ┌─────────────────────────────────────────────────────┐   │
│  │                  数据连接器                         │   │
│  │  SimpleDirectoryReader │ PDFReader │ WebReader ...  │   │
│  └─────────────────────────────────────────────────────┘   │
│                           │                                  │
│                           ▼                                  │
│  ┌─────────────────────────────────────────────────────┐   │
│  │                    Documents                        │   │
│  │  统一的文档对象，包含文本和元数据                     │   │
│  └─────────────────────────────────────────────────────┘   │
│                                                             │
└─────────────────────────────────────────────────────────────┘

SimpleDirectoryReader #

SimpleDirectoryReader 是最常用的数据加载器，支持从目录加载多种文件格式。

基本用法 #

python

from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader("./data").load_data()

documents = SimpleDirectoryReader(
    input_files=["./file1.txt", "./file2.pdf"]
).load_data()

documents = SimpleDirectoryReader(
    input_dir="./data",
    recursive=True,
).load_data()

配置选项 #

python

from llama_index.core import SimpleDirectoryReader

reader = SimpleDirectoryReader(
    input_dir="./data",
    
    recursive=True,
    
    required_exts=[".pdf", ".txt", ".md"],
    
    exclude=["*.tmp", "*.bak"],
    
    exclude_hidden=True,
    
    encoding="utf-8",
    
    errors="ignore",
)

documents = reader.load_data()
print(f"加载了 {len(documents)} 个文档")

获取文件信息 #

python

from llama_index.core import SimpleDirectoryReader

reader = SimpleDirectoryReader("./data")
documents = reader.load_data()

for doc in documents:
    print(f"文件: {doc.metadata.get('file_name')}")
    print(f"路径: {doc.metadata.get('file_path')}")
    print(f"大小: {len(doc.text)} 字符")
    print("---")

文件加载器 #

PDF 文件 #

bash

pip install llama-index-readers-file pypdf

python

from llama_index.core import SimpleDirectoryReader
from llama_index.readers.file import PDFReader

reader = SimpleDirectoryReader(
    input_dir="./pdfs",
    required_exts=[".pdf"],
)
documents = reader.load_data()

pdf_reader = PDFReader()
documents = pdf_reader.load_data(file="./document.pdf")

Word 文档 #

bash

pip install llama-index-readers-file python-docx

python

from llama_index.readers.file import DocxReader

docx_reader = DocxReader()
documents = docx_reader.load_data(file="./document.docx")

Markdown 文件 #

python

from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_dir="./markdown",
    required_exts=[".md"],
).load_data()

CSV 文件 #

python

from llama_index.readers.file import CSVReader

csv_reader = CSVReader()
documents = csv_reader.load_data(file="./data.csv")

JSON 文件 #

python

from llama_index.readers.file import JSONReader

json_reader = JSONReader()
documents = json_reader.load_data(file="./data.json")

代码文件 #

python

from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_dir="./src",
    required_exts=[".py", ".js", ".ts", ".java"],
    recursive=True,
).load_data()

网页加载器 #

简单网页读取 #

bash

pip install llama-index-readers-web

python

from llama_index.readers.web import SimpleWebPageReader

reader = SimpleWebPageReader(html_to_text=True)
documents = reader.load_data(
    urls=["https://example.com/article"]
)

print(documents[0].text)

BeautifulSoup 网页读取 #

bash

pip install llama-index-readers-web beautifulsoup4

python

from llama_index.readers.web import BeautifulSoupWebReader

reader = BeautifulSoupWebReader()
documents = reader.load_data(
    urls=["https://example.com/article"],
)

for doc in documents:
    print(f"URL: {doc.metadata.get('url')}")
    print(f"标题: {doc.metadata.get('title')}")

Sitemap 读取 #

python

from llama_index.readers.web import SitemapReader

reader = SitemapReader()
documents = reader.load_data(
    sitemap_url="https://example.com/sitemap.xml",
)

RSS 订阅读取 #

python

from llama_index.readers.web import RssReader

reader = RssReader()
documents = reader.load_data(
    urls=["https://example.com/feed.xml"],
)

数据库加载器 #

SQL 数据库 #

bash

pip install llama-index-readers-database sqlalchemy

python

from llama_index.readers.database import DatabaseReader

reader = DatabaseReader(
    sql_database="sqlite:///mydb.db"
)

documents = reader.load_data(
    query="SELECT title, content FROM articles"
)

for doc in documents:
    print(doc.text[:100])

PostgreSQL #

python

from llama_index.readers.database import DatabaseReader

reader = DatabaseReader(
    sql_database="postgresql://user:pass@localhost/mydb"
)

documents = reader.load_data(
    query="SELECT * FROM documents WHERE category = 'tech'"
)

MongoDB #

bash

pip install llama-index-readers-mongodb pymongo

python

from llama_index.readers.mongodb import SimpleMongoReader

reader = SimpleMongoReader(
    host="localhost",
    port=27017,
)

documents = reader.load_data(
    db_name="mydb",
    collection_name="documents",
    query_dict={"category": "tech"},
    field_names=["title", "content"],
)

云存储加载器 #

AWS S3 #

bash

pip install llama-index-readers-s3 boto3

python

from llama_index.readers.s3 import S3Reader

reader = S3Reader(
    bucket="my-bucket",
    prefix="documents/",
    aws_access_key_id="your-key",
    aws_secret_access_key="your-secret",
)

documents = reader.load_data()

Google Cloud Storage #

bash

pip install llama-index-readers-gcs google-cloud-storage

python

from llama_index.readers.gcs import GCSReader

reader = GCSReader(
    bucket="my-bucket",
    prefix="documents/",
)

documents = reader.load_data()

Azure Blob Storage #

bash

pip install llama-index-readers-azure-blob-storage azure-storage-blob

python

from llama_index.readers.azure_blob_storage import AzureBlobStorageReader

reader = AzureBlobStorageReader(
    container_name="my-container",
    blob_prefix="documents/",
    connection_string="your-connection-string",
)

documents = reader.load_data()

API 加载器 #

Notion #

bash

pip install llama-index-readers-notion

python

from llama_index.readers.notion import NotionPageReader

reader = NotionPageReader(integration_token="your-token")

documents = reader.load_data(
    page_ids=["page-id-1", "page-id-2"],
)

Slack #

bash

pip install llama-index-readers-slack

python

from llama_index.readers.slack import SlackReader

reader = SlackReader(slack_token="xoxb-your-token")

documents = reader.load_data(
    channel_ids=["C12345678"],
)

Discord #

bash

pip install llama-index-readers-discord

python

from llama_index.readers.discord import DiscordReader

reader = DiscordReader(discord_token="your-token")

documents = reader.load_data(
    channel_ids=["123456789"],
)

GitHub #

bash

pip install llama-index-readers-github

python

from llama_index.readers.github import GitHubRepositoryReader

reader = GitHubRepositoryReader(
    owner="run-llama",
    repo="llama_index",
    github_token="your-token",
)

documents = reader.load_data(branch="main")

Confluence #

bash

pip install llama-index-readers-confluence

python

from llama_index.readers.confluence import ConfluenceReader

reader = ConfluenceReader(
    base_url="https://your-company.atlassian.net/wiki",
    api_token="your-token",
)

documents = reader.load_data(
    space_key="DOC",
)

自定义加载器 #

创建简单加载器 #

python

from llama_index.core import Document
from typing import List

def load_custom_data(source: str) -> List[Document]:
    with open(source, "r", encoding="utf-8") as f:
        text = f.read()
    
    return [Document(
        text=text,
        metadata={"source": source},
    )]

documents = load_custom_data("./custom_data.txt")

创建自定义 Reader 类 #

python

from llama_index.core import Document
from llama_index.core.readers.base import BaseReader
from typing import List, Optional

class CustomReader(BaseReader):
    def __init__(self, source_path: str):
        self.source_path = source_path
    
    def load_data(self, *args, **kwargs) -> List[Document]:
        documents = []
        
        import os
        for filename in os.listdir(self.source_path):
            if filename.endswith(".custom"):
                filepath = os.path.join(self.source_path, filename)
                with open(filepath, "r", encoding="utf-8") as f:
                    text = f.read()
                
                documents.append(Document(
                    text=text,
                    metadata={
                        "filename": filename,
                        "source": filepath,
                    },
                ))
        
        return documents

reader = CustomReader("./custom_data")
documents = reader.load_data()

批量加载 #

并行加载 #

python

from llama_index.core import SimpleDirectoryReader
import multiprocessing

reader = SimpleDirectoryReader(
    input_dir="./data",
    num_files_per_batch=100,
)

documents = reader.load_data(
    num_workers=multiprocessing.cpu_count(),
    show_progress=True,
)

分批处理 #

python

from llama_index.core import SimpleDirectoryReader

reader = SimpleDirectoryReader("./data")

batch_size = 50
for i, doc in enumerate(reader.iter_data()):
    print(f"处理文档 {i+1}: {doc.metadata.get('file_name')}")
    
    if (i + 1) % batch_size == 0:
        pass

元数据处理 #

添加自定义元数据 #

python

from llama_index.core import SimpleDirectoryReader

def custom_metadata_func(file_path: str) -> dict:
    return {
        "custom_field": "value",
        "processed_at": "2024-01-01",
    }

reader = SimpleDirectoryReader(
    input_dir="./data",
    file_metadata=custom_metadata_func,
)

documents = reader.load_data()

for doc in documents:
    print(doc.metadata)

过滤文档 #

python

from llama_index.core import SimpleDirectoryReader

reader = SimpleDirectoryReader("./data")
documents = reader.load_data()

filtered_docs = [
    doc for doc in documents
    if len(doc.text) > 100
]

print(f"原始: {len(documents)} 个文档")
print(f"过滤后: {len(filtered_docs)} 个文档")

完整示例 #

python

import os
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.readers.file import PDFReader
from llama_index.readers.web import SimpleWebPageReader

pdf_docs = SimpleDirectoryReader(
    input_dir="./pdfs",
    required_exts=[".pdf"],
).load_data()
print(f"PDF 文档: {len(pdf_docs)}")

web_docs = SimpleWebPageReader(html_to_text=True).load_data(
    urls=[
        "https://docs.llamaindex.ai/en/stable/",
    ]
)
print(f"网页文档: {len(web_docs)}")

all_documents = pdf_docs + web_docs

index = VectorStoreIndex.from_documents(all_documents)

query_engine = index.as_query_engine()
response = query_engine.query("总结所有文档的主要内容")
print(response)

下一步 #

了解数据加载后，接下来学习文档与节点掌握文档处理和节点分割！