存储与持久化 #

概述 #

LlamaIndex 提供了灵活的存储抽象，支持向量存储、文档存储、索引存储等多种存储后端，可以轻松实现数据的持久化和高效检索。

text

┌─────────────────────────────────────────────────────────────┐
│                    存储架构                                  │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  StorageContext                                             │
│  ├── VectorStore        向量存储                            │
│  │   ├── Chroma                                            │
│  │   ├── Pinecone                                          │
│  │   ├── Qdrant                                            │
│  │   └── Weaviate                                          │
│  │                                                         │
│  ├── DocumentStore       文档存储                           │
│  │   ├── SimpleDocumentStore                               │
│  │   └── MongoDBDocumentStore                              │
│  │                                                         │
│  └── IndexStore          索引存储                           │
│      ├── SimpleIndexStore                                  │
│      └── RedisIndexStore                                   │
│                                                             │
└─────────────────────────────────────────────────────────────┘

本地持久化 #

基本持久化 #

python

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

documents = SimpleDirectoryReader("./data").load_data()
index = VectorStoreIndex.from_documents(documents)

index.storage_context.persist(persist_dir="./storage")

from llama_index.core import StorageContext, load_index_from_storage

storage_context = StorageContext.from_defaults(persist_dir="./storage")
index = load_index_from_storage(storage_context)

query_engine = index.as_query_engine()
response = query_engine.query("你的问题")

指定存储组件 #

python

from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.storage.index_store import SimpleIndexStore
from llama_index.core.vector_stores import SimpleVectorStore

storage_context = StorageContext.from_defaults(
    docstore=SimpleDocumentStore(),
    index_store=SimpleIndexStore(),
    vector_store=SimpleVectorStore(),
)

index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
)

storage_context.persist(persist_dir="./storage")

向量存储 #

Chroma #

bash

pip install llama-index-vector-stores-chroma chromadb

python

import chromadb
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext
from llama_index.vector_stores.chroma import ChromaVectorStore

db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("my_collection")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

documents = SimpleDirectoryReader("./data").load_data()
index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
)

db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_collection("my_collection")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
index = VectorStoreIndex.from_vector_store(vector_store)

Pinecone #

bash

pip install llama-index-vector-stores-pinecone pinecone-client

python

from pinecone import Pinecone
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.pinecone import PineconeVectorStore

pc = Pinecone(api_key="your-api-key")
pinecone_index = pc.Index("my-index")

vector_store = PineconeVectorStore(
    pinecone_index=pinecone_index,
    namespace="my-namespace",
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
)

index = VectorStoreIndex.from_vector_store(vector_store)

Qdrant #

bash

pip install llama-index-vector-stores-qdrant qdrant-client

python

from qdrant_client import QdrantClient
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore

client = QdrantClient(url="http://localhost:6333")

vector_store = QdrantVectorStore(
    client=client,
    collection_name="my_collection",
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
)

Weaviate #

bash

pip install llama-index-vector-stores-weaviate weaviate-client

python

import weaviate
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.weaviate import WeaviateVectorStore

client = weaviate.Client("http://localhost:8080")

vector_store = WeaviateVectorStore(
    weaviate_client=client,
    index_name="MyIndex",
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
)

Milvus #

bash

pip install llama-index-vector-stores-milvus pymilvus

python

from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.milvus import MilvusVectorStore

vector_store = MilvusVectorStore(
    uri="http://localhost:19530",
    collection_name="my_collection",
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
)

Redis #

bash

pip install llama-index-vector-stores-redis redis

python

from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.redis import RedisVectorStore

vector_store = RedisVectorStore(
    redis_url="redis://localhost:6379",
    index_name="my_index",
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
)

文档存储 #

SimpleDocumentStore #

python

from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core import StorageContext

docstore = SimpleDocumentStore()
storage_context = StorageContext.from_defaults(docstore=docstore)

index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
)

doc = storage_context.docstore.get_document(doc_id)
all_docs = storage_context.docstore.get_all_documents()

MongoDB Document Store #

bash

pip install llama-index-storage-docstore-mongodb pymongo

python

from llama_index.storage.docstore.mongodb import MongoDocumentStore
from llama_index.core import StorageContext

docstore = MongoDocumentStore.from_uri(
    uri="mongodb://localhost:27017",
    db_name="llamaindex",
    collection_name="documents",
)
storage_context = StorageContext.from_defaults(docstore=docstore)

索引存储 #

SimpleIndexStore #

python

from llama_index.core.storage.index_store import SimpleIndexStore
from llama_index.core import StorageContext

index_store = SimpleIndexStore()
storage_context = StorageContext.from_defaults(index_store=index_store)

Redis Index Store #

bash

pip install llama-index-storage-index-store-redis redis

python

from llama_index.storage.index_store.redis import RedisIndexStore
from llama_index.core import StorageContext

index_store = RedisIndexStore.from_host_and_port(
    host="localhost",
    port=6379,
)
storage_context = StorageContext.from_defaults(index_store=index_store)

增量更新 #

添加文档 #

python

from llama_index.core import VectorStoreIndex, Document

index = VectorStoreIndex.from_documents([
    Document(text="初始文档")
])

index.insert(Document(text="新文档 1"))
index.insert(Document(text="新文档 2"))

index.insert_nodes([node1, node2])

for doc in new_documents:
    index.insert(doc)

删除文档 #

python

index.delete_ref_doc(doc_id)

index.delete_nodes([node_id1, node_id2])

index.delete(doc_id)

更新文档 #

python

index.delete_ref_doc(old_doc_id)
index.insert(new_document)

缓存 #

嵌入缓存 #

python

from llama_index.core import VectorStoreIndex
from llama_index.core.storage.storage_context import StorageContext
from llama_index.core.storage.docstore import SimpleDocumentStore

storage_context = StorageContext.from_defaults(
    docstore=SimpleDocumentStore(),
)

index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
    use_async=True,
)

Redis 缓存 #

python

from llama_index.storage.kvstore.redis import RedisKVStore
from llama_index.core.storage.kvstore.types import KVStoreKey

kvstore = RedisKVStore.from_host_and_port(
    host="localhost",
    port=6379,
)

kvstore.put("key", "value")
value = kvstore.get("key")

多索引管理 #

管理多个索引 #

python

from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.core.storage.index_store import SimpleIndexStore

index_store = SimpleIndexStore()

storage_context1 = StorageContext.from_defaults(
    index_store=index_store,
    index_id="index_1",
)
index1 = VectorStoreIndex.from_documents(
    docs1,
    storage_context=storage_context1,
)

storage_context2 = StorageContext.from_defaults(
    index_store=index_store,
    index_id="index_2",
)
index2 = VectorStoreIndex.from_documents(
    docs2,
    storage_context=storage_context2,
)

index_store.persist(persist_dir="./index_storage")

加载特定索引 #

python

from llama_index.core import load_index_from_storage, StorageContext
from llama_index.core.storage.index_store import SimpleIndexStore

index_store = SimpleIndexStore.from_persist_dir("./index_storage")

index1 = load_index_from_storage(
    StorageContext.from_defaults(index_store=index_store),
    index_id="index_1",
)

index2 = load_index_from_storage(
    StorageContext.from_defaults(index_store=index_store),
    index_id="index_2",
)

完整示例 #

python

import os
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
    Settings,
)
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore

os.environ["OPENAI_API_KEY"] = "sk-your-key"

Settings.llm = OpenAI(model="gpt-4o-mini")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("knowledge_base")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

documents = SimpleDirectoryReader("./data").load_data()
print(f"加载了 {len(documents)} 个文档")

index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
    show_progress=True,
)
print("索引构建完成")

query_engine = index.as_query_engine()
response = query_engine.query("文档的主要内容是什么？")
print(f"\n回答: {response}")

new_doc = Document(text="这是新添加的文档内容。")
index.insert(new_doc)
print("新文档已添加")

db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_collection("knowledge_base")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
index = VectorStoreIndex.from_vector_store(vector_store)

query_engine = index.as_query_engine()
response = query_engine.query("新文档的内容是什么？")
print(f"\n回答: {response}")

向量存储对比 #

text

┌─────────────────────────────────────────────────────────────┐
│                    向量存储对比                              │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  存储类型        特点                    适用场景            │
│  ─────────────────────────────────────────────────────────  │
│  Chroma         开源、易用、本地存储     开发测试            │
│  Pinecone       云托管、高性能          生产环境            │
│  Qdrant         开源、高性能            自托管生产          │
│  Weaviate       开源、功能丰富          企业级应用          │
│  Milvus         开源、分布式            大规模数据          │
│  Redis          快速、易部署            缓存场景            │
│                                                             │
└─────────────────────────────────────────────────────────────┘

下一步 #

掌握存储与持久化后，接下来学习文档问答系统开始实战项目！