向量操作 #

向量结构 #

在 Pinecone 中，每个向量由三部分组成：

text

┌─────────────────────────────────────────────────────────────┐
│                    向量结构                                  │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  {                                                          │
│    "id": "doc-1",              // 唯一标识符                │
│    "values": [0.1, 0.2, ...],  // 向量值（浮点数数组）      │
│    "metadata": {               // 元数据（可选）            │
│      "title": "Document 1",                                 │
│      "category": "tech"                                     │
│    }                                                        │
│  }                                                          │
│                                                             │
│  字段说明：                                                  │
│  - id: 字符串，最大 512 字符                                │
│  - values: 浮点数数组，长度必须与索引维度匹配               │
│  - metadata: 键值对，可选                                   │
│                                                             │
└─────────────────────────────────────────────────────────────┘

插入向量 #

单个向量插入 #

python

from pinecone import Pinecone

pc = Pinecone(api_key="your-api-key")
index = pc.Index("my-index")

index.upsert(
    vectors=[
        ("doc-1", [0.1, 0.2, 0.3, ...], {"title": "Document 1"})
    ]
)

print("向量插入成功")

使用字典格式 #

python

index.upsert(
    vectors=[
        {
            "id": "doc-1",
            "values": [0.1, 0.2, 0.3, ...],
            "metadata": {"title": "Document 1", "category": "tech"}
        }
    ]
)

批量插入 #

python

vectors = [
    ("doc-1", [0.1, 0.2, ...], {"title": "Document 1"}),
    ("doc-2", [0.3, 0.4, ...], {"title": "Document 2"}),
    ("doc-3", [0.5, 0.6, ...], {"title": "Document 3"}),
]

index.upsert(vectors=vectors)

print(f"成功插入 {len(vectors)} 个向量")

大规模批量插入 #

text

┌─────────────────────────────────────────────────────────────┐
│                    批量插入限制                              │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  每次请求限制：                                              │
│  - 最多 1000 个向量                                         │
│  - 总大小不超过 2MB                                         │
│                                                             │
│  大规模数据插入策略：                                        │
│  - 分批插入，每批 100-1000 个向量                           │
│  - 使用异步操作提高效率                                     │
│  - 监控插入进度                                             │
│                                                             │
└─────────────────────────────────────────────────────────────┘

python

import itertools

def batch_upsert(index, vectors, batch_size=100):
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i:i + batch_size]
        index.upsert(vectors=batch)
        print(f"已插入 {min(i + batch_size, len(vectors))}/{len(vectors)} 个向量")

vectors = [
    (f"doc-{i}", [0.1 * i, 0.2 * i, ...], {"id": i})
    for i in range(10000)
]

batch_upsert(index, vectors, batch_size=100)

更新向量 #

使用 upsert 更新 #

python

index.upsert(
    vectors=[
        ("doc-1", [0.7, 0.8, 0.9, ...], {"title": "Updated Document 1"})
    ]
)

print("向量更新成功")

text

┌─────────────────────────────────────────────────────────────┐
│                    upsert 行为说明                           │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  upsert = update + insert                                   │
│                                                             │
│  如果 ID 不存在：插入新向量                                 │
│  如果 ID 已存在：更新现有向量                               │
│                                                             │
│  更新行为：                                                  │
│  - 完全替换原有向量值                                       │
│  - 完全替换原有元数据                                       │
│  - 不能部分更新                                             │
│                                                             │
└─────────────────────────────────────────────────────────────┘

更新元数据 #

python

vector = index.fetch(["doc-1"])["vectors"]["doc-1"]

index.upsert(
    vectors=[
        ("doc-1", vector.values, {"title": "New Title", "updated": True})
    ]
)

删除向量 #

删除单个向量 #

python

index.delete(ids=["doc-1"])

print("向量已删除")

删除多个向量 #

python

index.delete(ids=["doc-1", "doc-2", "doc-3"])

print("多个向量已删除")

删除命名空间中的所有向量 #

python

index.delete(delete_all=True, namespace="my-namespace")

print("命名空间中的所有向量已删除")

按元数据删除 #

python

index.delete(
    filter={"category": {"$eq": "obsolete"}},
    namespace="my-namespace"
)

print("符合条件的向量已删除")

查询向量 #

获取单个向量 #

python

result = index.fetch(ids=["doc-1"])

if "doc-1" in result["vectors"]:
    vector = result["vectors"]["doc-1"]
    print(f"ID: {vector.id}")
    print(f"Values: {vector.values[:5]}...")
    print(f"Metadata: {vector.metadata}")

获取多个向量 #

python

result = index.fetch(ids=["doc-1", "doc-2", "doc-3"])

for id, vector in result["vectors"].items():
    print(f"ID: {id}")
    print(f"Metadata: {vector.metadata}")
    print("---")

向量操作完整示例 #

python

import os
from pinecone import Pinecone

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index = pc.Index("my-index")

def insert_sample_vectors():
    vectors = [
        ("doc-1", [0.1, 0.2, 0.3], {"title": "Python 教程", "category": "programming"}),
        ("doc-2", [0.4, 0.5, 0.6], {"title": "JavaScript 教程", "category": "programming"}),
        ("doc-3", [0.7, 0.8, 0.9], {"title": "机器学习入门", "category": "ai"}),
    ]
    
    index.upsert(vectors=vectors)
    print(f"成功插入 {len(vectors)} 个向量")

def query_vectors():
    result = index.fetch(ids=["doc-1", "doc-2"])
    
    for id, vector in result["vectors"].items():
        print(f"ID: {id}")
        print(f"  标题: {vector.metadata.get('title')}")
        print(f"  类别: {vector.metadata.get('category')}")

def update_vector():
    index.upsert(
        vectors=[
            ("doc-1", [0.2, 0.3, 0.4], {"title": "Python 高级教程", "category": "programming", "level": "advanced"})
        ]
    )
    print("向量已更新")

def delete_vectors():
    index.delete(ids=["doc-3"])
    print("向量已删除")

def show_stats():
    stats = index.describe_index_stats()
    print(f"向量总数: {stats.total_vector_count}")

if __name__ == "__main__":
    insert_sample_vectors()
    show_stats()
    query_vectors()
    update_vector()
    delete_vectors()
    show_stats()

Node.js 示例 #

javascript

const { Pinecone } = require('@pinecone-database/pinecone');

async function vectorOperations() {
  const pc = new Pinecone({
    apiKey: process.env.PINECONE_API_KEY
  });

  const index = pc.index('my-index');

  await index.upsert([
    { id: 'doc-1', values: [0.1, 0.2, 0.3], metadata: { title: 'Document 1' } },
    { id: 'doc-2', values: [0.4, 0.5, 0.6], metadata: { title: 'Document 2' } },
    { id: 'doc-3', values: [0.7, 0.8, 0.9], metadata: { title: 'Document 3' } }
  ]);

  console.log('向量插入成功');

  const fetchResult = await index.fetch(['doc-1', 'doc-2']);
  console.log('获取向量:', Object.keys(fetchResult.records));

  await index.update({
    id: 'doc-1',
    metadata: { title: 'Updated Document 1', updated: true }
  });
  console.log('向量更新成功');

  await index.deleteMany(['doc-3']);
  console.log('向量删除成功');

  const stats = await index.describeIndexStats();
  console.log('向量总数:', stats.totalRecordCount);
}

vectorOperations();

最佳实践 #

ID 设计 #

text

┌─────────────────────────────────────────────────────────────┐
│                    ID 设计建议                               │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  推荐格式：                                                  │
│  - {type}_{source}_{unique_id}                             │
│                                                             │
│  示例：                                                     │
│  - doc_wikipedia_12345                                     │
│  - product_amazon_B08N5KWB9H                               │
│  - user_profile_abc123                                     │
│                                                             │
│  注意事项：                                                  │
│  ✅ 使用有意义的标识符                                      │
│  ✅ 保持格式一致                                            │
│  ✅ 避免特殊字符                                            │
│  ❌ 不要使用敏感信息                                        │
│                                                             │
└─────────────────────────────────────────────────────────────┘

元数据设计 #

text

┌─────────────────────────────────────────────────────────────┐
│                    元数据设计建议                            │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  推荐结构：                                                  │
│  {                                                          │
│    "title": "文档标题",        // 用于展示                  │
│    "category": "分类",          // 用于过滤                 │
│    "source": "来源",            // 用于追踪                 │
│    "created_at": "2024-01-15", // 用于排序                  │
│    "tags": ["tag1", "tag2"]    // 用于多值过滤              │
│  }                                                          │
│                                                             │
│  限制：                                                     │
│  - 单个元数据最大 40KB                                      │
│  - 字符串值最大 1000 字符                                   │
│  - 列表最多 1000 个元素                                     │
│                                                             │
│  性能建议：                                                  │
│  - 只存储必要的元数据                                       │
│  - 避免存储大文本                                           │
│  - 使用合适的数据类型                                       │
│                                                             │
└─────────────────────────────────────────────────────────────┘

批量操作优化 #

python

import asyncio
from pinecone import Pinecone

async def async_batch_upsert(index, vectors, batch_size=100):
    async def upsert_batch(batch):
        return index.upsert(vectors=batch)
    
    batches = [
        vectors[i:i + batch_size]
        for i in range(0, len(vectors), batch_size)
    ]
    
    tasks = [upsert_batch(batch) for batch in batches]
    await asyncio.gather(*tasks)
    
    print(f"成功插入 {len(vectors)} 个向量")

下一步 #

现在你已经掌握了向量操作，接下来学习查询与搜索，了解如何进行向量相似性搜索！