DynamoDB Scan扫描 #
一、Scan概述 #
1.1 Scan特点 #
text
Scan特点:
├── 扫描整个表或索引
├── 不需要指定主键
├── 消耗大量RCU
├── 适合导出、分析等场景
└── 应尽量避免在生产中使用
1.2 Scan vs Query #
| 特性 | Scan | Query |
|---|---|---|
| 查询方式 | 全表扫描 | 基于索引 |
| 效率 | 低 | 高 |
| 成本 | 高 | 低 |
| 分区键 | 不需要 | 必须指定 |
| 适用场景 | 导出、分析 | 业务查询 |
二、基本扫描 #
2.1 使用CLI #
bash
aws dynamodb scan \
--table-name Users
2.2 使用JavaScript SDK #
javascript
const { DynamoDBClient } = require('@aws-sdk/client-dynamodb');
const { DynamoDBDocumentClient, ScanCommand } = require('@aws-sdk/lib-dynamodb');
const client = new DynamoDBClient({ region: 'us-east-1' });
const docClient = DynamoDBDocumentClient.from(client);
const response = await docClient.send(new ScanCommand({
TableName: 'Users'
}));
console.log(`Total items: ${response.Count}`);
console.log('Items:', response.Items);
2.3 使用Python SDK #
python
import boto3
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('Users')
response = table.scan()
print(f"Total items: {response['Count']}")
print("Items:", response['Items'])
三、过滤表达式 #
3.1 基本过滤 #
javascript
const response = await docClient.send(new ScanCommand({
TableName: 'Users',
FilterExpression: 'Age > :minAge',
ExpressionAttributeValues: {
':minAge': 25
}
}));
3.2 过滤条件类型 #
javascript
// 比较运算
FilterExpression: 'Age > :minAge'
FilterExpression: 'Status = :status'
FilterExpression: 'Price BETWEEN :min AND :max'
// 逻辑运算
FilterExpression: 'Status = :status AND Age > :minAge'
FilterExpression: 'Status = :status OR Status = :status2'
FilterExpression: 'NOT IsDeleted = :deleted'
// 属性检查
FilterExpression: 'attribute_exists(Email)'
FilterExpression: 'attribute_not_exists(DeletedAt)'
FilterExpression: 'attribute_type(Age, :type)'
// 字符串操作
FilterExpression: 'begins_with(Name, :prefix)'
FilterExpression: 'contains(Name, :substring)'
// 集合操作
FilterExpression: 'contains(Tags, :tag)'
// 大小检查
FilterExpression: 'size(Description) < :maxSize'
// IN操作
FilterExpression: 'Status IN (:s1, :s2, :s3)'
3.3 组合过滤 #
javascript
const response = await docClient.send(new ScanCommand({
TableName: 'Users',
FilterExpression: `
IsActive = :active
AND Age >= :minAge
AND attribute_not_exists(DeletedAt)
`,
ExpressionAttributeValues: {
':active': true,
':minAge': 18
}
}));
3.4 过滤注意事项 #
text
重要提示:
├── FilterExpression在扫描后应用
├── 不减少RCU消耗
├── 可能返回空结果
└── 需要处理分页
四、投影 #
4.1 指定返回属性 #
javascript
const response = await docClient.send(new ScanCommand({
TableName: 'Users',
ProjectionExpression: 'UserId, Name, Email'
}));
4.2 嵌套属性投影 #
javascript
const response = await docClient.send(new ScanCommand({
TableName: 'Users',
ProjectionExpression: 'UserId, Name, Address.City'
}));
4.3 投影优势 #
text
投影优势:
├── 减少数据传输量
├── 降低网络延迟
├── 减少客户端内存使用
└── 不减少RCU消耗
五、分页 #
5.1 使用Limit #
javascript
const response = await docClient.send(new ScanCommand({
TableName: 'Users',
Limit: 100
}));
console.log(`Items: ${response.Items.length}`);
console.log(`Has more: ${response.LastEvaluatedKey ? true : false}`);
5.2 使用ExclusiveStartKey #
javascript
async function scanAll(params) {
let items = [];
let lastKey = null;
do {
const response = await docClient.send(new ScanCommand({
...params,
ExclusiveStartKey: lastKey
}));
items.push(...response.Items);
lastKey = response.LastEvaluatedKey;
} while (lastKey);
return items;
}
// 使用示例
const allUsers = await scanAll({
TableName: 'Users',
Limit: 100
});
5.3 分页工具函数 #
javascript
async function* scanPaginator(params) {
let lastKey = null;
do {
const response = await docClient.send(new ScanCommand({
...params,
ExclusiveStartKey: lastKey
}));
yield response.Items;
lastKey = response.LastEvaluatedKey;
} while (lastKey);
}
// 使用示例
for await (const batch of scanPaginator({
TableName: 'Users',
Limit: 100
})) {
for (const item of batch) {
console.log(item.UserId);
}
}
六、并行扫描 #
6.1 并行扫描概念 #
text
并行扫描:
├── 将表分成多个段
├── 多个线程/进程同时扫描
├── 提高扫描速度
└── 消耗更多RCU
6.2 实现并行扫描 #
javascript
async function parallelScan(tableName, options = {}) {
const { segments = 4, filterExpression, expressionAttributeValues } = options;
const scanSegment = async (segment) => {
const items = [];
let lastKey = null;
do {
const params = {
TableName: tableName,
Segment: segment,
TotalSegments: segments,
ExclusiveStartKey: lastKey
};
if (filterExpression) {
params.FilterExpression = filterExpression;
params.ExpressionAttributeValues = expressionAttributeValues;
}
const response = await docClient.send(new ScanCommand(params));
items.push(...response.Items);
lastKey = response.LastEvaluatedKey;
} while (lastKey);
return items;
};
const promises = [];
for (let i = 0; i < segments; i++) {
promises.push(scanSegment(i));
}
const results = await Promise.all(promises);
return results.flat();
}
// 使用示例
const allItems = await parallelScan('Users', {
segments: 8,
filterExpression: 'IsActive = :active',
expressionAttributeValues: { ':active': true }
});
6.3 并行扫描最佳实践 #
text
最佳实践:
├── 根据表大小选择分段数
├── 监控RCU消耗
├── 避免在高峰期使用
├── 使用指数退避处理限流
└── 考虑使用预置容量
七、扫描索引 #
7.1 扫描全局二级索引 #
javascript
const response = await docClient.send(new ScanCommand({
TableName: 'Users',
IndexName: 'EmailIndex'
}));
7.2 索引扫描优势 #
text
索引扫描优势:
├── 投影属性更少,数据量更小
├── 可能有更好的性能
└── 适合特定属性查询
八、一致性读取 #
8.1 强一致性扫描 #
javascript
const response = await docClient.send(new ScanCommand({
TableName: 'Users',
ConsistentRead: true
}));
8.2 一致性选择 #
text
一致性选择:
├── 最终一致性(默认)
│ ├── 消耗1 RCU(4KB)
│ └── 可能读到旧数据
└── 强一致性
├── 消耗2 RCU(4KB)
└── 保证读到最新数据
九、实用示例 #
9.1 导出所有数据 #
javascript
async function exportAllData(tableName) {
const items = [];
let lastKey = null;
let count = 0;
do {
const response = await docClient.send(new ScanCommand({
TableName: tableName,
ExclusiveStartKey: lastKey,
Limit: 100
}));
items.push(...response.Items);
count += response.Items.length;
lastKey = response.LastEvaluatedKey;
console.log(`Exported ${count} items...`);
} while (lastKey);
return items;
}
9.2 统计数据 #
javascript
async function countItems(tableName, filterExpression, expressionAttributeValues) {
let count = 0;
let lastKey = null;
const params = {
TableName: tableName,
Select: 'COUNT'
};
if (filterExpression) {
params.FilterExpression = filterExpression;
params.ExpressionAttributeValues = expressionAttributeValues;
}
do {
const response = await docClient.send(new ScanCommand({
...params,
ExclusiveStartKey: lastKey
}));
count += response.Count;
lastKey = response.LastEvaluatedKey;
} while (lastKey);
return count;
}
// 使用示例
const activeUsers = await countItems('Users', 'IsActive = :active', { ':active': true });
console.log(`Active users: ${activeUsers}`);
9.3 批量更新 #
javascript
async function batchUpdateStatus(tableName, newStatus) {
let lastKey = null;
let updated = 0;
do {
const scanResponse = await docClient.send(new ScanCommand({
TableName: tableName,
ExclusiveStartKey: lastKey,
Limit: 25,
ProjectionExpression: 'PK, SK'
}));
if (scanResponse.Items.length > 0) {
const updatePromises = scanResponse.Items.map(item =>
docClient.send(new UpdateCommand({
TableName: tableName,
Key: { PK: item.PK, SK: item.SK },
UpdateExpression: 'SET Status = :status, UpdatedAt = :now',
ExpressionAttributeValues: {
':status': newStatus,
':now': new Date().toISOString()
}
}))
);
await Promise.all(updatePromises);
updated += scanResponse.Items.length;
console.log(`Updated ${updated} items...`);
}
lastKey = scanResponse.LastEvaluatedKey;
} while (lastKey);
return updated;
}
9.4 数据迁移 #
javascript
async function migrateData(sourceTable, targetTable, transformer) {
let lastKey = null;
let migrated = 0;
do {
const response = await docClient.send(new ScanCommand({
TableName: sourceTable,
ExclusiveStartKey: lastKey,
Limit: 25
}));
if (response.Items.length > 0) {
const transformedItems = response.Items.map(transformer);
await batchWriteItems(targetTable, transformedItems);
migrated += response.Items.length;
console.log(`Migrated ${migrated} items...`);
}
lastKey = response.LastEvaluatedKey;
} while (lastKey);
return migrated;
}
十、性能优化 #
10.1 减少扫描数据量 #
text
优化方法:
├── 使用FilterExpression过滤
├── 使用ProjectionExpression投影
├── 使用Limit限制返回数量
└── 扫描索引而非表
10.2 并行扫描优化 #
javascript
async function optimizedParallelScan(tableName, options = {}) {
const {
segments = 4,
limit = 100,
filterExpression,
projectionExpression,
expressionAttributeValues
} = options;
const scanSegment = async (segment) => {
const items = [];
let lastKey = null;
let retries = 0;
const maxRetries = 5;
do {
try {
const params = {
TableName: tableName,
Segment: segment,
TotalSegments: segments,
Limit: limit,
ExclusiveStartKey: lastKey
};
if (filterExpression) params.FilterExpression = filterExpression;
if (projectionExpression) params.ProjectionExpression = projectionExpression;
if (expressionAttributeValues) {
params.ExpressionAttributeValues = expressionAttributeValues;
}
const response = await docClient.send(new ScanCommand(params));
items.push(...response.Items);
lastKey = response.LastEvaluatedKey;
retries = 0;
} catch (error) {
if (error.name === 'ProvisionedThroughputExceededException' && retries < maxRetries) {
retries++;
await new Promise(resolve => setTimeout(resolve, Math.pow(2, retries) * 100));
continue;
}
throw error;
}
} while (lastKey);
return items;
};
const promises = Array.from({ length: segments }, (_, i) => scanSegment(i));
const results = await Promise.all(promises);
return results.flat();
}
10.3 RCU消耗计算 #
text
RCU计算:
├── 最终一致性:1 RCU / 4KB
├── 强一致性:2 RCU / 4KB
└── 向上取整
示例:
├── 扫描1000个项目,每个2KB
├── 最终一致性:1000 / 2 = 500 RCU
└── FilterExpression不影响RCU
十一、最佳实践 #
11.1 避免使用Scan的场景 #
text
应避免:
├── 业务查询(使用Query)
├── 高频查询(使用索引)
├── 实时查询(使用缓存)
└── 大表扫描(使用并行扫描或导出)
11.2 适合使用Scan的场景 #
text
适合场景:
├── 数据导出
├── 数据迁移
├── 批量处理
├── 统计分析
├── 后台任务
└── 低频管理操作
11.3 扫描建议 #
text
建议:
├── 在低峰期执行
├── 使用Limit控制速率
├── 使用并行扫描加速
├── 监控RCU消耗
├── 使用投影减少数据传输
└── 实现重试机制
十二、总结 #
Scan操作要点:
| 特性 | 说明 |
|---|---|
| 基本功能 | 全表扫描 |
| 过滤 | FilterExpression |
| 投影 | ProjectionExpression |
| 分页 | Limit + ExclusiveStartKey |
| 并行 | Segment + TotalSegments |
| 成本 | 消耗大量RCU |
下一步,让我们学习高级查询!
最后更新:2026-03-27