DynamoDB监控与告警 #
一、监控概述 #
1.1 监控维度 #
text
DynamoDB监控维度:
├── 性能指标
│ ├── 延迟
│ ├── 吞吐量
│ └── 错误率
├── 容量指标
│ ├── 消耗容量
│ ├── 限流事件
│ └── 利用率
├── 存储指标
│ ├── 表大小
│ ├── 项目数量
│ └── 索引大小
└── 运维指标
├── 备份状态
├── TTL状态
└── 复制延迟
1.2 监控工具 #
| 工具 | 用途 |
|---|---|
| CloudWatch Metrics | 指标监控 |
| CloudWatch Alarms | 告警配置 |
| CloudWatch Logs | 日志分析 |
| CloudTrail | API审计 |
| X-Ray | 请求追踪 |
二、关键指标 #
2.1 性能指标 #
延迟指标:
text
延迟指标:
├── SuccessfulRequestLatency
│ └── 成功请求延迟
├── 按操作类型分类
│ ├── GetItem
│ ├── PutItem
│ ├── Query
│ ├── Scan
│ └── UpdateItem
└── 单位:毫秒
监控示例:
javascript
const { CloudWatchClient, GetMetricStatisticsCommand } = require('@aws-sdk/client-cloudwatch');
const cloudwatch = new CloudWatchClient({ region: 'us-east-1' });
async function getLatencyMetrics(tableName, operation) {
const response = await cloudwatch.send(new GetMetricStatisticsCommand({
Namespace: 'AWS/DynamoDB',
MetricName: 'SuccessfulRequestLatency',
Dimensions: [
{ Name: 'TableName', Value: tableName },
{ Name: 'Operation', Value: operation }
],
StartTime: new Date(Date.now() - 3600000),
EndTime: new Date(),
Period: 300,
Statistics: ['Average', 'p99', 'Maximum']
}));
return response.Datapoints;
}
2.2 容量指标 #
吞吐量指标:
text
吞吐量指标:
├── ConsumedReadCapacityUnits
│ └── 消耗的RCU
├── ConsumedWriteCapacityUnits
│ └── 消耗的WCU
├── ReadThrottleEvents
│ └── 读取限流事件
└── WriteThrottleEvents
└── 写入限流事件
监控示例:
javascript
async function getCapacityMetrics(tableName) {
const now = new Date();
const startTime = new Date(now.getTime() - 3600000);
const [rcu, wcu, readThrottle, writeThrottle] = await Promise.all([
cloudwatch.send(new GetMetricStatisticsCommand({
Namespace: 'AWS/DynamoDB',
MetricName: 'ConsumedReadCapacityUnits',
Dimensions: [{ Name: 'TableName', Value: tableName }],
StartTime: startTime,
EndTime: now,
Period: 300,
Statistics: ['Sum', 'Average']
})),
cloudwatch.send(new GetMetricStatisticsCommand({
Namespace: 'AWS/DynamoDB',
MetricName: 'ConsumedWriteCapacityUnits',
Dimensions: [{ Name: 'TableName', Value: tableName }],
StartTime: startTime,
EndTime: now,
Period: 300,
Statistics: ['Sum', 'Average']
})),
cloudwatch.send(new GetMetricStatisticsCommand({
Namespace: 'AWS/DynamoDB',
MetricName: 'ReadThrottleEvents',
Dimensions: [{ Name: 'TableName', Value: tableName }],
StartTime: startTime,
EndTime: now,
Period: 300,
Statistics: ['Sum']
})),
cloudwatch.send(new GetMetricStatisticsCommand({
Namespace: 'AWS/DynamoDB',
MetricName: 'WriteThrottleEvents',
Dimensions: [{ Name: 'TableName', Value: tableName }],
StartTime: startTime,
EndTime: now,
Period: 300,
Statistics: ['Sum']
}))
]);
return {
consumedRCU: rcu.Datapoints,
consumedWCU: wcu.Datapoints,
readThrottle: readThrottle.Datapoints,
writeThrottle: writeThrottle.Datapoints
};
}
2.3 错误指标 #
text
错误指标:
├── SystemErrors
│ └── 系统错误(5xx)
├── UserErrors
│ └── 用户错误(4xx)
├── ConditionalCheckFailedRequests
│ └── 条件检查失败
└── TransactionConflict
└── 事务冲突
2.4 存储指标 #
text
存储指标:
├── OnlineStoreSizeBytes
│ └── 表大小(字节)
├── OnlineStoreItemCount
│ └── 项目数量
└── GlobalSecondaryIndexSizeBytes
└── GSI大小
三、CloudWatch告警 #
3.1 创建告警 #
使用CLI:
bash
# 创建限流告警
aws cloudwatch put-metric-alarm \
--alarm-name "DynamoDB-Users-ReadThrottle" \
--alarm-description "DynamoDB read throttle events" \
--metric-name ReadThrottleEvents \
--namespace AWS/DynamoDB \
--dimensions Name=TableName,Value=Users \
--statistic Sum \
--period 300 \
--evaluation-periods 1 \
--threshold 10 \
--comparison-operator GreaterThanThreshold \
--alarm-actions arn:aws:sns:us-east-1:123456789012:alerts
使用JavaScript SDK:
javascript
const { CloudWatchClient, PutMetricAlarmCommand } = require('@aws-sdk/client-cloudwatch');
const cloudwatch = new CloudWatchClient({ region: 'us-east-1' });
async function createThrottleAlarm(tableName, snsTopicArn) {
await cloudwatch.send(new PutMetricAlarmCommand({
AlarmName: `DynamoDB-${tableName}-ReadThrottle`,
AlarmDescription: `DynamoDB read throttle events for ${tableName}`,
MetricName: 'ReadThrottleEvents',
Namespace: 'AWS/DynamoDB',
Dimensions: [
{ Name: 'TableName', Value: tableName }
],
Statistic: 'Sum',
Period: 300,
EvaluationPeriods: 1,
Threshold: 10,
ComparisonOperator: 'GreaterThanThreshold',
AlarmActions: [snsTopicArn],
TreatMissingData: 'notBreaching'
}));
}
3.2 常用告警配置 #
延迟告警:
javascript
async function createLatencyAlarm(tableName, operation, threshold) {
await cloudwatch.send(new PutMetricAlarmCommand({
AlarmName: `DynamoDB-${tableName}-${operation}-Latency`,
AlarmDescription: `High latency for ${operation} on ${tableName}`,
MetricName: 'SuccessfulRequestLatency',
Namespace: 'AWS/DynamoDB',
Dimensions: [
{ Name: 'TableName', Value: tableName },
{ Name: 'Operation', Value: operation }
],
Statistic: 'Average',
Period: 300,
EvaluationPeriods: 2,
Threshold: threshold,
ComparisonOperator: 'GreaterThanThreshold'
}));
}
容量利用率告警:
javascript
async function createCapacityUtilizationAlarm(tableName) {
// 需要使用数学表达式计算利用率
await cloudwatch.send(new PutMetricAlarmCommand({
AlarmName: `DynamoDB-${tableName}-HighCapacityUtilization`,
AlarmDescription: 'High capacity utilization',
Metrics: [
{
Id: 'consumed',
MetricStat: {
Metric: {
Namespace: 'AWS/DynamoDB',
MetricName: 'ConsumedReadCapacityUnits',
Dimensions: [{ Name: 'TableName', Value: tableName }]
},
Period: 300,
Stat: 'Sum'
}
},
{
Id: 'provisioned',
MetricStat: {
Metric: {
Namespace: 'AWS/DynamoDB',
MetricName: 'ProvisionedReadCapacityUnits',
Dimensions: [{ Name: 'TableName', Value: tableName }]
},
Period: 300,
Stat: 'Average'
}
},
{
Id: 'utilization',
Expression: '(consumed / provisioned) * 100',
Label: 'Utilization %'
}
],
EvaluationPeriods: 2,
Threshold: 80,
ComparisonOperator: 'GreaterThanThreshold'
}));
}
3.3 告警最佳实践 #
text
告警建议:
├── 设置合理的阈值
├── 使用多个评估周期
├── 配置告警通知
├── 设置告警级别
└── 定期审查告警
四、监控仪表板 #
4.1 CloudWatch仪表板 #
javascript
const { CloudWatchClient, PutDashboardCommand } = require('@aws-sdk/client-cloudwatch');
async function createDynamoDBDashboard(tableName) {
const dashboardBody = {
widgets: [
{
type: 'metric',
properties: {
metrics: [
['AWS/DynamoDB', 'ConsumedReadCapacityUnits', 'TableName', tableName],
['.', 'ConsumedWriteCapacityUnits', '.', '.']
],
period: 300,
stat: 'Sum',
region: 'us-east-1',
title: 'Capacity Consumption'
}
},
{
type: 'metric',
properties: {
metrics: [
['AWS/DynamoDB', 'SuccessfulRequestLatency', 'TableName', tableName, 'Operation', 'GetItem'],
['.', '.', '.', '.', '.', 'PutItem'],
['.', '.', '.', '.', '.', 'Query']
],
period: 300,
stat: 'Average',
region: 'us-east-1',
title: 'Latency'
}
},
{
type: 'metric',
properties: {
metrics: [
['AWS/DynamoDB', 'ReadThrottleEvents', 'TableName', tableName],
['.', 'WriteThrottleEvents', '.', '.']
],
period: 300,
stat: 'Sum',
region: 'us-east-1',
title: 'Throttle Events'
}
}
]
};
await cloudwatch.send(new PutDashboardCommand({
DashboardName: `DynamoDB-${tableName}`,
DashboardBody: JSON.stringify(dashboardBody)
}));
}
五、日志分析 #
5.1 启用错误日志 #
javascript
// DynamoDB本身不直接输出日志到CloudWatch Logs
// 但可以通过Lambda或应用日志记录
// Lambda函数记录DynamoDB操作
exports.handler = async (event) => {
const startTime = Date.now();
try {
const result = await docClient.send(new GetCommand({
TableName: 'Users',
Key: { UserId: event.userId }
}));
console.log(JSON.stringify({
operation: 'GetItem',
tableName: 'Users',
key: { UserId: event.userId },
latency: Date.now() - startTime,
success: true
}));
return result;
} catch (error) {
console.error(JSON.stringify({
operation: 'GetItem',
tableName: 'Users',
key: { UserId: event.userId },
latency: Date.now() - startTime,
success: false,
error: error.message
}));
throw error;
}
};
5.2 CloudTrail审计 #
text
CloudTrail记录:
├── 所有DynamoDB API调用
├── 调用者身份
├── 调用时间
├── 源IP地址
└── 请求参数
六、性能监控 #
6.1 性能基线 #
javascript
async function establishBaseline(tableName, duration = 86400000) {
const now = new Date();
const startTime = new Date(now.getTime() - duration);
const metrics = await Promise.all([
getMetricStatistics(tableName, 'SuccessfulRequestLatency', startTime, now, ['Average', 'p99']),
getMetricStatistics(tableName, 'ConsumedReadCapacityUnits', startTime, now, ['Sum', 'Average']),
getMetricStatistics(tableName, 'ConsumedWriteCapacityUnits', startTime, now, ['Sum', 'Average'])
]);
return {
latency: {
average: average(metrics[0].Datapoints.map(d => d.Average)),
p99: average(metrics[0].Datapoints.map(d => d.p99))
},
capacity: {
readAverage: average(metrics[1].Datapoints.map(d => d.Average)),
writeAverage: average(metrics[2].Datapoints.map(d => d.Average))
}
};
}
6.2 异常检测 #
javascript
async function detectAnomalies(tableName, baseline) {
const current = await getCurrentMetrics(tableName);
const anomalies = [];
// 检查延迟
if (current.latency.average > baseline.latency.average * 2) {
anomalies.push({
type: 'LATENCY_HIGH',
value: current.latency.average,
baseline: baseline.latency.average
});
}
// 检查限流
if (current.throttleEvents > 0) {
anomalies.push({
type: 'THROTTLE_DETECTED',
value: current.throttleEvents
});
}
return anomalies;
}
七、监控最佳实践 #
7.1 监控策略 #
text
监控策略:
├── 关键指标监控
│ ├── 延迟
│ ├── 吞吐量
│ ├── 错误率
│ └── 限流事件
├── 告警配置
│ ├── 多级告警
│ ├── 合理阈值
│ └── 及时通知
└── 定期审查
├── 评估告警有效性
├── 调整阈值
└── 优化监控策略
7.2 常见监控场景 #
text
场景监控:
├── 高延迟
│ ├── 检查查询模式
│ ├── 检查索引使用
│ └── 检查容量配置
├── 限流
│ ├── 检查容量是否足够
│ ├── 检查热点分区
│ └── 考虑Auto Scaling
└── 错误增加
├── 检查应用逻辑
├── 检查条件表达式
└── 检查网络问题
八、总结 #
监控与告警要点:
| 类别 | 关键指标 |
|---|---|
| 性能 | 延迟、吞吐量 |
| 容量 | RCU/WCU消耗、限流 |
| 错误 | 系统错误、用户错误 |
| 存储 | 表大小、项目数量 |
DynamoDB完全指南学习完成!
最后更新:2026-03-27