DynamoDB监控与告警 #

一、监控概述 #

1.1 监控维度 #

text
DynamoDB监控维度:
├── 性能指标
│   ├── 延迟
│   ├── 吞吐量
│   └── 错误率
├── 容量指标
│   ├── 消耗容量
│   ├── 限流事件
│   └── 利用率
├── 存储指标
│   ├── 表大小
│   ├── 项目数量
│   └── 索引大小
└── 运维指标
    ├── 备份状态
    ├── TTL状态
    └── 复制延迟

1.2 监控工具 #

工具 用途
CloudWatch Metrics 指标监控
CloudWatch Alarms 告警配置
CloudWatch Logs 日志分析
CloudTrail API审计
X-Ray 请求追踪

二、关键指标 #

2.1 性能指标 #

延迟指标:

text
延迟指标:
├── SuccessfulRequestLatency
│   └── 成功请求延迟
├── 按操作类型分类
│   ├── GetItem
│   ├── PutItem
│   ├── Query
│   ├── Scan
│   └── UpdateItem
└── 单位:毫秒

监控示例:

javascript
const { CloudWatchClient, GetMetricStatisticsCommand } = require('@aws-sdk/client-cloudwatch');

const cloudwatch = new CloudWatchClient({ region: 'us-east-1' });

async function getLatencyMetrics(tableName, operation) {
  const response = await cloudwatch.send(new GetMetricStatisticsCommand({
    Namespace: 'AWS/DynamoDB',
    MetricName: 'SuccessfulRequestLatency',
    Dimensions: [
      { Name: 'TableName', Value: tableName },
      { Name: 'Operation', Value: operation }
    ],
    StartTime: new Date(Date.now() - 3600000),
    EndTime: new Date(),
    Period: 300,
    Statistics: ['Average', 'p99', 'Maximum']
  }));
  
  return response.Datapoints;
}

2.2 容量指标 #

吞吐量指标:

text
吞吐量指标:
├── ConsumedReadCapacityUnits
│   └── 消耗的RCU
├── ConsumedWriteCapacityUnits
│   └── 消耗的WCU
├── ReadThrottleEvents
│   └── 读取限流事件
└── WriteThrottleEvents
    └── 写入限流事件

监控示例:

javascript
async function getCapacityMetrics(tableName) {
  const now = new Date();
  const startTime = new Date(now.getTime() - 3600000);
  
  const [rcu, wcu, readThrottle, writeThrottle] = await Promise.all([
    cloudwatch.send(new GetMetricStatisticsCommand({
      Namespace: 'AWS/DynamoDB',
      MetricName: 'ConsumedReadCapacityUnits',
      Dimensions: [{ Name: 'TableName', Value: tableName }],
      StartTime: startTime,
      EndTime: now,
      Period: 300,
      Statistics: ['Sum', 'Average']
    })),
    cloudwatch.send(new GetMetricStatisticsCommand({
      Namespace: 'AWS/DynamoDB',
      MetricName: 'ConsumedWriteCapacityUnits',
      Dimensions: [{ Name: 'TableName', Value: tableName }],
      StartTime: startTime,
      EndTime: now,
      Period: 300,
      Statistics: ['Sum', 'Average']
    })),
    cloudwatch.send(new GetMetricStatisticsCommand({
      Namespace: 'AWS/DynamoDB',
      MetricName: 'ReadThrottleEvents',
      Dimensions: [{ Name: 'TableName', Value: tableName }],
      StartTime: startTime,
      EndTime: now,
      Period: 300,
      Statistics: ['Sum']
    })),
    cloudwatch.send(new GetMetricStatisticsCommand({
      Namespace: 'AWS/DynamoDB',
      MetricName: 'WriteThrottleEvents',
      Dimensions: [{ Name: 'TableName', Value: tableName }],
      StartTime: startTime,
      EndTime: now,
      Period: 300,
      Statistics: ['Sum']
    }))
  ]);
  
  return {
    consumedRCU: rcu.Datapoints,
    consumedWCU: wcu.Datapoints,
    readThrottle: readThrottle.Datapoints,
    writeThrottle: writeThrottle.Datapoints
  };
}

2.3 错误指标 #

text
错误指标:
├── SystemErrors
│   └── 系统错误(5xx)
├── UserErrors
│   └── 用户错误(4xx)
├── ConditionalCheckFailedRequests
│   └── 条件检查失败
└── TransactionConflict
    └── 事务冲突

2.4 存储指标 #

text
存储指标:
├── OnlineStoreSizeBytes
│   └── 表大小(字节)
├── OnlineStoreItemCount
│   └── 项目数量
└── GlobalSecondaryIndexSizeBytes
    └── GSI大小

三、CloudWatch告警 #

3.1 创建告警 #

使用CLI:

bash
# 创建限流告警
aws cloudwatch put-metric-alarm \
  --alarm-name "DynamoDB-Users-ReadThrottle" \
  --alarm-description "DynamoDB read throttle events" \
  --metric-name ReadThrottleEvents \
  --namespace AWS/DynamoDB \
  --dimensions Name=TableName,Value=Users \
  --statistic Sum \
  --period 300 \
  --evaluation-periods 1 \
  --threshold 10 \
  --comparison-operator GreaterThanThreshold \
  --alarm-actions arn:aws:sns:us-east-1:123456789012:alerts

使用JavaScript SDK:

javascript
const { CloudWatchClient, PutMetricAlarmCommand } = require('@aws-sdk/client-cloudwatch');

const cloudwatch = new CloudWatchClient({ region: 'us-east-1' });

async function createThrottleAlarm(tableName, snsTopicArn) {
  await cloudwatch.send(new PutMetricAlarmCommand({
    AlarmName: `DynamoDB-${tableName}-ReadThrottle`,
    AlarmDescription: `DynamoDB read throttle events for ${tableName}`,
    MetricName: 'ReadThrottleEvents',
    Namespace: 'AWS/DynamoDB',
    Dimensions: [
      { Name: 'TableName', Value: tableName }
    ],
    Statistic: 'Sum',
    Period: 300,
    EvaluationPeriods: 1,
    Threshold: 10,
    ComparisonOperator: 'GreaterThanThreshold',
    AlarmActions: [snsTopicArn],
    TreatMissingData: 'notBreaching'
  }));
}

3.2 常用告警配置 #

延迟告警:

javascript
async function createLatencyAlarm(tableName, operation, threshold) {
  await cloudwatch.send(new PutMetricAlarmCommand({
    AlarmName: `DynamoDB-${tableName}-${operation}-Latency`,
    AlarmDescription: `High latency for ${operation} on ${tableName}`,
    MetricName: 'SuccessfulRequestLatency',
    Namespace: 'AWS/DynamoDB',
    Dimensions: [
      { Name: 'TableName', Value: tableName },
      { Name: 'Operation', Value: operation }
    ],
    Statistic: 'Average',
    Period: 300,
    EvaluationPeriods: 2,
    Threshold: threshold,
    ComparisonOperator: 'GreaterThanThreshold'
  }));
}

容量利用率告警:

javascript
async function createCapacityUtilizationAlarm(tableName) {
  // 需要使用数学表达式计算利用率
  await cloudwatch.send(new PutMetricAlarmCommand({
    AlarmName: `DynamoDB-${tableName}-HighCapacityUtilization`,
    AlarmDescription: 'High capacity utilization',
    Metrics: [
      {
        Id: 'consumed',
        MetricStat: {
          Metric: {
            Namespace: 'AWS/DynamoDB',
            MetricName: 'ConsumedReadCapacityUnits',
            Dimensions: [{ Name: 'TableName', Value: tableName }]
          },
          Period: 300,
          Stat: 'Sum'
        }
      },
      {
        Id: 'provisioned',
        MetricStat: {
          Metric: {
            Namespace: 'AWS/DynamoDB',
            MetricName: 'ProvisionedReadCapacityUnits',
            Dimensions: [{ Name: 'TableName', Value: tableName }]
          },
          Period: 300,
          Stat: 'Average'
        }
      },
      {
        Id: 'utilization',
        Expression: '(consumed / provisioned) * 100',
        Label: 'Utilization %'
      }
    ],
    EvaluationPeriods: 2,
    Threshold: 80,
    ComparisonOperator: 'GreaterThanThreshold'
  }));
}

3.3 告警最佳实践 #

text
告警建议:
├── 设置合理的阈值
├── 使用多个评估周期
├── 配置告警通知
├── 设置告警级别
└── 定期审查告警

四、监控仪表板 #

4.1 CloudWatch仪表板 #

javascript
const { CloudWatchClient, PutDashboardCommand } = require('@aws-sdk/client-cloudwatch');

async function createDynamoDBDashboard(tableName) {
  const dashboardBody = {
    widgets: [
      {
        type: 'metric',
        properties: {
          metrics: [
            ['AWS/DynamoDB', 'ConsumedReadCapacityUnits', 'TableName', tableName],
            ['.', 'ConsumedWriteCapacityUnits', '.', '.']
          ],
          period: 300,
          stat: 'Sum',
          region: 'us-east-1',
          title: 'Capacity Consumption'
        }
      },
      {
        type: 'metric',
        properties: {
          metrics: [
            ['AWS/DynamoDB', 'SuccessfulRequestLatency', 'TableName', tableName, 'Operation', 'GetItem'],
            ['.', '.', '.', '.', '.', 'PutItem'],
            ['.', '.', '.', '.', '.', 'Query']
          ],
          period: 300,
          stat: 'Average',
          region: 'us-east-1',
          title: 'Latency'
        }
      },
      {
        type: 'metric',
        properties: {
          metrics: [
            ['AWS/DynamoDB', 'ReadThrottleEvents', 'TableName', tableName],
            ['.', 'WriteThrottleEvents', '.', '.']
          ],
          period: 300,
          stat: 'Sum',
          region: 'us-east-1',
          title: 'Throttle Events'
        }
      }
    ]
  };
  
  await cloudwatch.send(new PutDashboardCommand({
    DashboardName: `DynamoDB-${tableName}`,
    DashboardBody: JSON.stringify(dashboardBody)
  }));
}

五、日志分析 #

5.1 启用错误日志 #

javascript
// DynamoDB本身不直接输出日志到CloudWatch Logs
// 但可以通过Lambda或应用日志记录

// Lambda函数记录DynamoDB操作
exports.handler = async (event) => {
  const startTime = Date.now();
  
  try {
    const result = await docClient.send(new GetCommand({
      TableName: 'Users',
      Key: { UserId: event.userId }
    }));
    
    console.log(JSON.stringify({
      operation: 'GetItem',
      tableName: 'Users',
      key: { UserId: event.userId },
      latency: Date.now() - startTime,
      success: true
    }));
    
    return result;
  } catch (error) {
    console.error(JSON.stringify({
      operation: 'GetItem',
      tableName: 'Users',
      key: { UserId: event.userId },
      latency: Date.now() - startTime,
      success: false,
      error: error.message
    }));
    
    throw error;
  }
};

5.2 CloudTrail审计 #

text
CloudTrail记录:
├── 所有DynamoDB API调用
├── 调用者身份
├── 调用时间
├── 源IP地址
└── 请求参数

六、性能监控 #

6.1 性能基线 #

javascript
async function establishBaseline(tableName, duration = 86400000) {
  const now = new Date();
  const startTime = new Date(now.getTime() - duration);
  
  const metrics = await Promise.all([
    getMetricStatistics(tableName, 'SuccessfulRequestLatency', startTime, now, ['Average', 'p99']),
    getMetricStatistics(tableName, 'ConsumedReadCapacityUnits', startTime, now, ['Sum', 'Average']),
    getMetricStatistics(tableName, 'ConsumedWriteCapacityUnits', startTime, now, ['Sum', 'Average'])
  ]);
  
  return {
    latency: {
      average: average(metrics[0].Datapoints.map(d => d.Average)),
      p99: average(metrics[0].Datapoints.map(d => d.p99))
    },
    capacity: {
      readAverage: average(metrics[1].Datapoints.map(d => d.Average)),
      writeAverage: average(metrics[2].Datapoints.map(d => d.Average))
    }
  };
}

6.2 异常检测 #

javascript
async function detectAnomalies(tableName, baseline) {
  const current = await getCurrentMetrics(tableName);
  
  const anomalies = [];
  
  // 检查延迟
  if (current.latency.average > baseline.latency.average * 2) {
    anomalies.push({
      type: 'LATENCY_HIGH',
      value: current.latency.average,
      baseline: baseline.latency.average
    });
  }
  
  // 检查限流
  if (current.throttleEvents > 0) {
    anomalies.push({
      type: 'THROTTLE_DETECTED',
      value: current.throttleEvents
    });
  }
  
  return anomalies;
}

七、监控最佳实践 #

7.1 监控策略 #

text
监控策略:
├── 关键指标监控
│   ├── 延迟
│   ├── 吞吐量
│   ├── 错误率
│   └── 限流事件
├── 告警配置
│   ├── 多级告警
│   ├── 合理阈值
│   └── 及时通知
└── 定期审查
    ├── 评估告警有效性
    ├── 调整阈值
    └── 优化监控策略

7.2 常见监控场景 #

text
场景监控:
├── 高延迟
│   ├── 检查查询模式
│   ├── 检查索引使用
│   └── 检查容量配置
├── 限流
│   ├── 检查容量是否足够
│   ├── 检查热点分区
│   └── 考虑Auto Scaling
└── 错误增加
    ├── 检查应用逻辑
    ├── 检查条件表达式
    └── 检查网络问题

八、总结 #

监控与告警要点:

类别 关键指标
性能 延迟、吞吐量
容量 RCU/WCU消耗、限流
错误 系统错误、用户错误
存储 表大小、项目数量

DynamoDB完全指南学习完成!

最后更新:2026-03-27