RocksDB监控指标 #
一、监控概述 #
1.1 为什么需要监控 #
text
监控的重要性:
├── 性能洞察 - 了解系统运行状态
├── 问题诊断 - 快速定位问题
├── 容量规划 - 预测资源需求
├── 告警预警 - 及时发现问题
└── 优化指导 - 指导性能优化
1.2 监控维度 #
| 维度 | 说明 |
|---|---|
| 性能指标 | 吞吐、延迟、命中率 |
| 资源指标 | 内存、磁盘、CPU |
| 存储指标 | 数据量、文件数、压缩比 |
| 操作指标 | 读写次数、Compaction状态 |
二、启用统计 #
2.1 创建Statistics对象 #
cpp
#include <rocksdb/db.h>
#include <rocksdb/statistics.h>
#include <rocksdb/options.h>
rocksdb::Options GetOptionsWithStatistics() {
rocksdb::Options options;
// 创建统计对象
options.statistics = rocksdb::CreateDBStatistics();
// 设置统计级别
options.statistics->set_stats_level(rocksdb::StatsLevel::kAll);
return options;
}
2.2 统计级别 #
cpp
enum StatsLevel {
kExceptHistogramOrTimers, // 不包含直方图
kExceptHistogram, // 不包含详细直方图
kExceptTimers, // 不包含计时器
kAll // 所有统计
};
// 设置级别
options.statistics->set_stats_level(rocksdb::StatsLevel::kAll);
三、性能指标 #
3.1 读写性能 #
cpp
#include <rocksdb/statistics.h>
#include <iostream>
void PrintReadWriteStats(rocksdb::DB* db) {
auto stats = db->GetOptions().statistics;
// 写入统计
uint64_t keys_written = stats->getTickerCount(rocksdb::NUMBER_KEYS_WRITTEN);
uint64_t bytes_written = stats->getTickerCount(rocksdb::BYTES_WRITTEN);
std::cout << "Keys written: " << keys_written << std::endl;
std::cout << "Bytes written: " << bytes_written / 1024 / 1024 << " MB" << std::endl;
// 读取统计
uint64_t keys_read = stats->getTickerCount(rocksdb::NUMBER_KEYS_READ);
uint64_t bytes_read = stats->getTickerCount(rocksdb::BYTES_READ);
std::cout << "Keys read: " << keys_read << std::endl;
std::cout << "Bytes read: " << bytes_read / 1024 / 1024 << " MB" << std::endl;
}
3.2 延迟统计 #
cpp
#include <rocksdb/statistics.h>
void PrintLatencyStats(rocksdb::DB* db) {
auto stats = db->GetOptions().statistics;
// 写入延迟
auto write_hist = stats->getHistogramData(rocksdb::DB_WRITE);
std::cout << "Write latency:" << std::endl;
std::cout << " P50: " << write_hist.median << " us" << std::endl;
std::cout << " P95: " << write_hist.percentile95 << " us" << std::endl;
std::cout << " P99: " << write_hist.percentile99 << " us" << std::endl;
// 读取延迟
auto read_hist = stats->getHistogramData(rocksdb::DB_GET);
std::cout << "Read latency:" << std::endl;
std::cout << " P50: " << read_hist.median << " us" << std::endl;
std::cout << " P95: " << read_hist.percentile95 << " us" << std::endl;
std::cout << " P99: " << read_hist.percentile99 << " us" << std::endl;
}
3.3 缓存命中率 #
cpp
#include <rocksdb/statistics.h>
void PrintCacheHitRate(rocksdb::DB* db) {
auto stats = db->GetOptions().statistics;
// Block Cache命中
uint64_t block_hits = stats->getTickerCount(rocksdb::BLOCK_CACHE_HIT);
uint64_t block_misses = stats->getTickerCount(rocksdb::BLOCK_CACHE_MISS);
double block_hit_rate = (double)block_hits / (block_hits + block_misses) * 100;
std::cout << "Block cache hit rate: " << block_hit_rate << "%" << std::endl;
// MemTable命中
uint64_t mem_hits = stats->getTickerCount(rocksdb::MEMTABLE_HIT);
uint64_t mem_misses = stats->getTickerCount(rocksdb::MEMTABLE_MISS);
double mem_hit_rate = (double)mem_hits / (mem_hits + mem_misses) * 100;
std::cout << "MemTable hit rate: " << mem_hit_rate << "%" << std::endl;
}
四、资源指标 #
4.1 内存使用 #
cpp
#include <rocksdb/db.h>
#include <iostream>
void PrintMemoryStats(rocksdb::DB* db) {
uint64_t value;
// MemTable内存
db->GetIntProperty("rocksdb.cur-size-all-mem-tables", &value);
std::cout << "MemTable memory: " << value / 1024 / 1024 << " MB" << std::endl;
// Block Cache使用
db->GetIntProperty("rocksdb.block-cache-usage", &value);
std::cout << "Block cache usage: " << value / 1024 / 1024 << " MB" << std::endl;
// Block Cache Pin使用
db->GetIntProperty("rocksdb.block-cache-pinned-usage", &value);
std::cout << "Block cache pinned: " << value / 1024 / 1024 << " MB" << std::endl;
// 估计内存使用总量
db->GetIntProperty("rocksdb.estimate-table-readers-mem", &value);
std::cout << "Table readers memory: " << value / 1024 / 1024 << " MB" << std::endl;
}
4.2 磁盘使用 #
cpp
#include <rocksdb/db.h>
void PrintDiskStats(rocksdb::DB* db) {
uint64_t value;
// SST文件总大小
db->GetIntProperty("rocksdb.total-sst-files-size", &value);
std::cout << "Total SST size: " << value / 1024 / 1024 / 1024 << " GB" << std::endl;
// WAL文件大小
db->GetIntProperty("rocksdb.wal-file-size", &value);
std::cout << "WAL size: " << value / 1024 / 1024 << " MB" << std::endl;
// 估计键数量
db->GetIntProperty("rocksdb.estimate-num-keys", &value);
std::cout << "Estimated keys: " << value << std::endl;
}
五、存储指标 #
5.1 各层文件统计 #
cpp
#include <rocksdb/db.h>
#include <iostream>
void PrintLevelStats(rocksdb::DB* db) {
uint64_t value;
for (int level = 0; level < 7; level++) {
std::string prop = "rocksdb.num-files-at-level" + std::to_string(level);
db->GetIntProperty(prop, &value);
std::cout << "Level " << level << " files: " << value << std::endl;
}
// 各层大小
for (int level = 0; level < 7; level++) {
std::string prop = "rocksdb.live-sst-files-size-at-level" + std::to_string(level);
db->GetIntProperty(prop, &value);
std::cout << "Level " << level << " size: " << value / 1024 / 1024 << " MB" << std::endl;
}
}
5.2 Compaction状态 #
cpp
#include <rocksdb/db.h>
void PrintCompactionStats(rocksdb::DB* db) {
uint64_t value;
// 待Compaction字节数
db->GetIntProperty("rocksdb.estimate-pending-compaction-bytes", &value);
std::cout << "Pending compaction: " << value / 1024 / 1024 << " MB" << std::endl;
// 正在进行的Compaction
db->GetIntProperty("rocksdb.num-running-compactions", &value);
std::cout << "Running compactions: " << value << std::endl;
// 正在进行的Flush
db->GetIntProperty("rocksdb.num-running-flushes", &value);
std::cout << "Running flushes: " << value << std::endl;
}
六、关键指标清单 #
6.1 性能指标 #
| 指标 | 获取方式 | 说明 |
|---|---|---|
| 读QPS | NUMBER_KEYS_READ | 每秒读取次数 |
| 写QPS | NUMBER_KEYS_WRITTEN | 每秒写入次数 |
| 读延迟 | DB_GET直方图 | 读取延迟分布 |
| 写延迟 | DB_WRITE直方图 | 写入延迟分布 |
| Block Cache命中率 | BLOCK_CACHE_HIT/MISS | 缓存效果 |
| MemTable命中率 | MEMTABLE_HIT/MISS | 内存命中效果 |
6.2 资源指标 #
| 指标 | 属性名 | 说明 |
|---|---|---|
| MemTable内存 | cur-size-all-mem-tables | MemTable占用内存 |
| Block Cache使用 | block-cache-usage | 缓存使用量 |
| SST文件大小 | total-sst-files-size | 磁盘数据量 |
| WAL大小 | wal-file-size | WAL文件大小 |
6.3 存储指标 #
| 指标 | 属性名 | 说明 |
|---|---|---|
| 各层文件数 | num-files-at-levelN | 每层SST文件数 |
| 键数量 | estimate-num-keys | 估计键数量 |
| 待Compaction | estimate-pending-compaction-bytes | 待合并数据量 |
七、监控集成 #
7.1 Prometheus集成 #
cpp
#include <rocksdb/db.h>
#include <string>
#include <map>
class RocksDBMetrics {
public:
RocksDBMetrics(rocksdb::DB* db) : db_(db) {}
std::map<std::string, uint64_t> CollectMetrics() {
std::map<std::string, uint64_t> metrics;
// 内存指标
uint64_t value;
db_->GetIntProperty("rocksdb.cur-size-all-mem-tables", &value);
metrics["rocksdb_memtable_bytes"] = value;
db_->GetIntProperty("rocksdb.block-cache-usage", &value);
metrics["rocksdb_block_cache_bytes"] = value;
// 存储指标
db_->GetIntProperty("rocksdb.total-sst-files-size", &value);
metrics["rocksdb_sst_bytes"] = value;
db_->GetIntProperty("rocksdb.estimate-num-keys", &value);
metrics["rocksdb_num_keys"] = value;
// Compaction指标
db_->GetIntProperty("rocksdb.estimate-pending-compaction-bytes", &value);
metrics["rocksdb_pending_compaction_bytes"] = value;
return metrics;
}
std::string ExportPrometheus() {
auto metrics = CollectMetrics();
std::string output;
for (const auto& [name, value] : metrics) {
output += name + " " + std::to_string(value) + "\n";
}
return output;
}
private:
rocksdb::DB* db_;
};
7.2 定时采集 #
cpp
#include <rocksdb/db.h>
#include <chrono>
#include <thread>
void MonitorLoop(rocksdb::DB* db, int interval_seconds) {
while (true) {
// 采集指标
uint64_t mem_usage, cache_usage, sst_size;
db->GetIntProperty("rocksdb.cur-size-all-mem-tables", &mem_usage);
db->GetIntProperty("rocksdb.block-cache-usage", &cache_usage);
db->GetIntProperty("rocksdb.total-sst-files-size", &sst_size);
// 输出或发送到监控系统
std::cout << "Memory: " << mem_usage / 1024 / 1024 << " MB, "
<< "Cache: " << cache_usage / 1024 / 1024 << " MB, "
<< "SST: " << sst_size / 1024 / 1024 << " MB" << std::endl;
std::this_thread::sleep_for(std::chrono::seconds(interval_seconds));
}
}
八、告警规则 #
8.1 告警阈值建议 #
| 指标 | 警告阈值 | 严重阈值 | 说明 |
|---|---|---|---|
| Block Cache命中率 | < 80% | < 60% | 缓存效果下降 |
| MemTable数量 | > 6 | > 8 | 写入压力大 |
| L0文件数 | > 20 | > 30 | Compaction滞后 |
| 待Compaction | > 10GB | > 50GB | Compaction积压 |
| 读延迟P99 | > 10ms | > 50ms | 读取性能下降 |
| 写延迟P99 | > 20ms | > 100ms | 写入性能下降 |
8.2 告警检查函数 #
cpp
#include <rocksdb/db.h>
#include <string>
struct Alert {
std::string name;
std::string level; // "warning" or "critical"
std::string message;
};
std::vector<Alert> CheckAlerts(rocksdb::DB* db) {
std::vector<Alert> alerts;
uint64_t value;
// 检查L0文件数
db->GetIntProperty("rocksdb.num-files-at-level0", &value);
if (value > 30) {
alerts.push_back({"l0_files", "critical",
"L0 files " + std::to_string(value) + " > 30"});
} else if (value > 20) {
alerts.push_back({"l0_files", "warning",
"L0 files " + std::to_string(value) + " > 20"});
}
// 检查MemTable数量
db->GetIntProperty("rocksdb.num-immutable-mem-table", &value);
if (value > 8) {
alerts.push_back({"memtable_count", "critical",
"Immutable MemTables " + std::to_string(value) + " > 8"});
}
// 检查待Compaction
db->GetIntProperty("rocksdb.estimate-pending-compaction-bytes", &value);
if (value > 50 * 1024 * 1024 * 1024) {
alerts.push_back({"pending_compaction", "critical",
"Pending compaction > 50GB"});
}
return alerts;
}
九、最佳实践 #
9.1 监控建议 #
- 持续监控:建立长期监控机制
- 设置告警:配置关键指标告警
- 定期分析:分析性能趋势
- 基线对比:建立性能基线
- 容量规划:预测资源需求
9.2 监控清单 #
| 类别 | 指标 | 频率 |
|---|---|---|
| 性能 | QPS、延迟 | 实时 |
| 资源 | 内存、磁盘 | 1分钟 |
| 存储 | 文件数、大小 | 5分钟 |
| Compaction | 状态、进度 | 1分钟 |
十、总结 #
10.1 关键监控指标 #
| 类别 | 核心指标 |
|---|---|
| 性能 | QPS、延迟、命中率 |
| 资源 | 内存、磁盘、CPU |
| 存储 | 文件数、数据量 |
| Compaction | 待处理量、进度 |
10.2 关键要点 #
- 启用统计:创建Statistics对象
- 关键指标:关注性能和资源指标
- 设置告警:配置合理的告警阈值
- 定期分析:分析趋势和异常
- 持续优化:基于监控数据优化
下一步,让我们学习工具命令!
最后更新:2026-03-27