RocksDB监控指标 #

一、监控概述 #

1.1 为什么需要监控 #

text
监控的重要性:
├── 性能洞察 - 了解系统运行状态
├── 问题诊断 - 快速定位问题
├── 容量规划 - 预测资源需求
├── 告警预警 - 及时发现问题
└── 优化指导 - 指导性能优化

1.2 监控维度 #

维度 说明
性能指标 吞吐、延迟、命中率
资源指标 内存、磁盘、CPU
存储指标 数据量、文件数、压缩比
操作指标 读写次数、Compaction状态

二、启用统计 #

2.1 创建Statistics对象 #

cpp
#include <rocksdb/db.h>
#include <rocksdb/statistics.h>
#include <rocksdb/options.h>

rocksdb::Options GetOptionsWithStatistics() {
    rocksdb::Options options;
    
    // 创建统计对象
    options.statistics = rocksdb::CreateDBStatistics();
    
    // 设置统计级别
    options.statistics->set_stats_level(rocksdb::StatsLevel::kAll);
    
    return options;
}

2.2 统计级别 #

cpp
enum StatsLevel {
    kExceptHistogramOrTimers,  // 不包含直方图
    kExceptHistogram,          // 不包含详细直方图
    kExceptTimers,             // 不包含计时器
    kAll                       // 所有统计
};

// 设置级别
options.statistics->set_stats_level(rocksdb::StatsLevel::kAll);

三、性能指标 #

3.1 读写性能 #

cpp
#include <rocksdb/statistics.h>
#include <iostream>

void PrintReadWriteStats(rocksdb::DB* db) {
    auto stats = db->GetOptions().statistics;
    
    // 写入统计
    uint64_t keys_written = stats->getTickerCount(rocksdb::NUMBER_KEYS_WRITTEN);
    uint64_t bytes_written = stats->getTickerCount(rocksdb::BYTES_WRITTEN);
    
    std::cout << "Keys written: " << keys_written << std::endl;
    std::cout << "Bytes written: " << bytes_written / 1024 / 1024 << " MB" << std::endl;
    
    // 读取统计
    uint64_t keys_read = stats->getTickerCount(rocksdb::NUMBER_KEYS_READ);
    uint64_t bytes_read = stats->getTickerCount(rocksdb::BYTES_READ);
    
    std::cout << "Keys read: " << keys_read << std::endl;
    std::cout << "Bytes read: " << bytes_read / 1024 / 1024 << " MB" << std::endl;
}

3.2 延迟统计 #

cpp
#include <rocksdb/statistics.h>

void PrintLatencyStats(rocksdb::DB* db) {
    auto stats = db->GetOptions().statistics;
    
    // 写入延迟
    auto write_hist = stats->getHistogramData(rocksdb::DB_WRITE);
    std::cout << "Write latency:" << std::endl;
    std::cout << "  P50: " << write_hist.median << " us" << std::endl;
    std::cout << "  P95: " << write_hist.percentile95 << " us" << std::endl;
    std::cout << "  P99: " << write_hist.percentile99 << " us" << std::endl;
    
    // 读取延迟
    auto read_hist = stats->getHistogramData(rocksdb::DB_GET);
    std::cout << "Read latency:" << std::endl;
    std::cout << "  P50: " << read_hist.median << " us" << std::endl;
    std::cout << "  P95: " << read_hist.percentile95 << " us" << std::endl;
    std::cout << "  P99: " << read_hist.percentile99 << " us" << std::endl;
}

3.3 缓存命中率 #

cpp
#include <rocksdb/statistics.h>

void PrintCacheHitRate(rocksdb::DB* db) {
    auto stats = db->GetOptions().statistics;
    
    // Block Cache命中
    uint64_t block_hits = stats->getTickerCount(rocksdb::BLOCK_CACHE_HIT);
    uint64_t block_misses = stats->getTickerCount(rocksdb::BLOCK_CACHE_MISS);
    
    double block_hit_rate = (double)block_hits / (block_hits + block_misses) * 100;
    std::cout << "Block cache hit rate: " << block_hit_rate << "%" << std::endl;
    
    // MemTable命中
    uint64_t mem_hits = stats->getTickerCount(rocksdb::MEMTABLE_HIT);
    uint64_t mem_misses = stats->getTickerCount(rocksdb::MEMTABLE_MISS);
    
    double mem_hit_rate = (double)mem_hits / (mem_hits + mem_misses) * 100;
    std::cout << "MemTable hit rate: " << mem_hit_rate << "%" << std::endl;
}

四、资源指标 #

4.1 内存使用 #

cpp
#include <rocksdb/db.h>
#include <iostream>

void PrintMemoryStats(rocksdb::DB* db) {
    uint64_t value;
    
    // MemTable内存
    db->GetIntProperty("rocksdb.cur-size-all-mem-tables", &value);
    std::cout << "MemTable memory: " << value / 1024 / 1024 << " MB" << std::endl;
    
    // Block Cache使用
    db->GetIntProperty("rocksdb.block-cache-usage", &value);
    std::cout << "Block cache usage: " << value / 1024 / 1024 << " MB" << std::endl;
    
    // Block Cache Pin使用
    db->GetIntProperty("rocksdb.block-cache-pinned-usage", &value);
    std::cout << "Block cache pinned: " << value / 1024 / 1024 << " MB" << std::endl;
    
    // 估计内存使用总量
    db->GetIntProperty("rocksdb.estimate-table-readers-mem", &value);
    std::cout << "Table readers memory: " << value / 1024 / 1024 << " MB" << std::endl;
}

4.2 磁盘使用 #

cpp
#include <rocksdb/db.h>

void PrintDiskStats(rocksdb::DB* db) {
    uint64_t value;
    
    // SST文件总大小
    db->GetIntProperty("rocksdb.total-sst-files-size", &value);
    std::cout << "Total SST size: " << value / 1024 / 1024 / 1024 << " GB" << std::endl;
    
    // WAL文件大小
    db->GetIntProperty("rocksdb.wal-file-size", &value);
    std::cout << "WAL size: " << value / 1024 / 1024 << " MB" << std::endl;
    
    // 估计键数量
    db->GetIntProperty("rocksdb.estimate-num-keys", &value);
    std::cout << "Estimated keys: " << value << std::endl;
}

五、存储指标 #

5.1 各层文件统计 #

cpp
#include <rocksdb/db.h>
#include <iostream>

void PrintLevelStats(rocksdb::DB* db) {
    uint64_t value;
    
    for (int level = 0; level < 7; level++) {
        std::string prop = "rocksdb.num-files-at-level" + std::to_string(level);
        db->GetIntProperty(prop, &value);
        std::cout << "Level " << level << " files: " << value << std::endl;
    }
    
    // 各层大小
    for (int level = 0; level < 7; level++) {
        std::string prop = "rocksdb.live-sst-files-size-at-level" + std::to_string(level);
        db->GetIntProperty(prop, &value);
        std::cout << "Level " << level << " size: " << value / 1024 / 1024 << " MB" << std::endl;
    }
}

5.2 Compaction状态 #

cpp
#include <rocksdb/db.h>

void PrintCompactionStats(rocksdb::DB* db) {
    uint64_t value;
    
    // 待Compaction字节数
    db->GetIntProperty("rocksdb.estimate-pending-compaction-bytes", &value);
    std::cout << "Pending compaction: " << value / 1024 / 1024 << " MB" << std::endl;
    
    // 正在进行的Compaction
    db->GetIntProperty("rocksdb.num-running-compactions", &value);
    std::cout << "Running compactions: " << value << std::endl;
    
    // 正在进行的Flush
    db->GetIntProperty("rocksdb.num-running-flushes", &value);
    std::cout << "Running flushes: " << value << std::endl;
}

六、关键指标清单 #

6.1 性能指标 #

指标 获取方式 说明
读QPS NUMBER_KEYS_READ 每秒读取次数
写QPS NUMBER_KEYS_WRITTEN 每秒写入次数
读延迟 DB_GET直方图 读取延迟分布
写延迟 DB_WRITE直方图 写入延迟分布
Block Cache命中率 BLOCK_CACHE_HIT/MISS 缓存效果
MemTable命中率 MEMTABLE_HIT/MISS 内存命中效果

6.2 资源指标 #

指标 属性名 说明
MemTable内存 cur-size-all-mem-tables MemTable占用内存
Block Cache使用 block-cache-usage 缓存使用量
SST文件大小 total-sst-files-size 磁盘数据量
WAL大小 wal-file-size WAL文件大小

6.3 存储指标 #

指标 属性名 说明
各层文件数 num-files-at-levelN 每层SST文件数
键数量 estimate-num-keys 估计键数量
待Compaction estimate-pending-compaction-bytes 待合并数据量

七、监控集成 #

7.1 Prometheus集成 #

cpp
#include <rocksdb/db.h>
#include <string>
#include <map>

class RocksDBMetrics {
public:
    RocksDBMetrics(rocksdb::DB* db) : db_(db) {}
    
    std::map<std::string, uint64_t> CollectMetrics() {
        std::map<std::string, uint64_t> metrics;
        
        // 内存指标
        uint64_t value;
        db_->GetIntProperty("rocksdb.cur-size-all-mem-tables", &value);
        metrics["rocksdb_memtable_bytes"] = value;
        
        db_->GetIntProperty("rocksdb.block-cache-usage", &value);
        metrics["rocksdb_block_cache_bytes"] = value;
        
        // 存储指标
        db_->GetIntProperty("rocksdb.total-sst-files-size", &value);
        metrics["rocksdb_sst_bytes"] = value;
        
        db_->GetIntProperty("rocksdb.estimate-num-keys", &value);
        metrics["rocksdb_num_keys"] = value;
        
        // Compaction指标
        db_->GetIntProperty("rocksdb.estimate-pending-compaction-bytes", &value);
        metrics["rocksdb_pending_compaction_bytes"] = value;
        
        return metrics;
    }
    
    std::string ExportPrometheus() {
        auto metrics = CollectMetrics();
        std::string output;
        
        for (const auto& [name, value] : metrics) {
            output += name + " " + std::to_string(value) + "\n";
        }
        
        return output;
    }

private:
    rocksdb::DB* db_;
};

7.2 定时采集 #

cpp
#include <rocksdb/db.h>
#include <chrono>
#include <thread>

void MonitorLoop(rocksdb::DB* db, int interval_seconds) {
    while (true) {
        // 采集指标
        uint64_t mem_usage, cache_usage, sst_size;
        
        db->GetIntProperty("rocksdb.cur-size-all-mem-tables", &mem_usage);
        db->GetIntProperty("rocksdb.block-cache-usage", &cache_usage);
        db->GetIntProperty("rocksdb.total-sst-files-size", &sst_size);
        
        // 输出或发送到监控系统
        std::cout << "Memory: " << mem_usage / 1024 / 1024 << " MB, "
                  << "Cache: " << cache_usage / 1024 / 1024 << " MB, "
                  << "SST: " << sst_size / 1024 / 1024 << " MB" << std::endl;
        
        std::this_thread::sleep_for(std::chrono::seconds(interval_seconds));
    }
}

八、告警规则 #

8.1 告警阈值建议 #

指标 警告阈值 严重阈值 说明
Block Cache命中率 < 80% < 60% 缓存效果下降
MemTable数量 > 6 > 8 写入压力大
L0文件数 > 20 > 30 Compaction滞后
待Compaction > 10GB > 50GB Compaction积压
读延迟P99 > 10ms > 50ms 读取性能下降
写延迟P99 > 20ms > 100ms 写入性能下降

8.2 告警检查函数 #

cpp
#include <rocksdb/db.h>
#include <string>

struct Alert {
    std::string name;
    std::string level;  // "warning" or "critical"
    std::string message;
};

std::vector<Alert> CheckAlerts(rocksdb::DB* db) {
    std::vector<Alert> alerts;
    uint64_t value;
    
    // 检查L0文件数
    db->GetIntProperty("rocksdb.num-files-at-level0", &value);
    if (value > 30) {
        alerts.push_back({"l0_files", "critical", 
            "L0 files " + std::to_string(value) + " > 30"});
    } else if (value > 20) {
        alerts.push_back({"l0_files", "warning", 
            "L0 files " + std::to_string(value) + " > 20"});
    }
    
    // 检查MemTable数量
    db->GetIntProperty("rocksdb.num-immutable-mem-table", &value);
    if (value > 8) {
        alerts.push_back({"memtable_count", "critical", 
            "Immutable MemTables " + std::to_string(value) + " > 8"});
    }
    
    // 检查待Compaction
    db->GetIntProperty("rocksdb.estimate-pending-compaction-bytes", &value);
    if (value > 50 * 1024 * 1024 * 1024) {
        alerts.push_back({"pending_compaction", "critical", 
            "Pending compaction > 50GB"});
    }
    
    return alerts;
}

九、最佳实践 #

9.1 监控建议 #

  1. 持续监控:建立长期监控机制
  2. 设置告警:配置关键指标告警
  3. 定期分析:分析性能趋势
  4. 基线对比:建立性能基线
  5. 容量规划:预测资源需求

9.2 监控清单 #

类别 指标 频率
性能 QPS、延迟 实时
资源 内存、磁盘 1分钟
存储 文件数、大小 5分钟
Compaction 状态、进度 1分钟

十、总结 #

10.1 关键监控指标 #

类别 核心指标
性能 QPS、延迟、命中率
资源 内存、磁盘、CPU
存储 文件数、数据量
Compaction 待处理量、进度

10.2 关键要点 #

  1. 启用统计:创建Statistics对象
  2. 关键指标:关注性能和资源指标
  3. 设置告警:配置合理的告警阈值
  4. 定期分析:分析趋势和异常
  5. 持续优化:基于监控数据优化

下一步,让我们学习工具命令!

最后更新:2026-03-27