监控运维 #
一、监控概述 #
1.1 监控维度 #
text
监控维度:
系统监控
├── CPU使用率
├── 内存使用率
├── 磁盘I/O
├── 网络流量
└── 系统负载
数据库监控
├── 连接数
├── 查询性能
├── 锁等待
├── 事务状态
└── 缓存命中率
TimescaleDB监控
├── 超表状态
├── 分片状态
├── 压缩状态
├── 策略执行
└── 后台作业
1.2 监控工具 #
text
监控工具选择:
PostgreSQL工具
├── pg_stat_activity
├── pg_stat_statements
├── pg_stat_database
└── pg_stat_user_tables
TimescaleDB工具
├── timescaledb_information视图
├── 后台作业监控
└── 分片状态监控
第三方工具
├── Prometheus + Grafana
├── pgAdmin
├── Datadog
└── Zabbix
二、健康检查 #
2.1 数据库健康检查 #
sql
-- 检查数据库连接
SELECT
datname,
numbackends as active_connections,
xact_commit,
xact_rollback,
blks_read,
blks_hit
FROM pg_stat_database
WHERE datname = 'tsdb';
-- 检查活跃连接
SELECT
pid,
usename,
application_name,
client_addr,
state,
query_start,
query
FROM pg_stat_activity
WHERE state = 'active';
-- 检查长时间运行的查询
SELECT
pid,
now() - query_start AS duration,
query,
state
FROM pg_stat_activity
WHERE (now() - query_start) > interval '5 minutes'
ORDER BY duration DESC;
2.2 超表健康检查 #
sql
-- 检查超表状态
SELECT
hypertable_name,
num_chunks,
table_size,
index_size,
toast_size,
total_size
FROM timescaledb_information.hypertables;
-- 检查分片状态
SELECT
hypertable_name,
count(*) as total_chunks,
sum(CASE WHEN is_compressed THEN 1 ELSE 0 END) as compressed_chunks,
sum(CASE WHEN NOT is_compressed THEN 1 ELSE 0 END) as uncompressed_chunks
FROM timescaledb_information.chunks
GROUP BY hypertable_name;
-- 检查分片大小分布
SELECT
hypertable_name,
pg_size_pretty(avg(total_bytes)::bigint) as avg_size,
pg_size_pretty(min(total_bytes)::bigint) as min_size,
pg_size_pretty(max(total_bytes)::bigint) as max_size
FROM chunk_relation_size('sensor_data')
GROUP BY hypertable_name;
2.3 策略健康检查 #
sql
-- 检查后台作业状态
SELECT
job_id,
application_name,
schedule_interval,
last_run,
next_run,
last_run_success,
total_runs,
total_failures
FROM timescaledb_information.jobs j
LEFT JOIN timescaledb_information.job_stats s ON j.job_id = s.job_id
ORDER BY job_id;
-- 检查策略执行错误
SELECT
job_id,
err_time,
err_message
FROM timescaledb_information.job_errors
ORDER BY err_time DESC
LIMIT 10;
-- 检查压缩策略
SELECT
hypertable_name,
config->'compress_after' as compress_after
FROM timescaledb_information.jobs
WHERE proc_name = 'policy_compression';
-- 检查保留策略
SELECT
hypertable_name,
config->'drop_after' as drop_after
FROM timescaledb_information.jobs
WHERE proc_name = 'policy_retention';
三、性能监控 #
3.1 查询性能监控 #
sql
-- 启用pg_stat_statements扩展
CREATE EXTENSION IF NOT EXISTS pg_stat_statements;
-- 查看慢查询
SELECT
query,
calls,
total_time / 1000 as total_seconds,
mean_time / 1000 as mean_seconds,
max_time / 1000 as max_seconds,
rows
FROM pg_stat_statements
ORDER BY mean_time DESC
LIMIT 10;
-- 查看最耗时的查询
SELECT
query,
total_time / 1000 as total_seconds,
calls
FROM pg_stat_statements
ORDER BY total_time DESC
LIMIT 10;
-- 查看最频繁的查询
SELECT
query,
calls,
mean_time / 1000 as mean_seconds
FROM pg_stat_statements
ORDER BY calls DESC
LIMIT 10;
3.2 索引使用监控 #
sql
-- 查看索引使用情况
SELECT
schemaname,
tablename,
indexrelname,
idx_scan,
idx_tup_read,
idx_tup_fetch,
pg_size_pretty(pg_relation_size(indexrelid)) as size
FROM pg_stat_user_indexes
ORDER BY idx_scan;
-- 查找未使用的索引
SELECT
schemaname,
tablename,
indexrelname,
pg_size_pretty(pg_relation_size(indexrelid)) as size
FROM pg_stat_user_indexes
WHERE idx_scan = 0
ORDER BY pg_relation_size(indexrelid) DESC;
-- 查看表扫描统计
SELECT
schemaname,
tablename,
seq_scan,
seq_tup_read,
idx_scan,
idx_tup_fetch,
CASE
WHEN seq_scan > idx_scan THEN 'SEQ_SCAN_HIGH'
ELSE 'OK'
END as scan_status
FROM pg_stat_user_tables
ORDER BY seq_tup_read DESC;
3.3 缓存命中率监控 #
sql
-- 表缓存命中率
SELECT
schemaname,
tablename,
heap_blks_read,
heap_blks_hit,
CASE
WHEN heap_blks_hit + heap_blks_read = 0 THEN 0
ELSE round(100.0 * heap_blks_hit / (heap_blks_hit + heap_blks_read), 2)
END as cache_hit_ratio
FROM pg_statio_user_tables
ORDER BY heap_blks_read DESC;
-- 索引缓存命中率
SELECT
schemaname,
tablename,
idx_blks_read,
idx_blks_hit,
CASE
WHEN idx_blks_hit + idx_blks_read = 0 THEN 0
ELSE round(100.0 * idx_blks_hit / (idx_blks_hit + idx_blks_read), 2)
END as cache_hit_ratio
FROM pg_statio_user_tables
WHERE idx_blks_read > 0
ORDER BY idx_blks_read DESC;
-- 整体缓存命中率
SELECT
sum(heap_blks_read) as heap_read,
sum(heap_blks_hit) as heap_hit,
round(100.0 * sum(heap_blks_hit) / (sum(heap_blks_hit) + sum(heap_blks_read)), 2) as heap_ratio,
sum(idx_blks_read) as idx_read,
sum(idx_blks_hit) as idx_hit,
round(100.0 * sum(idx_blks_hit) / (sum(idx_blks_hit) + sum(idx_blks_read)), 2) as idx_ratio
FROM pg_statio_user_tables;
四、日志管理 #
4.1 配置日志 #
ini
# postgresql.conf
# 启用日志收集
logging_collector = on
log_directory = 'log'
log_filename = 'postgresql-%Y-%m-%d_%H%M%S.log'
log_rotation_age = 1d
log_rotation_size = 100MB
# 日志内容配置
log_min_duration_statement = 1000 # 记录超过1秒的查询
log_checkpoints = on
log_connections = on
log_disconnections = on
log_lock_waits = on
log_temp_files = 0
# 日志格式
log_line_prefix = '%t [%p]: [%l-1] user=%u,db=%d,app=%a,client=%h '
log_timezone = 'UTC'
4.2 查看日志 #
bash
# 查看最新日志
tail -f /var/log/postgresql/postgresql-15-main.log
# 搜索慢查询
grep "duration:" /var/log/postgresql/*.log | tail -20
# 搜索错误
grep "ERROR" /var/log/postgresql/*.log | tail -20
# 搜索特定表
grep "sensor_data" /var/log/postgresql/*.log | tail -20
4.3 日志分析 #
bash
# 使用pgBadger分析日志
pgbadger /var/log/postgresql/postgresql-15-main.log -o report.html
# 分析特定时间范围
pgbadger /var/log/postgresql/*.log -b "2024-01-01 00:00:00" -e "2024-01-31 23:59:59" -o report.html
# 只分析慢查询
pgbadger /var/log/postgresql/*.log -f stderr -o report.html
五、告警配置 #
5.1 告警规则 #
text
告警规则建议:
系统告警
├── CPU使用率 > 80%
├── 内存使用率 > 85%
├── 磁盘使用率 > 85%
└── 系统负载 > CPU核心数
数据库告警
├── 活跃连接数 > max_connections * 0.8
├── 慢查询 > 10秒
├── 锁等待 > 30秒
└── 复制延迟 > 1分钟
TimescaleDB告警
├── 压缩策略失败
├── 保留策略失败
├── 连续聚合刷新失败
└── 分片数量异常
5.2 监控脚本 #
bash
#!/bin/bash
# check_timescaledb_health.sh
# 配置
ALERT_EMAIL="admin@example.com"
DB_NAME="tsdb"
# 检查连接数
CONNECTIONS=$(psql -U postgres -d $DB_NAME -t -c "
SELECT count(*) FROM pg_stat_activity;
")
MAX_CONNECTIONS=$(psql -U postgres -d $DB_NAME -t -c "
SELECT setting FROM pg_settings WHERE name = 'max_connections';
")
RATIO=$(echo "scale=2; $CONNECTIONS / $MAX_CONNECTIONS" | bc)
if [ $(echo "$RATIO > 0.8" | bc) -eq 1 ]; then
echo "Warning: Connection usage is ${RATIO}%" | mail -s "TimescaleDB Alert" $ALERT_EMAIL
fi
# 检查慢查询
SLOW_QUERIES=$(psql -U postgres -d $DB_NAME -t -c "
SELECT count(*) FROM pg_stat_activity
WHERE now() - query_start > interval '5 minutes';
")
if [ "$SLOW_QUERIES" -gt 0 ]; then
echo "Warning: $SLOW_QUERIES slow queries detected" | mail -s "TimescaleDB Alert" $ALERT_EMAIL
fi
# 检查策略执行
FAILED_JOBS=$(psql -U postgres -d $DB_NAME -t -c "
SELECT count(*) FROM timescaledb_information.job_stats
WHERE last_run_success = false;
")
if [ "$FAILED_JOBS" -gt 0 ]; then
echo "Warning: $FAILED_JOBS jobs failed" | mail -s "TimescaleDB Alert" $ALERT_EMAIL
fi
5.3 Prometheus监控 #
yaml
# prometheus.yml
scrape_configs:
- job_name: 'postgresql'
static_configs:
- targets: ['localhost:9187']
# postgres_exporter配置
DATA_SOURCE_NAME="postgresql://postgres:password@localhost:5432/tsdb?sslmode=disable"
sql
-- 自定义监控指标
CREATE OR REPLACE FUNCTION timescaledb_metrics()
RETURNS TABLE(
metric_name TEXT,
metric_value DOUBLE PRECISION
) AS $$
BEGIN
RETURN QUERY
SELECT 'chunk_count'::TEXT, count(*)::DOUBLE PRECISION
FROM timescaledb_information.chunks;
RETURN QUERY
SELECT 'compressed_chunk_ratio'::TEXT,
(SELECT count(*) FROM timescaledb_information.chunks WHERE is_compressed)::DOUBLE PRECISION /
NULLIF((SELECT count(*) FROM timescaledb_information.chunks), 0);
RETURN QUERY
SELECT 'hypertable_count'::TEXT, count(*)::DOUBLE PRECISION
FROM timescaledb_information.hypertables;
END;
$$ LANGUAGE plpgsql;
六、运维任务 #
6.1 定期维护任务 #
sql
-- 更新统计信息
ANALYZE;
-- 重建索引
REINDEX DATABASE tsdb;
-- 清理空间
VACUUM;
-- 检查数据库一致性
-- 需要在维护窗口执行
-- pg_checksums --check /var/lib/postgresql/15/main
6.2 自动维护脚本 #
bash
#!/bin/bash
# maintenance.sh
# 配置
DB_NAME="tsdb"
LOG_FILE="/var/log/timescaledb/maintenance.log"
# 记录日志函数
log() {
echo "$(date): $1" >> $LOG_FILE
}
# 更新统计信息
log "Running ANALYZE..."
psql -U postgres -d $DB_NAME -c "ANALYZE;" >> $LOG_FILE 2>&1
# 清理空间
log "Running VACUUM..."
psql -U postgres -d $DB_NAME -c "VACUUM;" >> $LOG_FILE 2>&1
# 检查索引使用
log "Checking unused indexes..."
psql -U postgres -d $DB_NAME -c "
SELECT indexrelname, pg_size_pretty(pg_relation_size(indexrelid)) as size
FROM pg_stat_user_indexes
WHERE idx_scan = 0
ORDER BY pg_relation_size(indexrelid) DESC
LIMIT 10;
" >> $LOG_FILE 2>&1
log "Maintenance completed"
6.3 定时任务 #
bash
# crontab配置
# 每天凌晨3点执行维护
0 3 * * * /path/to/maintenance.sh
# 每小时检查健康状态
0 * * * * /path/to/check_health.sh
# 每周检查备份
0 4 * * 0 /path/to/verify_backup.sh
七、故障排查 #
7.1 连接问题 #
sql
-- 检查最大连接数
SHOW max_connections;
-- 检查当前连接数
SELECT count(*) FROM pg_stat_activity;
-- 检查连接来源
SELECT
client_addr,
count(*) as connection_count
FROM pg_stat_activity
GROUP BY client_addr
ORDER BY connection_count DESC;
-- 终止空闲连接
SELECT pg_terminate_backend(pid)
FROM pg_stat_activity
WHERE state = 'idle'
AND query_start < NOW() - INTERVAL '1 hour';
7.2 性能问题 #
sql
-- 检查锁等待
SELECT
blocked_locks.pid AS blocked_pid,
blocked_activity.usename AS blocked_user,
blocking_locks.pid AS blocking_pid,
blocking_activity.usename AS blocking_user,
blocked_activity.query AS blocked_statement,
blocking_activity.query AS blocking_statement
FROM pg_catalog.pg_locks blocked_locks
JOIN pg_catalog.pg_stat_activity blocked_activity ON blocked_activity.pid = blocked_locks.pid
JOIN pg_catalog.pg_locks blocking_locks ON blocking_locks.locktype = blocked_locks.locktype
AND blocking_locks.relation = blocked_locks.relation
AND blocking_locks.pid != blocked_locks.pid
JOIN pg_catalog.pg_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid;
-- 检查表膨胀
SELECT
schemaname,
tablename,
n_live_tup,
n_dead_tup,
CASE
WHEN n_live_tup = 0 THEN 0
ELSE round(100.0 * n_dead_tup / n_live_tup, 2)
END as dead_ratio
FROM pg_stat_user_tables
WHERE n_dead_tup > 0
ORDER BY n_dead_tup DESC;
7.3 策略问题 #
sql
-- 检查策略执行状态
SELECT
j.job_id,
j.application_name,
j.schedule_interval,
s.last_run,
s.last_run_success,
s.total_failures
FROM timescaledb_information.jobs j
LEFT JOIN timescaledb_information.job_stats s ON j.job_id = s.job_id
WHERE s.last_run_success = false;
-- 检查策略错误
SELECT
job_id,
err_time,
err_message
FROM timescaledb_information.job_errors
ORDER BY err_time DESC;
-- 手动执行策略
SELECT run_job(job_id)
FROM timescaledb_information.jobs
WHERE application_name = 'policy_retention';
八、容量规划 #
8.1 存储容量监控 #
sql
-- 查看数据库大小
SELECT
datname,
pg_size_pretty(pg_database_size(datname)) as size
FROM pg_database;
-- 查看超表大小
SELECT
hypertable_name,
pg_size_pretty(total_size) as total_size,
pg_size_pretty(table_size) as table_size,
pg_size_pretty(index_size) as index_size
FROM timescaledb_information.hypertables;
-- 查看分片大小趋势
SELECT
date_trunc('week', range_start) as week,
count(*) as chunk_count,
pg_size_pretty(sum(total_bytes)) as size
FROM chunk_relation_size('sensor_data') c
JOIN timescaledb_information.chunks ch ON c.chunk_name = ch.chunk_name
GROUP BY date_trunc('week', range_start)
ORDER BY week DESC;
8.2 增长预测 #
sql
-- 计算数据增长速度
WITH daily_sizes AS (
SELECT
date_trunc('day', range_start) as day,
sum(total_bytes) as daily_size
FROM chunk_relation_size('sensor_data') c
JOIN timescaledb_information.chunks ch ON c.chunk_name = ch.chunk_name
GROUP BY date_trunc('day', range_start)
)
SELECT
avg(daily_size) as avg_daily_growth,
pg_size_pretty(avg(daily_size)::bigint) as avg_daily_growth_pretty,
avg(daily_size) * 30 as projected_monthly_growth,
pg_size_pretty((avg(daily_size) * 30)::bigint) as projected_monthly_growth_pretty
FROM daily_sizes;
九、总结 #
监控运维要点:
| 监控维度 | 关键指标 | 工具 |
|---|---|---|
| 系统 | CPU、内存、磁盘 | 系统工具 |
| 数据库 | 连接、查询、锁 | pg_stat_* |
| TimescaleDB | 超表、分片、策略 | timescaledb_information |
最佳实践:
- 定期监控:设置监控和告警
- 日志管理:配置日志收集和分析
- 定期维护:执行维护任务
- 容量规划:监控数据增长
恭喜你完成TimescaleDB完全指南的学习!
最后更新:2026-03-27