监控告警 #
一、监控概述 #
1.1 监控指标 #
| 指标类型 | 关键指标 |
|---|---|
| 性能指标 | QPS、延迟、命中率 |
| 资源指标 | CPU、内存、连接数 |
| 业务指标 | 缓存大小、对象数 |
| 错误指标 | 错误率、后端故障 |
1.2 监控架构 #
text
┌─────────────────────────────────────────────────────────┐
│ 监控架构 │
├─────────────────────────────────────────────────────────┤
│ │
│ Varnish ──► varnish_exporter ──► Prometheus │
│ │ │
│ ▼ │
│ Alertmanager │
│ │ │
│ ▼ │
│ Grafana │
│ │
└─────────────────────────────────────────────────────────┘
二、Prometheus集成 #
2.1 安装varnish_exporter #
bash
# 下载
wget https://github.com/jonnenauha/prometheus_varnish_exporter/releases/download/v1.6/varnish_exporter-1.6.linux-amd64.tar.gz
# 解压
tar xzf varnish_exporter-1.6.linux-amd64.tar.gz
# 安装
sudo cp varnish_exporter /usr/local/bin/
sudo chmod +x /usr/local/bin/varnish_exporter
2.2 创建服务 #
bash
# /etc/systemd/system/varnish_exporter.service
[Unit]
Description=Varnish Exporter for Prometheus
After=network.target
[Service]
User=varnish
Group=varnish
ExecStart=/usr/local/bin/varnish_exporter \
-web.listen-address=:9131 \
-varnishstat-path=/usr/bin/varnishstat \
-varnishlog-path=/usr/bin/varnishlog
[Install]
WantedBy=multi-user.target
2.3 启动服务 #
bash
# 启动
sudo systemctl daemon-reload
sudo systemctl start varnish_exporter
sudo systemctl enable varnish_exporter
# 验证
curl http://localhost:9131/metrics
2.4 Prometheus配置 #
yaml
# prometheus.yml
scrape_configs:
- job_name: 'varnish'
static_configs:
- targets: ['localhost:9131']
labels:
instance: 'varnish-server-1'
三、关键指标 #
3.1 缓存命中率 #
promql
# 缓存命中率
rate(varnish_main_cache_hit[5m]) /
(rate(varnish_main_cache_hit[5m]) + rate(varnish_main_cache_miss[5m])) * 100
# 命中次数
rate(varnish_main_cache_hit[5m])
# 未命中次数
rate(varnish_main_cache_miss[5m])
3.2 请求统计 #
promql
# 每秒请求数
rate(varnish_main_client_req[5m])
# 当前连接数
varnish_main_sess_conn
# 活跃连接
varnish_main_sess_active
3.3 后端指标 #
conn
# 后端连接数
varnish_main_backend_conn
# 后端失败数
rate(varnish_main_backend_fail[5m])
# 后端不健康数
varnish_main_backend_unhealthy
3.4 资源指标 #
promql
# 内存使用
varnish_s0_g_bytes
# 内存剩余
varnish_s0_g_space
# 对象数
varnish_main_n_object
# 过期对象
rate(varnish_main_n_expired[5m])
3.5 线程指标 #
promql
# 当前线程数
varnish_main_threads
# 线程创建速率
rate(varnish_main_threads_created[5m])
# 线程限制
varnish_main_threads_limited
四、Grafana仪表板 #
4.1 安装Grafana #
bash
# Ubuntu/Debian
sudo apt install grafana
# 启动
sudo systemctl start grafana-server
sudo systemctl enable grafana-server
4.2 导入仪表板 #
推荐仪表板:
- Grafana ID: 9913 (Varnish Dashboard)
- Grafana ID: 13499 (Varnish Overview)
4.3 自定义仪表板 #
缓存命中率面板:
json
{
"title": "Cache Hit Rate",
"type": "gauge",
"targets": [
{
"expr": "rate(varnish_main_cache_hit[5m]) / (rate(varnish_main_cache_hit[5m]) + rate(varnish_main_cache_miss[5m])) * 100",
"legendFormat": "Hit Rate"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "red", "value": 0},
{"color": "yellow", "value": 70},
{"color": "green", "value": 90}
]
}
}
}
}
QPS面板:
json
{
"title": "Requests Per Second",
"type": "graph",
"targets": [
{
"expr": "rate(varnish_main_client_req[5m])",
"legendFormat": "QPS"
}
],
"fieldConfig": {
"defaults": {
"unit": "reqps"
}
}
}
五、告警配置 #
5.1 Prometheus告警规则 #
yaml
# alerts.yml
groups:
- name: varnish_alerts
rules:
# 缓存命中率低
- alert: VarnishLowCacheHitRate
expr: |
rate(varnish_main_cache_hit[5m]) /
(rate(varnish_main_cache_hit[5m]) + rate(varnish_main_cache_miss[5m])) * 100 < 70
for: 5m
labels:
severity: warning
annotations:
summary: "Varnish cache hit rate is low"
description: "Cache hit rate is {{ $value | printf \"%.2f\" }}%"
# 后端故障
- alert: VarnishBackendDown
expr: varnish_main_backend_unhealthy > 0
for: 1m
labels:
severity: critical
annotations:
summary: "Varnish backend is down"
description: "{{ $value }} backend(s) are unhealthy"
# 高错误率
- alert: VarnishHighErrorRate
expr: |
rate(varnish_main_client_req[5m]) > 100 and
rate(varnish_main_synth[5m]) / rate(varnish_main_client_req[5m]) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "Varnish high error rate"
description: "Error rate is {{ $value | printf \"%.2f\" }}%"
# 内存不足
- alert: VarnishMemoryLow
expr: varnish_s0_g_space < 104857600
for: 5m
labels:
severity: warning
annotations:
summary: "Varnish memory is low"
description: "Only {{ $value | humanize }} bytes of cache space remaining"
# 线程不足
- alert: VarnishThreadsLimited
expr: rate(varnish_main_threads_limited[5m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Varnish thread creation limited"
description: "Thread creation is being limited"
# 后端响应慢
- alert: VarnishBackendSlow
expr: |
histogram_quantile(0.95, rate(varnish_main_bereq_bodybytes[5m])) > 1000000
for: 5m
labels:
severity: warning
annotations:
summary: "Varnish backend response slow"
description: "95th percentile response size is {{ $value | humanize }} bytes"
5.2 Alertmanager配置 #
yaml
# alertmanager.yml
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.example.com:587'
smtp_from: 'alerts@example.com'
smtp_auth_username: 'alerts@example.com'
smtp_auth_password: 'password'
route:
group_by: ['alertname', 'severity']
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
receiver: 'default'
routes:
- match:
severity: critical
receiver: 'critical'
- match:
severity: warning
receiver: 'warning'
receivers:
- name: 'default'
email_configs:
- to: 'admin@example.com'
- name: 'critical'
email_configs:
- to: 'admin@example.com'
webhook_configs:
- url: 'https://hooks.slack.com/services/xxx'
pagerduty_configs:
- service_key: 'xxx'
- name: 'warning'
email_configs:
- to: 'admin@example.com'
六、监控脚本 #
6.1 健康检查脚本 #
bash
#!/bin/bash
# health_check.sh
# 检查Varnish进程
if ! pgrep -x varnishd > /dev/null; then
echo "CRITICAL: Varnish process not running"
exit 2
fi
# 检查端口
if ! nc -z localhost 6081; then
echo "CRITICAL: Port 6081 not responding"
exit 2
fi
# 检查管理接口
if ! varnishadm status > /dev/null 2>&1; then
echo "WARNING: Management interface not responding"
exit 1
fi
# 检查缓存命中率
HITS=$(varnishstat -1 -f MAIN.cache_hit | awk '{print $2}')
MISSES=$(varnishstat -1 -f MAIN.cache_miss | awk '{print $2}')
TOTAL=$((HITS + MISSES))
if [ $TOTAL -gt 100 ]; then
RATE=$(echo "scale=2; $HITS * 100 / $TOTAL" | bc)
if (( $(echo "$RATE < 50" | bc -l) )); then
echo "WARNING: Cache hit rate is low: ${RATE}%"
exit 1
fi
fi
# 检查后端状态
UNHEALTHY=$(varnishadm backend.list | grep -c "sick")
if [ $UNHEALTHY -gt 0 ]; then
echo "WARNING: $UNHEALTHY backend(s) are unhealthy"
exit 1
fi
echo "OK: Varnish is healthy"
exit 0
6.2 性能监控脚本 #
bash
#!/bin/bash
# performance_monitor.sh
echo "=== Varnish Performance Monitor ==="
echo "Time: $(date)"
echo ""
# 缓存命中率
HITS=$(varnishstat -1 -f MAIN.cache_hit | awk '{print $2}')
MISSES=$(varnishstat -1 -f MAIN.cache_miss | awk '{print $2}')
TOTAL=$((HITS + MISSES))
if [ $TOTAL -gt 0 ]; then
RATE=$(echo "scale=2; $HITS * 100 / $TOTAL" | bc)
echo "Cache Hit Rate: ${RATE}%"
fi
# QPS
QPS=$(varnishstat -1 -f MAIN.client_req | awk '{print $2}')
echo "Total Requests: $QPS"
# 连接数
CONN=$(varnishstat -1 -f MAIN.sess_conn | awk '{print $2}')
echo "Total Connections: $CONN"
# 后端状态
echo ""
echo "Backend Status:"
varnishadm backend.list
# 内存使用
echo ""
echo "Memory Usage:"
varnishstat -1 -f MAIN.s0.g_bytes -f MAIN.s0.g_space
# 线程状态
echo ""
echo "Thread Status:"
varnishstat -1 -f MAIN.threads -f MAIN.threads_limited
七、日志监控 #
7.1 错误日志监控 #
bash
#!/bin/bash
# error_monitor.sh
# 监控5xx错误
ERRORS=$(varnishlog -d -q "RespStatus >= 500" | wc -l)
if [ $ERRORS -gt 100 ]; then
echo "WARNING: High number of 5xx errors: $ERRORS"
fi
# 监控后端错误
BACKEND_ERRORS=$(varnishlog -d -q "BackendHealth" | grep -c "sick")
if [ $BACKEND_ERRORS -gt 0 ]; then
echo "WARNING: Backend health issues detected"
fi
7.2 慢请求监控 #
bash
#!/bin/bash
# slow_request_monitor.sh
# 监控慢请求
SLOW=$(varnishlog -d -q "Timestamp:Process[2] > 1.0" | wc -l)
if [ $SLOW -gt 10 ]; then
echo "WARNING: High number of slow requests: $SLOW"
fi
八、总结 #
本章我们学习了:
- 监控概述:指标类型、监控架构
- Prometheus集成:安装、配置
- 关键指标:缓存命中率、请求统计、后端指标
- Grafana仪表板:安装、导入、自定义
- 告警配置:Prometheus规则、Alertmanager
- 监控脚本:健康检查、性能监控
- 日志监控:错误监控、慢请求监控
掌握监控告警后,让我们进入下一章,学习故障排查!
最后更新:2026-03-28