Prometheus快速入门 #
一、环境准备 #
1.1 实验环境 #
text
实验环境架构:
┌─────────────────────────────────────────────┐
│ 监控系统组件 │
├─────────────────────────────────────────────┤
│ │
│ Prometheus Server (9090) │
│ │ │
│ ├── Node Exporter (9100) │
│ │ │
│ └── Alertmanager (9093) │
│ │
└─────────────────────────────────────────────┘
我们将搭建:
• Prometheus Server:核心服务器
• Node Exporter:主机监控
• Alertmanager:告警管理
1.2 创建工作目录 #
bash
# 创建工作目录
mkdir -p ~/prometheus-lab/{prometheus,alertmanager,grafana}
cd ~/prometheus-lab
# 查看目录结构
tree .
二、启动Prometheus #
2.1 创建配置文件 #
yaml
# prometheus/prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
monitor: 'local-monitor'
alerting:
alertmanagers:
- static_configs:
- targets:
- 'localhost:9093'
rule_files:
- '/etc/prometheus/alerts/*.yml'
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
labels:
env: 'local'
- job_name: 'node-exporter'
static_configs:
- targets: ['localhost:9100']
labels:
env: 'local'
2.2 创建告警规则 #
yaml
# prometheus/alerts/rules.yml
groups:
- name: node_alerts
rules:
- alert: InstanceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute."
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage on {{ $labels.instance }}"
description: "Memory usage is {{ $value | printf \"%.2f\" }}%"
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage on {{ $labels.instance }}"
description: "CPU usage is {{ $value | printf \"%.2f\" }}%"
- alert: DiskSpaceLow
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 20
for: 5m
labels:
severity: warning
annotations:
summary: "Low disk space on {{ $labels.instance }}"
description: "Disk {{ $labels.mountpoint }} has only {{ $value | printf \"%.2f\" }}% free space"
2.3 启动Prometheus #
bash
# 使用Docker启动Prometheus
docker run -d \
--name prometheus \
-p 9090:9090 \
-v $(pwd)/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml \
-v $(pwd)/prometheus/alerts:/etc/prometheus/alerts \
prom/prometheus:v2.48.0 \
--config.file=/etc/prometheus/prometheus.yml \
--storage.tsdb.retention.time=7d \
--web.enable-lifecycle
# 查看日志
docker logs -f prometheus
# 访问Web界面
# http://localhost:9090
三、启动Node Exporter #
3.1 启动Node Exporter #
bash
# 使用Docker启动Node Exporter
docker run -d \
--name node-exporter \
-p 9100:9100 \
-v /proc:/host/proc:ro \
-v /sys:/host/sys:ro \
-v /:/rootfs:ro \
prom/node-exporter:v1.7.0 \
--path.procfs=/host/proc \
--path.sysfs=/host/sys \
--path.rootfs=/rootfs
# 查看指标
curl http://localhost:9100/metrics
# 查看日志
docker logs -f node-exporter
3.2 验证采集 #
bash
# 检查Prometheus目标状态
curl http://localhost:9090/api/v1/targets
# 查询node exporter指标
curl 'http://localhost:9090/api/v1/query?query=up'
# 查询CPU使用率
curl 'http://localhost:9090/api/v1/query?query=node_cpu_seconds_total'
四、启动Alertmanager #
4.1 创建Alertmanager配置 #
yaml
# alertmanager/alertmanager.yml
global:
resolve_timeout: 5m
route:
group_by: ['alertname', 'severity']
group_wait: 30s
group_interval: 5m
repeat_interval: 1h
receiver: 'default-receiver'
routes:
- match:
severity: critical
receiver: 'critical-receiver'
- match:
severity: warning
receiver: 'warning-receiver'
receivers:
- name: 'default-receiver'
webhook_configs:
- url: 'http://localhost:5001/webhook'
send_resolved: true
- name: 'critical-receiver'
webhook_configs:
- url: 'http://localhost:5001/webhook/critical'
send_resolved: true
- name: 'warning-receiver'
webhook_configs:
- url: 'http://localhost:5001/webhook/warning'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
4.2 启动Alertmanager #
bash
# 使用Docker启动Alertmanager
docker run -d \
--name alertmanager \
-p 9093:9093 \
-v $(pwd)/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml \
prom/alertmanager:v0.26.0 \
--config.file=/etc/alertmanager/alertmanager.yml \
--storage.path=/alertmanager
# 访问Web界面
# http://localhost:9093
五、常用查询示例 #
5.1 基础查询 #
promql
# 查看所有实例状态
up
# 查看特定job的实例状态
up{job="node-exporter"}
# 查看CPU时间
node_cpu_seconds_total
# 按模式过滤CPU时间
node_cpu_seconds_total{mode="idle"}
# 查看内存信息
node_memory_MemTotal_bytes
node_memory_MemAvailable_bytes
node_memory_MemFree_bytes
5.2 计算查询 #
promql
# 计算CPU使用率(百分比)
100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
# 计算内存使用率(百分比)
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
# 计算磁盘使用率(百分比)
(1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100
# 计算网络接收速率(MB/s)
irate(node_network_receive_bytes_total{device!~"lo|veth.*"}[5m]) / 1024 / 1024
# 计算网络发送速率(MB/s)
irate(node_network_transmit_bytes_total{device!~"lo|veth.*"}[5m]) / 1024 / 1024
5.3 聚合查询 #
promql
# 按实例分组计算CPU使用率
avg by(instance) (irate(node_cpu_seconds_total{mode!="idle"}[5m])) * 100
# 按CPU核心分组计算使用率
avg by(instance, cpu) (irate(node_cpu_seconds_total{mode!="idle"}[5m])) * 100
# 计算总内存使用量
sum(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)
# 按挂载点计算磁盘使用量
sum by(mountpoint) (node_filesystem_size_bytes - node_filesystem_avail_bytes)
5.4 范围查询 #
promql
# 过去5分钟的CPU使用率趋势
rate(node_cpu_seconds_total{mode="idle"}[5m])
# 过去1小时的内存使用趋势
node_memory_MemAvailable_bytes[1h]
# 过去30分钟的磁盘IO
rate(node_disk_io_time_seconds_total[30m])
六、使用Docker Compose #
6.1 创建docker-compose.yml #
yaml
# docker-compose.yml
version: '3.8'
services:
prometheus:
image: prom/prometheus:v2.48.0
container_name: prometheus
restart: unless-stopped
ports:
- "9090:9090"
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- ./prometheus/alerts:/etc/prometheus/alerts
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=7d'
- '--web.enable-lifecycle'
networks:
- monitoring
node-exporter:
image: prom/node-exporter:v1.7.0
container_name: node-exporter
restart: unless-stopped
ports:
- "9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--path.rootfs=/rootfs'
networks:
- monitoring
alertmanager:
image: prom/alertmanager:v0.26.0
container_name: alertmanager
restart: unless-stopped
ports:
- "9093:9093"
volumes:
- ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
- alertmanager_data:/alertmanager
networks:
- monitoring
grafana:
image: grafana/grafana:10.2.0
container_name: grafana
restart: unless-stopped
ports:
- "3000:3000"
volumes:
- grafana_data:/var/lib/grafana
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=admin
networks:
- monitoring
networks:
monitoring:
driver: bridge
volumes:
prometheus_data:
alertmanager_data:
grafana_data:
6.2 启动所有服务 #
bash
# 启动所有服务
docker-compose up -d
# 查看服务状态
docker-compose ps
# 查看日志
docker-compose logs -f
# 停止所有服务
docker-compose down
# 停止并删除数据卷
docker-compose down -v
七、验证监控系统 #
7.1 检查服务状态 #
bash
# 检查所有容器状态
docker ps
# 检查Prometheus
curl http://localhost:9090/-/healthy
curl http://localhost:9090/-/ready
# 检查Node Exporter
curl http://localhost:9100/metrics | head -20
# 检查Alertmanager
curl http://localhost:9093/-/healthy
7.2 Web界面验证 #
text
访问各服务Web界面:
Prometheus
├── http://localhost:9090
├── Status → Targets:查看采集目标
├── Status → Configuration:查看配置
└── Alerts:查看告警规则
Alertmanager
├── http://localhost:9093
└── 查看告警状态
Grafana
├── http://localhost:3000
├── 用户名:admin
├── 密码:admin
└── 添加Prometheus数据源
7.3 测试告警 #
bash
# 停止node-exporter触发告警
docker stop node-exporter
# 等待1分钟后查看告警
curl http://localhost:9090/api/v1/alerts
# 查看Alertmanager中的告警
curl http://localhost:9093/api/v1/alerts
# 恢复node-exporter
docker start node-exporter
八、常用操作 #
8.1 热加载配置 #
bash
# 修改配置文件后热加载
curl -X POST http://localhost:9090/-/reload
# 查看当前配置
curl http://localhost:9090/api/v1/status/config
8.2 查询API #
bash
# 即时查询
curl 'http://localhost:9090/api/v1/query?query=up'
# 范围查询
curl 'http://localhost:9090/api/v1/query_range?query=up&start=1700000000&end=1700003600&step=15s'
# 查询标签
curl 'http://localhost:9090/api/v1/labels'
# 查询标签值
curl 'http://localhost:9090/api/v1/label/job/values'
# 查询元数据
curl 'http://localhost:9090/api/v1/metadata'
8.3 管理API #
bash
# 删除时间序列(需要启用 --web.enable-admin-api)
curl -X POST -g 'http://localhost:9090/api/v1/admin/tsdb/delete_series?match[]={job="node-exporter"}'
# 清理数据
curl -X POST 'http://localhost:9090/api/v1/admin/tsdb/clean_tombstones'
# 快照
curl -X POST 'http://localhost:9090/api/v1/admin/tsdb/snapshot'
九、监控仪表盘 #
9.1 配置Grafana #
text
Grafana配置步骤:
1. 访问 http://localhost:3000
2. 登录(admin/admin)
3. 添加数据源
├── 选择 Prometheus
├── URL: http://prometheus:9090
└── Save & Test
4. 导入仪表盘
├── + → Import
├── 输入仪表盘ID
│ • Node Exporter Full: 1860
│ • Prometheus 2.0 Overview: 3662
└── 选择数据源并导入
9.2 创建简单仪表盘 #
text
创建仪表盘步骤:
1. 创建新仪表盘
+ → Dashboard → Add new panel
2. 添加CPU使用率面板
Query: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
Legend: {{ instance }}
Unit: Percent (0-100)
3. 添加内存使用率面板
Query: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
Legend: {{ instance }}
Unit: Percent (0-100)
4. 添加磁盘使用率面板
Query: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100
Legend: {{ instance }} - {{ mountpoint }}
Unit: Percent (0-100)
5. 保存仪表盘
十、总结 #
快速入门要点:
| 步骤 | 内容 |
|---|---|
| 1 | 启动Prometheus Server |
| 2 | 启动Node Exporter |
| 3 | 配置采集目标 |
| 4 | 编写告警规则 |
| 5 | 启动Alertmanager |
| 6 | 配置Grafana可视化 |
常用查询:
| 场景 | PromQL |
|---|---|
| CPU使用率 | 100 - avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100 |
| 内存使用率 | (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 |
| 磁盘使用率 | (1 - node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 |
下一步,让我们学习Prometheus的数据模型!
最后更新:2026-03-27