容器化部署 prometheus/grafana/alertmanager/node_exporter/cadvisor

目录及结构

.
├── alertmanager
│   └── alertmanager.yml
├── docker-compose.yml
├── grafana
│   ├── dashboards
│   └── provisioning
│   └── datasources
│   └── prometheus.yml
├── prometheus
│   ├── prometheus.yml
│   └── rules
│   └── alert.yml

Alertmanager 告警配置

global:
smtp_smarthost: 'localhost:587'
smtp_from: '[email protected]'

route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'web.hook'

receivers:
- name: 'web.hook'
webhook_configs:
- url: 'http://127.0.0.1:5001/'

inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']

Prometheus 配置文件

global:
scrape_interval: 15s
evaluation_interval: 15s

rule_files:
- "rules/*.yml"

alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093

scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']

- job_name: 'node-exporter'
static_configs:
- targets: ['node-exporter:9100']

- job_name: 'grafana'
static_configs:
- targets: ['grafana:3000']

- job_name: 'alertmanager'
static_configs:
- targets: ['alertmanager:9093']

- job_name: 'cadvisor'
static_configs:
- targets: ['cadvisor:8080']

告警规则 alert.yml

groups:
- name: node_alerts
rules:
- alert: InstanceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute."

- alert: HighCpuUsage
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: "High CPU usage on {{ $labels.instance }}"
description: "CPU usage is above 80% for more than 2 minutes on {{ $labels.instance }}"

docker-compose.yml

version: '3.8'

services:
prometheus:
image: prom/prometheus:latest
container_name: prometheus
restart: unless-stopped
ports:
- "9090:9090"
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- ./prometheus/rules:/etc/prometheus/rules
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--storage.tsdb.retention.time=720h'
- '--storage.tsdb.min-block-duration=2h'
- '--storage.tsdb.max-block-duration=2h'
- '--web.enable-lifecycle'
networks:
- monitoring

alertmanager:
image: prom/alertmanager:latest
container_name: alertmanager
restart: unless-stopped
ports:
- "9093:9093"
volumes:
- ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
- alertmanager_data:/alertmanager
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
- '--web.external-url=http://localhost:9093'
networks:
- monitoring

grafana:
image: grafana/grafana:latest
container_name: grafana
restart: unless-stopped
ports:
- "3000:3000"
volumes:
- grafana_data:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning
- ./grafana/dashboards:/var/lib/grafana/dashboards
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=admin123
- GF_USERS_ALLOW_SIGN_UP=false
- GF_INSTALL_PLUGINS=grafana-piechart-panel
networks:
- monitoring

node-exporter:
image: prom/node-exporter:latest
container_name: node-exporter
restart: unless-stopped
ports:
- "9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.rootfs=/rootfs'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
networks:
- monitoring

cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
container_name: cadvisor
restart: unless-stopped
ports:
- "8080:8080"
volumes:
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
privileged: true
devices:
- /dev/kmsg
networks:
- monitoring


networks:
monitoring:
driver: bridge

volumes:
prometheus_data:
grafana_data:
alertmanager_data: