1. process_exporter
1.1 介绍
可以监控系统进程
1.2 部署
1.容器方式
bash
docker run -d --rm -p 9256:9256 --privileged -v /proc:/host/proc -v `pwd`:/config ncabatoff/process-exporter --procfs /host/proc -config.path /config/filename.yml
docker run -d --rm -p 9256:9256 --privileged -v /proc:/host/proc -v `pwd`:/config ncabatoff/process-exporter --procfs /host/proc -config.path /config/filename.yml
2.二进制方式
- 下载
bash
wget https://github.com/ncabatoff/process-exporter/releases/download/v0.8.4/process-exporter-0.8.4.linux-amd64.tar.gz
wget https://github.com/ncabatoff/process-exporter/releases/download/v0.8.4/process-exporter-0.8.4.linux-amd64.tar.gz
1.3 配置文件
bash
cat > /data/apps/process-exporter/process_name.yaml <<EOF
process_names:
#监控所有进程
# - name: "{{.Comm}}"
# cmdline:
# - '.+'
- name: "{{.Matches}}"
cmdline:
- 'nginx'
- name: "{{.Matches}}"
cmdline:
- '/opt/atlassian/confluence/bin/tomcat-juli.jar'
- name: "{{.Matches}}"
cmdline:
- 'vsftpd'
- name: "{{.Matches}}"
cmdline:
- 'redis-server'
EOF
cat > /data/apps/process-exporter/process_name.yaml <<EOF
process_names:
#监控所有进程
# - name: "{{.Comm}}"
# cmdline:
# - '.+'
- name: "{{.Matches}}"
cmdline:
- 'nginx'
- name: "{{.Matches}}"
cmdline:
- '/opt/atlassian/confluence/bin/tomcat-juli.jar'
- name: "{{.Matches}}"
cmdline:
- 'vsftpd'
- name: "{{.Matches}}"
cmdline:
- 'redis-server'
EOF
💡 说明
cmdline: 所选进程的唯一标识,ps -ef 可以查询到。如果改进程不存在,则不会有该进程的数据采集到
例如:> ps -ef | grep redis
redis 4287 4127 0 Oct31 ? 00:58:12 redis-server *:6379
1.4 配置systemd
bash
cat > /usr/lib/systemd/system/process_exporter.service <<EOF
[Unit]
Description=Prometheus exporter for processors metrics, written in Go with pluggable metric collectors.
Documentation=https://github.com/ncabatoff/process-exporter
After=network.target
[Service]
Type=simple
User=prometheus
WorkingDirectory=/data/apps/process-exporter
ExecStart=/data/apps/process-exporter/process-exporter -config.path=/data/apps/process-exporter/process_name.yaml
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
cat > /usr/lib/systemd/system/process_exporter.service <<EOF
[Unit]
Description=Prometheus exporter for processors metrics, written in Go with pluggable metric collectors.
Documentation=https://github.com/ncabatoff/process-exporter
After=network.target
[Service]
Type=simple
User=prometheus
WorkingDirectory=/data/apps/process-exporter
ExecStart=/data/apps/process-exporter/process-exporter -config.path=/data/apps/process-exporter/process_name.yaml
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
- 启动
bash
systemctl daemon-reload
systemctl enable --now process_exporter
systemctl daemon-reload
systemctl enable --now process_exporter
- 验证数据
bash
curl http://localhost:9256/metrics
curl http://localhost:9256/metrics
1.5 添加prometheus
yaml
- job_name: 'process_exporter'
scrape_interval: 10s
honor_labels: true
metrics_path: '/metrics'
static_configs:
- targets: ['192.168.10.73:9256']
- job_name: 'process_exporter'
scrape_interval: 10s
honor_labels: true
metrics_path: '/metrics'
static_configs:
- targets: ['192.168.10.73:9256']
- 重启服务
curl -X POST http://127.0.0.1:9090/-/reload
curl -X POST http://127.0.0.1:9090/-/reload
1.6 报警规则
yaml
alert: 进程告警
expr: sum by(cluster, job, instance, groupname) (namedprocess_namegroup_states{state="Zombie"}) > 0
for: 1m
labels:
severity: warning
annotations:
value: 当前产生 {{ $value }} 个僵尸进程
alert: 进程重启告警
expr: ceil(time() - max by(cluster, job, instance, groupname) (namedprocess_namegroup_oldest_start_time_seconds)) < 60
for: 25s
labels:
label: alert_once
severity: warning
annotations:
value: 进程 {{ $labels.groupname }} 在 {{ $value }} 秒前发生重启
alert: 进程退出告警
expr: up{export="process_exporter"} == 0 or max by(cluster, job, instance, groupname) (delta(namedprocess_namegroup_oldest_start_time_seconds{groupname=~"^map.*"}[10d])) < 0
for: 55s
labels:
severity: warning
annotations:
value: 进程 {{ $labels.export}} 已退出
alert: 进程告警
expr: sum by(cluster, job, instance, groupname) (namedprocess_namegroup_states{state="Zombie"}) > 0
for: 1m
labels:
severity: warning
annotations:
value: 当前产生 {{ $value }} 个僵尸进程
alert: 进程重启告警
expr: ceil(time() - max by(cluster, job, instance, groupname) (namedprocess_namegroup_oldest_start_time_seconds)) < 60
for: 25s
labels:
label: alert_once
severity: warning
annotations:
value: 进程 {{ $labels.groupname }} 在 {{ $value }} 秒前发生重启
alert: 进程退出告警
expr: up{export="process_exporter"} == 0 or max by(cluster, job, instance, groupname) (delta(namedprocess_namegroup_oldest_start_time_seconds{groupname=~"^map.*"}[10d])) < 0
for: 55s
labels:
severity: warning
annotations:
value: 进程 {{ $labels.export}} 已退出
- 检查语法
/usr/local/prometheus/promtool check rules rules.yml
/usr/local/prometheus/promtool check rules rules.yml
1.7 grafana
https://grafana.com/grafana/dashboards/8378-system-processes-metrics/