Skip to content

1. process_exporter

1.1 介绍

可以监控系统进程

1.2 部署

1.容器方式

bash
  docker run -d --rm -p 9256:9256 --privileged -v /proc:/host/proc -v `pwd`:/config ncabatoff/process-exporter --procfs /host/proc -config.path /config/filename.yml
  docker run -d --rm -p 9256:9256 --privileged -v /proc:/host/proc -v `pwd`:/config ncabatoff/process-exporter --procfs /host/proc -config.path /config/filename.yml

2.二进制方式

  • 下载
bash
wget https://github.com/ncabatoff/process-exporter/releases/download/v0.8.4/process-exporter-0.8.4.linux-amd64.tar.gz
wget https://github.com/ncabatoff/process-exporter/releases/download/v0.8.4/process-exporter-0.8.4.linux-amd64.tar.gz

1.3 配置文件

bash
cat > /data/apps/process-exporter/process_name.yaml <<EOF

process_names:
#监控所有进程
#  - name: "{{.Comm}}"
#    cmdline:
#    - '.+'

  - name: "{{.Matches}}"
    cmdline:
    - 'nginx'

  - name: "{{.Matches}}"
    cmdline:
    - '/opt/atlassian/confluence/bin/tomcat-juli.jar'

  - name: "{{.Matches}}"
    cmdline:
    - 'vsftpd'

  - name: "{{.Matches}}"
    cmdline:
    - 'redis-server'
EOF
cat > /data/apps/process-exporter/process_name.yaml <<EOF

process_names:
#监控所有进程
#  - name: "{{.Comm}}"
#    cmdline:
#    - '.+'

  - name: "{{.Matches}}"
    cmdline:
    - 'nginx'

  - name: "{{.Matches}}"
    cmdline:
    - '/opt/atlassian/confluence/bin/tomcat-juli.jar'

  - name: "{{.Matches}}"
    cmdline:
    - 'vsftpd'

  - name: "{{.Matches}}"
    cmdline:
    - 'redis-server'
EOF

💡 说明

cmdline: 所选进程的唯一标识,ps -ef 可以查询到。如果改进程不存在,则不会有该进程的数据采集到

例如:> ps -ef | grep redis

redis 4287 4127 0 Oct31 ? 00:58:12 redis-server *:6379

image-20241218160404645

1.4 配置systemd

bash
cat > /usr/lib/systemd/system/process_exporter.service <<EOF
 
[Unit]
Description=Prometheus exporter for processors metrics, written in Go with pluggable metric collectors.
Documentation=https://github.com/ncabatoff/process-exporter
After=network.target
 
[Service]
Type=simple
User=prometheus
WorkingDirectory=/data/apps/process-exporter
ExecStart=/data/apps/process-exporter/process-exporter -config.path=/data/apps/process-exporter/process_name.yaml
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
Restart=on-failure
 
[Install]
WantedBy=multi-user.target
EOF
cat > /usr/lib/systemd/system/process_exporter.service <<EOF
 
[Unit]
Description=Prometheus exporter for processors metrics, written in Go with pluggable metric collectors.
Documentation=https://github.com/ncabatoff/process-exporter
After=network.target
 
[Service]
Type=simple
User=prometheus
WorkingDirectory=/data/apps/process-exporter
ExecStart=/data/apps/process-exporter/process-exporter -config.path=/data/apps/process-exporter/process_name.yaml
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
Restart=on-failure
 
[Install]
WantedBy=multi-user.target
EOF
  • 启动
bash
systemctl daemon-reload

systemctl enable --now process_exporter
systemctl daemon-reload

systemctl enable --now process_exporter
  • 验证数据
bash
curl http://localhost:9256/metrics
curl http://localhost:9256/metrics

1.5 添加prometheus

yaml
- job_name: 'process_exporter'
  scrape_interval: 10s
  honor_labels: true
  metrics_path: '/metrics'
  static_configs:
  - targets: ['192.168.10.73:9256']
- job_name: 'process_exporter'
  scrape_interval: 10s
  honor_labels: true
  metrics_path: '/metrics'
  static_configs:
  - targets: ['192.168.10.73:9256']
  • 重启服务
curl -X POST http://127.0.0.1:9090/-/reload
curl -X POST http://127.0.0.1:9090/-/reload

1.6 报警规则

yaml
alert: 进程告警
expr: sum by(cluster, job, instance, groupname) (namedprocess_namegroup_states{state="Zombie"}) > 0
for: 1m
labels:
  severity: warning
annotations:
  value: 当前产生 {{ $value }} 个僵尸进程

alert: 进程重启告警
expr: ceil(time() - max by(cluster, job, instance, groupname) (namedprocess_namegroup_oldest_start_time_seconds)) < 60
for: 25s
labels:
  label: alert_once
  severity: warning
annotations:
  value: 进程 {{ $labels.groupname }} 在 {{ $value }} 秒前发生重启

alert: 进程退出告警
expr: up{export="process_exporter"} == 0 or max by(cluster, job, instance, groupname) (delta(namedprocess_namegroup_oldest_start_time_seconds{groupname=~"^map.*"}[10d])) < 0
for: 55s
labels:
  severity: warning
annotations:
  value: 进程 {{ $labels.export}} 已退出
alert: 进程告警
expr: sum by(cluster, job, instance, groupname) (namedprocess_namegroup_states{state="Zombie"}) > 0
for: 1m
labels:
  severity: warning
annotations:
  value: 当前产生 {{ $value }} 个僵尸进程

alert: 进程重启告警
expr: ceil(time() - max by(cluster, job, instance, groupname) (namedprocess_namegroup_oldest_start_time_seconds)) < 60
for: 25s
labels:
  label: alert_once
  severity: warning
annotations:
  value: 进程 {{ $labels.groupname }} 在 {{ $value }} 秒前发生重启

alert: 进程退出告警
expr: up{export="process_exporter"} == 0 or max by(cluster, job, instance, groupname) (delta(namedprocess_namegroup_oldest_start_time_seconds{groupname=~"^map.*"}[10d])) < 0
for: 55s
labels:
  severity: warning
annotations:
  value: 进程 {{ $labels.export}} 已退出
  • 检查语法
/usr/local/prometheus/promtool  check rules  rules.yml
/usr/local/prometheus/promtool  check rules  rules.yml

1.7 grafana

https://grafana.com/grafana/dashboards/8378-system-processes-metrics/