Skip to content

1. 容器方式

bash
docker run -d --restart=always -p 9100:9100 prom/node-exporter
docker run -d --restart=always -p 9100:9100 prom/node-exporter

1.1 docker-compose

yaml
version: '3'
services:
  node-exporter:
    image: prom/node-exporter
    container_name: node-exporter
    restart: always
    ports:
      - "9100:9100"
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
      - /var/run/dbus/system_bus_socket:/var/run/dbus/system_bus_socket:ro
    command:
      - '--collector.processes'
      - '--no-collector.hwmon'
      - '--no-collector.dmi'
      - '--no-collector.arp'
      - '--no-collector.infiniband'
      - '--collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/containerd/.+|/var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)'
      - '--collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$'
      - '--collector.systemd'
version: '3'
services:
  node-exporter:
    image: prom/node-exporter
    container_name: node-exporter
    restart: always
    ports:
      - "9100:9100"
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
      - /var/run/dbus/system_bus_socket:/var/run/dbus/system_bus_socket:ro
    command:
      - '--collector.processes'
      - '--no-collector.hwmon'
      - '--no-collector.dmi'
      - '--no-collector.arp'
      - '--no-collector.infiniband'
      - '--collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/containerd/.+|/var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)'
      - '--collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$'
      - '--collector.systemd'

2. 二进制方式

2.1 下载

官当

https://github.com/prometheus/node_exporter/releases

bash
wget https://github.com/prometheus/node_exporter/releases/download/v1.8.2/node_exporter-1.8.2.linux-amd64.tar.gz

#或者
curl -LO https://github.com/prometheus/node_exporter/releases/download/v1.8.2/node_exporter-1.8.2.linux-amd64.tar.gz
wget https://github.com/prometheus/node_exporter/releases/download/v1.8.2/node_exporter-1.8.2.linux-amd64.tar.gz

#或者
curl -LO https://github.com/prometheus/node_exporter/releases/download/v1.8.2/node_exporter-1.8.2.linux-amd64.tar.gz

2.2 部署

  • 创建目录
bash
mkdir /opt/node_exporter
mkdir /opt/node_exporter
  • 创建用户
useradd -rs /bin/false node_exporter
useradd -rs /bin/false node_exporter
  • 解压
bash
tar xvfz node_exporter-*.*-amd64.tar.gz

cd node_exporter-*.*-amd64

mv node_exporter /opt/node_exporter

chown -R node_exporter. /opt/node_exporter
tar xvfz node_exporter-*.*-amd64.tar.gz

cd node_exporter-*.*-amd64

mv node_exporter /opt/node_exporter

chown -R node_exporter. /opt/node_exporter
  • 创建systemd
yaml
cat > /etc/systemd/system/node_exporter.service <<EOF
[Unit]
Description=Node Exporter
Documentation=https://prometheus.io/docs/guides/node-exporter/
Wants=network-online.target
After=network-online.target

[Service]
Type=simple
User=node_exporter
Group=node_exporter
ExecStart=/opt/node_exporter/node_exporter \
		--web.listen-address=:9100 \
		--collector.processes \
		--no-collector.hwmon \
		--no-collector.dmi \
		--no-collector.arp \
		--collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/containerd/.+|/var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/) \
		--collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$

Restart=on-failure
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process

[Install]
WantedBy=multi-user.target
EOF
cat > /etc/systemd/system/node_exporter.service <<EOF
[Unit]
Description=Node Exporter
Documentation=https://prometheus.io/docs/guides/node-exporter/
Wants=network-online.target
After=network-online.target

[Service]
Type=simple
User=node_exporter
Group=node_exporter
ExecStart=/opt/node_exporter/node_exporter \
		--web.listen-address=:9100 \
		--collector.processes \
		--no-collector.hwmon \
		--no-collector.dmi \
		--no-collector.arp \
		--collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/containerd/.+|/var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/) \
		--collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$

Restart=on-failure
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process

[Install]
WantedBy=multi-user.target
EOF
bash
chmod 644 /etc/systemd/system/node_exporter.service
chmod 644 /etc/systemd/system/node_exporter.service
  • 启动
bash
systemctl daemon-reload
systemctl enable --now node_exporter
systemctl daemon-reload
systemctl enable --now node_exporter

3. 指标

3.1 cpu

1.CPU负载

CPU负载是指某段时间内占用CPU时间的进程和等待CPU时间的进程数之和

node_load1
node_load5
node_load15


node_load1 > on (instance) 4 * count by (instance)(node_cpu_seconds_total{mode="idle"})
node_load1
node_load5
node_load15


node_load1 > on (instance) 4 * count by (instance)(node_cpu_seconds_total{mode="idle"})

2.CPU使用率

node_cpu_seconds_total
node_cpu_seconds_total
bash
100 -avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance)* 100
100 -avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance)* 100

3.2 mem

node_memory_MemTotal_bytes    #总内存大小
node_memory_MemFree_bytes     #空闲内存大小
node_memory_Buffers_bytes     #缓冲缓存大小
node_memory_Cached_bytes      #页面缓存大小
node_memory_MemTotal_bytes    #总内存大小
node_memory_MemFree_bytes     #空闲内存大小
node_memory_Buffers_bytes     #缓冲缓存大小
node_memory_Cached_bytes      #页面缓存大小

计算的公式为:(总内存 -(空闲内存 + 缓冲缓存 + 页面缓存))/ 总内存 * 100

(node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes+node_memory_Cached_bytes ))/node_memory_MemTotal_bytes * 100

swap内存使用率

Swap为交换内存分区,它使用磁盘上的部分空间来充当服务器内存,当系统物理内存吃紧时,Linux 会将内存中不常访问的数据保存到 swap 上,这样系统就有更多的物理内存为各个进程服务。而当系统需要访问 swap 上存储的内容时,再将 swap 上的数据加载到内存中,这就是常说的换出和换入。交换空间可以在一定程度上缓解内存不足的情况,但是它需要读写磁盘数据,所以性能不是很高。

node_memory_SwapTotal_bytes  #swap内存总大小
node_memory_SwapFree_bytes   #swap空闲内存大小
node_memory_SwapTotal_bytes  #swap内存总大小
node_memory_SwapFree_bytes   #swap空闲内存大小

计算的公式如下:(node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes)/node_memory_SwapTotal_bytes * 100

内存饱和度

node_vmstat_pswpin:系统每秒从磁盘读到内存的字节数

node_vmstat_pswpout:系统每秒从内存写到磁盘的字节数

两者都是自上次启动以来的字节数,以KB为单位

为了获得饱合度指标,对每个指标计算每一分钟的速率,将两个速率相加,然后乘以1024获得字节数

1024 * sum by (instance) ((rate(node_vmstat_pgpgin[1m]) + rate(node_vmstat_pgpgout[1m])))
1024 * sum by (instance) ((rate(node_vmstat_pgpgin[1m]) + rate(node_vmstat_pgpgout[1m])))

3.3 disk

1.分区使用率

node_filesystem_size_bytes  # 分区空间总容量
node_filesystem_free_bytes  # 分区空闲容量
node_filesystem_size_bytes  # 分区空间总容量
node_filesystem_free_bytes  # 分区空闲容量
bash
(node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"})/node_filesystem_size_bytes{mountpoint="/"} * 100
(node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"})/node_filesystem_size_bytes{mountpoint="/"} * 100

2.磁盘吞吐量

node_disk_read_bytes_total  #分区读总字节数
node_disk_written_bytes_total #分区写总字节数
node_disk_read_bytes_total  #分区读总字节数
node_disk_written_bytes_total #分区写总字节数

上面两个指标分别对应了分区读写的总字节数,指标为counter类型。前面文章讲过,counter类型会不断的累加,该指标直接使用对于监控没有意义,但可通过下面公式转化为磁盘的每秒读写速率。device代表对应的磁盘分区。

bash
irate(node_disk_read_bytes_total{device="vda"}[5m]) 
irate(node_disk_written_bytes_total{device="vda"}[5m])
irate(node_disk_read_bytes_total{device="vda"}[5m]) 
irate(node_disk_written_bytes_total{device="vda"}[5m])

3.磁盘IOPS

IOPS表示每秒对磁盘的读写次数,它与吞吐量都是衡量磁盘的重要指标

node_disk_reads_completed_total  #分区读总次数
node_disk_writes_completed_total  #分区写总次数
node_disk_reads_completed_total  #分区读总次数
node_disk_writes_completed_total  #分区写总次数

计算公式与上面相似,使用我们熟悉的irate或rate函数来处理

irate(node_disk_reads_completed_total{device="vda"}[5m])
irate(node_disk_writes_completed_total{device="vda"}[5m])
irate(node_disk_reads_completed_total{device="vda"}[5m])
irate(node_disk_writes_completed_total{device="vda"}[5m])

4.磁盘未来耗尽

可以使用predict_linear函数来构建在未来什么时间会耗尽磁盘空间

bash
predict_linear(node_filesystem_free_bytes{mountgoint="/"}[1h],4*3600)<0
predict_linear(node_filesystem_free_bytes{mountgoint="/"}[1h],4*3600)<0

上面是指定根文件系统,还可以通过指定作业名称或使用正则表达式来选择所有文件系统

bash
predict_linear(node_filesystem_free_bytes{job="node"}[1h], 4*3600) < 0
predict_linear(node_filesystem_free_bytes{job="node"}[1h], 4*3600) < 0

在上面中,我们选择一小时的时间窗口,并将此时间序列快照放在predict_linear函数中。该函数使用简单的线性回归,根 据以前的增长情况来确定文件系统何时会耗尽空间。该函数参数包括一个范围向量,即一小时窗口,以及未来需要预测的 时间点。这些都是以秒为单位的,因此这里使用4*3600秒,即四小时。最后<0,即文件系统空间不足

3.4 network

node_network_receive_bytes_total  #下载流量总字节数
node_network_transmit_bytes_total  #上传流量总字节数
node_network_receive_bytes_total  #下载流量总字节数
node_network_transmit_bytes_total  #上传流量总字节数

计算公式如下,此处排除Loopback 网卡

irate(node_network_receive_bytes_total{device != "lo"}[1m]
irate(node_network_receive_bytes_total{device != "lo"}[1m]

4. 收集器

4.1 textfile

暴露自定义指标时特别有用。这些自定义指标可能是批处理或cron作业等无法抓取的,可能是没有exporter的源,甚至可能是为主机提供上下文的静态指标。收集器通过扫描指定目录中的文件,提取所有格式为

Prometheus指标的字符串,然后暴露它们以便抓取

该参数自动加载,不用配置参数,但是需要指定textfile的目录

1 创建

  • 创建目录
bash
mkdir -p /data/monitor/prometheus/textfile_collector
mkdir -p /data/monitor/prometheus/textfile_collector
  • 创建文件
bash
echo 'metadata{role="docker_server",datacenter="NJ"} 1' >>  /data/monitor/prometheus/textfile_collector/metadata.prom
echo 'metadata{role="docker_server",datacenter="NJ"} 1' >>  /data/monitor/prometheus/textfile_collector/metadata.prom

2 启动

需要指定--collector.textfile.directory="/etc/prometheus/textfile_collector"

3.配置

Text-based格式*.prom文件文件书写规范

每行必须使用换行符\n结束,空行会被忽略。
#符号开头,后面不接HELP或TYPE的行,视为注释。

# HELP开头,后面第一个字段是metric名,再后面的字段或字符被视为对metric的描述。

# TYPE开头,后面第一个字段是metric名,第二个字段是metric类型,metric类型有counter, gauge, histogram, summary, or untyped。
每行必须使用换行符\n结束,空行会被忽略。
#符号开头,后面不接HELP或TYPE的行,视为注释。

# HELP开头,后面第一个字段是metric名,再后面的字段或字符被视为对metric的描述。

# TYPE开头,后面第一个字段是metric名,第二个字段是metric类型,metric类型有counter, gauge, histogram, summary, or untyped。

相同的metric名只能有一个TYPE,并且TYPE这行要放在metric取样之前,如果没有为metric设置TYPE,metric类型被设置为untyped

比如:

bash
# cat  /data/monitor/prometheus/textfile_collector/worker_num.prom 
# HELP worker_num airflow worker number
# TYPE worker_num gauge
worker_num 2001
# cat  /data/monitor/prometheus/textfile_collector/worker_num.prom 
# HELP worker_num airflow worker number
# TYPE worker_num gauge
worker_num 2001
  • 采集脚本
bash
# cat /data/monitor/prometheus/textfile_collector/scripts/worker_exporter.sh
worker_num=`ps uax | grep celery | grep -v grep | wc -l`
echo "# HELP worker_num airflow worker number" > /data/monitor/prometheus/textfile_collector/worker_num
echo "# TYPE worker_num gauge" >> /data/monitor/prometheus/textfile_collector/worker_num
echo worker_num $worker_num >> /data/monitor/prometheus/textfile_collector/worker_num
mv /home/hadoop/airflow/logs/node_exporter/worker_num /data/monitor/prometheus/textfile_collector/worker_num.prom
# cat /data/monitor/prometheus/textfile_collector/scripts/worker_exporter.sh
worker_num=`ps uax | grep celery | grep -v grep | wc -l`
echo "# HELP worker_num airflow worker number" > /data/monitor/prometheus/textfile_collector/worker_num
echo "# TYPE worker_num gauge" >> /data/monitor/prometheus/textfile_collector/worker_num
echo worker_num $worker_num >> /data/monitor/prometheus/textfile_collector/worker_num
mv /home/hadoop/airflow/logs/node_exporter/worker_num /data/monitor/prometheus/textfile_collector/worker_num.prom
  • 计划任务

...

监控端口案例

1.创建文件

bash
cat > /opt/prometheus/monitor_open_ports.sh <<EOF
#!/bin/bash

MONITOR_DADA_OUTPUT_FILE='/opt/prometheus/data/open_ports.prom'

# Define the ports to monitor. Add or remove ports as needed.
MONITORED_PORTS=(7380 7381 7382 17380 17381 17382)

# Function to extract open ports from netstat output
get_open_ports() {
  for port in "${MONITORED_PORTS[@]}"; do
    open_port_count=$(netstat -an | grep -c -E ":${port}.*LISTEN")
    if [[ $open_port_count -gt 0 ]]; then
      echo "node_exporter_open_ports{port=\"$port\"} $open_port_count" >> ${MONITOR_DADA_OUTPUT_FILE}
    else
      echo "node_exporter_open_ports{port=\"$port\"} 0" >> ${MONITOR_DADA_OUTPUT_FILE}
    fi
  done
}

# Define Metrics Type and clear OLD DATA.
echo "# TYPE node_exporter_open_ports gauge" > ${MONITOR_DADA_OUTPUT_FILE}

get_open_ports
EOF
cat > /opt/prometheus/monitor_open_ports.sh <<EOF
#!/bin/bash

MONITOR_DADA_OUTPUT_FILE='/opt/prometheus/data/open_ports.prom'

# Define the ports to monitor. Add or remove ports as needed.
MONITORED_PORTS=(7380 7381 7382 17380 17381 17382)

# Function to extract open ports from netstat output
get_open_ports() {
  for port in "${MONITORED_PORTS[@]}"; do
    open_port_count=$(netstat -an | grep -c -E ":${port}.*LISTEN")
    if [[ $open_port_count -gt 0 ]]; then
      echo "node_exporter_open_ports{port=\"$port\"} $open_port_count" >> ${MONITOR_DADA_OUTPUT_FILE}
    else
      echo "node_exporter_open_ports{port=\"$port\"} 0" >> ${MONITOR_DADA_OUTPUT_FILE}
    fi
  done
}

# Define Metrics Type and clear OLD DATA.
echo "# TYPE node_exporter_open_ports gauge" > ${MONITOR_DADA_OUTPUT_FILE}

get_open_ports
EOF

指标记录中的字符串格式的值要用 "",不能使用 '' 。否则会报错

2.启动配置

ExecStart=/usr/bin/node_exporter --collector.textfile.directory=/opt/prometheus/data/
ExecStart=/usr/bin/node_exporter --collector.textfile.directory=/opt/prometheus/data/

3.监控*prom文件是否变化

yaml
- alert: TextFileNotUpdated
  expr: (time() - node_textfile_mtime_seconds{file="your_file.prom"}) > 600
  for: 1m
  labels:
    severity: warning
  annotations:
    summary: "Textfile not updated within expected timeframe"
    description: "The file your_file.prom has not been updated in the last 10 minutes."
- alert: TextFileNotUpdated
  expr: (time() - node_textfile_mtime_seconds{file="your_file.prom"}) > 600
  for: 1m
  labels:
    severity: warning
  annotations:
    summary: "Textfile not updated within expected timeframe"
    description: "The file your_file.prom has not been updated in the last 10 minutes."

4.2 sytemd收集器

1.启动

/opt/apps/node_exporter/bin/node_exporter --collector.systemd
/opt/apps/node_exporter/bin/node_exporter --collector.systemd

添加白名单

--collector.systemd.unit-whitelist "(docker|ssh|kube-apiserver|kube-controller-manager|kube-scheduler|kubelet|kube-proxy).service"
--collector.systemd.unit-whitelist "(docker|ssh|kube-apiserver|kube-controller-manager|kube-scheduler|kubelet|kube-proxy).service"
  • PromQL查询
node_systemd_unit_state{name="kube-apiserver.service",state="active"}
node_systemd_unit_state{name="kube-apiserver.service",state="active"}

或者在node_exporter上执行

shell
$ curl -g -X GET 10.103.236.199:9100/metrics?collect[]=systemd
$ curl -g -X GET 10.103.236.199:9100/metrics?collect[]=systemd

image-20250213135852624