1. 容器方式
docker run -d --restart=always -p 9100:9100 prom/node-exporter
docker run -d --restart=always -p 9100:9100 prom/node-exporter
1.1 docker-compose
version: '3'
services:
node-exporter:
image: prom/node-exporter
container_name: node-exporter
restart: always
ports:
- "9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
- /var/run/dbus/system_bus_socket:/var/run/dbus/system_bus_socket:ro
command:
- '--collector.processes'
- '--no-collector.hwmon'
- '--no-collector.dmi'
- '--no-collector.arp'
- '--no-collector.infiniband'
- '--collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/containerd/.+|/var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)'
- '--collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$'
- '--collector.systemd'
version: '3'
services:
node-exporter:
image: prom/node-exporter
container_name: node-exporter
restart: always
ports:
- "9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
- /var/run/dbus/system_bus_socket:/var/run/dbus/system_bus_socket:ro
command:
- '--collector.processes'
- '--no-collector.hwmon'
- '--no-collector.dmi'
- '--no-collector.arp'
- '--no-collector.infiniband'
- '--collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/containerd/.+|/var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)'
- '--collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$'
- '--collector.systemd'
2. 二进制方式
2.1 下载
https://github.com/prometheus/node_exporter/releases
wget https://github.com/prometheus/node_exporter/releases/download/v1.8.2/node_exporter-1.8.2.linux-amd64.tar.gz
#或者
curl -LO https://github.com/prometheus/node_exporter/releases/download/v1.8.2/node_exporter-1.8.2.linux-amd64.tar.gz
wget https://github.com/prometheus/node_exporter/releases/download/v1.8.2/node_exporter-1.8.2.linux-amd64.tar.gz
#或者
curl -LO https://github.com/prometheus/node_exporter/releases/download/v1.8.2/node_exporter-1.8.2.linux-amd64.tar.gz
2.2 部署
- 创建目录
mkdir /opt/node_exporter
mkdir /opt/node_exporter
- 创建用户
useradd -rs /bin/false node_exporter
useradd -rs /bin/false node_exporter
- 解压
tar xvfz node_exporter-*.*-amd64.tar.gz
cd node_exporter-*.*-amd64
mv node_exporter /opt/node_exporter
chown -R node_exporter. /opt/node_exporter
tar xvfz node_exporter-*.*-amd64.tar.gz
cd node_exporter-*.*-amd64
mv node_exporter /opt/node_exporter
chown -R node_exporter. /opt/node_exporter
- 创建systemd
cat > /etc/systemd/system/node_exporter.service <<EOF
[Unit]
Description=Node Exporter
Documentation=https://prometheus.io/docs/guides/node-exporter/
Wants=network-online.target
After=network-online.target
[Service]
Type=simple
User=node_exporter
Group=node_exporter
ExecStart=/opt/node_exporter/node_exporter \
--web.listen-address=:9100 \
--collector.processes \
--no-collector.hwmon \
--no-collector.dmi \
--no-collector.arp \
--collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/containerd/.+|/var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/) \
--collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$
Restart=on-failure
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
[Install]
WantedBy=multi-user.target
EOF
cat > /etc/systemd/system/node_exporter.service <<EOF
[Unit]
Description=Node Exporter
Documentation=https://prometheus.io/docs/guides/node-exporter/
Wants=network-online.target
After=network-online.target
[Service]
Type=simple
User=node_exporter
Group=node_exporter
ExecStart=/opt/node_exporter/node_exporter \
--web.listen-address=:9100 \
--collector.processes \
--no-collector.hwmon \
--no-collector.dmi \
--no-collector.arp \
--collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/containerd/.+|/var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/) \
--collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$
Restart=on-failure
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
[Install]
WantedBy=multi-user.target
EOF
chmod 644 /etc/systemd/system/node_exporter.service
chmod 644 /etc/systemd/system/node_exporter.service
- 启动
systemctl daemon-reload
systemctl enable --now node_exporter
systemctl daemon-reload
systemctl enable --now node_exporter
3. 指标
3.1 cpu
1.CPU负载
CPU负载是指某段时间内占用CPU时间的进程和等待CPU时间的进程数之和
node_load1
node_load5
node_load15
node_load1 > on (instance) 4 * count by (instance)(node_cpu_seconds_total{mode="idle"})
node_load1
node_load5
node_load15
node_load1 > on (instance) 4 * count by (instance)(node_cpu_seconds_total{mode="idle"})
2.CPU使用率
node_cpu_seconds_total
node_cpu_seconds_total
100 -avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance)* 100
100 -avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance)* 100
3.2 mem
node_memory_MemTotal_bytes #总内存大小
node_memory_MemFree_bytes #空闲内存大小
node_memory_Buffers_bytes #缓冲缓存大小
node_memory_Cached_bytes #页面缓存大小
node_memory_MemTotal_bytes #总内存大小
node_memory_MemFree_bytes #空闲内存大小
node_memory_Buffers_bytes #缓冲缓存大小
node_memory_Cached_bytes #页面缓存大小
计算的公式为:(总内存 -(空闲内存 + 缓冲缓存 + 页面缓存))/ 总内存 * 100
(node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes+node_memory_Cached_bytes ))/node_memory_MemTotal_bytes * 100
swap内存使用率
Swap为交换内存分区,它使用磁盘上的部分空间来充当服务器内存,当系统物理内存吃紧时,Linux 会将内存中不常访问的数据保存到 swap 上,这样系统就有更多的物理内存为各个进程服务。而当系统需要访问 swap 上存储的内容时,再将 swap 上的数据加载到内存中,这就是常说的换出和换入。交换空间可以在一定程度上缓解内存不足的情况,但是它需要读写磁盘数据,所以性能不是很高。
node_memory_SwapTotal_bytes #swap内存总大小
node_memory_SwapFree_bytes #swap空闲内存大小
node_memory_SwapTotal_bytes #swap内存总大小
node_memory_SwapFree_bytes #swap空闲内存大小
计算的公式如下:(node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes)/node_memory_SwapTotal_bytes * 100
内存饱和度
node_vmstat_pswpin:系统每秒从磁盘读到内存的字节数
node_vmstat_pswpout:系统每秒从内存写到磁盘的字节数
两者都是自上次启动以来的字节数,以KB为单位
为了获得饱合度指标,对每个指标计算每一分钟的速率,将两个速率相加,然后乘以1024获得字节数
1024 * sum by (instance) ((rate(node_vmstat_pgpgin[1m]) + rate(node_vmstat_pgpgout[1m])))
1024 * sum by (instance) ((rate(node_vmstat_pgpgin[1m]) + rate(node_vmstat_pgpgout[1m])))
3.3 disk
1.分区使用率
node_filesystem_size_bytes # 分区空间总容量
node_filesystem_free_bytes # 分区空闲容量
node_filesystem_size_bytes # 分区空间总容量
node_filesystem_free_bytes # 分区空闲容量
(node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"})/node_filesystem_size_bytes{mountpoint="/"} * 100
(node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"})/node_filesystem_size_bytes{mountpoint="/"} * 100
2.磁盘吞吐量
node_disk_read_bytes_total #分区读总字节数
node_disk_written_bytes_total #分区写总字节数
node_disk_read_bytes_total #分区读总字节数
node_disk_written_bytes_total #分区写总字节数
上面两个指标分别对应了分区读写的总字节数,指标为counter类型。前面文章讲过,counter类型会不断的累加,该指标直接使用对于监控没有意义,但可通过下面公式转化为磁盘的每秒读写速率。device代表对应的磁盘分区。
irate(node_disk_read_bytes_total{device="vda"}[5m])
irate(node_disk_written_bytes_total{device="vda"}[5m])
irate(node_disk_read_bytes_total{device="vda"}[5m])
irate(node_disk_written_bytes_total{device="vda"}[5m])
3.磁盘IOPS
IOPS表示每秒对磁盘的读写次数,它与吞吐量都是衡量磁盘的重要指标
node_disk_reads_completed_total #分区读总次数
node_disk_writes_completed_total #分区写总次数
node_disk_reads_completed_total #分区读总次数
node_disk_writes_completed_total #分区写总次数
计算公式与上面相似,使用我们熟悉的irate或rate函数来处理
irate(node_disk_reads_completed_total{device="vda"}[5m])
irate(node_disk_writes_completed_total{device="vda"}[5m])
irate(node_disk_reads_completed_total{device="vda"}[5m])
irate(node_disk_writes_completed_total{device="vda"}[5m])
4.磁盘未来耗尽
可以使用predict_linear函数来构建在未来什么时间会耗尽磁盘空间
predict_linear(node_filesystem_free_bytes{mountgoint="/"}[1h],4*3600)<0
predict_linear(node_filesystem_free_bytes{mountgoint="/"}[1h],4*3600)<0
上面是指定根文件系统,还可以通过指定作业名称或使用正则表达式来选择所有文件系统
predict_linear(node_filesystem_free_bytes{job="node"}[1h], 4*3600) < 0
predict_linear(node_filesystem_free_bytes{job="node"}[1h], 4*3600) < 0
在上面中,我们选择一小时的时间窗口,并将此时间序列快照放在predict_linear函数中。该函数使用简单的线性回归,根 据以前的增长情况来确定文件系统何时会耗尽空间。该函数参数包括一个范围向量,即一小时窗口,以及未来需要预测的 时间点。这些都是以秒为单位的,因此这里使用4*3600秒,即四小时。最后<0,即文件系统空间不足
3.4 network
node_network_receive_bytes_total #下载流量总字节数
node_network_transmit_bytes_total #上传流量总字节数
node_network_receive_bytes_total #下载流量总字节数
node_network_transmit_bytes_total #上传流量总字节数
计算公式如下,此处排除Loopback 网卡
irate(node_network_receive_bytes_total{device != "lo"}[1m]
irate(node_network_receive_bytes_total{device != "lo"}[1m]
4. 收集器
4.1 textfile
暴露自定义指标时特别有用。这些自定义指标可能是批处理或cron作业等无法抓取的,可能是没有exporter的源,甚至可能是为主机提供上下文的静态指标。收集器通过扫描指定目录中的文件,提取所有格式为
Prometheus指标的字符串,然后暴露它们以便抓取
该参数自动加载,不用配置参数,但是需要指定textfile的目录
1 创建
- 创建目录
mkdir -p /data/monitor/prometheus/textfile_collector
mkdir -p /data/monitor/prometheus/textfile_collector
- 创建文件
echo 'metadata{role="docker_server",datacenter="NJ"} 1' >> /data/monitor/prometheus/textfile_collector/metadata.prom
echo 'metadata{role="docker_server",datacenter="NJ"} 1' >> /data/monitor/prometheus/textfile_collector/metadata.prom
2 启动
需要指定--collector.textfile.directory="/etc/prometheus/textfile_collector"
3.配置
Text-based格式*.prom
文件文件书写规范
每行必须使用换行符\n结束,空行会被忽略。
#符号开头,后面不接HELP或TYPE的行,视为注释。
# HELP开头,后面第一个字段是metric名,再后面的字段或字符被视为对metric的描述。
# TYPE开头,后面第一个字段是metric名,第二个字段是metric类型,metric类型有counter, gauge, histogram, summary, or untyped。
每行必须使用换行符\n结束,空行会被忽略。
#符号开头,后面不接HELP或TYPE的行,视为注释。
# HELP开头,后面第一个字段是metric名,再后面的字段或字符被视为对metric的描述。
# TYPE开头,后面第一个字段是metric名,第二个字段是metric类型,metric类型有counter, gauge, histogram, summary, or untyped。
相同的metric名只能有一个TYPE,并且TYPE这行要放在metric取样之前,如果没有为metric设置TYPE,metric类型被设置为untyped
比如:
# cat /data/monitor/prometheus/textfile_collector/worker_num.prom
# HELP worker_num airflow worker number
# TYPE worker_num gauge
worker_num 2001
# cat /data/monitor/prometheus/textfile_collector/worker_num.prom
# HELP worker_num airflow worker number
# TYPE worker_num gauge
worker_num 2001
- 采集脚本
# cat /data/monitor/prometheus/textfile_collector/scripts/worker_exporter.sh
worker_num=`ps uax | grep celery | grep -v grep | wc -l`
echo "# HELP worker_num airflow worker number" > /data/monitor/prometheus/textfile_collector/worker_num
echo "# TYPE worker_num gauge" >> /data/monitor/prometheus/textfile_collector/worker_num
echo worker_num $worker_num >> /data/monitor/prometheus/textfile_collector/worker_num
mv /home/hadoop/airflow/logs/node_exporter/worker_num /data/monitor/prometheus/textfile_collector/worker_num.prom
# cat /data/monitor/prometheus/textfile_collector/scripts/worker_exporter.sh
worker_num=`ps uax | grep celery | grep -v grep | wc -l`
echo "# HELP worker_num airflow worker number" > /data/monitor/prometheus/textfile_collector/worker_num
echo "# TYPE worker_num gauge" >> /data/monitor/prometheus/textfile_collector/worker_num
echo worker_num $worker_num >> /data/monitor/prometheus/textfile_collector/worker_num
mv /home/hadoop/airflow/logs/node_exporter/worker_num /data/monitor/prometheus/textfile_collector/worker_num.prom
- 计划任务
...
监控端口案例
1.创建文件
cat > /opt/prometheus/monitor_open_ports.sh <<EOF
#!/bin/bash
MONITOR_DADA_OUTPUT_FILE='/opt/prometheus/data/open_ports.prom'
# Define the ports to monitor. Add or remove ports as needed.
MONITORED_PORTS=(7380 7381 7382 17380 17381 17382)
# Function to extract open ports from netstat output
get_open_ports() {
for port in "${MONITORED_PORTS[@]}"; do
open_port_count=$(netstat -an | grep -c -E ":${port}.*LISTEN")
if [[ $open_port_count -gt 0 ]]; then
echo "node_exporter_open_ports{port=\"$port\"} $open_port_count" >> ${MONITOR_DADA_OUTPUT_FILE}
else
echo "node_exporter_open_ports{port=\"$port\"} 0" >> ${MONITOR_DADA_OUTPUT_FILE}
fi
done
}
# Define Metrics Type and clear OLD DATA.
echo "# TYPE node_exporter_open_ports gauge" > ${MONITOR_DADA_OUTPUT_FILE}
get_open_ports
EOF
cat > /opt/prometheus/monitor_open_ports.sh <<EOF
#!/bin/bash
MONITOR_DADA_OUTPUT_FILE='/opt/prometheus/data/open_ports.prom'
# Define the ports to monitor. Add or remove ports as needed.
MONITORED_PORTS=(7380 7381 7382 17380 17381 17382)
# Function to extract open ports from netstat output
get_open_ports() {
for port in "${MONITORED_PORTS[@]}"; do
open_port_count=$(netstat -an | grep -c -E ":${port}.*LISTEN")
if [[ $open_port_count -gt 0 ]]; then
echo "node_exporter_open_ports{port=\"$port\"} $open_port_count" >> ${MONITOR_DADA_OUTPUT_FILE}
else
echo "node_exporter_open_ports{port=\"$port\"} 0" >> ${MONITOR_DADA_OUTPUT_FILE}
fi
done
}
# Define Metrics Type and clear OLD DATA.
echo "# TYPE node_exporter_open_ports gauge" > ${MONITOR_DADA_OUTPUT_FILE}
get_open_ports
EOF
指标记录中的字符串格式的值要用
""
,不能使用''
。否则会报错
2.启动配置
ExecStart=/usr/bin/node_exporter --collector.textfile.directory=/opt/prometheus/data/
ExecStart=/usr/bin/node_exporter --collector.textfile.directory=/opt/prometheus/data/
3.监控*prom文件是否变化
- alert: TextFileNotUpdated
expr: (time() - node_textfile_mtime_seconds{file="your_file.prom"}) > 600
for: 1m
labels:
severity: warning
annotations:
summary: "Textfile not updated within expected timeframe"
description: "The file your_file.prom has not been updated in the last 10 minutes."
- alert: TextFileNotUpdated
expr: (time() - node_textfile_mtime_seconds{file="your_file.prom"}) > 600
for: 1m
labels:
severity: warning
annotations:
summary: "Textfile not updated within expected timeframe"
description: "The file your_file.prom has not been updated in the last 10 minutes."
4.2 sytemd收集器
1.启动
/opt/apps/node_exporter/bin/node_exporter --collector.systemd
/opt/apps/node_exporter/bin/node_exporter --collector.systemd
添加白名单
--collector.systemd.unit-whitelist "(docker|ssh|kube-apiserver|kube-controller-manager|kube-scheduler|kubelet|kube-proxy).service"
--collector.systemd.unit-whitelist "(docker|ssh|kube-apiserver|kube-controller-manager|kube-scheduler|kubelet|kube-proxy).service"
- PromQL查询
node_systemd_unit_state{name="kube-apiserver.service",state="active"}
node_systemd_unit_state{name="kube-apiserver.service",state="active"}
或者在node_exporter上执行
$ curl -g -X GET 10.103.236.199:9100/metrics?collect[]=systemd
$ curl -g -X GET 10.103.236.199:9100/metrics?collect[]=systemd