第一种:RabbitMQ内部集成Prometheus来获取指标
- 3.8.0之前版本,RabbitMQ可以使用单独的插件prometheus_rabbitmq_exporter来向Prometheus公开指标,要单独下载到RabbitMQ安装目录中进行安装;
prometheus_rabbitmq_exporter:https://github.com/deadtrickster/prometheus_rabbitmq_exporter
- 3.8.0版开始,RabbitMQ附带了内置的Prometheus&Grafana支持。虽然内置了该插件,但也要进行安装
rabbitmq-prometheus:https://github.com/rabbitmq/rabbitmq-prometheus
第二种:使用独立程序来获取指标(RabbitMQ_exporter)
不管什么版本都能使用,要单独启动exporter进程
rabbitmq_exporter:https://github.com/kbudde/rabbitmq_exporter
RabbitMQ 官方监控介绍:
- https://www.rabbitmq.com/monitoring.html
- https://www.rabbitmq.com/prometheus.html#overview-prometheus
1、rabbitmq_exporter采集介绍
rabbitmq_exporter通过读取rabbitmq中一些监控数据消息,并将其转换成prometheus指标格式并暴露http接口被prometheus采集:
2、部署rabbitmq_exporter
https://github.com/kbudde/rabbitmq_exporter
[root@node1 prometheus]# wget https://github.com/kbudde/rabbitmq_exporter/releases/download/v1.0.0-RC7/rabbitmq_exporter-1.0.0-RC7.linux-amd64.tar.gz
[root@node1 prometheus]# tar xf rabbitmq_exporter-1.0.0-RC7.linux-amd64.tar.gz
[root@node1 prometheus]# mv -f rabbitmq_exporter-1.0.0-RC7.linux-amd64/ rabbitmq_exporter
[root@node1 prometheus]# cd rabbitmq_exporter
[root@node1 prometheus]# wget https://github.com/kbudde/rabbitmq_exporter/releases/download/v1.0.0-RC7/rabbitmq_exporter-1.0.0-RC7.linux-amd64.tar.gz
[root@node1 prometheus]# tar xf rabbitmq_exporter-1.0.0-RC7.linux-amd64.tar.gz
[root@node1 prometheus]# mv -f rabbitmq_exporter-1.0.0-RC7.linux-amd64/ rabbitmq_exporter
[root@node1 prometheus]# cd rabbitmq_exporter
- 配置文件
vim config.example.json
{
"rabbit_url": "http://127.0.0.1:15672",
"rabbit_user": "guest",
"rabbit_pass": "guest",
"publish_port": "9419",
"publish_addr": "",
"output_format": "TTY",
"ca_file": "ca.pem",
"cert_file": "client-cert.pem",
"key_file": "client-key.pem",
"insecure_skip_verify": false,
"exlude_metrics": [],
"include_exchanges": ".*",
"skip_exchanges": "^$",
"include_queues": ".*",
"skip_queues": "^$",
"skip_vhost": "^$",
"include_vhost": ".*",
"rabbit_capabilities": "no_sort,bert",
"aliveness_vhost": "/",
"enabled_exporters": [
"exchange",
"node",
"overview",
"queue",
"aliveness"
],
"timeout": 30,
"max_queues": 0
}
vim config.example.json
{
"rabbit_url": "http://127.0.0.1:15672",
"rabbit_user": "guest",
"rabbit_pass": "guest",
"publish_port": "9419",
"publish_addr": "",
"output_format": "TTY",
"ca_file": "ca.pem",
"cert_file": "client-cert.pem",
"key_file": "client-key.pem",
"insecure_skip_verify": false,
"exlude_metrics": [],
"include_exchanges": ".*",
"skip_exchanges": "^$",
"include_queues": ".*",
"skip_queues": "^$",
"skip_vhost": "^$",
"include_vhost": ".*",
"rabbit_capabilities": "no_sort,bert",
"aliveness_vhost": "/",
"enabled_exporters": [
"exchange",
"node",
"overview",
"queue",
"aliveness"
],
"timeout": 30,
"max_queues": 0
}
配置rabbitmq_exporter服务启动
[root@node1 rabbitmq_exporter]# cat /usr/lib/systemd/system/rabbitmq_exporter.service
[Service]
User=root
Group=root
ExecStart=/data/prometheus/rabbitmq_exporter/rabbitmq_exporter -config-file /data/prometheus/rabbitmq_exporter/config.json
[Install]
WantedBy=multi-user.target
[Unit]
Description=node_exporter
After=network.target
[root@node1 rabbitmq_exporter]# cat /usr/lib/systemd/system/rabbitmq_exporter.service
[Service]
User=root
Group=root
ExecStart=/data/prometheus/rabbitmq_exporter/rabbitmq_exporter -config-file /data/prometheus/rabbitmq_exporter/config.json
[Install]
WantedBy=multi-user.target
[Unit]
Description=node_exporter
After=network.target
3、添加采集监控配置
采集多个rabbitmq_exporter通常用label加以区分
scrape_configs:
# 线上rabbitmq 监控
- job_name: 'prod-rabbitmq01'
honor_timestamps: true
scrape_interval: 30s
scrape_timeout: 10s
metrics_path: /metrics
scheme: http
static_configs:
- targets: ['10.x.x.47:9099']
labels:
instance: prod-rabbitmq-01
# 线上rabbitmq-seckill 监控
- job_name: 'prod-rabbitmq02'
static_configs:
- targets: ['10.0.x.x:19099']
labels:
instance: prod-rabbitmq-02
scrape_configs:
# 线上rabbitmq 监控
- job_name: 'prod-rabbitmq01'
honor_timestamps: true
scrape_interval: 30s
scrape_timeout: 10s
metrics_path: /metrics
scheme: http
static_configs:
- targets: ['10.x.x.47:9099']
labels:
instance: prod-rabbitmq-01
# 线上rabbitmq-seckill 监控
- job_name: 'prod-rabbitmq02'
static_configs:
- targets: ['10.0.x.x:19099']
labels:
instance: prod-rabbitmq-02
4、添加监控面板
5添加告警Alter Rule
https://chenzhonzhou.github.io/2020/04/02/prometheusalert-duo-qu-dao-gao-jing-tong-zhi-gong-ju/
编辑prometheus-rule文件添加
#mq 节点状态
- alert: rabbitmq-status
expr: rabbitmq_running{job="prod-rabbitmq"} != 1
for: 10s
labels:
severity: 严重
app: ops
annotations:
summary: "rabbitmq Instance down"
description: "生产rabbitmq:{{ $labels.node }} 节点宕机 (当前值: {{ $value }})"
#mq节点内存使用
- alert: rabbitmq-node_mem_used
expr: rabbitmq_node_mem_used_re{job="prod-rabbitmq"} / rabbitmq_node_mem_limit_re{job="prod-rabbitmq"} > 0.8
for: 10s
labels:
severity: 严重
app: ops
annotations:
summary: "rabbitmq pod节点内存使用"
description:"rabbitmq:{{ $labels.node }} 内存实际使用超80% (当前值: {{ $value }})"
#mq堆积消息监控
- alert: rabbitmq_queue_messages_unack
expr: sum by (queue ,job)(rabbitmq_queue_messages_unacknowledged{job="prod-rabbitmq"}) > 500
for: 5m
labels:
severity: critical
app: ops
annotations:
summary: "{{ $labels.queue }} MQ队列unack消息> 500 "
description: "{{ $labels.queue }} MQ队列unack消息> 500 (当前值: {{ $value }})"
#MQ推送错误监控
- alert: PushMqFailed
expr: sum by (job) (rabbitmq_failed_to_publish_total{job != ""}) >5
for: 30s
labels:
severity: critical
app: ops
annotations:
summary: "{{ $labels.job }} {{ $labels.instance }} 推送MQ 错误大于5 "
description: "{{ $labels.job }} {{ $labels.instance }} 推送MQ 错误大于5 (当前值: {{ $value }})"
#mq队列消息监控
- alert: rabbitmq_queue_messages_ready
expr: sum by(queue ,job) (rabbitmq_queue_messages_ready_re{job="prod-rabbitmq"})
for: 5m
labels:
severity: critical
app: ops
annotations:
summary: "{{ $labels.queue }} MQ队列ready消息> 500 "
description: "{{ $labels.queue }} MQ队列ready消息数> 5000 持续10min (当前值: {{ $value }})"
#mq 节点状态
- alert: rabbitmq-status
expr: rabbitmq_running{job="prod-rabbitmq"} != 1
for: 10s
labels:
severity: 严重
app: ops
annotations:
summary: "rabbitmq Instance down"
description: "生产rabbitmq:{{ $labels.node }} 节点宕机 (当前值: {{ $value }})"
#mq节点内存使用
- alert: rabbitmq-node_mem_used
expr: rabbitmq_node_mem_used_re{job="prod-rabbitmq"} / rabbitmq_node_mem_limit_re{job="prod-rabbitmq"} > 0.8
for: 10s
labels:
severity: 严重
app: ops
annotations:
summary: "rabbitmq pod节点内存使用"
description:"rabbitmq:{{ $labels.node }} 内存实际使用超80% (当前值: {{ $value }})"
#mq堆积消息监控
- alert: rabbitmq_queue_messages_unack
expr: sum by (queue ,job)(rabbitmq_queue_messages_unacknowledged{job="prod-rabbitmq"}) > 500
for: 5m
labels:
severity: critical
app: ops
annotations:
summary: "{{ $labels.queue }} MQ队列unack消息> 500 "
description: "{{ $labels.queue }} MQ队列unack消息> 500 (当前值: {{ $value }})"
#MQ推送错误监控
- alert: PushMqFailed
expr: sum by (job) (rabbitmq_failed_to_publish_total{job != ""}) >5
for: 30s
labels:
severity: critical
app: ops
annotations:
summary: "{{ $labels.job }} {{ $labels.instance }} 推送MQ 错误大于5 "
description: "{{ $labels.job }} {{ $labels.instance }} 推送MQ 错误大于5 (当前值: {{ $value }})"
#mq队列消息监控
- alert: rabbitmq_queue_messages_ready
expr: sum by(queue ,job) (rabbitmq_queue_messages_ready_re{job="prod-rabbitmq"})
for: 5m
labels:
severity: critical
app: ops
annotations:
summary: "{{ $labels.queue }} MQ队列ready消息> 500 "
description: "{{ $labels.queue }} MQ队列ready消息数> 5000 持续10min (当前值: {{ $value }})"