Skip to content

第一种:RabbitMQ内部集成Prometheus来获取指标

  • 3.8.0之前版本,RabbitMQ可以使用单独的插件prometheus_rabbitmq_exporter来向Prometheus公开指标,要单独下载到RabbitMQ安装目录中进行安装;

prometheus_rabbitmq_exporter:https://github.com/deadtrickster/prometheus_rabbitmq_exporter

  • 3.8.0版开始,RabbitMQ附带了内置的Prometheus&Grafana支持。虽然内置了该插件,但也要进行安装

rabbitmq-prometheus:https://github.com/rabbitmq/rabbitmq-prometheus

第二种:使用独立程序来获取指标(RabbitMQ_exporter)

不管什么版本都能使用,要单独启动exporter进程

rabbitmq_exporter:https://github.com/kbudde/rabbitmq_exporter

RabbitMQ 官方监控介绍:

1、rabbitmq_exporter采集介绍

rabbitmq_exporter通过读取rabbitmq中一些监控数据消息,并将其转换成prometheus指标格式并暴露http接口被prometheus采集:

2、部署rabbitmq_exporter

https://github.com/kbudde/rabbitmq_exporter

[root@node1 prometheus]# wget https://github.com/kbudde/rabbitmq_exporter/releases/download/v1.0.0-RC7/rabbitmq_exporter-1.0.0-RC7.linux-amd64.tar.gz
[root@node1 prometheus]# tar xf rabbitmq_exporter-1.0.0-RC7.linux-amd64.tar.gz
[root@node1 prometheus]# mv -f rabbitmq_exporter-1.0.0-RC7.linux-amd64/ rabbitmq_exporter
[root@node1 prometheus]# cd rabbitmq_exporter
[root@node1 prometheus]# wget https://github.com/kbudde/rabbitmq_exporter/releases/download/v1.0.0-RC7/rabbitmq_exporter-1.0.0-RC7.linux-amd64.tar.gz
[root@node1 prometheus]# tar xf rabbitmq_exporter-1.0.0-RC7.linux-amd64.tar.gz
[root@node1 prometheus]# mv -f rabbitmq_exporter-1.0.0-RC7.linux-amd64/ rabbitmq_exporter
[root@node1 prometheus]# cd rabbitmq_exporter
  • 配置文件
vim config.example.json
{
    "rabbit_url": "http://127.0.0.1:15672",
    "rabbit_user": "guest",
    "rabbit_pass": "guest",
    "publish_port": "9419",
    "publish_addr": "",
    "output_format": "TTY",
    "ca_file": "ca.pem",
    "cert_file": "client-cert.pem",
    "key_file": "client-key.pem",
    "insecure_skip_verify": false,
    "exlude_metrics": [],
    "include_exchanges": ".*",
    "skip_exchanges": "^$",
    "include_queues": ".*",
    "skip_queues": "^$",
    "skip_vhost": "^$",
    "include_vhost": ".*",
    "rabbit_capabilities": "no_sort,bert",
    "aliveness_vhost": "/",
    "enabled_exporters": [
            "exchange",
            "node",
            "overview",
            "queue",
            "aliveness"
    ],
    "timeout": 30,
    "max_queues": 0
}
vim config.example.json
{
    "rabbit_url": "http://127.0.0.1:15672",
    "rabbit_user": "guest",
    "rabbit_pass": "guest",
    "publish_port": "9419",
    "publish_addr": "",
    "output_format": "TTY",
    "ca_file": "ca.pem",
    "cert_file": "client-cert.pem",
    "key_file": "client-key.pem",
    "insecure_skip_verify": false,
    "exlude_metrics": [],
    "include_exchanges": ".*",
    "skip_exchanges": "^$",
    "include_queues": ".*",
    "skip_queues": "^$",
    "skip_vhost": "^$",
    "include_vhost": ".*",
    "rabbit_capabilities": "no_sort,bert",
    "aliveness_vhost": "/",
    "enabled_exporters": [
            "exchange",
            "node",
            "overview",
            "queue",
            "aliveness"
    ],
    "timeout": 30,
    "max_queues": 0
}

配置rabbitmq_exporter服务启动

[root@node1 rabbitmq_exporter]# cat /usr/lib/systemd/system/rabbitmq_exporter.service
[Service]
User=root
Group=root
ExecStart=/data/prometheus/rabbitmq_exporter/rabbitmq_exporter -config-file /data/prometheus/rabbitmq_exporter/config.json

[Install]
WantedBy=multi-user.target

[Unit]
Description=node_exporter
After=network.target
[root@node1 rabbitmq_exporter]# cat /usr/lib/systemd/system/rabbitmq_exporter.service
[Service]
User=root
Group=root
ExecStart=/data/prometheus/rabbitmq_exporter/rabbitmq_exporter -config-file /data/prometheus/rabbitmq_exporter/config.json

[Install]
WantedBy=multi-user.target

[Unit]
Description=node_exporter
After=network.target

3、添加采集监控配置

采集多个rabbitmq_exporter通常用label加以区分

    scrape_configs:

    # 线上rabbitmq 监控

    - job_name: 'prod-rabbitmq01'

      honor_timestamps: true

      scrape_interval: 30s

      scrape_timeout: 10s

      metrics_path: /metrics

      scheme: http

      static_configs:

        - targets: ['10.x.x.47:9099']

          labels:

            instance: prod-rabbitmq-01



    # 线上rabbitmq-seckill 监控

    - job_name: 'prod-rabbitmq02'

      static_configs:

        - targets: ['10.0.x.x:19099']

          labels:

            instance: prod-rabbitmq-02
    scrape_configs:

    # 线上rabbitmq 监控

    - job_name: 'prod-rabbitmq01'

      honor_timestamps: true

      scrape_interval: 30s

      scrape_timeout: 10s

      metrics_path: /metrics

      scheme: http

      static_configs:

        - targets: ['10.x.x.47:9099']

          labels:

            instance: prod-rabbitmq-01



    # 线上rabbitmq-seckill 监控

    - job_name: 'prod-rabbitmq02'

      static_configs:

        - targets: ['10.0.x.x:19099']

          labels:

            instance: prod-rabbitmq-02

4、添加监控面板

5添加告警Alter Rule

https://chenzhonzhou.github.io/2020/04/02/prometheusalert-duo-qu-dao-gao-jing-tong-zhi-gong-ju/

编辑prometheus-rule文件添加

 #mq 节点状态

      - alert: rabbitmq-status

        expr: rabbitmq_running{job="prod-rabbitmq"} != 1

        for: 10s

        labels:

          severity: 严重

          app: ops

        annotations:

          summary: "rabbitmq Instance down"

          description: "生产rabbitmq:{{ $labels.node }} 节点宕机 (当前值: {{ $value }})"



      #mq节点内存使用

      - alert: rabbitmq-node_mem_used

        expr: rabbitmq_node_mem_used_re{job="prod-rabbitmq"} / rabbitmq_node_mem_limit_re{job="prod-rabbitmq"}  > 0.8
        for: 10s

        labels:

          severity: 严重

          app: ops

        annotations:

          summary: "rabbitmq pod节点内存使用"

          description:"rabbitmq:{{ $labels.node }} 内存实际使用超80% (当前值: {{ $value }})"  
          
          
          
  #mq堆积消息监控

      - alert: rabbitmq_queue_messages_unack

        expr: sum by (queue ,job)(rabbitmq_queue_messages_unacknowledged{job="prod-rabbitmq"}) > 500
       for: 5m

        labels:

          severity: critical

          app: ops

        annotations:

          summary: "{{ $labels.queue }} MQ队列unack消息> 500 "

          description: "{{ $labels.queue }} MQ队列unack消息> 500 (当前值: {{ $value }})"
           #MQ推送错误监控

      - alert: PushMqFailed

        expr:  sum by (job) (rabbitmq_failed_to_publish_total{job != ""}) >5

        for: 30s

        labels:

          severity: critical

          app: ops

        annotations:

          summary: "{{ $labels.job }} {{ $labels.instance }} 推送MQ 错误大于5 "

          description: "{{ $labels.job }} {{ $labels.instance }} 推送MQ 错误大于5 (当前值: {{ $value }})"
 
  #mq队列消息监控

      - alert: rabbitmq_queue_messages_ready

        expr: sum by(queue ,job) (rabbitmq_queue_messages_ready_re{job="prod-rabbitmq"}) 
   
        for: 5m

        labels:

          severity: critical

          app: ops

        annotations:

          summary: "{{ $labels.queue }} MQ队列ready消息> 500 "

          description: "{{ $labels.queue }} MQ队列ready消息数> 5000 持续10min (当前值: {{ $value }})"
 #mq 节点状态

      - alert: rabbitmq-status

        expr: rabbitmq_running{job="prod-rabbitmq"} != 1

        for: 10s

        labels:

          severity: 严重

          app: ops

        annotations:

          summary: "rabbitmq Instance down"

          description: "生产rabbitmq:{{ $labels.node }} 节点宕机 (当前值: {{ $value }})"



      #mq节点内存使用

      - alert: rabbitmq-node_mem_used

        expr: rabbitmq_node_mem_used_re{job="prod-rabbitmq"} / rabbitmq_node_mem_limit_re{job="prod-rabbitmq"}  > 0.8
        for: 10s

        labels:

          severity: 严重

          app: ops

        annotations:

          summary: "rabbitmq pod节点内存使用"

          description:"rabbitmq:{{ $labels.node }} 内存实际使用超80% (当前值: {{ $value }})"  
          
          
          
  #mq堆积消息监控

      - alert: rabbitmq_queue_messages_unack

        expr: sum by (queue ,job)(rabbitmq_queue_messages_unacknowledged{job="prod-rabbitmq"}) > 500
       for: 5m

        labels:

          severity: critical

          app: ops

        annotations:

          summary: "{{ $labels.queue }} MQ队列unack消息> 500 "

          description: "{{ $labels.queue }} MQ队列unack消息> 500 (当前值: {{ $value }})"
           #MQ推送错误监控

      - alert: PushMqFailed

        expr:  sum by (job) (rabbitmq_failed_to_publish_total{job != ""}) >5

        for: 30s

        labels:

          severity: critical

          app: ops

        annotations:

          summary: "{{ $labels.job }} {{ $labels.instance }} 推送MQ 错误大于5 "

          description: "{{ $labels.job }} {{ $labels.instance }} 推送MQ 错误大于5 (当前值: {{ $value }})"
 
  #mq队列消息监控

      - alert: rabbitmq_queue_messages_ready

        expr: sum by(queue ,job) (rabbitmq_queue_messages_ready_re{job="prod-rabbitmq"}) 
   
        for: 5m

        labels:

          severity: critical

          app: ops

        annotations:

          summary: "{{ $labels.queue }} MQ队列ready消息> 500 "

          description: "{{ $labels.queue }} MQ队列ready消息数> 5000 持续10min (当前值: {{ $value }})"