Skip to content

1. 基于Prometheus的全方位监控平台

1.1 部署配置

用的主要技术栈如下:

名字含义
Prometheus监控主服务
node-exporter数据采集器
kube-state-metrics数据采集器
metrics-server数据采集器
Consul自动发现
blackbox黑盒拨测
Alertmanager监控告警服务
Grafana数据展示服务
prometheusAlert告警消息转发服务

1.2 安装

1.2.1 部署顺序

image-20240626142101988

部署对外可访问Prometheus:

  1. 首先需要创建Prometheus所在命名空间;
  2. 然后创建Prometheus使用的RBAC规则;
  3. 创建Prometheus的configmap来保存配置文件;
  4. 创建service暴露Prometheus服务;
  5. 创建deployment部署Prometheus容器;
  6. 最后创建Ingress实现外部域名访问Prometheus。

1.2.2 创建命名空间

bash
[root@kube-master Prometheus]# kubectl create namespace monitor
namespace/monitor created
[root@kube-master Prometheus]# kubectl create namespace monitor
namespace/monitor created

1.2.3 创建RBAC规则

创建1.prometheus-rbac.yaml

yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: prometheus
  namespace: monitor
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: prometheus
rules:
- apiGroups: [""]
  resources: ["nodes","nodes/proxy","services","endpoints","pods"]
  verbs: ["get", "list", "watch"]
- apiGroups: ["extensions"]
  resources: ["ingress"]
  verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
  verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: prometheus
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: cluster-admin
subjects:
- kind: ServiceAccount
  name: prometheus
  namespace: monitor
apiVersion: v1
kind: ServiceAccount
metadata:
  name: prometheus
  namespace: monitor
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: prometheus
rules:
- apiGroups: [""]
  resources: ["nodes","nodes/proxy","services","endpoints","pods"]
  verbs: ["get", "list", "watch"]
- apiGroups: ["extensions"]
  resources: ["ingress"]
  verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
  verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: prometheus
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: cluster-admin
subjects:
- kind: ServiceAccount
  name: prometheus
  namespace: monitor
bash
[root@kube-master prometheus]# kubectl apply -f 1.prometheus-rbac.yaml
serviceaccount/prometheus created
clusterrole.rbac.authorization.k8s.io/prometheus created
clusterrolebinding.rbac.authorization.k8s.io/prometheus created
[root@kube-master prometheus]# kubectl apply -f 1.prometheus-rbac.yaml
serviceaccount/prometheus created
clusterrole.rbac.authorization.k8s.io/prometheus created
clusterrolebinding.rbac.authorization.k8s.io/prometheus created
  • 查看
bash
kubectl get sa prometheus -n monitor

kubectl get clusterrole prometheus

kubectl get clusterrolebinding prometheus
kubectl get sa prometheus -n monitor

kubectl get clusterrole prometheus

kubectl get clusterrolebinding prometheus

1.2.4 创建config

yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus-config
  namespace: monitor
data:
  prometheus.yml: |
    global:
      scrape_interval:     15s
      evaluation_interval: 15s
      external_labels:
        cluster: "kubernetes"


    ############ 数据采集job ###################
    scrape_configs:
    - job_name: prometheus
      static_configs:
      - targets: ['127.0.0.1:9090']
        labels:
          instance: prometheus

    ############ 指定告警规则文件路径位置 ###################
    rule_files:
    - /etc/prometheus/rules/*.rules
apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus-config
  namespace: monitor
data:
  prometheus.yml: |
    global:
      scrape_interval:     15s
      evaluation_interval: 15s
      external_labels:
        cluster: "kubernetes"


    ############ 数据采集job ###################
    scrape_configs:
    - job_name: prometheus
      static_configs:
      - targets: ['127.0.0.1:9090']
        labels:
          instance: prometheus

    ############ 指定告警规则文件路径位置 ###################
    rule_files:
    - /etc/prometheus/rules/*.rules
bash
kubectl apply -f 2.prometheus-config.yaml

#查看
kubectl get cm prometheus-config -n monitor
kubectl apply -f 2.prometheus-config.yaml

#查看
kubectl get cm prometheus-config -n monitor

1.2.5 创建ConfigMap类型的rules

yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus-rules
  namespace: monitor
data:
  general.rules: |
    groups:
    - name: general.rules
      rules:
      - alert: InstanceDown
        expr: |
          up{job=~"other-ECS|k8s-nodes|prometheus"} == 0
        for: 5s
        labels:
          severity: critical
        annotations:
          summary: "Instance {{ $labels.instance }} 停止工作"
          description: "{{ $labels.instance }} 主机名:{{ $labels.hostname }} 已经停止1分钟以上."

  node.rules: |
    groups:
    - name: node.rules
      rules:
      - alert: NodeFilesystemUsage
        expr: |
          100 - (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 > 85
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "Instance {{ $labels.instance }} : {{ $labels.mountpoint }} 分区使用率过高"
          description: "{{ $labels.instance }} 主机名:{{ $labels.hostname }} : {{ $labels.mountpoint }} 分区使用大于85% (当前值: {{ $value }})"
apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus-rules
  namespace: monitor
data:
  general.rules: |
    groups:
    - name: general.rules
      rules:
      - alert: InstanceDown
        expr: |
          up{job=~"other-ECS|k8s-nodes|prometheus"} == 0
        for: 5s
        labels:
          severity: critical
        annotations:
          summary: "Instance {{ $labels.instance }} 停止工作"
          description: "{{ $labels.instance }} 主机名:{{ $labels.hostname }} 已经停止1分钟以上."

  node.rules: |
    groups:
    - name: node.rules
      rules:
      - alert: NodeFilesystemUsage
        expr: |
          100 - (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 > 85
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "Instance {{ $labels.instance }} : {{ $labels.mountpoint }} 分区使用率过高"
          description: "{{ $labels.instance }} 主机名:{{ $labels.hostname }} : {{ $labels.mountpoint }} 分区使用大于85% (当前值: {{ $value }})"
bash
kubectl apply -f 3.prometheus-rules.yaml

#查看
kubectl get cm -n monitor prometheus-rules
kubectl apply -f 3.prometheus-rules.yaml

#查看
kubectl get cm -n monitor prometheus-rules

1.2.6 创建prometheus svc

bash
kubectl apply -f 4.prometheus-svc.yaml

#查看
kubectl get svc -n monitor
kubectl apply -f 4.prometheus-svc.yaml

#查看
kubectl get svc -n monitor

1.2.7 创建prometheus deploy

使用NFS提供的StorageClass来做数据存储

yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: prometheus
  namespace: monitor
  labels:
    k8s-app: prometheus
spec:
  replicas: 1
  selector:
    matchLabels:
      k8s-app: prometheus
  template:
    metadata:
      labels:
        k8s-app: prometheus
    spec:
      serviceAccountName: prometheus
      containers:
      - name: prometheus
        image: prom/prometheus:v2.36.0
        imagePullPolicy: IfNotPresent
        ports:
        - name: http
          containerPort: 9090
        securityContext:
          runAsUser: 65534
          privileged: true
        command:
        - "/bin/prometheus"
        args:
        - "--config.file=/etc/prometheus/prometheus.yml"
        - "--web.enable-lifecycle"
        - "--storage.tsdb.path=/prometheus"
        - "--storage.tsdb.retention.time=10d"
        - "--web.console.libraries=/etc/prometheus/console_libraries"
        - "--web.console.templates=/etc/prometheus/consoles"
        resources:
          limits:
            cpu: 2000m
            memory: 2048Mi
          requests:
            cpu: 1000m
            memory: 512Mi
        readinessProbe:
          httpGet:
            path: /-/ready
            port: 9090
          initialDelaySeconds: 5
          timeoutSeconds: 10
        livenessProbe:
          httpGet:
            path: /-/healthy
            port: 9090
          initialDelaySeconds: 30
          timeoutSeconds: 30
        volumeMounts:
        - name: data
          mountPath: /prometheus
          subPath: prometheus
        - name: config
          mountPath: /etc/prometheus
        - name: prometheus-rules
          mountPath: /etc/prometheus/rules
      - name: configmap-reload
        image: jimmidyson/configmap-reload:v0.5.0
        imagePullPolicy: IfNotPresent
        args:
        - "--volume-dir=/etc/config"
        - "--webhook-url=http://localhost:9090/-/reload"
        resources:
          limits:
            cpu: 100m
            memory: 100Mi
          requests:
            cpu: 10m
            memory: 10Mi
        volumeMounts:
        - name: config
          mountPath: /etc/config
          readOnly: true
      volumes:
      - name: data
        persistentVolumeClaim:
          claimName: prometheus-data-pvc
      - name: prometheus-rules
        configMap:
          name: prometheus-rules
      - name: config
        configMap:
          name: prometheus-config
apiVersion: apps/v1
kind: Deployment
metadata:
  name: prometheus
  namespace: monitor
  labels:
    k8s-app: prometheus
spec:
  replicas: 1
  selector:
    matchLabels:
      k8s-app: prometheus
  template:
    metadata:
      labels:
        k8s-app: prometheus
    spec:
      serviceAccountName: prometheus
      containers:
      - name: prometheus
        image: prom/prometheus:v2.36.0
        imagePullPolicy: IfNotPresent
        ports:
        - name: http
          containerPort: 9090
        securityContext:
          runAsUser: 65534
          privileged: true
        command:
        - "/bin/prometheus"
        args:
        - "--config.file=/etc/prometheus/prometheus.yml"
        - "--web.enable-lifecycle"
        - "--storage.tsdb.path=/prometheus"
        - "--storage.tsdb.retention.time=10d"
        - "--web.console.libraries=/etc/prometheus/console_libraries"
        - "--web.console.templates=/etc/prometheus/consoles"
        resources:
          limits:
            cpu: 2000m
            memory: 2048Mi
          requests:
            cpu: 1000m
            memory: 512Mi
        readinessProbe:
          httpGet:
            path: /-/ready
            port: 9090
          initialDelaySeconds: 5
          timeoutSeconds: 10
        livenessProbe:
          httpGet:
            path: /-/healthy
            port: 9090
          initialDelaySeconds: 30
          timeoutSeconds: 30
        volumeMounts:
        - name: data
          mountPath: /prometheus
          subPath: prometheus
        - name: config
          mountPath: /etc/prometheus
        - name: prometheus-rules
          mountPath: /etc/prometheus/rules
      - name: configmap-reload
        image: jimmidyson/configmap-reload:v0.5.0
        imagePullPolicy: IfNotPresent
        args:
        - "--volume-dir=/etc/config"
        - "--webhook-url=http://localhost:9090/-/reload"
        resources:
          limits:
            cpu: 100m
            memory: 100Mi
          requests:
            cpu: 10m
            memory: 10Mi
        volumeMounts:
        - name: config
          mountPath: /etc/config
          readOnly: true
      volumes:
      - name: data
        persistentVolumeClaim:
          claimName: prometheus-data-pvc
      - name: prometheus-rules
        configMap:
          name: prometheus-rules
      - name: config
        configMap:
          name: prometheus-config
bash
#查看
kubectl get deploy -n monitor

kubectl get pods -n monitor
#查看
kubectl get deploy -n monitor

kubectl get pods -n monitor
  • 参数解释

Deployment 资源文件中的 containers 部分配置了两个容器,分别是:

  • prometheus: Prometheus 容器是主容器,用于运行 Prometheus 进程
  • configmap-reload:用于监听指定的ConfigMap文件中的内容,如果内容发生更改,则执行webhookurl请求因为Prometheus支持通过接口重新加载配置文件,所以这里使用这个容器提供的机制来完成PrometheusConfigMap配置文件内容一有更改,就执行Prometheus的/-/reload接口,进行更新配置操作。

Prometheus 参数说明

--web.enable-lifecycle: 启用 Prometheus 用于重新加载配置的 /-/reload 接口

--config.file: 指定 Prometheus 配置文件所在地址,这个地址是相对于容器内部而言的

--storage.tsdb.path: 指定 Prometheus 数据存储目录地址,这个地址是相对于容器而言的

--storage.tsdb.retention.time: 指定删除旧数据的时间,默认为 15d

--web.console.libraries: 指定控制台组件依赖的存储路径

--web.console.templates: 指定控制台模板的存储路径

1.2.8 创建ingress对外提供访问地址

创建7.prometheus-ing.yaml

yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  namespace: monitor
  name: prometheus-ingress
spec:
  ingressClassName: nginx
  rules:
  - host: prometheus.ikubernetes.net
    http:
      paths:
        - pathType: Prefix
          backend:
            service:
              name: prometheus
              port:
                number: 9090
          path: /
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  namespace: monitor
  name: prometheus-ingress
spec:
  ingressClassName: nginx
  rules:
  - host: prometheus.ikubernetes.net
    http:
      paths:
        - pathType: Prefix
          backend:
            service:
              name: prometheus
              port:
                number: 9090
          path: /
bash
kubectl apply -f 7.prometheus-ing.yaml

#查看
kubectl get ingress -n monitor
NAME                 CLASS   HOSTS                        ADDRESS         PORTS   AGE
prometheus-ingress   nginx   prometheus.ikubernetes.net   10.103.236.70   80      9m35s
kubectl apply -f 7.prometheus-ing.yaml

#查看
kubectl get ingress -n monitor
NAME                 CLASS   HOSTS                        ADDRESS         PORTS   AGE
prometheus-ingress   nginx   prometheus.ikubernetes.net   10.103.236.70   80      9m35s
  • 访问验证

浏览器进行访问, prometheus.ikubernetes.net

image-20240626172317936