1. 基于Prometheus的全方位监控平台
1.1 部署配置
用的主要技术栈如下:
名字 | 含义 |
---|---|
Prometheus | 监控主服务 |
node-exporter | 数据采集器 |
kube-state-metrics | 数据采集器 |
metrics-server | 数据采集器 |
Consul | 自动发现 |
blackbox | 黑盒拨测 |
Alertmanager | 监控告警服务 |
Grafana | 数据展示服务 |
prometheusAlert | 告警消息转发服务 |
1.2 安装
1.2.1 部署顺序
部署对外可访问Prometheus:
- 首先需要创建Prometheus所在命名空间;
- 然后创建Prometheus使用的RBAC规则;
- 创建Prometheus的configmap来保存配置文件;
- 创建service暴露Prometheus服务;
- 创建deployment部署Prometheus容器;
- 最后创建Ingress实现外部域名访问Prometheus。
1.2.2 创建命名空间
bash
[root@kube-master Prometheus]# kubectl create namespace monitor
namespace/monitor created
[root@kube-master Prometheus]# kubectl create namespace monitor
namespace/monitor created
1.2.3 创建RBAC规则
创建1.prometheus-rbac.yaml
yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: monitor
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources: ["nodes","nodes/proxy","services","endpoints","pods"]
verbs: ["get", "list", "watch"]
- apiGroups: ["extensions"]
resources: ["ingress"]
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: cluster-admin
subjects:
- kind: ServiceAccount
name: prometheus
namespace: monitor
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: monitor
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources: ["nodes","nodes/proxy","services","endpoints","pods"]
verbs: ["get", "list", "watch"]
- apiGroups: ["extensions"]
resources: ["ingress"]
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: cluster-admin
subjects:
- kind: ServiceAccount
name: prometheus
namespace: monitor
bash
[root@kube-master prometheus]# kubectl apply -f 1.prometheus-rbac.yaml
serviceaccount/prometheus created
clusterrole.rbac.authorization.k8s.io/prometheus created
clusterrolebinding.rbac.authorization.k8s.io/prometheus created
[root@kube-master prometheus]# kubectl apply -f 1.prometheus-rbac.yaml
serviceaccount/prometheus created
clusterrole.rbac.authorization.k8s.io/prometheus created
clusterrolebinding.rbac.authorization.k8s.io/prometheus created
- 查看
bash
kubectl get sa prometheus -n monitor
kubectl get clusterrole prometheus
kubectl get clusterrolebinding prometheus
kubectl get sa prometheus -n monitor
kubectl get clusterrole prometheus
kubectl get clusterrolebinding prometheus
1.2.4 创建config
yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: monitor
data:
prometheus.yml: |
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: "kubernetes"
############ 数据采集job ###################
scrape_configs:
- job_name: prometheus
static_configs:
- targets: ['127.0.0.1:9090']
labels:
instance: prometheus
############ 指定告警规则文件路径位置 ###################
rule_files:
- /etc/prometheus/rules/*.rules
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: monitor
data:
prometheus.yml: |
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: "kubernetes"
############ 数据采集job ###################
scrape_configs:
- job_name: prometheus
static_configs:
- targets: ['127.0.0.1:9090']
labels:
instance: prometheus
############ 指定告警规则文件路径位置 ###################
rule_files:
- /etc/prometheus/rules/*.rules
bash
kubectl apply -f 2.prometheus-config.yaml
#查看
kubectl get cm prometheus-config -n monitor
kubectl apply -f 2.prometheus-config.yaml
#查看
kubectl get cm prometheus-config -n monitor
1.2.5 创建ConfigMap类型的rules
yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-rules
namespace: monitor
data:
general.rules: |
groups:
- name: general.rules
rules:
- alert: InstanceDown
expr: |
up{job=~"other-ECS|k8s-nodes|prometheus"} == 0
for: 5s
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} 停止工作"
description: "{{ $labels.instance }} 主机名:{{ $labels.hostname }} 已经停止1分钟以上."
node.rules: |
groups:
- name: node.rules
rules:
- alert: NodeFilesystemUsage
expr: |
100 - (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 > 85
for: 1m
labels:
severity: warning
annotations:
summary: "Instance {{ $labels.instance }} : {{ $labels.mountpoint }} 分区使用率过高"
description: "{{ $labels.instance }} 主机名:{{ $labels.hostname }} : {{ $labels.mountpoint }} 分区使用大于85% (当前值: {{ $value }})"
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-rules
namespace: monitor
data:
general.rules: |
groups:
- name: general.rules
rules:
- alert: InstanceDown
expr: |
up{job=~"other-ECS|k8s-nodes|prometheus"} == 0
for: 5s
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} 停止工作"
description: "{{ $labels.instance }} 主机名:{{ $labels.hostname }} 已经停止1分钟以上."
node.rules: |
groups:
- name: node.rules
rules:
- alert: NodeFilesystemUsage
expr: |
100 - (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 > 85
for: 1m
labels:
severity: warning
annotations:
summary: "Instance {{ $labels.instance }} : {{ $labels.mountpoint }} 分区使用率过高"
description: "{{ $labels.instance }} 主机名:{{ $labels.hostname }} : {{ $labels.mountpoint }} 分区使用大于85% (当前值: {{ $value }})"
bash
kubectl apply -f 3.prometheus-rules.yaml
#查看
kubectl get cm -n monitor prometheus-rules
kubectl apply -f 3.prometheus-rules.yaml
#查看
kubectl get cm -n monitor prometheus-rules
1.2.6 创建prometheus svc
bash
kubectl apply -f 4.prometheus-svc.yaml
#查看
kubectl get svc -n monitor
kubectl apply -f 4.prometheus-svc.yaml
#查看
kubectl get svc -n monitor
1.2.7 创建prometheus deploy
使用NFS提供的StorageClass来做数据存储
yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus
namespace: monitor
labels:
k8s-app: prometheus
spec:
replicas: 1
selector:
matchLabels:
k8s-app: prometheus
template:
metadata:
labels:
k8s-app: prometheus
spec:
serviceAccountName: prometheus
containers:
- name: prometheus
image: prom/prometheus:v2.36.0
imagePullPolicy: IfNotPresent
ports:
- name: http
containerPort: 9090
securityContext:
runAsUser: 65534
privileged: true
command:
- "/bin/prometheus"
args:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--web.enable-lifecycle"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention.time=10d"
- "--web.console.libraries=/etc/prometheus/console_libraries"
- "--web.console.templates=/etc/prometheus/consoles"
resources:
limits:
cpu: 2000m
memory: 2048Mi
requests:
cpu: 1000m
memory: 512Mi
readinessProbe:
httpGet:
path: /-/ready
port: 9090
initialDelaySeconds: 5
timeoutSeconds: 10
livenessProbe:
httpGet:
path: /-/healthy
port: 9090
initialDelaySeconds: 30
timeoutSeconds: 30
volumeMounts:
- name: data
mountPath: /prometheus
subPath: prometheus
- name: config
mountPath: /etc/prometheus
- name: prometheus-rules
mountPath: /etc/prometheus/rules
- name: configmap-reload
image: jimmidyson/configmap-reload:v0.5.0
imagePullPolicy: IfNotPresent
args:
- "--volume-dir=/etc/config"
- "--webhook-url=http://localhost:9090/-/reload"
resources:
limits:
cpu: 100m
memory: 100Mi
requests:
cpu: 10m
memory: 10Mi
volumeMounts:
- name: config
mountPath: /etc/config
readOnly: true
volumes:
- name: data
persistentVolumeClaim:
claimName: prometheus-data-pvc
- name: prometheus-rules
configMap:
name: prometheus-rules
- name: config
configMap:
name: prometheus-config
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus
namespace: monitor
labels:
k8s-app: prometheus
spec:
replicas: 1
selector:
matchLabels:
k8s-app: prometheus
template:
metadata:
labels:
k8s-app: prometheus
spec:
serviceAccountName: prometheus
containers:
- name: prometheus
image: prom/prometheus:v2.36.0
imagePullPolicy: IfNotPresent
ports:
- name: http
containerPort: 9090
securityContext:
runAsUser: 65534
privileged: true
command:
- "/bin/prometheus"
args:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--web.enable-lifecycle"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention.time=10d"
- "--web.console.libraries=/etc/prometheus/console_libraries"
- "--web.console.templates=/etc/prometheus/consoles"
resources:
limits:
cpu: 2000m
memory: 2048Mi
requests:
cpu: 1000m
memory: 512Mi
readinessProbe:
httpGet:
path: /-/ready
port: 9090
initialDelaySeconds: 5
timeoutSeconds: 10
livenessProbe:
httpGet:
path: /-/healthy
port: 9090
initialDelaySeconds: 30
timeoutSeconds: 30
volumeMounts:
- name: data
mountPath: /prometheus
subPath: prometheus
- name: config
mountPath: /etc/prometheus
- name: prometheus-rules
mountPath: /etc/prometheus/rules
- name: configmap-reload
image: jimmidyson/configmap-reload:v0.5.0
imagePullPolicy: IfNotPresent
args:
- "--volume-dir=/etc/config"
- "--webhook-url=http://localhost:9090/-/reload"
resources:
limits:
cpu: 100m
memory: 100Mi
requests:
cpu: 10m
memory: 10Mi
volumeMounts:
- name: config
mountPath: /etc/config
readOnly: true
volumes:
- name: data
persistentVolumeClaim:
claimName: prometheus-data-pvc
- name: prometheus-rules
configMap:
name: prometheus-rules
- name: config
configMap:
name: prometheus-config
bash
#查看
kubectl get deploy -n monitor
kubectl get pods -n monitor
#查看
kubectl get deploy -n monitor
kubectl get pods -n monitor
- 参数解释
Deployment 资源文件中的 containers 部分配置了两个容器,分别是:
- prometheus: Prometheus 容器是主容器,用于运行 Prometheus 进程
- configmap-reload:用于监听指定的ConfigMap文件中的内容,如果内容发生更改,则执行webhookurl请求因为Prometheus支持通过接口重新加载配置文件,所以这里使用这个容器提供的机制来完成PrometheusConfigMap配置文件内容一有更改,就执行Prometheus的/-/reload接口,进行更新配置操作。
Prometheus 参数说明
--web.enable-lifecycle: 启用 Prometheus 用于重新加载配置的 /-/reload 接口
--config.file: 指定 Prometheus 配置文件所在地址,这个地址是相对于容器内部而言的
--storage.tsdb.path: 指定 Prometheus 数据存储目录地址,这个地址是相对于容器而言的
--storage.tsdb.retention.time: 指定删除旧数据的时间,默认为 15d
--web.console.libraries: 指定控制台组件依赖的存储路径
--web.console.templates: 指定控制台模板的存储路径
1.2.8 创建ingress对外提供访问地址
创建7.prometheus-ing.yaml
yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
namespace: monitor
name: prometheus-ingress
spec:
ingressClassName: nginx
rules:
- host: prometheus.ikubernetes.net
http:
paths:
- pathType: Prefix
backend:
service:
name: prometheus
port:
number: 9090
path: /
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
namespace: monitor
name: prometheus-ingress
spec:
ingressClassName: nginx
rules:
- host: prometheus.ikubernetes.net
http:
paths:
- pathType: Prefix
backend:
service:
name: prometheus
port:
number: 9090
path: /
bash
kubectl apply -f 7.prometheus-ing.yaml
#查看
kubectl get ingress -n monitor
NAME CLASS HOSTS ADDRESS PORTS AGE
prometheus-ingress nginx prometheus.ikubernetes.net 10.103.236.70 80 9m35s
kubectl apply -f 7.prometheus-ing.yaml
#查看
kubectl get ingress -n monitor
NAME CLASS HOSTS ADDRESS PORTS AGE
prometheus-ingress nginx prometheus.ikubernetes.net 10.103.236.70 80 9m35s
- 访问验证
浏览器进行访问, prometheus.ikubernetes.net