1. postgres_exporter
1.1 介绍
Prometheus exporter for PostgreSQL server metrics.
CI Tested PostgreSQL versions: 11
, 12
, 13
, 14
, 15
, 16
1.2 配置pg
- 创建帐户
sql
CREATE USER monitoring WITH PASSWORD 'hello123456'; #线上修改复杂点
CREATE USER monitoring WITH PASSWORD 'hello123456'; #线上修改复杂点
- 授权
sql
GRANT pg_read_all_stats to monitoring;
GRANT pg_read_all_stats to monitoring;
- 远程访问
bash
vim pg_hba.conf
host all all 0.0.0.0/0 password
#重启服务或者热更新
vim pg_hba.conf
host all all 0.0.0.0/0 password
#重启服务或者热更新
- pg_stat_statements
bash
#创建
psql -h localhost -p 5432 -U postgres -W -c "CREATE EXTENSION pg_stat_statements;"
#查看
psql -h localhost -p 5432 -U postgres -W -c "select name from pg_available_extensions where name = 'pg_stat_statements';"
#创建
psql -h localhost -p 5432 -U postgres -W -c "CREATE EXTENSION pg_stat_statements;"
#查看
psql -h localhost -p 5432 -U postgres -W -c "select name from pg_available_extensions where name = 'pg_stat_statements';"
1.3 部署
- 下载
bash
wget https://github.com/prometheus-community/postgres_exporter/releases/download/v0.16.0/postgres_exporter-0.16.0.linux-amd64.tar.gz
wget https://github.com/prometheus-community/postgres_exporter/releases/download/v0.16.0/postgres_exporter-0.16.0.linux-amd64.tar.gz
- 安装
由于是二进制包,解压
bash
tar -zxvf postgres_exporter-0.16.0.linux-amd64.tar.gz -C /opt
cd /opt/
mv postgres_exporter-0.16.0.linux-amd64 /opt/postgres_exporter
tar -zxvf postgres_exporter-0.16.0.linux-amd64.tar.gz -C /opt
cd /opt/
mv postgres_exporter-0.16.0.linux-amd64 /opt/postgres_exporter
- 配置开机启动
queries.yaml 文件没有用到
cat /opt/postgres_exporter/.env
PG_EXPORTER_EXTEND_QUERY_PATH="/opt/postgres_exporter/queries.yaml"
DATA_SOURCE_NAME=postgresql://monitoring:hello123456@localhost:5532/postgres?sslmode=disable
cat /opt/postgres_exporter/.env
PG_EXPORTER_EXTEND_QUERY_PATH="/opt/postgres_exporter/queries.yaml"
DATA_SOURCE_NAME=postgresql://monitoring:hello123456@localhost:5532/postgres?sslmode=disable
- queries.yaml
yaml
pg_replication:
query: "SELECT CASE WHEN NOT pg_is_in_recovery() THEN 0 ELSE GREATEST (0, EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))) END AS lag"
master: true
metrics:
- lag:
usage: "GAUGE"
description: "Replication lag behind master in seconds"
pg_postmaster:
query: "SELECT pg_postmaster_start_time as start_time_seconds from pg_postmaster_start_time()"
master: true
metrics:
- start_time_seconds:
usage: "GAUGE"
description: "Time at which postmaster started"
pg_database:
query: "SELECT sum(pg_database_size(pg_database.datname)) as size FROM pg_database"
master: true
metrics:
- size:
usage: "GAUGE"
description: "DB Size"
pg_stat_user_tables:
query: "SELECT
current_database() datname,
schemaname,
relname,
seq_scan,
seq_tup_read,
idx_scan,
idx_tup_fetch,
n_tup_ins,
n_tup_upd,
n_tup_del,
n_tup_hot_upd,
n_live_tup,
n_dead_tup,
n_mod_since_analyze,
COALESCE(last_vacuum, '1970-01-01Z') as last_vacuum,
COALESCE(last_autovacuum, '1970-01-01Z') as last_autovacuum,
COALESCE(last_analyze, '1970-01-01Z') as last_analyze,
COALESCE(last_autoanalyze, '1970-01-01Z') as last_autoanalyze,
vacuum_count,
autovacuum_count,
analyze_count,
autoanalyze_count
FROM
pg_stat_user_tables"
metrics:
- datname:
usage: "LABEL"
description: "Name of current database"
- schemaname:
usage: "LABEL"
description: "Name of the schema that this table is in"
- relname:
usage: "LABEL"
description: "Name of this table"
- seq_scan:
usage: "COUNTER"
description: "Number of sequential scans initiated on this table"
- seq_tup_read:
usage: "COUNTER"
description: "Number of live rows fetched by sequential scans"
- idx_scan:
usage: "COUNTER"
description: "Number of index scans initiated on this table"
- idx_tup_fetch:
usage: "COUNTER"
description: "Number of live rows fetched by index scans"
- n_tup_ins:
usage: "COUNTER"
description: "Number of rows inserted"
- n_tup_upd:
usage: "COUNTER"
description: "Number of rows updated"
- n_tup_del:
usage: "COUNTER"
description: "Number of rows deleted"
- n_tup_hot_upd:
usage: "COUNTER"
description: "Number of rows HOT updated (i.e., with no separate index update required)"
- n_live_tup:
usage: "GAUGE"
description: "Estimated number of live rows"
- n_dead_tup:
usage: "GAUGE"
description: "Estimated number of dead rows"
- n_mod_since_analyze:
usage: "GAUGE"
description: "Estimated number of rows changed since last analyze"
- last_vacuum:
usage: "GAUGE"
description: "Last time at which this table was manually vacuumed (not counting VACUUM FULL)"
- last_autovacuum:
usage: "GAUGE"
description: "Last time at which this table was vacuumed by the autovacuum daemon"
- last_analyze:
usage: "GAUGE"
description: "Last time at which this table was manually analyzed"
- last_autoanalyze:
usage: "GAUGE"
description: "Last time at which this table was analyzed by the autovacuum daemon"
- vacuum_count:
usage: "COUNTER"
description: "Number of times this table has been manually vacuumed (not counting VACUUM FULL)"
- autovacuum_count:
usage: "COUNTER"
description: "Number of times this table has been vacuumed by the autovacuum daemon"
- analyze_count:
usage: "COUNTER"
description: "Number of times this table has been manually analyzed"
- autoanalyze_count:
usage: "COUNTER"
description: "Number of times this table has been analyzed by the autovacuum daemon"
pg_statio_user_tables:
query: "SELECT current_database() datname, schemaname, relname, heap_blks_read, heap_blks_hit, idx_blks_read, idx_blks_hit, toast_blks_read, toast_blks_hit, tidx_blks_read, tidx_blks_hit FROM pg_statio_user_tables"
metrics:
- datname:
usage: "LABEL"
description: "Name of current database"
- schemaname:
usage: "LABEL"
description: "Name of the schema that this table is in"
- relname:
usage: "LABEL"
description: "Name of this table"
- heap_blks_read:
usage: "COUNTER"
description: "Number of disk blocks read from this table"
- heap_blks_hit:
usage: "COUNTER"
description: "Number of buffer hits in this table"
- idx_blks_read:
usage: "COUNTER"
description: "Number of disk blocks read from all indexes on this table"
- idx_blks_hit:
usage: "COUNTER"
description: "Number of buffer hits in all indexes on this table"
- toast_blks_read:
usage: "COUNTER"
description: "Number of disk blocks read from this table's TOAST table (if any)"
- toast_blks_hit:
usage: "COUNTER"
description: "Number of buffer hits in this table's TOAST table (if any)"
- tidx_blks_read:
usage: "COUNTER"
description: "Number of disk blocks read from this table's TOAST table indexes (if any)"
- tidx_blks_hit:
usage: "COUNTER"
description: "Number of buffer hits in this table's TOAST table indexes (if any)"
pg_stat_statements:
query: "select
t2.rolname,
t3.datname,
queryid,
calls,
(total_plan_time + total_exec_time) / 1000 as total_time_seconds,
(min_plan_time + min_exec_time) / 1000 as min_time_seconds,
(max_plan_time + max_exec_time) / 1000 as max_time_seconds,
(mean_plan_time + mean_exec_time) / 1000 as mean_time_seconds,
(stddev_plan_time + stddev_exec_time) / 1000 as stddev_time_seconds,
rows,
shared_blks_hit,
shared_blks_read,
shared_blks_dirtied,
shared_blks_written,
local_blks_hit,
local_blks_read,
local_blks_dirtied,
local_blks_written,
temp_blks_read,
temp_blks_written,
blk_read_time / 1000 as blk_read_time_seconds,
blk_write_time / 1000 as blk_write_time_seconds
from
pg_stat_statements t1
join pg_roles t2 on
(t1.userid = t2.oid)
join pg_database t3 on
(t1.dbid = t3.oid)
where
t2.rolname != 'rdsadmin'"
master: true
metrics:
- rolname:
usage: "LABEL"
description: "Name of user"
- datname:
usage: "LABEL"
description: "Name of database"
- queryid:
usage: "LABEL"
description: "Query ID"
- calls:
usage: "COUNTER"
description: "Number of times executed"
- total_time_seconds:
usage: "COUNTER"
description: "Total time spent in the statement, in milliseconds"
- min_time_seconds:
usage: "GAUGE"
description: "Minimum time spent in the statement, in milliseconds"
- max_time_seconds:
usage: "GAUGE"
description: "Maximum time spent in the statement, in milliseconds"
- mean_time_seconds:
usage: "GAUGE"
description: "Mean time spent in the statement, in milliseconds"
- stddev_time_seconds:
usage: "GAUGE"
description: "Population standard deviation of time spent in the statement, in milliseconds"
- rows:
usage: "COUNTER"
description: "Total number of rows retrieved or affected by the statement"
- shared_blks_hit:
usage: "COUNTER"
description: "Total number of shared block cache hits by the statement"
- shared_blks_read:
usage: "COUNTER"
description: "Total number of shared blocks read by the statement"
- shared_blks_dirtied:
usage: "COUNTER"
description: "Total number of shared blocks dirtied by the statement"
- shared_blks_written:
usage: "COUNTER"
description: "Total number of shared blocks written by the statement"
- local_blks_hit:
usage: "COUNTER"
description: "Total number of local block cache hits by the statement"
- local_blks_read:
usage: "COUNTER"
description: "Total number of local blocks read by the statement"
- local_blks_dirtied:
usage: "COUNTER"
description: "Total number of local blocks dirtied by the statement"
- local_blks_written:
usage: "COUNTER"
description: "Total number of local blocks written by the statement"
- temp_blks_read:
usage: "COUNTER"
description: "Total number of temp blocks read by the statement"
- temp_blks_written:
usage: "COUNTER"
description: "Total number of temp blocks written by the statement"
- blk_read_time_seconds:
usage: "COUNTER"
description: "Total time the statement spent reading blocks, in milliseconds (if track_io_timing is enabled, otherwise zero)"
- blk_write_time_seconds:
usage: "COUNTER"
description: "Total time the statement spent writing blocks, in milliseconds (if track_io_timing is enabled, otherwise zero)"
pg_process_idle:
query: |
WITH
metrics AS (
SELECT
application_name,
SUM(EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - state_change))::bigint)::float AS process_idle_seconds_sum,
COUNT(*) AS process_idle_seconds_count
FROM pg_stat_activity
WHERE state = 'idle'
GROUP BY application_name
),
buckets AS (
SELECT
application_name,
le,
SUM(
CASE WHEN EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - state_change)) <= le
THEN 1
ELSE 0
END
)::bigint AS bucket
FROM
pg_stat_activity,
UNNEST(ARRAY[1, 2, 5, 15, 30, 60, 90, 120, 300]) AS le
GROUP BY application_name, le
ORDER BY application_name, le
)
SELECT
application_name,
process_idle_seconds_sum as seconds_sum,
process_idle_seconds_count as seconds_count,
ARRAY_AGG(le) AS seconds,
ARRAY_AGG(bucket) AS seconds_bucket
FROM metrics JOIN buckets USING (application_name)
GROUP BY 1, 2, 3
metrics:
- application_name:
usage: "LABEL"
description: "Application Name"
- seconds:
usage: "HISTOGRAM"
description: "Idle time of server processes"
pg_active_lockedsql:
query: |
select case when replace(replace(pg_blocking_pids(pid)::text,'{',''),'}','')='' then 'numsofnopidblock' else 'numsofsomepidblock' end pidblock,
count(1) pidnums from pg_stat_activity
where state not in('idle') and query !='' group by pidblock order by pidblock;
metrics:
- pidblock:
usage: "LABEL"
description: "Possible values:numsofnopidblock--The processes that are not locked; numsofsomepidblock--The processes locked by some "
- pidnums:
usage: "COUNTER"
description: "The number of processes"
pg_active_slowsql:
query: |
select datname,usename,count(1) slowsql_count
from pg_stat_activity where state not in('idle') and query !=''
and extract(epoch from (now() - query_start)) > 60*5 group by datname,usename order by count(1) desc;
metrics:
- datname:
usage: "LABEL"
description: "Name of database"
- usename:
usage: "LABEL"
description: "Name of user"
- slowsql_count:
usage: "COUNTER"
description: "the numbers of slow sqls"
pg_never_used_indexes:
query: |
select pi.schemaname, pi.relname, pi.indexrelname,
pg_table_size(pi.indexrelid) as index_size from pg_indexes pis join
pg_stat_user_indexes pi on pis.schemaname = pi.schemaname
and pis.tablename = pi.relname and pis.indexname = pi.indexrelname
left join pg_constraint pco on pco.conname = pi.indexrelname
and pco.conrelid = pi.relid where pco.contype is distinct from 'p'
and pco.contype is distinct from 'u' and (idx_scan,idx_tup_read,idx_tup_fetch) = (0,0,0)
and pis.indexdef !~ ' UNIQUE INDEX ' and pi.relname !~ 'backup$'
order by pg_table_size(indexrelid) desc;
metrics:
- schemaname:
usage: "LABEL"
description: "Schema of table"
- relname:
usage: "LABEL"
description: "Name of table"
- indexrelname:
usage: "LABEL"
description: "Name of index"
- index_size:
usage: "GAUGE"
description: "Size of index"
pg_tablelocktops:
query: |
select db.datname,relname tbname,mode locktype,count(1) locknums
from pg_database db join pg_locks lk on db.oid=lk.database
join pg_class cl on lk.relation=cl.oid
join pg_stat_activity act on lk.pid=act.pid
where db.datname not in ('template0','template1') and fastpath='t'
and cl.oid not in (select oid from pg_class where relname in ('pg_class','pg_locks'))
and act.pid <>pg_backend_pid() and cl.reltablespace in (select oid from pg_tablespace)
group by db.datname,relname,mode order by count(1) desc limit 10;
metrics:
- datname:
usage: "LABEL"
description: "database of table"
- tbname:
usage: "LABEL"
description: "Name of table"
- locktype:
usage: "LABEL"
description: "type of lock"
- locknums:
usage: "COUNTER"
description: "the numbers of this lock"
pg_replication:
query: "SELECT CASE WHEN NOT pg_is_in_recovery() THEN 0 ELSE GREATEST (0, EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))) END AS lag"
master: true
metrics:
- lag:
usage: "GAUGE"
description: "Replication lag behind master in seconds"
pg_postmaster:
query: "SELECT pg_postmaster_start_time as start_time_seconds from pg_postmaster_start_time()"
master: true
metrics:
- start_time_seconds:
usage: "GAUGE"
description: "Time at which postmaster started"
pg_database:
query: "SELECT sum(pg_database_size(pg_database.datname)) as size FROM pg_database"
master: true
metrics:
- size:
usage: "GAUGE"
description: "DB Size"
pg_stat_user_tables:
query: "SELECT
current_database() datname,
schemaname,
relname,
seq_scan,
seq_tup_read,
idx_scan,
idx_tup_fetch,
n_tup_ins,
n_tup_upd,
n_tup_del,
n_tup_hot_upd,
n_live_tup,
n_dead_tup,
n_mod_since_analyze,
COALESCE(last_vacuum, '1970-01-01Z') as last_vacuum,
COALESCE(last_autovacuum, '1970-01-01Z') as last_autovacuum,
COALESCE(last_analyze, '1970-01-01Z') as last_analyze,
COALESCE(last_autoanalyze, '1970-01-01Z') as last_autoanalyze,
vacuum_count,
autovacuum_count,
analyze_count,
autoanalyze_count
FROM
pg_stat_user_tables"
metrics:
- datname:
usage: "LABEL"
description: "Name of current database"
- schemaname:
usage: "LABEL"
description: "Name of the schema that this table is in"
- relname:
usage: "LABEL"
description: "Name of this table"
- seq_scan:
usage: "COUNTER"
description: "Number of sequential scans initiated on this table"
- seq_tup_read:
usage: "COUNTER"
description: "Number of live rows fetched by sequential scans"
- idx_scan:
usage: "COUNTER"
description: "Number of index scans initiated on this table"
- idx_tup_fetch:
usage: "COUNTER"
description: "Number of live rows fetched by index scans"
- n_tup_ins:
usage: "COUNTER"
description: "Number of rows inserted"
- n_tup_upd:
usage: "COUNTER"
description: "Number of rows updated"
- n_tup_del:
usage: "COUNTER"
description: "Number of rows deleted"
- n_tup_hot_upd:
usage: "COUNTER"
description: "Number of rows HOT updated (i.e., with no separate index update required)"
- n_live_tup:
usage: "GAUGE"
description: "Estimated number of live rows"
- n_dead_tup:
usage: "GAUGE"
description: "Estimated number of dead rows"
- n_mod_since_analyze:
usage: "GAUGE"
description: "Estimated number of rows changed since last analyze"
- last_vacuum:
usage: "GAUGE"
description: "Last time at which this table was manually vacuumed (not counting VACUUM FULL)"
- last_autovacuum:
usage: "GAUGE"
description: "Last time at which this table was vacuumed by the autovacuum daemon"
- last_analyze:
usage: "GAUGE"
description: "Last time at which this table was manually analyzed"
- last_autoanalyze:
usage: "GAUGE"
description: "Last time at which this table was analyzed by the autovacuum daemon"
- vacuum_count:
usage: "COUNTER"
description: "Number of times this table has been manually vacuumed (not counting VACUUM FULL)"
- autovacuum_count:
usage: "COUNTER"
description: "Number of times this table has been vacuumed by the autovacuum daemon"
- analyze_count:
usage: "COUNTER"
description: "Number of times this table has been manually analyzed"
- autoanalyze_count:
usage: "COUNTER"
description: "Number of times this table has been analyzed by the autovacuum daemon"
pg_statio_user_tables:
query: "SELECT current_database() datname, schemaname, relname, heap_blks_read, heap_blks_hit, idx_blks_read, idx_blks_hit, toast_blks_read, toast_blks_hit, tidx_blks_read, tidx_blks_hit FROM pg_statio_user_tables"
metrics:
- datname:
usage: "LABEL"
description: "Name of current database"
- schemaname:
usage: "LABEL"
description: "Name of the schema that this table is in"
- relname:
usage: "LABEL"
description: "Name of this table"
- heap_blks_read:
usage: "COUNTER"
description: "Number of disk blocks read from this table"
- heap_blks_hit:
usage: "COUNTER"
description: "Number of buffer hits in this table"
- idx_blks_read:
usage: "COUNTER"
description: "Number of disk blocks read from all indexes on this table"
- idx_blks_hit:
usage: "COUNTER"
description: "Number of buffer hits in all indexes on this table"
- toast_blks_read:
usage: "COUNTER"
description: "Number of disk blocks read from this table's TOAST table (if any)"
- toast_blks_hit:
usage: "COUNTER"
description: "Number of buffer hits in this table's TOAST table (if any)"
- tidx_blks_read:
usage: "COUNTER"
description: "Number of disk blocks read from this table's TOAST table indexes (if any)"
- tidx_blks_hit:
usage: "COUNTER"
description: "Number of buffer hits in this table's TOAST table indexes (if any)"
pg_stat_statements:
query: "select
t2.rolname,
t3.datname,
queryid,
calls,
(total_plan_time + total_exec_time) / 1000 as total_time_seconds,
(min_plan_time + min_exec_time) / 1000 as min_time_seconds,
(max_plan_time + max_exec_time) / 1000 as max_time_seconds,
(mean_plan_time + mean_exec_time) / 1000 as mean_time_seconds,
(stddev_plan_time + stddev_exec_time) / 1000 as stddev_time_seconds,
rows,
shared_blks_hit,
shared_blks_read,
shared_blks_dirtied,
shared_blks_written,
local_blks_hit,
local_blks_read,
local_blks_dirtied,
local_blks_written,
temp_blks_read,
temp_blks_written,
blk_read_time / 1000 as blk_read_time_seconds,
blk_write_time / 1000 as blk_write_time_seconds
from
pg_stat_statements t1
join pg_roles t2 on
(t1.userid = t2.oid)
join pg_database t3 on
(t1.dbid = t3.oid)
where
t2.rolname != 'rdsadmin'"
master: true
metrics:
- rolname:
usage: "LABEL"
description: "Name of user"
- datname:
usage: "LABEL"
description: "Name of database"
- queryid:
usage: "LABEL"
description: "Query ID"
- calls:
usage: "COUNTER"
description: "Number of times executed"
- total_time_seconds:
usage: "COUNTER"
description: "Total time spent in the statement, in milliseconds"
- min_time_seconds:
usage: "GAUGE"
description: "Minimum time spent in the statement, in milliseconds"
- max_time_seconds:
usage: "GAUGE"
description: "Maximum time spent in the statement, in milliseconds"
- mean_time_seconds:
usage: "GAUGE"
description: "Mean time spent in the statement, in milliseconds"
- stddev_time_seconds:
usage: "GAUGE"
description: "Population standard deviation of time spent in the statement, in milliseconds"
- rows:
usage: "COUNTER"
description: "Total number of rows retrieved or affected by the statement"
- shared_blks_hit:
usage: "COUNTER"
description: "Total number of shared block cache hits by the statement"
- shared_blks_read:
usage: "COUNTER"
description: "Total number of shared blocks read by the statement"
- shared_blks_dirtied:
usage: "COUNTER"
description: "Total number of shared blocks dirtied by the statement"
- shared_blks_written:
usage: "COUNTER"
description: "Total number of shared blocks written by the statement"
- local_blks_hit:
usage: "COUNTER"
description: "Total number of local block cache hits by the statement"
- local_blks_read:
usage: "COUNTER"
description: "Total number of local blocks read by the statement"
- local_blks_dirtied:
usage: "COUNTER"
description: "Total number of local blocks dirtied by the statement"
- local_blks_written:
usage: "COUNTER"
description: "Total number of local blocks written by the statement"
- temp_blks_read:
usage: "COUNTER"
description: "Total number of temp blocks read by the statement"
- temp_blks_written:
usage: "COUNTER"
description: "Total number of temp blocks written by the statement"
- blk_read_time_seconds:
usage: "COUNTER"
description: "Total time the statement spent reading blocks, in milliseconds (if track_io_timing is enabled, otherwise zero)"
- blk_write_time_seconds:
usage: "COUNTER"
description: "Total time the statement spent writing blocks, in milliseconds (if track_io_timing is enabled, otherwise zero)"
pg_process_idle:
query: |
WITH
metrics AS (
SELECT
application_name,
SUM(EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - state_change))::bigint)::float AS process_idle_seconds_sum,
COUNT(*) AS process_idle_seconds_count
FROM pg_stat_activity
WHERE state = 'idle'
GROUP BY application_name
),
buckets AS (
SELECT
application_name,
le,
SUM(
CASE WHEN EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - state_change)) <= le
THEN 1
ELSE 0
END
)::bigint AS bucket
FROM
pg_stat_activity,
UNNEST(ARRAY[1, 2, 5, 15, 30, 60, 90, 120, 300]) AS le
GROUP BY application_name, le
ORDER BY application_name, le
)
SELECT
application_name,
process_idle_seconds_sum as seconds_sum,
process_idle_seconds_count as seconds_count,
ARRAY_AGG(le) AS seconds,
ARRAY_AGG(bucket) AS seconds_bucket
FROM metrics JOIN buckets USING (application_name)
GROUP BY 1, 2, 3
metrics:
- application_name:
usage: "LABEL"
description: "Application Name"
- seconds:
usage: "HISTOGRAM"
description: "Idle time of server processes"
pg_active_lockedsql:
query: |
select case when replace(replace(pg_blocking_pids(pid)::text,'{',''),'}','')='' then 'numsofnopidblock' else 'numsofsomepidblock' end pidblock,
count(1) pidnums from pg_stat_activity
where state not in('idle') and query !='' group by pidblock order by pidblock;
metrics:
- pidblock:
usage: "LABEL"
description: "Possible values:numsofnopidblock--The processes that are not locked; numsofsomepidblock--The processes locked by some "
- pidnums:
usage: "COUNTER"
description: "The number of processes"
pg_active_slowsql:
query: |
select datname,usename,count(1) slowsql_count
from pg_stat_activity where state not in('idle') and query !=''
and extract(epoch from (now() - query_start)) > 60*5 group by datname,usename order by count(1) desc;
metrics:
- datname:
usage: "LABEL"
description: "Name of database"
- usename:
usage: "LABEL"
description: "Name of user"
- slowsql_count:
usage: "COUNTER"
description: "the numbers of slow sqls"
pg_never_used_indexes:
query: |
select pi.schemaname, pi.relname, pi.indexrelname,
pg_table_size(pi.indexrelid) as index_size from pg_indexes pis join
pg_stat_user_indexes pi on pis.schemaname = pi.schemaname
and pis.tablename = pi.relname and pis.indexname = pi.indexrelname
left join pg_constraint pco on pco.conname = pi.indexrelname
and pco.conrelid = pi.relid where pco.contype is distinct from 'p'
and pco.contype is distinct from 'u' and (idx_scan,idx_tup_read,idx_tup_fetch) = (0,0,0)
and pis.indexdef !~ ' UNIQUE INDEX ' and pi.relname !~ 'backup$'
order by pg_table_size(indexrelid) desc;
metrics:
- schemaname:
usage: "LABEL"
description: "Schema of table"
- relname:
usage: "LABEL"
description: "Name of table"
- indexrelname:
usage: "LABEL"
description: "Name of index"
- index_size:
usage: "GAUGE"
description: "Size of index"
pg_tablelocktops:
query: |
select db.datname,relname tbname,mode locktype,count(1) locknums
from pg_database db join pg_locks lk on db.oid=lk.database
join pg_class cl on lk.relation=cl.oid
join pg_stat_activity act on lk.pid=act.pid
where db.datname not in ('template0','template1') and fastpath='t'
and cl.oid not in (select oid from pg_class where relname in ('pg_class','pg_locks'))
and act.pid <>pg_backend_pid() and cl.reltablespace in (select oid from pg_tablespace)
group by db.datname,relname,mode order by count(1) desc limit 10;
metrics:
- datname:
usage: "LABEL"
description: "database of table"
- tbname:
usage: "LABEL"
description: "Name of table"
- locktype:
usage: "LABEL"
description: "type of lock"
- locknums:
usage: "COUNTER"
description: "the numbers of this lock"
bash
tee /etc/systemd/system/postgres_exporter.service<<EOF
[Unit]
Description=Prometheus exporter for Postgresql
Wants=network-online.target
After=network-online.target
[Service]
User=ptgres
Group=ptgres
WorkingDirectory=/opt/postgres_exporter
EnvironmentFile=/opt/postgres_exporter/.env
ExecStart=/opt/postgres_exporter/postgres_exporter --web.listen-address=:9187 --collector.stat_statements --web.telemetry-path=/metrics
ExecStop=/bin/kill -s SIGTERM $MAINPID
Restart=always
[Install]
WantedBy=multi-user.target
EOF
tee /etc/systemd/system/postgres_exporter.service<<EOF
[Unit]
Description=Prometheus exporter for Postgresql
Wants=network-online.target
After=network-online.target
[Service]
User=ptgres
Group=ptgres
WorkingDirectory=/opt/postgres_exporter
EnvironmentFile=/opt/postgres_exporter/.env
ExecStart=/opt/postgres_exporter/postgres_exporter --web.listen-address=:9187 --collector.stat_statements --web.telemetry-path=/metrics
ExecStop=/bin/kill -s SIGTERM $MAINPID
Restart=always
[Install]
WantedBy=multi-user.target
EOF
bash
[root@kubeadm-master01 postgres_exporter]# cat .env
PG_EXPORTER_EXTEND_QUERY_PATH="/opt/postgres_exporter/queries.yaml"
DATA_SOURCE_NAME=postgresql://monitoring:hello123456@localhost:5532/postgres?sslmode=disable
[root@kubeadm-master01 postgres_exporter]# cat .env
PG_EXPORTER_EXTEND_QUERY_PATH="/opt/postgres_exporter/queries.yaml"
DATA_SOURCE_NAME=postgresql://monitoring:hello123456@localhost:5532/postgres?sslmode=disable
bash
systemctl daemon-reload
systemctl start postgres_exporter
systemctl enable postgres_exporter
systemctl daemon-reload
systemctl start postgres_exporter
systemctl enable postgres_exporter
- 访问
1.4 Prometheus采集
yaml
###################### postgres_exporter ######################
- job_name: "postgres_exporter"
static_configs:
- targets: ["ip:9187"]
###################### postgres_exporter ######################
- job_name: "postgres_exporter"
static_configs:
- targets: ["ip:9187"]
- 热更新
bash
curl -XPOST http://prometheus.ikubernetes.net/-/reload
curl -XPOST http://prometheus.ikubernetes.net/-/reload
1.报警规则
yaml
postgres.rules: |
groups:
- name: "postgres.rules"
rules:
- alert: Postgres_InstanceDown
expr: pg_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Postgres server instance is down"
description: "Postgres has not been responding for the past 1 minutes on {{ $labels.instance }}"
- alert: Postgresql_Restarted
expr: time() - pg_postmaster_start_time_seconds < 60
for: 5m
labels:
severity: critical
annotations:
summary: "Postgresql restarted (instance {{ $labels.instance }})"
description: "Postgresql restarted\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: Postgresql_ExporterError
expr: pg_exporter_last_scrape_error > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql exporter error (instance {{ $labels.instance }})"
description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: Postgresql_ReplicationLag
expr: |
(pg_replication_lag) > 10 and ON(instance) (pg_replication_is_replica == 1)
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql replication lag (instance {{ $labels.instance }})"
description: "PostgreSQL replication lag is going up (> 10s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: Postgresql_TooManyConnections
expr: |
sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.9
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql too many connections (instance {{ $labels.instance }})"
description: "PostgreSQL instance has too many connections\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: Postgresql_NotEnoughConnections
expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql not enough connections (instance {{ $labels.instance }})"
description: "PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: Postgresql_DeadLocks
expr: rate(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql dead locks (instance {{ $labels.instance }})"
description: "PostgreSQL has dead-locks\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: Postgresql_HighRollbackRate
expr: |
rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) > 0.02
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql high rollback rate (instance {{ $labels.instance }})"
description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value }}\nLABELS: {{ $labels }}"
- alert: Postgresql_CommitRateLow
expr: rate(pg_stat_database_xact_commit[1m]) < 10
for: 5m
labels:
severity: critical
annotations:
summary: "Postgresql commit rate low (instance {{ $labels.instance }})"
description: "Postgres seems to be processing very few transactions\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: Postgresql_TooManyLocksAcquired
expr: ((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20
for: 5m
labels:
severity: critical
annotations:
summary: "Postgresql too many locks acquired (instance {{ $labels.instance }})"
description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: Postgresql_ConfigurationChanged
expr: |
{__name__=~"pg_settings_.*"} != ON(__name__) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m
for: 0m
labels:
severity: info
annotations:
summary: Postgresql configuration changed (instance {{ $labels.instance }})
description: "Postgres Database configuration change has occurred\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgreSQL_SlowQueries
expr: |-
avg by (datname) (
rate (
pg_stat_activity_max_tx_duration{datname!~"template.*"}[2m]
)
) > 2 * 60
for: 2m
labels:
severity: warning
annotations:
summary: 'PostgreSQL high number of slow queries for database: {{ `{{ $labels.datname }}` }}'
description: 'PostgreSQL high number of slow queries for database {{ `{{ $labels.datname }}` }} with a value of {{ `{{ $value }}` }}'
- alert: PostgreSQL_QPS
expr: |-
avg by (datname) (
irate(
pg_stat_database_xact_commit{datname!~"template.*"}[5m]
)
+
irate(
pg_stat_database_xact_rollback{datname!~"template.*"}[5m]
)
) > 10000
for: 5m
labels:
severity: warning
annotations:
summary: 'PostgreSQL high number of queries per second for database {{ `{{ $labels.datname }}` }}'
description: 'PostgreSQL high number of queries per second for database {{ `{{ $labels.datname }}` }} witha value of {{ `{{ $value }}` }}'
- alert: PostgreSQL_CacheHitRatio
expr: |-
avg by (datname) (
rate(pg_stat_database_blks_hit{datname!~"template.*"}[5m])
/
(
rate(
pg_stat_database_blks_hit{datname!~"template.*"}[5m]
)
+
rate(
pg_stat_database_blks_read{datname!~"template.*"}[5m]
)
)
) < 0.98
for: 5m
labels:
severity: warning
annotations:
summary: 'PostgreSQL low cache hit rate for database {{`{{ $labels.datname }}`}}'
description: 'PostgreSQL low on cache hit rate for database {{`{{ $labels.datname }}`}} with a value of {{`{{ $value }}`}}'
############## postgresql_rules ###########
postgres.rules: |
groups:
- name: "postgres.rules"
rules:
- alert: Postgres_InstanceDown
expr: pg_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Postgres server instance is down"
description: "Postgres has not been responding for the past 1 minutes on {{ $labels.instance }}"
- alert: Postgresql_Restarted
expr: time() - pg_postmaster_start_time_seconds < 60
for: 5m
labels:
severity: critical
annotations:
summary: "Postgresql restarted (instance {{ $labels.instance }})"
description: "Postgresql restarted\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: Postgresql_ExporterError
expr: pg_exporter_last_scrape_error > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql exporter error (instance {{ $labels.instance }})"
description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: Postgresql_ReplicationLag
expr: |
(pg_replication_lag) > 10 and ON(instance) (pg_replication_is_replica == 1)
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql replication lag (instance {{ $labels.instance }})"
description: "PostgreSQL replication lag is going up (> 10s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: Postgresql_TooManyConnections
expr: |
sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.9
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql too many connections (instance {{ $labels.instance }})"
description: "PostgreSQL instance has too many connections\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: Postgresql_NotEnoughConnections
expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql not enough connections (instance {{ $labels.instance }})"
description: "PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: Postgresql_DeadLocks
expr: rate(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql dead locks (instance {{ $labels.instance }})"
description: "PostgreSQL has dead-locks\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: Postgresql_HighRollbackRate
expr: |
rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) > 0.02
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql high rollback rate (instance {{ $labels.instance }})"
description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value }}\nLABELS: {{ $labels }}"
- alert: Postgresql_CommitRateLow
expr: rate(pg_stat_database_xact_commit[1m]) < 10
for: 5m
labels:
severity: critical
annotations:
summary: "Postgresql commit rate low (instance {{ $labels.instance }})"
description: "Postgres seems to be processing very few transactions\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: Postgresql_TooManyLocksAcquired
expr: ((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20
for: 5m
labels:
severity: critical
annotations:
summary: "Postgresql too many locks acquired (instance {{ $labels.instance }})"
description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: Postgresql_ConfigurationChanged
expr: |
{__name__=~"pg_settings_.*"} != ON(__name__) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m
for: 0m
labels:
severity: info
annotations:
summary: Postgresql configuration changed (instance {{ $labels.instance }})
description: "Postgres Database configuration change has occurred\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgreSQL_SlowQueries
expr: |-
avg by (datname) (
rate (
pg_stat_activity_max_tx_duration{datname!~"template.*"}[2m]
)
) > 2 * 60
for: 2m
labels:
severity: warning
annotations:
summary: 'PostgreSQL high number of slow queries for database: {{ `{{ $labels.datname }}` }}'
description: 'PostgreSQL high number of slow queries for database {{ `{{ $labels.datname }}` }} with a value of {{ `{{ $value }}` }}'
- alert: PostgreSQL_QPS
expr: |-
avg by (datname) (
irate(
pg_stat_database_xact_commit{datname!~"template.*"}[5m]
)
+
irate(
pg_stat_database_xact_rollback{datname!~"template.*"}[5m]
)
) > 10000
for: 5m
labels:
severity: warning
annotations:
summary: 'PostgreSQL high number of queries per second for database {{ `{{ $labels.datname }}` }}'
description: 'PostgreSQL high number of queries per second for database {{ `{{ $labels.datname }}` }} witha value of {{ `{{ $value }}` }}'
- alert: PostgreSQL_CacheHitRatio
expr: |-
avg by (datname) (
rate(pg_stat_database_blks_hit{datname!~"template.*"}[5m])
/
(
rate(
pg_stat_database_blks_hit{datname!~"template.*"}[5m]
)
+
rate(
pg_stat_database_blks_read{datname!~"template.*"}[5m]
)
)
) < 0.98
for: 5m
labels:
severity: warning
annotations:
summary: 'PostgreSQL low cache hit rate for database {{`{{ $labels.datname }}`}}'
description: 'PostgreSQL low on cache hit rate for database {{`{{ $labels.datname }}`}} with a value of {{`{{ $value }}`}}'
############## postgresql_rules ###########
1.5 grafana
默认搜索,https://grafana.com/grafana/dashboards/
- 效果
2. k8s环境
- 验证
curl -s $(kubectl -n kube-system get svc prometheus -ojsonpath='{.spec.clusterIP}:{.spec.ports[0].port}')/prometheus/api/v1/query --data-urlencode 'query=up{job="postgres-exporter"}'
curl -s $(kubectl -n kube-system get svc prometheus -ojsonpath='{.spec.clusterIP}:{.spec.ports[0].port}')/prometheus/api/v1/query --data-urlencode 'query=up{job="postgres-exporter"}'
参考:
alert
https://sysdig.com/blog/postgresql-monitoring/
https://github.com/philyuchkoff/prometheus-alerts/blob/master/databases/postgresql.md
https://samber.github.io/awesome-prometheus-alerts/rules.html#postgresql
aws
https://www.perfectscale.io/blog/monitoring-postgresql-with-prometheus-operator