-
Notifications
You must be signed in to change notification settings - Fork 317
Open
Description
We can expose log metrics with mtail
- add mtail packages to infra repo rpm/deb x86/armj
- add mtail deployment task
- add mtail to prometheus monitor target
- add grafana dashboard
# Match only postgresql.csv
getfilename() !~ /postgresql\.csv$/ {
stop
}
# total log messages from Postgres
counter postgresql_logs_total by severity
# histogram of slow queries
histogram postgresql_slow_queries_seconds buckets 0.0, 0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 50.0
# Count of specific types of errors -- notably statement timeouts
counter postgresql_errors_total by error
# log_lock_wait times
counter postgresql_lock_waits_total by mode, locktype
# event logs produced by a command like:
# DO $$ BEGIN raise log 'gitlab event: daily test'; END $$
counter postgresql_event_logs_total by event
counter postgresql_temp_files_total
# Count authentication failures
counter postgresql_auth_failure_total by user_name, database_name, connection_from_host
# The section below is for CSV logs.
#
# CSV columns: log_time,user_name,database_name,process_id,connection_from,session_id,session_line_num,command_tag,session_start_time,virtual_transaction_id,transaction_id,error_severity,sql_state_code,message,detail,hint,internal_query,internal_query_pos,context,query,query_pos,location,application_name
#
# See Postgres documentation for more details on CSV log fields:
# https://www.postgresql.org/docs/12/runtime-config-logging.html#RUNTIME-CONFIG-LOGGING-CSVLOG
/^(?P<date>\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d\.\d+ [A-Z]{3}),.*,(?P<severity>DEBUG[1-5]|INFO|NOTICE|WARNING|ERROR|LOG|FATAL|PANIC),/ {
strptime($date, "2006-01-02 15:04:05.000 GMT")
postgresql_logs_total[$severity]++
# Postgresql slow query duration log
# Slow query duration log:
# 2020-02-28 13:35:55.119 GMT,"gitlab","gitlabhq_production",21016,"127.0.0.1:45520",5e5916c7.5218,3,"SELECT",2020-02-28 13:33:59 GMT,7/322653,0,LOG,00000,"duration: 1032.176 ms execute <unnamed>: SELECT ""services"".* FROM ""services"" WHERE ""services"".""template"" = TRUE
/,LOG,(?P<sqlcode>\d+),.*duration: (?P<query_time>\d+\.\d+) ms/ {
postgresql_slow_queries_seconds = $query_time / 1000.0
}
# These would be better aggregated by SQLCODE which we can add to the
# error messages easily enough. See issue https://gitlab.com/gitlab-com/infrastructure/issues/3462
#
# We're primarily interested in these errors related to operational
# issues. Other errors are either syntax errors or user data triggered
# errors which are better handled through Sentry though we'll count
# and alert based on the total error rate.
# 2020-02-27 21:09:54.076 GMT,"gitlab-psql","gitlabhq_production",13976,"[local]",5e58301f.3698,2,"SHOW",2020-02-27 21:09:51 GMT,12/34,0,ERROR,42704,"unrecognized configuration parameter ""al""",,,,,,"show al;",,,"psql"
/,ERROR,(?P<sqlcode>\d+),(?P<message>.*)$/ {
$message =~ /canceling statement due to statement timeout/ {
postgresql_errors_total["statement_timeout"]++
}
$message =~ /current transaction is aborted, commands ignored until end of transaction block/ {
postgresql_errors_total["transaction_is_aborted"]++
}
$message =~ /canceling autovacuum task/ {
postgresql_errors_total["canceled_autovacuum"]++
}
$message =~ /deadlock detected/ {
postgresql_errors_total["deadlock_detected"]++
}
$message =~ /canceling statement due to user request/ {
postgresql_errors_total["canceled_by_user_request"]++
}
$message =~ /duplicate key value violates unique constraint/ {
postgresql_errors_total["duplicate_key"]++
}
$message =~ /invalid page in block/ {
postgresql_errors_total["invalid_page"]++
}
otherwise {
postgresql_errors_total["other"]++
}
}
/,FATAL,(?P<sqlcode>\d+),(?P<message>.*)$/ {
# Format for the connection_from field may be an IP and port ("127.0.0.1:48796") or the special string "[local]" for Unix socket.
# Here we want to remove the client port if present to reduce the cardinality of the Prometheus metric label.
# We only want to capture the client IP or the fact that it was a local socket connection.
$message =~ /^.*?,\"(?P<user_name>.*?)\",\"(?P<database_name>.*?)\",[^,]*,\"(?P<connection_from_host>[^:\"]*).*\",.*,\"authentication\",.*authentication failed/ {
postgresql_auth_failure_total[$user_name,$database_name,$connection_from_host]++
}
}
/,LOG,(?P<sqlcode>\d+),(?P<message>.*)$/ {
$message =~ /process [0-9]+ still waiting for (?P<mode>[a-zA-Z]+) on (?P<locktype>[a-zA-Z ]+) / {
postgresql_lock_waits_total[$mode,$locktype]++
}
$message =~ /gitlab event: (?P<event>.*)$/ {
postgresql_event_logs_total[$event]++
}
$message =~ /temporary file: path/ {
postgresql_temp_files_total++
}
}
# end postgres pattern block
}
Metadata
Metadata
Assignees
Labels
MONITORMonitoring relatedMonitoring related
Type
Projects
Status
Iced