[check_postgres] [PATCH] Add `pgagent_jobs` test.
David E. Wheeler
david at justatheory.com
Wed Jan 18 22:42:25 UTC 2012
From: "David E. Wheeler" <david at justatheory.com>
This patch adds support for checking for failed pgAgent jobs within a specified period of time. You can specify either --critical or --warning as a period of time, and it will report on failures within that period of time previous to the current time. Job failures are determined by a non-0 status in a job step record.
Using this test obviously requiers that the pgAgent schema be installed. I've also included a bunch of unit tests to make sure it works the way I would expect (the test will create a schema for testing) and documentation.
As part of this, I've introduced the `any_warning` argument to `validate_range()`. The `pgagent_jobs` test does not care if you specify a warning value greater than the critical value (indeed, I expect that if one used both at all, the warning would be much longer). So this new argument prevents the `range-warnbigtime` or `range-warnbigsize` failures from being triggered.
---
check_postgres.pl | 101 +++++++++++++++++++++-
t/02_bloat.t | 1 -
t/02_pgagent_jobs.t | 243 +++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 341 insertions(+), 4 deletions(-)
create mode 100644 t/02_pgagent_jobs.t
diff --git a/check_postgres.pl b/check_postgres.pl
index 22f3b44..49db354 100755
--- a/check_postgres.pl
+++ b/check_postgres.pl
@@ -189,6 +189,7 @@ our %msg = (
'opt-psql-nofind' => q{Could not find a suitable psql executable},
'opt-psql-nover' => q{Could not determine psql version},
'opt-psql-restrict' => q{Cannot use the --PSQL option when NO_PSQL_OPTION is on},
+ 'pgagent-jobs-ok' => q{No failed jobs},
'pgbouncer-pool' => q{Pool=$1 $2=$3},
'pgb-backends-mrtg' => q{DB=$1 Max connections=$2},
'pgb-backends-msg' => q{$1 of $2 connections ($3%)},
@@ -1985,6 +1986,8 @@ check_pgb_pool('maxwait') if $action eq 'pgb_pool_maxwait';
## Check how many clients are connected to pgbouncer compared to max_client_conn.
check_pgbouncer_backends() if $action eq 'pgbouncer_backends';
+check_pgagent_jobs() if $action eq 'pgagent_jobs';
+
##
## Everything past here does not hit a Postgres database
##
@@ -2826,7 +2829,7 @@ sub validate_range {
ndie msg('range-seconds', 'critical')
}
$critical = $1;
- if (length $warning and $warning > $critical) {
+ if (!$arg->{any_warning} and length $warning and $warning > $critical) {
ndie msg('range-warnbigtime', $warning, $critical);
}
}
@@ -2837,7 +2840,7 @@ sub validate_range {
if (! length $critical and ! length $warning) {
ndie msg('range-notime');
}
- if (length $warning and length $critical and $warning > $critical) {
+ if (!$arg->{any_warning} and length $warning and length $critical and $warning > $critical) {
ndie msg('range-warnbigtime', $warning, $critical);
}
}
@@ -2865,7 +2868,7 @@ sub validate_range {
ndie msg('range-badsize', 'warning');
}
$warning = size_in_bytes($1,$2);
- if (length $critical and $warning > $critical) {
+ if (!$arg->{any_warning} and length $critical and $warning > $critical) {
ndie msg('range-warnbigsize', $warning, $critical);
}
}
@@ -5429,6 +5432,73 @@ sub check_new_version_tnm {
} ## end of check_new_version_tnm
+sub check_pgagent_jobs {
+ ## Check for failed pgAgent jobs.
+ ## Supports: Nagios
+ ## Critical and warning are intervals.
+ ## Example: --critical="1 hour"
+ ## Example: --warning="2 hours"
+
+ my ($warning, $critical) = validate_range({ type => 'time', any_warning => 1 });
+
+ # Determine critcal warning column contents.
+ my $is_crit = $critical && $warning
+ ? "GREATEST($critical - EXTRACT('epoch' FROM NOW() - (jlog.jlgstart + jlog.jlgduration)), 0)"
+ : $critical ? 1 : 0;
+
+ # Determine max time to examine.
+ my $seconds = do {
+ no warnings;
+ $warning > $critical ? $warning : $critical;
+ };
+
+ $SQL = qq{
+ SELECT jlog.jlgid
+ , job.jobname
+ , step.jstname
+ , slog.jslresult
+ , slog.jsloutput
+ , $is_crit AS critical
+ FROM pgagent.pga_job job
+ JOIN pgagent.pga_joblog jlog ON job.jobid = jlog.jlgjobid
+ JOIN pgagent.pga_jobstep step ON job.jobid = step.jstjobid
+ JOIN pgagent.pga_jobsteplog slog ON jlog.jlgid = slog.jsljlgid AND step.jstid = slog.jsljstid
+ WHERE slog.jslresult <> 0
+ AND EXTRACT('epoch' FROM NOW() - (jlog.jlgstart + jlog.jlgduration)) < $seconds
+ };
+
+ my $info = run_command($SQL);
+
+ for $db (@{$info->{db}}) {
+ my @rows = @{ $db->{slurp} } or do {
+ add_ok msg('pgagent-jobs-ok');
+ next;
+ };
+
+ if ($rows[0]{critical} !~ /^(?:[01]|\d+[.]\d+)$/) {
+ add_unknown msg('invalid-query', $db->{slurp});
+ next;
+ }
+
+ my ($is_crit, @msg);
+ my $log_id = -1;
+ for my $step (@rows) {
+ my $output = $step->{jsloutput} || '(NO OUTPUT)';
+ push @msg => "$step->{jslresult} $step->{jobname}/$step->{jstname}: $output";
+ $is_crit ||= $step->{critical};
+ }
+
+ (my $msg = join '; ' => @msg) =~ s{\r?\n}{ }g;
+ if ($is_crit) {
+ add_critical $msg;
+ } else {
+ add_warning $msg;
+ }
+ }
+
+ return;
+}
+
sub check_pgbouncer_checksum {
## Verify the checksum of all pgbouncer settings
@@ -8885,6 +8955,31 @@ the fourth line gives the name of the database, plus the current
max_client_conn. If more than one database has been queried, the one with the
highest number of connections is output.
+=head2 B<pgagent_jobs>
+
+(C<symlink: check_postgres_pgagent_jobs>) Checks that all the pgAgent jobs
+that have executed in the preceding interval of time have succeeded. This is
+done by checking for any steps that have a non-zero result.
+
+Either C<--warning> or C<--critical>, or both, may be specified as times, and
+jobs will be checked for failures withing the specified periods of time before
+the current time. Valid units are seconds, minutes, hours, and days; all can
+be abbreviated to the first letter. If no units are given, 'seconds' are
+assumed.
+
+Example 1: Give a critical when any jobs executed in the last day have failed.
+
+ check_postgres_pgagent_jobs --critical=1d
+
+Example 2: Give a warning when any jobs executed in the last week have failed.
+
+ check_postgres_pgagent_jobs --warning=7d
+
+Example 3: Give a critical for jobs that have failed in the last 2 hours and a
+warning for jobs that have failed in the last 4 hours:
+
+ check_postgres_pgagent_jobs --critical=2h --warning=4h
+
=head2 B<pgbouncer_checksum>
(C<symlink: check_postgres_pgbouncer_checksum>) Checks that all the
diff --git a/t/02_bloat.t b/t/02_bloat.t
index cbb9c01..5614b88 100644
--- a/t/02_bloat.t
+++ b/t/02_bloat.t
@@ -5,7 +5,6 @@
use 5.006;
use strict;
use warnings;
-use Data::Dumper;
use Test::More tests => 30;
use lib 't','.';
use CP_Testing;
diff --git a/t/02_pgagent_jobs.t b/t/02_pgagent_jobs.t
new file mode 100644
index 0000000..acee568
--- /dev/null
+++ b/t/02_pgagent_jobs.t
@@ -0,0 +1,243 @@
+#!/usr/bin/perl -w
+
+## Test the "pgagent_jobs" action
+
+use 5.006;
+use strict;
+use warnings;
+use Test::More tests => 48;
+#use Test::More 'no_plan';
+use lib 't','.';
+use CP_Testing;
+
+my $cp = CP_Testing->new({ default_action => 'pgagent_jobs' });
+my $dbh = $cp->test_database_handle;
+my $S = q{Action 'pgagent_jobs'};
+my $label = 'POSTGRES_PGAGENT_JOBS';
+my $tname = 'cp_pgagent_jobs_test';
+
+# Mock NOW().
+like $cp->run('foobar=12'), qr{Usage:}, "$S fails when called with an invalid option";
+
+like $cp->run('-w=abc'), qr{must be a valid time}, "$S fails with invalid -w";
+like $cp->run('-c=abc'), qr{must be a valid time}, "$S fails with invalid -c";
+
+# Set up a dummy pgagent schema.
+$dbh->{AutoCommit} = 1;
+$dbh->do(q{
+ SET client_min_messages TO warning;
+ CREATE SCHEMA pgagent;
+
+ CREATE TABLE pgagent.pga_job (
+ jobid serial NOT NULL PRIMARY KEY,
+ jobname text NOT NULL
+ );
+
+ CREATE TABLE pgagent.pga_jobstep (
+ jstid serial NOT NULL PRIMARY KEY,
+ jstjobid int4 NOT NULL REFERENCES pgagent.pga_job(jobid),
+ jstname text NOT NULL
+ );
+
+ CREATE TABLE pgagent.pga_joblog (
+ jlgid serial NOT NULL PRIMARY KEY,
+ jlgjobid int4 NOT NULL REFERENCES pgagent.pga_job(jobid),
+ jlgstart timestamptz NOT NULL DEFAULT current_timestamp,
+ jlgduration interval NULL
+ );
+
+ CREATE TABLE pgagent.pga_jobsteplog (
+ jsljlgid int4 NOT NULL REFERENCES pgagent.pga_joblog(jlgid),
+ jsljstid int4 NOT NULL REFERENCES pgagent.pga_jobstep(jstid),
+ jslresult int4 NULL,
+ jsloutput text
+ );
+ RESET client_min_messages;
+});
+END { $dbh->do(q{
+ SET client_min_messages TO warning;
+ DROP SCHEMA pgagent CASCADE;
+ RESET client_min_messages;
+}) if $dbh; }
+
+like $cp->run('-c=1d'), qr{^$label OK: DB "postgres"}, "$S returns ok for no jobs";
+
+for my $time (qw/seconds minutes hours days/) {
+ like $cp->run("-w=1000000$time"), qr{^$label OK: DB "postgres"},
+ qq{$S returns ok for no pgagent_jobs with a unit of $time};
+ (my $singular = $time) =~ s/s$//;
+ like $cp->run("-w=1000000$singular"), qr{^$label OK: DB "postgres"},
+ qq{$S returns ok for no pgagent_jobs with a unit of $singular};
+ my $short = substr $time, 0, 1;
+ like $cp->run("-w=1000000$short"), qr{^$label OK: DB "postgres"},
+ qq{$S returns ok for no pgagent_jobs with a unit of $short};
+}
+
+my ($now, $back_6_hours, $back_30_hours) = $dbh->selectrow_array(q{
+ SELECT NOW(), NOW() - '6 hours'::interval, NOW() - '30 hours'::interval
+});
+
+# Let's add some jobs
+$dbh->do(qq{
+ -- Two jobs.
+ INSERT INTO pgagent.pga_job (jobid, jobname)
+ VALUES (1, 'Backup'), (2, 'Restore');
+
+ -- Each job has two steps.
+ INSERT INTO pgagent.pga_jobstep (jstid, jstjobid, jstname)
+ VALUES (11, 1, 'pd_dump'), (21, 1, 'vacuum'),
+ (12, 2, 'pd_restore'), (22, 2, 'analyze');
+
+ -- Execute each job twice.
+ INSERT INTO pgagent.pga_joblog (jlgid, jlgjobid, jlgstart, jlgduration)
+ VALUES (31, 1, '$back_6_hours', '1 hour'),
+ (41, 1, '$back_30_hours', '5m'),
+ (32, 2, '$back_6_hours', '01:02:00'),
+ (42, 2, '$back_30_hours', '7m');
+
+ -- Execute each step twice.
+ INSERT INTO pgagent.pga_jobsteplog (jsljlgid, jsljstid, jslresult, jsloutput)
+ VALUES (31, 11, 0, ''),
+ (31, 21, 0, ''),
+ (41, 11, 0, ''),
+ (41, 21, 0, ''),
+ (32, 12, 0, ''),
+ (32, 22, 0, ''),
+ (42, 12, 0, ''),
+ (42, 22, 0, '');
+});
+
+# There should be no failures.
+like $cp->run('-c=1d'), qr{^$label OK: DB "postgres"},
+ "$S returns ok with only successful jobs";
+
+# Make one job fail from before our time.
+ok $dbh->do(q{
+ UPDATE pgagent.pga_jobsteplog
+ SET jslresult = 255
+ , jsloutput = 'WTF!'
+ WHERE jsljlgid = 32
+ AND jsljstid = 22
+}), 'Make a job fail around 5 hours ago';
+
+like $cp->run('-c=2h'), qr{^$label OK: DB "postgres"},
+ "$S -c=2h returns ok with failed job before our time";
+
+like $cp->run('-c=6h'),
+ qr{^$label CRITICAL: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!},
+ "$S -c=6h returns critical with failed job within our time";
+
+like $cp->run('-w=2h'), qr{^$label OK: DB "postgres"},
+ "$S -w=2h returns ok with failed job before our time";
+
+like $cp->run('-w=6h'),
+ qr{^$label WARNING: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!},
+ "$S -w=6h returns warninf with failed job within our time";
+
+like $cp->run('-w=2h'), qr{^$label OK: DB "postgres"},
+ "$S -w=2h returns ok with failed job before our time";
+
+like $cp->run('-w=4h -c=2h'), qr{^$label OK: DB "postgres"},
+ "$S -w=4h =c=2h returns ok with failed job before our time";
+
+like $cp->run('-w=5h -c=2h'),
+ qr{^$label WARNING: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!},
+ "$S -w=5h =c=2h returns warning with failed job within our time";
+
+like $cp->run('-w=2h -c=5h'),
+ qr{^$label CRITICAL: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!},
+ "$S -w=2h =c=5h returns critical with failed job within our time";
+
+like $cp->run('-w=5h -c=5h'),
+ qr{^$label CRITICAL: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!},
+ "$S -w=5h =c=5h returns critical with failed job within our time";
+
+# Make a second job fail, back 30 hours.
+ok $dbh->do(q{
+ UPDATE pgagent.pga_jobsteplog
+ SET jslresult = 64
+ , jsloutput = 'OMGWTFLOL!'
+ WHERE jsljlgid = 42
+ AND jsljstid = 22
+}), 'Make a job fail around 29 hours ago';
+
+like $cp->run('-c=2h'), qr{^$label OK: DB "postgres"},
+ "$S -c=2h returns ok with failed job before our time";
+
+like $cp->run('-c=6h'),
+ qr{^$label CRITICAL: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!},
+ "$S -c=6h returns critical with failed job within our time";
+
+like $cp->run('-w=2h'), qr{^$label OK: DB "postgres"},
+ "$S -w=2h returns ok with failed job before our time";
+
+like $cp->run('-w=6h'),
+ qr{^$label WARNING: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!},
+ "$S -w=6h returns warninf with failed job within our time";
+
+like $cp->run('-w=2h'), qr{^$label OK: DB "postgres"},
+ "$S -w=2h returns ok with failed job before our time";
+
+like $cp->run('-w=4h -c=2h'), qr{^$label OK: DB "postgres"},
+ "$S -w=4h =c=2h returns ok with failed job before our time";
+
+like $cp->run('-w=5h -c=2h'),
+ qr{^$label WARNING: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!},
+ "$S -w=5h =c=2h returns warning with failed job within our time";
+
+like $cp->run('-w=2h -c=5h'),
+ qr{^$label CRITICAL: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!},
+ "$S -w=2h =c=5h returns critical with failed job within our time";
+
+like $cp->run('-w=5h -c=5h'),
+ qr{^$label CRITICAL: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!},
+ "$S -w=5h -c=5h returns critical with failed job within our time";
+
+# Go back futher in time!
+like $cp->run('-w=30h -c=2h'),
+ qr{^$label WARNING: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!},
+ "$S -w=30h -c=5h returns warning for older failed job";
+
+like $cp->run('-w=30h -c=6h'),
+ qr{^$label CRITICAL: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!; 64 Restore/analyze: OMGWTFLOL!},
+ "$S -w=30h -c=6h returns critical with both jobs, more recent critical";
+
+like $cp->run('-c=30h'),
+ qr{^$label CRITICAL: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!; 64 Restore/analyze: OMGWTFLOL!},
+ "$S -c=30h returns critical with both failed jobs";
+
+like $cp->run('-w=30h'),
+ qr{^$label WARNING: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!; 64 Restore/analyze: OMGWTFLOL!},
+ "$S -w=30h returns critical with both failed jobs";
+
+# Try with critical recent and warning longer ago.
+like $cp->run('-w=30h -c=6h'),
+ qr{^$label CRITICAL: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!; 64 Restore/analyze: OMGWTFLOL!},
+ "$S -w=30h -c=6h returns critical with both failed jobs";
+
+# Try with warning recent and critical longer ago.
+like $cp->run('-c=30h -w=6h'),
+ qr{^$label CRITICAL: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!; 64 Restore/analyze: OMGWTFLOL!},
+ "$S -c=30h -w=6h returns critical with both failed jobs";
+
+# Undo the more recent failure.
+ok $dbh->do(q{
+ UPDATE pgagent.pga_jobsteplog
+ SET jslresult = 0
+ , jsloutput = ''
+ WHERE jsljlgid = 32
+ AND jsljstid = 22
+}), 'Unfail the more recent failed job';
+
+like $cp->run('-c=6h'), qr{^$label OK: DB "postgres"},
+ "$S -c=6h should now return ok";
+
+like $cp->run('-c=30h'), qr{^$label CRITICAL: DB "postgres"},
+ "$S -c=30h should return critical";
+
+like $cp->run('-w=6h'), qr{^$label OK: DB "postgres"},
+ "$S -w=6h should now return ok";
+
+like $cp->run('-w=30h'), qr{^$label WARNING: DB "postgres"},
+ "$S -w=30h should return warning";
+
--
1.7.7.3
More information about the Check_postgres
mailing list