[check_postgres] [PATCH] Add `pgagent_jobs` test.

David E. Wheeler david at justatheory.com
Wed Jan 18 22:42:25 UTC 2012


From: "David E. Wheeler" <david at justatheory.com>

This patch adds support for checking for failed pgAgent jobs within a specified period of time. You can specify either --critical or --warning as a period of time, and it will report on failures within that period of time previous to the current time. Job failures are determined by a non-0 status in a job step record.

Using this test obviously requiers that the pgAgent schema be installed. I've also included a bunch of unit tests to make sure it works the way I would expect (the test will create a schema for testing) and documentation.

As part of this, I've introduced the `any_warning` argument to `validate_range()`. The `pgagent_jobs` test does not care if you specify a warning value greater than the critical value (indeed, I expect that if one used both at all, the warning would be much longer). So this new argument prevents the `range-warnbigtime` or `range-warnbigsize` failures from being triggered.
---
 check_postgres.pl   |  101 +++++++++++++++++++++-
 t/02_bloat.t        |    1 -
 t/02_pgagent_jobs.t |  243 +++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 341 insertions(+), 4 deletions(-)
 create mode 100644 t/02_pgagent_jobs.t

diff --git a/check_postgres.pl b/check_postgres.pl
index 22f3b44..49db354 100755
--- a/check_postgres.pl
+++ b/check_postgres.pl
@@ -189,6 +189,7 @@ our %msg = (
     'opt-psql-nofind'    => q{Could not find a suitable psql executable},
     'opt-psql-nover'     => q{Could not determine psql version},
     'opt-psql-restrict'  => q{Cannot use the --PSQL option when NO_PSQL_OPTION is on},
+    'pgagent-jobs-ok'    => q{No failed jobs},
     'pgbouncer-pool'     => q{Pool=$1 $2=$3},
     'pgb-backends-mrtg'  => q{DB=$1 Max connections=$2},
     'pgb-backends-msg'   => q{$1 of $2 connections ($3%)},
@@ -1985,6 +1986,8 @@ check_pgb_pool('maxwait') if $action eq 'pgb_pool_maxwait';
 ## Check how many clients are connected to pgbouncer compared to max_client_conn.
 check_pgbouncer_backends() if $action eq 'pgbouncer_backends';
 
+check_pgagent_jobs() if $action eq 'pgagent_jobs';
+
 ##
 ## Everything past here does not hit a Postgres database
 ##
@@ -2826,7 +2829,7 @@ sub validate_range {
                 ndie msg('range-seconds', 'critical')
             }
             $critical = $1;
-            if (length $warning and $warning > $critical) {
+            if (!$arg->{any_warning} and length $warning and $warning > $critical) {
                 ndie msg('range-warnbigtime', $warning, $critical);
             }
         }
@@ -2837,7 +2840,7 @@ sub validate_range {
         if (! length $critical and ! length $warning) {
             ndie msg('range-notime');
         }
-        if (length $warning and length $critical and $warning > $critical) {
+        if (!$arg->{any_warning} and length $warning and length $critical and $warning > $critical) {
             ndie msg('range-warnbigtime', $warning, $critical);
         }
     }
@@ -2865,7 +2868,7 @@ sub validate_range {
                 ndie msg('range-badsize', 'warning');
             }
             $warning = size_in_bytes($1,$2);
-            if (length $critical and $warning > $critical) {
+            if (!$arg->{any_warning} and length $critical and $warning > $critical) {
                 ndie msg('range-warnbigsize', $warning, $critical);
             }
         }
@@ -5429,6 +5432,73 @@ sub check_new_version_tnm {
 } ## end of check_new_version_tnm
 
 
+sub check_pgagent_jobs {
+    ## Check for failed pgAgent jobs.
+    ## Supports: Nagios
+    ## Critical and warning are intervals.
+    ## Example: --critical="1 hour"
+    ## Example: --warning="2 hours"
+
+    my ($warning, $critical) = validate_range({ type => 'time', any_warning => 1 });
+
+    # Determine critcal warning column contents.
+    my $is_crit = $critical && $warning
+        ? "GREATEST($critical - EXTRACT('epoch' FROM NOW() - (jlog.jlgstart + jlog.jlgduration)), 0)"
+        : $critical ? 1 : 0;
+
+    # Determine max time to examine.
+    my $seconds = do {
+        no warnings;
+        $warning > $critical ? $warning : $critical;
+    };
+
+    $SQL = qq{
+        SELECT jlog.jlgid
+             , job.jobname
+             , step.jstname
+             , slog.jslresult
+             , slog.jsloutput
+             , $is_crit AS critical
+          FROM pgagent.pga_job job
+          JOIN pgagent.pga_joblog     jlog ON job.jobid  = jlog.jlgjobid
+          JOIN pgagent.pga_jobstep    step ON job.jobid  = step.jstjobid
+          JOIN pgagent.pga_jobsteplog slog ON jlog.jlgid = slog.jsljlgid AND step.jstid = slog.jsljstid
+         WHERE slog.jslresult <> 0
+           AND EXTRACT('epoch' FROM NOW() - (jlog.jlgstart + jlog.jlgduration)) < $seconds
+    };
+
+    my $info = run_command($SQL);
+
+    for $db (@{$info->{db}}) {
+        my @rows = @{ $db->{slurp} } or do {
+            add_ok msg('pgagent-jobs-ok');
+            next;
+        };
+
+        if ($rows[0]{critical} !~ /^(?:[01]|\d+[.]\d+)$/) {
+            add_unknown msg('invalid-query', $db->{slurp});
+            next;
+        }
+
+        my ($is_crit, @msg);
+        my $log_id = -1;
+        for my $step (@rows) {
+            my $output = $step->{jsloutput} || '(NO OUTPUT)';
+            push @msg => "$step->{jslresult} $step->{jobname}/$step->{jstname}: $output";
+            $is_crit ||= $step->{critical};
+        }
+
+        (my $msg = join '; ' => @msg) =~ s{\r?\n}{ }g;
+        if ($is_crit) {
+            add_critical $msg;
+        } else {
+            add_warning $msg;
+        }
+    }
+
+    return;
+}
+
 sub check_pgbouncer_checksum {
 
     ## Verify the checksum of all pgbouncer settings
@@ -8885,6 +8955,31 @@ the fourth line gives the name of the database, plus the current
 max_client_conn. If more than one database has been queried, the one with the
 highest number of connections is output.
 
+=head2 B<pgagent_jobs>
+
+(C<symlink: check_postgres_pgagent_jobs>) Checks that all the pgAgent jobs
+that have executed in the preceding interval of time have succeeded. This is
+done by checking for any steps that have a non-zero result.
+
+Either C<--warning> or C<--critical>, or both, may be specified as times, and
+jobs will be checked for failures withing the specified periods of time before
+the current time. Valid units are seconds, minutes, hours, and days; all can
+be abbreviated to the first letter. If no units are given, 'seconds' are
+assumed.
+
+Example 1: Give a critical when any jobs executed in the last day have failed.
+
+  check_postgres_pgagent_jobs --critical=1d
+
+Example 2: Give a warning when any jobs executed in the last week have failed.
+
+  check_postgres_pgagent_jobs --warning=7d
+
+Example 3: Give a critical for jobs that have failed in the last 2 hours and a
+warning for jobs that have failed in the last 4 hours:
+
+  check_postgres_pgagent_jobs --critical=2h --warning=4h
+
 =head2 B<pgbouncer_checksum>
 
 (C<symlink: check_postgres_pgbouncer_checksum>) Checks that all the
diff --git a/t/02_bloat.t b/t/02_bloat.t
index cbb9c01..5614b88 100644
--- a/t/02_bloat.t
+++ b/t/02_bloat.t
@@ -5,7 +5,6 @@
 use 5.006;
 use strict;
 use warnings;
-use Data::Dumper;
 use Test::More tests => 30;
 use lib 't','.';
 use CP_Testing;
diff --git a/t/02_pgagent_jobs.t b/t/02_pgagent_jobs.t
new file mode 100644
index 0000000..acee568
--- /dev/null
+++ b/t/02_pgagent_jobs.t
@@ -0,0 +1,243 @@
+#!/usr/bin/perl -w
+
+## Test the "pgagent_jobs" action
+
+use 5.006;
+use strict;
+use warnings;
+use Test::More tests => 48;
+#use Test::More 'no_plan';
+use lib 't','.';
+use CP_Testing;
+
+my $cp    = CP_Testing->new({ default_action => 'pgagent_jobs' });
+my $dbh   = $cp->test_database_handle;
+my $S     = q{Action 'pgagent_jobs'};
+my $label = 'POSTGRES_PGAGENT_JOBS';
+my $tname = 'cp_pgagent_jobs_test';
+
+# Mock NOW().
+like $cp->run('foobar=12'), qr{Usage:}, "$S fails when called with an invalid option";
+
+like $cp->run('-w=abc'), qr{must be a valid time}, "$S fails with invalid -w";
+like $cp->run('-c=abc'), qr{must be a valid time}, "$S fails with invalid -c";
+
+# Set up a dummy pgagent schema.
+$dbh->{AutoCommit} = 1;
+$dbh->do(q{
+    SET client_min_messages TO warning;
+    CREATE SCHEMA pgagent;
+
+    CREATE TABLE pgagent.pga_job (
+        jobid     serial  NOT NULL PRIMARY KEY,
+        jobname   text    NOT NULL
+    );
+
+    CREATE TABLE pgagent.pga_jobstep (
+        jstid     serial  NOT NULL PRIMARY KEY,
+        jstjobid  int4    NOT NULL REFERENCES pgagent.pga_job(jobid),
+        jstname   text    NOT NULL
+    );
+
+    CREATE TABLE pgagent.pga_joblog (
+        jlgid        serial       NOT NULL PRIMARY KEY,
+        jlgjobid     int4         NOT NULL REFERENCES pgagent.pga_job(jobid),
+        jlgstart     timestamptz  NOT NULL DEFAULT current_timestamp,
+        jlgduration  interval     NULL
+    );
+
+    CREATE TABLE pgagent.pga_jobsteplog (
+        jsljlgid   int4  NOT NULL REFERENCES pgagent.pga_joblog(jlgid),
+        jsljstid   int4  NOT NULL REFERENCES pgagent.pga_jobstep(jstid),
+        jslresult  int4      NULL,
+        jsloutput  text
+    );
+    RESET client_min_messages;
+});
+END { $dbh->do(q{
+    SET client_min_messages TO warning;
+    DROP SCHEMA pgagent CASCADE;
+    RESET client_min_messages;
+}) if $dbh; }
+
+like $cp->run('-c=1d'), qr{^$label OK: DB "postgres"}, "$S returns ok for no jobs";
+
+for my $time (qw/seconds minutes hours days/) {
+    like $cp->run("-w=1000000$time"), qr{^$label OK: DB "postgres"},
+        qq{$S returns ok for no pgagent_jobs with a unit of $time};
+    (my $singular = $time) =~ s/s$//;
+    like $cp->run("-w=1000000$singular"), qr{^$label OK: DB "postgres"},
+        qq{$S returns ok for no pgagent_jobs with a unit of $singular};
+    my $short = substr $time, 0, 1;
+    like $cp->run("-w=1000000$short"), qr{^$label OK: DB "postgres"},
+        qq{$S returns ok for no pgagent_jobs with a unit of $short};
+}
+
+my ($now, $back_6_hours, $back_30_hours) = $dbh->selectrow_array(q{
+    SELECT NOW(), NOW() - '6 hours'::interval, NOW() - '30 hours'::interval
+});
+
+# Let's add some jobs
+$dbh->do(qq{
+    -- Two jobs.
+    INSERT INTO pgagent.pga_job (jobid, jobname)
+    VALUES (1, 'Backup'), (2, 'Restore');
+
+    -- Each job has two steps.
+    INSERT INTO pgagent.pga_jobstep (jstid, jstjobid, jstname)
+    VALUES (11, 1, 'pd_dump'), (21, 1, 'vacuum'),
+           (12, 2, 'pd_restore'), (22, 2, 'analyze');
+
+    -- Execute each job twice.
+    INSERT INTO pgagent.pga_joblog (jlgid, jlgjobid, jlgstart, jlgduration)
+    VALUES (31, 1, '$back_6_hours',  '1 hour'),
+           (41, 1, '$back_30_hours', '5m'),
+           (32, 2, '$back_6_hours',  '01:02:00'),
+           (42, 2, '$back_30_hours', '7m');
+
+    -- Execute each step twice.
+    INSERT INTO pgagent.pga_jobsteplog (jsljlgid, jsljstid, jslresult, jsloutput)
+    VALUES (31, 11, 0, ''),
+           (31, 21, 0, ''),
+           (41, 11, 0, ''),
+           (41, 21, 0, ''),
+           (32, 12, 0, ''),
+           (32, 22, 0, ''),
+           (42, 12, 0, ''),
+           (42, 22, 0, '');
+});
+
+# There should be no failures.
+like $cp->run('-c=1d'), qr{^$label OK: DB "postgres"},
+    "$S returns ok with only successful jobs";
+
+# Make one job fail from before our time.
+ok $dbh->do(q{
+    UPDATE pgagent.pga_jobsteplog
+       SET jslresult = 255
+         , jsloutput = 'WTF!'
+     WHERE jsljlgid = 32
+       AND jsljstid = 22
+}), 'Make a job fail around 5 hours ago';
+
+like $cp->run('-c=2h'), qr{^$label OK: DB "postgres"},
+    "$S -c=2h returns ok with failed job before our time";
+
+like $cp->run('-c=6h'),
+    qr{^$label CRITICAL: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!},
+    "$S -c=6h returns critical with failed job within our time";
+
+like $cp->run('-w=2h'), qr{^$label OK: DB "postgres"},
+    "$S -w=2h returns ok with failed job before our time";
+
+like $cp->run('-w=6h'),
+    qr{^$label WARNING: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!},
+    "$S -w=6h returns warninf with failed job within our time";
+
+like $cp->run('-w=2h'), qr{^$label OK: DB "postgres"},
+    "$S -w=2h returns ok with failed job before our time";
+
+like $cp->run('-w=4h -c=2h'), qr{^$label OK: DB "postgres"},
+    "$S -w=4h =c=2h returns ok with failed job before our time";
+
+like $cp->run('-w=5h -c=2h'),
+    qr{^$label WARNING: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!},
+    "$S -w=5h =c=2h returns warning with failed job within our time";
+
+like $cp->run('-w=2h -c=5h'),
+    qr{^$label CRITICAL: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!},
+    "$S -w=2h =c=5h returns critical with failed job within our time";
+
+like $cp->run('-w=5h -c=5h'),
+    qr{^$label CRITICAL: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!},
+    "$S -w=5h =c=5h returns critical with failed job within our time";
+
+# Make a second job fail, back 30 hours.
+ok $dbh->do(q{
+    UPDATE pgagent.pga_jobsteplog
+       SET jslresult = 64
+         , jsloutput = 'OMGWTFLOL!'
+     WHERE jsljlgid = 42
+       AND jsljstid = 22
+}), 'Make a job fail around 29 hours ago';
+
+like $cp->run('-c=2h'), qr{^$label OK: DB "postgres"},
+    "$S -c=2h returns ok with failed job before our time";
+
+like $cp->run('-c=6h'),
+    qr{^$label CRITICAL: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!},
+    "$S -c=6h returns critical with failed job within our time";
+
+like $cp->run('-w=2h'), qr{^$label OK: DB "postgres"},
+    "$S -w=2h returns ok with failed job before our time";
+
+like $cp->run('-w=6h'),
+    qr{^$label WARNING: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!},
+    "$S -w=6h returns warninf with failed job within our time";
+
+like $cp->run('-w=2h'), qr{^$label OK: DB "postgres"},
+    "$S -w=2h returns ok with failed job before our time";
+
+like $cp->run('-w=4h -c=2h'), qr{^$label OK: DB "postgres"},
+    "$S -w=4h =c=2h returns ok with failed job before our time";
+
+like $cp->run('-w=5h -c=2h'),
+    qr{^$label WARNING: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!},
+    "$S -w=5h =c=2h returns warning with failed job within our time";
+
+like $cp->run('-w=2h -c=5h'),
+    qr{^$label CRITICAL: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!},
+    "$S -w=2h =c=5h returns critical with failed job within our time";
+
+like $cp->run('-w=5h -c=5h'),
+    qr{^$label CRITICAL: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!},
+    "$S -w=5h -c=5h returns critical with failed job within our time";
+
+# Go back futher in time!
+like $cp->run('-w=30h -c=2h'),
+    qr{^$label WARNING: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!},
+    "$S -w=30h -c=5h returns warning for older failed job";
+
+like $cp->run('-w=30h -c=6h'),
+    qr{^$label CRITICAL: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!; 64 Restore/analyze: OMGWTFLOL!},
+    "$S -w=30h -c=6h returns critical with both jobs, more recent critical";
+
+like $cp->run('-c=30h'),
+    qr{^$label CRITICAL: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!; 64 Restore/analyze: OMGWTFLOL!},
+    "$S -c=30h returns critical with both failed jobs";
+
+like $cp->run('-w=30h'),
+    qr{^$label WARNING: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!; 64 Restore/analyze: OMGWTFLOL!},
+    "$S -w=30h returns critical with both failed jobs";
+
+# Try with critical recent and warning longer ago.
+like $cp->run('-w=30h -c=6h'),
+    qr{^$label CRITICAL: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!; 64 Restore/analyze: OMGWTFLOL!},
+    "$S -w=30h -c=6h returns critical with both failed jobs";
+
+# Try with warning recent and critical longer ago.
+like $cp->run('-c=30h -w=6h'),
+    qr{^$label CRITICAL: DB "postgres" [()][^)]+[)] 255 Restore/analyze: WTF!; 64 Restore/analyze: OMGWTFLOL!},
+    "$S -c=30h -w=6h returns critical with both failed jobs";
+
+# Undo the more recent failure.
+ok $dbh->do(q{
+    UPDATE pgagent.pga_jobsteplog
+       SET jslresult = 0
+         , jsloutput = ''
+     WHERE jsljlgid = 32
+       AND jsljstid = 22
+}), 'Unfail the more recent failed job';
+
+like $cp->run('-c=6h'), qr{^$label OK: DB "postgres"},
+    "$S -c=6h should now return ok";
+
+like $cp->run('-c=30h'), qr{^$label CRITICAL: DB "postgres"},
+    "$S -c=30h should return critical";
+
+like $cp->run('-w=6h'), qr{^$label OK: DB "postgres"},
+    "$S -w=6h should now return ok";
+
+like $cp->run('-w=30h'), qr{^$label WARNING: DB "postgres"},
+    "$S -w=30h should return warning";
+
-- 
1.7.7.3



More information about the Check_postgres mailing list