[check_postgres] Change to show query details for slow query time

Mon Mar 1 16:14:05 UTC 2010

Hi list,

We had a need to see the client port of slow query to trace the pid of 
the process holding the connection open. I modified check_postgres to 
display this:-

sub check_query_time {
	## Check the length of running queries
	## Supports: Nagios, MRTG
	## It makes no sense to run this more than once on the same cluster
	## Warning and critical are time limits - defaults to seconds
	## Valid units: s[econd], m[inute], h[our], d[ay]
	## All above may be written as plural as well (e.g. "2 hours")
	## Can also ignore databases with exclude and limit with include
	## Limit to a specific user with the includeuser option
	## Exclude users with the excludeuser option

	my ($warning, $critical) = validate_range
		({
		  type             => 'time',
		  default_warning  => '2 minutes',
		  default_critical => '5 minutes',
		  });

	## Bail early if stats_command_string is off
	$SQL = q{SELECT setting FROM pg_settings WHERE name = 
'stats_command_string'};
	my $info = run_command($SQL);
	for my $db (@{$info->{db}}) {
		if ($db->{slurp} =~ /off/) {
			ndie msg('qtime-fail');
		}
	}
	$SQL = q{SELECT client_addr, client_port, procpid, datname, 
max(COALESCE(ROUND(EXTRACT(epoch FROM now()-query_start)),0)) AS 
query_length }.
		qq{FROM pg_stat_activity WHERE current_query <> 
'<IDLE>'$USERWHERECLAUSE GROUP BY client_addr, client_port, procpid, 
datname ORDER BY query_length DESC LIMIT 1};

	$info = run_command($SQL, { regex => qr{\s*.+?\s+\|\s+\-?\d+}, emptyok 
=> 1 } );

	my $found = 0;
     my $client_addr = '0.0.0.0';
     my $client_port = 0;
     my $procpid = 0;
	for $db (@{$info->{db}}) {

		if ($db->{slurp} !~ /\w/ and $USERWHERECLAUSE) {
			$stats{$db->{dbname}} = 0;
			add_ok msg('no-match-user');
			next;
		}
		$found = 1;
		my $max = 0;

	  SLURP: while ($db->{slurp} =~ 
/(.+?)\s+\|(.+?)\s+\|(.+?)\s+\|(.+?)\s+\|\s+(\-?\d+)\s*/gsm) {
			my ($dbname,$current) = ($4, int $5);
             $client_addr = $1;
             $client_port = $2;
             $procpid = $3;

			next SLURP if skip_item($dbname);
			$max = $current if $current > $max;
		}
		if ($MRTG) {
			$stats{$db->{dbname}} = $max;
			next;
		}

		$db->{perf} .= "maxtime=$max;";
		$db->{perf} .= "$warning" if length $warning;
		$db->{perf} .= ';';
		$db->{perf} .= "$critical" if length $critical;

		my $msg = msg('qtime-msg', $max);
		if (length $critical and $max >= $critical) {
             $msg .= " procpid:$procpid, client_addr:$client_add, 
client_port:$client_port. Run sudo netstat -anp and grep $client_port?";
			add_critical $msg;
		}
		elsif (length $warning and $max >= $warning) {
             $msg .= " procpid:$procpid, client_addr:$client_add, 
client_port:$client_port. Run sudo netstat -anp and grep $client_port?";
			add_warning $msg;
		}
		else {
			add_ok $msg;
		}
	}

	return;

} ## end of check_query_time

-- 
Giles Westwood, Systems Administrator,
Semantico, Floor 1, 21-23 Dyke Road, Brighton BN1 3FE
<http://www.semantico.com/>
<mailto:Giles.Westwood at semantico.com>
<tel:+44-1273-358231> <fax:+44-1273-723232>