#!/usr/bin/perl -w
our $VERSION = '2.4 (2013-01-12)';
#
# check_afs_quotas -- Monitor AFS quota usage under Nagios.
#
# Expects a host with the -H option or a volume with the -n option and checks
# usage versus quota either for all volumes on that server or only for the
# specific volume listed.  Exits with status 1 if the free space is below a
# warning percentage and with status 2 if the free space is below a critical
# percentage.
#
# Written by Russ Allbery <rra@stanford.edu>
# Based on a script by Steve Rader
# Copyright 2010, 2013
#     The Board of Trustees of the Leland Stanford Junior University
#
# This program is free software; you may redistribute it and/or modify it
# under the same terms as Perl itself.

##############################################################################
# Modules and declarations
##############################################################################

require 5.006;

use strict;

use Getopt::Long qw(GetOptions);

# Use Number::Format if it's available, but don't require it.
our $FORMAT = 0;
eval {
    require Number::Format;
    Number::Format->import ('format_bytes');
};
unless ($@) {
    $FORMAT = 1;
}

##############################################################################
# Site configuration
##############################################################################

# The default percentage full at which to warn and at which to send a critical
# alert.  These can be overridden with the -w and -c command-line options.
our $WARNINGS = 85;
our $CRITICAL = 90;

# The default timeout in seconds (implemented by alarm) for vos operations.
our $TIMEOUT = 300;

# The full path to vos.  Make sure that this is on local disk so that
# monitoring doesn't have an AFS dependency.
our ($VOS) = grep { -x $_ }
    qw(/usr/bin/vos /usr/sbin/vos /usr/local/bin/vos /usr/local/sbin/vos);
$VOS ||= 'vos';

##############################################################################
# AFS operations
##############################################################################

# Given a volume name, determines various characteristics of the volume and
# returns them in a hash.  'size' gets the volume size in KB, 'quota' gets the
# volume quota in KB, and 'rwserver' and 'rwpart' get the server and partition
# for the read-write volume.
sub volinfo {
    my ($volume) = @_;
    my (%results);
    unless (open (VEX, '-|', $VOS, 'examine', $volume, '-noauth')) {
        print "AFS CRITICAL - cannot contact server\n";
        exit 2;
    }
    local $_;
    while (<VEX>) {
        if (/^\Q$volume\E\s+\d+ (RW|RO|BK)\s+(\d+) K\s+On-line\s*$/) {
            $results{size} = $2;
        } elsif (/^\s+server ([^.\s]+)\.\S+ partition (\S+) RW Site\s*/) {
            $results{rwserver} = $1;
            $results{rwpart} = $2;
        } elsif (/^\s+MaxQuota\s+(\d+) K\s*$/) {
            $results{quota} = $1;
        }
    }
    close VEX;
    return unless $results{size};
    return %results;
}

# Given a server, for each partition on that server, for each volume on each
# partition, colllect volume usage and quota information.  Return the results
# in a hash where the keys are volume names and the values are hashes
# containing 'size' and 'quota' keys and values in KB.
sub serverinfo {
    my ($server, $partition, $regex) = @_;
    my @command = ($VOS, 'listvol', '-server', $server, '-long', '-noauth');
    if (defined ($partition)) {
        push (@command, '-partition', $partition);
    }
    unless (open (LVOL, '-|', @command)) {
        print "AFS CRITICAL - cannot contact server\n";
        exit 2;
    }
    my ($volume, $size, %results);
    local $_;
    while (<LVOL>) {
        if (/^(\S+)\s+\d+ RW\s+(\d+) K\s+On-line\s*$/) {
            ($volume, $size) = ($1, $2);
            if (!defined ($regex) || $volume =~ /$regex/) {
                $results{$volume}{size} = $size;
            } else {
                $volume = undef;
            }
        } elsif (/^\s+MaxQuota\s+(\d+) K\s*$/ && defined $volume) {
            $results{$volume}{quota} = $1;
            $volume = undef;
        } elsif (/^\s*$/) { # next volume
            $volume = undef;
        }
    }
    return %results;
}

##############################################################################
# Reporting
##############################################################################

# Summarize a single volume.  This is used to support the -n option, where we
# give verbose results for one volume.  Takes a hash ref to the volume data
# and returns a list consisting of the percent used and the text summary.
sub summarize_volume {
    my ($volume, $results) = @_;
    my $total = $$results{quota};
    my $used  = $$results{size};
    my $free  = $total - $used;
    $free = 0 if ($free < 0);
    my $percent = 0;
    if ($total > 0) {
        $percent = int ((($total - $free) / $total) * 100);
    }
    if ($FORMAT) {
        $total = format_bytes ($total, mode => 'iec');
        $free  = format_bytes ($free,  mode => 'iec');
        $used  = format_bytes ($used,  mode => 'iec');
    }
    my $summary = "$volume $percent% used ($total quota, $used used,"
        . " $free free)";
    return ($percent, $summary);
}

##############################################################################
# Main routine
##############################################################################

# Report a syntax error and exit.  We do this via stdout in order to satisfy
# the Nagios plugin output requirements, but also report a more conventional
# error via stderr in case people are calling this outside of Nagios.
sub syntax {
    print "AFS UNKNOWN - ", join ('', @_), "\n";
    warn "$0: ", join ('', @_), "\n";
    exit 3;
}

# Parse command line options.
my ($help, $host, $partition, $regex, $version, $volume);
Getopt::Long::config ('bundling', 'no_ignore_case');
GetOptions ('c|critical=i'  => \$CRITICAL,
            'H|hostname=s'  => \$host,
            'h|help'        => \$help,
            'n|volume=s'    => \$volume,
            'r|regex=s'     => \$regex,
            'p|partition=s' => \$partition,
            't|timeout=i'   => \$TIMEOUT,
            'V|version'     => \$version,
            'w|warning=i'   => \$WARNINGS)
    or syntax ("invalid option");
if ($help) {
    print "Feeding myself to perldoc, please wait....\n";
    exec ('perldoc', '-t', $0) or die "Cannot fork: $!\n";
} elsif ($version) {
    my $version = $VERSION;
    print "check_afs_quotas $version\n";
    exit 0;
}
syntax ("extra arguments on command line") if @ARGV;
syntax ("host or volume to check not specified")
    unless (defined ($host) || defined ($volume));
syntax ("regex specified without host to check")
    if (defined ($regex) && !defined ($host));
if ($WARNINGS > $CRITICAL) {
    syntax ("warning level $WARNINGS greater than critical level $CRITICAL");
}

# Set up the alarm.
$SIG{ALRM} = sub {
    print "AFS CRITICAL - network timeout after $TIMEOUT seconds\n";
    exit 2;
};
alarm ($TIMEOUT);

# Do the actual check.
if (defined ($volume)) {
    my %results = volinfo ($volume);
    unless (%results) {
        print "AFS CRITICAL - cannot get information for volume $volume\n";
        exit 2;
    }
    my ($percent, $summary) = summarize_volume ($volume, \%results);
    if ($percent > $CRITICAL) {
        print "AFS CRITICAL - $summary\n";
        exit 2;
    } elsif ($percent > $WARNINGS) {
        print "AFS WARNING - $summary\n";
        exit 1;
    } else {
        print "AFS OK - $summary\n";
        exit 0;
    }
} else {
    my %results = serverinfo ($host, $partition, $regex);
    my (@ok, @warning, @critical);
    for my $volume (keys %results) {
        my ($percent, $summary)
            = summarize_volume ($volume, $results{$volume});
        $summary =~ s/\s*used\s*\(.*//;
        if ($percent > $CRITICAL) {
            push (@critical, $summary);
        } elsif ($percent > $WARNINGS) {
            push (@warning, $summary);
        } else {
            push (@ok, $summary);
        }
    }
    if (@critical > 0) {
        print "AFS CRITICAL - ", join (', ', @critical), "\n";
        exit 2;
    } elsif (@warning > 0) {
        print "AFS WARNING - ", join (', ', @warning), "\n";
        exit 1;
    } else {
        print "AFS OK - ", scalar (@ok), " volumes okay\n";
        exit 0;
    }
}

##############################################################################
# Documentation
##############################################################################

=for stopwords
AFS Crellin Nagios Rader afs-monitor -hV util -vv

=head1 NAME

check_afs_quotas - Monitor AFS quota usage under Nagios

=head1 SYNOPSIS

B<check_afs_quotas> [B<-hV>] [B<-c> I<threshold>] [B<-w> I<threshold>]
    [B<-t> I<timeout>] (B<-H> I<host> [B<-p> I<partition>] | B<-n> I<volume>)
    [B<-r> I<regex>]

=head1 DESCRIPTION

B<check_afs_quotas> is a Nagios plugin for checking free space in AFS
volumes based on the allocated quota for the volume.  It uses either C<vos
examine> or C<vos listvol> to obtain the quota and current usage for
either a single volume or all volumes on a server or server partition and
will return an alert if the percentage of quota used is over a threshold.
By default, it returns a critical error if the used quota is over 90% and
a warning if it is over 85% (changeable with the B<-c> and B<-w> options).

To check a single volume, specify the volume name with B<-n>.  To check a
whole server, specify the server name with B<-H>.  You can check only a
single partition on a server by using the B<-p> option to name the
partition in combination with B<-H>.  When checking an entire server, the
volumes checked can be filtered by using a regular expression (B<-r>).

If C<vos examine> or C<vos listvol> doesn't return within the timeout,
B<check_afs_quotas> will return a critical error.  The default timeout is
300 seconds, changeable with the B<-t> option.

B<check_afs_quotas> will always print out a single line of output, giving
the critical errors if any, otherwise giving the warnings if any,
otherwise either summarizing the volume usage (if B<-n> was given) or the
total number of volumes checked.

=head1 OPTIONS

=over 4

=item B<-c> I<threshold>, B<--critical>=I<threshold>

Change the critical percentage threshold to I<threshold>, which should be
an integer percentage.  The default is 90.

=item B<-H> I<host>, B<--hostname>=I<host>

The AFS file server whose volumes to check for quota usage.  Either this
option or the B<-n> option is required.

=item B<-h>, B<--help>

Print out this documentation (which is done simply by feeding the script
to C<perldoc -t>).

=item B<-n> I<volume>, B<--volume>=I<volume>

Check quota usage for a specific volume.  Either this option or the B<-H>
option is required.

=item B<-p> I<partition>, B<--partition>=I<partition>

Used in combination with the B<-H> option, limits quota checking to a
particular partition.  The partition can be given in any of the forms
recognized by the AFS tools (so both the partition letter C<a> and the
full partition name C</vicepa> will work).

=item B<-r> I<regex>

When processing an entire server (B<-H>), ignore any volumes that don't
match the specified Perl regular expression.

=item B<-t> I<timeout>, B<--timeout>=I<timeout>

Change the timeout for the C<vos partinfo> command.  The default timeout
is 300 seconds.

=item B<-V>, B<--version>

Print out the version of B<check_afs_quotas> and quit.

=item B<-w> I<threshold>, B<--warning>=I<threshold>

Change the warning percentage threshold to I<threshold>, which should be
an integer percentage.  The default is 85.

=back

=head1 EXIT STATUS

B<check_afs_quotas> follows the standard Nagios exit status requirements.
This means that it will exit with status 0 if there are no problems, with
status 2 if there is at least one critical partition for that server, and
with status 1 if there are no critical partitions but at least one warning
partition.  For other errors, such as invalid syntax, B<check_afs_quotas>
will exit with status 3.

=head1 BUGS

The standard B<-v> verbose Nagios plugin option is not supported and
should be.  (For example, under B<-vv> we would want to show the actual
total, free, and used byte counts, not just the percentages.)

The usage message for invalid options and for the B<-h> option doesn't
conform to Nagios standards.

=head1 CAVEATS

This script does not use the Nagios util library or any of the defaults
that it provides, which makes it somewhat deficient as a Nagios plugin.
This is intentional, though, since this script can be used with other
monitoring systems as well.  It's not clear what a good solution to this
would be.

=head1 SEE ALSO

vos(1)

This script is part of the afs-monitor package, which includes various AFS
monitoring plugins for Nagios.  It is available from the AFS monitoring
tools page at L<http://www.eyrie.org/~eagle/software/afs-monitor/>.

=head1 AUTHORS

Written by Russ Allbery based on a similar script by Steve Rader.

=head1 COPYRIGHT AND LICENSE

Copyright 2010, 2013 The Board of Trustees of the Leland Stanford Junior
University

This program is free software; you may redistribute it and/or modify it
under the same terms as Perl itself.

=cut
