#!/usr/bin/perl
#
# Filename: /usr/local/etc/pingsweep.pl
#
# Purpose: ping a list of network electronics and report (email) a
# list of devices that does not respond.
#
# Author: Michael McNamara
#
# Language: Perl
#
# Date: May 6, 2003
#
# Changes:
#
# Sep 20, 2007: cleanup code and comments in order to publish code to web
# Dec 30, 2005: cleaned up DEBUG and logging statements for troubleshooting
# Mar 18, 2005: added file logging for troubleshooting and monitoring
# Feb 19, 2005: added threshold logic to prevent flapping notifications
# Feb 17, 2005: fixed logic regarding alert notification and ping retry
# Jan 17, 2005: added code that will retry ping if # devices alarming is < 10 devices
# July 14, 2004: script will now report when devices are once again reachable
# Jan 5, 2004: record downtime for monthly computation of downtime %
# Jan 2, 2004: changed alert notifications so that device must be
# down for at least 60 seconds before email notification
# Also added 15 minute alert email notification
# Notifications now: > 1 min and < 5 min
# at 15 min
# at 60 min
# Dec 30, 2003: changed email notifications to HTML based messages
#
# Notes:
# Over the years I've built up this Perl script from a simple shell
# that would just call FPING to a all encompassing tool which would
# still perform its basic function yet go well beyond its original
# design. What does it have now that it didn't have then you ask?
# Well lets name a few; debug output, comments are now allowed in
# the input files, ping retry, flapping notification thresholds, etc.
# And it currently has timing logic to initiate both email
# notification and pager notification. It now has threshold logic
# to prevent those few occasions where I would find a few hunderd
# alert messages in my inbox because of some flapping condition. It
# also has the ability to record the outages so that a downtime
# report can be generated from that data (reportdowntime.pl).
#
# This script requires fping; http://fping.sourceforge.net/
#
# This script was written and designed to run on a Fedora Core Linux
# server. It can probably be adapted to run on a Windows server.
#
# Load Modules
use strict;
use warnings;
use Fcntl qw(:DEFAULT :flock);
# Declare Constants
use constant DEBUG => 1; # DEBUG settings
use constant CONSOLE => 0; # CONSOLE DEBUG settings
# Declare Global Variables
my $result;
my $sdate;
my $date;
my $time;
my $currentTime;
my @devices;
my $retry = 10; # Number of seconds to wait before retrying ping
my $pause = 7; # Number of seconds to pause at the startup of the script
my %disk; # Array of Devices that were previously DOWN
my %data; # Array of Devices that are currently DOWN (after processing)
my %live; # Array of Devices that was recorded as DOWN from FPING
my %notify; # Array of Devices that will be included in notifications
my %thres; # Array of Devices for threshold checking
my $FPING = "/usr/local/etc/fping"; # Location of FPING exec
my $MAILTO; # Who should recieve the email alerts
my $MAILFROM; # Who they should appear as coming from
my $MAILSUBJECT; # The subject of the email alerts
my $PAGERTO; # The email pager addresses of those that should be paged
my $NOTIFY; # Flag/count variable for email notification
my $PAGERNOTIFY = 0; # Flag/count variable for pager notification
my $SOMETHINGTODO = 0; # Flag/count variable for alert logic
my $RECOVER = 0; # Flag/count variable for recovery notification
my $program = "pingsweep.pl";
my $version = "v1.2";
my $author = "Michael McNamara";
my $purpose = "This Perl script is designed to poll the network electronics and report any failures.";
my $usage = "Usage: pingsweep.pl \[debug\]\n";
# Email Addresses and Subject Lines
$MAILTO = '';
$MAILFROM = '';
$MAILSUBJECT = 'ALERT: Urgent Network Status Report';
$PAGERTO = '';
# Data File Paths and Locations
my $electronics = "/usr/local/etc/pingsweep.txt";
my $datafile = "/usr/local/etc/pingsweep.dat";
my $flagfile = "/tmp/pingsweep.flg";
my $recordfile = "/usr/local/etc/pingsweeprecord.dat";
my $thresholdfile = "/usr/local/etc/pingsweeprecord.dat";
my $logfile = "/usr/local/etc/logs/pingsweep.log";
my $lockfile = "/tmp/pingsweep.tmp";
my $templock = "/tmp/pingsweep.tmp.$$";
# Notification Intervals in seconds
our $firstalert = 15*60; # Time value for first alert notification
our $secondalert = 60*60; # Time value for second alert notification
our $minalert = 5*50; # Time value for minimum alert notification
our $largealert = 10; # Number of alert devices to send immediate
our $thresholdTime = 120*60; # Time (seconds) value for threshold checking
our $thresholdEvents = 3; # Threshold value for number of events
our $NOW = localtime; # Local Date and Time
###########################################################################
# Let's trap and SIG interrupts and remove lockfiles and tempfiles
$SIG{INT} = $SIG{TERM} =
sub { unlink ${main::Cleanfile}
if defined $main::Cleanfile;
unlink ${main::Cleanfile2}
if defined $main::Cleanfile2;
unlink ${main::Cleanfile3}
if defined $main::Cleanfile3;
warn "$NOW: ERROR: Bailout after SIG $_[0]\n";
exit 1;
};
$SIG{HUP} = sub {
unlink ${main::Cleanfile}
if defined $main::Cleanfile;
unlink ${main::Cleanfile2}
if defined $main::Cleanfile2;
unlink ${main::Cleanfile3}
if defined $main::Cleanfile3;
die "$NOW: ERROR: Bailout after SIG $_[0]\n";
};
END {
local($?, $!);
unlink ${main::Cleanfile} if defined $main::Cleanfile;
unlink ${main::Cleanfile2} if defined $main::Cleanfile2;
}
###########################################################################
# B E G I N M A I N
###########################################################################
# Initialize program environment
&initialize;
# Load threshold data and identify problem devices
&load_thres;
# Ping devices and get list of devices not responding
&get_data;
# Load devices that failed to respond previously
&load_data;
# Compare the list of devices responding against the old list
&comp_data;
# Check to see if email alerts should be sent
&check_alert;
# Store list of devices not responding for later use
&write_data;
# Issue email for list of devices not responding or now responding
&alert_notify;
# Issue pager email alert if more than 10 devices for 5 minutes
&alert_pager;
# Store list of devices that are now responding with their total
# accumulated downtime for later reporting with reportdowntime.pl
&record_downtime;
# Remove the flagfile after everything has completed
&finishup;
exit 0;
###########################################################################
# E N D M A I N
###########################################################################
###########################################################################
# Subroutine initialize
#
# Purpose: perform all the initialization steps and procedures
###########################################################################
sub initialize {
# Declare Local Variables
# Inialize time and date
$sdate = localtime;
($date, $time) = &get_time;
$currentTime = time;
&lockit($lockfile,$templock);
## Let's open the logfile and place a lock on the file
open(LOGFILE, ">>$logfile");
flock(LOGFILE, LOCK_EX) or die "$program script unable to lock logfile $logfile\n";
logit("DEBUG: #################################################################", 1);
logit("DEBUG: (initialize) $program $version starting up...", 1);
logit("DEBUG: (initialize) logfile $logfile is open and flock complete", 1);
logit("DEBUG: (initialize) creating flagfile $flagfile", 1);
logit("DEBUG: (initialize) sleeping $pause seconds",1) if (DEBUG);
# sleep 1 seconds to offset other test scripts
sleep $pause;
return;
} #end sub
###########################################################################
# Subroutine finishup
#
# Purpose: wrap up the program and close any open files
###########################################################################
sub finishup {
logit("DEBUG: (finishup) all finished let's remove the flagfile.",1) if (DEBUG);
## Remove the flagfile after everything has completed
##`/bin/rm $flagfile`;
#unlink $flagfile;
close LOCK; unlink ($templock, $lockfile);
logit("DEBUG: (finishup) closing logfile $logfile",1) if (DEBUG);
# Close the log data file
close(LOGFILE);
# Copy the list of down devices for the webserver to access
`cp -f /usr/local/etc/pingsweep.dat /var/www/html/pingsweep.dat`;
return;
}
###########################################################################
# Subroutine get_data
#
# Purpose: ping a list of devices and record those that don't respond
###########################################################################
sub get_data {
# Declare Local Variables
my $name;
my $execstr;
my $result2;
my $numdevices; # Number of elements in @devices array
logit("DEBUG: (get_data) shelling out to exec system call to FPING",1) if (DEBUG);
# Use FPING to ping all the electronics and store the list of devices
# that doesn't respond into the string $result
$result = `$FPING -u -f $electronics`;
# If there were some devices that didn't respond let's go to work
if ($result) {
$SOMETHINGTODO = 1; # Set this flag for later processing
chomp ($result); # remove the CR/LF
logit("DEBUG: (get_data) some devices failed to respond to our ping",1) if (DEBUG);
# Lets take the results and load them into an array
@devices = split (/\n/, $result);
# Lets count the number of elements in the array
$numdevices = @devices;
# If there are less than 10 devices not responding lets go ahead and
# try to PING them again just to make sure they are really down. If
# there are more than 10 devices we can assume that this is a larger
# problem and that we shouldn't overload the system trying to re-PING
# too many devices in too short a timeframe else we'll go outside our
# 60 second window.
if ($numdevices < 10) {
# Lets take that list of devices and ping them again to confirm
$execstr = "$FPING -u ";
foreach $name (@devices) {
$execstr = $execstr . "$name ";
}
logit("DEBUG: (get_data) some devices failed to responsd, retrying...",1) if (DEBUG);
logit("DEBUG: (get_data) here's the execstr $execstr",1) if (DEBUG);
logit("DEBUG: (get_data) sleeping the retry interval of $retry seconds",1) if (DEBUG);
# Lets stop and catch our breath for a few seconds before trying again
sleep $retry;
# Lets ping those devices that originally failed again
$result2 = `$execstr`;
# If there were any results lets store and evaluate them
if ($result2) {
$SOMETHINGTODO = 1; # Set this flag for later processing
chomp ($result2); # remove CR/LF
# Lets take the results and load them into an array
@devices = split(/\n/, $result2);
logit("DEBUG: (get_data) some devices failed to respond a SECOND time",1) if (DEBUG);
foreach $name (@devices) {
$name =~ s/\/n//g; # remove the CR/LF
$live{$name} = $currentTime; # store the current time for each device
logit("DEBUG: (get_data) live{$name} = $live{$name}",1) if (DEBUG);
} #end foreach
} else { # if there were no devices down the SECOND time around
$SOMETHINGTODO = 0; # Set this flag for later processing
logit("DEBUG: (get_data) all devices responded the SECOND time around.",1) if (DEBUG);
} #end if else($result2)
} else { # if number of devices down > 10 just process
logit("DEBUG: (get_data) number of devices > 10 processing without retrying PING",1) if (DEBUG);
foreach $name (@devices) {
$name =~ s/\/n//g; # remove the CR/LF
$live{$name} = $currentTime; # store the current time for each device
logit("DEBUG: (get_data) live{$name} = $live{$name}",1) if (DEBUG);
} #end foreach
} # endif numdevices > 10
} else { # if there were no devices down the FIRST time around
# There were no devices that failed to respond so there's nothing to-do
$SOMETHINGTODO = 0;
logit("DEBUG: (get_data) there were no devices that failed to respond...",1) if (DEBUG);
} #end if($result)
return 1;
}
##########################################################################
# Subroutine alert_notify
#
# Purpose: compose an HTML based email message which details the devices
# that failed to response and also details those devices that
# are now reponding (recovered).
##########################################################################
sub alert_notify {
# Declare Local Varaibles
my $name;
my $oTime;
my $lTime;
my $dTime;
my $alert;
my $flag = 0;
# If there was some device that either failed to ping or has recovered
if ($NOTIFY | $RECOVER) {
logit("DEBUG: (alert_notify) there is something todo",1) if (DEBUG);
# Lets open a filehandle to sendmail for an email alert
open(SENDMAIL, "| /usr/lib/sendmail $MAILTO") || die;
# We need to make sure we properly format the mail message
print(SENDMAIL "From: $MAILFROM\nTo: $MAILTO\nSubject: $MAILSUBJECT\n");
print(SENDMAIL "MIME-Version: 1.0\n");
print(SENDMAIL "Content-Type: text/html; charset=us-ascii\n\n");
print(SENDMAIL "\n");
print SENDMAIL <
Main Line Health Network Infrastructure Status Report
\n
Date : $sdate
EOF
# There was a device that failed to respond so we'll be alerting on it
if ($NOTIFY) {
logit("DEBUG: (alert_notify) within the down host section",1) if (DEBUG);
print SENDMAIL "
\n";
print SENDMAIL "The following devices failed to respond to an ICMP ping(s);
\n";
print SENDMAIL "
| Device Hostname or IP Address | "; print SENDMAIL "Time Down DD:HH:MM:SS | ";
print SENDMAIL "
| $name | \n"; print SENDMAIL "$dTime | \n"; print SENDMAIL "
\n";
print SENDMAIL "The following devices are now responding to ICMP ping(s);
\n";
print SENDMAIL "
| Device Hostname or IP Address | \n"; print SENDMAIL "Time Down DD:HH:MM:SS | \n";
print SENDMAIL "
| $name | \n"; print SENDMAIL "$dTime | \n"; print SENDMAIL "

\n

\n
EOF
close(SENDMAIL) || die;
} #end if ($NOTIFY | $RECOVER)
else {
logit("DEBUG: (alert_notify) there is nothing to notify",1) if (DEBUG);
}
return 1;
} #end sub alert_notify
########################################################################
# Subroutine get_time
#
# Purpose: calculate the time
########################################################################
sub get_time {
# Declare Local Variables
my ($sec, $min, $hour, $day, $mon, $year, $date, $time, $now);
($sec, $min, $hour, $day, $mon, $year) = (localtime)[0,1,2,3,4,5];
if ($sec < 10) { $sec = "0" . $sec }
if ($min < 10) { $min = "0" . $min }
if ($hour < 10) { $hour = "0" . $hour }
$mon = $mon + 1;
$year = $year + 1900;
$date = $mon . "-" . $day . "-" . $year;
$time = $hour . ":" . $min . ":" . $sec;
$now = $date . " at " . $time;
return ($date, $time);
} #end sub get_time
########################################################################
# Subroutine load_data
#
# Purpose: load from file the list of devices that were previously down
########################################################################
sub load_data {
# Declare Local Variables
my $oTime; # Original timestamp when device went down
my $lTime; # Last timestamp when device was checked
my $rTime; # Recovery timestap when device recovered
my $dTime; # Amount of time the device has been down
my $name; # FQDN of the device being checked
my $alert; # Number of Pager alerts sent
# Open data file
open DATA, "$datafile" or die "Can't open $datafile: $!\n";
logit("DEBUG: (load_data) starting to load hash \%disk",1) if (DEBUG);
# Walk through data file
while () {
# Skip blank lines
next if (/^\n$/);
# Skip comments
next if (/^#/);
# Read a line of data, throw away iTime
($name, $oTime, $lTime, $dTime, $alert) = split(' ');
# Build data structure
$disk{$name} = "$name $oTime $lTime $dTime $alert";
logit("DEBUG: (load_data) reading disk{$name} = $name $oTime $lTime $dTime $alert",1) if (DEBUG);
} #end while
close DATA;
return;
} #end sub load_data
########################################################################
# Subroutine load_thres
#
# Purpose: load threshold data from file of devices that were previously down
########################################################################
sub load_thres {
# Declare Local Variables
my $oTime; # Original timestamp when device went down
my $lTime; # Last timestamp when device was checked
my $rTime; # Recovery timestap when device recovered
my $dTime; # Amount of time the device has been down
my $name; # FQDN of the device being checked
my $alert; # Number of Pager alerts sent
my $index; # Index variable for hash array
my $tmwindow = $currentTime - $thresholdTime;
#
# Load threshold data from \$thresholdfile for threshold checking
#
# We can use the pingsweeprecord.dat file to check for thresholds
# The format of that file appears below. We can load all the data and then
# count the number of events within the threshold window, perhaps 60 minutes.
#
# Device Hostname Original Recover Total Down Date Time
# Time Time Time Time
#---------------------------------------------------------------------------------------
#
# switchhostname.domain 1107277623 1107277801 238 0:00:03:58 2-1-2005 12:11:01
#
# Open data file
open THRESDATA, "$thresholdfile" or die "Can't open $thresholdfile: $!\n";
logit("DEBUG: (load_thres) starting to load hash \%thres",1) if (DEBUG);
logit("DEBUG: (load_thres) threshold events = $thresholdEvents and time window = $tmwindow",1) if (DEBUG);
# Walk through data file
while (