radwatch.in   [plain text]


#!/bin/sh
######################################################################
#
#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
#
#    Copyright (C) 2009 Network RADIUS SARL <info@networkradius.com>
#
######################################################################
#
# radwatch - Start the radius daemon and restart upon crash.
#
#  It also catches signals sent to it, and then re-sends those signals
#  to the radius server it is watching.
#
#  If you want to watch and re-start the server, we recommend
#  reading the file doc/supervise-radiusd.txt

#
#  This simplifies the script, and avoids most issues with (say)
#  Debian re-naming "radiusd" to "freeradius".
#
name=radiusd

prefix=@prefix@
exec_prefix=@exec_prefix@
sbindir=@sbindir@
localstatedir=@localstatedir@
logdir=@logdir@
rundir=${localstatedir}/run/${name}
sysconfdir=@sysconfdir@
pid_file=${rundir}/${name}.pid
log_file=${logdir}/${name}_safe.log

#
#  Figure out what arguments to pass tail
#
tail="tail -n "
echo foo | ${tail}1 > /dev/null 2>&1
if test "$?" != "0"
then
    tail="tail -"
fi

RADIUSD=$sbindir/${name}
RADDBDIR=${sysconfdir}/raddb

#
#  If you want to send email, define this field to be an email address.
#  This part of the functionality hasn't been well tested, so please
#  test it before putting it into production.
#
#  It also presumes that you have a functioning mail system on
#  the maching running RADIUS.  You will need to check that the
#  "mail" command exists, and sends mail to the address below, e.g.:
#
#     echo test | mail -s "Testing" $MAILTO
#
#  If you receive the message, then enable MAILTO.  Otherwise, fix
#  your mail system so that it delivers mail.
#
MAILTO=

#
#  Allow "radiusd_safe -X" for testing the radiusd_safe functionality.
#
ARGS="$@"

test -f $RADIUSD || exit 0
test -f $RADDBDIR/radiusd.conf || exit 0

ulimit -c unlimited

#
#  See if the PID file exists.  It might have been left over after
#  a crash, or it might be because the RADIUS server is still running.
#
if test -f $pid_file
then
    PID=`cat $pid_file`
    #
    #  Check if the process exists, AND if it has the right name
    #
    if ps -p $PID | grep $name > /dev/null
    then
	echo "`date +'%a %b %e %H:%M:%S %Y'` : Fatal: A $name process already exists at PID $PID.  We cannot start another one." >> $log_file
	echo "A $name process already exists"
	exit 1
    fi
    
    #
    #  A RADIUS server doesn't exist.  Delete the stale PID file.
    #
    rm -f $pid_file
    if test -f $pid_file
    then
	echo "`date +'%a %b %e %H:%M:%S %Y'` : Fatal: Cannot remove the pid file: $pid_file" >> $log_file
	echo "Fatal error: Cannot remove the pid file: $pid_file"
	echo "Please remove it manually and start $0 again"
	echo "$name daemon not started"
	exit 1
    fi
fi

started=0
restarts=0
last_email=0
now=0

#
#  Save our PID.
#
echo $$ > ${rundir}/${name}_safe.pid

#
#  Loop forever, or until we're told to exit via a signal.
#
while :
do
    #
    #  The first time around, just start the server.
    #  After that, see if we are re-starting in the same second
    #  as the last time.  If so, sleep for a second.  Otherwise,
    #  if we're not starting in the same second, then just restart
    #  the server.
    #
    #  This helps prevent CPU spikes when something goes catastrophically
    #  wrong, and the server re-starts continuously.  (e.g. disk full, etc.)
    #
    now_s=`date +'%a %b %e %H:%M:%S %Y'`
    if test "$started" != "0"
    then
	#  Send mail when the server starts
	if test "$MAILTO" != ""
	then
	    # don't print minutes and seconds: cheap way
	    # of sending email only once an hour.
	    now=`date +'%a %b %e %H %Y'`
	    restarts=`expr $restarts + 1`

	    # send email the first time it restarts
	    if test "$last_email" = "0"
	    then
		    cat |  mail -s "ERROR - $name died, restarting.." $MAILTO <<EOF
$name has restarted unexpectedly at $now

See $log_file for details.  Last 20 lines are:

----------------------------------------------------------------------
`${tail}20 $log_file`
EOF
		last_email="$now"
		restarts=0
	    else
		#  Send email only once every hour (or so)
		if test "$now" != "$last_email"
		then
		    cat |  mail -s "ERROR - $name died, restarting.." $MAILTO <<EOF
$name has restarted $restarts times since last email at $last_email

See $log_file for details.  Last 100 lines are:

----------------------------------------------------------------------
`${tail}100 $log_file`
EOF
		    last_email="$now"
		    restarts=0
		fi
	    fi
	fi

	if test "$started" = "$now_s"
	then
	    #  Allow us to be killed
	    trap - HUP INT QUIT TERM TSTP
	    sleep 1
	fi
    fi
    started="$now_s"

    mysig=
    trap 'mysig=1' HUP
    trap 'mysig=2' INT
    trap 'mysig=3' QUIT
    trap 'mysig=15' TERM
    trap 'mysig=18' TSTP

    eval "$RADIUSD -f $ARGS < /dev/null >> $log_file 2>&1 &"
    PID=$!
    
    if test "$?" != "0"
    then
	echo "Failed to start $name.  See $log_file for details"
	echo "$name daemon not started"
	exit 1
    fi

    echo $PID > $pid_file

    #
    #  Wait for the process to exit.
    #
    wait $PID
    code=$?

    #
    #  On *BSD and Linux, sending *us* a signal results in "wait" returning
    #  with 128+sig.  On Solaris, it results in "wait" returning with "0".
    #
    #  If this happens, we reset our expectations here so that the code
    #  below will work correctly.
    #
    if test "$code" = "0"
    then
	if test "$mysig" != ""
	then
	    code=`expr $mysig + 128`
	fi
    fi

    case "$code" in
	0)
	    echo "`date +'%a %b %e %H:%M:%S %Y'` : Info: $name exited normally.  Exiting" | tee -a $log_file
	    break
	    ;;

	127)
	    echo "`date +'%a %b %e %H:%M:%S %Y'` : Info: $name exited unexpectedly.  Restarting it." | tee -a $log_file
	    ;;

	*)
	    #
	    #  The server exited of its own accord.
	    #
	    if test "$code" -lt 128
	    then
		echo "`date +'%a %b %e %H:%M:%S %Y'` : Info: $name exited unexpectedly on exit code $code.  Restarting it." | tee -a $log_file
	    else
		sig=`expr $code - 128`

		#
		#  Was the signal sent to us, or to the child process?
		#
		if test "$mysig" != ""
		then
		    echo "`date +'%a %b %e %H:%M:%S %Y'` : Info: Caught signal $sig: Signalling $name to exit." | tee -a $log_file
		    kill -$sig $PID
		    break
		else
		    echo "`date +'%a %b %e %H:%M:%S %Y'` : Info: $name exited unexpectedly on signal $sig.  Restarting it." | tee -a $log_file
		fi
	    fi
	    ;;
    esac
done

rm -f $pid_file ${rundir}/${name}_safe.pid
exit 0