#!/bin/sh # This is a "repair binary" for watchdog that allows the tests to fail N times # within a given period before a reboot is called. Note that this "grace # period" should really be a functionality of watchdog itself, IMHO. # # Erik Rossen <rossen@prolibre.com> # If one does not change the default watchdog loop time of 10 secords, N=12 # will allow two minutes of failures before a reboot is signaled. N=12 # CMAXAGE is the age in seconds that the counter file may have before it is # considered too old and is wiped out. CMAXAGE=20 ERR=$1 COUNTER=/var/run/watchdog.counter if test -f $COUNTER; then COUNTERAGE=$(stat -c %Y $COUNTER) NOW=$(date +%s) if test $(($COUNTERAGE+$CMAXAGE)) -lt $NOW ; then rm $COUNTER else I=$(cat $COUNTER) fi fi I=${I:-0} I=$(($I+1)) logger -t "watchdog[$$]" "Failure $I of $N" logger -t "watchdog[$$]" "PROCESS LIST:" ps auxww | logger -t "watchdog[$$]" if test "$I" -ge "$N" ; then logger -t "watchdog[$$]" Too many failures. Signalling reboot. rm $COUNTER exit $ERR fi echo $I > $COUNTER exit 0