For constant monitoring and no cost, check out
Big Brother . You can monitor processes, cpu, disk...fantastic product.
You would still need something to check some other things (you could implement into BB but it's up to you).
This was written back in 1995 - not too many changes since then. Give you a quick snapshot of what has happened in the last 24 hours (runs once every 24 hours). Could be improved but one does not always have the time! Sorry it's in csh - but it's more for knowledge then use - all the servers (over 60) send the snapshot report to one server - a cron job collects all the info - if it doesn't find a report from a server in it's list, it reports that too.
This is the script that runs on each server - if nothing is wrong it sends a zero byte file (which proves the network connection is working). If you want the other script, post back. This works on Solaris 2.6 - does not have to run as root. Also have one for HP.
#!/bin/csh -f
# Created 09/21/95 HOG A script file to gather info from all Unix Systems
# ========SET UP SYMBOLS===========================
set defdir="/tmp"
set node="`hostname`"
set today="`date '+%m%d%y'`"
set theday="`date +'%d'`"
set thedate="`date +'%b %e'`"
set themonth="`date +'%m'`"
set theyear="`date +'%Y'`"
set tmpfile = "$defdir/SI$node.$today"
set y2kfile = "/opt/Y2K/sunscan.$node-$theyear.$themonth.$theday-*/README.$node"
set dailycopy = "oven:/usr/local/sysconfigs/daily"
set monthcopy = "oven:/usr/local/sysconfigs"
set fsmin = "5000"
/usr/bin/rm $defdir/SI$node.*
/usr/bin/touch $tmpfile
#
# ========RUN FOLLOWING COMMANDS ON ALL SYSTEMS====
# Check uptime
set lastboot = `/usr/bin/who -b | awk '{print $4" "$5}'`
if ("$lastboot" == "$thedate") echo "`/usr/bin/who -b`" >> $tmpfile
# Check space on local filesystems
set filesys = `df -bl |grep dsk|grep -v vol|/usr/bin/awk '{print $1}'`
foreach fs ($filesys)
set fs1 = `/usr/bin/df -kl $fs|grep dsk|/usr/bin/awk '{print $4}'`
set fson = `/usr/bin/df -kl $fs|grep dsk|/usr/bin/awk '{print $6}'`
if ($fs1 < $fsmin) then
echo "$fson is at $fs1 kilobytes" >> $tmpfile
endif
end
# Check for OV status
if (-e /opt/OV/bin/ovstatus) then
if ("$node" == "casc-nms128") then
# do nothing - loaded but not running
else
set ovstat = `/opt/OV/bin/ovstatus |/usr/bin/grep -c RUNNING`
if ($ovstat < 5) echo "Only $ovstat OV processes running. Please
check." >> $tmpfile
endif
endif
# Check on meta disks
if (-e /usr/opt/SUNWmd/sbin/metastat) then
set mdstat = `/usr/opt/SUNWmd/sbin/metastat|/usr/bin/grep "State:"|awk '
{print $2}'|/usr/bin/grep -cv "Okay"`
if ($mdstat > 0) then
/usr/bin/echo "$mdstat errors found in metastat" >> $tmpfile
endif
endif
# Check on volume manager disks - normal user can't run vxdisk
if (-e /usr/sbin/vxprint) then
set vxstat = `/usr/sbin/vxprint |grep -ic "recover"`
if ($vxstat > 0) then
/usr/bin/echo "$vxstat errors found in vxprint" >> $tmpfile
endif
endif
# Check prtdiag for errors
if (-e /usr/platform/`uname -i`/sbin/prtdiag) then
set prtdiag = "/usr/platform/`uname -i`/sbin/prtdiag"
set prtdiagstat = `$prtdiag | grep -c "No failures found in System"`
if ($prtdiagstat < 1) then
/usr/bin/echo "Prtdiag shows system errors" >> $tmpfile
endif
endif
#
/usr/bin/rcp $tmpfile $dailycopy
if ("$theday" == "01" && "$node" != "oven") then
if (-e /opt/Y2K) then
/usr/bin/rcp $y2kfile $monthcopy
endif
endif
if (-e /tmp/$node.all) then
/usr/bin/rcp /tmp/$node.all $monthcopy
/usr/bin/mv /tmp/$node.all /tmp/$node.old
endif
# ==================================================
exit