Script for monitoring disk failures


 
Thread Tools Search this Thread
Top Forums Shell Programming and Scripting Script for monitoring disk failures
# 1  
Old 02-16-2012
Script for monitoring disk failures

Hi ,

Does anyone have a script for monitoring disk failures on a Solaris 10/11 box? Need one today ASAP, thnks in advance.
# 2  
Old 02-16-2012
Things are not always the way you want them: C code
This is for Solaris 10, a quick hack I wrote for a SAN problem and then adjusted a few days ago. It runs forever, or until you signal it to stop. It creates daily report files in /tmp, with hourly entries that show accumulated changes in error counts. Run it in the global zone on zoned box.

Code:
/* myiostat.c monitor iostat for disk errors  - 
   specifically edited for EDITH problem 2/14/2012 9:31:18 AM jmc
   compile: gcc -Wall -o myiostat myiostat.c
   usage ./myiostat 
   interval default=60 seconds, runs until signaled to stop
   invoked as an at routine:
   
   at -k now << !
   cd /path/to/.....
   ./myiostat
! 
  
*/

#include <sys/types.h>
#include <sys/time.h>
#include <sys/stat.h>
#include <time.h>
#include <stdlib.h>
#include <strings.h>
#include <unistd.h>
#include <stdio.h>
#include <limits.h>
#include <dirent.h>
#include <signal.h>
#include <errno.h>



typedef struct  // errors on disks
{
   char disk[PATH_MAX];
   time_t start;
   int hard[2];
   int soft[2];
   int tran[2];
   time_t now;
} elist_t;

#define HARD 2
#define SOFT 1
#define TRAN 3


#define ELIST_SZ 256
#define MAX_SPLIT 10
static
elist_t elist[ELIST_SZ];
elist_t *e=elist;
FILE *LOG=NULL;

char *timestamp(void)
{
    char tmp[64]={0x0};

    static char ts[64]={0x0};
    struct timeval tv;
    double fraction=0;
    gettimeofday(&tv, NULL);
    fraction=(tv.tv_usec/1000000.);
    strftime(ts, 64, "%Y-%m-%d-%H:%M:%S", localtime(&tv.tv_sec));
    sprintf(tmp, "%.4f", fraction);
    strcat(ts, &tmp[1]);
    return ts;
}
void cknull(const void *val)
{
	 if(val==NULL) { perror("Fatal error"); exit(1);}
}

void exit_handler(void)
{
	   fprintf(LOG, "%s existing errors %s\n", timestamp(), strerror(errno));
	   fflush(LOG);
	   system(
	 "echo 'myiostat exiting' | /usr/bin/mailx -s 'myiostat down' mcnamara@foo.com");
}


void sig_hnd(int sig)
{                    // yes, I know libc routines are bad.
	  char signame[SIG2STR_MAX]={0x0};
	  char tmp[256]={0x0};	  
	  int fd=fileno(LOG);
	  signal(sig, SIG_IGN);
	  sig2str(sig, signame);
	  sprintf(tmp, "%s received exiting\n", signame);
	  write(fd, "  \n", 2);
	  write(fd, tmp, strlen(tmp));
	  perror("exit with error");
	  exit(1);
}

int compar(const void *A, const void *B)
{
	  elist_t *a=(elist_t *)A;
	  elist_t *b=(elist_t *)B;
	  return strcmp(a->disk, b->disk);
}

void sort_elist(void)
{
	  elist_t *p=e;
	  int nel=ELIST_SZ;
	  
	  qsort(p, nel, sizeof(elist_t), compar);
}

void init_elist(void)
{
   elist_t *p=e;
   int i=0;
   time_t now=time(NULL);
   for(i=0; i< ELIST_SZ; i++,p++)
   {
        memset(p->disk, 0x0, PATH_MAX);
        p->now=0; 
        p->start=now;
        p->hard[0]=p->hard[1]=0;
        p->soft[0]=p->soft[1]=0;
        p->tran[0]=p->tran[1]=0;
   }
}
// delimit fields

void split(char **result, char *working, const char *delim)
{
          int i;
          char *p=strtok(working, delim);
          for(i=0; p!=NULL && i<MAX_SPLIT; p=strtok(NULL, delim), i++ )
          {
              result[i]=p;
              result[i+1]=NULL;
          }
}
elist_t *find_disk(const char *dname)
{
    elist_t *p=e;
    elist_t *retval=(elist_t *)bsearch(dname, p, ELIST_SZ, 
                                      sizeof(elist_t), compar);
    return retval;
}
// cleanup old files
void cleanup(const char *dirname, const char *fname, const int days)
{
  DIR *dirp=opendir(dirname);
  struct dirent *dp=NULL;
  char files_to_delete[128][64]={{0x0}};
  struct stat st;
  time_t then=time(NULL);
  int found=0;
  int i=0;
  
  cknull(dirp);
  errno = 0;
  then-=(time_t)(days * 86400);  // days in past for mtime
  while((dp = readdir(dirp)) != NULL)
  {
      if(strstr(dp->d_name, fname)!=NULL &&
         stat(dp->d_name, &st)!= -1 &&
         st.st_mtime < then)         
      {
         sprintf(files_to_delete[found++], "%s/%s", dirname, dp->d_name);
      }

  }
  closedir(dirp);
  for (i=0; i< found; i++)
  {
      fprintf(LOG, "%s deleting old file: %s\n", timestamp(), files_to_delete[i]);
      remove(files_to_delete[i]);
  }
  
}



// gen_report make an hourly report, rollover the report file around daily)
int sum(int *diff)
{ 
	 int retval=0;
	 int i=0;
	 for(i=0; i<3; i++)
	    retval+=diff[i];
	 return retval;   
	
}



void gen_report(void)
{
	 char filetime[PATH_MAX]={0x0};
	 time_t now=time(NULL);
   size_t i=strftime(filetime, PATH_MAX, "/tmp/iostat_report.txt.%Y%m%d", localtime(&now) );
	 elist_t *p=e;
	 char tmp[128]={0x0};
	 char rpt[2048]={0x0};
   int diff[3]={0};
   FILE *RPT=fopen(filetime, "a");
	 
	 sort_elist();
	 sprintf(rpt, "%s Report\n", timestamp());
	 for(i=0; i<ELIST_SZ; i++, p++)
	 {
	 	  if(!*p->disk)
    	 	  continue;
	 	  
	 	  diff[0]=p->hard[1] - p->hard[0];
	 	  diff[1]=p->soft[1] - p->soft[0];
	 	  diff[2]=p->tran[1] - p->tran[0];
	 	  
	 	  if(sum(diff) )
	 	  {
	 	  	 time_t elapsed=p->now - p->start;
	 	  	 sprintf(tmp, "\tDisk %s: elapsed: %ld(total:%d) hard %d soft %d transport %d\n", 
	 	  	              p->disk, 
	 	  	              elapsed, 
	 	  	              sum(diff),  
	 	  	              diff[0], 
	 	  	              diff[1], 
	 	  	              diff[2]);
	 	  	 strcat(rpt, tmp);
	 	  }
	 }
	 
	      size_t fwrite(const void *ptr, size_t size,  size_t  nitems,
	      FILE *stream);                                              
	 
	 fwrite(rpt, strlen(rpt), 1, RPT);
	 fclose(RPT);
	 cleanup("/tmp", "/iostat_report.txt", 35); 
}



// add new data to struct elist array

void add_data(char **result)
{
	  time_t now=time(NULL);
	  elist_t *p=find_disk(result[0]);

	  if(p==NULL)
	  {
	  	  p=&elist[0];
	  	  strcpy(p->disk, result[0]);
	  	  p->now=now;
	  	  p->start=now;
        p->hard[0]=atoi(result[HARD]);
        p->soft[0]=atoi(result[SOFT]);
        p->tran[0]=atoi(result[TRAN]);
	  }
    p->hard[1]=atoi(result[HARD]);
    p->soft[1]=atoi(result[SOFT]);
    p->tran[1]=atoi(result[TRAN]);    
    p->now=now;	  
}

void analyze(const char *fname, const int report)
{
   char *result[MAX_SPLIT]={NULL};
   char tmp[256]={0x0};   	
   int start=0;
	 FILE *in=fopen(fname,"r");
	 
	 cknull(in);
	 
	 while(fgets(tmp, sizeof(tmp), in)!=NULL)
	 {
	 	  if(!start)
	 	  {
	 	      if(strstr(tmp, "device")!=NULL)
	 	      {
	 	           start=1;
	 	           continue;
	 	      }
	 	      continue;
	 	  }	 	  
	 	  split(result, tmp, " ");
	 	  sort_elist();
	 	  add_data(result);
	 }
	 if(!feof(in))
	 {
	    fprintf(LOG,"%s Fatal file error in analyze() %s\n",
	          timestamp(), strerror(errno));
	    return;      
	 }
	 fclose(in);
	
	 sort_elist();
	 if(report)
	 	  gen_report();
}


int process(const int report)
{
   int retval=0;   
   char cmd[PATH_MAX]={0x0};
   const char *io_cmd= (getenv("IOSTAT_CMD")==NULL)?  "iostat -e" :      getenv("IOSTAT_CMD");
   const char *io_dest=(getenv("IOSTAT_DEST")==NULL)?"/tmp/iostat.log" : getenv("IOSTAT_DEST");
   
   sprintf(cmd,"%s > %s", io_cmd, io_dest);
   retval=system(cmd);
   if(retval)
   {
      fprintf(LOG,	"%s: ERROR on iostat cmd, %d", timestamp(), retval);
      return 1;
   }
	 analyze(io_dest, report);
	 return 0;
}



int main(int argc, char **argv)
{
    // time to wait between samples
    pid_t pid=getpid();
    int cnt=0;
    int gen_report=0;
    char logname[PATH_MAX]={0x0};
    unsigned int interval=(argc==2)? strtoul(argv[1], (char **)0, 10): 60;
    atexit(exit_handler);	
    signal(SIGHUP , sig_hnd);
    signal(SIGSEGV , sig_hnd);
    signal(SIGILL , sig_hnd);    
    signal(SIGBUS , sig_hnd);
    signal(SIGTERM , sig_hnd);
    signal(SIGINT , sig_hnd);    
    sprintf(logname, "/tmp/iostat_monitor.%d.log", (int)pid);
    LOG=fopen(logname, "w");
    cknull(LOG);
    init_elist();  
    for(;;)
    {
    	 cnt++;  // we assume an interval of 60 seconds in this code block.
    	 gen_report=( (cnt%60) ==0 )?1 :0;
       process(gen_report);
       sleep(interval);
    }   
    return 0;
}

[/code]

Last edited by jim mcnamara; 08-01-2013 at 11:34 AM..
# 3  
Old 02-16-2012
Thnks
Login or Register to Ask a Question

Previous Thread | Next Thread

10 More Discussions You Might Find Interesting

1. UNIX for Beginners Questions & Answers

Disk usage monitoring and record the disk used in last 24 hour

HI I am Trying to edit the below code to send email every day with difference of disk utilized in for last 24 hours but instead getting same usage everyday. can you please help me to point out where my calculation is going wrong. Thank you. ================= #!/bin/bash TODAY="at $(date... (0 Replies)
Discussion started by: Mi4304
0 Replies

2. Solaris

11.0 to 11.2 update failures

Attempting to update an 11.0 server with many non-global zones installed. pkg publisher is pkg.oracle.com/solaris/support. FMRI = pkg://solaris/entire@0.5.11,5.11-0.175.1.15.0.4.0:20131230T203500Z When we run pkg update --accept the server contacts oracle, checks packages, finds about 700... (4 Replies)
Discussion started by: CptCarrot
4 Replies

3. Shell Programming and Scripting

Shell script to set trap for finding cron job failures

Unix box: solaris 5.8 Server: IP Need to to set trap for cron job failures by writing a shell script (5 Replies)
Discussion started by: ChandruBala73
5 Replies

4. Shell Programming and Scripting

Disk Monitoring shell script giving incorrect information

Hi All, OS: Linux 86x64 bits Red Hat Linux I get the email alert for the following when Alert condition is set for 30: /dev/sda1 99M 21M 74M 22% /boot -> Below 30%(Should not get the email alert) Expected output as per E-Mail alert: /dev/sda3 20G ... (2 Replies)
Discussion started by: a1_win
2 Replies

5. Shell Programming and Scripting

Disk Space Monitoring Script - OLD and NEW

It's the old thread "Disk Space Monitoring Script", modified for UNIX This is the new code: df -k | awk ' { if ( int($4) > 90) {subject = $1 " More than 90% disk usage. Used: " $4 email = "email@test.com" print subject cmd = "mailx -s \"" subject "\" " email cmd | getline... (4 Replies)
Discussion started by: dungureanu
4 Replies

6. UNIX for Dummies Questions & Answers

Solaris Disk Monitoring?

Hi Guys, I'm looking for a way to monitor disk health/status for a Solaris 5.8 sparc machine. I'm looking for something similar to LSIutility or MegaCLI. Any suggestions? Output of `modinfo`: 30 102616fb 10be8 118 1 ssd (SCSI SSA/FCAL Disk Driver 1.151) 122 7821c000 18550 32 1 ... (2 Replies)
Discussion started by: tank126
2 Replies

7. Shell Programming and Scripting

Can users be notified of the disk failures.

Hi All, Just wanted to know if there is any way by which users can be notified about the failures in disks on Red Hat linux systems. Thanks for all the help!! nua7 (4 Replies)
Discussion started by: nua7
4 Replies

8. Filesystems, Disks and Memory

disk and memory monitoring problem

Hi all, I am looking for api to get me system monitoring statictics every 5 minutes. I am looking at the following statistics: 1. System CPU Usage 2. Process CPU Usage 3. Process Memory Usage 4. I/O Usage for a certain disk. 5. Process I/O bytes/sec utilization. I have seen very... (2 Replies)
Discussion started by: uiqbal
2 Replies

9. Shell Programming and Scripting

Disk Space Monitoring Script

#!/bin/bash # Disk Space Monitoring for more than 95 % # and Sending Alerts by Mail if ; then `df -k |awk '$5 > 95 {print $1 " ----------- " $5}' |mailx -s "More than 95% disk usage in DEV" email@test.com'; else exit 0 fi I get the... (8 Replies)
Discussion started by: sriram003
8 Replies

10. Programming

monitoring files copied onto hard disk

hi... i need pointers to books/website... 'm trytin to write a daemon that monitors files of particular type(eg. text or pdfs) copied onto the hard disk. the daemon should detect the above n write the file name (along with the absolute path) to a file. please DO NOT give me the code... (2 Replies)
Discussion started by: abhi_abhijith
2 Replies
Login or Register to Ask a Question