Code:
/* myiostat.c monitor iostat for disk errors -
specifically edited for EDITH problem 2/14/2012 9:31:18 AM jmc
compile: gcc -Wall -o myiostat myiostat.c
usage ./myiostat
interval default=60 seconds, runs until signaled to stop
invoked as an at routine:
at -k now << !
cd /path/to/.....
./myiostat
!
*/
#include <sys/types.h>
#include <sys/time.h>
#include <sys/stat.h>
#include <time.h>
#include <stdlib.h>
#include <strings.h>
#include <unistd.h>
#include <stdio.h>
#include <limits.h>
#include <dirent.h>
#include <signal.h>
#include <errno.h>
typedef struct // errors on disks
{
char disk[PATH_MAX];
time_t start;
int hard[2];
int soft[2];
int tran[2];
time_t now;
} elist_t;
#define HARD 2
#define SOFT 1
#define TRAN 3
#define ELIST_SZ 256
#define MAX_SPLIT 10
static
elist_t elist[ELIST_SZ];
elist_t *e=elist;
FILE *LOG=NULL;
char *timestamp(void)
{
char tmp[64]={0x0};
static char ts[64]={0x0};
struct timeval tv;
double fraction=0;
gettimeofday(&tv, NULL);
fraction=(tv.tv_usec/1000000.);
strftime(ts, 64, "%Y-%m-%d-%H:%M:%S", localtime(&tv.tv_sec));
sprintf(tmp, "%.4f", fraction);
strcat(ts, &tmp[1]);
return ts;
}
void cknull(const void *val)
{
if(val==NULL) { perror("Fatal error"); exit(1);}
}
void exit_handler(void)
{
fprintf(LOG, "%s existing errors %s\n", timestamp(), strerror(errno));
fflush(LOG);
system(
"echo 'myiostat exiting' | /usr/bin/mailx -s 'myiostat down' mcnamara@foo.com");
}
void sig_hnd(int sig)
{ // yes, I know libc routines are bad.
char signame[SIG2STR_MAX]={0x0};
char tmp[256]={0x0};
int fd=fileno(LOG);
signal(sig, SIG_IGN);
sig2str(sig, signame);
sprintf(tmp, "%s received exiting\n", signame);
write(fd, " \n", 2);
write(fd, tmp, strlen(tmp));
perror("exit with error");
exit(1);
}
int compar(const void *A, const void *B)
{
elist_t *a=(elist_t *)A;
elist_t *b=(elist_t *)B;
return strcmp(a->disk, b->disk);
}
void sort_elist(void)
{
elist_t *p=e;
int nel=ELIST_SZ;
qsort(p, nel, sizeof(elist_t), compar);
}
void init_elist(void)
{
elist_t *p=e;
int i=0;
time_t now=time(NULL);
for(i=0; i< ELIST_SZ; i++,p++)
{
memset(p->disk, 0x0, PATH_MAX);
p->now=0;
p->start=now;
p->hard[0]=p->hard[1]=0;
p->soft[0]=p->soft[1]=0;
p->tran[0]=p->tran[1]=0;
}
}
// delimit fields
void split(char **result, char *working, const char *delim)
{
int i;
char *p=strtok(working, delim);
for(i=0; p!=NULL && i<MAX_SPLIT; p=strtok(NULL, delim), i++ )
{
result[i]=p;
result[i+1]=NULL;
}
}
elist_t *find_disk(const char *dname)
{
elist_t *p=e;
elist_t *retval=(elist_t *)bsearch(dname, p, ELIST_SZ,
sizeof(elist_t), compar);
return retval;
}
// cleanup old files
void cleanup(const char *dirname, const char *fname, const int days)
{
DIR *dirp=opendir(dirname);
struct dirent *dp=NULL;
char files_to_delete[128][64]={{0x0}};
struct stat st;
time_t then=time(NULL);
int found=0;
int i=0;
cknull(dirp);
errno = 0;
then-=(time_t)(days * 86400); // days in past for mtime
while((dp = readdir(dirp)) != NULL)
{
if(strstr(dp->d_name, fname)!=NULL &&
stat(dp->d_name, &st)!= -1 &&
st.st_mtime < then)
{
sprintf(files_to_delete[found++], "%s/%s", dirname, dp->d_name);
}
}
closedir(dirp);
for (i=0; i< found; i++)
{
fprintf(LOG, "%s deleting old file: %s\n", timestamp(), files_to_delete[i]);
remove(files_to_delete[i]);
}
}
// gen_report make an hourly report, rollover the report file around daily)
int sum(int *diff)
{
int retval=0;
int i=0;
for(i=0; i<3; i++)
retval+=diff[i];
return retval;
}
void gen_report(void)
{
char filetime[PATH_MAX]={0x0};
time_t now=time(NULL);
size_t i=strftime(filetime, PATH_MAX, "/tmp/iostat_report.txt.%Y%m%d", localtime(&now) );
elist_t *p=e;
char tmp[128]={0x0};
char rpt[2048]={0x0};
int diff[3]={0};
FILE *RPT=fopen(filetime, "a");
sort_elist();
sprintf(rpt, "%s Report\n", timestamp());
for(i=0; i<ELIST_SZ; i++, p++)
{
if(!*p->disk)
continue;
diff[0]=p->hard[1] - p->hard[0];
diff[1]=p->soft[1] - p->soft[0];
diff[2]=p->tran[1] - p->tran[0];
if(sum(diff) )
{
time_t elapsed=p->now - p->start;
sprintf(tmp, "\tDisk %s: elapsed: %ld(total:%d) hard %d soft %d transport %d\n",
p->disk,
elapsed,
sum(diff),
diff[0],
diff[1],
diff[2]);
strcat(rpt, tmp);
}
}
size_t fwrite(const void *ptr, size_t size, size_t nitems,
FILE *stream);
fwrite(rpt, strlen(rpt), 1, RPT);
fclose(RPT);
cleanup("/tmp", "/iostat_report.txt", 35);
}
// add new data to struct elist array
void add_data(char **result)
{
time_t now=time(NULL);
elist_t *p=find_disk(result[0]);
if(p==NULL)
{
p=&elist[0];
strcpy(p->disk, result[0]);
p->now=now;
p->start=now;
p->hard[0]=atoi(result[HARD]);
p->soft[0]=atoi(result[SOFT]);
p->tran[0]=atoi(result[TRAN]);
}
p->hard[1]=atoi(result[HARD]);
p->soft[1]=atoi(result[SOFT]);
p->tran[1]=atoi(result[TRAN]);
p->now=now;
}
void analyze(const char *fname, const int report)
{
char *result[MAX_SPLIT]={NULL};
char tmp[256]={0x0};
int start=0;
FILE *in=fopen(fname,"r");
cknull(in);
while(fgets(tmp, sizeof(tmp), in)!=NULL)
{
if(!start)
{
if(strstr(tmp, "device")!=NULL)
{
start=1;
continue;
}
continue;
}
split(result, tmp, " ");
sort_elist();
add_data(result);
}
if(!feof(in))
{
fprintf(LOG,"%s Fatal file error in analyze() %s\n",
timestamp(), strerror(errno));
return;
}
fclose(in);
sort_elist();
if(report)
gen_report();
}
int process(const int report)
{
int retval=0;
char cmd[PATH_MAX]={0x0};
const char *io_cmd= (getenv("IOSTAT_CMD")==NULL)? "iostat -e" : getenv("IOSTAT_CMD");
const char *io_dest=(getenv("IOSTAT_DEST")==NULL)?"/tmp/iostat.log" : getenv("IOSTAT_DEST");
sprintf(cmd,"%s > %s", io_cmd, io_dest);
retval=system(cmd);
if(retval)
{
fprintf(LOG, "%s: ERROR on iostat cmd, %d", timestamp(), retval);
return 1;
}
analyze(io_dest, report);
return 0;
}
int main(int argc, char **argv)
{
// time to wait between samples
pid_t pid=getpid();
int cnt=0;
int gen_report=0;
char logname[PATH_MAX]={0x0};
unsigned int interval=(argc==2)? strtoul(argv[1], (char **)0, 10): 60;
atexit(exit_handler);
signal(SIGHUP , sig_hnd);
signal(SIGSEGV , sig_hnd);
signal(SIGILL , sig_hnd);
signal(SIGBUS , sig_hnd);
signal(SIGTERM , sig_hnd);
signal(SIGINT , sig_hnd);
sprintf(logname, "/tmp/iostat_monitor.%d.log", (int)pid);
LOG=fopen(logname, "w");
cknull(LOG);
init_elist();
for(;;)
{
cnt++; // we assume an interval of 60 seconds in this code block.
gen_report=( (cnt%60) ==0 )?1 :0;
process(gen_report);
sleep(interval);
}
return 0;
}