Code:
sed -n 's/^Data.*|//p' file | aggsx -l
$ aggsx --help
Usage:
aggsx [ -b ] [ -l ] [ -p <prefix> ] [ -u ] [ -d ] [ -h ]
Computes the count distinct, count null, min, count of min, max,
count of max, average (mean) of not null values if numeric,
median of not null values, largest of the most popular values,
count of that most popular value.
If -l is present, first prints out all values in order and their counts,
null last, but no aggregates.
If -b is present, prints out like -l and then prints aggregates.
If -p is present, the aggregate is prefixed with '<prefix>|'.
If -u is present, just immediately prints out unique values.
If -d is present, just immediately prints out duplicated values.
If -h is present, prefixes values line with header line:
CtD|CtN|Min|CtMin|Max|CtMax|Avg|Med|MPop|CtMPop
$ cat mysrc/aggsx.c
#include <stdio.h>
#include <limits.h>
#include <errno.h>
#include <stdlib.h>
#include <strings.h>
static long double sum = 0.0 ;
static long double nval ;
static unsigned long lct = 0 ;
static unsigned long nct = 0 ;
static unsigned long ll2 ;
static unsigned long nvc = 0 ;
static unsigned long dct = 0 ;
static unsigned long act = 0 ;
static unsigned long ll ;
static unsigned long mpc = 0 ;
static unsigned long *vct = NULL ; /* value counts */
static unsigned long *lp ;
static char **vl = NULL ; /* value list */
static char **cpp ;
static char *cp ;
static char *cp2 ;
static char *cp3 ;
static char *me = "" ;
static char *mp = "" ;
static char *pfx = NULL ;
static int i ;
static int d = 0 ; /* -d option */
static int u = 0 ; /* -u option */
static int l = 0 ; /* -l option */
static int b = 0 ; /* -b option */
static int num = 1 ;
static int lfm ; /* line feed missing */
static char buf[66000] ;
static void fmv( char *val )
{
unsigned long cv ;
unsigned long cl = 0 ;
unsigned long ch ;
int r ;
char **cf ;
char **ct ;
char **ce ;
unsigned long *lf ;
unsigned long *lt ;
if ( dct )
for ( cl = 0, ch = dct - 1 ; cl <= ch ; )
{
cv = ( ch + cl ) >> 1 ;
r = strcmp( val, vl[cv] );
if ( r > 0 )
{
cl = cv + 1 ;
}
else if ( r < 0 )
{
if ( cv )
ch = cv - 1 ;
else
break ;
}
else
{
lt = vct + cv ;
*lt += 1 ;
if ( d
&& *lt == 2 ) /* report dups */
{
if ( 0 > printf( "%s\n", val )
|| fflush( stdout ) )
{
if ( ferror( stdout ) )
{
perror( "stdout" );
exit( 1 );
}
exit( 0 );
}
}
return ;
}
}
if ( u ) /* report unique */
{
if ( 0 > printf( "%s\n", val )
|| fflush( stdout ) )
{
if ( ferror( stdout ) )
{
perror( "stdout" );
exit( 1 );
}
exit( 0 );
}
}
cv = dct ;
if ( ++dct > act )
{
act += 1024 ;
if ( !( vl = realloc( vl, act * sizeof( char* ) ) ) )
{
perror( "realloc()" );
exit( 1 );
}
if ( !( vct = realloc( vct, act * sizeof( long ) ) ) )
{
perror( "realloc()" );
exit( 1 );
}
}
for ( ce = vl + cl,
cf = ( ( ct = vl + cv ) - 1 ),
lf = ( ( lt = vct + cv ) - 1 ) ;
ct > ce ;
cf--, ct--, lf--, lt-- )
{
*ct = *cf ;
*lt = *lf ;
}
*lt = 1 ;
if ( !( *ct = malloc( strlen( val ) + 1 ) ) )
{
perror( "malloc()" );
exit( 1 );
}
strcpy( *ct, val );
return ;
}
int main( int argc, char **argv ){
setvbuf( stdin, NULL, _IOFBF, PIPE_MAX );
setvbuf( stdout, NULL, _IOFBF, PIPE_MAX );
for ( i = 1 ; i < argc ; i++ )
{
if ( !strcmp( argv[1], "-b" ) )
{
b = 1 ;
continue ;
}
if ( !strcmp( argv[1], "-l" ) )
{
l = 1 ;
continue ;
}
if ( !strcmp( argv[1], "-p" )
&& ++i < argc )
{
pfx = argv[i];
continue ;
}
if ( !strcmp( argv[1], "-u" ) )
{
u = 1 ;
continue ;
}
if ( !strcmp( argv[1], "-d" ) )
{
d = 1 ;
continue ;
}
if ( !strcmp( argv[1], "-h" ) )
{
fputs(
"CtD|CtN|Min|CtMin|Max|CtMax|Avg|Med|MPop|CtMPop|Ct\n",
stdout );
continue ;
}
fputs(
"Usage:\n"
"\n"
"aggsx [ -b ] [ -l ] [ -p <prefix> ] [ -u ] [ -d ] [ -h ]\n"
"\n"
"Computes the count distinct, count null, min, count of min, max,\n"
"count of max, average (mean) of not null values if numeric,\n"
"median of not null values, largest of the most popular values,\n"
"count of that most popular value.\n"
"\n"
"If -l is present, first prints out all values in order and their counts,\n"
"null last, but no aggregates.\n"
"If -b is present, prints out like -l and then prints aggregates.\n"
"If -p is present, the aggregate is prefixed with '<prefix>|'.\n"
"If -u is present, just immediately prints out unique values.\n"
"If -d is present, just immediately prints out duplicated values.\n"
"If -h is present, prefixes values line with header line:\n"
"CtD|CtN|Min|CtMin|Max|CtMax|Avg|Med|MPop|CtMPop\n"
"\n" , stderr );
exit( 1 );
}
while( fgets( buf, sizeof( buf ), stdin ) )
{
lct++ ;
for ( cp = buf, cp2 = cp3 = NULL, lfm = 1 ; *cp ; cp++ )
{
switch( *cp )
{
case '\n':
lfm = 0 ;
/* intentional fall through */
case '\r':
/* intentional fall through */
case ' ':
/* intentional fall through */
case '\t':
continue ;
/* intentional fall through */
default:
if ( !cp2 )
{
cp2 = cp ;
}
cp3 = cp ;
}
}
if ( lfm )
{
fprintf( stderr, "\nFatal: Data line %lu too long!\n",
lct );
exit( 1 );
}
if ( cp3 )
{
*(++cp3) = NULL ;
cp = cp2 ;
}
if ( strcmp( cp, "<null>" ) )
{
fmv( cp );
}
else
{
nct++ ;
}
}
if ( ferror( stdin ) )
{
perror( "stdin" );
exit( 1 );
}
if ( u
|| d )
exit( 0 );
if ( l || b )
{
for ( ll = 0, cpp = vl, lp = vct ;
ll < dct ;
ll++, lp++, cpp++ )
{
if ( 0 > printf( "%lu\t%s\n", *lp, *cpp ) )
{
if ( ferror( stdout ) )
{
perror( "stdout" );
exit( 1 );
}
exit( 0 );
}
}
if ( nct
&& 0 > printf( "%lu\t%s\n", nct, "<null>" ) )
{
if ( ferror( stdout ) )
{
perror( "stdout" );
exit( 1 );
}
exit( 0 );
}
if ( !b )
{
exit( 0 );
}
}
for ( ll = 0L, cpp = vl, lp = vct, ll2 = ( ( lct - nct ) >> 1 ) + nct ;
ll < dct ; ll++, lp++, cpp++ )
{
cp = *cpp ;
if ( *lp >= mpc )
{
mpc = *lp ;
mp = cp ;
}
if ( *cp
&& num )
{
errno = 0 ;
nval = strtod( cp, &cp2 );
if ( errno /* underflow or overflow */
|| ( cp2 == cp ) /* didn't like the characters */
|| *cp2 ) /* didn't like some */
{
num = 0 ;
}
else
{
sum += ( nval * *lp ) ;
nvc += *lp ;
}
}
if ( ll2 <= lct )
{
me = cp ;
ll2 += *lp ;
}
}
if ( num
&& nvc )
{
sum /= nvc ;
sprintf( buf, "%-30.20LG", sum );
for ( cp = buf + strlen( buf ) - 1 ;
cp >= buf && *cp == ' ' ;
cp-- )
{
*cp = NULL ;
}
}
else
{
strcpy( buf, "N/A" );
}
if ( ( ( pfx
&& 0 > printf( "%s|", pfx ) )
|| 0 > printf( "%lu|%lu|%s|%lu|%s|%lu|%s|%s|%s|%lu|%lu\n",
dct, nct,
( dct ? vl[0] : "" ),
( dct ? vct[0] : 0 ),
( dct ? vl[dct - 1] : "" ),
( dct ? vct[dct - 1] : 0 ),
buf, me, mp, mpc, lct ) )
&& ferror( stdout ) )
{
perror( "stdout" );
exit( 1 );
}
exit( 0 );
}