m1join.c


 
Thread Tools Search this Thread
Top Forums UNIX for Dummies Questions & Answers m1join.c
# 1  
Old 06-07-2011
m1join.c

Joins from pipes many to one in one pass and other tricks:

Code:
$ cat mysrc/m1join.c
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <limits.h>
static  int     msep_ign = 1 ;
static  int     fullouter = 0 ;
static  int     leftouter = 0 ;
static  int     rightouter = 0 ;
static  int     caseless = 0 ;
static  int     paste = 0 ;
static  int     keycols = 1 ;
static  int     keylen1 ;
static  int     keylen2 ;
static  int     i ;
static  int     cmp ;
static  int     eof1 = 0 ;
static  int     eof2 = 0 ;
static  int     unsent1 = 0 ;
static  int     unsent2 = 0 ;
static  char    seps[256] = "\t \r\n" ;
static  char    *fn1 = NULL ;
static  char    *fn2 = NULL ;
static  char    *cp1 ;
static  char    *cp2 ;
static  char    *cpx ;
static  char    *cpy ;
static  FILE    *fp1 = NULL ;
static  FILE    *fp2 = NULL ;
static  char    fb1[65536] ;
static  char    fb2[65536] ;
int main( int argc, char **argv ){
        for ( i = 1 ; i < argc ; i++ ){
                if ( !strcmp( argv[i], "-a" ) ){
                        fullouter = 1 ;
                        continue ;
                }
                if ( !strcmp( argv[i], "-a1" ) ){
                        leftouter = 1 ;
                        continue ;
                }
                if ( !strcmp( argv[i], "-a2" ) ){
                        rightouter = 1 ;
                        continue ;
                }
                if ( !strcmp( argv[i], "-i" ) ){
                        caseless = 1 ;
                        continue ;
                }
                if ( !strcmp( argv[i], "-p" ) ){
                        paste = 1 ;
                        continue ;
                }
                if ( !strcmp( argv[i], "-m" ) ){
                        msep_ign = 0 ;
                        continue ;
                }
                if ( !strcmp( argv[i], "-c" ) ){
                        if ( ++i == argc
                          || 1 > ( keycols = atoi( argv[ i ] ) ) ){
                                fputs( "Invalid -c option!\n", stderr );
                                goto usage ;
                        }
                        continue ;
                }
                if ( !strcmp( argv[i], "-t" ) ){
                        if ( ++i == argc ){
                                fputs( "Invalid -t option!\n", stderr );
                                goto usage ;
                        }
                        sprintf( seps, "%s\r\n", argv[i] );
                        continue ;
                }
                if ( !fn1 ){
                        fn1 = argv[i] ;
                        continue ;
                }
                if ( !fn2 ){
                        fn2 = argv[i] ;
                        continue ;
                }
                fprintf( stderr, "Extra %d Options!\n", argc - i );
                goto usage ;
        }
        if ( !fn2 ){
                fputs( "Insufficient arguments!\n", stderr );
        usage:
                fputs(
"\n"
"Usage: m1_join [-i] [-a] [-t <sep_chars>] [-m] [-c <key_col_ct>] f1 f2\n"
"\n"
"Joins (possibly multiple) lines from sorted file f1 with each line from\n"
"sorted file f2, on leading key fields.  Leading separators are not ignored.\n"
"Output is all the f1 fields followed by the first separator character (tab)\n"
"followed by non-matched fields of f2.\n"
"** Does not mind pipes as files! **\n"
"** Does not support 'one to many' or 'many to many', just 'many to one'! **\n"
"** (Duplicates in the second file are not matched.) **\n"
"\n"
"Options:\n"
"\n"
" -a   All lines are output (full outer join).\n"
" -a1  All lines of f1 are output (left outer join).\n"
" -a2  All lines of f2 are output (right outer join).\n"
" -c   Only one column is matched, unless -c is specified.\n"
" -i   Keys (and the required sort order) are case-sensitive unless -i is\n"
"      specified, in which case all letters are treated as lower case in the\n"
"      ASCII binary sort order: both 'A' 0101 0x41 and 'a' 0141 0x61 are\n"
"      greater than '_' 0137 0x5F.\n"
" -m   Multiple separators are treated as one unless -m is specified.\n"
" -p   Turns on paste mode: forces one to one matching.\n"
" -t   Columns are separated by tab, space, carriage return or linefeed\n"
"      unless -t specifies a string of other character(s).\n"
"\n",
                        stderr );
                exit( 1 );
        }
        if ( !( fp1 = fopen( fn1, "r" ) ) ){
                perror( fn1 );
                exit( 1 );
        }
        if ( !( fp2 = fopen( fn2, "r" ) ) ){
                perror( fn2 );
                exit( 1 );
        }
        setvbuf( stdout, NULL, _IOFBF, PIPE_MAX );
        setvbuf( fp1, NULL, _IOFBF, 65536 );
        setvbuf( fp2, NULL, _IOFBF, 65536 );
 read2:
        if ( !fgets( fb2, sizeof(fb2), fp2 ) ){
                if ( ferror( fp2 ) ){
                        perror( fn2 );
                        exit( 2 );
                }
                eof2 = 1 ;
        }
        else
                unsent2 = 1 ;
 read1:
        if ( !fgets( fb1, sizeof(fb1), fp1 ) ){
                if ( ferror( fp1 ) ){
                        perror( fn1 );
                        exit( 1 );
                }
                eof1 = 1 ;
        }
        else
                unsent1 = 1 ;
 loop:
        if ( ( eof1
            && eof2 )
          || ( eof1
            && !rightouter
            && !fullouter )
          || ( eof2
            && !leftouter
            && !fullouter ) ){
                exit( 0 );
        }
        if ( eof1 )
                cmp = 1 ;
        else if ( eof2 )
                cmp = -1 ;
        else
                cmp = 0 ;
        cp1 = fb1 ;
        cp2 = fb2 ;
        i = 0 ;
        do {    /* Find keys and if necessary compare them */
                keylen1 = strcspn( cp1, seps );
                keylen2 = strcspn( cp2, seps );
                if ( !cmp ){
                        if ( keylen1 > keylen2 ){
                                cmp =   ( caseless
                                        ? strncasecmp( cp1, cp2, keylen2 )
                                        : strncmp( cp1, cp2, keylen2 ) );
                                if ( !cmp )
                                        cmp = 1 ;
                        } else {
                                cmp =   ( caseless
                                        ? strncasecmp( cp1, cp2, keylen1 )
                                        : strncmp( cp1, cp2, keylen1 ) );
                                if ( !cmp
                                  && keylen1 < keylen2 )
                                        cmp = -1 ;
                        }
                }
                cp1 += keylen1 ;
                cp2 += keylen2 ;
                if ( msep_ign ){
                        cp1 += strspn( cp1, seps ) ;
                        cp2 += strspn( cp2, seps ) ;
                } else {
                        cp1++ ;
                        cp2++ ;
                }
        } while ( ++i < keycols ) ;
        if ( cmp < 0 ){
                if ( ( fullouter
                    || leftouter )
                  && unsent1
                  && *fb1 ){
                        if ( 0 > printf( "%s", fb1 ) ){
                                if ( ferror( stdout ) ){
                                        perror( "stdout" );
                                        exit( 1 );
                                }
                                exit( 0 );
                        }
                }
                goto read1 ;
        }
        else if ( cmp > 0 ){
                if ( ( fullouter
                    || rightouter )
                  && unsent2
                  && *fb2 ){
                        if ( 0 > printf( "%.*s%c%s",
                                    cp2 - fb2, fb2, *seps, cp2 ) ){
                                if ( ferror( stdout ) ){
                                        perror( "stdout" );
                                        exit( 1 );
                                }
                                exit( 0 );
                        }
                }
                if ( !fgets( fb2, sizeof( fb2 ), fp2 ) ){
                        if ( ferror( fp2 ) ){
                                perror( fn2 );
                                exit( 1 );
                        }
                        eof2 = 1 ;
                }
                else
                        unsent2 = 1 ;
                goto loop ;
        }
        if ( !*cp2 )    /* Stepped over the linefeed! */
                cp2-- ;
        if ( 0 > printf( "%.*s%c%s", strlen( fb1 ) - 1, fb1, *seps, cp2 ) ){
                if ( ferror( stdout ) ){
                        perror( "stdout" );
                        exit( 1 );
                }
                exit( 0 );
        }
        unsent1 = 0 ;
        unsent2 = 0 ;
        if ( paste )
                goto read2 ;
        goto read1 ;
}


Last edited by pludi; 06-07-2011 at 03:47 PM..
 
Login or Register to Ask a Question

Previous Thread | Next Thread
Login or Register to Ask a Question