Text analysis


 
Thread Tools Search this Thread
Top Forums UNIX for Dummies Questions & Answers Text analysis
# 15  
Old 03-30-2011
In fact your document is a Microsoft Word document

Open it in word, and save it as a txt ...

Code:
# file walle.txt
walle.txt:      Microsoft Word Document

# 16  
Old 03-30-2011
You must change case to lower, I like tr, then separate all the words, say using sed, then count them, sort to uniq -c.
Code:
tr '[:upper:]' '[:lower]' <yourfile.txt | sed '
  s/[^a-z]\{1,99\}/\
/g
 ' | sort | uniq -c >wordcount.txt

For big files with mostly repeating words, you may need a less disk-space intensive, intelligent solution. I wrote this in C, instructions included, replace "sort | uniq -c" with "aggsx -l". (Some features are aimed at the Interbase/Firebird RDBMS.) (You could write a more elegant and faster on using a hash map container in C++ or JAVA, and do the case and separation in code, too.)
Code:
$ cat mysrc/aggsx.c
 
#include <stdio.h>
#include <limits.h>
#include <errno.h>
#include <stdlib.h>
#include <strings.h>
static  long double     sum = 0.0 ;
static  long double     nval ;
static  unsigned long   lct = 0 ;
static  unsigned long   nct = 0 ;
static  unsigned long   ll2 ;
static  unsigned long   nvc = 0 ;
static  unsigned long   dct = 0 ;
static  unsigned long   act = 0 ;
static  unsigned long   ll ;
static  unsigned long   mpc = 0 ;
static  unsigned long   *vct = NULL ;   /* value counts */
static  unsigned long   *lp ;
static  char            **vl = NULL ;   /* value list */
static  char            **cpp ;
static  char            *cp ;
static  char            *cp2 ;
static  char            *cp3 ;
static  char            *me = "" ;
static  char            *mp = "" ;
static  char            *pfx = NULL ;
static  int             i ;
static  int             d = 0 ; /* -d option */
static  int             u = 0 ; /* -u option */
static  int             l = 0 ; /* -l option */
static  int             b = 0 ; /* -b option */
static  int             num = 1 ;
static  int             lfm ;   /* line feed missing */
static  char            buf[66000] ;
static  void            fmv( char *val )
{
        unsigned long   cv ;
        unsigned long   cl = 0 ;
        unsigned long   ch ;
        int             r ;
        char            **cf ;
        char            **ct ;
        char            **ce ;
        unsigned long   *lf ;
        unsigned long   *lt ;
        if ( dct )
                for ( cl = 0, ch = dct - 1 ; cl <= ch ; )
                {
                        cv = ( ch + cl ) >> 1 ;
                        r = strcmp( val, vl[cv] );
                        if ( r > 0 )
                        {
                                cl = cv + 1 ;
                        }
                        else if ( r < 0 )
                        {
                                if ( cv )
                                        ch = cv - 1 ;
                                else
                                        break ;
                        }
                        else
                        {
                                lt = vct + cv ;
                                *lt += 1 ;
                                if ( d
                                  && *lt == 2 ) /* report dups */
                                {
                                        if ( 0 > printf( "%s\n", val )
                                          || fflush( stdout ) )
                                        {
                                                if ( ferror( stdout ) )
                                                {
                                                        perror( "stdout" );
                                                        exit( 1 );
                                                }
                                                exit( 0 );
                                        }
                                }
                                return ;
                        }
                }
        if ( u ) /* report unique */
        {
                if ( 0 > printf( "%s\n", val )
                  || fflush( stdout ) )
                {
                        if ( ferror( stdout ) )
                        {
                                perror( "stdout" );
                                exit( 1 );
                        }
                        exit( 0 );
                }
        }
        cv = dct ;
        if ( ++dct > act )
        {
                act += 1024 ;
                if ( !( vl = realloc( vl, act * sizeof( char* ) ) ) )
                {
                        perror( "realloc()" );
                        exit( 1 );
                }
                if ( !( vct = realloc( vct, act * sizeof( long ) ) ) )
                {
                        perror( "realloc()" );
                        exit( 1 );
                }
        }
        for ( ce = vl + cl,
                cf = ( ( ct = vl + cv ) - 1 ),
                lf = ( ( lt = vct + cv ) - 1 ) ;
              ct > ce ;
              cf--, ct--, lf--, lt-- )
        {
                *ct = *cf ;
                *lt = *lf ;
        }
        *lt = 1 ;
        if ( !( *ct = malloc( strlen( val ) + 1 ) ) )
        {
                perror( "malloc()" );
                exit( 1 );
        }
        strcpy( *ct, val );
        return ;
}
int main( int argc, char **argv ){
        setvbuf( stdin, NULL, _IOFBF, PIPE_MAX );
        setvbuf( stdout, NULL, _IOFBF, PIPE_MAX );
        for ( i = 1 ; i < argc ; i++ )
        {
                if ( !strcmp( argv[1], "-b" ) )
                {
                        b = 1 ;
                        continue ;
                }
                if ( !strcmp( argv[1], "-l" ) )
                {
                        l = 1 ;
                        continue ;
                }
                if ( !strcmp( argv[1], "-p" )
                  && ++i < argc )
                {
                        pfx = argv[i];
                        continue ;
                }
                if ( !strcmp( argv[1], "-u" ) )
                {
                        u = 1 ;
                        continue ;
                }
                if ( !strcmp( argv[1], "-d" ) )
                {
                        d = 1 ;
                        continue ;
                }
                if ( !strcmp( argv[1], "-h" ) )
                {
                        fputs( 
"CtD|CtN|Min|CtMin|Max|CtMax|Avg|Med|MPop|CtMPop|Ct\n",
                                stdout );
                        continue ;
                }
                fputs(
"Usage:\n"
"\n"
"aggsx [ -b ] [ -l ] [ -p <prefix> ] [ -u ] [ -d ] [ -h ]\n"
"\n"
"Computes the count distinct, count null, min, count of min, max,\n"
"count of max, average (mean) of not null values if numeric,\n"
"median of not null values, largest of the most popular values,\n"
"count of that most popular value.\n"
"\n"
"If -l is present, first prints out all values in order and their counts,\n"
"null last, but no aggregates.\n"
"If -b is present, prints out like -l and then prints aggregates.\n"
"If -p is present, the aggregate is prefixed with '<prefix>|'.\n"
"If -u is present, just immediately prints out unique values.\n"
"If -d is present, just immediately prints out duplicated values.\n"
"If -h is present, prefixes values line with header line:\n"
"CtD|CtN|Min|CtMin|Max|CtMax|Avg|Med|MPop|CtMPop\n"
"\n"                    , stderr );
                exit( 1 );
        }
        while( fgets( buf, sizeof( buf ), stdin ) )
        {
                lct++ ;
                for ( cp = buf, cp2 = cp3 = NULL, lfm = 1 ; *cp ; cp++ )
                {
                        switch( *cp )
                        {
                        case '\n':
                                lfm = 0 ;
                                /* intentional fall through */
                        case '\r':
                                /* intentional fall through */
                        case ' ':
                                /* intentional fall through */
                        case '\t':
                                continue ;
                                /* intentional fall through */
                        default:
                                if ( !cp2 )
                                {
                                        cp2 = cp ;
                                }
                                cp3 = cp ;
                        }
                }
                if ( lfm )
                {
                        fprintf( stderr, "\nFatal: Data line %lu too long!\n",
                                                lct );
                        exit( 1 );
                }
                if ( cp3 )
                {
                        *(++cp3) = NULL ;
                        cp = cp2 ;
                }
                if ( strcmp( cp, "<null>" ) )
                {
                        fmv( cp );
                }
                else
                {
                        nct++ ;
                }
        }
        if ( ferror( stdin ) )
        {
                perror( "stdin" );
                exit( 1 );
        }
        if ( u
          || d )
                exit( 0 );
        if ( l || b )
        {
                for ( ll = 0, cpp = vl, lp = vct ;
                        ll < dct ;
                        ll++, lp++, cpp++ )
                {
                        if ( 0 > printf( "%lu\t%s\n", *lp, *cpp ) )
                        {
                                if ( ferror( stdout ) )
                                {
                                        perror( "stdout" );
                                        exit( 1 );
                                }
                                exit( 0 );
                        }
                }
                if ( nct
                  && 0 > printf( "%lu\t%s\n", nct, "<null>" ) )
                {
                        if ( ferror( stdout ) )
                        {
                                perror( "stdout" );
                                exit( 1 );
                        }
                        exit( 0 );
                }
                if ( !b )
                {
                        exit( 0 );
                }
        }
        for ( ll = 0L, cpp = vl, lp = vct, ll2 = ( ( lct - nct ) >> 1 ) + nct ;
              ll < dct ; ll++, lp++, cpp++ )
        {
                cp = *cpp ;
                if ( *lp >= mpc )
                {
                        mpc = *lp ;
                        mp = cp ;
                }
                if ( *cp
                  && num )
                {
                        errno = 0 ;
                        nval = strtod( cp, &cp2 );
                        if ( errno              /* underflow or overflow */
                          || ( cp2 == cp )      /* didn't like the characters */
                          || *cp2 )             /* didn't like some */
                        {
                                num = 0 ;
                        }
                        else
                        {
                                sum += ( nval * *lp ) ;
                                nvc += *lp ;
                        }
                }
                if ( ll2 <= lct  )
                {
                        me = cp ;
                        ll2 += *lp ;
                }
        }
        if ( num
          && nvc )
        {
                sum /= nvc ;
                sprintf( buf, "%-30.20LG", sum );
                for ( cp = buf + strlen( buf ) - 1 ;
                      cp >= buf && *cp == ' ' ;
                      cp-- )
                {
                        *cp = NULL ;
                }
        }
        else
        {
                strcpy( buf, "N/A" );
        }
        if ( ( ( pfx
              && 0 > printf( "%s|", pfx ) )
            || 0 > printf( "%lu|%lu|%s|%lu|%s|%lu|%s|%s|%s|%lu|%lu\n",
                                dct, nct,
                                ( dct ? vl[0] : "" ),
                                ( dct ? vct[0] : 0 ),
                                ( dct ? vl[dct - 1] : "" ),
                                ( dct ? vct[dct - 1] : 0 ),
                                buf, me, mp, mpc, lct ) )
          && ferror( stdout ) )
        {
                perror( "stdout" );
                exit( 1 );
        }
        exit( 0 );
}

# 17  
Old 03-30-2011
I am not sure whether anybody mentioned to you but when you use a variable under 'awk' that is originally a string (because you piped it as a string from somewhere or you used it as a string in the first place), then you just have to add a number to it and 'awk' converts the result into a number. Usually you add 0 (zero) to it.

Here is an example that should work for you:

Code:
wc -w inputfile | cut -c 1-9 | awk '{print $1 + a} a=0'

In my case, the inputfile had 10 words and awk returned 10 at the end. You can easily see this when you add a number to the result (like "1" in the case below)"

Code:
wc -w inputfile | cut -c 1-9 | awk 'END {c=$1+a; print c} a=1'
      10
11

# 18  
Old 03-30-2011
I think he wanted word frequency, not total word count. He could have used a tool like awk, perl and put 1 or value+1 to a variable for every word in lower case, then dump the variables.
This User Gave Thanks to DGPickett For This Post:
# 19  
Old 04-05-2011
Hey guys,

Does anybody know how to parse a large text file to remove punctuation completely, so it only displays text?

Thanks
# 20  
Old 04-05-2011
Quote:
Originally Posted by John0101
Hey guys,

Does anybody know how to parse a large text file to remove punctuation completely, so it only displays text?

Thanks
look into 'man tr'.
# 21  
Old 04-05-2011
tr may merge some words that shouln't so maybe space replacement alternative could be preferred ?
something like :
Code:
sed "s/[\"',.;?!:]/ /g;s/  */ /g" infile

(feel free to add or remove what you considered as "punctuation" withing the sqaure bracket)

NOTE that :
If you want to include the closing square bracket ], it should then be FIRST in the list
If you want to include the Hyphen - , it should be the first OR the last of the list

look :

Code:
# echo "a^:]-,.;!?\"'a"
a^:]-,.;!?"'a
# echo "a^:]-,.;!?\"'a" | sed "s/[]^\"',.;?!:-]/ /g;s/  */ /g"
a a

---------- Post updated at 08:21 PM ---------- Previous update was at 08:06 PM ----------

For that one you can try this :
Code:
strings walle.txt | head -5521 | tail +91 >cleanfile.txt


Last edited by ctsgnb; 04-05-2011 at 03:19 PM..
 
Login or Register to Ask a Question

Previous Thread | Next Thread

8 More Discussions You Might Find Interesting

1. Infrastructure Monitoring

Nmon Analysis

Dear All, I am an performance tester. Now i am working in project where we are using linux 2.6.32. Now I got an oppurtunity to learn the monitoring the server. As part of this task i need to do analysis of the Nmon report. I was completely blank in this. So please suggest me how to start... (0 Replies)
Discussion started by: iamsengu
0 Replies

2. UNIX for Dummies Questions & Answers

Help with text analysis - UNIX

Hey Guys I recently posted yesterday about trying to count the amount of separate words that exists in a text file e.g. walle.txt. i want the output to give to give me a list of words with a number next indicating how many times its came up in the file e.g: cat 20 the 11 if 40 I'm... (0 Replies)
Discussion started by: John0101
0 Replies

3. Shell Programming and Scripting

Analysis of a script

what does this line in a script mean?? I have tried to give it at the command prompt and here is what it returns ksh: /db2home/db2dap1/sqllib/db2profile: not found. . /db2home/db2dap1/sqllib/db2profile i have tried the same thing for my home directory too and the result is the same .... (5 Replies)
Discussion started by: ramky79
5 Replies

4. Shell Programming and Scripting

Metacharacters analysis

:confused:Hi , Can someone please advise what is the meaning of metacharacters in below code? a_PROCESS=${0##*/} a_DPFX=${a_PROCESS%.*} a_LPFX="a_DPFX : $$ : " a_UPFX="Usage: $a_PROCESS" Regards, gehlnar (3 Replies)
Discussion started by: gehlnar
3 Replies

5. Shell Programming and Scripting

text file analysis

Hello, I have a text file containin 4 lines which are repeated along the file, ie the file looks like this: 16:20:12.060769 blablabla 40 16:20:12.093199 blablabla 640 16:20:12.209003 blablabla 640 16:20:12.273179 blablabla 216 16:20:27.217444 blablabla 40 16:20:27.235410 blablabla 640... (2 Replies)
Discussion started by: Celine19
2 Replies

6. Programming

Regarding stack analysis

I would like to know how I could do the following : void func(){ int a = 100; b=0; int c = a/b; } void sig_handler (int sig,siginfo_t *info,void *context){ //signal handling function //here I want to access the variables of func() } int main(){ struct sigaction *act =... (7 Replies)
Discussion started by: vpraveen84
7 Replies

7. Shell Programming and Scripting

AWK script: decrypt text uses frequency analysis

Ez all! I have a question how to decrypt text uses letter frequency analysis. I have code which count the letters, but what i need to do after that. Can anybody help me to write a code. VERY NEEDED! My code now: #!/usr/bin/awk -f BEGIN { FS="" } { for (i=1; i <= NF; i++) { if ($i... (4 Replies)
Discussion started by: SerJel
4 Replies

8. Solaris

Catalina Analysis

How can I make analysis for catalina.out (2 Replies)
Discussion started by: Burhan
2 Replies
Login or Register to Ask a Question