Fast string removal from large text collection

10-03-2011

Registered User

190, 1

Join Date: Jan 2011

Last Activity: 11 October 2017, 1:35 PM EDT

Location: Nowhere

Posts: 190

Thanks Given: 227

Thanked 1 Time in 1 Post

Fast string removal from large text collection

Hi All,

I don't want any codes for this problem. Just suggestions:

I have a huge collection of text files (around 300,000) which look like this:

1.fil

Code:

orange
apple
dskjdsk
computer
skjks

The entire text collection (referenced above) has about 1 billion words.

I have created another text file which contains some words like these:

junk.dat

Code:

dskjdsk
skjks

The above text file has about 300,000 such words.

As you can see the words that I have filtered out are some junk words which I want to remove from my text collection.

I have already written a code for this in C but the problem is when the code runs it seems that it will take days and months to finally clean up the entire text data.

My algorithm:
1. Read the junk.dat file and make a hash table of it.
2. Read each of the *.fil files one by one, search for words in the hash table.
3. If words present in junk.dat, then leave out that word and move onto the next word.
4. Keep doing this for the entire collection of 300000 files.

I have even tried to minimize the disk read-writes. Still it is very slow.

Below is my C code:

Code:

//This will remove one word from the entire collection.

#define _GNU_SOURCE
#define _BSD_SOURCE

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <inttypes.h>
#include <search.h>

#define FILE_LIST 3000000
#define UNIQUE_WORDS 400000 //change this based on the number of words in junk.dat
#define HIGHEST_WORDS 999999 //this is the maximum number of words that
//one document has...obtained using wc -l and then sort


char ** read_file ( char * , ssize_t * );
char *file_name_generator ( void );
char * itoa( int , char * );
char * reverse ( char [] );
char *file_names ( unsigned int );


int32_t main ( int32_t argc , char **argv )
{
	unsigned long int number_of_words = 0;
	unsigned long int i = 0;
	unsigned long int word_counter = 0;
	unsigned long  int j = 0;

	FILE *output_pointer = NULL;
	FILE *input_pointer = NULL;

	ENTRY e , *ep;

	char **words_from_dictionary = NULL;
	char **words_from_webpage = NULL;
	char **word_array = NULL;


	char *file_name = NULL;
	file_name = ( char * ) calloc ( 300 , sizeof ( char ) );
	if ( file_name == NULL )
	{
		perror ( "malloc() memory allocation failure" );
		return ( EXIT_FAILURE );
	}

	words_from_dictionary = read_file ( argv [ 1 ] , &number_of_words );
	word_array = ( char * ) malloc ( HIGHEST_WORDS * sizeof ( char ) );
	if ( word_array == NULL )
	{
		fprintf ( stderr , "malloc() memory allocation failure in word_array\n" );
	}


	hcreate ( number_of_words );

        //creating hash table for junk.dat
	for ( i = 0 ; i < number_of_words ; i++ ) 
	{
		e.key = words_from_dictionary [ i ];
		e.data = ( void * ) i;
		ep = hsearch ( e, ENTER );
		if ( ep == NULL )
		{
			fprintf(stderr, "entry failed\n");
			exit(1);
		}
	}
	
	i = 1;

	while ( i <= FILE_LIST )
	{
		file_name = file_name_generator ();
		input_pointer = fopen ( file_name , "r" );
		if ( input_pointer == NULL )
		{
			i++;
			memset ( file_name , 0 , strlen ( file_name ) );
			continue;//if file not found move on to next file
		}
		fclose ( input_pointer );
		words_from_webpage = read_file ( file_name , &number_of_words );//read entire words from file in an array
		for ( j = 0 ; j < number_of_words ; j ++ )
		{
			e.key = words_from_webpage [ j ];
			ep = hsearch ( e , FIND );
			if ( ep == NULL )
			{
				* ( word_array + word_counter++ ) = words_from_webpage [ j ];
			}
		}//search for words

		memset ( file_name , 0 , strlen ( file_name ) );
		file_name = file_names ( i );
		output_pointer = fopen ( file_name , "w" );

		j = 0;

		while ( j < ( word_counter ) )
		{
			fprintf ( output_pointer , "%s" , word_array [ j ] );//write the array to output file
			j ++;
		}
		fclose ( output_pointer );

		memset ( word_array , 0 , HIGHEST_WORDS );

		for ( j = 0 ; j < number_of_words ; j ++ )
		{
			free ( words_from_webpage [j] );
		}
			free ( words_from_webpage );

		memset ( file_name , 0 , strlen ( file_name ) );
		word_counter = 0;
		i ++;
	}
	
	free ( file_name );

return ( EXIT_SUCCESS );
}


char *file_names ( unsigned int i )//generate file names for writing
{
	char *str = NULL;
	str = ( char * ) calloc ( 7 , sizeof ( char ) );
	if ( str == NULL )
	{
		perror ( "malloc() memory allocation error" );
		//return ( EXIT_FAILURE );
	}
	
	char *file_path = NULL;
	file_path = ( char * ) calloc ( 300 , sizeof ( char ) );
	if ( file_path == NULL )
	{
		perror ( "malloc() memory allocation failure" );
	}

	char common_path[] = {"/test/remove_one_word/"};
	//static unsigned long int i = 1;
	
	str = itoa ( i , str );
	strcat ( file_path , common_path );
	strcat ( str , ".dat" );
	strcat ( file_path , str );
	i ++;

	return ( file_path );
}


char *file_name_generator ( void )//generate input file names
{
	char *str = NULL;
	str = ( char * ) calloc ( 7 , sizeof ( char ) );
	if ( str == NULL )
	{
		perror ( "malloc() memory allocation error" );
		//return ( EXIT_FAILURE );
	}
	
	char *file_path = NULL;
	file_path = ( char * ) calloc ( 300 , sizeof ( char ) );
	if ( file_path == NULL )
	{
		perror ( "malloc() memory allocation failure" );
	}

	char common_path[] = {"/test/remove_one_word/"};//CHANGE THIS
	static unsigned long int i = 1;
	
	str = itoa ( i , str );
	strcat ( file_path , common_path );
	strcat ( str , ".dat" );
	strcat ( file_path , str );
	i ++;

	return ( file_path );
}


char * itoa ( int n , char * s )
{
     unsigned long int i, sign;
 
     if ( ( sign = n ) < 0 )  /* record sign */
         n = -n;          /* make n positive */
     i = 0;
     do {       /* generate digits in reverse order */
         s [ i++ ] = n % 10 + '0';   /* get next digit */
     } while ( ( n /= 10 ) > 0 );     /* delete it */
     if ( sign < 0 )
         s [ i++ ] = '-';
     s [ i ] = '\0';
     reverse ( s );
	return ( s );
}


char * reverse ( char s [ ] )
 {
     unsigned long int i, j;
     char c;
 
     for ( i = 0, j = strlen ( s ) - 1; i < j; i ++, j -- ) 
	{
         	c = s [ i ];
         	s [ i ] = s [ j ];
         	s [ j ] = c;
     	}
	return ( s );
 }


char ** read_file ( char *path , ssize_t *number_of_words )
{
	char ch;
	char *line = NULL;

	size_t len = 0;
	ssize_t read;

	(*number_of_words) = 0;
	unsigned long int i = 0;
	unsigned long int j = 0;

	FILE *pointer = NULL;
	char **word_array = NULL;

	pointer = fopen ( path , "r" );
	if ( pointer == NULL)
	{
		perror ( "File read error " );
	}

	//counting the number of words...
	while ( !feof ( pointer ) )
	{
		ch = fgetc ( pointer );
		if ( ch == '\n' && ch != EOF )
		{
			(*number_of_words) ++;
		}
	}

	rewind ( pointer );

	word_array = malloc ( (*number_of_words) * sizeof ( char * ) );
	if ( word_array == NULL )
	{
		perror ( "malloc() memory allocation failure" );
	}

	for ( i = 0 ; i < (*number_of_words) ; i ++ )
	{
		word_array[i] = malloc ( 100 * sizeof ( char ) );
		if ( word_array[i] == NULL )
		{
			perror ( "malloc() memory allocation failure" );
		}
	}

	i = 0;
	j = 0;

	while ( !feof ( pointer ) )
	{
		while ( ( read = getline ( &line , &len , pointer ) ) != -1 )
		{
			strcpy ( word_array[i] , line );
			if ( i <= (*number_of_words) )
			{
				i++;
			}
		}
	}

	fclose ( pointer );
	return ( word_array );
}

I am using gcc and working on Linux.

shoaibjameel123

View Public Profile for shoaibjameel123

Find all posts by shoaibjameel123

10-03-2011

Registered User

1,015, 157

Join Date: Jun 2009

Last Activity: 25 June 2018, 8:15 AM EDT

Posts: 1,015

Thanks Given: 3

Thanked 157 Times in 149 Posts

1. Stop using malloc()/calloc() and free() every time you need memory. Get ONE chunk of memory and reuse it. For example, pass a character buffer into a method instead of using calloc() to allocate a new one each and every time.
2. Fix your memory leaks - I spotted at least two, on in file_name_generator(), one caused by the return value of file_name_generator() overwriting a malloc()'d pointer.
3. Don't EVER use fgetc().
4. Don't read files TWICE. Use something like fgets() and process each word as you read it. If you're using rewind(), you've done something wrong.

This User Gave Thanks to achenle For This Post:

achenle

View Public Profile for achenle

Find all posts by achenle

Programming

Fast string removal from large text collection

9 More Discussions You Might Find Interesting

1. Shell Programming and Scripting

Removing string from CSV file by provide removal string from other file

Discussion started by: ketanraut

2. UNIX for Dummies Questions & Answers

awk string removal

Discussion started by: shruthidwh

3. Shell Programming and Scripting

String removal from file

Discussion started by: jaydeep_sadaria

4. Shell Programming and Scripting

Large XML to MySQL - fast way

Discussion started by: stendelis

5. UNIX for Dummies Questions & Answers

selective removal of blank spaces in string

Discussion started by: Cpt_Cell

6. Shell Programming and Scripting

Help with splitting a large text file into smaller ones

Discussion started by: lord_butler

7. Programming

Read/Write a fairly large amount of data to a file as fast as possible

Discussion started by: emitrax

8. Programming

fopen() + reading in large text files

Discussion started by: JamesGoh

9. Shell Programming and Scripting

Large Text Files

Discussion started by: caddyjoe77