Fast string removal from large text collection


 
Thread Tools Search this Thread
Top Forums Programming Fast string removal from large text collection
# 1  
Old 10-03-2011
Fast string removal from large text collection

Hi All,

I don't want any codes for this problem. Just suggestions:

I have a huge collection of text files (around 300,000) which look like this:

1.fil
Code:
orange
apple
dskjdsk
computer
skjks

The entire text collection (referenced above) has about 1 billion words.

I have created another text file which contains some words like these:

junk.dat
Code:
dskjdsk
skjks

The above text file has about 300,000 such words.

As you can see the words that I have filtered out are some junk words which I want to remove from my text collection.

I have already written a code for this in C but the problem is when the code runs it seems that it will take days and months to finally clean up the entire text data.

My algorithm:
1. Read the junk.dat file and make a hash table of it.
2. Read each of the *.fil files one by one, search for words in the hash table.
3. If words present in junk.dat, then leave out that word and move onto the next word.
4. Keep doing this for the entire collection of 300000 files.

I have even tried to minimize the disk read-writes. Still it is very slow.

Below is my C code:

Code:
//This will remove one word from the entire collection.

#define _GNU_SOURCE
#define _BSD_SOURCE

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <inttypes.h>
#include <search.h>

#define FILE_LIST 3000000
#define UNIQUE_WORDS 400000 //change this based on the number of words in junk.dat
#define HIGHEST_WORDS 999999 //this is the maximum number of words that
//one document has...obtained using wc -l and then sort


char ** read_file ( char * , ssize_t * );
char *file_name_generator ( void );
char * itoa( int , char * );
char * reverse ( char [] );
char *file_names ( unsigned int );


int32_t main ( int32_t argc , char **argv )
{
	unsigned long int number_of_words = 0;
	unsigned long int i = 0;
	unsigned long int word_counter = 0;
	unsigned long  int j = 0;

	FILE *output_pointer = NULL;
	FILE *input_pointer = NULL;

	ENTRY e , *ep;

	char **words_from_dictionary = NULL;
	char **words_from_webpage = NULL;
	char **word_array = NULL;


	char *file_name = NULL;
	file_name = ( char * ) calloc ( 300 , sizeof ( char ) );
	if ( file_name == NULL )
	{
		perror ( "malloc() memory allocation failure" );
		return ( EXIT_FAILURE );
	}

	words_from_dictionary = read_file ( argv [ 1 ] , &number_of_words );
	word_array = ( char * ) malloc ( HIGHEST_WORDS * sizeof ( char ) );
	if ( word_array == NULL )
	{
		fprintf ( stderr , "malloc() memory allocation failure in word_array\n" );
	}


	hcreate ( number_of_words );

        //creating hash table for junk.dat
	for ( i = 0 ; i < number_of_words ; i++ ) 
	{
		e.key = words_from_dictionary [ i ];
		e.data = ( void * ) i;
		ep = hsearch ( e, ENTER );
		if ( ep == NULL )
		{
			fprintf(stderr, "entry failed\n");
			exit(1);
		}
	}
	
	i = 1;

	while ( i <= FILE_LIST )
	{
		file_name = file_name_generator ();
		input_pointer = fopen ( file_name , "r" );
		if ( input_pointer == NULL )
		{
			i++;
			memset ( file_name , 0 , strlen ( file_name ) );
			continue;//if file not found move on to next file
		}
		fclose ( input_pointer );
		words_from_webpage = read_file ( file_name , &number_of_words );//read entire words from file in an array
		for ( j = 0 ; j < number_of_words ; j ++ )
		{
			e.key = words_from_webpage [ j ];
			ep = hsearch ( e , FIND );
			if ( ep == NULL )
			{
				* ( word_array + word_counter++ ) = words_from_webpage [ j ];
			}
		}//search for words

		memset ( file_name , 0 , strlen ( file_name ) );
		file_name = file_names ( i );
		output_pointer = fopen ( file_name , "w" );

		j = 0;

		while ( j < ( word_counter ) )
		{
			fprintf ( output_pointer , "%s" , word_array [ j ] );//write the array to output file
			j ++;
		}
		fclose ( output_pointer );

		memset ( word_array , 0 , HIGHEST_WORDS );

		for ( j = 0 ; j < number_of_words ; j ++ )
		{
			free ( words_from_webpage [j] );
		}
			free ( words_from_webpage );

		memset ( file_name , 0 , strlen ( file_name ) );
		word_counter = 0;
		i ++;
	}
	
	free ( file_name );

return ( EXIT_SUCCESS );
}


char *file_names ( unsigned int i )//generate file names for writing
{
	char *str = NULL;
	str = ( char * ) calloc ( 7 , sizeof ( char ) );
	if ( str == NULL )
	{
		perror ( "malloc() memory allocation error" );
		//return ( EXIT_FAILURE );
	}
	
	char *file_path = NULL;
	file_path = ( char * ) calloc ( 300 , sizeof ( char ) );
	if ( file_path == NULL )
	{
		perror ( "malloc() memory allocation failure" );
	}

	char common_path[] = {"/test/remove_one_word/"};
	//static unsigned long int i = 1;
	
	str = itoa ( i , str );
	strcat ( file_path , common_path );
	strcat ( str , ".dat" );
	strcat ( file_path , str );
	i ++;

	return ( file_path );
}


char *file_name_generator ( void )//generate input file names
{
	char *str = NULL;
	str = ( char * ) calloc ( 7 , sizeof ( char ) );
	if ( str == NULL )
	{
		perror ( "malloc() memory allocation error" );
		//return ( EXIT_FAILURE );
	}
	
	char *file_path = NULL;
	file_path = ( char * ) calloc ( 300 , sizeof ( char ) );
	if ( file_path == NULL )
	{
		perror ( "malloc() memory allocation failure" );
	}

	char common_path[] = {"/test/remove_one_word/"};//CHANGE THIS
	static unsigned long int i = 1;
	
	str = itoa ( i , str );
	strcat ( file_path , common_path );
	strcat ( str , ".dat" );
	strcat ( file_path , str );
	i ++;

	return ( file_path );
}


char * itoa ( int n , char * s )
{
     unsigned long int i, sign;
 
     if ( ( sign = n ) < 0 )  /* record sign */
         n = -n;          /* make n positive */
     i = 0;
     do {       /* generate digits in reverse order */
         s [ i++ ] = n % 10 + '0';   /* get next digit */
     } while ( ( n /= 10 ) > 0 );     /* delete it */
     if ( sign < 0 )
         s [ i++ ] = '-';
     s [ i ] = '\0';
     reverse ( s );
	return ( s );
}


char * reverse ( char s [ ] )
 {
     unsigned long int i, j;
     char c;
 
     for ( i = 0, j = strlen ( s ) - 1; i < j; i ++, j -- ) 
	{
         	c = s [ i ];
         	s [ i ] = s [ j ];
         	s [ j ] = c;
     	}
	return ( s );
 }


char ** read_file ( char *path , ssize_t *number_of_words )
{
	char ch;
	char *line = NULL;

	size_t len = 0;
	ssize_t read;

	(*number_of_words) = 0;
	unsigned long int i = 0;
	unsigned long int j = 0;

	FILE *pointer = NULL;
	char **word_array = NULL;

	pointer = fopen ( path , "r" );
	if ( pointer == NULL)
	{
		perror ( "File read error " );
	}

	//counting the number of words...
	while ( !feof ( pointer ) )
	{
		ch = fgetc ( pointer );
		if ( ch == '\n' && ch != EOF )
		{
			(*number_of_words) ++;
		}
	}

	rewind ( pointer );

	word_array = malloc ( (*number_of_words) * sizeof ( char * ) );
	if ( word_array == NULL )
	{
		perror ( "malloc() memory allocation failure" );
	}

	for ( i = 0 ; i < (*number_of_words) ; i ++ )
	{
		word_array[i] = malloc ( 100 * sizeof ( char ) );
		if ( word_array[i] == NULL )
		{
			perror ( "malloc() memory allocation failure" );
		}
	}

	i = 0;
	j = 0;

	while ( !feof ( pointer ) )
	{
		while ( ( read = getline ( &line , &len , pointer ) ) != -1 )
		{
			strcpy ( word_array[i] , line );
			if ( i <= (*number_of_words) )
			{
				i++;
			}
		}
	}

	fclose ( pointer );
	return ( word_array );
}

I am using gcc and working on Linux.
# 2  
Old 10-03-2011
1. Stop using malloc()/calloc() and free() every time you need memory. Get ONE chunk of memory and reuse it. For example, pass a character buffer into a method instead of using calloc() to allocate a new one each and every time.
2. Fix your memory leaks - I spotted at least two, on in file_name_generator(), one caused by the return value of file_name_generator() overwriting a malloc()'d pointer.
3. Don't EVER use fgetc().
4. Don't read files TWICE. Use something like fgets() and process each word as you read it. If you're using rewind(), you've done something wrong.
This User Gave Thanks to achenle For This Post:
Login or Register to Ask a Question

Previous Thread | Next Thread

9 More Discussions You Might Find Interesting

1. Shell Programming and Scripting

Removing string from CSV file by provide removal string from other file

What I need is to remove the text from Location_file.txt from each line matching all entries from Remove_location.txt Location_file.txt FlowPrePaid, h3nmg1cm2,Jamaica_MTAImageFileFlowPrePaid,h0nmg1cm1, Flow_BeatTest,FlowRockTest FlowNewTest,FlowNewTest,h0nmg1cm1 PartiallySubscribed,... (3 Replies)
Discussion started by: ketanraut
3 Replies

2. UNIX for Dummies Questions & Answers

awk string removal

Hi, I am trying to remove a string ".var" using the below command but it's not working as expected, when I execute this in the command prompt using the echo it's working fine , please let me know where I am doing it wrong. UYRD=$FILE_$timestamp.csv | awk '{gsub(".var", "");print}' # this is... (6 Replies)
Discussion started by: shruthidwh
6 Replies

3. Shell Programming and Scripting

String removal from file

Dear all From below mention input file I needed op file as show below. I am using below code but not worked. I/p file BSCBCH1 EXAL-1-4 WO* SMPS MAINS FAIL BSCBCH1 EXAL-1-5 WO* SMPS RECTIFIER FAIL BSCBCH1 EXAL-1-6 WO* SMPS MAJOR ALARM BSCBCH2 EXAL-1-10 WO* ... (5 Replies)
Discussion started by: jaydeep_sadaria
5 Replies

4. Shell Programming and Scripting

Large XML to MySQL - fast way

Hello, Sorry for my bad english. I need to improve performance in project managing large data, these data are exported to a MySql from XML. Now I use PHP (XMLReader ()) to do this job. I need a faster way to do this process. Which do you think is the best way? Example: (the item... (2 Replies)
Discussion started by: stendelis
2 Replies

5. UNIX for Dummies Questions & Answers

selective removal of blank spaces in string

Hi, I'm a newbie to shell scripting and I have the following problem: I need all spaces between two letters or a letter and a number exchanged for an underscore, but all spaces between a letter and other characters need to remain. Searching forums didn't help... One example for clarity: ... (3 Replies)
Discussion started by: Cpt_Cell
3 Replies

6. Shell Programming and Scripting

Help with splitting a large text file into smaller ones

Hi Everyone, I am using a centos 5.2 server as an sflow log collector on my network. Currently I am using inmons free sflowtool to collect the packets sent by my switches. I have a bash script running on an infinate loop to stop and start the log collection at set intervals - currently one... (2 Replies)
Discussion started by: lord_butler
2 Replies

7. Programming

Read/Write a fairly large amount of data to a file as fast as possible

Hi, I'm trying to figure out the best solution to the following problem, and I'm not yet that much experienced like you. :-) Basically I have to read a fairly large file, composed of "messages" , in order to display all of them through an user interface (made with QT). The messages that... (3 Replies)
Discussion started by: emitrax
3 Replies

8. Programming

fopen() + reading in large text files

For reading in large text files (say files over 1kB in size) are there any issues with fopen() that I should be aware of ? cheers (2 Replies)
Discussion started by: JamesGoh
2 Replies

9. Shell Programming and Scripting

Large Text Files

Hi All I have approximately 10 files that are at least 100+ MB in size. I am importing them into a DB to output them to the web. What i need to do first is clean the files up so i dont have un necessary rows in the DB. Below is what the file looks like: Ignore the <TAB> annotations as that... (4 Replies)
Discussion started by: caddyjoe77
4 Replies
Login or Register to Ask a Question