I don't want any codes for this problem. Just suggestions:
I have a huge collection of text files (around 300,000) which look like this:
1.fil
Code:
orange
apple
dskjdsk
computer
skjks
The entire text collection (referenced above) has about 1 billion words.
I have created another text file which contains some words like these:
junk.dat
Code:
dskjdsk
skjks
The above text file has about 300,000 such words.
As you can see the words that I have filtered out are some junk words which I want to remove from my text collection.
I have already written a code for this in C but the problem is when the code runs it seems that it will take days and months to finally clean up the entire text data.
My algorithm:
1. Read the junk.dat file and make a hash table of it.
2. Read each of the *.fil files one by one, search for words in the hash table.
3. If words present in junk.dat, then leave out that word and move onto the next word.
4. Keep doing this for the entire collection of 300000 files.
I have even tried to minimize the disk read-writes. Still it is very slow.
Below is my C code:
Code:
//This will remove one word from the entire collection.
#define _GNU_SOURCE
#define _BSD_SOURCE
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <inttypes.h>
#include <search.h>
#define FILE_LIST 3000000
#define UNIQUE_WORDS 400000 //change this based on the number of words in junk.dat
#define HIGHEST_WORDS 999999 //this is the maximum number of words that
//one document has...obtained using wc -l and then sort
char ** read_file ( char * , ssize_t * );
char *file_name_generator ( void );
char * itoa( int , char * );
char * reverse ( char [] );
char *file_names ( unsigned int );
int32_t main ( int32_t argc , char **argv )
{
unsigned long int number_of_words = 0;
unsigned long int i = 0;
unsigned long int word_counter = 0;
unsigned long int j = 0;
FILE *output_pointer = NULL;
FILE *input_pointer = NULL;
ENTRY e , *ep;
char **words_from_dictionary = NULL;
char **words_from_webpage = NULL;
char **word_array = NULL;
char *file_name = NULL;
file_name = ( char * ) calloc ( 300 , sizeof ( char ) );
if ( file_name == NULL )
{
perror ( "malloc() memory allocation failure" );
return ( EXIT_FAILURE );
}
words_from_dictionary = read_file ( argv [ 1 ] , &number_of_words );
word_array = ( char * ) malloc ( HIGHEST_WORDS * sizeof ( char ) );
if ( word_array == NULL )
{
fprintf ( stderr , "malloc() memory allocation failure in word_array\n" );
}
hcreate ( number_of_words );
//creating hash table for junk.dat
for ( i = 0 ; i < number_of_words ; i++ )
{
e.key = words_from_dictionary [ i ];
e.data = ( void * ) i;
ep = hsearch ( e, ENTER );
if ( ep == NULL )
{
fprintf(stderr, "entry failed\n");
exit(1);
}
}
i = 1;
while ( i <= FILE_LIST )
{
file_name = file_name_generator ();
input_pointer = fopen ( file_name , "r" );
if ( input_pointer == NULL )
{
i++;
memset ( file_name , 0 , strlen ( file_name ) );
continue;//if file not found move on to next file
}
fclose ( input_pointer );
words_from_webpage = read_file ( file_name , &number_of_words );//read entire words from file in an array
for ( j = 0 ; j < number_of_words ; j ++ )
{
e.key = words_from_webpage [ j ];
ep = hsearch ( e , FIND );
if ( ep == NULL )
{
* ( word_array + word_counter++ ) = words_from_webpage [ j ];
}
}//search for words
memset ( file_name , 0 , strlen ( file_name ) );
file_name = file_names ( i );
output_pointer = fopen ( file_name , "w" );
j = 0;
while ( j < ( word_counter ) )
{
fprintf ( output_pointer , "%s" , word_array [ j ] );//write the array to output file
j ++;
}
fclose ( output_pointer );
memset ( word_array , 0 , HIGHEST_WORDS );
for ( j = 0 ; j < number_of_words ; j ++ )
{
free ( words_from_webpage [j] );
}
free ( words_from_webpage );
memset ( file_name , 0 , strlen ( file_name ) );
word_counter = 0;
i ++;
}
free ( file_name );
return ( EXIT_SUCCESS );
}
char *file_names ( unsigned int i )//generate file names for writing
{
char *str = NULL;
str = ( char * ) calloc ( 7 , sizeof ( char ) );
if ( str == NULL )
{
perror ( "malloc() memory allocation error" );
//return ( EXIT_FAILURE );
}
char *file_path = NULL;
file_path = ( char * ) calloc ( 300 , sizeof ( char ) );
if ( file_path == NULL )
{
perror ( "malloc() memory allocation failure" );
}
char common_path[] = {"/test/remove_one_word/"};
//static unsigned long int i = 1;
str = itoa ( i , str );
strcat ( file_path , common_path );
strcat ( str , ".dat" );
strcat ( file_path , str );
i ++;
return ( file_path );
}
char *file_name_generator ( void )//generate input file names
{
char *str = NULL;
str = ( char * ) calloc ( 7 , sizeof ( char ) );
if ( str == NULL )
{
perror ( "malloc() memory allocation error" );
//return ( EXIT_FAILURE );
}
char *file_path = NULL;
file_path = ( char * ) calloc ( 300 , sizeof ( char ) );
if ( file_path == NULL )
{
perror ( "malloc() memory allocation failure" );
}
char common_path[] = {"/test/remove_one_word/"};//CHANGE THIS
static unsigned long int i = 1;
str = itoa ( i , str );
strcat ( file_path , common_path );
strcat ( str , ".dat" );
strcat ( file_path , str );
i ++;
return ( file_path );
}
char * itoa ( int n , char * s )
{
unsigned long int i, sign;
if ( ( sign = n ) < 0 ) /* record sign */
n = -n; /* make n positive */
i = 0;
do { /* generate digits in reverse order */
s [ i++ ] = n % 10 + '0'; /* get next digit */
} while ( ( n /= 10 ) > 0 ); /* delete it */
if ( sign < 0 )
s [ i++ ] = '-';
s [ i ] = '\0';
reverse ( s );
return ( s );
}
char * reverse ( char s [ ] )
{
unsigned long int i, j;
char c;
for ( i = 0, j = strlen ( s ) - 1; i < j; i ++, j -- )
{
c = s [ i ];
s [ i ] = s [ j ];
s [ j ] = c;
}
return ( s );
}
char ** read_file ( char *path , ssize_t *number_of_words )
{
char ch;
char *line = NULL;
size_t len = 0;
ssize_t read;
(*number_of_words) = 0;
unsigned long int i = 0;
unsigned long int j = 0;
FILE *pointer = NULL;
char **word_array = NULL;
pointer = fopen ( path , "r" );
if ( pointer == NULL)
{
perror ( "File read error " );
}
//counting the number of words...
while ( !feof ( pointer ) )
{
ch = fgetc ( pointer );
if ( ch == '\n' && ch != EOF )
{
(*number_of_words) ++;
}
}
rewind ( pointer );
word_array = malloc ( (*number_of_words) * sizeof ( char * ) );
if ( word_array == NULL )
{
perror ( "malloc() memory allocation failure" );
}
for ( i = 0 ; i < (*number_of_words) ; i ++ )
{
word_array[i] = malloc ( 100 * sizeof ( char ) );
if ( word_array[i] == NULL )
{
perror ( "malloc() memory allocation failure" );
}
}
i = 0;
j = 0;
while ( !feof ( pointer ) )
{
while ( ( read = getline ( &line , &len , pointer ) ) != -1 )
{
strcpy ( word_array[i] , line );
if ( i <= (*number_of_words) )
{
i++;
}
}
}
fclose ( pointer );
return ( word_array );
}
1. Stop using malloc()/calloc() and free() every time you need memory. Get ONE chunk of memory and reuse it. For example, pass a character buffer into a method instead of using calloc() to allocate a new one each and every time.
2. Fix your memory leaks - I spotted at least two, on in file_name_generator(), one caused by the return value of file_name_generator() overwriting a malloc()'d pointer.
3. Don't EVER use fgetc().
4. Don't read files TWICE. Use something like fgets() and process each word as you read it. If you're using rewind(), you've done something wrong.
What I need is to remove the text from Location_file.txt from each line matching all entries from Remove_location.txt
Location_file.txt
FlowPrePaid, h3nmg1cm2,Jamaica_MTAImageFileFlowPrePaid,h0nmg1cm1, Flow_BeatTest,FlowRockTest
FlowNewTest,FlowNewTest,h0nmg1cm1
PartiallySubscribed,... (3 Replies)
Hi,
I am trying to remove a string ".var" using the below command but it's not working as expected, when I execute this in the command prompt using the echo it's working fine , please let me know where I am doing it wrong.
UYRD=$FILE_$timestamp.csv | awk '{gsub(".var", "");print}' # this is... (6 Replies)
Dear all
From below mention input file I needed op file as show below. I am using below code but not worked.
I/p file
BSCBCH1 EXAL-1-4 WO* SMPS MAINS FAIL
BSCBCH1 EXAL-1-5 WO* SMPS RECTIFIER FAIL
BSCBCH1 EXAL-1-6 WO* SMPS MAJOR ALARM
BSCBCH2 EXAL-1-10 WO* ... (5 Replies)
Hello,
Sorry for my bad english.
I need to improve performance in project managing large data, these data are exported to a MySql from XML.
Now I use PHP (XMLReader ()) to do this job.
I need a faster way to do this process.
Which do you think is the best way?
Example:
(the item... (2 Replies)
Hi,
I'm a newbie to shell scripting and I have the following problem:
I need all spaces between two letters or a letter and a number exchanged for an underscore, but all spaces between a letter and other characters need to remain. Searching forums didn't help...
One example for clarity:
... (3 Replies)
Hi Everyone,
I am using a centos 5.2 server as an sflow log collector on my network. Currently I am using inmons free sflowtool to collect the packets sent by my switches. I have a bash script running on an infinate loop to stop and start the log collection at set intervals - currently one... (2 Replies)
Hi,
I'm trying to figure out the best solution to the following problem, and I'm not
yet that much experienced like you. :-)
Basically I have to read a fairly large file, composed of "messages" , in order
to display all of them through an user interface (made with QT).
The messages that... (3 Replies)
Hi All
I have approximately 10 files that are at least 100+ MB in size. I am importing them into a DB to output them to the web. What i need to do first is clean the files up so i dont have un necessary rows in the DB. Below is what the file looks like:
Ignore the <TAB> annotations as that... (4 Replies)