Code:
#define _GNU_SOURCE
#define _BSD_SOURCE
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <inttypes.h>
#include <search.h>
#define NUMBER_OF_FILES 300000
#define LENGTH_OF_FILE 7617230 //number of records in the file
unsigned long int number_of_lines ( FILE * );
void file_name_generator ( unsigned long int , char * , char * );
char * itoa ( int , char * );
char * reverse ( char [] );
void file_names ( unsigned int ,char * , char * );
char * chomp ( char * );
int32_t main ( int32_t argc , char ** argv )
{
FILE *output_pointer = NULL;
FILE *bigram_counts = NULL;
FILE *input_file = NULL;
FILE *bigram_words = NULL;
unsigned long int i = 0;
unsigned long int number_of_words = 0;
char *file_name = NULL;
char **words_from_webpage = NULL;
unsigned long int j = 0;
unsigned long int *store_values = NULL;
char **bigram_words_array = NULL;
unsigned long int *bigram_counts_array = NULL;
unsigned long int array_index = 0;
char *str = NULL;
char *line = NULL;
size_t len = 0;
ssize_t read = 0;
bigram_words_array = ( char ** ) malloc ( LENGTH_OF_FILE * sizeof ( char * ) );
if ( bigram_words_array == NULL )
{
fprintf ( stderr , "malloc() memory allocation failure\n" );
}
for ( i = 0 ; i < LENGTH_OF_FILE ; i ++ )
{
bigram_words_array [ i ] = ( char * ) malloc ( 50 * sizeof ( char ) );
if ( bigram_words_array [ i ] == NULL )
{
fprintf ( stderr , "malloc() memory allocation failure\n" );
}
}
bigram_counts_array = ( unsigned long int * ) malloc ( LENGTH_OF_FILE * sizeof ( unsigned long int ) );
if ( bigram_counts_array == NULL )
{
fprintf ( stderr , "malloc() memory allocation failure\n" );
}
str = ( char * ) malloc ( 4 * sizeof ( char ) );
if ( str == NULL )
{
fprintf ( stderr , "malloc() memory allocation failure\n" );
}
ENTRY e , *ep;
bigram_counts = fopen ( "counts.dat" , "r" );
if ( bigram_counts == NULL )
{
fprintf ( stderr , "file read error\n" );
}
bigram_words = fopen ( "bigrams.dat" , "r" );
if ( bigram_words == NULL )
{
fprintf ( stderr , "file read error\n" );
}
file_name = ( char * ) malloc ( 300 * sizeof ( char ) );
if ( file_name == NULL )
{
fprintf ( stderr , "malloc() memory allocation failure\n" );
}
while ( !feof ( bigram_words ) )
{
while ( ( read = getline ( &line , &len , bigram_words ) ) != -1 )
{
line = chomp ( line );//this removes newline character
strcpy ( bigram_words_array [ j ] , line );
if ( j <= LENGTH_OF_FILE )
{
j ++;
}
}
}
j = 0;
while ( !feof ( bigram_counts ) )
{
fscanf ( bigram_counts , "%ld\n" , &bigram_counts_array [ j ++ ] );
}
j = 0;
if ( line )
{
free ( line );
}
hcreate ( LENGTH_OF_FILE ); //building the hash table
for ( i = 0 ; i < LENGTH_OF_FILE ; i ++ )
{
e.key = bigram_words_array [ i ];
e.data = ( void * ) i;
ep = hsearch ( e , ENTER );
if ( ep == NULL )
{
fprintf ( stderr , "hash table entry failed\n" );
}
}
fclose ( bigram_counts );
fclose ( bigram_words );
for ( i = 0 ; i < NUMBER_OF_FILES ; i ++ ) //this will iterate through all the files in the directory
{
file_name_generator ( ( i + 1 ) , file_name , str ); //this generates the file name of the file to be read
memset ( str , 0 , strlen ( str ) );
input_file = fopen ( file_name , "r" );
if ( input_file == NULL )
{
memset ( file_name , 0 , strlen ( file_name ) );
fclose ( input_file );
continue;
}
number_of_words = number_of_lines ( input_file ); //computes the number of words in the data file
store_values = ( unsigned long int * ) malloc ( number_of_words * sizeof ( unsigned long int ) ); this array will store all the numerical values corresponding to the words
if ( store_values == NULL )
{
fprintf ( stderr , "malloc() memory allocation failure\n" );
}
words_from_webpage = ( char ** ) malloc ( number_of_words * sizeof ( char * ) ); //this read the data file consising of words each word in newline
if ( words_from_webpage == NULL )
{
fprintf ( stderr , "malloc() memory allocation failure in words_from_webpage\n" );
}
for ( j = 0 ; j < number_of_words ; j ++ )
{
words_from_webpage [ j ] = ( char * ) malloc ( 20 * sizeof ( char ) );
if ( words_from_webpage [ j ] == NULL )
{
fprintf ( stderr , "malloc() memory allocation failure\n" );
}
}
j = 0;
rewind ( input_file );
while ( !feof ( input_file ) )
{
while ( ( read = getline ( &line , &len , input_file ) ) != -1 )
{
line = chomp ( line );
strcpy ( words_from_webpage [ j ] , line );
if ( j <= number_of_words )
{
j ++;
}
}
}
if ( line )
{
free ( line );
}
for ( j = 0 ; j < number_of_words ; j ++ )
{
e.key = words_from_webpage [ j ];
ep = hsearch ( e , FIND ); //find the word in the hash table and get the index of the word from the word list file
array_index = ( unsigned long int ) ( ep -> data );
* ( store_values + j ) = bigram_counts_array [ array_index ];
}
memset ( file_name , 0 , strlen ( file_name ) );
file_names ( ( i + 1 ) , file_name , str ); //get the file path of the file to be written.
output_pointer = fopen ( file_name , "w" ); //THIS IS WHERE THE PROBLEM OCCURS
if ( output_pointer == NULL )
{
fprintf ( stderr , "file write error\n" );
}
j = 0;
while ( j < number_of_words )
{
fprintf ( output_pointer , "%ld\n" , store_values [ j ] + 1 );
j ++;
}
fclose ( output_pointer );
memset ( store_values , 0 , sizeof ( store_values ) );
memset ( file_name , 0 , strlen ( file_name ) );
memset ( str , 0 , strlen ( str ) );
fclose ( input_file );
for ( j = 0 ; j < number_of_words ; j ++ )
{
words_from_webpage [ j ] = '\0';
}
memset ( words_from_webpage , 0 , strlen ( words_from_webpage ) );
}
free ( file_name );
free ( str );
for ( j = 0 ; j < number_of_words ; j ++ )
{
free ( words_from_webpage [ j ] );
}
free ( words_from_webpage );
for ( j = 0 ; i < LENGTH_OF_FILE ; j ++ )
{
free ( bigram_words_array [ j ] );
}
free ( bigram_words_array );
free ( bigram_counts_array );
fclose ( bigram_counts );
fclose ( bigram_words );
return ( EXIT_SUCCESS );
}
char *chomp ( char * word )
{
int32_t word_length = 0;
word_length = strlen ( word );
if ( word [ word_length - 1 ] == '\n' )
{
word [ word_length - 1 ] = '\0';
}
return ( word );
}
void file_names ( unsigned int i , char * file_name , char * str )
{
char common_path[] = {"/data/scores/"};
str = itoa ( i , str );
strcat ( file_name , common_path );
strcat ( str , ".dat" );
strcat ( file_name , str );
}
unsigned long int number_of_lines ( FILE *input_file )
{
char ch;
unsigned long int number_of_words = 0;
while ( ! ( feof ( input_file ) ) )
{
ch = fgetc ( input_file );
if ( ch == '\n' )
{
number_of_words++;
}
}
return ( number_of_words );
}
void file_name_generator ( unsigned long int i , char * file_name , char *str )
{
strcat ( file_name , "/data/files/" );
str = itoa ( i , str );
strcat ( file_name, str );
strcat ( file_name , ".dat" );
}
char * itoa ( int n , char * s )
{
int i, sign;
if ( ( sign = n ) < 0 ) /* record sign */
n = -n; /* make n positive */
i = 0;
do { /* generate digits in reverse order */
s [ i++ ] = n % 10 + '0'; /* get next digit */
} while ( ( n /= 10 ) > 0 ); /* delete it */
if ( sign < 0 )
s [ i++ ] = '-';
s [ i ] = '\0';
reverse ( s );
return ( s );
}
char * reverse ( char s [ ] )
{
int i, j;
char c;
for ( i = 0, j = strlen ( s ) - 1; i < j; i ++, j -- )
{
c = s [ i ];
s [ i ] = s [ j ];
s [ j ] = c;
}
return ( s );
}