Code:
#include<stdio.h>
#include<stdlib.h>
#include<inttypes.h>
#include<math.h>
#include<search.h>
#include<string.h>
#include<unistd.h>
#define DICTIONARY_FILE_LENGTH 819558
#define NUMBER_OF_CLUSTERS 150
#define DIMENSIONS 200
#define NUMBER_OF_FILES 297180 //Always give a larger number than actual
#define MAXIMUM_WORDS 2227449
void read_text_file ( FILE * , char ** , unsigned long int * );
void read_cluster_file ( FILE * , int * , unsigned long int * );
void read_centroid_matrix ( FILE * , float ** );
char *file_name_generator ( unsigned long int , char * , char * , char * );
char * itoa( int32_t , char * );
char * reverse ( char [] );
float cosine_similarity ( float ** , unsigned long int , unsigned long int );
int32_t main ( int32_t argc , char ** argv )
{
FILE *cluster_membership_file = NULL;
FILE *cluster_centroid_file = NULL;
FILE *dictionary_file = NULL;
FILE *input_file = NULL;
FILE *output_file = NULL;
ENTRY e , *ep;
unsigned long int i = 0;
unsigned long int number_of_words = 0;
int32_t *cluster_data = NULL;
int32_t j = 0;
unsigned long int first_word_coordinate = 0;
unsigned long int second_word_coordinate = 0;
int32_t counter = 0;
int32_t first_word_cluster_member = 0;
int32_t second_word_cluster_member = 0;
int32_t signal = 0;
float *store_centroids = NULL;
store_centroids = ( float * ) malloc ( MAXIMUM_WORDS * sizeof ( float ) );
if ( store_centroids == NULL )
{
fprintf ( stderr , "malloc() memory allocation failure in store_centroids\n" );
return ( EXIT_FAILURE );
}
float ** centroid_matrix = NULL;
centroid_matrix = ( float ** ) malloc ( NUMBER_OF_CLUSTERS * sizeof ( float * ) );
if ( centroid_matrix == NULL )
{
fprintf ( stderr , "malloc() memory allocation error in centroid matrix\n" );
return ( EXIT_FAILURE );
}
for ( i = 0 ; i < NUMBER_OF_CLUSTERS ; i ++ )
{
centroid_matrix [ i ] = ( float * ) malloc ( DIMENSIONS * sizeof ( float ) );
if ( centroid_matrix [ i ] == NULL )
{
fprintf ( stderr , "malloc() memory allocation error in centroid matrix\n" );
return ( EXIT_FAILURE );
}
}
char *file_extension = NULL;
char *file_path = NULL;
char *file_name = NULL;
file_extension = ( char * ) malloc ( 4 * sizeof ( char ) );
if ( file_extension == NULL )
{
fprintf ( stderr , "malloc() memory allocation failure in file_extension\n" );
return ( EXIT_FAILURE );
}
file_path = ( char * ) malloc ( 300 * sizeof ( char ) );
if ( file_path == NULL )
{
fprintf ( stderr , "malloc() memory allocation failure in file_path\n" );
return ( EXIT_FAILURE );
}
file_name = ( char * ) malloc ( 300 * sizeof ( char ) );
if ( file_name == NULL )
{
fprintf ( stderr , "malloc() memory allocation failure in file_name\n" );
return ( EXIT_FAILURE );
}
char **words_from_dictionary = NULL;
words_from_dictionary = ( char ** ) malloc ( DICTIONARY_FILE_LENGTH * sizeof ( char * ) );
if ( words_from_dictionary == NULL )
{
fprintf ( stderr , "malloc() memory allocation failure in creating words_from_dictionary space\n" );
return ( EXIT_FAILURE );
}
char **words_from_web_page = NULL; //get the chunk of memory once using malloc() and keep using the same for subsequent reads
words_from_web_page = ( char ** ) malloc ( MAXIMUM_WORDS * sizeof ( char * ) );
if ( words_from_web_page == NULL )
{
fprintf ( stderr , "malloc() memory allocation failure in creating words_from_web_page\n" );
return ( EXIT_FAILURE );
}
for ( i = 0 ; i < MAXIMUM_WORDS ; i ++ )
{
words_from_web_page [ i ] = ( char * ) malloc ( 30 * sizeof ( char ) );
if ( words_from_web_page [ i ] == NULL )
{
fprintf ( stderr , "malloc() memory allocation failure in words_from_web_page\n" );
return ( EXIT_FAILURE );
}
}
cluster_data = ( int * ) malloc ( DICTIONARY_FILE_LENGTH * sizeof ( int ) );
if ( cluster_data == NULL )
{
fprintf ( stderr , "malloc() memory allocation failure in cluster_data\n" );
return ( EXIT_FAILURE );
}
for ( i = 0 ; i < DICTIONARY_FILE_LENGTH ; i ++ )
{
words_from_dictionary [ i ] = ( char * ) malloc ( 30 * sizeof ( char ) );
if ( words_from_dictionary [ i ] == NULL )
{
fprintf ( stderr , "malloc() memory allocation failure in words_from_dictionary\n" );
return ( EXIT_FAILURE );
}
}
cluster_membership_file = fopen ( "cluster_membership.txt" , "r" );
if ( cluster_membership_file == NULL )
{
fprintf ( stderr, "cluster membership file read error\n" );
return ( EXIT_FAILURE );
}
cluster_centroid_file = fopen ( "cluster_centroids.txt" , "r" );
if ( cluster_centroid_file == NULL )
{
fprintf ( stderr , "cluster centroid file read error\n" );
return ( EXIT_FAILURE );
}
dictionary_file = fopen ( "dictionary.dit" , "r" );
if ( dictionary_file == NULL )
{
fprintf ( stderr , "dictionary file read error\n" );
return ( EXIT_FAILURE );
}
i = 0;
read_text_file ( dictionary_file , words_from_dictionary , &number_of_words );
number_of_words = 0;
read_cluster_file ( cluster_membership_file , cluster_data , &number_of_words );
read_centroid_matrix ( cluster_centroid_file , centroid_matrix );
hcreate ( DICTIONARY_FILE_LENGTH );
for ( i = 0 ; i < DICTIONARY_FILE_LENGTH ; i++ )
{
e.key = words_from_dictionary [ i ];
e.data = ( void * ) i;
ep = hsearch ( e, ENTER );
if ( ep == NULL )
{
fprintf(stderr, "entry failed\n");
exit(1);
}
}
for ( i = 0 ; i < NUMBER_OF_FILES ; i ++ )
{
file_name = file_name_generator ( i , file_extension , file_path , "/science/original_files/" );//file name generator generates the
//file names which will be opened further for reading
input_file = fopen ( file_name , "r" );
if ( input_file == NULL )
{
memset ( file_name , '\0' , 300 );
memset ( file_extension , '\0' , 4 );
memset ( file_path , '\0' , 300 );
continue;
}
number_of_words = 0;
read_text_file ( input_file , words_from_web_page , &number_of_words );//read text files one by one
for ( j = 0 ; j < ( number_of_words - 1 ) ; j ++ )
{
e.key = words_from_web_page [ j ];
ep = hsearch ( e , FIND );
first_word_coordinate = ( int32_t ) ( ep->data );
first_word_cluster_member = cluster_data [ first_word_coordinate ];
e.key = words_from_web_page [ j + 1 ];
ep = hsearch ( e , FIND );
second_word_coordinate = ( int32_t ) ( ep -> data );
second_word_cluster_member = cluster_data [ second_word_coordinate ];
if ( first_word_cluster_member == second_word_cluster_member )
{
signal = signal + 1;
continue;
}
else
{
* ( store_centroids + counter ++ ) =
cosine_similarity ( centroid_matrix , ( first_word_cluster_member - 1 ) , ( second_word_cluster_member - 1 ) );
}
}
memset ( file_name , 0 , 300 );
memset ( file_extension , 0 , 4 );
memset ( file_path , 0 , 300 );
memset ( words_from_web_page , 0 , MAXIMUM_WORDS );
file_name = file_name_generator ( i , file_extension , file_path , "segmentation_scores/" );//file writing done here
output_file = fopen ( file_name , "w" );
if ( output_file == NULL )
{
fprintf ( stderr , "file write error\n" );
return ( EXIT_FAILURE );
}
if ( signal != ( number_of_words - 1 ) )
{
for ( j = 0 ; j < counter ; j ++ )
{
fprintf ( output_file , "%f\n" , store_centroids [ j ] );
}
}
else
{
fprintf ( output_file , "%d" , 1 );
}
fclose ( output_file );
counter = 0;
memset ( store_centroids , 0 , MAXIMUM_WORDS );
memset ( file_name , 0 , 300 );
memset ( file_extension , 0 , 4 );
memset ( file_path , 0 , 300 );
}
for ( i = 0 ; i < DICTIONARY_FILE_LENGTH ; i ++ )
{
free ( words_from_dictionary [ i ] );
}
free ( words_from_dictionary );
for ( i = 0 ; i < NUMBER_OF_CLUSTERS ; i ++ )
{
free ( centroid_matrix [ i ] );
}
free ( centroid_matrix );
for ( i = 0 ; i < MAXIMUM_WORDS ; i ++ )
{
free ( words_from_web_page [ i ] );
}
free ( words_from_web_page );
free ( cluster_data );
free ( file_extension );
free ( file_path );
//free ( file_name );
free ( store_centroids );
return ( EXIT_SUCCESS );
}
//reading the text file. I think this where the leaks occur the most because this is the function which is called several times
//in main()
void read_text_file ( FILE *file_pointer , char ** words_to_be_read , unsigned long int *number_of_words )
{
unsigned long int len = 0;
char *line = NULL;
while ( ! feof ( file_pointer ) )
{
getline ( &words_to_be_read [ ( *number_of_words) ++ ] , &len , file_pointer );
}
( *number_of_words )--;
fclose ( file_pointer );
}
void read_cluster_file ( FILE *file_pointer , int * cluster_data , unsigned long int *number_of_words )
{
while ( ! feof ( file_pointer ) )
{
fscanf ( file_pointer , "%d\n" , &cluster_data [ (*number_of_words) ++ ] );
}
( *number_of_words )--;
fclose ( file_pointer );
}
//this reads a two dimensional array
void read_centroid_matrix ( FILE * file_pointer , float ** centroid_matrix )
{
int i = 0;
int j = 0;
for ( i = 0 ; i < NUMBER_OF_CLUSTERS ; i ++ )
{
for ( j = 0 ; j < DIMENSIONS ; j ++ )
{
fscanf ( file_pointer, "%f" , ¢roid_matrix [ i ] [ j ] );
}
}
fclose ( file_pointer );
}
//this generates the file names to be read
char *file_name_generator ( unsigned long int i , char * str , char * file_path , char *common_path )
{
str = itoa ( i , str );
strcat ( file_path , common_path );
strcat ( str , ".dat" );
strcat ( file_path , str );
i ++;
return ( file_path );
}
char * itoa ( int n , char * s )
{
int i, sign;
if ( ( sign = n ) < 0 ) /* record sign */
n = -n; /* make n positive */
i = 0;
do { /* generate digits in reverse order */
s [ i++ ] = n % 10 + '0'; /* get next digit */
} while ( ( n /= 10 ) > 0 ); /* delete it */
if ( sign < 0 )
s [ i++ ] = '-';
s [ i ] = '\0';
reverse ( s );
return ( s );
}
char * reverse ( char s [ ] )
{
int i, j;
char c;
for ( i = 0, j = strlen ( s ) - 1; i < j; i ++, j -- )
{
c = s [ i ];
s [ i ] = s [ j ];
s [ j ] = c;
}
return ( s );
}
//this function computes cosine between the two vectors
float cosine_similarity ( float ** cluster_centroid , unsigned long int first_coordinate , unsigned long int second_coordinate )
{
size_t i = 0;
size_t j = 0;
float eval1 = 0;
float eval2 = 0;
float eval3 = 0;
float cosine = 0;
for ( i = 0 ; i < DIMENSIONS ; i++ )
{
eval1 = eval1 + ( cluster_centroid [ first_coordinate ] [ i ] * cluster_centroid [ second_coordinate ] [ i ] );
eval2 = eval2 + ( cluster_centroid [ first_coordinate ] [ i ] * cluster_centroid [ first_coordinate ] [ i ] );
eval3 = eval3 + ( cluster_centroid [ second_coordinate ] [ i ] * cluster_centroid [ second_coordinate ] [ i ] );
}
cosine = eval1 / ( sqrtf ( eval2 ) * sqrtf ( eval3 ) );
return ( cosine );
}