find_rules.c code
Dear Hitori
This is the code, please help me to do any correction. I will appreciate
/*----------------------------------------------------------------------------
File: find_rules.c
Date: March 7, 1997
Copyright (c) Risto Karjalainen. All rights reserved.
This program finds technical trading rules for a stock index.
----------------------------------------------------------------------------*/
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <errno.h>
#include <ctype.h>
#include <float.h>
#include "util.h"
#include "gp.h"
#include "gp_util.h"
#include "find_util.h"
#include "find_functions.h"
#include "find_template.h"
static pop_t population;
static int date[MAX_DATA];
static float price[MAX_DATA];
static float norm_price[MAX_DATA];
static float compounded[MAX_DATA];
static float riskfree[MAX_DATA];
static trade_t trades[MAX_TRADES];
main (argc, argv)
int argc;
char *argv[];
{
int generations;
stat_t statistics;
char data_file[256];
char log_file[256];
char rule_file[256];
FILE *df = NULL;
FILE *lf = NULL;
FILE *rf = NULL;
float cost;
int last_day[MAX_YEARS];
int delay;
int normalize;
int n_stagnated;
int max_stagnated;
int n_clusters;
cluster_t cluster[MAX_GENOMES];
float fitness;
float selection;
float target;
float max_fitness[MAX_TRIALS];
float max_selection[MAX_TRIALS];
float fitness_level[MAX_GENERATIONS];
float selection_level[MAX_GENERATIONS];
int count;
int size;
int n_collection[MAX_TRIALS];
int g_collection[MAX_TRIALS];
int max_rules_per_trial;
cluster_t collection[MAX_TRIALS][MAX_COLLECTION];
genome_t rule[MAX_TRIALS][MAX_COLLECTION];
int order[MAX_GENOMES];
int offspring;
float annual_return;
float cumul_return;
float raw;
float excess;
float range[2];
float best;
float breakpoint;
int n_trades;
int i,j,k;
int first_year;
int n_years;
int trial;
int max_trials;
int verbose;
int t1,t2;
int s1,s2;
int failed;
int dt;
int t;
int g;
long e;
int n;
max_trials = MAX_TRIALS;
max_rules_per_trial = 1;
target = 0.0;
breakpoint = FLT_MAX;
generations = MAX_GENERATIONS/2;
e = seed();
normalize = TRUE;
cost = 0.003;
delay = 0;
verbose = FALSE;
t1 = t2 = s1 = s2 = 0;
strcpy(data_file, "");
strcpy(log_file, "");
strcpy(rule_file, "");
population.size = MAX_GENOMES/4;
population.max_nodes = MAX_NODES;
population.max_depth = MAX_DEPTH;
population.min_depth = MIN_DEPTH;
population.pressure = PRESSURE;
population.prob_mutations = PROB_MUTATIONS;
population.type = Boolean;
population.env = range;
range[0] = 0.0;
range[1] = 2.0;
/*----------------------------------------------------------------------------
Parse the command line and check that the parameters are valid.
----------------------------------------------------------------------------*/
if (argc == 1)
{
print_help_message(stderr, argv[0]);
exit(1);
}
if (read_cline(argc, argv, &max_trials, &max_rules_per_trial, &target, &breakpoint,
&generations, &e, data_file, log_file, rule_file, &normalize,
&t1, &t2, &s1, &s2, &cost, &delay, &(population.type), &(population.size),
&(population.max_nodes), &(population.min_depth),
&(population.max_depth), range, &verbose))
{
if (failed = !(1 <= max_trials) && (max_trials <= MAX_TRIALS))
fprintf(stderr, "(%s) Invalid value for the number of trials (%d) -- expected an integer between %d and %d.\n",
argv[0], max_trials, 1, MAX_TRIALS);
else if (failed = !(1 <= max_rules_per_trial) &&
(max_rules_per_trial <= MAX_COLLECTION))
fprintf(stderr, "(%s) Invalid value for the number of rules per trial (%d) -- expected an integer between %d and %d.\n",
argv[0], max_rules_per_trial, 1, MAX_COLLECTION);
else if (failed = !(breakpoint > target))
fprintf(stderr, "(%s) Invalid value for the breakpoint (%f) -- expected a real number greater than %f.\n",
argv[0], breakpoint, target);
else if (failed = !(1 <= generations) && (generations <= MAX_GENERATIONS))
fprintf(stderr, "(%s) Invalid value for the number of generations (%d) -- expected an integer between %d and %d.\n",
argv[0], generations, 1, MAX_GENERATIONS);
else if (failed = !strcmp(data_file, ""))
fprintf(stderr, "(%s) No data file specified.\n", argv[0]);
else if (failed = !(cost >= 0.0))
fprintf(stderr, "(%s) Invalid value for the transaction cost (%f) -- expected a positive real number.\n",
argv[0], cost);
else if (failed = (delay < 0))
fprintf(stderr, "(%s) Invalid value for execution delay (%d) -- expected a positive number.\n",
argv[0], delay);
else if (failed =!(10 <= population.size) &&
(population.size <= MAX_GENOMES))
fprintf(stderr, "(%s) Invalid value for population size (%d) -- expected an integer between %d and %d.\n",
argv[0], population.size, 10, MAX_GENOMES);
else if (failed =!(1 <= population.max_nodes) &&
(population.max_nodes <= MAX_GENOMES))
fprintf(stderr, "(%s) Invalid value for node limit (%d) -- expected an integer between %d and %d.\n",
argv[0], population.max_nodes, 1, MAX_NODES);
else if (failed =!(1 <= population.max_depth) &&
(population.max_depth <= MAX_DEPTH))
fprintf(stderr, "(%s) Invalid value for max tree depth (%d) -- expected an integer between %d and %d.\n",
argv[0], population.max_depth, 1, MAX_DEPTH);
else if (failed =!(1 <= population.min_depth) &&
(population.min_depth <= population.max_depth))
fprintf(stderr, "(%s) Invalid value for min tree depth (%d) -- expected an integer between %d and %d.\n",
argv[0], population.max_depth, 1, population.min_depth);
if (failed)
exit(1);
}
else
{
printf("(%s) Parsing command line failed.\n", argv[0]);
exit(1);
}
max_stagnated = (generations+1)/2;
/*----------------------------------------------------------------------------
Define the function templates which determine the type of nodes that
can be part of the trees created during the evolution.
----------------------------------------------------------------------------*/
define_type(Boolean, type_label[Boolean]);
define_type(Real, type_label[Real]);
define_type(Variable, type_label[Variable]);
for (i = 0; i < sizeof(template)/sizeof(template_t); i++)
define_function(template[i]);
define_tree(population.max_nodes, sizeof(data_t));
/*----------------------------------------------------------------------------
Read in the data.
----------------------------------------------------------------------------*/
if (!(df = fopen(data_file, "r")))
{
fprintf(stderr, "(%s) Cannot open %s: %s\n", argv[0], data_file, strerror(errno));
exit(1);
}
if (!init_data(df, &first_year, &n_years))
exit(1);
printf("Data in file '%s' spans years %d-%d.\n\n", data_file, first_year, first_year+n_years-1);
if (n_years < 2)
{
fprintf(stderr, "(%s) Not enough data: At least %d years needed.\n", argv[0], 2);
exit(1);
}
read_data(df, first_year, n_years, date, price, compounded, riskfree, last_day);
normalize_price(price, norm_price, NORMALIZED, TRADING_YEAR, last_day[n_years-1]+1);
fclose(df);
/*----------------------------------------------------------------------------
Check that the training and selection periods are valid.
----------------------------------------------------------------------------*/
if (failed = !((first_year+1 <= t1) && (t1 <= first_year+n_years-1) &&
(t1 <= t2) && (t2 <= first_year+n_years-1)))
fprintf(stderr, "(%s) Invalid training period (%d-%d) -- a range within %d-%d expected.\n", argv[0], t1, t2, first_year+1, first_year+n_years-1);
else if (failed = !((first_year+1 <= s1) && (s1 <= first_year+n_years-1) &&
(s1 <= s2) && (s2 <= first_year+n_years-1)))
fprintf(stderr, "(%s) Invalid selection period (%d-%d) -- a range within %d-%d expected.\n", argv[0], s1, s2, first_year+1, first_year+n_years-1);
if (failed)
exit(1);
t1 -= first_year;
t2 -= first_year;
s1 -= first_year;
s2 -= first_year;
/*----------------------------------------------------------------------------
Open a log file and print out the parameters.
----------------------------------------------------------------------------*/
if (!strcmp(log_file, ""))
lf = NULL;
else if (!(lf = fopen(log_file, "w")))
fprintf(stderr, "(%s) Cannot open %s: %s\n", argv[0], log_file, strerror(errno));
else
{
fprintf(lf, "\nProgram: %s (%s)\n\n", __FILE__, __DATE__);
fprintf(lf, "Number of trials = %d\n", max_trials);
fprintf(lf, "Number of rules/trial = %d\n", max_rules_per_trial);
fprintf(lf, "Target fitness = %.4f\n", target);
fprintf(lf, "Breakpoint fitness = %.4f\n", breakpoint);
fprintf(lf, "Maximum number of generations/trial = %d\n", generations);
fprintf(lf, "Population size = %d\n", population.size);
fprintf(lf, "Max number of nodes = %d\n", population.max_nodes);
fprintf(lf, "Max tree depth = %d\n", population.max_depth);
fprintf(lf, "Min tree depth = %d\n", population.min_depth);
fprintf(lf, "Selection pressure = %.2f\n", population.pressure);
fprintf(lf, "Mutation probability = %.3f\n", population.prob_mutations);
fprintf(lf, "Population type: %s\n", type_label[population.type]);
fprintf(lf, "Seed for random number generator = %d\n", e);
fprintf(lf, "Data file = '%s'\n", data_file);
fprintf(lf, "Log file = '%s'\n", log_file);
fprintf(lf, "Rule file = '%s'\n", rule_file);
fprintf(lf, "Data are %s normalized.\n", normalize ? "" : "not ");
fprintf(lf, "One-way transaction cost = %.3f\n", cost);
fprintf(lf, "Range for random numbers = (%.3f,%.3f)\n", range[0], range[1]);
fprintf(lf, "First training year = %d\n", first_year+t1);
fprintf(lf, "Last training year = %d\n", first_year+t2);
fprintf(lf, "First selection year = %d\n", first_year+s1);
fprintf(lf, "Last selection year = %d\n", first_year+s2);
fprintf(lf, "Execution delay = %d\n", delay);
fprintf(lf, "\n");
fflush(lf);
}
/*----------------------------------------------------------------------------
Print the compounded returns for holding the index.
----------------------------------------------------------------------------*/
printf("Annual and cumulative compounded returns:\n");
for (t = t1-1; t < t2; t++)
{
annual_return = compounded_return(compounded, t+1, t+1, last_day, cost);
cumul_return = compounded_return(compounded, t1, t+1, last_day, cost);
printf(" Year %d:\t%+.4f\t\t%+.4f\n", first_year+t+1, annual_return, cumul_return);
}
printf("\n");
/*----------------------------------------------------------------------------
Allocate memory for the trading rules.
----------------------------------------------------------------------------*/
for (i = 0; i < max_trials; i++)
for (j = 0; j < max_rules_per_trial; j++)
{
rule[i][j].chromosome = new_tree(sizeof(data_t));
collection[i][j].genome = &(rule[i][j]);
}
/*----------------------------------------------------------------------------
Open a text file to hold the saved rules. If none specified, use stdout.
----------------------------------------------------------------------------*/
if (!strcmp(rule_file, ""))
{
rf = stdout;
fprintf(stderr, "(%s) Rules will be printed on stdout (use option -r to specify a rule file).\n", argv[0]);
}
else if (!(rf = fopen(rule_file, "w")))
{
fprintf(stderr, "(%s) Cannot open %s: %s\n", argv[0], rule_file, strerror(errno));
exit(1);
}
/*----------------------------------------------------------------------------
Loop over the trials.
----------------------------------------------------------------------------*/
for (trial = 0; trial < max_trials; trial++)
{
printf("Trial %d starts...\n\n", trial+1);
if (lf) fprintf(lf, "Trial %d starts...\n\n", trial+1);
max_fitness[trial] = max_selection[trial] = -FLT_MAX;
n_collection[trial] = 0;
n_stagnated = 0;
/*----------------------------------------------------------------------------
Create the initial population.
----------------------------------------------------------------------------*/
create_population(&population, statistics);
/*----------------------------------------------------------------------------
Evolve new generations. At each generation, evaluate each new trading
rule, with the fitness equal to the average annualized excess return.
----------------------------------------------------------------------------*/
for (g = 0; g < generations; g++)
{
for (i = 0; i < population.size; i++)
{
offspring = (g == 0) ? i : create_offspring(&population, statistics);
eval_genome(normalize ? norm_price : price, compounded, t1, t2,
last_day, delay, &(population.genome[offspring]),
&n_trades, trades);
raw = compute_return(compounded, riskfree, t1, t2, last_day,
n_trades, trades, cost);
excess = raw - compounded_return(compounded, t1, t2, last_day, cost);
population.genome[offspring].fitness = excess/(t2-t1+1);
}
/*----------------------------------------------------------------------------
Compute statistics about the evolution.
----------------------------------------------------------------------------*/
compute_statistics(&population, statistics);
/*----------------------------------------------------------------------------
Cluster the population, grouping together rules with identical fitness.
----------------------------------------------------------------------------*/
n_clusters = cluster_population(&population, cluster);
printf("Generation %d (%d clusters, average fitness = %.4f)\n\n",
g, n_clusters, statistics[g].avg_fitness);
if (lf) fprintf(lf, "Generation %d (%d clusters, average fitness = %.4f)\n\n",
g, n_clusters, statistics[g].avg_fitness);
n_stagnated++;
/*----------------------------------------------------------------------------
Break if fitness in the training period is too high.
----------------------------------------------------------------------------*/
if (statistics[g].avg_fitness > breakpoint)
{
printf("Population fitness exceeded the breakpoint (%.4f)\n\n", breakpoint);
if (lf) fprintf(lf, "Population fitness exceeded the breakpoint (%.4f)\n\n", breakpoint);
break;
}
else if (statistics[g].avg_fitness > target)
{
count = 0;
fitness_level[g] = selection_level[g] = 0.0;
/*----------------------------------------------------------------------------
Evaluate the rules in the selection period.
----------------------------------------------------------------------------*/
for (i = 0; i < n_clusters; i++)
{
if (cluster[i].genome->fitness < target)
continue;
eval_genome(normalize ? norm_price : price, compounded,
s1, s2, last_day, delay, cluster[i].genome,
&n_trades, trades);
raw = compute_return(compounded, riskfree, s1, s2,
last_day, n_trades, trades, cost);
excess = raw - compounded_return(compounded, s1, s2, last_day, cost);
selection = excess/(s2-s1+1);
if (verbose)
{
printf("\tCluster %d (%d rules): fitness = %.4f, selection = %.4f (%d trades)\n",
i+1, cluster[i].size, cluster[i].genome->fitness,
selection, n_trades/(s2-s1+1));
if (lf) fprintf(lf, "\tCluster %d (%d rules): fitness = %.4f, selection = %.4f (%d trades)\n",
i+1, cluster[i].size, cluster[i].genome->fitness,
selection, n_trades/(s2-s1+1));
}
/*----------------------------------------------------------------------------
Save a rule if it improves the excess returns in the selection period.
----------------------------------------------------------------------------*/
if ((count < max_rules_per_trial) && (selection > max_selection[trial]))
{
max_fitness[trial] = cluster[i].genome->fitness;
max_selection[trial] = selection;
fitness_level[g] += cluster[i].genome->fitness;
selection_level[g] += selection;
if (n_stagnated > 0)
{
n_stagnated = 0;
g_collection[trial] = g;
n_collection[trial] = 0;
}
collection[trial][n_collection[trial]].size = cluster[i].size;
copy_genome(*(cluster[i].genome), collection[trial][n_collection[trial]].genome);
n_collection[trial]++;
}
count++;
}
if (n_collection[trial] > 0)
{
fitness_level[g] /= n_collection[trial];
selection_level[g] /= n_collection[trial];
}
if (verbose)
{
printf("\n");
if (lf) fprintf(lf, "\n");
}
}
/*----------------------------------------------------------------------------
Break if the population remains stagnated for too long.
----------------------------------------------------------------------------*/
if (n_stagnated == 0)
{
printf("\tSaved %d rules: average fitness = %.4f, selection = %.4f\n\n",
n_collection[trial], fitness_level[g], selection_level[g]);
if (lf) fprintf(lf, "\tSaved %d rules: average fitness = %.4f, selection = %.4f\n\n",
n_collection[trial], fitness_level[g], selection_level[g]);
}
else if (n_stagnated == max_stagnated)
{
printf("Population stagnated at generation %d.\n\n", g);
if (lf) fprintf(lf, "Population stagnated at generation %d.\n\n", g);
break;
}
}
if (max_fitness[trial] > -FLT_MAX)
{
printf("Trial %d (%d rules): fitness = %.4f, selection = %.4f\n\n", trial+1,
n_collection[trial], max_fitness[trial], max_selection[trial]);
if (lf) fprintf(lf, "Trial %d (%d rules): fitness = %.4f, selection = %.4f\n\n", trial+1,
n_collection[trial], max_fitness[trial], max_selection[trial]);
}
else
{
printf("Trial %d (%d rules): fitness = -Infinity\n\n", trial+1, n_collection[trial]);
if (lf) fprintf(lf, "Trial %d (%d rules): fitness = -Infinity\n\n", trial+1, n_collection[trial]);
}
/*----------------------------------------------------------------------------
Save the rules in a file and test them after the selection period.
----------------------------------------------------------------------------*/
for (j = 0; j < n_collection[trial]; j++)
{
fprintf(lf ? lf :stdout, "Cluster %d (%d rules):\n", j+1, collection[trial][j].size);
if (rf)
{
save_genome(rf, collection[trial][j].genome);
fprintf(rf, "\n");
fflush(rf);
}
eval_genome(normalize ? norm_price : price, compounded, s2+1, n_years-1,
last_day, delay, collection[trial][j].genome, &n_trades, trades);
raw = compute_return(compounded, riskfree, s2+1, n_years-1, last_day,
n_trades, trades, cost);
excess = raw - compounded_return(compounded, s2+1, n_years-1, last_day, cost);
printf("\tTest (cluster %d) = %.4f (%d trades/year)\n\n", i+1, excess/(n_years-s2-1), n_trades/(n_years-s2-1));
if (lf) fprintf(lf, "\tTest (cluster %d) = %.4f (%d trades/year)\n\n", i+1, excess/(n_years-s2-1), n_trades/(n_years-s2-1));
}
if (lf) fflush(lf);
dispose_population(&population);
}
/*----------------------------------------------------------------------------
Report the ranking of trials according to the selection period.
----------------------------------------------------------------------------*/
printf("Trials ranked according to the selection period:\n\n");
if (lf) fprintf(lf, "Trials ranked according to the selection period:\n\n");
order_by_key(order, max_trials, max_selection, descending_order);
for (i = 0; i < max_trials; i++)
{
if (n_collection[order[i]] == 0) break;
printf("Trial %d (%d rules): fitness = %.4f, selection = %.4f\n\n", order[i]+1,
n_collection[order[i]], max_fitness[order[i]], max_selection[order[i]]);
if (lf) fprintf(lf, "Trial %d (%d rules): fitness = %.4f, selection = %.4f\n\n", order[i]+1,
n_collection[order[i]], max_fitness[order[i]], max_selection[order[i]]);
}
if (lf) fclose(lf);
}