#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <sys/time.h>
#include <math.h>
#include <unistd.h>
#include "tmp/scores.h"
#include "tmp/tests.h"
#ifdef ENTROPIC_ERROR
#ifdef LEAST_SQUARES_ERROR
#warn Both entropic error and least squares error chosen. Using least squares.
#endif
#endif
#ifndef ENTROPIC_ERROR
#ifndef LEAST_SQUARES_ERROR
#define LEAST_SQUARES_ERROR
#endif
#endif
#define OUTPUT_FILE "perceptron.scores"
void init_wheel ();
void destroy_wheel ();
int get_random_test ();
void init_weights();
void destroy_weights ();
void write_weights (FILE * fp);
void scale_scores (double old_threshold, double new_threshold);
double evaluate_test (int test);
double evaluate_test_nogain (int test);
void train (int num_epochs, double learning_rate);
void usage ();
#define weight_to_score(x) (-threshold*(x)/bias)
#define score_to_weight(x) (-(x)*bias/threshold)
int wheel_size;
int * roulette_wheel;
double ham_preference = 2.0;
#define DEFAULT_THRESHOLD 5.0
double threshold = DEFAULT_THRESHOLD;
double * weights;
double bias;
int num_epochs = 15;
double learning_rate = 2.0;
double weight_decay = 1.0;
void init_wheel () {
int i;
int spam = 0, ham = 0;
roulette_wheel = (int*)calloc(num_nondup, sizeof(int));
wheel_size = 0;
roulette_wheel[0] = 0;
for (i = 0; i < num_nondup - 1; i++) {
int slot_size = 1;
if ( ! is_spam[i] ) {
slot_size += (int)(num_tests_hit[i] * ham_preference * tests_count[i]);
} else {
slot_size = tests_count[i];
}
wheel_size += slot_size;
if ( ! is_spam[i] ) {
ham += slot_size;
} else {
spam++;
}
roulette_wheel[i+1] = roulette_wheel[i] + slot_size;
}
printf ("Modified training set statistics: %d spam, %d ham.\n", spam, ham);
}
void destroy_wheel () {
if ( roulette_wheel ) {
free (roulette_wheel);
roulette_wheel = 0;
wheel_size = 0;
}
}
int get_random_test () {
int r;
int bottom, middle, top;
r = lrand48() % wheel_size;
bottom = 0;
top = num_nondup - 1;
middle = top / 2;
while (1) {
if ( r < roulette_wheel[bottom+1] ) {
return bottom;
} else if ( r >= roulette_wheel[top] ) {
return top;
} else if ( r >= roulette_wheel[middle] && r < roulette_wheel[middle+1] ) {
return middle;
} else if ( r < roulette_wheel[middle] ) {
top = middle-1;
bottom++;
middle = bottom + (top-bottom)/2;
} else {
bottom = middle+1;
top--;
middle = bottom + (top-bottom)/2;
}
}
}
void init_weights () {
int i;
weights = (double*)calloc(num_scores, sizeof(double));
bias = drand48() - 0.5;
for (i = 0; i < num_scores; i++) {
weights[i] = range_lo[i] + drand48() * (range_hi[i] - range_lo[i]);
}
}
void destroy_weights () {
if ( weights ) {
free(weights);
weights = 0;
}
}
void write_weights (FILE * fp) {
int i;
int ga_nn, ga_yy, ga_ny, ga_yn;
double nnscore, yyscore, nyscore, ynscore;
ga_nn = ga_yy = ga_ny = ga_yn = 0;
nnscore = yyscore = nyscore = ynscore = 0;
for (i = 0; i < num_nondup; i++) {
double score = weight_to_score(evaluate_test_nogain(i)) + threshold;
if ( score >= threshold && is_spam[i] ) {
ga_yy += tests_count[i];
yyscore += tests_count[i] * score;
} else if ( score < threshold && !is_spam[i] ) {
ga_nn += tests_count[i];
nnscore += tests_count[i] * score;
} else if ( score >= threshold && !is_spam[i] ) {
ga_ny += tests_count[i];
nyscore += tests_count[i] * score;
} else {
ga_yn += tests_count[i];
ynscore += tests_count[i] * score;
}
}
fprintf (fp,"\n# SUMMARY for threshold %3.1f:\n", threshold);
fprintf (fp,
"# Correctly non-spam: %6d %4.2f%%\n",
ga_nn,
(ga_nn / (float) num_ham) * 100.0);
fprintf (fp,
"# Correctly spam: %6d %4.2f%%\n",
ga_yy,
(ga_yy / (float) num_spam) * 100.0);
fprintf (fp,
"# False positives: %6d %4.2f%%\n",
ga_ny,
(ga_ny / (float) num_ham) * 100.0);
fprintf (fp,
"# False negatives: %6d %4.2f%%\n",
ga_yn,
(ga_yn / (float) num_spam) * 100.0);
fprintf (fp,"# Average score for spam: %3.3f ham: %3.1f\n",(ynscore+yyscore)/((double)(ga_yn+ga_yy)),(nyscore+nnscore)/((double)(ga_nn+ga_ny)));
fprintf (fp,"# Average for false-pos: %3.3f false-neg: %3.1f\n",(nyscore/(double)ga_ny),(ynscore/(double)ga_yn));
fprintf (fp,"# TOTAL: %6d %3.2f%%\n\n", num_tests, 100.0);
for (i = 0; i < num_scores; i++) {
if ( is_mutable[i] ) {
fprintf(fp, "score %-30s %2.3f # [%2.3f..%2.3f]\n", score_names[i], weight_to_score(weights[i]), range_lo[i], range_hi[i]);
} else {
fprintf(fp, "score %-30s %2.3f # not mutable\n", score_names[i], range_lo[i]);
}
}
}
void scale_scores (double old_threshold, double new_threshold) {
int i;
if ( old_threshold == new_threshold ) {
return;
}
for (i = 0; i < num_scores; i++) {
if ( is_mutable[i] ) {
range_lo[i] = range_lo[i] * new_threshold / old_threshold;
range_hi[i] = range_hi[i] * new_threshold / old_threshold;
}
}
}
double evaluate_test (int test) {
#ifdef LEAST_SQUARES_ERROR
return 1/(1+exp(-evaluate_test_nogain(test)));
#else
#ifdef ENTROPIC_ERROR
return tanh(evaluate_test_nogain(test));
#endif
#endif
}
double evaluate_test_nogain (int test) {
double sum;
int i;
sum = bias;
for (i = 0; i < num_tests_hit[test]; i++) {
sum += weights[tests_hit[test][i]];
}
sum += score_to_weight(scores[test]);
return sum;
}
void train (int num_epochs, double learning_rate) {
int epoch, random_test;
int i, j;
int * tests;
double y_out, error, delta;
tests = (int*)calloc(wheel_size, sizeof(int));
for (i = 0, j = 0; i < num_nondup-1; i++) {
for (; j < roulette_wheel[i+1]; j++) {
tests[j] = i;
}
}
for (; j < wheel_size; j++) {
tests[j] = num_nondup-1;
}
for (epoch = 0; epoch < num_epochs; epoch++) {
if ( weight_decay != 1.0 ) {
bias *= weight_decay;
for (i = 0; i < num_mutable; i++) {
weights[i] *= weight_decay;
}
}
for (i = 0; i < wheel_size-1; i++) {
int tmp;
int r = lrand48 () % (wheel_size - i);
tmp = tests[i];
tests[i] = tests[r+i];
tests[r+i] = tmp;
}
for (j = 0; j < wheel_size; j++) {
random_test = tests[j];
y_out = evaluate_test(random_test);
#ifdef LEAST_SQUARES_ERROR
error = is_spam[random_test] - y_out;
delta = y_out * (1-y_out) * error / (num_tests_hit[random_test]+1) * learning_rate;
#else
#ifdef ENTROPIC_ERROR
error = (2.0*is_spam[random_test]-1) - y_out;
delta = error / (num_tests_hit[random_test]+1) * learning_rate;
#endif
#endif
if ( epoch + 1 < num_epochs ) {
bias += delta;
}
for (i = 0; i < num_tests_hit[random_test]; i++) {
int idx = tests_hit[random_test][i];
weights[idx] += delta;
#ifdef IGNORE_SCORE_RANGES
if ( range_lo[idx] >= 0 && weights[idx] < 0 ) {
weights[idx] = 0;
} else if ( range_hi[idx] <= 0 && weights[idx] > 0 ) {
weights[idx] = 0;
}
#else
if ( weights[idx] < score_to_weight(range_lo[idx]) ) {
weights[idx] = score_to_weight(range_lo[idx]);
} else if ( weights[idx] > score_to_weight(range_hi[idx]) ) {
weights[idx] = score_to_weight(range_hi[idx]);
}
#endif
}
}
}
free(tests);
}
void usage () {
printf ("usage: perceptron [args]\n"
"\n"
" -p ham_preference = adds extra ham to training set multiplied by number of\n"
" tests hit (2.0 default)\n"
" -e num_epochs = number of epochs to train (15 default)\n"
" -l learning_rate = learning rate for gradient descent (2.0 default)\n"
" -t threshold = minimum threshold for spam (5.0 default)\n"
" -w weight_decay = per-epoch decay of learned weight and bias (1.0 default)\n"
" -h = print this help\n"
"\n");
exit(30);
}
int main (int argc, char ** argv) {
struct timeval tv, tv_start, tv_end;
long long int t_usec;
FILE * fp;
int arg;
while ((arg = getopt (argc, argv, "p:e:l:t:w:h?")) != -1) {
switch (arg) {
case 'p':
ham_preference = atof(optarg);
break;
case 'e':
num_epochs = atoi(optarg);
break;
case 'l':
learning_rate = atof(optarg);
break;
case 't':
threshold = atof(optarg);
break;
case 'w':
weight_decay = atof(optarg);
break;
case 'h':
case '?':
usage();
break;
}
}
gettimeofday (&tv, 0);
t_usec = tv.tv_sec * 1000000 + tv.tv_usec;
srand48 ((int)t_usec);
loadtests();
loadscores();
scale_scores (DEFAULT_THRESHOLD, threshold);
init_wheel ();
init_weights ();
gettimeofday(&tv_start, 0);
train(num_epochs, learning_rate);
gettimeofday(&tv_end, 0);
t_usec = tv_end.tv_sec * 1000000 + tv_end.tv_usec -
(tv_start.tv_sec *1000000 + tv_start.tv_usec);
printf ("Training time = %fs.\n", t_usec / 1000000.0f);
fp = fopen (OUTPUT_FILE, "w");
if ( fp ) {
write_weights(fp);
fclose(fp);
} else {
perror (OUTPUT_FILE);
}
destroy_weights ();
destroy_wheel ();
return 0;
}