📄 icsiboost.c

📁 Boosting is a meta-learning approach that aims at combining an ensemble of weak classifiers to form
💻 C
📖 第 1 页 / 共 5 页
字号:
12 3 4 5 下一页
/* Copyright (C) (2007) (Benoit Favre) <favre@icsi.berkeley.edu>This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.This program is distributed in the hope that it will be useful,but WITHOUT ANY WARRANTY; without even the implied warranty ofMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See theGNU General Public License for more details.You should have received a copy of the GNU General Public Licensealong with this program; if not, write to the Free SoftwareFoundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */#if HAVE_CONFIG_H# include <config.h>#endif#define USE_THREADS//#define USE_FLOATS#include "utils/common.h"#include "utils/debug.h"#include "utils/vector.h"#include "utils/string.h"#include "utils/hashtable.h"#include "utils/mapped.h"#include "utils/array.h"#ifdef USE_THREADS#include "utils/threads.h"#include <pthread.h>#include <unistd.h>#include <sys/types.h>#include <sys/stat.h>#include <fcntl.h>#endif#include <stdio.h>#include <stdlib.h>#include <string.h>#include <math.h>#include <time.h>#ifdef USE_FLOATS#define double float#endif// declare vectors that are interesting for usvector_implement_functions_for_type(float, 0.0);vector_implement_functions_for_type(int32_t, 0);/*int num_EXP=0;double EXP(double number){	num_EXP++;	if(isnan(number))die("exp(): had a NAN");	return exp(number);}int num_SQRT=0;double SQRT(double number){	num_SQRT++;	if(isnan(number))die("sqrt(): had a NAN");	return sqrt(number);}int num_LOG=0;double LOG(double number){	num_LOG++;	if(isnan(number))die("log(): had a NAN");	return log(number);}*/#define EXP(a) exp(a)#define LOG(a) log(a)inline double SQRT(double number){	if(number<0)return 0;	return sqrt(number);}/*WARNING:- garbage collection is not working yet => do not activate itTODO:- free memory and remove leaks (and go through that again and again)- frequency cutoff- sampleing and ranking- more COMMENTS !!!NOTES ON THE ORIGINAL:- text features are treated as bag-of-ngrams- shyp file: boostexter saves non-text discrete features as their id- shyp file: when combining the same classifiers, boostexter adds the alphas and averages the C_js*/#define FEATURE_TYPE_IGNORE 0#define FEATURE_TYPE_CONTINUOUS 1#define FEATURE_TYPE_TEXT 2#define FEATURE_TYPE_SCORED_TEXT 3#define FEATURE_TYPE_SET 4typedef struct template { // a column definition	int column;	string_t* name;	int type;	hashtable_t* dictionary;       // dictionary for token based template	vector_t* tokens;              // the actual tokens (if the feature is text; contains tokeninfo)	vector_t* values;              // continuous values (if the feature is continuous)	int32_t* ordered;             // ordered example ids according to value for continuous columns	vector_t* classifiers;         // classifiers related to that template (in classification mode only)} template_t;typedef struct example { // an instance	//vector_t* features;            // vector of (int or float according to templates)	int class;	double* weight;                // example weight by class	double* score;                 // example score by class, updated iteratively} example_t;#define CLASSIFIER_TYPE_THRESHOLD 1#define CLASSIFIER_TYPE_TEXT 2typedef struct weakclassifier { // a weak learner	template_t* template;          // the corresponding template	int type;                      // type in CLASSIFIER_TYPE_TEXT or CLASSIFIER_TYPE_THRESHOLD	int32_t token;                     // token for TEXT classifier	int column;                    // redundant with template	double threshold;              // threshold for THRESHOLD classifier	double alpha;                  // alpha from the paper	double objective;              // Z() value from minimization	double *c0;                    // weight by class, unknown	double *c1;                    // weight by class, token absent or below threshold	double *c2;                    // weight by class, token present or above threshold} weakclassifier_t;typedef struct tokeninfo { // store info about a word (or token)	int32_t id;	char* key;	size_t count;	vector_t* examples;} tokeninfo_t;double smoothing=0.5;              // -E <number> in boostexterint verbose=0;int output_weights=0;int output_scores=0;//int *random_sequence;#define y_l(x,y) (x->class==y?1.0:-1.0)    // y_l() from the paper#define b(x,y) (x->class==y?1:0)           // b() from the paper (binary class match)weakclassifier_t* train_text_stump(double min_objective, template_t* template, vector_t* examples, double** sum_of_weights, int num_classes){	int i,l;	int column=template->column;	size_t num_tokens=template->tokens->length;	int32_t t;	double* weight[2][num_classes]; // weight[b][label][token] (=> D() in the paper), only for token presence (absence is infered from sum_of_weights)	for(l=0;l<num_classes;l++)	{		weight[0][l]=MALLOC(sizeof(double)*num_tokens);		weight[1][l]=MALLOC(sizeof(double)*num_tokens);	}	for(t=1;t<num_tokens;t++)  // initialize	{		for(l=0;l<num_classes;l++)		{			weight[0][l][t]=0.0;			weight[1][l][t]=0.0;		}		tokeninfo_t* tokeninfo=(tokeninfo_t*)vector_get(template->tokens, t);		//fprintf(stdout,"%s [%s] %d\n",template->name->data,tokeninfo->key,tokeninfo->examples->length);		for(i=0;i<tokeninfo->examples->length;i++) // compute the presence weights		{			example_t* example=(example_t*)vector_get(examples,vector_get_int32_t(tokeninfo->examples,i));			for(l=0;l<num_classes;l++)			{				weight[b(example,l)][l][t]+=example->weight[l];			}		}	}	weakclassifier_t* classifier=NULL; // init an empty classifier	classifier=MALLOC(sizeof(weakclassifier_t));	classifier->template=template;	classifier->threshold=NAN;	classifier->alpha=1.0;	classifier->type=CLASSIFIER_TYPE_TEXT;	classifier->token=0;	classifier->column=column;	classifier->objective=1.0;	classifier->c0=MALLOC(sizeof(double)*num_classes);	classifier->c1=MALLOC(sizeof(double)*num_classes);	classifier->c2=MALLOC(sizeof(double)*num_classes);	double epsilon=smoothing/(num_classes*examples->length);	//min_objective=1;	for(t=1;t<num_tokens;t++)	{		double objective=0;		for(l=0;l<num_classes;l++) // compute the objective function Z()=sum_j(sum_l(SQRT(W+*W-))		{			/*if(weight[0][l][t]<0)weight[0][l][t]=0.0;			if(weight[1][l][t]<0)weight[1][l][t]=0.0;*/			objective+=SQRT((sum_of_weights[1][l]-weight[1][l][t])*(sum_of_weights[0][l]-weight[0][l][t]));			objective+=SQRT(weight[1][l][t]*weight[0][l][t]);		}		objective*=2;		//fprintf(stdout,"DEBUG: column=%d token=%d obj=%f\n",column,t,objective);		if(objective-min_objective<-1e-11) // select the argmin()		{			min_objective=objective;			classifier->token=t;			classifier->objective=objective;			for(l=0;l<num_classes;l++)  // update c0, c1 and c2 => c0 and c1 are the same for text stumps			{				classifier->c0[l]=0.5*LOG((sum_of_weights[1][l]-weight[1][l][t]+epsilon)/(sum_of_weights[0][l]-weight[0][l][t]+epsilon));				classifier->c1[l]=classifier->c0[l];				//classifier->c0[l]=0;				//classifier->c1[l]=0.5*LOG((sum_of_weights[1][l]-weight[1][l][t]+epsilon)/(sum_of_weights[0][l]-weight[0][l][t]+epsilon));				classifier->c2[l]=0.5*LOG((weight[1][l][t]+epsilon)/(weight[0][l][t]+epsilon));			}		}	}	for(l=0;l<num_classes;l++) // free memory	{		FREE(weight[0][l]);		FREE(weight[1][l]);	}	//tokeninfo_t* info=vector_get(template->tokens,classifier->token);	//fprintf(stdout,"DEBUG: column=%d token=%s obj=%f %s\n",column,info->key,classifier->objective,template->name->data);	if(classifier->token==0) // no better classifier has been found	{		FREE(classifier->c0);		FREE(classifier->c1);		FREE(classifier->c2);		FREE(classifier);		return NULL;	}	return classifier;}weakclassifier_t* train_continuous_stump(double min_objective, template_t* template, vector_t* examples_vector, int num_classes){	size_t i,j,l;	int column=template->column;	float* values=(float*)template->values->data;	int32_t* ordered=template->ordered;	example_t** examples=(example_t**)examples_vector->data;	if(ordered==NULL) // only order examples once, then keep the result in the template	{		ordered=MALLOC(sizeof(int32_t)*examples_vector->length);		int32_t index=0;		for(index=0;index<examples_vector->length;index++)ordered[index]=index;		//for(i=0;i<examples->length && i<4;i++)fprintf(stdout,"%d %f\n",i,vector_get_float(((example_t*)vector_get(examples,i))->features,column));		int local_comparator(const void* a, const void* b)		{			int32_t aa=*(int32_t*)a;			int32_t bb=*(int32_t*)b;			float aa_value=values[aa];			float bb_value=values[bb];			//float aa_value=vector_get_float(template->values,(size_t)aa);			//float bb_value=vector_get_float(template->values,(size_t)bb);			//float aa_value=vector_get_float(((example_t*)vector_get(examples,(size_t)aa))->features,column);			//float bb_value=vector_get_float(((example_t*)vector_get(examples,(size_t)bb))->features,column);			//fprintf(stdout,"%d(%f) <=> %d(%f)\n",aa,aa_value,bb,bb_value);			//if(aa_value<bb_value)return -1;			//if(aa_value>bb_value)return 1;			if(isnan(aa_value) || aa_value>bb_value)return 1; // put the NAN (unknown values) at the end of the list			if(isnan(bb_value) || aa_value<bb_value)return -1;			return 0;		}		qsort(ordered,examples_vector->length,sizeof(int32_t),local_comparator);		//vector_sort(ordered,local_comparator);		template->ordered=ordered;	}	double weight[3][2][num_classes]; // D(j,b,l)	for(j=0;j<3;j++)		for(l=0;l<num_classes;l++)		{			weight[j][0][l]=0.0;			weight[j][1][l]=0.0;		}	for(i=0;i<examples_vector->length;i++) // compute the "unknown" weights and the weight of examples after threshold	{		int32_t example_id=ordered[i];		example_t* example=examples[example_id];		//fprintf(stdout,"%d %f\n",column,vector_get_float(example->features,column));		for(l=0;l<num_classes;l++)		{			if(isnan(values[example_id]))				weight[0][b(example,l)][l]+=example->weight[l];			else				weight[2][b(example,l)][l]+=example->weight[l];		}	}	weakclassifier_t* classifier=NULL; // new classifier	classifier=MALLOC(sizeof(weakclassifier_t));	classifier->template=template;	classifier->threshold=NAN;	classifier->alpha=1.0;	classifier->type=CLASSIFIER_TYPE_THRESHOLD;	classifier->token=0;	classifier->column=column;	classifier->objective=1.0;	classifier->c0=MALLOC(sizeof(double)*num_classes);	classifier->c1=MALLOC(sizeof(double)*num_classes);	classifier->c2=MALLOC(sizeof(double)*num_classes);	double epsilon=smoothing/(num_classes*examples_vector->length);	for(i=0;i<examples_vector->length-1;i++) // compute the objective function at every possible threshold (in between examples)	{		int32_t example_id=ordered[i];		example_t* example=examples[example_id];		//fprintf(stdout,"%zd %zd %f\n",i,vector_get_int32_t(ordered,i),vector_get_float(template->values,example_id));		if(isnan(values[example_id]))break; // skip unknown values		//example_t* next_example=(example_t*)vector_get(examples,(size_t)next_example_id);		for(l=0;l<num_classes;l++) // update the objective function by putting the current example the other side of the threshold		{			weight[1][b(example,l)][l]+=example->weight[l];			weight[2][b(example,l)][l]-=example->weight[l];		}		int next_example_id=ordered[i+1];		if(values[example_id]==values[next_example_id])continue; // same value		double objective=0;		for(l=0;l<num_classes;l++) // compute objective Z()		{			/*if(weight[0][1][l]<0)weight[0][1][l]=0.0;			if(weight[0][0][l]<0)weight[0][0][l]=0.0;			if(weight[1][1][l]<0)weight[1][1][l]=0.0;			if(weight[1][0][l]<0)weight[1][0][l]=0.0;			if(weight[2][1][l]<0)weight[2][1][l]=0.0;			if(weight[2][0][l]<0)weight[2][0][l]=0.0;*/			objective+=SQRT(weight[0][1][l]*weight[0][0][l]);			objective+=SQRT(weight[1][1][l]*weight[1][0][l]);			objective+=SQRT(weight[2][1][l]*weight[2][0][l]);		}		objective*=2;		//fprintf(stdout,"DEBUG: column=%d threshold=%f obj=%f\n",column,(vector_get_float(next_example->features,column)+vector_get_float(example->features,column))/2,objective);		if(objective-min_objective<-1e-11) // get argmin		{			classifier->objective=objective;
12 3 4 5 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -