📄 icsiboost.c

📁 Boosting is a meta-learning approach that aims at combining an ensemble of weak classifiers to form
💻 C
📖 第 1 页 / 共 5 页
字号:
			classifier->threshold=((double)values[next_example_id]+(double)values[example_id])/2.0; // threshold between current and next example			if(isnan(classifier->threshold))die("threshold is nan, column=%d, objective=%f, i=%zd",column,objective,i); // should not happend			//fprintf(stdout," %d:%d:%f",column,i,classifier->threshold);			min_objective=objective;			for(l=0;l<num_classes;l++) // update class weight			{				classifier->c0[l]=0.5*LOG((weight[0][1][l]+epsilon)/(weight[0][0][l]+epsilon));				classifier->c1[l]=0.5*LOG((weight[1][1][l]+epsilon)/(weight[1][0][l]+epsilon));				classifier->c2[l]=0.5*LOG((weight[2][1][l]+epsilon)/(weight[2][0][l]+epsilon));			}		}	}	//fprintf(stdout,"DEBUG: column=%d threshold=%f obj=%f %s\n",column,classifier->threshold,classifier->objective,template->name->data);	if(isnan(classifier->threshold)) // not found a better classifier	{		FREE(classifier->c0);		FREE(classifier->c1);		FREE(classifier->c2);		FREE(classifier);		return NULL;	}	return classifier;}#ifdef USE_THREADS#define SHARED(type,name,value) pthread_mutex_t name ## _mutex = PTHREAD_MUTEX_INITIALIZER; type name=value;#define SHARED_NOINIT(type,name) pthread_mutex_t name ## _mutex; type name;#define LOCK(variable) pthread_mutex_lock(&(variable ## _mutex));#define UNLOCK(variable) pthread_mutex_unlock(&(variable ## _mutex));SHARED(int,finished,0);typedef struct workertoolbox {	vector_t* examples; // stuff that matter to the weak learner	vector_t* templates;	vector_t* classes;	double **sum_of_weights;	SHARED_NOINIT(int,next_column); // the next column to process	SHARED_NOINIT(weakclassifier_t*,best_classifier); // the result of a job	//sem_t* ready_to_process; // if available, a thread can start working	semaphore_t* ready_to_process;	//sem_t* result_available; // if available, a result is avaiable in best_classifier	semaphore_t* result_available;} workertoolbox_t;workertoolbox_t* toolbox=NULL;//SHARED(int,next_worker_num,0);void* threaded_worker(void* data){	int column;	/*LOCK(next_worker_num);	int worker_num=next_worker_num;	next_worker_num++;	UNLOCK(next_worker_num);*/	while(!finished)	{		//sem_wait(toolbox->ready_to_process);		//if(verbose)fprintf(stdout,"%d worker thread ready\n",worker_num);		semaphore_eat(toolbox->ready_to_process);		if(finished)pthread_exit(NULL);		LOCK(toolbox->next_column);		column=toolbox->next_column;		toolbox->next_column++;		UNLOCK(toolbox->next_column);		template_t* template=(template_t*)vector_get(toolbox->templates,column);		//if(verbose)fprintf(stdout,"%d worker thread processing: %s\n",worker_num,template->name->data);		//----------- do the job		weakclassifier_t* current=NULL;		if(template->type==FEATURE_TYPE_CONTINUOUS)		{			current=train_continuous_stump(1.0, template, toolbox->examples, toolbox->classes->length);		}		else if(template->type==FEATURE_TYPE_TEXT || template->type==FEATURE_TYPE_SET)		{			current=train_text_stump(1.0, template, toolbox->examples, toolbox->sum_of_weights, toolbox->classes->length);		}		//----------- return the result		if(current!=NULL)		{			//fprintf(stdout,"obj:%f %f\n",current->objective,toolbox->best_classifier->objective);			LOCK(toolbox->best_classifier);			if(current->objective-toolbox->best_classifier->objective<-1e-11)			{				toolbox->best_classifier->template=current->template;				toolbox->best_classifier->type=current->type;				toolbox->best_classifier->token=current->token;				toolbox->best_classifier->column=current->column;				toolbox->best_classifier->threshold=current->threshold;				toolbox->best_classifier->alpha=current->alpha;				toolbox->best_classifier->objective=current->objective;				memcpy(toolbox->best_classifier->c0,current->c0,sizeof(double)*toolbox->classes->length);				memcpy(toolbox->best_classifier->c1,current->c1,sizeof(double)*toolbox->classes->length);				memcpy(toolbox->best_classifier->c2,current->c2,sizeof(double)*toolbox->classes->length);			}			UNLOCK(toolbox->best_classifier);			FREE(current->c0);			FREE(current->c1);			FREE(current->c2);			FREE(current);		}		//sem_post(toolbox->result_available);		semaphore_feed(toolbox->result_available);	}	pthread_exit(NULL);}#endif/* compute error rate AND update weights   in testing conditions (dev or test set) sum_of_weights is NULL, so just compute error rate   => need to be parallelized*/double compute_classification_error(vector_t* classifiers, vector_t* examples, double** sum_of_weights, int num_classes){	int i=0;	int l=0;	double error=0;	double normalization=0;	weakclassifier_t* classifier=(weakclassifier_t*)vector_get(classifiers,classifiers->length-1);	if(classifier->type==CLASSIFIER_TYPE_THRESHOLD)	{		for(i=0;i<examples->length;i++)		{			example_t* example=(example_t*)vector_get(examples,i);			float value=vector_get_float(classifier->template->values,i);			if(isnan(value))			{				for(l=0;l<num_classes;l++)				{					example->score[l]+=classifier->alpha*classifier->c0[l];					example->weight[l]=example->weight[l]*EXP(-classifier->alpha*y_l(example,l)*classifier->c0[l]);				}			}			else if(value<classifier->threshold)			{				for(l=0;l<num_classes;l++)				{					example->score[l]+=classifier->alpha*classifier->c1[l];					example->weight[l]=example->weight[l]*EXP(-classifier->alpha*y_l(example,l)*classifier->c1[l]);				}			}			else			{				for(l=0;l<num_classes;l++)				{					example->score[l]+=classifier->alpha*classifier->c2[l];					example->weight[l]=example->weight[l]*EXP(-classifier->alpha*y_l(example,l)*classifier->c2[l]);				}			}		}	}	else if(classifier->type==CLASSIFIER_TYPE_TEXT)	{		tokeninfo_t* tokeninfo=(tokeninfo_t*)vector_get(classifier->template->tokens,classifier->token);		int* seen_examples=MALLOC(sizeof(int)*examples->length);		memset(seen_examples,0,examples->length*sizeof(int));		for(i=0;i<tokeninfo->examples->length;i++)		{			int32_t example_id=vector_get_int32_t(tokeninfo->examples,i);			seen_examples[example_id]=1;		}		for(i=0;i<examples->length;i++)		{			example_t* example=(example_t*)vector_get(examples,i);			if(seen_examples[i]==1)			{				for(l=0;l<num_classes;l++)				{					example->score[l]+=classifier->alpha*classifier->c2[l];					example->weight[l]=example->weight[l]*EXP(-classifier->alpha*y_l(example,l)*classifier->c2[l]);				}			}			else // unknown or absent (c1 = c0)			{				for(l=0;l<num_classes;l++)				{					example->score[l]+=classifier->alpha*classifier->c1[l];					example->weight[l]=example->weight[l]*EXP(-classifier->alpha*y_l(example,l)*classifier->c1[l]);				}			}		}		FREE(seen_examples);	}	for(i=0;i<examples->length;i++)	{		example_t* example=(example_t*)vector_get(examples,i);		double max=-1;		int argmax=0;		for(l=0;l<num_classes;l++) // selected class = class with highest score		{			normalization+=example->weight[l]; // update Z() normalization (not the same Z as in optimization)			if(example->score[l]>max)			{				max=example->score[l];				argmax=l;			}		}		if(!b(example,argmax))error++; // error if the class is not the real class	}	if(sum_of_weights!=NULL)	{		//double min_weight=examples->length*num_classes;		//double max_weight=0;		//normalization/=num_classes*examples->length;		for(l=0;l<num_classes;l++) // update the sum of weights by class		{			sum_of_weights[0][l]=0.0;			sum_of_weights[1][l]=0.0;		}		for(i=0;i<examples->length;i++) // normalize the weights and do some stats for debuging		{			example_t* example=(example_t*)vector_get(examples,i);			//fprintf(stdout,"%d",i);			if(output_weights)fprintf(stdout,"iteration=%zd example=%d weights:\n",classifiers->length, i);			for(l=0;l<num_classes;l++)			{				example->weight[l]/=normalization;				if(output_weights)fprintf(stdout," %f",example->weight[l]);				/*if(example->weight[l]<0)die("ERROR: negative weight: %d %d %f",i,l,example->weight[l]);				if(min_weight>example->weight[l]){min_weight=example->weight[l];}				if(max_weight<example->weight[l]){max_weight=example->weight[l];}*/				//fprintf(stdout," %f",example->weight[l]);				sum_of_weights[b(example,l)][l]+=example->weight[l];			}			if(output_weights)fprintf(stdout,"\n");			//if(output_scores)fprintf(stdout,"XXX %f %f \n",example->score[0]/classifiers->length,example->score[1]/classifiers->length);		}		//fprintf(stdout,"norm=%.12f min=%.12f max=%.12f\n",normalization,min_weight,max_weight);	}	return error/examples->length;}/* load a data file  if it is a test or dev file (in_test=1) then do not update dictionaries  note: real test files without a class in the end will fail (this is not classification mode)*/vector_t* load_examples(const char* filename, vector_t* templates, vector_t* classes, int feature_count_cutoff, int in_test){	int i,j;	mapped_t* input = mapped_load_readonly(filename);	if(input == NULL)	{		warn("can't load \"%s\"", filename);		return NULL;	}	vector_t* examples = vector_new(16);	int line_num = 0;	char* begining_of_line = (char*)input->data;	char* end_of_file = (char*)input->data+input->length;	while(begining_of_line < end_of_file)// && examples->length<10000)	{		line_num++;		while(begining_of_line<end_of_file && *begining_of_line==' ') begining_of_line++;		if(*begining_of_line=='|' || *begining_of_line=='\n') // skip comments and blank lines		{			while(begining_of_line<end_of_file && *begining_of_line!='\n') begining_of_line++;			begining_of_line++;			continue;		}		if(begining_of_line >= end_of_file) die("unexpected end of file, line %d in %s", line_num, filename);		example_t* example = MALLOC(sizeof(example_t)); // that's one new example per line		//example->features = vector_new_type(templates->length,sizeof(int32_t)); // we will store 32bit ints and floats		//example->features->length=templates->length;		example->weight = NULL;		char* current = begining_of_line;		int i;		for(i = 0; i < templates->length; i++) // get one feature per column		{			template_t* template = (template_t*)vector_get(templates,i);			while(current < end_of_file && *current == ' ') current++; // strip spaces at begining			if(current >= end_of_file) die("unexpected end of file, line %d, column %d (%s) in %s", line_num, i, template->name->data, filename);			char* token = current;			size_t length = 0;			while(current < end_of_file && *current != ',') // get up to coma			{				current++;				length++;			}			if(current >= end_of_file) die("unexpected end of file, line %d, column %d (%s) in %s", line_num, i, template->name->data, filename);			while(*(token+length-1) == ' ' && length > 0) length--; // strip spaces at end			char field[length+1];			memcpy(field, token, length);			field[length] = '\0';			if(template->type == FEATURE_TYPE_CONTINUOUS)			{				float value=NAN;// unknwon is represented by Not-A-Number (NAN)				char* error_location=NULL;				if(length==0 || strcmp(field,"?")) // if not unknown value				{					value = strtof(field, &error_location);					if(error_location==NULL || *error_location!='\0')					   die("could not convert \"%s\" to a number, line %d, char %td, column %d (%s) in %s", field, line_num, token-begining_of_line+1, i, template->name->data, filename);				}				vector_push_float(template->values,value);			}			else if(template->type == FEATURE_TYPE_TEXT || template->type==FEATURE_TYPE_SET)			{				if(length==0 || strcmp(field,"?")) // if not unknwon value				{					char* word=NULL;					hashtable_t* bag_of_words=hashtable_new();					for(word=strtok(field, " "); word != NULL; word=strtok(NULL," "))					{						tokeninfo_t* tokeninfo = hashtable_get(template->dictionary, word, strlen(word));						if(tokeninfo == NULL)						{							if(in_test)tokeninfo=vector_get(template->tokens,0); // default to the unknown token							else if(template->type == FEATURE_TYPE_TEXT) // update the dictionary with the new token							{								tokeninfo = (tokeninfo_t*)MALLOC(sizeof(tokeninfo_t));								tokeninfo->id = template->tokens->length;								tokeninfo->key = strdup(word);								tokeninfo->count=0;								tokeninfo->examples=vector_new_int32_t(16);								hashtable_set(template->dictionary, word, strlen(word), tokeninfo);								vector_push(template->tokens, tokeninfo);							}							else die("value \"%s\" was not described in the .names file, line %d, column %d (%s) in %s", word, line_num, i, template->name->data, filename);						}						//vector_set_int32(example->features,i,tokeninfo->id);						if(hashtable_get(bag_of_words, word, strlen(word))==NULL)						{							hashtable_set(bag_of_words, word, strlen(word), word);							tokeninfo->count++;							vector_push_int32_t(tokeninfo->examples,(int32_t)examples->length); // inverted index						}					}					hashtable_free(bag_of_words);				}				else				{					//vector_set_int32(example->features,i,0); // unknown token is 0 (aka NULL)				}			}			else			{				//=> FEATURE_TYPE_IGNORE			}			current++;		}		//fprintf(stdout,"%d %p\n",examples->length,vector_get(example->features,0));		char* class = current; // get class label		while(current < end_of_file && *current != '\n') current++; // up to end of line		if(current >= end_of_file) die("unexpected end of file, line %d in %s", line_num, filename);		while(class < current && *class == ' ') class++; // strip spaces at the begining		size_t length=0;		while(class < current && *(class+length) != ' ' && *(class+length) != '.' && *(class+length) != '\n') length++; // strip "." and spaces at the end		char class_token[length+1];		memcpy(class_token,class,length); // copy as a cstring		class_token[length]='\0';		int id = 0;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -