📄 icsiboost.c
字号:
classifier->threshold=((double)values[next_example_id]+(double)values[example_id])/2.0; // threshold between current and next example if(isnan(classifier->threshold))die("threshold is nan, column=%d, objective=%f, i=%zd",column,objective,i); // should not happend //fprintf(stdout," %d:%d:%f",column,i,classifier->threshold); min_objective=objective; for(l=0;l<num_classes;l++) // update class weight { classifier->c0[l]=0.5*LOG((weight[0][1][l]+epsilon)/(weight[0][0][l]+epsilon)); classifier->c1[l]=0.5*LOG((weight[1][1][l]+epsilon)/(weight[1][0][l]+epsilon)); classifier->c2[l]=0.5*LOG((weight[2][1][l]+epsilon)/(weight[2][0][l]+epsilon)); } } } //fprintf(stdout,"DEBUG: column=%d threshold=%f obj=%f %s\n",column,classifier->threshold,classifier->objective,template->name->data); if(isnan(classifier->threshold)) // not found a better classifier { FREE(classifier->c0); FREE(classifier->c1); FREE(classifier->c2); FREE(classifier); return NULL; } return classifier;}#ifdef USE_THREADS#define SHARED(type,name,value) pthread_mutex_t name ## _mutex = PTHREAD_MUTEX_INITIALIZER; type name=value;#define SHARED_NOINIT(type,name) pthread_mutex_t name ## _mutex; type name;#define LOCK(variable) pthread_mutex_lock(&(variable ## _mutex));#define UNLOCK(variable) pthread_mutex_unlock(&(variable ## _mutex));SHARED(int,finished,0);typedef struct workertoolbox { vector_t* examples; // stuff that matter to the weak learner vector_t* templates; vector_t* classes; double **sum_of_weights; SHARED_NOINIT(int,next_column); // the next column to process SHARED_NOINIT(weakclassifier_t*,best_classifier); // the result of a job //sem_t* ready_to_process; // if available, a thread can start working semaphore_t* ready_to_process; //sem_t* result_available; // if available, a result is avaiable in best_classifier semaphore_t* result_available;} workertoolbox_t;workertoolbox_t* toolbox=NULL;//SHARED(int,next_worker_num,0);void* threaded_worker(void* data){ int column; /*LOCK(next_worker_num); int worker_num=next_worker_num; next_worker_num++; UNLOCK(next_worker_num);*/ while(!finished) { //sem_wait(toolbox->ready_to_process); //if(verbose)fprintf(stdout,"%d worker thread ready\n",worker_num); semaphore_eat(toolbox->ready_to_process); if(finished)pthread_exit(NULL); LOCK(toolbox->next_column); column=toolbox->next_column; toolbox->next_column++; UNLOCK(toolbox->next_column); template_t* template=(template_t*)vector_get(toolbox->templates,column); //if(verbose)fprintf(stdout,"%d worker thread processing: %s\n",worker_num,template->name->data); //----------- do the job weakclassifier_t* current=NULL; if(template->type==FEATURE_TYPE_CONTINUOUS) { current=train_continuous_stump(1.0, template, toolbox->examples, toolbox->classes->length); } else if(template->type==FEATURE_TYPE_TEXT || template->type==FEATURE_TYPE_SET) { current=train_text_stump(1.0, template, toolbox->examples, toolbox->sum_of_weights, toolbox->classes->length); } //----------- return the result if(current!=NULL) { //fprintf(stdout,"obj:%f %f\n",current->objective,toolbox->best_classifier->objective); LOCK(toolbox->best_classifier); if(current->objective-toolbox->best_classifier->objective<-1e-11) { toolbox->best_classifier->template=current->template; toolbox->best_classifier->type=current->type; toolbox->best_classifier->token=current->token; toolbox->best_classifier->column=current->column; toolbox->best_classifier->threshold=current->threshold; toolbox->best_classifier->alpha=current->alpha; toolbox->best_classifier->objective=current->objective; memcpy(toolbox->best_classifier->c0,current->c0,sizeof(double)*toolbox->classes->length); memcpy(toolbox->best_classifier->c1,current->c1,sizeof(double)*toolbox->classes->length); memcpy(toolbox->best_classifier->c2,current->c2,sizeof(double)*toolbox->classes->length); } UNLOCK(toolbox->best_classifier); FREE(current->c0); FREE(current->c1); FREE(current->c2); FREE(current); } //sem_post(toolbox->result_available); semaphore_feed(toolbox->result_available); } pthread_exit(NULL);}#endif/* compute error rate AND update weights in testing conditions (dev or test set) sum_of_weights is NULL, so just compute error rate => need to be parallelized*/double compute_classification_error(vector_t* classifiers, vector_t* examples, double** sum_of_weights, int num_classes){ int i=0; int l=0; double error=0; double normalization=0; weakclassifier_t* classifier=(weakclassifier_t*)vector_get(classifiers,classifiers->length-1); if(classifier->type==CLASSIFIER_TYPE_THRESHOLD) { for(i=0;i<examples->length;i++) { example_t* example=(example_t*)vector_get(examples,i); float value=vector_get_float(classifier->template->values,i); if(isnan(value)) { for(l=0;l<num_classes;l++) { example->score[l]+=classifier->alpha*classifier->c0[l]; example->weight[l]=example->weight[l]*EXP(-classifier->alpha*y_l(example,l)*classifier->c0[l]); } } else if(value<classifier->threshold) { for(l=0;l<num_classes;l++) { example->score[l]+=classifier->alpha*classifier->c1[l]; example->weight[l]=example->weight[l]*EXP(-classifier->alpha*y_l(example,l)*classifier->c1[l]); } } else { for(l=0;l<num_classes;l++) { example->score[l]+=classifier->alpha*classifier->c2[l]; example->weight[l]=example->weight[l]*EXP(-classifier->alpha*y_l(example,l)*classifier->c2[l]); } } } } else if(classifier->type==CLASSIFIER_TYPE_TEXT) { tokeninfo_t* tokeninfo=(tokeninfo_t*)vector_get(classifier->template->tokens,classifier->token); int* seen_examples=MALLOC(sizeof(int)*examples->length); memset(seen_examples,0,examples->length*sizeof(int)); for(i=0;i<tokeninfo->examples->length;i++) { int32_t example_id=vector_get_int32_t(tokeninfo->examples,i); seen_examples[example_id]=1; } for(i=0;i<examples->length;i++) { example_t* example=(example_t*)vector_get(examples,i); if(seen_examples[i]==1) { for(l=0;l<num_classes;l++) { example->score[l]+=classifier->alpha*classifier->c2[l]; example->weight[l]=example->weight[l]*EXP(-classifier->alpha*y_l(example,l)*classifier->c2[l]); } } else // unknown or absent (c1 = c0) { for(l=0;l<num_classes;l++) { example->score[l]+=classifier->alpha*classifier->c1[l]; example->weight[l]=example->weight[l]*EXP(-classifier->alpha*y_l(example,l)*classifier->c1[l]); } } } FREE(seen_examples); } for(i=0;i<examples->length;i++) { example_t* example=(example_t*)vector_get(examples,i); double max=-1; int argmax=0; for(l=0;l<num_classes;l++) // selected class = class with highest score { normalization+=example->weight[l]; // update Z() normalization (not the same Z as in optimization) if(example->score[l]>max) { max=example->score[l]; argmax=l; } } if(!b(example,argmax))error++; // error if the class is not the real class } if(sum_of_weights!=NULL) { //double min_weight=examples->length*num_classes; //double max_weight=0; //normalization/=num_classes*examples->length; for(l=0;l<num_classes;l++) // update the sum of weights by class { sum_of_weights[0][l]=0.0; sum_of_weights[1][l]=0.0; } for(i=0;i<examples->length;i++) // normalize the weights and do some stats for debuging { example_t* example=(example_t*)vector_get(examples,i); //fprintf(stdout,"%d",i); if(output_weights)fprintf(stdout,"iteration=%zd example=%d weights:\n",classifiers->length, i); for(l=0;l<num_classes;l++) { example->weight[l]/=normalization; if(output_weights)fprintf(stdout," %f",example->weight[l]); /*if(example->weight[l]<0)die("ERROR: negative weight: %d %d %f",i,l,example->weight[l]); if(min_weight>example->weight[l]){min_weight=example->weight[l];} if(max_weight<example->weight[l]){max_weight=example->weight[l];}*/ //fprintf(stdout," %f",example->weight[l]); sum_of_weights[b(example,l)][l]+=example->weight[l]; } if(output_weights)fprintf(stdout,"\n"); //if(output_scores)fprintf(stdout,"XXX %f %f \n",example->score[0]/classifiers->length,example->score[1]/classifiers->length); } //fprintf(stdout,"norm=%.12f min=%.12f max=%.12f\n",normalization,min_weight,max_weight); } return error/examples->length;}/* load a data file if it is a test or dev file (in_test=1) then do not update dictionaries note: real test files without a class in the end will fail (this is not classification mode)*/vector_t* load_examples(const char* filename, vector_t* templates, vector_t* classes, int feature_count_cutoff, int in_test){ int i,j; mapped_t* input = mapped_load_readonly(filename); if(input == NULL) { warn("can't load \"%s\"", filename); return NULL; } vector_t* examples = vector_new(16); int line_num = 0; char* begining_of_line = (char*)input->data; char* end_of_file = (char*)input->data+input->length; while(begining_of_line < end_of_file)// && examples->length<10000) { line_num++; while(begining_of_line<end_of_file && *begining_of_line==' ') begining_of_line++; if(*begining_of_line=='|' || *begining_of_line=='\n') // skip comments and blank lines { while(begining_of_line<end_of_file && *begining_of_line!='\n') begining_of_line++; begining_of_line++; continue; } if(begining_of_line >= end_of_file) die("unexpected end of file, line %d in %s", line_num, filename); example_t* example = MALLOC(sizeof(example_t)); // that's one new example per line //example->features = vector_new_type(templates->length,sizeof(int32_t)); // we will store 32bit ints and floats //example->features->length=templates->length; example->weight = NULL; char* current = begining_of_line; int i; for(i = 0; i < templates->length; i++) // get one feature per column { template_t* template = (template_t*)vector_get(templates,i); while(current < end_of_file && *current == ' ') current++; // strip spaces at begining if(current >= end_of_file) die("unexpected end of file, line %d, column %d (%s) in %s", line_num, i, template->name->data, filename); char* token = current; size_t length = 0; while(current < end_of_file && *current != ',') // get up to coma { current++; length++; } if(current >= end_of_file) die("unexpected end of file, line %d, column %d (%s) in %s", line_num, i, template->name->data, filename); while(*(token+length-1) == ' ' && length > 0) length--; // strip spaces at end char field[length+1]; memcpy(field, token, length); field[length] = '\0'; if(template->type == FEATURE_TYPE_CONTINUOUS) { float value=NAN;// unknwon is represented by Not-A-Number (NAN) char* error_location=NULL; if(length==0 || strcmp(field,"?")) // if not unknown value { value = strtof(field, &error_location); if(error_location==NULL || *error_location!='\0') die("could not convert \"%s\" to a number, line %d, char %td, column %d (%s) in %s", field, line_num, token-begining_of_line+1, i, template->name->data, filename); } vector_push_float(template->values,value); } else if(template->type == FEATURE_TYPE_TEXT || template->type==FEATURE_TYPE_SET) { if(length==0 || strcmp(field,"?")) // if not unknwon value { char* word=NULL; hashtable_t* bag_of_words=hashtable_new(); for(word=strtok(field, " "); word != NULL; word=strtok(NULL," ")) { tokeninfo_t* tokeninfo = hashtable_get(template->dictionary, word, strlen(word)); if(tokeninfo == NULL) { if(in_test)tokeninfo=vector_get(template->tokens,0); // default to the unknown token else if(template->type == FEATURE_TYPE_TEXT) // update the dictionary with the new token { tokeninfo = (tokeninfo_t*)MALLOC(sizeof(tokeninfo_t)); tokeninfo->id = template->tokens->length; tokeninfo->key = strdup(word); tokeninfo->count=0; tokeninfo->examples=vector_new_int32_t(16); hashtable_set(template->dictionary, word, strlen(word), tokeninfo); vector_push(template->tokens, tokeninfo); } else die("value \"%s\" was not described in the .names file, line %d, column %d (%s) in %s", word, line_num, i, template->name->data, filename); } //vector_set_int32(example->features,i,tokeninfo->id); if(hashtable_get(bag_of_words, word, strlen(word))==NULL) { hashtable_set(bag_of_words, word, strlen(word), word); tokeninfo->count++; vector_push_int32_t(tokeninfo->examples,(int32_t)examples->length); // inverted index } } hashtable_free(bag_of_words); } else { //vector_set_int32(example->features,i,0); // unknown token is 0 (aka NULL) } } else { //=> FEATURE_TYPE_IGNORE } current++; } //fprintf(stdout,"%d %p\n",examples->length,vector_get(example->features,0)); char* class = current; // get class label while(current < end_of_file && *current != '\n') current++; // up to end of line if(current >= end_of_file) die("unexpected end of file, line %d in %s", line_num, filename); while(class < current && *class == ' ') class++; // strip spaces at the begining size_t length=0; while(class < current && *(class+length) != ' ' && *(class+length) != '.' && *(class+length) != '\n') length++; // strip "." and spaces at the end char class_token[length+1]; memcpy(class_token,class,length); // copy as a cstring class_token[length]='\0'; int id = 0;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -