📄 icsiboost.c
字号:
{ string_t* class=vector_get(classes,l); fprintf(stdout,"%s%s % 5f : %s \n",string_cmp(true_class,class)==0?"*":" ", (score[l]>0?">":(string_cmp(true_class,class)==0?"*":" ")),score[l],class->data); if((score[l]<0 && string_cmp(true_class,class)==0))errors++; } } else { fprintf(stdout,"correct label = ?\n"); for(l=0; l<classes->length; l++) { string_t* class=vector_get(classes,l); fprintf(stdout," % 5f : %s\n", score[l],class->data); } } } } vector_free(tokens); string_array_free(array_of_tokens); string_free(line); num_examples++; } if(!dryrun_mode && classification_output!=0) { fprintf(stderr,"ERROR RATE: %f\n",(double)errors/(double)num_examples); } exit(0); } // load a train, dev and test sets (if available) if(data_filename==NULL) { data_filename = string_copy(stem); string_append_cstr(data_filename, ".data"); } vector_t* examples = load_examples(data_filename->data, templates, classes, feature_count_cutoff, 0); string_free(data_filename); // generate a simple random sequence of example ids for sampleing /*random_sequence=(int*)MALLOC(sizeof(int)*examples->length); srand(time(NULL)); for(i=0;i<examples->length;i++) { int index=rand()%examples->length; int tmp=random_sequence[i]; random_sequence[i]=random_sequence[index]; random_sequence[index]=tmp; }*/ vector_t* dev_examples=NULL; vector_t* test_examples=NULL;// deactivated dev/test that don't work with the new indexed features (need separate handleing) /*data_filename = string_copy(stem); string_append_cstr(data_filename, ".dev"); dev_examples = load_examples(data_filename->data, templates, classes, 0, 1); string_free(data_filename); data_filename = string_copy(stem); string_append_cstr(data_filename, ".test"); test_examples = load_examples(data_filename->data, templates, classes, 0, 1); string_free(data_filename);*/ if(dryrun_mode)exit(0); // initialize the sum of weights by classes (to infer the other side of the partition in binary classifiers) // sum_of_weights[b][l] double **sum_of_weights=(double**)MALLOC(sizeof(double*)*2); sum_of_weights[0]=(double*)MALLOC(sizeof(double)*classes->length); sum_of_weights[1]=(double*)MALLOC(sizeof(double)*classes->length); for(i=0;i<classes->length;i++) { sum_of_weights[0][i]=0.0; sum_of_weights[1][i]=0.0; } int l; for(i=0;i<examples->length;i++) { example_t* example=(example_t*)vector_get(examples,i); for(l=0;l<classes->length;l++) { sum_of_weights[b(example,l)][l]+=example->weight[l]; } }#ifdef USE_THREADS pthread_t workers[number_of_workers]; //workertoolbox_t* toolbox=MALLOC(sizeof(workertoolbox_t)); toolbox=MALLOC(sizeof(workertoolbox_t)); toolbox->examples=examples; toolbox->templates=templates; toolbox->classes=classes; toolbox->sum_of_weights=sum_of_weights; toolbox->next_column=0; pthread_mutex_init(&toolbox->next_column_mutex,NULL); pthread_mutex_init(&toolbox->best_classifier_mutex,NULL); toolbox->best_classifier=MALLOC(sizeof(weakclassifier_t)); toolbox->best_classifier->c0=MALLOC(sizeof(double)*classes->length); toolbox->best_classifier->c1=MALLOC(sizeof(double)*classes->length); toolbox->best_classifier->c2=MALLOC(sizeof(double)*classes->length); toolbox->best_classifier->objective=1.0; //toolbox->ready_to_process=MALLOC(sizeof(sem_t)); //sem_init(toolbox->ready_to_process,0,0); //sem_unlink("ready_to_process"); //toolbox->ready_to_process=sem_open("ready_to_process",O_CREAT,0700,0); //if(toolbox->ready_to_process==(sem_t*)SEM_FAILED)die("ready to process %d",SEM_FAILED); toolbox->ready_to_process=semaphore_new(0); //toolbox->result_available=MALLOC(sizeof(sem_t)); //sem_init(toolbox->result_available,0,0); //sem_unlink("result_available"); //toolbox->result_available=sem_open("result_avaiable",O_CREAT,0700,0); //if(toolbox->result_available==(sem_t*)SEM_FAILED)die("result_available"); toolbox->result_available=semaphore_new(0); semaphore_feed(toolbox->result_available); semaphore_eat(toolbox->result_available); for(i=0; i<number_of_workers; i++) { /*pthread_attr_t attributes; pthread_attr_init(&attributes); pthread_attr_setdetachstate(&attributes,PTHREAD_CREATE_JOINABLE);*/ //pthread_create(&workers[i],&attributes,threaded_worker,NULL); pthread_create(&workers[i],NULL,threaded_worker,NULL); //pthread_attr_destroy(&attributes); }#endif int iteration=0; vector_t* classifiers=vector_new(maximum_iterations); double theorical_error=1.0; for(iteration=0;iteration<maximum_iterations;iteration++) {#ifdef USE_THREADS LOCK(toolbox->next_column); toolbox->next_column=0; UNLOCK(toolbox->next_column); LOCK(toolbox->best_classifier); toolbox->best_classifier->objective=1.0; UNLOCK(toolbox->best_classifier);#else double min_objective=1.0;#endif weakclassifier_t* classifier=NULL;#ifdef USE_THREADS for(i=0;i<templates->length;i++) // find the best classifier { //sem_post(toolbox->ready_to_process); semaphore_feed(toolbox->ready_to_process); } // need to store and reorder potential classifiers to get a deterministic behaviour for(i=0;i<templates->length;i++) // wait for the results { //sem_wait(toolbox->result_available); semaphore_eat(toolbox->result_available); } // all results should be available classifier=MALLOC(sizeof(weakclassifier_t)); memcpy(classifier,toolbox->best_classifier,sizeof(weakclassifier_t)); classifier->c0=MALLOC(sizeof(double)*classes->length); classifier->c1=MALLOC(sizeof(double)*classes->length); classifier->c2=MALLOC(sizeof(double)*classes->length); memcpy(classifier->c0,toolbox->best_classifier->c0,sizeof(double)*classes->length); memcpy(classifier->c1,toolbox->best_classifier->c1,sizeof(double)*classes->length); memcpy(classifier->c2,toolbox->best_classifier->c2,sizeof(double)*classes->length);#else for(i=0;i<templates->length;i++) // fine the best classifier { template_t* template=(template_t*)vector_get(templates,i); weakclassifier_t* current=NULL; if(template->type==FEATURE_TYPE_CONTINUOUS) { current=train_continuous_stump(min_objective, template, examples, classes->length); } else if(template->type==FEATURE_TYPE_TEXT || template->type==FEATURE_TYPE_SET) { current=train_text_stump(min_objective, template, examples, sum_of_weights, classes->length); } // else => FEATURE_TYPE_IGNORE if(current==NULL)continue; if(current->objective-min_objective<-1e-11) { min_objective=current->objective; if(classifier!=NULL) // free previous classifier { FREE(classifier->c0); FREE(classifier->c1); FREE(classifier->c2); FREE(classifier); } classifier=current; } else { FREE(current->c0); FREE(current->c1); FREE(current->c2); FREE(current); } }#endif vector_push(classifiers,classifier); if(iteration==maximum_iterations-1) output_scores=1; else output_scores=0; double error=compute_classification_error(classifiers, examples, sum_of_weights, classes->length); // compute error rate and update weights double dev_error=NAN; if(dev_examples!=NULL)dev_error = compute_classification_error(classifiers, dev_examples, NULL, classes->length); // compute error rate on dev double test_error=NAN; if(test_examples!=NULL)test_error = compute_classification_error(classifiers, test_examples, NULL, classes->length); // compute error rate on test // display result "a la" boostexter if(verbose==1) { char* token=""; if(classifier->type==CLASSIFIER_TYPE_TEXT) { tokeninfo_t* tokeninfo=(tokeninfo_t*)vector_get(classifier->template->tokens,classifier->token); token=tokeninfo->key; } if(classifier->type==CLASSIFIER_TYPE_THRESHOLD) { fprintf(stdout,"\n%s:%s\n",classifier->template->name->data,token); fprintf(stdout,"Threshold: %7.3g\n",classifier->threshold); classifier->c0[0]=-1e-11; fprintf(stdout,"C0: ");for(i=0;i<classes->length;i++)fprintf(stdout," % 4.3f ",classifier->c0[i]); fprintf(stdout,"\n"); fprintf(stdout,"C1: ");for(i=0;i<classes->length;i++)fprintf(stdout," % 4.3f ",classifier->c1[i]); fprintf(stdout,"\n"); fprintf(stdout,"C2: ");for(i=0;i<classes->length;i++)fprintf(stdout," % 4.3f ",classifier->c2[i]); fprintf(stdout,"\n"); } else if(classifier->type==CLASSIFIER_TYPE_TEXT) { fprintf(stdout,"\n%s:%s \n",classifier->template->name->data,token); fprintf(stdout,"C0: ");for(i=0;i<classes->length;i++)fprintf(stdout," % 4.3f ",classifier->c1[i]); fprintf(stdout,"\n"); fprintf(stdout,"C1: ");for(i=0;i<classes->length;i++)fprintf(stdout," % 4.3f ",classifier->c2[i]); fprintf(stdout,"\n"); } } theorical_error*=classifier->objective; fprintf(stdout,"rnd %4d: wh-err= %.6f th-err= %.6f test= %7f train= %7f\n",iteration+1,classifier->objective,theorical_error,test_error,error); //fprintf(stdout,"rnd %4d: wh-err= %.6f th-err= %.6f dev= %.7f test= %.7f train= %.7f\n",iteration+1,classifier->objective,theorical_error,dev_error,test_error,error); // unlike boostexter, C0 is always unk, C1 below or absent, C2 above or present } if(model_name==NULL) { model_name = string_copy(stem); string_append_cstr(model_name, ".shyp"); } save_model(classifiers,classes,model_name->data,pack_model); string_free(model_name); // release data structures (this is why we need a garbage collector ;) FREE(sum_of_weights[0]); FREE(sum_of_weights[1]); FREE(sum_of_weights); for(i = 0; i < classifiers->length; i++) { weakclassifier_t* classifier=(weakclassifier_t*)vector_get(classifiers,i); if(classifier==NULL)continue; FREE(classifier->c0); FREE(classifier->c1); FREE(classifier->c2); FREE(classifier); } vector_free(classifiers); for(i=0; i<classes->length; i++) string_free((string_t*)vector_get(classes,i)); vector_free(classes); for(i=0; i<templates->length; i++) { template_t* template=(template_t*)vector_get(templates,i); string_free(template->name); hashtable_free(template->dictionary); int j; for(j=0; j<template->tokens->length; j++) { tokeninfo_t* tokeninfo = (tokeninfo_t*) vector_get(template->tokens, j); if(tokeninfo->examples!=NULL)vector_free(tokeninfo->examples); FREE(tokeninfo->key); FREE(tokeninfo); } vector_free(template->tokens); vector_free(template->values); if(template->ordered!=NULL)FREE(template->ordered); if(template->classifiers!=NULL)vector_free(template->classifiers); FREE(template); } vector_free(templates); if(dev_examples!=NULL) { for(i=0; i<dev_examples->length; i++) { example_t* example=(example_t*)vector_get(dev_examples,i); //vector_free(example->features); FREE(example->weight); FREE(example->score); FREE(example); } vector_free(dev_examples); } if(test_examples!=NULL) { for(i=0; i<test_examples->length; i++) { example_t* example=(example_t*)vector_get(test_examples,i); //vector_free(example->features); FREE(example->weight); FREE(example->score); FREE(example); } vector_free(test_examples); } for(i=0; i<examples->length; i++) { example_t* example=(example_t*)vector_get(examples,i); //vector_free(example->features); FREE(example->weight); FREE(example->score); FREE(example); } vector_free(examples); string_free(stem); //if(verbose)fprintf(stdout,"FINISHED!!!\n");#ifdef USE_THREADS LOCK(finished); finished=1; UNLOCK(finished); for(i=0;i<number_of_workers;i++) { int j; for(j=0;j<number_of_workers;j++) semaphore_feed(toolbox->ready_to_process); //sem_post(toolbox->ready_to_process); // need more, because you cannot unlock all at a time void* output; pthread_join(workers[i],&output); //if(verbose)fprintf(stdout,"worker joined with return value: %p\n",output); } /*sem_unlink("ready_to_process"); sem_close(toolbox->ready_to_process); sem_unlink("results_available"); sem_close(toolbox->result_available);*/ semaphore_free(toolbox->ready_to_process); semaphore_free(toolbox->result_available); FREE(toolbox->best_classifier->c0); FREE(toolbox->best_classifier->c1); FREE(toolbox->best_classifier->c2); FREE(toolbox->best_classifier); FREE(toolbox);#endif return 0;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -