📄 icsiboost.c
字号:
for(id = 0; id < classes->length; id++) // find class linearly as we usually have a few classes { if(!strncmp(((string_t*)vector_get(classes,id))->data, class, length)) break; } if(id == classes->length) die("unknown class \"%s\", line %d, char %td in %s", class_token, line_num, class-begining_of_line+1, filename); example->class = id; vector_push(examples, example); // store example begining_of_line = current+1; } vector_optimize(examples); // reduce memory consumption if(!in_test) { for(i=0;i<templates->length;i++) { template_t* template=(template_t*)vector_get(templates,i); if(template->tokens->length>1) { for(j=1; j<template->tokens->length; j++) // remove unfrequent features { tokeninfo_t* tokeninfo=vector_get(template->tokens,j); tokeninfo->id=j; if(tokeninfo->count<feature_count_cutoff) { hashtable_remove(template->dictionary, tokeninfo->key, strlen(tokeninfo->key)); if(tokeninfo->examples!=NULL)vector_free(tokeninfo->examples); FREE(tokeninfo->key); FREE(tokeninfo); memcpy(template->tokens->data+j*sizeof(void*),template->tokens->data+(template->tokens->length-1)*sizeof(void*),sizeof(void*)); template->tokens->length--; j--; } } } vector_optimize(template->tokens); vector_optimize(template->values); } } // initalize weights and score for(i=0;i<examples->length;i++) { example_t* example=(example_t*)vector_get(examples,i); //fprintf(stdout,"%d %d %p\n",i,(int)vector_get(example->features,0), example->weight); example->weight=(double*)MALLOC(classes->length*sizeof(double)); example->score=(double*)MALLOC(classes->length*sizeof(double)); for(j=0;j<classes->length;j++) { example->weight[j]=1.0/(classes->length*examples->length); // 1/(m*k) example->score[j]=0.0; } } //if(verbose)fprintf(stdout,"EXAMPLES: %s %zd\n",filename, examples->length); mapped_free(input); return examples;}vector_t* load_model(vector_t* templates, vector_t* classes, char* filename){ int i; vector_t* classifiers=vector_new(16); hashtable_t* templates_by_name=hashtable_new(); for(i=0; i<templates->length; i++) { template_t* template=(template_t*)vector_get(templates,i); hashtable_set(templates_by_name, template->name->data, template->name->length, template); template->classifiers=vector_new(16); } mapped_t* input=mapped_load_readonly(filename); if(input==NULL) die("can't load model \"%s\"", filename); string_t* line=NULL; int line_num=0; int num_classifiers=-1; weakclassifier_t* current=NULL; while((line=mapped_readline(input))!=NULL) { line_num++; if(string_match(line,"^ *$","n")) // skip blank lines { string_free(line); continue; } if(num_classifiers==-1) { num_classifiers=string_to_int32(line); } else if(current==NULL) { vector_t* groups=string_match(line," *([^ ]+) Text:(SGRAM|THRESHOLD):([^:]+):(.*)",NULL); if(groups) { current=(weakclassifier_t*)MALLOC(sizeof(weakclassifier_t)); string_t* template_name=vector_get(groups,3); template_t* template=hashtable_get(templates_by_name, template_name->data, template_name->length); if(template==NULL)die("invalid column template name \"%s\", line %d in %s", template_name->data, line_num, filename); string_t* alpha=vector_get(groups,1); current->alpha=string_to_double(alpha); if(isnan(current->alpha))die("invalid alpha value \"%s\", line %d in %s", alpha->data, line_num, filename); current->template=template; vector_push(classifiers, current); vector_push(current->template->classifiers,current); current->column=template->column; current->threshold=NAN; current->token=0; current->type=0; current->c0=NULL; current->c1=NULL; current->c2=NULL; string_t* word=vector_get(groups,4); if(string_eq_cstr(vector_get(groups,2),"SGRAM")) { current->type=CLASSIFIER_TYPE_TEXT; if(template->type==FEATURE_TYPE_SET) { int32_t token_num=string_to_int32(word)+1; if(token_num>template->tokens->length)die("invalid token number \"%s\", line %d in %s", word->data, line_num, filename); current->token=token_num; } else if(template->type==FEATURE_TYPE_TEXT) { tokeninfo_t* tokeninfo = hashtable_get(template->dictionary, word->data, word->length); if(tokeninfo==NULL) { tokeninfo = (tokeninfo_t*)MALLOC(sizeof(tokeninfo_t)); tokeninfo->id = template->tokens->length; tokeninfo->key = strdup(word->data); tokeninfo->count=0; tokeninfo->examples=NULL; hashtable_set(template->dictionary, word->data, word->length, tokeninfo); vector_push(template->tokens, tokeninfo); } current->token = tokeninfo->id; } } else if(string_eq_cstr(vector_get(groups,2),"THRESHOLD")) { current->type=CLASSIFIER_TYPE_THRESHOLD; } else die("invalid classifier definition \"%s\", line %d in %s", line->data, line_num, filename); string_vector_free(groups); } else die("invalid classifier definition \"%s\", line %d in %s", line->data, line_num, filename); } else if(current->c0==NULL) { current->c0=MALLOC(sizeof(double)*classes->length); array_t* values=string_split(line," ",NULL); if(values==NULL || values->length!=classes->length)die("invalid weight distribution \"%s\", line %d in %s",line->data,line_num,filename); for(i=0; i<values->length; i++) { string_t* value=array_get(values,i); current->c0[i]=string_to_double(value); if(isnan(current->c0[i]))die("invalid value in distribution \"%s\", line %d in %s", value->data, line_num, filename); } string_array_free(values); if(current->type==CLASSIFIER_TYPE_TEXT) { current->c1=MALLOC(sizeof(double)*classes->length); memcpy(current->c1, current->c0, sizeof(double)*classes->length); } } else if(current->c1==NULL) { current->c1=MALLOC(sizeof(double)*classes->length); array_t* values=string_split(line," ",NULL); if(values==NULL || values->length!=classes->length)die("invalid weight distribution \"%s\", line %d in %s",line->data,line_num,filename); for(i=0; i<values->length; i++) { string_t* value=array_get(values,i); current->c1[i]=string_to_double(value); if(isnan(current->c1[i]))die("invalid value in distribution \"%s\", line %d in %s", value->data, line_num, filename); } string_array_free(values); } else if(current->c2==NULL) { current->c2=MALLOC(sizeof(double)*classes->length); array_t* values=string_split(line," ",NULL); if(values==NULL || values->length!=classes->length)die("invalid weight distribution \"%s\", line %d in %s",line->data,line_num,filename); for(i=0; i<values->length; i++) { string_t* value=array_get(values,i); current->c2[i]=string_to_double(value); if(isnan(current->c2[i]))die("invalid value in distribution \"%s\", line %d in %s", value->data, line_num, filename); } string_array_free(values); if(current->type==CLASSIFIER_TYPE_TEXT) { current=NULL; } } else if(current->type==CLASSIFIER_TYPE_THRESHOLD) { current->threshold=string_to_double(line); if(isnan(current->threshold))die("invalid threshold \"%s\", line %d in %s", line->data, line_num, filename); current=NULL; } else die("invalid classifier definition \"%s\", line %d in %s", line->data, line_num, filename); string_free(line); } //if(verbose)fprintf(stdout,"LOADED_CLASSIFIERS %zd\n",classifiers->length); mapped_free(input); hashtable_free(templates_by_name); for(i=0; i<templates->length; i++) { template_t* template=vector_get(templates, i); vector_optimize(template->classifiers); } vector_optimize(classifiers); return classifiers;}void save_model(vector_t* classifiers, vector_t* classes, char* filename, int pack_model){ FILE* output=fopen(filename,"w"); if(output==NULL)die("could not output model in \"%s\"",filename); int i; int num_classifiers=classifiers->length; if(pack_model) { hashtable_t* packed_classifiers=hashtable_new(); for(i=0; i<classifiers->length; i++) { weakclassifier_t* classifier=vector_get(classifiers, i); string_t* identifier=string_sprintf("%d:%f:%d",classifier->column, classifier->threshold, classifier->token); weakclassifier_t* previous_classifier=hashtable_get(packed_classifiers, identifier->data, identifier->length); if(previous_classifier!=NULL) { int l; for(l=0;l<classes->length;l++) { previous_classifier->c0[l]+=classifier->c0[l]; previous_classifier->c1[l]+=classifier->c1[l]; previous_classifier->c2[l]+=classifier->c2[l]; } previous_classifier->alpha+=classifier->alpha; FREE(classifier->c0); FREE(classifier->c1); FREE(classifier->c2); FREE(classifier); vector_set(classifiers, i ,NULL); num_classifiers--; } else { hashtable_set(packed_classifiers, identifier->data, identifier->length, classifier); } string_free(identifier); } hashtable_free(packed_classifiers); } fprintf(output,"%d\n\n",num_classifiers); for(i=0; i<classifiers->length; i++) { weakclassifier_t* classifier=(weakclassifier_t*)vector_get(classifiers,i); if(classifier==NULL)continue; fprintf(output," %.12f Text:",classifier->alpha); if(classifier->type==CLASSIFIER_TYPE_THRESHOLD) { fprintf(output,"THRESHOLD:%s:\n\n",classifier->template->name->data); int l=0; for(l=0;l<classes->length;l++) fprintf(output,"%.10f ",classifier->c0[l]/classifier->alpha); fprintf(output,"\n\n"); for(l=0;l<classes->length;l++) fprintf(output,"%.10f ",classifier->c1[l]/classifier->alpha); fprintf(output,"\n\n"); for(l=0;l<classes->length;l++) fprintf(output,"%.10f ",classifier->c2[l]/classifier->alpha); fprintf(output,"\n\n"); fprintf(output,"%.10f\n\n\n",classifier->threshold); } else if(classifier->type==CLASSIFIER_TYPE_TEXT && classifier->template->type==FEATURE_TYPE_TEXT) { tokeninfo_t* tokeninfo=(tokeninfo_t*) vector_get(classifier->template->tokens,classifier->token); fprintf(output,"SGRAM:%s:%s\n\n",classifier->template->name->data,tokeninfo->key); int l=0; for(l=0;l<classes->length;l++) fprintf(output,"%.10f ",classifier->c1[l]/classifier->alpha); fprintf(output,"\n\n"); for(l=0;l<classes->length;l++) fprintf(output,"%.10f ",classifier->c2[l]/classifier->alpha); fprintf(output,"\n\n"); fprintf(output,"\n"); } else if(classifier->type==CLASSIFIER_TYPE_TEXT && classifier->template->type==FEATURE_TYPE_SET) { tokeninfo_t* tokeninfo=(tokeninfo_t*) vector_get(classifier->template->tokens,classifier->token); fprintf(output,"SGRAM:%s:%d\n\n",classifier->template->name->data,tokeninfo->id-1); // 0 is unknown (?), so skip it int l=0; for(l=0;l<classes->length;l++) fprintf(output,"%.10f ",classifier->c1[l]/classifier->alpha); fprintf(output,"\n\n"); for(l=0;l<classes->length;l++) fprintf(output,"%.10f ",classifier->c2[l]/classifier->alpha); fprintf(output,"\n\n"); fprintf(output,"\n"); } else die("unknown classifier type \"%d\"",classifier->type); } fclose(output);}void usage(char* program_name){ fprintf(stderr,"USAGE: %s [options] -S <stem>\n",program_name); fprintf(stderr," --version print version info\n"); fprintf(stderr," -S <stem> defines model/data/names stem\n"); fprintf(stderr," -n <iterations> number of boosting iterations\n"); fprintf(stderr," -E <smoothing> set smoothing value (default=0.5)\n"); fprintf(stderr," -V verbose mode\n"); fprintf(stderr," -C classification mode -- reads examples from <stdin>\n"); fprintf(stderr," -o long output in classification mode\n"); fprintf(stderr," --dryrun only parse the names file and the data file to check for errors\n"); fprintf(stderr," --cutoff <freq> ignore nominal features occuring unfrequently\n"); fprintf(stderr," --jobs <threads> number of threaded weak learners\n"); fprintf(stderr," --do-not-pack-model do not pack model (to get individual training steps)\n"); fprintf(stderr," --output-weights output training examples weights at each iteration\n"); fprintf(stderr," --model <model> save/load the model to/from this file instead of <stem>.shyp\n"); fprintf(stderr," --train <file> bypass the <stem>.data filename to specify training examples\n"); fprintf(stderr," --test <file> output additional error rate from an other file during training (can be used multiple times, not implemented)\n"); fprintf(stderr," --names <file> use this column description file instead of <stem>.names\n"); fprintf(stderr," --ignore <columns> ignore a comma separated list of columns (synonym with \"ignore\" in name file)\n"); exit(1);}void print_version(char* program_name){ fprintf(stdout,"%s v%s, Boosting decision stumps.\n", PACKAGE, VERSION); fprintf(stdout,"Written by Benoit Favre.\n\n"); fprintf(stdout,"Copyright (C) 2007 International Computer Science Institute.\n"); fprintf(stdout,"This is free software; see the source for copying conditions. There is NO\n"); fprintf(stdout,"warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n\n"); fprintf(stdout,"Build: %s at %s"#ifdef __VERSION__ ", gcc %s\n",#endif __DATE__,__TIME__,#ifdef __VERSION__ __VERSION__);#endif fprintf(stdout,"Subversion info:\n"); fprintf(stdout,"$URL: https://icsiboost.googlecode.com/svn/trunk/icsiboost/src/icsiboost.c $\n"); fprintf(stdout,"$Date: 2007-08-03 18:24:59 -0700 (Fri, 03 Aug 2007) $\n"); fprintf(stdout,"$Revision: 34 $\n"); fprintf(stdout,"$Author: benoit.favre $\n");}int main(int argc, char** argv){#ifdef DEBUG init_debugging(argv[0],DEBUG_NON_INTERACTIVE);#endif#ifdef USE_GC GC_INIT();#endif int maximum_iterations=10; int feature_count_cutoff=0; int classification_mode=0; int classification_output=0; int dryrun_mode=0; int pack_model=1; string_t* model_name=NULL; string_t* data_filename=NULL; string_t* names_filename=NULL; array_t* ignore_columns=NULL;#ifdef USE_THREADS
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -