📄 multiclass.c
字号:
exclude_cis, exclude_cis_size, exclude_cis_capacity); } return 0;}/* Greedily add classes to CIS in the order of their mixture weight. */intmulticlass_explore_cis_greedy1 (crossbow_doc *doc, multiclass_score *scores, int *scores_count, int scores_capacity, const int *exclude_cis, int exclude_cis_size, int exclude_cis_capacity){ int nc = crossbow_root->children_count; int nm = nc + 2; double *mixture; int *cis, cis_size, *local_exclude_cis, local_exclude_cis_size; int cisi, mi; double max_mixture; int max_mi = -1; int top_mi = -1; mixture = alloca ((nm) * sizeof(double)); cis = alloca ((nc+2) * sizeof(int)); local_exclude_cis = alloca (exclude_cis_capacity * sizeof(int)); for (cisi = 0; cisi < nc; cisi++) cis[cisi] = -1; cis_size = 0; local_exclude_cis_size = exclude_cis_size; for (cisi = 0; cisi < exclude_cis_size; cisi++) local_exclude_cis[cisi] = exclude_cis[cisi]; multiclass_mixture_given_doc (doc, mixture); for (cisi = 0; cisi < exclude_cis_size; cisi++) mixture[exclude_cis[cisi]] = -1; /* Exclude the root and uniform mixtures */ mixture[nc] = mixture[nc+1] = -1; while (cis_size < MAX_NUM_MIXTURE_CLASSES && cis_size < nc) { max_mixture = -1; max_mi = -1; for (mi = 0; mi < nc; mi++) /* Make sure we don't pick a class that has no training data */ if (mixture[mi] > max_mixture && crossbow_root->children[mi]->prior) { max_mixture = mixture[mi]; max_mi = mi; } /* If MAX_MI is still -1, then we didn't find any positive mixture weight; they were all excluded or zero from multiclass_mixture_given_doc(). Nothing more to be done; pop out of this loop. */ if (max_mi < 0) break; for (cisi = 0; cisi < cis_size; cisi++) assert (cis[cisi] != max_mi); if (cis_size == 0) top_mi = max_mi; assert (mixture[max_mi] > 0); /* Drive this mixture low so that it will never be max_mi again */ mixture[max_mi] = -1; assert (max_mi < crossbow_root->children_count); cis[cis_size++] = max_mi; qsort (cis, cis_size, sizeof (int), compare_ints); if (multiclass_cis_scores_index (cis, cis_size, scores, *scores_count) == -1 && !multiclass_artificially_prune_cis (cis, cis_size)) { for (cisi = 0; cisi < MAX_NUM_MIXTURE_CLASSES; cisi++) scores[*scores_count].c[cisi] = cis[cisi]; scores[*scores_count].score = multiclass_log_prob_of_classes_given_doc (cis, cis_size, doc); (*scores_count)++; assert (*scores_count < scores_capacity); } } if (exclude_cis_size < 5 && exclude_cis_size < exclude_cis_capacity) { assert (top_mi >= 0); local_exclude_cis[local_exclude_cis_size++] = top_mi; multiclass_explore_cis_greedy1 (doc, scores, scores_count, scores_capacity, local_exclude_cis, local_exclude_cis_size, exclude_cis_capacity); } return 0;}intmulticlass_explore_cis_greedy2 (crossbow_doc *doc, multiclass_score *scores, int *scores_count, int scores_capacity, const int *exclude_cis, int exclude_cis_size, int exclude_cis_capacity){ int nc = crossbow_root->children_count; int nm = nc + 2; double *mixture; int *cis, cis_size, *local_exclude_cis; int cisi, mi; double max_mixture; double max_mi; mixture = alloca ((nm) * sizeof(double)); cis = alloca ((nc+2) * sizeof(int)); local_exclude_cis = alloca ((nc+2) * sizeof(int)); for (cisi = 0; cisi < nc; cisi++) cis[cisi] = -1; cis_size = 0; /* Greedily add classes to the mixture one at a time. */ while (cis_size < MAX_NUM_MIXTURE_CLASSES) { multiclass_mixture_given_doc_and_partial_cis (doc, cis, cis_size, exclude_cis, exclude_cis_size, mixture); max_mixture = -FLT_MAX; max_mi = -1; for (mi = 0; mi < nm; mi++) if (mixture[mi] > max_mixture && mi < nc) { max_mixture = mixture[mi]; max_mi = mi; } assert (max_mi >= 0); cis[cis_size++] = max_mi; qsort (cis, cis_size, sizeof (int), compare_ints); if (multiclass_cis_scores_index (cis, cis_size, scores, *scores_count) == -1) { for (cisi = 0; cisi < MAX_NUM_MIXTURE_CLASSES; cisi++) scores[*scores_count].c[cisi] = cis[cisi]; scores[*scores_count].score = multiclass_log_prob_of_classes_given_doc (cis, cis_size, doc); (*scores_count)++; assert (*scores_count < scores_capacity); } } return 0;}intmulticlass_classify_doc (crossbow_doc *doc, int verbose, FILE *out){ int nc = crossbow_root->children_count; int *exclude_cis, exclude_cis_size, exclude_cis_capacity; int *cis, cis_size, cis_capacity, cisi, si, scores_count; int scores_capacity = nc * nc * nc/2; multiclass_score *scores, top_score; exclude_cis_capacity = nc+2; exclude_cis = alloca (exclude_cis_capacity * sizeof(int)); scores = bow_malloc (scores_capacity * sizeof (multiclass_score)); cis_capacity = nc; cis = alloca (nc * sizeof(int)); si = 0; for (cisi = 0; cisi < nc; cisi++) cis[cisi] = exclude_cis[cisi] = -1; exclude_cis_size = 0; cis_size = 0; scores_count = 0; multiclass_explore_cis_greedy0 (doc, scores, &scores_count, scores_capacity, cis, cis_size, cis_capacity, exclude_cis, exclude_cis_size, exclude_cis_capacity); multiclass_explore_cis_greedy1 (doc, scores, &scores_count, scores_capacity, exclude_cis, exclude_cis_size, exclude_cis_capacity); assert (scores_count < scores_capacity); qsort (scores, scores_count, sizeof (multiclass_score), compare_multiclass_scores); if (verbose) { double *mixture = alloca (MAX_NUM_MIXTURES * sizeof (double)); fprintf (out, "%s ", doc->filename); if (doc->cis_size >= 0) { for (cisi = 0; cisi < doc->cis_size-1; cisi++) fprintf (out, "%s,", bow_int2str (crossbow_classnames, doc->cis[cisi])); fprintf (out, "%s ", bow_int2str (crossbow_classnames, doc->cis[cisi])); } else fprintf (out, "<unknown> "); /* Artificially print only the top scores */ for (si = 0; si < 30; si++) { fprintf (out,"%s",bow_int2str(crossbow_classnames,scores[si].c[0])); for (cisi = 1; cisi < MAX_NUM_MIXTURE_CLASSES && scores[si].c[cisi] >= 0; cisi++) fprintf (out, ",%s", bow_int2str (crossbow_classnames, scores[si].c[cisi])); fprintf (out, ":%g ", scores[si].score); if (verbose < 2) break; } fprintf (out, "\n"); /* Print the mixture of the winner */ for (cis_size = 0; cis_size < MAX_NUM_MIXTURE_CLASSES; cis_size++) if (scores[0].c[cis_size] == -1) break; multiclass_mixture_given_cis (scores[0].c, cis_size, mixture); for (cisi = 0; cisi < cis_size+2; cisi++) fprintf (out, "%g ", mixture[cisi]); fprintf (out, "\n"); /* Print a message if the correct mixture was never even tested. */ if (multiclass_cis_scores_index (doc->cis, doc->cis_size, scores, scores_count) == -1) fprintf (out, "Correct class set was not explored in %d sets.\n", scores_count); } /* Return 1 iff the classification is completely correct. */ top_score = scores[0]; bow_free (scores); for (cisi = 0; cisi < crossbow_root->children_count; cisi++) { if (top_score.c[cisi] == -1 && doc->cis[cisi] == -1) return 1; if (top_score.c[cisi] != doc->cis[cisi]) return 0; } return 1;}intold2_multiclass_classify_doc (crossbow_doc *doc, int verbose, FILE *out){ int nc = crossbow_root->children_count; int c1, c2, c3, c4, c5, si, scores_count, cisi, correct; int this_cis[MAX_NUM_MIXTURE_CLASSES]; int scores_capacity = nc * nc * nc; multiclass_score *scores, top_score; /* xxx More than enough space */ scores = bow_malloc (scores_capacity * sizeof (multiclass_score)); si = 0; for (c1 = 0; c1 < nc; c1++) { this_cis[0] = scores[si].c[0] = c1; this_cis[1] = scores[si].c[1] = -1; this_cis[2] = scores[si].c[2] = -1; this_cis[3] = scores[si].c[3] = -1; this_cis[4] = scores[si].c[4] = -1; scores[si].score = multiclass_log_prob_of_classes_given_doc (this_cis, 1, doc); si++; } /* Sort the single-class results, and use these rankings to prune the number of class combinations we evaluate. */ qsort (scores, si, sizeof (multiclass_score), compare_multiclass_scores); for (c1 = 0; c1 < nc; c1++) for (c2 = c1+1; c2 < nc; c2++) { this_cis[0] = scores[si].c[0] = c1; this_cis[1] = scores[si].c[1] = c2; this_cis[2] = scores[si].c[2] = -1; this_cis[3] = scores[si].c[3] = -1; this_cis[4] = scores[si].c[4] = -1; if (multiclass_cis_is_in_top (this_cis, scores, MAX(nc/3,10))) scores[si].score = multiclass_log_prob_of_classes_given_doc (this_cis, 2, doc); else scores[si].score = -FLT_MAX; si++; } for (c1 = 0; c1 < nc; c1++) for (c2 = c1+1; c2 < nc; c2++) for (c3 = c2+1; c3 < nc; c3++) { this_cis[0] = scores[si].c[0] = c1; this_cis[1] = scores[si].c[1] = c2; this_cis[2] = scores[si].c[2] = c3; this_cis[3] = scores[si].c[3] = -1; this_cis[4] = scores[si].c[4] = -1; if (multiclass_cis_is_in_top (this_cis, scores, MAX(nc/3,10))) scores[si].score = multiclass_log_prob_of_classes_given_doc (this_cis, 3, doc); else scores[si].score = -FLT_MAX; si++; } if (MAX_NUM_MIXTURE_CLASSES < 4) goto done_scoring; for (c1 = 0; c1 < nc; c1++) for (c2 = c1+1; c2 < nc; c2++) for (c3 = c2+1; c3 < nc; c3++) for (c4 = c3+1; c4 < nc; c4++) { this_cis[0] = scores[si].c[0] = c1; this_cis[1] = scores[si].c[1] = c2; this_cis[2] = scores[si].c[2] = c3; this_cis[3] = scores[si].c[3] = c4; this_cis[4] = scores[si].c[4] = -1; if (multiclass_cis_is_in_top (this_cis, scores, MAX(nc/3,10))) scores[si].score = multiclass_log_prob_of_classes_given_doc (this_cis, 4, doc); else scores[si].score = -FLT_MAX; si++; } if (MAX_NUM_MIXTURE_CLASSES < 5) goto done_scoring; for (c1 = 0; c1 < nc; c1++) for (c2 = c1+1; c2 < nc; c2++) for (c3 = c2+1; c3 < nc; c3++) for (c4 = c3+1; c4 < nc; c4++) for (c5 = c4+1; c5 < nc; c5++) { this_cis[0] = scores[si].c[0] = c1; this_cis[1] = scores[si].c[1] = c2; this_cis[2] = scores[si].c[2] = c3; this_cis[3] = scores[si].c[3] = c4; this_cis[4] = scores[si].c[4] = c5; if (multiclass_cis_is_in_top (this_cis, scores, MAX(nc/3,10))) scores[si].score = multiclass_log_prob_of_classes_given_doc (this_cis, 5, doc); else scores[si].score = -FLT_MAX; si++; } done_scoring: scores_count = si; assert (scores_count < scores_capacity); qsort (scores, scores_count, sizeof (multiclass_score), compare_multiclass_scores); if (verbose) { fprintf (out, "%s ", doc->filename); if (doc->cis_size >= 0) { for (cisi = 0; cisi < doc->cis_size-1; cisi++) fprintf (out, "%s,", bow_int2str (crossbow_classnames, doc->cis[cisi])); fprintf (out, "%s ", bow_int2str (crossbow_classnames, doc->cis[cisi])); } else fprintf (out, "<unknown> "); /* Artificially print only the top 20 scores */ scores_count = 20; for (si = 0; si < scores_count; si++) { fprintf (out, "%s", bow_int2str (crossbow_classnames, scores[si].c[0])); if (scores[si].c[1] >= 0) fprintf (out, ",%s", bow_int2str (crossbow_classnames, scores[si].c[1])); if (scores[si].c[2] >= 0) fprintf (out, ",%s", bow_int2str (crossbow_classnames, scores[si].c[2])); fprintf (out, ":%g ", scores[si].score); if (verbose < 2) break; } fprintf (out, "\n"); } top_score = scores[0]; bow_free (scores); correct = 1; assert (doc->cis_size <= 3); if (top_score.c[0] != doc->cis[0]) return 0; if (doc->cis_size == 1) return 1; if (top_score.c[1] != doc->cis[1]) return 0; if (doc->cis_size == 2) return 1; if (top_score.c[2] != doc->cis[2]) return 0; return 1;}intmulticlass_old_classify_doc (crossbow_doc *doc, int verbose, FILE *out){ bow_wv *wv; int cisi, ni; double node_data_prob[crossbow_root->children_count]; double node_data_prob_sum; double node_membership[crossbow_root->children_count]; bow_score scores[crossbow_root->children_count]; double score_sum; int wvi; treenode *node; score_sum = 0; for (ni = 0; ni < crossbow_root->children_count; ni++) scores[ni].weight = 0; wv = crossbow_wv_at_di (doc->di); for (wvi = 0; wvi < wv->num_entries; wvi++) { node_data_prob_sum = 0; for (ni = 0; ni < crossbow_root->children_count; ni++) { node = crossbow_root->children[ni]; node_data_prob[ni] = /* node->prior * */ node->words[wv->entry[wvi].wi]; node_data_prob_sum += node_data_prob[ni]; } /* Skip over words with zero probability everywhere */ if (node_data_prob_sum == 0) continue; /* Normalize the node data probs, so they are membership probabilities, and increment the classification score of each node */ for (ni = 0; ni < crossbow_root->children_count; ni++) { node_membership[ni] = node_data_prob[ni] / node_data_prob_sum; scores[ni].weight += node_membership[ni] * wv->entry[wvi].count; score_sum += scores[ni].weight; } } /* Normalize the class scores, and fill in DI's */ for (ni = 0; ni < crossbow_root->children_count; ni++) { scores[ni].weight /= score_sum; scores[ni].di = ni; } bow_sort_scores (scores, crossbow_root->children_count); fprintf (out, "%s ", doc->filename); for (cisi = 0; cisi < doc->cis_size-1; cisi++) fprintf (out, "%s,", bow_int2str (crossbow_classnames, doc->cis[cisi])); fprintf (out, "%s ", bow_int2str (crossbow_classnames, doc->cis[cisi])); for (ni = 0; ni < crossbow_root->children_count; ni++) { if (scores[ni].weight <= 0) break; fprintf (out, "%s:%g ", bow_int2str (crossbow_classnames, scores[ni].di), scores[ni].weight); } fprintf (out, "\n"); /* Return non-zero if the top ranked class is among the CIS of this doc */ for (cisi = 0; cisi < doc->cis_size; cisi++) if (doc->cis[cisi] == scores[0].di) return 1; return 0;}crossbow_method multiclass_method ={ "multiclass", NULL, multiclass_train, NULL, multiclass_classify_doc};void _register_method_multiclass () __attribute__ ((constructor));void _register_method_multiclass (){ bow_method_register_with_name ((bow_method*)&multiclass_method, "multiclass", sizeof (crossbow_method), NULL);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -