📄 active.c
字号:
} } if (active_binary_pos_ci == -1) { bow_error("No such class %s.", active_binary_pos_classname); } } /* print out the model docs */ for (di=0; di < doc_barrel->cdocs->length; di++) { bow_cdoc *cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di); if (cdoc->type == bow_doc_train) bow_verbosify (bow_progress, "Initial %s\n", cdoc->filename); } /* count the number of unlabeled docs */ for (di=0; di < doc_barrel->cdocs->length; di++) { bow_cdoc *cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di); if (cdoc->type == bow_doc_unlabeled) num_unlabeled_docs++; } orig_num_unlabeled_docs = num_unlabeled_docs; /* allocate the correct amount of space for unlabeled scoring */ scores = bow_malloc (sizeof(active_scores) * num_unlabeled_docs); for (di = 0; di < num_unlabeled_docs; di++) { scores[di].scores = bow_malloc (sizeof(bow_score *) * active_committee_size); for (mi = 0; mi < active_committee_size; mi++) { scores[di].scores[mi] = bow_malloc (sizeof (bow_score) * bow_barrel_num_classes(doc_barrel)); } } /* make the class barrel */ vpc_barrel = bow_barrel_new (doc_barrel->wi2dvf->size, doc_barrel->cdocs->length-1, doc_barrel->cdocs->entry_size, doc_barrel->cdocs->free_func); vpc_barrel->method = doc_barrel->method; vpc_barrel->classnames = bow_int4str_new (0); /* And, we're off */ for (round_num = 0; round_num < active_num_rounds; round_num++) { int hiti = 0; /* Re-create the vector-per-class barrel in accordance with the new train/test settings. */ /* if we're pruning the vocab, do that now - fix for unlabeled percent */ if (bow_prune_vocab_by_infogain_n) { /* Change barrel by removing words with small info gain, if requested. */ bow_barrel_keep_top_words_by_infogain (bow_prune_vocab_by_infogain_n, doc_barrel, bow_barrel_num_classes (doc_barrel)); } /* Set word_count set correctly and set the entropy of each document in the normalizer of the cdoc; do this after all vocab changing */ { query_wv = NULL; /* Create the heap from which we'll get WV's. */ test_heap = bow_test_new_heap (doc_barrel); /* Loop once for each document. */ while ((di = bow_heap_next_wv (test_heap, doc_barrel, &query_wv, bow_cdoc_yes)) != -1) { int word_count = 0; int wvi; doc_cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di); bow_wv_set_weights (query_wv, vpc_barrel); bow_wv_normalize_weights (query_wv, vpc_barrel); for (wvi = 0; wvi < query_wv->num_entries; wvi++) { word_count += query_wv->entry[wvi].count; } doc_cdoc->word_count = word_count; doc_cdoc->normalizer = active_document_entropy(query_wv); } } /* generate test stats for the step in active learning */ if (active_test_stats) { bow_em_perturb_method reset_perturb_start = -1; int reset_num_em_runs = -1; int reset_em_cross_entropy = -1; /* turn variance off for test stats */ if (bow_em_perturb_starting_point != bow_em_perturb_none) { reset_perturb_start = bow_em_perturb_starting_point; bow_em_perturb_starting_point = bow_em_perturb_none; } /* make a real number of EM rounds just for printing tests */ if (active_final_em) { reset_num_em_runs = bow_em_num_em_runs; bow_em_num_em_runs = 7; } /* Do no EM for stats-reporting */ if (active_no_final_em) { reset_num_em_runs = bow_em_num_em_runs; bow_em_num_em_runs = 1; } /* turn cross entropy off if just testing docs */ if (active_selection_method == wkl || active_selection_method == dkl) { reset_em_cross_entropy = em_cross_entropy; em_cross_entropy = 0; } if (vpc_barrel != NULL) bow_free_barrel (vpc_barrel); vpc_barrel = (*(secondary_method->vpc_with_weights))(doc_barrel); active_test(stdout, doc_barrel, vpc_barrel); /* turn variance back on for committee members */ if (reset_perturb_start != -1) bow_em_perturb_starting_point = reset_perturb_start; /* turn EM off for committee members */ if (reset_num_em_runs != -1) bow_em_num_em_runs = reset_num_em_runs; /* turn cross entropy back on if necessary */ if (reset_em_cross_entropy != -1) em_cross_entropy = reset_em_cross_entropy; } if (active_perturb_after_em) { if (vpc_barrel) bow_barrel_free(vpc_barrel); vpc_barrel = (*(secondary_method->vpc_with_weights))(doc_barrel); } for (mi = 0; mi < active_committee_size; mi++) { bow_barrel *comm_barrel = NULL; hiti = 0; if (active_perturb_after_em) { comm_barrel = bow_barrel_copy(vpc_barrel); bow_em_perturb_starting_point = bow_em_perturb_with_dirichlet; bow_em_perturb_weights(comm_barrel, doc_barrel); bow_em_perturb_starting_point = bow_em_perturb_none; } else { comm_barrel = (*(secondary_method->vpc_with_weights))(doc_barrel); } if (active_print_committee_matrices) { active_test(stdout, doc_barrel, comm_barrel); } /* score all the unlabeled docs */ /* Create the heap from which we'll get WV's. */ test_heap = bow_test_new_heap (doc_barrel); /* Initialize QUERY_WV so BOW_TEST_NEXT_WV() knows not to try to free */ query_wv = NULL; /* Loop once for each unlabeled document. */ while ((di = bow_heap_next_wv (test_heap, doc_barrel, &query_wv, bow_cdoc_is_unlabeled)) != -1) { doc_cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di); class_cdoc = bow_array_entry_at_index (comm_barrel->cdocs, doc_cdoc->class); bow_wv_set_weights (query_wv, comm_barrel); bow_wv_normalize_weights (query_wv, comm_barrel); actual_num_hits = bow_barrel_score (comm_barrel, query_wv, scores[hiti].scores[mi], bow_barrel_num_classes(comm_barrel), -1); assert (actual_num_hits == bow_barrel_num_classes(comm_barrel)); if (mi == 0) scores[hiti].di = di; else assert (di == scores[hiti].di); hiti++; } bow_barrel_free (comm_barrel); } num_unlabeled_docs = hiti; /* remap the scores if desired */ if (active_remap_scores_pr) active_remap_scores(doc_barrel, scores, num_unlabeled_docs, active_committee_size); /* choose docs to convert to model */ active_select_docs(doc_barrel, scores, active_add_per_round, num_unlabeled_docs, active_committee_size); } /* turn off perturbing for building final barrel */ if (bow_em_perturb_starting_point != bow_em_perturb_none) { bow_em_perturb_starting_point = bow_em_perturb_none; } /* make a real number of EM rounds if final em run */ if (active_final_em) { bow_em_num_em_runs = 7; } /* Do no EM for stats-reporting */ if (active_no_final_em) { bow_em_num_em_runs = 1; } /* turn cross entropy off if just testing docs */ if (active_selection_method == wkl || active_selection_method == dkl) { em_cross_entropy = 0; } if (vpc_barrel != NULL) bow_free_barrel(vpc_barrel); vpc_barrel = (*(secondary_method->vpc_with_weights))(doc_barrel); /* free scores */ for (di = 0; di < orig_num_unlabeled_docs ; di++) { for (mi = 0; mi < active_committee_size; mi ++) { bow_free (scores[di].scores[mi]); } bow_free(scores[di].scores); } bow_free(scores); return vpc_barrel;}voidactive_undef_prior (bow_barrel *vpc_barrel, bow_barrel *doc_barrel){ bow_error("Active priors depends on secondary method."); return;}intactive_undef_score (bow_barrel *barrel, bow_wv *query_wv, bow_score *bscores, int bscores_len, int loo_class){ bow_error("Active scoring depends on secondary method."); return -1;}/* Run test trials, outputing results to TEST_FP. The results are indended to be read and processed by the Perl script ./rainbow-stats. */voidactive_test (FILE *test_fp, bow_barrel *rainbow_doc_barrel, bow_barrel *rainbow_class_barrel){ bow_dv_heap *test_heap; /* we'll extract test WV's from here */ bow_wv *query_wv; int di; /* a document index */ bow_score *hits = NULL; int num_hits_to_retrieve=0; int actual_num_hits; int hi; /* hit index */ bow_cdoc *doc_cdoc; bow_cdoc *class_cdoc; fprintf (test_fp, "#0\n"); num_hits_to_retrieve = bow_barrel_num_classes (rainbow_class_barrel); hits = alloca (sizeof (bow_score) * num_hits_to_retrieve); /* Create the heap from which we'll get WV's. */ test_heap = bow_test_new_heap (rainbow_doc_barrel); /* Initialize QUERY_WV so BOW_TEST_NEXT_WV() knows not to try to free */ query_wv = NULL; /* Loop once for each test document. NOTE: This will skip documents that don't have any words that are in the vocabulary. */ while ((di = bow_heap_next_wv (test_heap, rainbow_doc_barrel, &query_wv, bow_cdoc_is_test)) != -1) { doc_cdoc = bow_array_entry_at_index (rainbow_doc_barrel->cdocs, di); class_cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, doc_cdoc->class); bow_wv_set_weights (query_wv, rainbow_class_barrel); bow_wv_normalize_weights (query_wv, rainbow_class_barrel); actual_num_hits = bow_barrel_score (rainbow_class_barrel, query_wv, hits, num_hits_to_retrieve, -1); assert (actual_num_hits == num_hits_to_retrieve);#if 0 printf ("%8.6f %d %8.6f %8.6f %d ", class_cdoc->normalizer, class_cdoc->word_count, class_cdoc->normalizer / class_cdoc->word_count, class_cdoc->prior, doc_cdoc->class); if (hits[0].di == doc_cdoc->class) printf ("1\n"); else printf ("0\n");#endif fprintf (test_fp, "%s %s ", doc_cdoc->filename, filename_to_classname(class_cdoc->filename)); for (hi = 0; hi < actual_num_hits; hi++) { class_cdoc = bow_array_entry_at_index (rainbow_class_barrel->cdocs, hits[hi].di); fprintf (test_fp, "%s:%.*g ", bow_barrel_classname_at_index (rainbow_class_barrel, hits[hi].di), bow_score_print_precision, hits[hi].weight); } fprintf (test_fp, "\n"); }}rainbow_method bow_method_active = { "active", NULL, /* bow_leave_weights_alone_since_theyre_really_counts */ 0, /* no weight scaling function */ NULL, /* bow_barrel_normalize_weights_by_summing, */ active_learn, active_undef_prior, active_undef_score, bow_wv_set_weights_to_count, NULL, /* no need for extra weight normalization */ NULL };void _register_method_active () __attribute__ ((constructor));void _register_method_active (){ static int done = 0; if (done) return; bow_method_register_with_name ((bow_method*)&bow_method_active, "active", sizeof (rainbow_method), &active_argp_child); bow_argp_add_child (&active_argp_child); done = 1;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -