📄 maxent.c
字号:
bow_wi2dvf_hide_wi (barrel->wi2dvf, wi); else dv->length = new_dvi; } for (ci = 0; ci < max_ci; ci++) bow_free (mis[ci]); bow_free (mis); return;}/* Newton's method. max_fi is one more than the index of the largest coefficient given */doublemaxent_newton (maxent_polynomial *poly){ double root; double fun_val = 100; double deriv_val; double low = 0; double high = 1000000; double dxold = 1000000; double dx = 1000000; static double epsilon=1.0E-8; int fi; int rounds = 0; /* initial guess of root */ root = 1.0001; while (fabs(fun_val) > epsilon) { /* calculate function at new root */ fun_val = 0.0; for (fi = 0; fi < poly->length; fi++) fun_val += poly->entry[fi].coeff * pow (root, poly->entry[fi].power); if (maxent_gaussian_prior) fun_val += poly->entry[poly->length].coeff * log (root); /* calculate derivative at root */ deriv_val = 0.0; for (fi = 1; fi < poly->length; fi++) deriv_val += poly->entry[fi].power * poly->entry[fi].coeff * pow (root, poly->entry[fi].power - 1); if (maxent_gaussian_prior) deriv_val += poly->entry[poly->length].coeff / root; assert (fun_val == fun_val && deriv_val == deriv_val); if (fun_val < 0) low = root; else high = root; dxold = dx; /* if fun_val is infinity, bisect the region. otherwise, use newton. */ if (fun_val == HUGE_VAL || 2 * fabs(fun_val) > fabs(dxold * deriv_val)) { dx = (high - low) / 2.0; root = low + dx; } else { dx = fun_val / deriv_val; root -= dx; /* if newton thinks to go outside the region, bisect the region */ if (root < low || root > high) { dx = (high - low) / 2.0; root = low + dx; } } rounds++; if (rounds > 100) bow_error ("Newton's method did not converge.\n"); } return (root);}/* Calculate the accuracy or the model prob of the barrel on a set of docs. accuracy_or_logprob 1 for accuracy, else for logprob */floatmaxent_calculate_accuracy (bow_barrel *doc_barrel, bow_barrel *class_barrel, int (*test_docs)(bow_cdoc *), int accuracy_or_logprob){ bow_dv_heap *test_heap; /* we'll extract test WV's from here */ bow_wv *query_wv; int di; /* a document index */ bow_score *hits; int num_hits_to_retrieve = bow_barrel_num_classes (class_barrel); int actual_num_hits; bow_cdoc *doc_cdoc; int num_tested = 0; int num_correct = 0; int old_model_building = 0; double log_prob = 0; if (accuracy_or_logprob == 1) { old_model_building = bow_maxent_model_building; bow_maxent_model_building = 0; } /* Create the heap from which we'll get WV's. Initialize QUERY_WV so BOW_TEST_NEXT_WV() knows not to try to free. */ hits = alloca (sizeof (bow_score) * num_hits_to_retrieve); test_heap = bow_test_new_heap (doc_barrel); query_wv = NULL; /* Loop once for each test document. */ while ((di = bow_heap_next_wv (test_heap, doc_barrel, &query_wv, test_docs)) != -1) { doc_cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di); bow_wv_set_weights (query_wv, class_barrel); bow_wv_normalize_weights (query_wv, class_barrel); actual_num_hits = bow_barrel_score (class_barrel, query_wv, hits, num_hits_to_retrieve, -1); assert (actual_num_hits == num_hits_to_retrieve); if (doc_cdoc->class == hits[0].di) num_correct++; log_prob += log(hits[doc_cdoc->class].weight); num_tested++; } if (accuracy_or_logprob == 1) { bow_maxent_model_building = old_model_building; return (((float) num_correct) / ((float) num_tested)); } else return (log_prob);}bow_barrel *bow_maxent_new_vpc_with_weights_doc_then_word (bow_barrel *doc_barrel){ bow_barrel *vpc_barrel; /* the vector-per-class barrel */ int wi; /* word index */ int wvi; int max_wi; /* max word index */ int dvi; /* document vector index */ int ci; /* class index */ bow_dv *dv; /* document vector */ int di; /* document index */ bow_dv_heap *test_heap=NULL; /* we'll extract test WV's from here */ bow_wv *query_wv; bow_score *hits; int actual_num_hits; bow_cdoc *doc_cdoc; bow_wi2dvf *constraint_wi2dvf; int max_ci; int rounds = 0; double total_count_per_ci[200]; int total_num_docs = 0; float old_log_prob = -FLT_MAX; float new_log_prob = -FLT_MAX / 2; float old_accuracy = -1; float new_accuracy = 0; maxent_polynomial *newton_poly; int num_unlabeled = 0; int *unlabeled_dis = NULL; bow_maxent_model_building = 1; /* some sanity checks first */ assert (200 > bow_barrel_num_classes(doc_barrel)); assert (doc_barrel->classnames); assert (bow_event_model == bow_event_document_then_word); assert (!maxent_scoring_hack); assert (!maxent_logprob_constraints); assert (!maxent_words_per_class); assert (!maxent_prune_features_by_count); max_wi = MIN (doc_barrel->wi2dvf->size, bow_num_words ()); max_ci = bow_barrel_num_classes (doc_barrel); newton_poly = bow_malloc (sizeof (maxent_polynomial) + sizeof (maxent_coefficient) * 3); newton_poly->size = 3; newton_poly->length = 2; newton_poly->entry[0].power = 0; newton_poly->entry[1].power = bow_event_document_then_word_document_length; newton_poly->entry[2].power = -1; /* if we're using unlabeled data to set the constraints, then we need to temporarily convert these into training documents. */ if (maxent_constraint_use_unlabeled) { unlabeled_dis = bow_malloc (sizeof (int) * doc_barrel->cdocs->length); for (di = 0; di < doc_barrel->cdocs->length; di++) { bow_cdoc *cdoc = bow_array_entry_at_index(doc_barrel->cdocs, di); if (cdoc->type == bow_doc_unlabeled) { unlabeled_dis[num_unlabeled] = di; num_unlabeled++; cdoc->type = bow_doc_train; } } } /* get a barrel where the weights and counts are set to word counts based on the labeled data only */ vpc_barrel = bow_barrel_new_vpc (doc_barrel); /* switch back the unlabeled documents to have their original tag */ if (maxent_constraint_use_unlabeled) { int ui; for (ui = 0; ui < num_unlabeled; ui++) { bow_cdoc *cdoc = bow_array_entry_at_index(doc_barrel->cdocs, unlabeled_dis[ui]); cdoc->type = bow_doc_unlabeled; } bow_free (unlabeled_dis); } /* re-initialize the weights to 0. Set the constraint wi2dvf to the counts. */ for (ci = 0; ci < max_ci; ci++) { bow_cdoc *cdoc = bow_array_entry_at_index (vpc_barrel->cdocs, ci); total_num_docs += cdoc->word_count; } /* Count how many training documents there are. Exclude documents that have no features, as we need to ignore them for doc_then_word */ query_wv = NULL; test_heap = bow_test_new_heap (doc_barrel); /* Iterate over each document. */ while (-1 != (di = bow_heap_next_wv (test_heap, doc_barrel, &query_wv, bow_cdoc_is_train))) { if (query_wv->num_entries == 0) total_num_docs--; } constraint_wi2dvf = bow_wi2dvf_new (doc_barrel->wi2dvf->size); for (wi = 0; wi < max_wi; wi++) { dv = bow_wi2dvf_dv (vpc_barrel->wi2dvf, wi); if (!dv) continue; if (maxent_smooth_counts) { dvi = 0; for (ci = 0; ci < max_ci; ci++) { while (dv->entry[dvi].di < ci && dvi < dv->length) dvi++; /* set contraint to smoothed empirical average */ if (dvi < dv->length && dv->entry[dvi].di == ci) bow_wi2dvf_set_wi_di_count_weight(&constraint_wi2dvf, wi, ci, dv->entry[dvi].count + 1, (dv->entry[dvi].weight + 1.0) / (double) total_num_docs); else bow_wi2dvf_set_wi_di_count_weight(&constraint_wi2dvf, wi, ci, 1, 1.0 / (double) total_num_docs); /* initialize the lambda to 0 */ bow_wi2dvf_set_wi_di_count_weight (&(vpc_barrel->wi2dvf), wi, ci, 1, 0); } } else if (maxent_gaussian_prior) { dvi = 0; for (ci = 0; ci < max_ci; ci++) { while (dv->entry[dvi].di < ci && dvi < dv->length) dvi++; /* set contraint to smoothed empirical average */ if (dvi < dv->length && dv->entry[dvi].di == ci) { bow_wi2dvf_set_wi_di_count_weight(&constraint_wi2dvf, wi, ci, dv->entry[dvi].count, dv->entry[dvi].weight / (double) total_num_docs); /* initialize the lambda to 0 */ bow_wi2dvf_set_wi_di_count_weight (&(vpc_barrel->wi2dvf), wi, ci, 1, 0); } else if (maxent_gaussian_prior_zero_constraints) { bow_wi2dvf_set_wi_di_count_weight(&constraint_wi2dvf, wi, ci, 1, 0); /* initialize the lambda to 0 */ bow_wi2dvf_set_wi_di_count_weight (&(vpc_barrel->wi2dvf), wi, ci, 1, 0); } } } else { for (dvi = 0; dvi < dv->length; dvi++) { bow_cdoc *cdoc; ci = dv->entry[dvi].di; cdoc = bow_array_entry_at_index (vpc_barrel->cdocs, ci); assert (dv->entry[dvi].weight > 0); /* */ bow_wi2dvf_set_wi_di_count_weight(&constraint_wi2dvf, wi, ci, dv->entry[dvi].count, dv->entry[dvi].weight / (double) total_num_docs); bow_wi2dvf_set_wi_di_count_weight (&(vpc_barrel->wi2dvf), wi, ci, dv->entry[dvi].count, 0); } } }#if 0 if (maxent_print_constraints) { bow_verbosify (bow_progress, "foo"); for (ci = 0; ci < max_ci; ci++) bow_verbosify (bow_progress, " %s", bow_barrel_classname_at_index (doc_barrel, ci)); bow_verbosify (bow_progress, "\n"); for (wi = 0; wi < max_wi; wi++) { bow_verbosify (bow_progress, "%s", bow_int2word (wi)); dv = bow_wi2dvf_dv (constraint_wi2dvf, wi); dvi = 0; for (ci = 0; ci < max_ci; ci++) { while ((ci > dv->entry[dvi].di) && (dvi < dv->length)) dvi++; if ((ci == dv->entry[dvi].di) && (dvi < dv->length)) bow_verbosify (bow_progress, " %f", dv->entry[dvi].weight); else bow_verbosify (bow_progress, " 0"); } bow_verbosify (bow_progress, "\n"); } }#endif /* Lets start some maximum entropy iteration */ while (maxent_logprob_docs ? new_log_prob > old_log_prob : (maxent_halt_accuracy_docs ? new_accuracy > old_accuracy : rounds < maxent_num_iterations)) { bow_wi2dvf *exp_wi2dvf = bow_wi2dvf_new (doc_barrel->wi2dvf->size); int num_tested = 0; for (ci = 0; ci < max_ci; ci++) total_count_per_ci[ci] = 0.0; rounds++; /* classify all the documents, and put their contribution into the different lambdas */ query_wv = NULL; hits = alloca (sizeof (bow_score) * max_ci); num_tested = 0; test_heap = bow_test_new_heap (doc_barrel); /* Calculate accuracy of the validation set for halting check */ if (maxent_accuracy_docs) bow_verbosify (bow_progress, "%4d Correct: %f\n", rounds, maxent_calculate_accuracy(doc_barrel, vpc_barrel, maxent_accuracy_docs, 1)); /* Loop once for each training document, scoring it and recording its contribution to all the E[f_{w,c}] */ while ((di = bow_heap_next_wv (test_heap, doc_barrel, &query_wv, maxent_iteration_docs)) != -1) { doc_cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di); bow_wv_set_weights (query_wv, vpc_barrel); bow_wv_normalize_weights (query_wv, vpc_barrel); // skip documents with no words if (query_wv->num_entries == 0) continue; num_tested++; actual_num_hits = bow_barrel_score (vpc_barrel,
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -