📄 maxent.c
字号:
query_wv, hits, max_ci, -1); assert (actual_num_hits == max_ci); for (ci = 0; ci < max_ci; ci++) total_count_per_ci[ci] += hits[ci].weight; /* now loop over the words in the document and all the classes, adding the contribution to E[f_{w,c}] */ for (wvi=0; wvi < query_wv->num_entries; wvi++) { wi = query_wv->entry[wvi].wi; for (ci=0; ci < bow_barrel_num_classes (vpc_barrel); ci++) bow_wi2dvf_add_wi_di_count_weight (&exp_wi2dvf, wi, ci, 1, hits[ci].weight * query_wv->entry[wvi].weight); } } /* now update the lambdas. Ignore zero constraints? */ for (wi = 0; wi < max_wi; wi++) { bow_dv *vpc_dv; bow_dv *constraint_dv; bow_dv *exp_dv; int exp_dvi = 0; vpc_dv = bow_wi2dvf_dv (vpc_barrel->wi2dvf, wi); constraint_dv = bow_wi2dvf_dv (constraint_wi2dvf, wi); exp_dv = bow_wi2dvf_dv (exp_wi2dvf, wi); /* the exp_dv can be null if we're using only some of the documents for the iteration step. If there are no iteration docs that have this word, then we don't need to worry about its weight... leave it at zero */ if (!constraint_dv || !exp_dv) continue; /* the dvi goes over the constraint and the vpc; the constraint and vpc wi2dvf should have exactly corresponding entries. The exp wi2dvf can have a superset of the entries; */ for (dvi = 0; dvi < vpc_dv->length; dvi++) { ci = vpc_dv->entry[dvi].di; /* get the corresponding exp_dvi */ while (exp_dvi < exp_dv->length && ci > exp_dv->entry[exp_dvi].di) exp_dvi++; assert (exp_dvi < exp_dv->length); assert (ci == constraint_dv->entry[dvi].di && ci == exp_dv->entry[exp_dvi].di); /* need to normalize this delta with M? */#if 1 if (exp_dv->entry[exp_dvi].weight == 0) assert (constraint_dv->entry[dvi].weight == 0); else #endif { double delta = 0; if (maxent_gaussian_prior) { double variance = maxent_prior_variance; if (maxent_prior_vary_by_count == 1) variance = maxent_prior_variance * log (1 + constraint_dv->entry[dvi].count); else if (maxent_prior_vary_by_count == 2) variance = maxent_prior_variance * constraint_dv->entry[dvi].count; newton_poly->entry[0].coeff = -constraint_dv->entry[dvi].weight + vpc_dv->entry[dvi].weight / variance; newton_poly->entry[1].coeff = exp_dv->entry[exp_dvi].weight / (double) num_tested; newton_poly->entry[2].coeff = 1.0 / variance; delta = maxent_newton (newton_poly); delta = log (delta); } else { if (exp_dv->entry[exp_dvi].weight != 0) delta = log (((double) num_tested) * constraint_dv->entry[dvi].weight / (exp_dv->entry[exp_dvi].weight)) / (double) bow_event_document_then_word_document_length; else delta = 0; /* check that delta is not NaN */ assert (delta == delta); assert (constraint_dv->entry[dvi].weight); } bow_wi2dvf_set_wi_di_count_weight (&(vpc_barrel->wi2dvf), wi, ci, vpc_dv->entry[dvi].count, (vpc_dv->entry[dvi].weight + delta)); } } } if (maxent_logprob_docs) { old_log_prob = new_log_prob; new_log_prob = maxent_calculate_accuracy(doc_barrel, vpc_barrel, maxent_logprob_docs, 2); bow_verbosify (bow_progress, "Halting Log Prob: %f\n", new_log_prob); } else if (maxent_halt_accuracy_docs) { old_accuracy = new_accuracy; new_accuracy = maxent_calculate_accuracy (doc_barrel, vpc_barrel, maxent_halt_accuracy_docs, 1); bow_verbosify (bow_progress, "Halting Accuracy: %f\n", new_accuracy); } bow_wi2dvf_free (exp_wi2dvf); } bow_free (newton_poly); bow_wi2dvf_free (constraint_wi2dvf); bow_maxent_model_building = 0;#if 0 if (maxent_print_lambdas) { bow_verbosify (bow_progress, "foo"); for (ci = 0; ci < max_ci; ci++) bow_verbosify (bow_progress, " %s", bow_barrel_classname_at_index (doc_barrel, ci)); bow_verbosify (bow_progress, "\n"); for (wi = 0; wi < max_wi; wi++) { bow_verbosify (bow_progress, "%s", bow_int2word (wi)); dv = bow_wi2dvf_dv (vpc_barrel->wi2dvf, wi); dvi = 0; for (ci = 0; ci < max_ci; ci++) { while ((ci > dv->entry[dvi].di) && (dvi < dv->length)) dvi++; if ((ci == dv->entry[dvi].di) && (dvi < dv->length)) bow_verbosify (bow_progress, " %f", dv->entry[dvi].weight); else bow_verbosify (bow_progress, " 0"); } bow_verbosify (bow_progress, "\n"); } }#endif return (vpc_barrel); }bow_barrel *bow_maxent_new_vpc_with_weights (bow_barrel *doc_barrel){ bow_barrel *vpc_barrel; /* the vector-per-class barrel */ int wi; /* word index */ int max_wi; /* max word index */ int dvi; /* document vector index */ int ci; /* class index */ bow_dv *dv; /* document vector */ int di; /* document index */ bow_dv_heap *test_heap=NULL; /* we'll extract test WV's from here */ bow_wv *query_wv; bow_score *hits; int actual_num_hits; bow_cdoc *doc_cdoc; bow_cdoc *cdoc; bow_wi2dvf *constraint_wi2dvf; int max_ci; int rounds = 0; int total_num_docs = 0; int **f_sharp; int max_f_sharp = 0; double *coefficients[200]; bow_dv *doc_dv; bow_dv *constraint_dv; bow_dv *lambda_dv; int constraint_dvi; int doc_dvi; int fi; maxent_polynomial *newton_poly; double log_prob_model; double beta; double num_words_per_ci[200]; int num_unique_words_per_ci[200]; float old_log_prob = -FLT_MAX; float new_log_prob = -FLT_MAX / 2; float old_accuracy = -1; float new_accuracy = 0; if (bow_event_model == bow_event_document_then_word) return (bow_maxent_new_vpc_with_weights_doc_then_word (doc_barrel)); bow_maxent_model_building = 1; /* some sanity checks first */ assert (200 > bow_barrel_num_classes(doc_barrel)); assert (doc_barrel->classnames); assert (bow_event_model == bow_event_word); assert (!maxent_words_per_class || !maxent_scoring_hack); assert (!(maxent_smooth_counts && maxent_gaussian_prior)); assert (!maxent_words_per_class || !maxent_logprob_constraints); assert (!maxent_logprob_constraints); assert (!maxent_prior_vary_by_count); assert (!maxent_constraint_use_unlabeled); max_wi = MIN (doc_barrel->wi2dvf->size, bow_num_words ()); max_ci = bow_barrel_num_classes (doc_barrel); f_sharp = bow_malloc (sizeof (int *) * doc_barrel->cdocs->length); for (di = 0; di < doc_barrel->cdocs->length; di++) f_sharp[di] = bow_malloc (sizeof (int) * max_ci); /* initialize f_sharp */ for (di = 0; di < doc_barrel->cdocs->length; di++) for (ci = 0; ci < max_ci; ci++) f_sharp[di][ci] = 0; /* if we're doing log counts, set the document weights appropriately. Otherwise, set the weights to the counts for each document. */ if (maxent_logprob_constraints) { for (wi = 0; wi < max_wi; wi++) { dv = bow_wi2dvf_dv (doc_barrel->wi2dvf, wi); if (dv == NULL) continue; for (dvi = 0; dvi < dv->length; dvi++) dv->entry[dvi].weight = log (dv->entry[dvi].count + 1); } } else { for (wi = 0; wi < max_wi; wi++) { dv = bow_wi2dvf_dv (doc_barrel->wi2dvf, wi); if (dv == NULL) continue; for (dvi = 0; dvi < dv->length; dvi++) dv->entry[dvi].weight = (float) dv->entry[dvi].count; } } /* get a barrel where the counts are set to word counts and the weights are set to normalized or unnormalized counts as appropriate for the event model */ vpc_barrel = bow_barrel_new_vpc (doc_barrel); /* if doing occurrence count pruning of features, do that now. */ if (maxent_prune_features_by_count) maxent_prune_features_by_occurrence_count (vpc_barrel, maxent_prune_features_by_count); /* set the word count and normalizer of each class cdoc correctly. Use the weight here, b/c maybe doing logprob_constraints. The word counts and normalizer are used by mutual information feature pruning.*/ for (ci = 0; ci < max_ci; ci++) { num_words_per_ci[ci] = 0; num_unique_words_per_ci[ci] = 0; } for (wi = 0; wi < max_wi; wi++) { dv = bow_wi2dvf_dv (vpc_barrel->wi2dvf, wi); if (dv == NULL) continue; for (dvi = 0; dvi < dv->length; dvi++) { num_words_per_ci[dv->entry[dvi].di] += dv->entry[dvi].weight; num_unique_words_per_ci[dv->entry[dvi].di]++; } } for (ci = 0; ci < vpc_barrel->cdocs->length; ci++) { cdoc = bow_array_entry_at_index (vpc_barrel->cdocs, ci); cdoc->word_count = (int) rint (num_words_per_ci[ci]); cdoc->normalizer = num_unique_words_per_ci[ci]; } /* If doing feature selection by mutual information, do that now. Ensure that cdoc->word_count set correctly beforehand. It should be ok to do both kinds of feature selection pruning. */ if (maxent_words_per_class > 0) maxent_prune_vocab_by_mutual_information (vpc_barrel, maxent_words_per_class); /* initialize cdoc->class_probs for all the docs and initialize total_num_docs to the number of training docs */ for (di=0; di < doc_barrel->cdocs->length; di++) { bow_cdoc *cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di); double *double_class_probs; if (cdoc->type == bow_doc_train) total_num_docs++; if (!cdoc->class_probs) cdoc->class_probs = (float *) bow_malloc (sizeof (double) * max_ci); double_class_probs = (double *) cdoc->class_probs; /* initialize the class_probs to all zeros */ for (ci=0; ci < max_ci; ci++) double_class_probs[ci] = 0.0; } /* Set the constraint wi2dvf to be the (vpc weight / number of documents). Re-initialize the vpc weights to 0 (initialize the lambdas to be zero). */ constraint_wi2dvf = bow_wi2dvf_new (doc_barrel->wi2dvf->size); for (wi = 0; wi < max_wi; wi++) { dv = bow_wi2dvf_dv (vpc_barrel->wi2dvf, wi); if (!dv) continue; if (maxent_smooth_counts) { dvi = 0; for (ci = 0; ci < max_ci; ci++) { while (dv->entry[dvi].di < ci && dvi < dv->length) dvi++; /* set contraint to smoothed empirical average */ if (dvi < dv->length && dv->entry[dvi].di == ci) bow_wi2dvf_set_wi_di_count_weight(&constraint_wi2dvf, wi, ci, dv->entry[dvi].count + 1, (dv->entry[dvi].weight + 1.0) / (double) total_num_docs); else bow_wi2dvf_set_wi_di_count_weight(&constraint_wi2dvf, wi, ci, 1, 1.0 / (double) total_num_docs); /* initialize the lambda to 0 */ bow_wi2dvf_set_wi_di_count_weight (&(vpc_barrel->wi2dvf), wi, ci, 1, 0); } } else if (maxent_gaussian_prior) { dvi = 0; for (ci = 0; ci < max_ci; ci++) { while (dv->entry[dvi].di < ci && dvi < dv->length) dvi++; /* set contraint to smoothed empirical average */ if (dvi < dv->length && dv->entry[dvi].di == ci) { bow_wi2dvf_set_wi_di_count_weight(&constraint_wi2dvf, wi, ci, dv->entry[dvi].count, dv->entry[dvi].weight / (double) total_num_docs); /* initialize the lambda to 0 */ bow_wi2dvf_set_wi_di_count_weight (&(vpc_barrel->wi2dvf), wi, ci, 1, 0); } else if (maxent_gaussian_prior_zero_constraints) { bow_wi2dvf_set_wi_di_count_weight(&constraint_wi2dvf, wi, ci, 1, 0); /* initialize the lambda to 0 */ bow_wi2dvf_set_wi_di_count_weight (&(vpc_barrel->wi2dvf), wi, ci, 1, 0); } } } else { for (dvi = 0; dvi < dv->length; dvi++) { ci = dv->entry[dvi].di; assert (dv->entry[dvi].weight > 0); /* set contraint to empirical average */ bow_wi2dvf_set_wi_di_count_weight(&constraint_wi2dvf, wi, ci, dv->entry[dvi].count, dv->entry[dvi].weight / (double) total_num_docs); /* initialize the lambda to 0 */ bow_wi2dvf_set_wi_di_count_weight (&(vpc_barrel->wi2dvf), wi, ci, dv->entry[dvi].count, 0); } } } /* set f_sharp of each document/class combination to be the sum of all the feature weights for that class for that doc. set max_f_sharp to be the maximum of all the f_sharp values. Note that we're summing document word counts here, and not document word weights. We'll have to do something more sneaky for logprob constraints when we implement it. For now, though, this should be ok.*/ /* walk the document wi2dvf with the constraint wi2dvf and increment */ for (wi = 0; wi < max_wi; wi++) { doc_dv = bow_wi2dvf_dv (doc_barrel->wi2dvf, wi); constraint_dv = bow_wi2dvf_dv (constraint_wi2dvf, wi); if (!constraint_dv || !doc_dv) continue; for (doc_dvi = 0; doc_dvi < doc_dv->length; doc_dvi++) for (constraint_dvi = 0; constraint_dvi < constraint_dv->length; constraint_dvi++) f_sharp[doc_dv->entry[doc_dvi].di][constraint_dv->entry[constraint_dvi].di] +=
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -