📄 maxent.c
字号:
doc_dv->entry[doc_dvi].count; } /* find the max_fsharp */ for (di = 0; di < doc_barrel->cdocs->length ; di++) { bow_cdoc *doc_cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di); if (doc_cdoc->type != bow_doc_train) continue; for (ci = 0; ci < max_ci; ci++) if (f_sharp[di][ci] > max_f_sharp) max_f_sharp = f_sharp[di][ci]; } max_f_sharp++; /* allocate space for the coefficients */ for (ci = 0; ci < max_ci; ci++) coefficients[ci] = bow_malloc (sizeof (double) * (max_f_sharp)); /* initialize coefficients */ for (ci = 0; ci < max_ci; ci++) for (fi = 0; fi < max_f_sharp; fi++) coefficients[ci][fi] = 0.0; /* initialize the newton_poly structure */ newton_poly = bow_malloc (sizeof (maxent_polynomial) + sizeof (maxent_coefficient) * max_f_sharp); newton_poly->size = max_f_sharp; /* Lets start some maximum entropy iteration */ while (maxent_logprob_docs ? new_log_prob > old_log_prob : (maxent_halt_accuracy_docs ? new_accuracy > old_accuracy : rounds < maxent_num_iterations)) { int num_tested; rounds++; /* classify all the training documents, and store the class membership probs in each document's cdoc->class_probs */ query_wv = NULL; hits = alloca (sizeof (bow_score) * max_ci); num_tested = 0; test_heap = bow_test_new_heap (doc_barrel); log_prob_model = 0; /* Loop once for each training document, scoring it and recording its class probs */ while ((di = bow_heap_next_wv (test_heap, doc_barrel, &query_wv, maxent_iteration_docs)) != -1) { double *double_class_probs; num_tested++; doc_cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di); bow_wv_set_weights (query_wv, vpc_barrel); bow_wv_normalize_weights (query_wv, vpc_barrel); actual_num_hits = bow_barrel_score (vpc_barrel, query_wv, hits, max_ci, -1); assert (actual_num_hits == max_ci); log_prob_model += log (hits[doc_cdoc->class].weight); double_class_probs = (double *) doc_cdoc->class_probs; /* record the scores in class_probs */ for (ci = 0; ci < max_ci; ci++) { double_class_probs[ci] = hits[ci].weight; assert (double_class_probs[ci]); } } /* Calculate accuracy of the validation set for halting check */ if (maxent_accuracy_docs) bow_verbosify (bow_progress, "%4d Training Log Prob: %f Selected Correct: %f\n", rounds, log_prob_model, maxent_calculate_accuracy(doc_barrel, vpc_barrel, maxent_accuracy_docs, 1)); bow_verbosify (bow_progress, "Updating lambdas : "); /* now calculate a new lambda for each word feature. */ for (wi = 0; wi < max_wi; wi++) { constraint_dv = bow_wi2dvf_dv (constraint_wi2dvf, wi); if (wi % 100 == 0) bow_verbosify (bow_progress, "\b\b\b\b\b\b\b%7d", wi); if (!constraint_dv) continue; dv = bow_wi2dvf_dv (doc_barrel->wi2dvf, wi); lambda_dv = bow_wi2dvf_dv (vpc_barrel->wi2dvf, wi); assert (dv && lambda_dv); /* collect the coefficients for all classes simultaneously */ for (dvi = 0; dvi < dv->length; dvi++) { bow_cdoc *cdoc; double *double_class_probs; di = dv->entry[dvi].di; cdoc = bow_array_entry_at_index (doc_barrel->cdocs, di); if (cdoc->type != bow_doc_train) continue; double_class_probs = (double *) cdoc->class_probs; for (constraint_dvi = 0; constraint_dvi < constraint_dv->length; constraint_dvi++) { ci = constraint_dv->entry[constraint_dvi].di; coefficients[ci][f_sharp[di][ci]] += double_class_probs[ci] * dv->entry[dvi].weight / (double) total_num_docs; } } /* now update the lambdas */ for (dvi = 0; dvi < constraint_dv->length; dvi++) { bow_cdoc *cdoc; /* set the zeroth coefficient to -constraint */ ci = constraint_dv->entry[dvi].di; cdoc = bow_array_entry_at_index (vpc_barrel->cdocs, ci); /* skip classes for which there is no training data */ if (!cdoc->word_count) continue; coefficients[ci][0] -= constraint_dv->entry[dvi].weight; /* compile down the class coefficients into newton_poly */ newton_poly->length = 0; for (fi = 0; fi < max_f_sharp; fi++) { if (coefficients[ci][fi] != 0) { newton_poly->entry[newton_poly->length].coeff = coefficients[ci][fi] ; newton_poly->entry[newton_poly->length].power = fi; newton_poly->length++; } } assert (newton_poly->length > 1); if (maxent_gaussian_prior) { newton_poly->entry[0].coeff += lambda_dv->entry[dvi].weight / maxent_prior_variance; newton_poly->entry[newton_poly->length].power = -1; newton_poly->entry[newton_poly->length].coeff = 1.0 / maxent_prior_variance; } /* solve for beta using newton's method on the coefficients */ beta = maxent_newton (newton_poly); /* update the right lambda */ assert (lambda_dv->entry[dvi].di == ci); lambda_dv->entry[dvi].weight += log (beta); assert (lambda_dv->entry[dvi].weight == lambda_dv->entry[dvi].weight); /* clear out the coefficients used */ for (fi = 0; fi < newton_poly->length; fi++) coefficients[ci][newton_poly->entry[fi].power] = 0.0; } } bow_verbosify (bow_progress, "\b\b\b\b\b\b\b%7d\n", wi); /* calculate the new accuracy or log_prob for the halting check */ if (maxent_logprob_docs) { old_log_prob = new_log_prob; new_log_prob = maxent_calculate_accuracy(doc_barrel, vpc_barrel, maxent_logprob_docs, 2); bow_verbosify (bow_progress, "Halting Log Prob: %f\n", new_log_prob); } else if (maxent_halt_accuracy_docs) { old_accuracy = new_accuracy; new_accuracy = maxent_calculate_accuracy (doc_barrel, vpc_barrel, maxent_halt_accuracy_docs, 1); bow_verbosify (bow_progress, "Halting Accuracy: %f\n", new_accuracy); } } for (di = 0; di < doc_barrel->cdocs->length; di++) bow_free (f_sharp[di]); bow_wi2dvf_free (constraint_wi2dvf); bow_free (f_sharp); bow_free (newton_poly); for (ci = 0; ci < max_ci; ci++) bow_free (coefficients[ci]); bow_maxent_model_building = 0; return (vpc_barrel); }intbow_maxent_score (bow_barrel *barrel, bow_wv *query_wv, bow_score *bscores, int bscores_len, int loo_class){ double *scores; /* will become prob(class), indexed over CI */ int ci; /* a "class index" (document index) */ int wvi; /* an index into the entries of QUERY_WV. */ int dvi; /* an index into a "document vector" */ double rescaler; /* Rescale SCORES by this after each word */ int num_scores; /* number of entries placed in SCORES */ int wi; /* word index */ int max_wi; int max_ci; assert (bow_event_model == bow_event_word || bow_event_model == bow_event_document_then_word); assert (loo_class == -1); assert (bscores_len <= bow_barrel_num_classes (barrel)); assert (!bow_maxent_model_building || bow_barrel_num_classes (barrel) == bscores_len); /* Allocate space to store scores for *all* classes (documents) */ scores = alloca (bow_barrel_num_classes (barrel) * sizeof (double)); max_wi = MIN (barrel->wi2dvf->size, bow_num_words()); max_ci = bow_barrel_num_classes (barrel); /* Remove words not in the class_barrel */ bow_wv_prune_words_not_in_wi2dvf (query_wv, barrel->wi2dvf); /* Yipes, now we are doing this twice, but we have to make sure that it is re-done after any pruning of words in the QUERY_WV */ bow_wv_set_weights (query_wv, barrel); bow_wv_normalize_weights (query_wv, barrel);#if 0 /* WhizBang Print the WV, just for debugging */ bow_wv_fprintf (stderr, query_wv); fflush (stderr);#endif /* Initialize the SCORES to 0. */ for (ci=0; ci < bow_barrel_num_classes (barrel); ci++) scores[ci] = 0; for (wvi=0; wvi < query_wv->num_entries; wvi++) { bow_dv *dv; /* the "document vector" for the word WI */ wi = query_wv->entry[wvi].wi; dv = bow_wi2dvf_dv (barrel->wi2dvf, wi); /* If the model doesn't know about this word, skip it. Is this really the right thing to do for maximum entropy? */ if (!dv) continue; rescaler = DBL_MAX; if (maxent_scoring_hack) { /* loop over all classes, using the lambda, or the NB prob if zero */ dvi = 0; for (ci = 0; ci < max_ci; ci++) { while (dv->entry[dvi].di < ci && dv->length < dvi) dvi++; if (dvi < dv->length && dv->entry[dvi].di == ci) scores[ci] += dv->entry[dvi].weight * query_wv->entry[wvi].weight; else { bow_cdoc *cdoc = bow_array_entry_at_index(barrel->cdocs, ci); if (cdoc->word_count && cdoc->normalizer) scores[ci] += (1.0 / (double) (cdoc->word_count + cdoc->normalizer)) * query_wv->entry[wvi].weight; else scores[ci] -= 10; } /* Keep track of the minimum score updated for this word. */ if (rescaler > scores[ci]) rescaler = scores[ci]; } } else { /* Loop over all features using this word, putting this word's (WI's) contribution into SCORES. */ for (dvi = 0; dvi < dv->length; dvi++) { double lambda; ci = dv->entry[dvi].di; lambda = dv->entry[dvi].weight; /* update the scores for this word class combination */ scores[ci] += lambda * query_wv->entry[wvi].weight; /* Keep track of the minimum score updated for this word. */ if (rescaler > scores[ci]) rescaler = scores[ci]; } } /* Loop over all classes, re-scaling SCORES so that they don't get so small we loose floating point resolution. This scaling always keeps all SCORES positive. */ if (rescaler < 0) { for (ci = 0; ci < barrel->cdocs->length; ci++) { /* Add to SCORES to bring them close to zero. RESCALER is expected to often be less than zero here. */ /* xxx If this doesn't work, we could keep track of the min and the max, and sum by their average. */ scores[ci] += -rescaler; assert (scores[ci] > -DBL_MAX + 1.0e5 && scores[ci] < DBL_MAX - 1.0e5); } } } /* Rescale the SCORE one last time, this time making them all 0 or negative, so that exp() will work well, especially around the higher-probability classes. */ rescaler = -DBL_MAX; for (ci = 0; ci < barrel->cdocs->length; ci++) if (scores[ci] > rescaler) rescaler = scores[ci]; /* RESCALER is now the maximum of the SCORES. */ for (ci = 0; ci < barrel->cdocs->length; ci++) scores[ci] -= rescaler; /* Use exp() on the SCORES */ for (ci = 0; ci < barrel->cdocs->length; ci++) scores[ci] = exp (scores[ci]); /* Now normalize to make them probabilities */ { double scores_sum = 0; for (ci = 0; ci < barrel->cdocs->length; ci++) scores_sum += scores[ci]; for (ci = 0; ci < barrel->cdocs->length; ci++) { scores[ci] /= scores_sum; /* assert (scores[ci] > 0); */ } } /* Return the SCORES by putting them (and the `class indices') into SCORES in sorted order. */ if (!bow_maxent_model_building) { num_scores = 0; for (ci = 0; ci < barrel->cdocs->length; ci++) { if (num_scores < bscores_len || bscores[num_scores-1].weight < scores[ci]) { /* We are going to put this score and CI into SCORES because either: (1) there is empty space in SCORES, or (2) SCORES[CI] is larger than the smallest score there currently. */ int dsi; /* an index into SCORES */ if (num_scores < bscores_len) num_scores++; dsi = num_scores - 1; /* Shift down all the entries that are smaller than SCORES[CI] */ for (; dsi > 0 && bscores[dsi-1].weight < scores[ci]; dsi--) bscores[dsi] = bscores[dsi-1]; /* Insert the new score */ bscores[dsi].weight = scores[ci]; bscores[dsi].di = ci; } } } else { for (ci = 0; ci < barrel->cdocs->length; ci++) { bscores[ci].weight = scores[ci]; bscores[ci].di = ci ; } num_scores = bscores_len; } return num_scores;}rainbow_method bow_method_maxent = { "maxent", NULL, /* now weight setting function */ NULL, /* no weight scaling function */ NULL, /* bow_barrel_normalize_weights_by_summing, */ bow_maxent_new_vpc_with_weights, bow_barrel_set_vpc_priors_by_counting, bow_maxent_score, bow_wv_set_weights_by_event_model, NULL, /* no need for extra wv weight normalization */ bow_barrel_free, /* is this really the right one? */ NULL /* no parameters for the method */};void _register_method_maxent () __attribute__ ((constructor));void _register_method_maxent (){ bow_method_register_with_name ((bow_method*)&bow_method_maxent, "maxent", sizeof (rainbow_method), &maxent_argp_child); bow_argp_add_child (&maxent_argp_child);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -