📄 multiclass.c
字号:
estimate. */ for (cisi = 0; cisi < cis_size + 2; cisi++) mixture[cisi] = 1.0; normalizer = (cis_size + 2) * 1.0; for (cmi = 0; cmi < cm_length; cmi++) { for (cisi = cisi2 = 0; cisi2 < cm[cmi]->cis_size; cisi2++) { while (cisi < cisi2) cisi++; if (cm[cmi]->cis[cisi2] == cis[cisi]) { mixture[cisi] += cm[cmi]->cis[cisi2]; normalizer += mixture[cisi]; } } } for (cisi = 0; cisi < cis_size; cisi++) mixture[cisi] /= normalizer; return;#endif#if 0 /* Another (unused) option is to use a completely factored representation */ /* Calculcate normalized mixture weights just from the treenode priors, i.e., not using the CMIXTURE. These may not actually get used. */ /* Plus one for the root, plus one for the uniform */ double mixture_prior_sum; mixture_weights = alloca ((cis_size + 1 + 1) * sizeof (double)); mixture_prior_sum = 0; for (cisi = 0; cisi < cis_size; cisi++) { assert (cis[cisi] >= 0); mixture_prior_sum += crossbow_root->children[cis[cisi]]->prior; } mixture_prior_sum += crossbow_root->prior + multiclass_uniform_prior; for (cisi = 0; cisi < cis_size; cisi++) if (cis[cisi] >= 0) mixture_weights[cisi] = crossbow_root->children[cis[cisi]]->prior / mixture_prior_sum; mixture_weights[cis_size] = crossbow_root->prior / mixture_prior_sum; mixture_weights[cis_size+1] = multiclass_uniform_prior / mixture_prior_sum;#endif}/* MIXTURE must be as large as CIS_SIZE+2 */voidmulticlass_mixture_given_doc_and_cis (crossbow_doc *doc, int *cis, int cis_size, double *mixture){ bow_wv *wv; double *cis_mixture; double mixture_sum; treenode *node; int cisi, wvi; int num_nodes; double *node_data_prob; double node_data_prob_sum; double *node_membership; wv = crossbow_wv_at_di (doc->di); cis_mixture = alloca (sizeof (double) * (cis_size + 2)); multiclass_mixture_given_cis (cis, cis_size, cis_mixture); num_nodes = crossbow_root->children_count + 1 + 1; node_membership = alloca (num_nodes * sizeof (double)); node_data_prob = alloca (num_nodes * sizeof (double)); for (cisi = 0; cisi <= cis_size+1; cisi++) mixture[cisi] = 0; mixture_sum = 0; for (wvi = 0; wvi < wv->num_entries; wvi++) { /* Analagous to the per-word E-step */ node_data_prob_sum = 0; for (cisi = 0; cisi <= cis_size; cisi++) { if (cisi == cis_size) node = crossbow_root; else node = crossbow_root->children[cis[cisi]]; node_data_prob[cisi] = cis_mixture[cisi] * bow_treenode_pr_wi_loo_local(node,wv->entry[wvi].wi,doc->di,wvi); assert (node_data_prob[cisi] >= 0); node_data_prob_sum += node_data_prob[cisi]; } /* For the uniform distribution */ node_data_prob[cis_size+1] = cis_mixture[cis_size+1] * (1.0 / bow_num_words ()); assert (node_data_prob[cis_size+1] >= 0); node_data_prob_sum += node_data_prob[cis_size+1]; assert (node_data_prob_sum != 0); /* Normalize the node data probs, so they are membership probabilities. */ for (cisi = 0; cisi <= cis_size+1; cisi++) node_membership[cisi] = node_data_prob[cisi] / node_data_prob_sum; /* Analagous to the per-word M-step */ for (cisi = 0; cisi <= cis_size+1; cisi++) { mixture[cisi] += wv->entry[wvi].count * node_membership[cisi]; mixture_sum += mixture[cisi]; } } /* Normalize the mixture to be returned */ for (cisi = 0; cisi <= cis_size+1; cisi++) mixture[cisi] /= mixture_sum;}/* MIXTURE must be as large as CIS_SIZE+2 */voidmulticlass_iterated_mixture_given_doc_and_cis (crossbow_doc *doc, int *cis, int cis_size, double *mixture){ bow_wv *wv; double *cis_mixture; double mixture_sum; treenode *node; int cisi, wvi; int num_nodes; double *node_data_prob; double node_data_prob_sum; double *node_membership; //double pp, old_pp; wv = crossbow_wv_at_di (doc->di); cis_mixture = alloca (sizeof (double) * (cis_size + 2)); multiclass_mixture_given_cis (cis, cis_size, cis_mixture); num_nodes = crossbow_root->children_count + 1 + 1; node_membership = alloca (num_nodes * sizeof (double)); node_data_prob = alloca (num_nodes * sizeof (double)); for (cisi = 0; cisi <= cis_size+1; cisi++) mixture[cisi] = 0; mixture_sum = 0; for (wvi = 0; wvi < wv->num_entries; wvi++) { /* Analagous to the per-word E-step */ node_data_prob_sum = 0; for (cisi = 0; cisi <= cis_size; cisi++) { if (cisi == cis_size) node = crossbow_root; else node = crossbow_root->children[cis[cisi]]; node_data_prob[cisi] = cis_mixture[cisi] * bow_treenode_pr_wi_loo_local(node,wv->entry[wvi].wi,doc->di,wvi); assert (node_data_prob[cisi] >= 0); node_data_prob_sum += node_data_prob[cisi]; } /* For the uniform distribution */ node_data_prob[cis_size+1] = cis_mixture[cis_size+1] * (1.0 / bow_num_words ()); assert (node_data_prob[cis_size+1] >= 0); node_data_prob_sum += node_data_prob[cis_size+1]; assert (node_data_prob_sum != 0); /* Normalize the node data probs, so they are membership probabilities. */ for (cisi = 0; cisi <= cis_size+1; cisi++) node_membership[cisi] = node_data_prob[cisi] / node_data_prob_sum; /* Analagous to the per-word M-step */ for (cisi = 0; cisi <= cis_size+1; cisi++) { mixture[cisi] += wv->entry[wvi].count * node_membership[cisi]; mixture_sum += mixture[cisi]; } } /* Normalize the mixture to be returned */ for (cisi = 0; cisi <= cis_size+1; cisi++) mixture[cisi] /= mixture_sum;}/* MIXTURE must be as large as CROSSBOW_ROOT->CHILDREN_COUNT+2 */voidmulticlass_mixture_given_doc (crossbow_doc *doc, double *mixture){ int mixture_count = crossbow_root->children_count + 2; bow_wv *wv; double mixture_sum; treenode *node; int mi, wvi; double node_membership_sum; double *node_membership; wv = crossbow_wv_at_di (doc->di); node_membership = alloca (mixture_count * sizeof (double)); for (mi = 0; mi < mixture_count; mi++) mixture[mi] = 0; mixture_sum = 0; for (wvi = 0; wvi < wv->num_entries; wvi++) { /* Analagous to the per-word E-step */ node_membership_sum = 0; for (mi = 0; mi <= mixture_count-2; mi++) { if (mi == mixture_count-2) node = crossbow_root; else node = crossbow_root->children[mi]; if (doc->tag == bow_doc_train || doc->tag == bow_doc_unlabeled) node_membership[mi] = bow_treenode_pr_wi_loo_local (node,wv->entry[wvi].wi, doc->di,wvi); else node_membership[mi] = node->words[wv->entry[wvi].wi]; assert (node_membership[mi] >= 0); node_membership_sum += node_membership[mi]; } /* For the uniform distribution */ node_membership[mixture_count-1] = 1.0 / bow_num_words (); node_membership_sum += node_membership[mixture_count-1]; assert (node_membership_sum != 0); /* Normalize the node data probs, so they are membership probabilities. */ for (mi = 0; mi < mixture_count; mi++) node_membership[mi] = node_membership[mi] / node_membership_sum; /* Analagous to the per-word M-step */ for (mi = 0; mi < mixture_count; mi++) { mixture[mi] += wv->entry[wvi].count * node_membership[mi]; mixture_sum += mixture[mi]; } } /* Normalize the mixture to be returned */ assert (mixture_sum); for (mi = 0; mi < mixture_count; mi++) { mixture[mi] /= mixture_sum; //assert (mixture[mi] > 0); }}/* Return the most likely mixture over mixture components, assuming that we are already committed to including the classes in CIS, and that we probabilistically remove the words that they account for. MIXTURE must be as large as CROSSBOW_ROOT->CHILDREN_COUNT+2 */voidmulticlass_mixture_given_doc_and_partial_cis (crossbow_doc *doc, const int *cis, int cis_size, const int *exclude_cis, int exclude_cis_size, double *mixture){ int mixture_count = crossbow_root->children_count + 2; bow_wv *wv; double mixture_sum; treenode *node; int mi, wvi, cisi; double node_membership_sum; double *node_membership; double *node_word_prob; double average_word_prob_cis, incr; wv = crossbow_wv_at_di (doc->di); node_membership = alloca (mixture_count * sizeof (double)); node_word_prob = alloca (mixture_count * sizeof (double)); for (mi = 0; mi < mixture_count; mi++) mixture[mi] = 0; mixture_sum = 0; for (wvi = 0; wvi < wv->num_entries; wvi++) { /* Analagous to the per-word E-step */ node_membership_sum = 0; for (mi = 0; mi <= mixture_count-2; mi++) { if (mi == mixture_count-2) node = crossbow_root; else node = crossbow_root->children[mi]; node_word_prob[mi] = bow_treenode_pr_wi_loo_local (node,wv->entry[wvi].wi,doc->di,wvi); node_membership[mi] = node_word_prob[mi]; assert (node_membership[mi] >= 0); } /* For the uniform distribution */ node_membership[mixture_count-1] = 1.0 / bow_num_words (); /* Calculate the average word probability of the classes explicitly included with CIS, and the always-included root and uniform distribution. Zero the mixture probabilities for those mixtures. */ average_word_prob_cis = 0; for (cisi = 0; cisi < cis_size; cisi++) { average_word_prob_cis += node_membership[cis[cisi]]; node_membership[cis[cisi]] = 0; } average_word_prob_cis += node_membership[mixture_count-2]; node_membership[mixture_count-2] = 0; average_word_prob_cis += node_membership[mixture_count-1]; node_membership[mixture_count-1] = 0; average_word_prob_cis /= cis_size + 2; /* Zero the probabilities of the classes explicitly excluded */ for (cisi = 0; cisi < exclude_cis_size; cisi++) node_membership[exclude_cis[cisi]] = 0; /* Subtract the average */ for (mi = 0; mi < mixture_count; mi++) { node_membership[mi] -= average_word_prob_cis; if (node_membership[mi] < 0) node_membership[mi] = 0; node_membership_sum += node_membership[mi]; }#if 1 /* If any of the NODE_MEMBERSHIP's are non-zero, normalize the node data probs, so they are membership probabilities. */ if (node_membership_sum != 0) for (mi = 0; mi < mixture_count; mi++) node_membership[mi] = node_membership[mi] / node_membership_sum;#endif /* Analagous to the per-word M-step */ for (mi = 0; mi < mixture_count; mi++) { if (node_membership[mi] == 0) continue; incr= (wv->entry[wvi].count * node_membership[mi] * log (node_word_prob[mi]/average_word_prob_cis)); assert (incr >= 0); mixture[mi] += incr; mixture_sum += mixture[mi]; } } /* Normalize the mixture to be returned */ for (mi = 0; mi < mixture_count; mi++) mixture[mi] /= mixture_sum;}/* Return the perplexity */doublemulticlass_em_one_iteration (){ int di; crossbow_doc *doc; bow_wv *wv; treenode *node; int cisi, wvi; int num_nodes; double *node_word_prob, log_prob_of_data2; double node_membership_sum, word_prob, log_prob_of_data, deposit; int num_data_words = 0; /* the number of word occurrences */ double *node_membership; cmixture *m; int cis_size; double *mixture_all; /* One node for each topic, plus one for all-english, plus one for uniform */ num_nodes = crossbow_root->children_count + 1 + 1; node_membership = alloca (num_nodes * sizeof (double)); node_word_prob = alloca (num_nodes * sizeof (double)); mixture_all = alloca ((crossbow_root->children_count+2) * sizeof(double)); log_prob_of_data = log_prob_of_data2 = 0; for (di = 0; di < crossbow_docs->length; di++) { doc = bow_array_entry_at_index (crossbow_docs, di); if (doc->tag != bow_doc_train && doc->tag != bow_doc_unlabeled) continue; /* Temporary fix */ if (strstr (doc->filename, ".include") || strstr (doc->filename, ".exclude")) continue; multiclass_mixture_given_doc (doc, mixture_all); bow_verbosify (bow_verbose, "%s ", doc->filename); for (cisi = 0; cisi < crossbow_root->children_count+2; cisi++) { bow_verbosify (bow_verbose, "%s=%g,", (cisi < crossbow_root->children_count ? bow_int2str (crossbow_classnames, cisi) : (cisi == crossbow_root->children_count ? "root" : "uniform")), mixture_all[cisi]); } bow_verbosify (bow_verbose, "\n"); /* Get the word vector for this document, and for each word, estimate its membership probability in each of its classes (and the root class), and then gather stats for the M-step */ wv = crossbow_wv_at_di (di); m = cmixture_for_cis (doc->cis, doc->cis_size, 0, &cis_size); assert (m); assert (m->doc_count > 0); /* Zero the document-specific mixture in preparation for incrementing */ for (cisi = 0; cisi < cis_size + 2; cisi++) doc->cis_mixture[cisi] = 0; for (wvi = 0; wvi < wv->num_entries; wvi++) { num_data_words += wv->entry[wvi].count; /* Per-word E-step */ node_membership_sum = 0; for (cisi = 0; cisi <= doc->cis_size; cisi++) { if (cisi == doc->cis_size) node = crossbow_root; else node = crossbow_root->children[doc->cis[cisi]]; node_word_prob[cisi] = bow_treenode_pr_wi_loo_local (node, wv->entry[wvi].wi, di, wvi); node_membership[cisi] = m->m[cisi] * node_word_prob[cisi]; assert (node_word_prob[cisi] >= 0); node_membership_sum += node_membership[cisi]; } /* For the uniform distribution */ node_word_prob[doc->cis_size+1] = (1.0 / bow_num_words ()); node_membership[doc->cis_size+1] = m->m[doc->cis_size+1] * node_word_prob[doc->cis_size+1]; node_membership_sum += node_membership[doc->cis_size+1]; assert (node_membership_sum != 0); /* Normalize the node membership probs. Also increment perplexity */ word_prob = 0; for (cisi = 0; cisi <= doc->cis_size+1; cisi++) { node_membership[cisi] /= node_membership_sum; word_prob += node_membership[cisi] * node_word_prob[cisi]; if (node_membership[cisi]) log_prob_of_data2 += (node_membership[cisi] * wv->entry[wvi].count * log (node_word_prob[cisi])); } log_prob_of_data += wv->entry[wvi].count * log (word_prob); /* Per-word M-step */ for (cisi = 0; cisi <= doc->cis_size; cisi++) { if (cisi == doc->cis_size) node = crossbow_root; else node = crossbow_root->children[doc->cis[cisi]]; deposit = wv->entry[wvi].count * node_membership[cisi]; node->new_words[wv->entry[wvi].wi] += deposit; bow_treenode_add_new_loo_for_di_wvi (node, deposit, di, wvi, wv->num_entries, crossbow_docs->length); /* For non-combo version */ node->new_prior += deposit; /* For combo version */ m->new_m[cisi] += deposit; doc->cis_mixture[cisi] += deposit; } /* For the uniform distribution */ deposit = wv->entry[wvi].count * node_membership[doc->cis_size+1]; multiclass_uniform_new_prior += deposit; m->new_m[doc->cis_size+1] += deposit; doc->cis_mixture[cis_size+1] += deposit; } /* Normalize the document-specific CIS_MIXTURE, (and print it out) */ { double max = -FLT_MAX; double cis_mixture_sum; for (cisi = 0; cisi < cis_size+2; cisi++) if (doc->cis_mixture[cisi] > max) max = doc->cis_mixture[cisi]; cis_mixture_sum = 0;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -