multiclass.c

来自「卡内基梅隆大学MaCallum开发的文本分类系统」· C语言 代码 · 共 1,954 行 · 第 1/4 页

C
1,954
字号
     estimate. */  for (cisi = 0; cisi < cis_size + 2; cisi++)    mixture[cisi] = 1.0;  normalizer = (cis_size + 2) * 1.0;  for (cmi = 0; cmi < cm_length; cmi++)    {      for (cisi = cisi2 = 0; cisi2 < cm[cmi]->cis_size; cisi2++)	{	  while (cisi < cisi2)	    cisi++;	  if (cm[cmi]->cis[cisi2] == cis[cisi])	    {	      mixture[cisi] += cm[cmi]->cis[cisi2];	      normalizer += mixture[cisi];	    }	}    }  for (cisi = 0; cisi < cis_size; cisi++)    mixture[cisi] /= normalizer;  return;#endif#if 0  /* Another (unused) option is to use a completely factored      representation */  /* Calculcate normalized mixture weights just from the treenode priors,     i.e., not using the CMIXTURE.  These may not actually get used.  */  /* Plus one for the root, plus one for the uniform */  double mixture_prior_sum;  mixture_weights = alloca ((cis_size + 1 + 1) * sizeof (double));  mixture_prior_sum = 0;  for (cisi = 0; cisi < cis_size; cisi++)    {      assert (cis[cisi] >= 0);      mixture_prior_sum += crossbow_root->children[cis[cisi]]->prior;    }  mixture_prior_sum += crossbow_root->prior + multiclass_uniform_prior;  for (cisi = 0; cisi < cis_size; cisi++)    if (cis[cisi] >= 0)      mixture_weights[cisi] = 	crossbow_root->children[cis[cisi]]->prior / mixture_prior_sum;  mixture_weights[cis_size] = crossbow_root->prior / mixture_prior_sum;  mixture_weights[cis_size+1] = multiclass_uniform_prior / mixture_prior_sum;#endif}/* MIXTURE must be as large as CIS_SIZE+2 */voidmulticlass_mixture_given_doc_and_cis (crossbow_doc *doc, 				      int *cis, int cis_size,				      double *mixture){  bow_wv *wv;  double *cis_mixture;  double mixture_sum;  treenode *node;  int cisi, wvi;  int num_nodes;  double *node_data_prob;  double node_data_prob_sum;  double *node_membership;  wv = crossbow_wv_at_di (doc->di);  cis_mixture = alloca (sizeof (double) * (cis_size + 2));  multiclass_mixture_given_cis (cis, cis_size, cis_mixture);  num_nodes = crossbow_root->children_count + 1 + 1;  node_membership = alloca (num_nodes * sizeof (double));  node_data_prob = alloca (num_nodes * sizeof (double));  for (cisi = 0; cisi <= cis_size+1; cisi++)    mixture[cisi] = 0;  mixture_sum = 0;  for (wvi = 0; wvi < wv->num_entries; wvi++)    {      /* Analagous to the per-word E-step */      node_data_prob_sum = 0;      for (cisi = 0; cisi <= cis_size; cisi++)	{	  if (cisi == cis_size)	    node = crossbow_root;	  else	    node = crossbow_root->children[cis[cisi]];	  node_data_prob[cisi] = cis_mixture[cisi] *	    bow_treenode_pr_wi_loo_local(node,wv->entry[wvi].wi,doc->di,wvi);	  assert (node_data_prob[cisi] >= 0);	  node_data_prob_sum += node_data_prob[cisi];	}      /* For the uniform distribution */      node_data_prob[cis_size+1] = cis_mixture[cis_size+1] *	(1.0 / bow_num_words ());      assert (node_data_prob[cis_size+1] >= 0);      node_data_prob_sum += node_data_prob[cis_size+1];      assert (node_data_prob_sum != 0);      /* Normalize the node data probs, so they are membership	 probabilities. */      for (cisi = 0; cisi <= cis_size+1; cisi++)	node_membership[cisi] = 	  node_data_prob[cisi] / node_data_prob_sum;      /* Analagous to the per-word M-step */      for (cisi = 0; cisi <= cis_size+1; cisi++)	{	  mixture[cisi] += wv->entry[wvi].count * node_membership[cisi];	  mixture_sum += mixture[cisi];	}    }  /* Normalize the mixture to be returned */  for (cisi = 0; cisi <= cis_size+1; cisi++)    mixture[cisi] /= mixture_sum;}/* MIXTURE must be as large as CIS_SIZE+2 */voidmulticlass_iterated_mixture_given_doc_and_cis (crossbow_doc *doc, 					       int *cis, int cis_size,					       double *mixture){  bow_wv *wv;  double *cis_mixture;  double mixture_sum;  treenode *node;  int cisi, wvi;  int num_nodes;  double *node_data_prob;  double node_data_prob_sum;  double *node_membership;  //double pp, old_pp;  wv = crossbow_wv_at_di (doc->di);  cis_mixture = alloca (sizeof (double) * (cis_size + 2));  multiclass_mixture_given_cis (cis, cis_size, cis_mixture);  num_nodes = crossbow_root->children_count + 1 + 1;  node_membership = alloca (num_nodes * sizeof (double));  node_data_prob = alloca (num_nodes * sizeof (double));  for (cisi = 0; cisi <= cis_size+1; cisi++)    mixture[cisi] = 0;  mixture_sum = 0;  for (wvi = 0; wvi < wv->num_entries; wvi++)    {      /* Analagous to the per-word E-step */      node_data_prob_sum = 0;      for (cisi = 0; cisi <= cis_size; cisi++)	{	  if (cisi == cis_size)	    node = crossbow_root;	  else	    node = crossbow_root->children[cis[cisi]];	  node_data_prob[cisi] = cis_mixture[cisi] *	    bow_treenode_pr_wi_loo_local(node,wv->entry[wvi].wi,doc->di,wvi);	  assert (node_data_prob[cisi] >= 0);	  node_data_prob_sum += node_data_prob[cisi];	}      /* For the uniform distribution */      node_data_prob[cis_size+1] = cis_mixture[cis_size+1] *	(1.0 / bow_num_words ());      assert (node_data_prob[cis_size+1] >= 0);      node_data_prob_sum += node_data_prob[cis_size+1];      assert (node_data_prob_sum != 0);      /* Normalize the node data probs, so they are membership	 probabilities. */      for (cisi = 0; cisi <= cis_size+1; cisi++)	node_membership[cisi] = 	  node_data_prob[cisi] / node_data_prob_sum;      /* Analagous to the per-word M-step */      for (cisi = 0; cisi <= cis_size+1; cisi++)	{	  mixture[cisi] += wv->entry[wvi].count * node_membership[cisi];	  mixture_sum += mixture[cisi];	}    }  /* Normalize the mixture to be returned */  for (cisi = 0; cisi <= cis_size+1; cisi++)    mixture[cisi] /= mixture_sum;}/* MIXTURE must be as large as CROSSBOW_ROOT->CHILDREN_COUNT+2 */voidmulticlass_mixture_given_doc (crossbow_doc *doc, 			      double *mixture){  int mixture_count = crossbow_root->children_count + 2;  bow_wv *wv;  double mixture_sum;  treenode *node;  int mi, wvi;  double node_membership_sum;  double *node_membership;  wv = crossbow_wv_at_di (doc->di);  node_membership = alloca (mixture_count * sizeof (double));  for (mi = 0; mi < mixture_count; mi++)    mixture[mi] = 0;  mixture_sum = 0;  for (wvi = 0; wvi < wv->num_entries; wvi++)    {      /* Analagous to the per-word E-step */      node_membership_sum = 0;      for (mi = 0; mi <= mixture_count-2; mi++)	{	  if (mi == mixture_count-2)	    node = crossbow_root;	  else	    node = crossbow_root->children[mi];	  if (doc->tag == bow_doc_train || doc->tag == bow_doc_unlabeled)	    node_membership[mi] = 	      bow_treenode_pr_wi_loo_local (node,wv->entry[wvi].wi,					    doc->di,wvi);	  else	    node_membership[mi] = node->words[wv->entry[wvi].wi];	  assert (node_membership[mi] >= 0);	  node_membership_sum += node_membership[mi];	}      /* For the uniform distribution */      node_membership[mixture_count-1] = 1.0 / bow_num_words ();      node_membership_sum += node_membership[mixture_count-1];      assert (node_membership_sum != 0);      /* Normalize the node data probs, so they are membership	 probabilities. */      for (mi = 0; mi < mixture_count; mi++)	node_membership[mi] = node_membership[mi] / node_membership_sum;      /* Analagous to the per-word M-step */      for (mi = 0; mi < mixture_count; mi++)	{	  mixture[mi] += wv->entry[wvi].count * node_membership[mi];	  mixture_sum += mixture[mi];	}    }  /* Normalize the mixture to be returned */  assert (mixture_sum);  for (mi = 0; mi < mixture_count; mi++)    {      mixture[mi] /= mixture_sum;      //assert (mixture[mi] > 0);    }}/* Return the most likely mixture over mixture components, assuming   that we are already committed to including the classes in CIS, and   that we probabilistically remove the words that they account for.   MIXTURE must be as large as CROSSBOW_ROOT->CHILDREN_COUNT+2 */voidmulticlass_mixture_given_doc_and_partial_cis (crossbow_doc *doc, 					      const int *cis, int cis_size,					      const int *exclude_cis, 					      int exclude_cis_size,					      double *mixture){  int mixture_count = crossbow_root->children_count + 2;  bow_wv *wv;  double mixture_sum;  treenode *node;  int mi, wvi, cisi;  double node_membership_sum;  double *node_membership;  double *node_word_prob;  double average_word_prob_cis, incr;  wv = crossbow_wv_at_di (doc->di);  node_membership = alloca (mixture_count * sizeof (double));  node_word_prob = alloca (mixture_count * sizeof (double));  for (mi = 0; mi < mixture_count; mi++)    mixture[mi] = 0;  mixture_sum = 0;  for (wvi = 0; wvi < wv->num_entries; wvi++)    {      /* Analagous to the per-word E-step */      node_membership_sum = 0;      for (mi = 0; mi <= mixture_count-2; mi++)	{	  if (mi == mixture_count-2)	    node = crossbow_root;	  else	    node = crossbow_root->children[mi];	  node_word_prob[mi] = 	    bow_treenode_pr_wi_loo_local (node,wv->entry[wvi].wi,doc->di,wvi);	  node_membership[mi] = node_word_prob[mi];	  assert (node_membership[mi] >= 0);	}      /* For the uniform distribution */      node_membership[mixture_count-1] = 1.0 / bow_num_words ();      /* Calculate the average word probability of the classes	 explicitly included with CIS, and the always-included root	 and uniform distribution.  Zero the mixture probabilities for	 those mixtures. */      average_word_prob_cis = 0;      for (cisi = 0; cisi < cis_size; cisi++)	{	  average_word_prob_cis += node_membership[cis[cisi]];	  node_membership[cis[cisi]] = 0;	}      average_word_prob_cis += node_membership[mixture_count-2];      node_membership[mixture_count-2] = 0;      average_word_prob_cis += node_membership[mixture_count-1];      node_membership[mixture_count-1] = 0;      average_word_prob_cis /= cis_size + 2;      /* Zero the probabilities of the classes explicitly excluded */      for (cisi = 0; cisi < exclude_cis_size; cisi++)	node_membership[exclude_cis[cisi]] = 0;      /* Subtract the average  */      for (mi = 0; mi < mixture_count; mi++)	{	  node_membership[mi] -= average_word_prob_cis;	  if (node_membership[mi] < 0) 	    node_membership[mi] = 0;	  node_membership_sum += node_membership[mi];	}#if 1      /* If any of the NODE_MEMBERSHIP's are non-zero, normalize the	 node data probs, so they are membership probabilities. */      if (node_membership_sum != 0)	for (mi = 0; mi < mixture_count; mi++)	  node_membership[mi] = node_membership[mi] / node_membership_sum;#endif      /* Analagous to the per-word M-step */      for (mi = 0; mi < mixture_count; mi++)	{	  if (node_membership[mi] == 0)	    continue;	  incr= (wv->entry[wvi].count * node_membership[mi]		 * log (node_word_prob[mi]/average_word_prob_cis));	  assert (incr >= 0);	  mixture[mi] += incr;	  mixture_sum += mixture[mi];	}    }  /* Normalize the mixture to be returned */  for (mi = 0; mi < mixture_count; mi++)    mixture[mi] /= mixture_sum;}/* Return the perplexity */doublemulticlass_em_one_iteration (){  int di;  crossbow_doc *doc;  bow_wv *wv;  treenode *node;  int cisi, wvi;  int num_nodes;  double *node_word_prob, log_prob_of_data2;  double node_membership_sum, word_prob, log_prob_of_data, deposit;  int num_data_words = 0;	/* the number of word occurrences */  double *node_membership;  cmixture *m;  int cis_size;  double *mixture_all;  /* One node for each topic, plus one for all-english, plus one for uniform */  num_nodes = crossbow_root->children_count + 1 + 1;  node_membership = alloca (num_nodes * sizeof (double));  node_word_prob = alloca (num_nodes * sizeof (double));  mixture_all = alloca ((crossbow_root->children_count+2) * sizeof(double));  log_prob_of_data = log_prob_of_data2 = 0;  for (di = 0; di < crossbow_docs->length; di++)    {      doc = bow_array_entry_at_index (crossbow_docs, di);      if (doc->tag != bow_doc_train && doc->tag != bow_doc_unlabeled)	continue;      /* Temporary fix */      if (strstr (doc->filename, ".include")	  || strstr (doc->filename, ".exclude"))	continue;      multiclass_mixture_given_doc (doc, mixture_all);      bow_verbosify (bow_verbose, "%s ", doc->filename);      for (cisi = 0; cisi < crossbow_root->children_count+2; cisi++)	{	  bow_verbosify (bow_verbose, "%s=%g,",			 (cisi < crossbow_root->children_count			  ? bow_int2str (crossbow_classnames, cisi)			  : (cisi == crossbow_root->children_count			     ? "root"			     : "uniform")),			 mixture_all[cisi]);	}      bow_verbosify (bow_verbose, "\n");      /* Get the word vector for this document, and for each word,         estimate its membership probability in each of its classes         (and the root class), and then gather stats for the M-step */      wv = crossbow_wv_at_di (di);      m = cmixture_for_cis (doc->cis, doc->cis_size, 0, &cis_size);      assert (m);      assert (m->doc_count > 0);      /* Zero the document-specific mixture in preparation for incrementing */      for (cisi = 0; cisi < cis_size + 2; cisi++)	doc->cis_mixture[cisi] = 0;      for (wvi = 0; wvi < wv->num_entries; wvi++)	{	  num_data_words += wv->entry[wvi].count;	  /* Per-word E-step */	  node_membership_sum = 0;	  for (cisi = 0; cisi <= doc->cis_size; cisi++)	    {	      if (cisi == doc->cis_size)		node = crossbow_root;	      else		node = crossbow_root->children[doc->cis[cisi]];	      node_word_prob[cisi] = 		bow_treenode_pr_wi_loo_local (node, wv->entry[wvi].wi,					      di, wvi);	      node_membership[cisi] = m->m[cisi] * node_word_prob[cisi];	      assert (node_word_prob[cisi] >= 0);	      node_membership_sum += node_membership[cisi];	    }	  /* For the uniform distribution */	  node_word_prob[doc->cis_size+1] = (1.0 / bow_num_words ());	  node_membership[doc->cis_size+1] = m->m[doc->cis_size+1] *	    node_word_prob[doc->cis_size+1];	  node_membership_sum += node_membership[doc->cis_size+1];	  assert (node_membership_sum != 0);	  /* Normalize the node membership probs.  Also increment             perplexity */	  word_prob = 0;	  for (cisi = 0; cisi <= doc->cis_size+1; cisi++)	    {	      node_membership[cisi] /= node_membership_sum;	      word_prob += node_membership[cisi] * node_word_prob[cisi];	      if (node_membership[cisi])		log_prob_of_data2 += (node_membership[cisi]				      * wv->entry[wvi].count				      * log (node_word_prob[cisi]));	    }	  log_prob_of_data += wv->entry[wvi].count * log (word_prob);	  /* Per-word M-step */	  for (cisi = 0; cisi <= doc->cis_size; cisi++)	    {	      if (cisi == doc->cis_size)		node = crossbow_root;	      else		node = crossbow_root->children[doc->cis[cisi]];	      deposit = wv->entry[wvi].count * node_membership[cisi];	      node->new_words[wv->entry[wvi].wi] += deposit;	      bow_treenode_add_new_loo_for_di_wvi		(node, deposit, di, wvi, 		 wv->num_entries, crossbow_docs->length);	      /* For non-combo version */	      node->new_prior += deposit;	      /* For combo version */	      m->new_m[cisi] += deposit;	      doc->cis_mixture[cisi] += deposit;	    }	  /* For the uniform distribution */	  deposit = wv->entry[wvi].count * node_membership[doc->cis_size+1];	  multiclass_uniform_new_prior += deposit;	  m->new_m[doc->cis_size+1] += deposit;	  doc->cis_mixture[cis_size+1] += deposit;	}      /* Normalize the document-specific CIS_MIXTURE, (and print it out) */      {	double max = -FLT_MAX;	double cis_mixture_sum;	for (cisi = 0; cisi < cis_size+2; cisi++)	  if (doc->cis_mixture[cisi] > max)	    max = doc->cis_mixture[cisi];	cis_mixture_sum = 0;

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?