📄 multiclass.c
字号:
#include <bow/libbow.h>#include <argp.h>#include <bow/crossbow.h>/* Thursday am - changed total_num_mixtures_possible calculation changed palpha from 1.0 to 0.01 changed malpha from 0 to 1 changed pruning class set size from 4 to 3 */extern bow_int4str *crossbow_classnames;static double multiclass_uniform_prior;static double multiclass_uniform_new_prior;static double multiclass_mixture_prior_alpha;static double multiclass_mixture_prior_normalizer;static double **cached_mixture = NULL;void multiclass_mixture_clear_cache ();int compare_ints (const void *x, const void *y){ if (*(int*)x > *(int*)y) return 1; else if (*(int*)x == *(int*)y) return 0; else return -1;}#define CCC crossbow_classes_count#define DOING_COMBO 1/* Plus 1 for no class */#define MAX_NUM_CLASSES (100 + 1)/* Plus one for root, and plus one for uniform */#define MAX_NUM_MIXTURE_CLASSES 20#define MAX_NUM_MIXTURES (MAX_NUM_MIXTURE_CLASSES + 1 + 1)typedef struct _mcombo { double prior; double new_prior; int doc_count; /* The class indicies for this multi-label combination */ /* Indexed by cisi, up to cis_size-1; + root + uniform */ int cis[MAX_NUM_MIXTURE_CLASSES]; int cis_size; /* Indexed by cisi, up to cis_size-1; + root + uniform */ /* The mixture weights for this multi-label combination */ double m[MAX_NUM_MIXTURES]; double new_m[MAX_NUM_MIXTURES]; /* each dimension indexed by CI+1 */} cmixture;/* Info on all class mixtures */static cmixture *cm = NULL;/* The number of entries in the above */static int cm_length = 0;typedef struct _multiclass_score { double score; int c[MAX_NUM_MIXTURE_CLASSES];} multiclass_score;static intcompare_multiclass_scores (const void *x, const void *y){ if (((multiclass_score*)x)->score > ((multiclass_score*)y)->score) return -1; else if (((multiclass_score*)x)->score == ((multiclass_score*)y)->score) return 0; else return 1;}/* Return a pointer to the cmixture structure for the specific class set specified by CIS. If CREATE_NEW is non-zero, then create a cmixture entry if one doesn't already exist. The actual number of classes in the mixture is returned in ACTUAL_SIZE */cmixture *cmixture_for_cis (const int *cis, int cis_size, int create_new, int *actual_size){ static bow_int4str *cmi_map = NULL; static int cm_size = 0; int cmi, cisi, num_chars, real_size; static const int cis_name_size = 512; char cis_name[cis_name_size], *cis_name_p; assert (cis_size <= MAX_NUM_MIXTURE_CLASSES); cis_name_p = cis_name; real_size = 0; for (cisi = 0; cisi < cis_size && cis[cisi] >= 0; cisi++) { num_chars = sprintf (cis_name_p, "%d,", cis[cisi]); cis_name_p += num_chars; assert (cis_name_p - cis_name <= cis_name_size); real_size++; } if (actual_size) *actual_size = real_size; if (!cmi_map) cmi_map = bow_int4str_new (0); if (create_new) cmi = bow_str2int (cmi_map, cis_name); else cmi = bow_str2int_no_add (cmi_map, cis_name); if (cmi < 0) return NULL; if (cmi >= cm_length) { /* Add a new entry for this class mixture combination */ cm_length++; if (cm == NULL) { cm_size = 128; cm = bow_malloc (cm_size * sizeof (cmixture)); } if (cmi >= cm_size) { cm_size *= 2; cm = bow_realloc (cm, cm_size * sizeof (cmixture)); } bow_verbosify (bow_verbose, "New entry for "); for (cisi = 0; cisi < real_size; cisi++) bow_verbosify (bow_verbose, "%s,", bow_int2str (crossbow_classnames, cis[cisi])); bow_verbosify (bow_verbose, "\n"); /* Initialize the new CM entry */ cm[cmi].prior = 0; cm[cmi].new_prior = 0; cm[cmi].doc_count = 0; cm[cmi].cis_size = real_size; for (cisi = 0; cisi < real_size; cisi++) { cm[cmi].cis[cisi] = cis[cisi]; cm[cmi].m[cisi] = 1.0 / real_size; cm[cmi].new_m[cisi] = 0; } for (cisi = real_size; cisi < MAX_NUM_MIXTURE_CLASSES; cisi++) cm[cmi].cis[cisi] = -1; for (cisi = real_size; cisi < MAX_NUM_MIXTURES; cisi++) { cm[cmi].m[cisi] = 0; cm[cmi].new_m[cisi] = 0; } } return &(cm[cmi]);}voidcmixture_set_from_new (int set_p_flag, double p_alpha, double m_alpha){ double p_sum; double m_sum; int cmi, l, total_num_mixtures_possible; cmixture *m; /* Get normalization constants */ assert (MAX_NUM_CLASSES > crossbow_classes_count); p_sum = 0; for (cmi = 0; cmi < cm_length; cmi++) { m = &(cm[cmi]); p_sum += m->new_prior + p_alpha; /* Don't touch the mixtures cached at test time. */ if (m->doc_count <= 0) continue; m_sum = 0; assert (m->cis_size+2 <= MAX_NUM_MIXTURES); for (l = 0; l < m->cis_size+2; l++) m_sum += m->new_m[l] + m_alpha; assert (m_sum); for (l = 0; l < m->cis_size+2; l++) { m->m[l] = (m->new_m[l] + m_alpha) / m_sum; assert (m->m[l] > 0); m->new_m[l] = 0; assert (m->m[l] <= 1.0 && m->m[l] >= 0.0); } } /* xxx This number possible is an over-estimate? */ total_num_mixtures_possible = 1; for (l = crossbow_classes_count; (l > (crossbow_classes_count - MAX_NUM_MIXTURE_CLASSES) && l >= 1); l--) total_num_mixtures_possible *= l; p_sum += (total_num_mixtures_possible - cm_length) * p_alpha; assert (p_sum > 0); multiclass_mixture_prior_alpha = p_alpha; multiclass_mixture_prior_normalizer = p_sum; /* Set p and m's from normalized new data, and zero the new data */ for (cmi = 0; cmi < cm_length; cmi++) { m = &(cm[cmi]); if (set_p_flag) { m->prior = (m->new_prior + p_alpha) / p_sum; assert (m->prior <= 1.0 && m->prior >= 0.0); } m->new_prior = 0; } /* Clear the mixture cache so it will get reset */ multiclass_mixture_clear_cache ();}voidcmixture_print_diagnostics (FILE *out){ int i, l, cmi; cmixture *m; for (cmi = 0; cmi < cm_length; cmi++) { m = &(cm[cmi]); /* Skip over class mixtures that have no training data */ if (m->doc_count <= 0) continue; /* Print the list of classes */ for (i = 0; i < MAX_NUM_MIXTURE_CLASSES; i++) if (m->cis[i] >= 0) fprintf (out, "%s,", bow_int2str (crossbow_classnames, m->cis[i])); fprintf (out, " prior=%g ", m->prior); for (l = 0; l < m->cis_size+2; l++) fprintf (out, "%g,", m->m[l]); fprintf (out, "\n"); }}voidmulticlass_place_labeled_data (){ int di, wvi; crossbow_doc *doc; treenode *node; bow_wv *wv; int cmi, cisi; cmixture *m; int l, cis_size; /* Clear all previous information. */ bow_treenode_set_new_words_to_zero_all (crossbow_root); bow_treenode_free_loo_and_new_loo_all (crossbow_root, crossbow_docs->length); bow_treenode_set_prior_from_new_prior_all (crossbow_root, 0); multiclass_uniform_new_prior = 0; /* Clear MC */ for (cmi = 0; cmi < cm_length; cmi++) { cm[cmi].doc_count = 0; cm[cmi].prior = 0; cm[cmi].new_prior = 0; for (l = 0; l < MAX_NUM_MIXTURES; l++) { cm[cmi].m[l] = 0; cm[cmi].new_m[l] = 0; } } /* Initialize the word distributions and LOO entries with the data and initialize lambdas to uniform */ for (di = 0; di < crossbow_docs->length; di++) { doc = bow_array_entry_at_index (crossbow_docs, di); /* Make sure that the CIS are in sorted order */ if (doc->cis_size > 1) qsort (doc->cis, doc->cis_size, sizeof (int), compare_ints); /* If space for this document's mixture hasn't already been allocated, do that now. */ if (doc->cis_mixture == NULL) doc->cis_mixture = bow_malloc ((doc->cis_size + 2) * sizeof (double)); wv = crossbow_wv_at_di (di); if (doc->tag != bow_doc_train) continue; /* Temporary fix */ if (strstr (doc->filename, ".include") || strstr (doc->filename, ".exclude")) continue; /* Put the data in each of the leaf classes to which the document belongs, and lastly the root. */ for (cisi = 0; cisi <= doc->cis_size; cisi++) { if (cisi == doc->cis_size) node = crossbow_root; else { assert (crossbow_root->children_count > doc->cis[cisi]); node = crossbow_root->children[doc->cis[cisi]]; } node->new_prior++; multiclass_uniform_new_prior++; for (wvi = 0; wvi < wv->num_entries; wvi++) { node->new_words[wv->entry[wvi].wi] += wv->entry[wvi].count; bow_treenode_add_new_loo_for_di_wvi (node, wv->entry[wvi].count, di, wvi, wv->num_entries, crossbow_docs->length); } } /* Put data into MC */ m = cmixture_for_cis (doc->cis, doc->cis_size, 1, &cis_size); assert (cis_size == doc->cis_size); m->doc_count++; m->new_prior += 1.0; for (cisi = 0; cisi < doc->cis_size+2; cisi++) m->new_m[cisi] += 1.0; } bow_treenode_set_prior_and_extra_from_new_prior_all (crossbow_root, &multiclass_uniform_new_prior, &multiclass_uniform_prior, 0); bow_treenode_set_words_from_new_words_all (crossbow_root, 0); cmixture_set_from_new (1, 0.01, 1);}doublemulticlass_cis_overlap (int *cis1, int cis1_size, int *cis2, int cis2_size){ int cisi1, cisi2; double overlap = 0;#if 0 if (cis1_size == cis2_size) overlap++;#endif for (cisi1 = cisi2 = 0; cisi2 < cis2_size; cisi2++) { while (cisi1 < cis1_size && cis1[cisi1] < cis2[cisi2]) cisi1++; if (cis1[cisi1] == cis2[cisi2]) overlap++; } return 2 * overlap / (cis1_size + cis2_size);}/* Erase the cached information used by MULTICLASS_MIXTURE_GIVEN_CIS(), forcing it to be re-calculated. */voidmulticlass_mixture_clear_cache (){ int cisi, cmi; if (cached_mixture) { for (cisi = 0; cisi < MAX_NUM_MIXTURE_CLASSES; cisi++) if (cached_mixture[cisi]) bow_free (cached_mixture[cisi]); bow_free (cached_mixture); cached_mixture = NULL; } /* Clear the CMIXTURE cache by changing the special "has cached average mixture" flag of -1 back to the "simply has no data, no cached mixture" of 0. */ for (cmi = 0; cmi < cm_length; cmi++) { if (cm[cmi].doc_count == -1) cm[cmi].doc_count = 0; }}/* Place into MIXTURE the mixture weights for the class set specified by CIS. When this class set appeared in the training data, this is simply a matter of copying the mixtures from the global CM structure. When it didn't, various forms of backoff are used. This function caches its backoff calculations. The above function clears the cache, which should happen any time mixtures in CM are changed. */voidmulticlass_mixture_given_cis (int *cis, int cis_size, double *mixture){ cmixture *m; int cisi; assert (cis_size <= MAX_NUM_MIXTURE_CLASSES); if (cached_mixture == NULL) { cached_mixture = bow_malloc((MAX_NUM_MIXTURE_CLASSES+1)*sizeof(double*)); /* Entry 0 never gets used. */ for (cisi = 0; cisi < MAX_NUM_MIXTURE_CLASSES+1; cisi++) cached_mixture[cisi] = NULL; } m = cmixture_for_cis (cis, cis_size, 0, 0); if (m && !m->doc_count == 0) { /* This set of classes exists in the training data, use the MAP-calculated mixture weights. */ for (cisi = 0; cisi < cis_size + 2; cisi++) mixture[cisi] = m->m[cisi]; } else { /* This set of classes appeared nowhere in the training data, backoff to an average of related mixtures, and cache the results in a (possibly) new CMIXTURE extry. */ int cmi, cisimb; double bmixture_sum, *mixture_count, similarity; cmixture *mb; /* Make sure that there is at least one training document with this label. */ for (cisi = 0; cisi < cis_size; cisi++) assert (crossbow_root->children[cis[cisi]]->prior); /* Get a (possibly new) CMIXTURE extry; We will set DOC_COUNT==-1 to indicate that it has a mixture cached from the following calculation. */ m = cmixture_for_cis (cis, cis_size, 1, 0); assert (m->doc_count == 0); m->doc_count = -1; mixture_count = alloca (MAX_NUM_MIXTURES * sizeof (double)); for (cisi = 0; cisi < cis_size+2; cisi++) { m->m[cisi] = 0; mixture_count[cisi] = 0; } /* Go through all mixtures for which there is training data */ for (cmi = 0; cmi < cm_length; cmi++) { mb = &(cm[cmi]); if (mb->doc_count <= 0) continue; similarity = multiclass_cis_overlap (mb->cis, mb->cis_size, cis, cis_size); if (similarity == 0) continue; for (cisimb = cisi = 0; cisimb < mb->cis_size; cisimb++) { while (cisi < cis_size && cis[cisi] < mb->cis[cisimb]) cisi++; if (mb->cis[cisimb] == cis[cisi]) { m->m[cisi] += mb->m[cisimb] * similarity; assert (m->m[cisi] == m->m[cisi]); mixture_count[cisi] += similarity; } } /* Likewise for the root and uniform mixtures */ m->m[cis_size] += mb->m[mb->cis_size] * similarity; mixture_count[cis_size] += similarity; m->m[cis_size+1] += mb->m[mb->cis_size+1] * similarity; mixture_count[cis_size+1] += similarity; } /* Take the average of each column */ for (cisi = 0; cisi < cis_size+2; cisi++) { assert (mixture_count[cisi]); m->m[cisi] /= mixture_count[cisi]; assert (m->m[cisi] == m->m[cisi]); } /* Normalize the mixture to sum to one */ bmixture_sum = 0; for (cisi = 0; cisi < cis_size+2; cisi++) bmixture_sum += m->m[cisi]; assert (bmixture_sum > 0); /* Normalize and put into MIXTURE for return */ for (cisi = 0; cisi < cis_size+2; cisi++) { m->m[cisi] /= bmixture_sum; mixture[cisi] = m->m[cisi]; } }#if 0 double normalizer = 0; int cisi2; /* Another (unused) estimate based on adding all mixtures */ /* This mixture did not occur in the training data, used a smoothed
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -