📄 cluster.c

📁 隐马尔科夫模型工具箱
💻 C
📖 第 1 页 / 共 5 页
字号:
/* Return the number of classes used by default (or currently set) */int classes_get_default(void){   return N;}/* Set the number of classes used */void classes_set_number(int numb){   if (clCnt) {      HError(17099, "classes_set_number: must be called prior to initialisation");   }   if (numb<1) {      HError(17099, "classes_set_number: number of classes must be 1 or more");   }      N = numb;}/* Initialise this module */void classes_init(int numb_words){   int i, j;   if (W>0 && W!=numb_words) {      /* Could only get here if code to do with loading classmap is broken */      HError(17098, "classes_init: internal inconsistency - number of words has changed");   }   W = numb_words;   /* Create empty storage table */   if (trace & T_MEM) {      printf("Allocating memory for %d classes\n", N);   }   class_sort = CNew(&global_stack, W * sizeof(UInt));   clSum = CNew(&global_stack, N * sizeof(int));   tmp_c1 = CNew(&global_stack, N * sizeof(int));   tmp_c2 = CNew(&global_stack, N * sizeof(int));   tmp_c3 = CNew(&global_stack, N * sizeof(int));   tmp_c4 = CNew(&global_stack, N * sizeof(int));   tmp_sum1 = CNew(&global_stack, N * sizeof(int));   tmp_sum2 = CNew(&global_stack, N * sizeof(int));   mlv = CNew(&global_stack, N * sizeof(double));   sort_uni = CNew(&global_stack, W * sizeof(int));   if (!clMemb)      clMemb = CNew(&global_stack, W * sizeof(int)); /* May have been setup by existing map */   clCnt = CNew(&global_stack, N * sizeof(int *));   for (i=0; i<N; i++) {      clCnt[i] = CNew(&global_stack, N * sizeof(int));   }   if (trace & T_MEM) {      printf("Class memory allocated\n");   }   /* Create array of bigram (w,w) pair counts (ie. word followed by itself) */   bipair = CNew(&global_stack, W * sizeof(int));   for (i=0; i<W; i++) {      bipair[i] = 0;      for (j=0; j<forward[i].size; j++) {         if (forward[i].bi[j].id == i) {            bipair[i] = forward[i].bi[j].count;            break;         }      }   }}/* See what change results when word 'w' moved to class 'g' */static void classes_change(UInt w, int g){   register int i;   /* tmp_c1[] stores the set of class counts C(G(w),*)      tmp_c2[] stores the set of class counts C(*,G(w))      tmp_c3[] stores the set of class counts C(g,*)      tmp_c4[] stores the set of class counts C(*,g)      tmp_sum1[] stores the bigram class counts C(w,*)      tmp_sum2[] stores the bigram class counts C(*,w) */      /* Loop over all classes */   for (i=0; i<N; i++) {      if (i!=curr_class && i!=g) {         if (tmp_sum1[i]) {            /* (G(w),gi) => -C(w,gi) */            tmp_c1[i] = clCnt[curr_class][i] - tmp_sum1[i];            /* (g,gi)    => +C(w,gi) */            tmp_c3[i] = clCnt[g][i] + tmp_sum1[i];         }         else {            tmp_c1[i] = clCnt[curr_class][i];            tmp_c3[i] = clCnt[g][i];         }         if (tmp_sum2[i]) {            /* (gi,G(w)) => -C(gi,w) */            tmp_c2[i] = clCnt[i][curr_class] - tmp_sum2[i];            /* (gi,g)    => +C(gi,w) */            tmp_c4[i] = clCnt[i][g] + tmp_sum2[i];         }         else {            tmp_c2[i] = clCnt[i][curr_class];            tmp_c4[i] = clCnt[i][g];         }      }   }     /* Calculate correct values for class-to-or-from-only pairs */   /* (G(w),G(w)) => -C(w,G(x)) - C(G(x),w) + C(w,w) */   GwGw =   clCnt[curr_class][curr_class]          - tmp_sum1[curr_class] - tmp_sum2[curr_class] + bipair[w];   /* (G(w),g)    => -C(w,g) + C(G(x),w) - C(w,w) */   Gwg  =   clCnt[curr_class][g]          - tmp_sum1[g] + tmp_sum2[curr_class] - bipair[w];   /* (g,G(w))    => -C(g,w) + C(w,G(x)) - C(w,w) */   gGw  =   clCnt[g][curr_class]          - tmp_sum2[g] + tmp_sum1[curr_class] - bipair[w];   /* (g,g)       => +C(w,g) + C(g,w) + C(w,w) */   gg   =   clCnt[g][g]          + tmp_sum1[g] + tmp_sum2[g] + bipair[w];}/* Decide on a class to move word 'w' to. Returns class index. */static int choose_class(UInt w){   register int i;   register int j;   double d;             /* Change in optimisation value */   int uniGx, unig;   int best_class;   double best_change;   double start_value;      best_class = curr_class;   best_change = 0;   /* Create set of forward bigram class counts, C(w,*) and C(*,w)      (for * = any class) */   for (i=0; i<N; i++) {      tmp_sum1[i] = 0;      tmp_sum2[i] = 0;   }   for (i=0; i<forward[w].size; i++) {      tmp_sum1[clMemb[forward[w].bi[i].id]] += forward[w].bi[i].count;   }   for (i=0; i<backward[w].size; i++) {      tmp_sum2[clMemb[backward[w].bi[i].id]] += backward[w].bi[i].count;   }  /* Try all classes */   for (i=start_class; i<N; i++) {      if (i==curr_class || uni[w]==0) {         /* If we have no information about this word, or its a self-move, don't            bother (self-move gives zero change) */         continue;      }      d = 0;      classes_change(w, i);      /* Word has moved to class i, so see how this would change our         optimisation equation */      uniGx = clSum[curr_class] - uni[w];      unig  = clSum[i] + uni[w];      /* Counts involving original class and a new class */      for (j=0; j<N; j++) {         if ((j!=curr_class) && (j!=i)) {            if (tmp_c1[j]) {               d += ((double)tmp_c1[j]) * log(tmp_c1[j]);            }            if (tmp_c2[j]) {               d += ((double)tmp_c2[j]) * log(tmp_c2[j]);            }            if (tmp_c3[j]) {               d += ((double)tmp_c3[j]) * log(tmp_c3[j]);            }            if (tmp_c4[j]) {               d += ((double)tmp_c4[j]) * log(tmp_c4[j]);            }         }      }      /* Unigram part of summation */      if (uniGx) {         d -= 2*(((double)uniGx)*log(uniGx));      }      if (unig) {         d -= 2*(((double)unig)*log(unig));      }      /* Exceptions */      if (GwGw) {         d += ((double)GwGw) * log(GwGw);      }      if (Gwg) {         d += ((double)Gwg) * log(Gwg);      }      if (gGw) {         d += ((double)gGw) * log(gGw);      }      if (gg) {         d += ((double)gg) * log(gg);      }      /* Now make 'd' into a difference: */      start_value = mlv[curr_class] + mlv[i];      /* Subtract off the two values we added twice by using mlv[] */      if (clCnt[curr_class][i])         start_value -= clCnt[curr_class][i]*log(clCnt[curr_class][i]);      if (clCnt[i][curr_class])         start_value -= clCnt[i][curr_class]*log(clCnt[i][curr_class]);      /* And calculate 'd': */      d = d - start_value;      if (verbose && logfile) {         fprintf(logfile, "...moving word %d to class %d from class %d gives %f change\n",                 w, i, curr_class, d);      }      if (d>best_change) {         /* Accuracy check - this is a bit of a dodgy hack for gcc 3 */         sprintf(tmp, "%f", d); /* This will flush d from the higher-precision maths register */         /* No doubt a much faster way of doing this, but never mind */         if (d>best_change) {            if (verbose && logfile) {               fprintf(logfile, "   \\- which is better than the current best\n");            }            best_change = d;            best_class = i;         }         else {            HError(-17059, "Noticed a comparison accuracy difference [you may safely ignore this warning]");         }      }   }   if (show_MLV) {      curr_MLV += best_change;   }   return best_class;}/* Define sort order for word unigrams */static int freq_sort_order(int *in1, int *in2){  return ( (uni[*in1] < uni[*in2]) | (-(uni[*in1] > uni[*in2])));}/* Perform one iteration of the clustering algorithm */static void do_one_iteration(int w_period, int start_word){   UInt w, j, w_index;   int to;   FILE *file;   Boolean pipe_status;   int total_warnings=0;   for (w=0; w<W; w++) {      sort_uni[w] = w;   }   if (sort_order == SORT_FREQ) {      qsort(sort_uni, W, sizeof(int), (int (*) (const void *, const void *)) &freq_sort_order);   }   for (w_index=start_word; w_index<W; w_index++) {      w = sort_uni[w_index];      if (w_period && w%w_period==0) {         /* Write recovery file */         export_classes(1);         sprintf(tmp, "%.150s.recovery", export_prefix);         file = FOpen(tmp, NoOFilter, &pipe_status);         check_file(file, tmp, "do_one_iteration");         fprintf(file, "Clustering automatic recovery status file\n");         fprintf(file, "Clustered up to (excluding) word: %d\n", w_index);         fprintf(file, "Clusters are stored in: %.150s.recovery.cm\n", export_prefix);         fprintf(file, "Keep unknown word token separate: %d\n", unk_sep?1:0);         fprintf(file, "Sort order: %s\n", (sort_order==SORT_WMAP)?"WMAP":"FREQ");         FClose(file, pipe_status);      }      if ((w==start_id) || (w==end_id) || (unk_sep && (w==unk_id))) {         /* We don't want to move this special token, so skip it */         continue;      }      if (uni[w]==0) {         /* Word is in wordlist but not used, so warn */         if (total_warnings<10) {            HError(-17053, "Word '%s' is in word map but not in any gram files", what_is_word(w));         }         else if (total_warnings==10) {            HError(-17053, "Suppressing further word 'x' not in gram file warnings");         }         total_warnings++;         continue;      }      if (logfile) {         if (verbose) {            fprintf(logfile, "...deciding whether/where to move word %d (of %d - %2.2f%% done) [id=%d]\n",                    w_index, W, ((float)w_index/(float)W)*100.0, w);         }         else {            fprintf(logfile, "%d [%d] (%2.2f%%):\t", w_index, w, ((float)w_index/(float)W)*100.0);         }      }      curr_class = clMemb[w]; /* Find out what class word is currently in */      to = choose_class(w);   /* Work out where to move it to */      if (curr_class != to) {         if (logfile) {            if (verbose) {               fprintf(logfile, "...moving word id %d from class %d to class %d\n", w, curr_class, to);            }            else {               fprintf(logfile, "-> %d\n", to);            }            fflush(logfile);         }         classes_change(w, to); /* Calculate new unigram and bigram values */         /* Remove influence of these two classes from MLV values */         for (j=0; j<N; j++) {            if (j!=to && j!=curr_class) {               if (clCnt[to][j])                  mlv[j] -= ((double)clCnt[to][j]) * log(clCnt[to][j]);               if (clCnt[j][to])                  mlv[j] -= ((double)clCnt[j][to]) * log(clCnt[j][to]);               if (clCnt[curr_class][j])                  mlv[j] -= ((double)clCnt[curr_class][j]) * log(clCnt[curr_class][j]);               if (clCnt[j][curr_class])                  mlv[j] -= ((double)clCnt[j][curr_class]) * log(clCnt[j][curr_class]);            }         }         /* Make change permanent */         /* Class map */         clMemb[w] = to;         /* Class unigram counts */         clSum[curr_class] -= uni[w];         clSum[to] += uni[w];         /* Class bigram counts */         for (j=0; j<N; j++) {            if ((j!=curr_class) && (j!=to)) {               /* (Gw, *) */               clCnt[curr_class][j] = tmp_c1[j];               /* (*, Gw) */               clCnt[j][curr_class] = tmp_c2[j];               /* (g, *) */               clCnt[to][j] = tmp_c3[j];               /* (*, g) */               clCnt[j][to] = tmp_c4[j];            }         }         /* Exceptions */         clCnt[curr_class][curr_class] = GwGw;         clCnt[curr_class][to] = Gwg;         clCnt[to][curr_class] = gGw;         clCnt[to][to] = gg;         /* Recalculate maximum-likelihood values involving this class */         mlv[to] = 0;         for (j=0; j<N; j++) {            if (clCnt[to][j])               mlv[to] += ((double)clCnt[to][j]) * log(clCnt[to][j]);            if (to!=j) {               if (clCnt[j][to])                  mlv[to] += ((double)clCnt[j][to]) * log(clCnt[j][to]);            }         }         if (clSum[to])            mlv[to] -= 2*(((double)clSum[to]) * log(clSum[to]));         mlv[curr_class] = 0;         for (j=0; j<N; j++) {            if (clCnt[curr_class][j])               mlv[curr_class] += ((double)clCnt[curr_class][j]) * log(clCnt[curr_class][j]);            if (curr_class!=j) {               if (clCnt[j][curr_class])                  mlv[curr_class] += ((double)clCnt[j][curr_class]) * log(clCnt[j][curr_class]);            }         }         if (clSum[curr_class])            mlv[curr_class] -= 2*(((double)clSum[curr_class]) * log(clSum[curr_class]));         /* Update MLV values for other classes */         for (j=0; j<N; j++)  {            if (j!=to && j!=curr_class) {               if (clCnt[to][j])                  mlv[j] += ((double)clCnt[to][j]) * log(clCnt[to][j]);               if (clCnt[j][to])                  mlv[j] += ((double)clCnt[j][to]) * log(clCnt[j][to]);
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -