⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 cluster.c

📁 隐马尔科夫模型工具箱
💻 C
📖 第 1 页 / 共 5 页
字号:
               if (clCnt[curr_class][j])                  mlv[j] += ((double)clCnt[curr_class][j]) * log(clCnt[curr_class][j]);               if (clCnt[j][curr_class])                  mlv[j] += ((double)clCnt[j][curr_class]) * log(clCnt[j][curr_class]);            }         }      }      else {         if (logfile) {            if (verbose) {               fprintf(logfile, "...decided not to move word %d from class %d\n", w, curr_class);            }            else {               fprintf(logfile, "--\n");            }         }         fflush(stdout);      }      if (show_MLV && logfile) {         fprintf(logfile, "   MLV = %f\n", curr_MLV);      }#ifdef INTEGRITY_CHECK      /* Debug: Check our counts still sum correctly */      check_counts_sum();      /* Debug: Check our updated MLV counts */      max_likelihood_check();#endif   }   if (w_period) {      /* Make sure recovery file reflects end of iteration */      export_classes(1);      sprintf(tmp, "%.150s.recovery", export_prefix);      file = FOpen(tmp, NoOFilter, &pipe_status);      check_file(file, tmp, "do_one_iteration");      fprintf(file, "Clustering automatic recovery status file\n");      fprintf(file, "Clustered up to (excluding) word: all\n");      fprintf(file, "Clusters are stored in: %.150s.recovery.cm\n", export_prefix);      fprintf(file, "Keep unknown word token separate: %d\n", unk_sep?1:0);      fprintf(file, "Sort order: %s\n", (sort_order==SORT_WMAP)?"WMAP":"FREQ");      FClose(file, pipe_status);   }   if (total_warnings>=10) {      HError(-17053, "A total of %d words were found in the wordmap but not in the gram files", total_warnings);   }}/* Recover from a given recovery file */void do_recovery(char *fname, int words){   FILE *file;   char *ptr;   int   from;   Boolean pipe_status;      file = FOpen(fname, NoFilter, &pipe_status);   check_file(file, fname, "do_recovery");   fgets(tmp, 256, file);   if (strncmp(tmp, "Clustering automatic", 20)!=0) {      HError(17013, "This is not a recovery status file");   }   fgets(tmp, 256, file);   ptr = strchr(tmp, ':');   if (!ptr) {      HError(17013, "Failure to read current word point from status file");   }   ptr++;   ptr += strspn(ptr, " \t");   if (strncmp(ptr, "all", 3)==0) {      from = -1;   }   else {      from = atoi(ptr);   }   fgets(tmp, 256, file);   ptr = strchr(tmp, ':');   if (!ptr) {      HError(17013, "Failure to read recovery class map file name from status file");   }   ptr++;   ptr = strtok(ptr, " \t\n");   import_classmap(ptr, words);   fgets(tmp, 256, file);   ptr = strchr(tmp, ':');   if (!ptr) {      HError(17013, "Failure to read recovery unknown word status from status file");   }   ptr++;   ptr += strspn(ptr, " \t");   unk_sep = (*ptr=='1');   start_class = unk_sep?3:2;   fgets(tmp, 256, file);   ptr = strchr(tmp, ':');   if (!ptr) {      HError(17013, "Failure to read recovery word sort order status from status file");   }   ptr++;   ptr += strspn(ptr, " \t");   sort_order = (*ptr=='W')?SORT_WMAP:SORT_FREQ;   FClose(file, pipe_status);   if (trace & T_TOP) {      printf("Continuing from recovered state\n");   }   classes_init(words);   setup_all_counts();     if (trace & T_TOP) {      printf("Iterations that had been completed: %d\n", export_index);   }   export_index++;   /* Open output log file */   if (write_logfile) {      sprintf(tmp, "%.150s.%d.log", export_prefix, export_index);      logfile = FOpen(tmp, NoOFilter, &pipe_logfile);      check_file(logfile, tmp, "do_recovery");   }   else      logfile = NULL;   if (from>=0) {      do_one_iteration(rec_freq, from);   }   if (logfile)      FClose(logfile, pipe_logfile);   export_classes(0);   if (trace & T_EXTRA) {      printf("Completed iteration which started from recovered state\n");      if (from == -1) {         printf("   (no change since recovery state was stored at end of iteration)\n");      }   }}/* Initialise the values used when calculating the current value of the   maximum likelihood equation used when clustering */static void max_likelihood_init(void){   int i, j;   if (show_MLV)      curr_MLV=0;   /* We store all those values from the summation which involve a      particular class in a value specifically for that class */   for (i=0; i<N; i++) {      mlv[i] = 0;      for (j=0; j<N; j++) {         if (clCnt[i][j]) {            mlv[i] += ((double)clCnt[i][j]) * log(clCnt[i][j]);            if (show_MLV) {               curr_MLV += ((double)clCnt[i][j]) * log(clCnt[i][j]);            }         }         if (i!=j) {            if (clCnt[j][i])               mlv[i] += ((double)clCnt[j][i]) * log(clCnt[j][i]);         }      }      if (clSum[i]) {         mlv[i] -= 2*(((double)clSum[i]) * log(clSum[i]));         if (show_MLV) {            curr_MLV -= 2*(((double)clSum[i]) * log(clSum[i]));         }      }   }}#ifdef INTEGRITY_CHECK/* Check the contents of the maximum likelihood running totals store */static void max_likelihood_check(void){   int i, j;   double a;   char s1[50], s2[50];  /* We store all those values from the summation which involve a     particular class in a value specifically for that class */   for (i=0; i<N; i++) {      a = 0;      for (j=0; j<N; j++) {         if (clCnt[i][j])            a += ((double)clCnt[i][j]) * log(clCnt[i][j]);         if (i!=j) {            if (clCnt[j][i])               a += ((double)clCnt[j][i]) * log(clCnt[j][i]);         }      }      if (clSum[i])         a -= 2*(((double)clSum[i]) * log(clSum[i]));      /* Compare strings, to ignore minor precision differences */      sprintf(s1, "%f", a);      sprintf(s2, "%f", mlv[i]);      if (strcmp(s1, s2)) {         HError(17097, "max_likelihood_check: MLV for class %d is wrong - %f instead of %f", i, mlv[i], a);      }   }}#endif/* Perform a given number of iterations of the clustering algorithm */void cluster_words(int iterations){   int i;   for (i=0; i<iterations; i++) {      /* Also keep a separate iteration count - we do this because it's         possible to call cluster_words() multiple times from a host         program, or to continue from an existing classmap */      export_index++;            if (trace & T_TOP) {         printf("Beginning iteration %d\n", export_index);      }            /* Open output log file */      if (write_logfile) {         sprintf(tmp, "%.150s.%d.log", export_prefix, export_index);         logfile = FOpen(tmp, NoOFilter, &pipe_logfile);         check_file(logfile, tmp, "cluster_words");      }      do_one_iteration(rec_freq, 0);      if (logfile)         FClose(logfile, pipe_logfile);      if (trace & T_TOP) {         printf("Iteration complete\n");      }   }}/* Setup all class counts, given existing class word map */void setup_all_counts(void){   register int i, j;   for (i=0; i<N; i++) {      clSum[i] = 0;      for (j=0; j<N; j++) {         clCnt[i][j] = 0;      }   }   for (i=0; i<W; i++) {      /* Class unigram counts */      clSum[clMemb[i]] += uni[i];      /* Class bigram counts */      for (j=0; j<forward[i].size; j++) {         clCnt[clMemb[i]][clMemb[forward[i].bi[j].id]] += forward[i].bi[j].count;      }   }   /* Now initialise the maximisation function class values */   max_likelihood_init();}/* Perform some initial clustering - currently just puts all in one class */void initial_cluster(void){   register int i;   for (i=0; i<W; i++) {      if (unk_sep) {         clMemb[i] = 3;     /* Put everything in class 3 */      }      else {         clMemb[i] = 2;     /* Put everything in class 2 */      }   }   clMemb[start_id] = 0;   clMemb[end_id] = 1;   if (unk_sep) {      clMemb[unk_id] = 2;   }   /* Note that external class numbers are all +1 relative to internal (HLM can't cope      with class 0 in a class map) */   if (trace & T_EXTRA) {      printf("Initial clustering performed: all words in class %d (total count=%d)\n", unk_sep?4:3, sum_of_all_bigram_counts);      printf ("   (sentence start in class 1; sentence end in class 2%s)\n", unk_sep?"; unknown in class 3":"");   }}/* Define sorting order of words alphabetically, given id */int id_sort(UInt *in1, UInt *in2){   return strcmp(what_is_word(*in1), what_is_word(*in2));}/* Write out a HLM class map file (pass non-zero to write recovery file) */void export_classes(int recovery){   FILE *out;   int   i, j, index;   Boolean pipe_status;   /* %.150s limits the length of the filename prefix to 150 characters */   if (recovery) {      sprintf(tmp, "%.150s.recovery.cm", export_prefix);   }   else {      sprintf(tmp, "%.150s.%d.cm", export_prefix, export_index);   }   out = FOpen(tmp, LCMapOFilter, &pipe_status);   check_file(out, tmp, "export_classes");   /* Write header */   if (recovery) {      fprintf(out, "Name=Classmap_%s_iteration%d\n", export_prefix, export_index-1);      fprintf(out, "Entries=%d\n", N);      fprintf(out, "Iterations=%d\n", export_index-1);   }   else {      fprintf(out, "Name=Classmap_%s_iteration%d\n", export_prefix, export_index);      fprintf(out, "Entries=%d\n", N);      fprintf(out, "Iterations=%d\n", export_index);   }   if (outCMapRaw) {      fprintf(out, "EscMode=Raw\n");   }   else {      fprintf(out, "EscMode=HTK\n");   }   fprintf(out, "\\Classes\\\n");   for (i=0; i<N; i++) {      index = 0;      for (j=0; j<W; j++) {         if (clMemb[j] == i) {            class_sort[index] = j;            index++;         }      }      qsort(class_sort, index, sizeof(UInt),            (int (*) (const void *, const void *)) &id_sort);      fprintf(out, "CLASS%d %d %d IN\n", i+1, i+1, index);      if (outCMapRaw) {         for (j=0; j<index; j++) {            fprintf(out, " %s\n", what_is_word(class_sort[j]));         }      }      else {         for (j=0; j<index; j++) {            fprintf(out, " %s\n", ReWriteString(what_is_word(class_sort[j]), NULL, ESCAPE_CHAR));         }      }   }   FClose(out, pipe_status);}#ifdef INTEGRITY_CHECK/* Debugging: Do integrity check on counts - ensure they sum   to the same value after each loop! */void check_counts_sum(void){   register int i, j;   register int a, b;   a = 0;   b = 0;   for (i=0; i<N; i++) {      a += clSum[i];      for (j=0; j<N; j++) {         b += clCnt[i][j];      }   }   if (a != sum_of_all_uni_counts) {      HError(17096, "check_counts_sum: unigrams now sum to %d, not %d", a, sum_of_all_uni_counts);   }   if (b != sum_of_all_bigram_counts) {      HError(17096, "check_counts_sum: bigrams now sum to %d, not %d", a, sum_of_all_bigram_counts);   }   if (a != b) {      HError(17096, "check_counts_sum: uni and bi totals differ - %d v %d", a, b);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -