📄 cluster.c

📁 隐马尔科夫模型工具箱
💻 C
📖 第 1 页 / 共 5 页
字号:
   }}#endif/* Complain about a broken header */static void invalid_header(void){   HError(17013, "Classmap has broken header - missing '='");}/* Import a HLM classmap file. Currently ignores contents of 'EscMode' field. GLM Also non-IN?*/void import_classmap(char *fname, int numb_words){#define max_line_len 500    FILE     *file;               /* Input file handle */   char      line[max_line_len]; /* Line read buffer */   int       C;                  /* Current class index */   int       size;               /* Size of current class */   int       i;                  /* Loop counter */   char     *ptr;                /* Text pointer */   UInt      id;                 /* Word id */   int       reassigned = 0;     /* Number of reassigned classes */   int       unexpected = 0;     /* Number of unexpected lines trailing class descriptions */   Boolean   pipe_status;   if (trace & T_FILE) {      printf("Importing classmap '%s'\n", fname);   }   W = numb_words;   clMemb = CNew(&global_stack, W * sizeof(int));     /* Set impossible classmap in order to do integrity check after import */   for (i=0; i<W; i++) {      clMemb[i] = -1;   }      N = 0;   file = FOpen(fname, LCMapFilter, &pipe_status);   check_file(file, fname, "import_classmap");   while (fgets(line, max_line_len, file)) {      if (strncmp(line, "Entries", 7)==0) {         ptr = strchr(line, '=');         if (!ptr) invalid_header();         ptr++;         ptr = strtok(ptr, " \t\n");         N = atoi(ptr);         if (trace & T_EXTRA) {            printf("Number of classes = %d\n", N);         }      }      else if (strncmp(line, "Iterations", 10)==0) {         ptr = strchr(line, '=');         if (!ptr) invalid_header();         ptr++;         ptr = strtok(ptr, " \t\n");         export_index = atoi(ptr);      }      else if (strncmp(line, "EscMode", 7)==0) {         ptr = strchr(line, '=');         if (!ptr) invalid_header();         ptr++;         ptr = strtok(ptr, " \t\n");         if (strcmp(ptr, "HTK")==0) {            if (inCMapRawTrap && inCMapRaw) {               HError(-17013, "Class map specifies HTK escaping on input but configuration file specifies Raw escaping -- using HTK escaping for input");            }            inCMapRaw = FALSE;         }         else if (strcmp(ptr, "Raw")==0) {            if (inCMapRawTrap && !inCMapRaw) {               HError(-17013, "Class map specifies Raw escaping on input but configuration file specifies HTK escaping -- using Raw escaping for input");            }            inCMapRaw = TRUE;         }         else {            HError(17013, "Classmap has unknown escaping of type '%s'", ptr);         }         if (!outCMapRawTrap) {            if (outCMapRaw != inCMapRaw) {               HError(-17013, "Setting output class map escaping to same format as input class map (%s)", inCMapRaw?"Raw":"HTK");            }            outCMapRaw = inCMapRaw;  /* This is common sense */         }         if (inCMapRaw != outCMapRaw) {            HError(-17013, "Input class map escaping and output class map escaping differ (this is not a problem -- this warning is to alert you in case you meant them to be the same)");         }      }      else if (strncmp(line, "\\Classes\\", 9)==0) {         break;      }   }   if (feof(file)) {      HError(17013, "Classmap file is corrupt/contains no classes!");   }   if (!N) {      HError(17013, "Corrupt classmap header - must specify number of classes!");   }   if (trace & T_EXTRA) {      printf("Iterations = %d\n", export_index);   }   C = 0;   while (fgets(line, max_line_len, file)) {      if (C>=N) {         if (strstr(line, " IN")) {            HError(17013, "More classes are described than are specified in the header!");         }         else {            ptr = strtok(line, " \t\n");            if (!ptr)               continue;            else {               HError(-17013, "Warning: ignoring '%s' at end of classmap file", ptr);            }         }      }      if (strstr(line, " IN")) {         /* Start of a new class */         /* Make this class 'C' */         strtok(line, " \t");         ptr = strtok(NULL, " \t");         if (!ptr) {            HError(17013, "Failure reading class header %d in classmap (no id)", C);         }         if (atoi(ptr) != C+1) {            /* We'll renumber this class */            reassigned++;         }         ptr = strtok(NULL, " \t");         if (!ptr) {            HError(17013, "Failure reading class header %d in classmap (no size)", C);         }         size = atoi(ptr); /* Read number of words in class */         for (i=0; i<size; i++) {            fgets(line, max_line_len, file);            ptr = strtok(line, " \t\n");            if (!ptr) {               /* Warn about the blank line */               HError(-17013, "Found empty line inside class %d definition", C);               i--;               continue;            }            /* Unescape word if necessary */            if (!inCMapRaw) {               if (strlen(ptr)>255) {                  HError(17013, "Cannot handle words longer than 255 characters when using HTK escaping (recompile with higher tmp[] buffer size in Cluster.c)");               }               ParseString(ptr, tmp);               /* Put word in class */               id = get_id_from_word(tmp);            }            else {               /* Put word in class */               id = get_id_from_word(ptr);            }            if (inCMapRaw && strcmp(what_is_word(id), ptr)!=0) {               HError(17095, "import_classmap: word '%s' is id '%d'; id is '%s'!", ptr, id, what_is_word(id));            }            else if (!inCMapRaw && strcmp(what_is_word(id), tmp)!=0) {               HError(17095, "import_classmap: word '%s' is id '%d'; id is '%s'!", tmp, id, what_is_word(id));            }            if (clMemb[id] != -1) {               HError(17094, "Word '%s' occurs more than once in classmap!", ptr);            }            clMemb[id] = C;         }         C++;      }      else {         /* Where is class header? It's gone missing! */         if (strlen(line)>0) {            if (strchr(line, '\n'))               *strchr(line, '\n')='\0'; /* Strip linefeed */            if (strlen(line)==0)               continue;            HError(-17013, "Unexpected line '%s' in classmap", line);            unexpected++;         }         if (unexpected>9) {            HError(17013, "Too many unexpected lines in classmap - aborting now");         }         /* Loop round to see if it's coming up next */      }   }   if (C<N) {      HError(17013, "Less classes are described than are specified in the header!");   }   if (trace & T_TOP) {      if (reassigned) {         if (reassigned>1) {            printf("%d class ids were reassigned\n", reassigned);         }         else {            printf("1 class id was reassigned\n");         }      }      else {         printf("No class ids were reassigned\n");      }   }   /* Check all words were assigned */   for (i=0; i<W; i++) {      if (clMemb[i]==-1) {         HError(17052,"import_classmap: Not all words were assigned to classes");      }   }   FClose(file, pipe_status);   if (trace & T_FILE) {      printf("Class map import successful\n");   }}/* Write out p(word | class) probabilities */void write_word_probs(char *filename){   FILE *out;   int   i;   /* Loop counter */   double probability;   Boolean pipe_status;   /* These files never use HTK escaping */      out = FOpen(filename, NoOFilter, &pipe_status);   check_file(out, filename, "write_word_probs");   /* Write header */   fprintf(out, "Word|Class probabilities\n");   fprintf(out, "\n");   fprintf(out, "Derived from: %s\n", export_prefix);   fprintf(out, "Number of classes: %d\n", N);   fprintf(out, "Number of words: %d\n", W);   fprintf(out, "Iterations: %d\n", export_index);   fprintf(out, "\n");   fprintf(out, "%-15s\tClass name\tProbability (log)\n", "Word");   for (i=0; i<W; i++) {      if (uni[i]==0) uni[i]=1;   }   /* Use tmp_sum1[] to save having to allocate a new array (so can't call      this from within a class change calculation, but this isn't a problem!)   */   for (i=0; i<N; i++) {      tmp_sum1[i] = 0;   }   for (i=0; i<W; i++) {      tmp_sum1[clMemb[i]] += uni[i];   }   for (i=0; i<W; i++) {      probability = (double)uni[i]/((double)tmp_sum1[clMemb[i]]);      fprintf(out, "%-15s\tCLASS%-4d\t%f\n", what_is_word(i), clMemb[i]+1,              LOG_NATURAL(probability));      if (LOG_NATURAL(probability)<-90) {         printf("prob is %f, discount is %f, uni is %d\n", LOG_NATURAL((double)uni[i]/((double)tmp_sum1[clMemb[i]])), mlv[clMemb[i]], uni[i]);      }   }   FClose(out, pipe_status);   if (trace & T_FILE) {      printf("Wrote word|class probabilities to '%s'\n", filename);   }}/* Write out p(word | class) counts */void write_word_counts(char *filename){   FILE *out;   int   i;   /* Loop counter */   Boolean pipe_status;   /* These files never use HTK escaping */   /* Open output file */   out = FOpen(filename, NoOFilter, &pipe_status);   check_file(out, filename, "write_word_counts");   /* Write header */   fprintf(out, "Word|Class counts\n");   fprintf(out, "\n");   fprintf(out, "Derived from: %s\n", export_prefix);   fprintf(out, "Number of classes: %d\n", N);   fprintf(out, "Number of words: %d\n", W);   fprintf(out, "Iterations: %d\n", export_index);   fprintf(out, "\n");   fprintf(out, "%-15s\tClass name\tCount\n", "Word");      for (i=0; i<W; i++) {      fprintf(out, "%-15s\tCLASS%-4d\t%d\n", what_is_word(i), clMemb[i]+1, uni[i]);   }   FClose(out, pipe_status);   if (trace & T_FILE) {      printf("Wrote word|class counts to '%s'\n", filename);   }}/* Specify whether to keep the unknown word in its own solo-member class or not */void classes_keep_unk_separate(int keep_separate){   unk_sep = (Boolean) keep_separate;   start_class = unk_sep?3:2;}/* Pass in start, end and unknown word ids */void set_ids(int start, int end, int unk){   start_id = start;   end_id = end;   unk_id = unk;}/* This set of functions takes unigram counts, stores them, and   then allows them to be retrieved. It simply allocates a count   for each possible word id, since they are allocated in a   continuous block.*//* Initialise this unigram storage module */void unigram_init(int words){   max_words = words;   uni = CNew(&global_stack, words * sizeof(unigram));   sum_of_all_uni_counts = 0;}/* Add a unigram */void unigram_add(NGram ng, int count){   if (ng[0]>=max_words) {      /* Something's gone wrong */      HError(17093, "unigram_add: Found a word id higher than the base+number of words - word ids are expected to be allocated in an unbroken chunk\n[Current unigram is (%d); number of words is %d]", ng[0], max_words);   }   uni[ng[0]] += count;   sum_of_all_uni_counts += count;   return;}/* Read a unigram */UInt unigram_read(UInt id){#ifdef INTEGRITY_CHECK      if ((id<0) || (id>=max_words)) {         HError(17092, "unigram_read: attempt to read unigram outside bounds (%d; %d words)", id, max_words);      }#endif   return uni[id];}/* This section contains functions to store a sequence of bigrams - they must   be sequenced before passing to this code, since it relies on the input   being sorted.   Both forward and backward word to all bigrams look-up tables are built,   so given either u or v from a bigram (u,v) then the set of all (u,*) or   (*,v) can be found. *//* Grab some space from our current local storage block */static void *get_space(int size){   static void* ptr;   /* Test against our not-worth-using cut-off point */   if (size>block_cut_off)      return New(&global_stack, size);   /* Use New() again if necessary to get a new block */   if (((int)block+(int)size) >= (int)block_end) {      block = New(&global_heap, block_grab_size);      block_end = (void *) ((int)block+(int)block_grab_size);   }   /* Hand back the next free space */   ptr = block;   block = (void*) ((int) block + (int) size);     /* Next free byte */   block = (void*) ((((int)block)+3) & (~(int)3)); /* Word-align */   return ptr;}/* Add a bigram */void bigram_add(NGram ng, int count)
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -