📄 cluster.c

📁 隐马尔科夫模型工具箱
💻 C
📖 第 1 页 / 共 5 页
字号:
上一页 1 2 3 45
{   bi_count *ptr;   int       space_used;   if ((ng[0]>=max_words) || (ng[1]>=max_words)) {      /* Something's gone wrong */      HError(17093, "bigram_add: Found a word id higher than the base+number of words - all word ids are expected to be allocated in an unbroken chunk.\n[Current bigram is (%d,%d). Number of words is %d]", ng[0], ng[1], max_words);   }   /* Keep backward count */   backward[ng[1]].size++;   if (ng[0] == last_word) {      /* Make sure there's room in the buffer */      if (store_idx >= curr_bistore_size) {         /* Expand bigram buffer store to cope */         curr_bistore_size += bigram_buffer_grow;         if (trace & T_MEM) {            printf("Expanding bigram read buffer to %d entries\n", curr_bistore_size);         }         store = realloc(store, curr_bistore_size*sizeof(bi_count));      }      /* Store in buffer */      store[store_idx].id = ng[1];      store[store_idx].count = count;      store_idx++;      return;   }   /* Otherwise we must have just gone on to a new word, so keep the old      details */   forward[last_word].size = store_idx;   space_used = store_idx*sizeof(bi_count);   ptr = get_space(space_used);   memcpy(ptr, store, space_used);   forward[last_word].bi = ptr;   /* And go on to the next entry */   last_word = ng[0];   store[0].id = ng[1];   store[0].count = count;   store_idx = 1;}/* Call when all bigrams have been passed in */void bigram_added_all(void){   bi_count *ptr;   int       space_used;   int       i, j, backward_id;   /* Store last set of details */   forward[last_word].size = store_idx;   space_used = store_idx*sizeof(bi_count);   ptr = get_space(space_used);   memcpy(ptr, store, space_used);   forward[last_word].bi = ptr;   free(store);   sum_of_all_bigram_counts = 0;   /* Generate backward lookup table */   if (trace & T_EXTRA) {      printf("Building bigram backward lookup table...");      fflush(stdout);   }   /* Allocate required storage space */   for (i=0; i<max_words; i++) {      backward[i].bi = get_space(backward[i].size * sizeof(bi_count));      backward[i].size = 0; /* Reset to use as counter when building data */   }   /* Run through all forward data, copying into backward array */   for (i=0; i<max_words; i++) {      for (j=0; j<forward[i].size; j++) {         backward_id = forward[i].bi[j].id;         backward[backward_id].bi[backward[backward_id].size].id = i;         backward[backward_id].bi[backward[backward_id].size].count                                            = forward[i].bi[j].count;         backward[backward_id].size++;         sum_of_all_bigram_counts += forward[i].bi[j].count;      }   }   if (trace & T_EXTRA) {      printf(" done\n");   }}/* Must be called before almost any other function in this file will work */void bigram_init(int words) /* Pass ->used field from word-map */{   max_words = words;   forward = CNew(&global_stack, words * sizeof(bigrams));   backward = CNew(&global_stack, words * sizeof(bigrams));   if (trace & T_MEM) {      printf("Bigram store for %d words created\n", words);   }   last_word = 0;   store_idx = 0;   curr_bistore_size = initial_bigram_buffer;   store = calloc(initial_bigram_buffer, sizeof(bi_count));   if (trace & T_MEM) {      printf("Bigram read buffer of %d entries created\n", initial_bigram_buffer);   }}/* Main program control function */int main(int argc, char *argv[]){   char *s;   float weight;   /* used when loading gram files */   char *filename; /* used when loading gram files */   int   iterations=1, loop;   char *init_cmap = NULL;   char *recover_from = NULL;   char *write_classprobs = NULL;   char *write_classcounts = NULL;   Boolean read_gram_files=FALSE; /* Has the user passed any gram files? */   Boolean set_classes = FALSE, loaded_map = FALSE; /* Check for -c and -l */   Boolean keep_unk_sep = FALSE; /* Was -k passed? */   Boolean passed_unk = FALSE; /* Unknown word was passed in */   int start_word_id, end_word_id, unknown_word_id;   int numb_classes, min_classes;   char *ptr, *ptr2; /* temp results */   /* Initialise HTK/HLM modules */   InitShell(argc, argv, Cluster_version, Cluster_vc_id);   InitMem();   InitMath();   InitWave();   InitLabel();   InitLUtil();   InitWMap();   InitGBase();   SetConfParms();   /* Default start, end and unknown words */   strcpy(sent_start, DEF_STARTWORD);   strcpy(sent_end, DEF_ENDWORD);   strcpy(unknown_w, DEF_UNKNOWNNAME);   /* Default number of classes */   numb_classes = classes_get_default();   /* Parse command line */   if (!InfoPrinted() && NumArgs() == 0)      ReportUsage();   if (NumArgs() == 0)      Exit(EXIT_FAILURE);   /* Create a global stack and heap */   CreateHeap(&global_stack, "Clusterer stack", MSTAK, 1, 0.0, 8192, 8192);   CreateHeap(&global_heap, "Clusterer heap", MHEAP, block_grab_size, 0.0, 1, 1);   while (NextArg() == SWITCHARG) {      s = GetSwtArg();      if (strlen(s) !=1 )         HError(17019, "Cluster: Bad switch %s; must be single letter",s);      switch(s[0]) {         case 'c':            if (NextArg()!=INTARG)               HError(17019,"Cluster: number of categories expected for -c");	    numb_classes = GetIntArg();            classes_set_number(numb_classes);	    set_classes = TRUE;            break;         case 'i':            if (NextArg()!=INTARG)               HError(17019,"Cluster: number of iterations expected for -i");            iterations = GetIntArg();            break;          case 'r':            if (NextArg()!=INTARG)               HError(17019,"Cluster: recovery export frequency expected for -r");            rec_freq = GetIntArg();            break;         case 'm':	    classes_showMLV(1);	    break;         case 'o':            if (NextArg()!=STRINGARG)               HError(17019,"Cluster: output filename prefix expected for -o");            set_output_prefix(GetStrArg());            break;         case 'p':            if (NextArg()!=STRINGARG)               HError(17019,"Cluster: output filename expected for -p");            write_classprobs = GetStrArg();            break;         case 'q':            if (NextArg()!=STRINGARG)               HError(17019,"Cluster: output filename expected for -q");            write_classcounts = GetStrArg();            break;         case 'l':            if (NextArg()!=STRINGARG)               HError(17019,"Cluster: output filename prefix expected for -l");            init_cmap = GetStrArg();	    loaded_map = TRUE;            break;         case 's':            if (NextArg()!=STRINGARG)               HError(17019,"Cluster: sentence start word expected for -s");            strcpy(sent_start, GetStrArg());            break;         case 't':            if (NextArg()!=STRINGARG)               HError(17019,"Cluster: sentence end word expected for -t");            strcpy(sent_end, GetStrArg());            break;         case 'u':            if (NextArg()!=STRINGARG)               HError(17019,"Cluster: unknown word token expected for -u");            strcpy(unknown_w, GetStrArg());	    passed_unk = TRUE;            break;         case 'x':            if (NextArg()!=STRINGARG)               HError(17019,"Cluster: recovery filename expected for -x");            recover_from = GetStrArg();            break;         case 'w':            if (NextArg()!=STRINGARG)               HError(17019, "Cluster: wordmap sort order expected for -w");            strcpy(tmp, GetStrArg());            for (ptr=tmp; *ptr!=0; *ptr=toupper(*ptr), ptr++);            if (strcmp(tmp, "WMAP")==0) {               sort_order = SORT_WMAP;            }            else if (strcmp(tmp, "FREQ")==0) {               sort_order = SORT_FREQ;            }            else {               HError(17019, "Cluster: -w expects either WMAP or FREQ");            }            break;         case 'k':	    classes_keep_unk_separate(TRUE);	    keep_unk_sep = TRUE;            break;         case 'v':	    verbose = TRUE;            break;         case 'n':	    write_logfile = !write_logfile;            break;         case 'T':            trace = GetChkedInt(0,017,s); break;         default:            HError(17019,"Cluster: Unknown switch %s",s);      }   }   if (NextArg()!=STRINGARG)      HError(17019, "Cluster: word map file name expected");   CreateWordMap(GetStrArg(), &wmap, 0);   min_classes = 4 + (keep_unk_sep?1:0); /* Minimum number of classes */   if (loaded_map && set_classes) {      HError(-17019, "Ignoring -c option: when combined with -l the number of classes in the existing map must be used");   }   else if (numb_classes < min_classes) {      HError(17019, "It doesn't make sense to specify less than %d classes -\n    %d classes are reserved, and you need at least 2 more", min_classes, min_classes-2);   }   /* See if start and end word occur in the data */   if (!GetLabId(sent_start, FALSE)) {      HError(17051, "Sentence start token '%s' not in word list");   }   if (!GetLabId(sent_end, FALSE)) {      HError(17051, "Sentence end token '%s' not in word list");   }   /* We can't keep the unknown word in its own class if one wasn't passed */   if (!GetLabId(unknown_w, FALSE) && keep_unk_sep) {      HError(17051, "Unknown word token '%s' not in word list and -k passed", unknown_w);   }   /* And generate a sensible warning if necessary: */   if (!GetLabId(unknown_w, FALSE) && passed_unk) {      HError(-17051, "Unknown word token '%s' was explicitly given with -u, but does not occur in the word map", unknown_w);   }   start_word_id = GetMEIndex(&wmap, (((MapEntry *)(GetLabId(sent_start, FALSE)->aux))->ndx));   end_word_id = GetMEIndex(&wmap, (((MapEntry *)(GetLabId(sent_end, FALSE)->aux))->ndx));   if (keep_unk_sep) {      unknown_word_id = GetMEIndex(&wmap, (((MapEntry *)(GetLabId(unknown_w, FALSE)->aux))->ndx));   }   else {      unknown_word_id = 0;   }   set_ids(start_word_id, end_word_id, unknown_word_id);   /* If we're doing no iterations we want to ignore the given filename      prefix and use the one from the classmap - this way we'll write the      correct information into the saved probabilities file header */   if (iterations==0 && init_cmap) {      ptr = strrchr(init_cmap, '.');      if (ptr) {         *ptr = '\0';         ptr2 = strrchr(init_cmap, '.');         if (ptr2) {            *ptr2 = '\0';            set_output_prefix(init_cmap);            *ptr2 = '.';         }         else            set_output_prefix(init_cmap);         *ptr = '.';      }      else {         set_output_prefix(init_cmap);      }   }   if (trace & T_FILE) {      printf("Wordmap loaded - %d words\n", wmap.used);   }   unigram_init(wmap.used);   bigram_init(wmap.used);   /* Add input gram files to input set */   if (trace & T_TOP)      printf("Preparing input gram set\n");   CreateHeap(&imem, "inputset", MSTAK, 1, 0.0, 1000, 1000);   CreateHeap(&imem2, "inputset2", MSTAK, 1, 0.0, 1000, 1000);   CreateInputSet(&imem, &wmap, &inset);   CreateInputSet(&imem2, &wmap, &inset2);   weight = 1.0;   while (NextArg() == STRINGARG || NextArg() == FLOATARG) {      if (NextArg() == FLOATARG) {         weight = GetFltArg();      }      if (weight==0.0 || weight<-10000.0 || weight>10000.0) {         HError(17019, "Improbable gram file weight (%.4f)", weight);      }      if (NextArg()!=STRINGARG) {         HError(17019,"Gram file name expected");      }      filename = GetStrArg();      AddInputGFile(&inset, filename, weight);      AddInputGFile(&inset2, filename, weight);      read_gram_files = TRUE;      if (trace & T_TOP)         printf("Input gram file %s added (weight=%f)\n", filename, weight);   }   if (!read_gram_files) {      HError(17019, "No gram files passed");   }   LoadBiGrams();   LoadUniGrams();   bigram_added_all();   DeleteHeap(&imem);   DeleteHeap(&imem2);   if (init_cmap) {      import_classmap(init_cmap, wmap.used);   }   else if (recover_from) {      do_recovery(recover_from, wmap.used);   }   /* Allocate memory and compute bigram pair arrays */   if (!recover_from) {      classes_init(wmap.used);      /* Perform default initial clustering */      if (!init_cmap) {         initial_cluster();      }      /* Calculate initial counts required */      setup_all_counts();   }   /* Run clustering algorithm */   for (loop=0; loop<iterations; loop++) {      cluster_words(1);      export_classes(0);   }   if (write_classprobs) {      write_word_probs(write_classprobs);   }   if (write_classcounts) {      write_word_counts(write_classcounts);   }   if (trace &
上一页 1 2 3 45
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -