📄 cluster.c

📁 该压缩包为最新版htk的源代码,htk是现在比较流行的语音处理软件,请有兴趣的朋友下载使用
💻 C
📖 第 1 页 / 共 5 页
字号:
12 3 4 5 下一页
/* ----------------------------------------------------------- *//*                                                             *//*                          ___                                *//*                       |_| | |_/   SPEECH                    *//*                       | | | | \   RECOGNITION               *//*                       =========   SOFTWARE                  */ /*                                                             *//*                                                             *//* ----------------------------------------------------------- *//* developed at:                                               *//*                                                             *//*      Speech Vision and Robotics group                       *//*      Cambridge University Engineering Department            *//*      http://svr-www.eng.cam.ac.uk/                          *//*                                                             *//* author: Gareth Moore <glm20@eng.cam.ac.uk>                  *//*                                                             *//* ----------------------------------------------------------- *//*         Copyright:                                          *//*                                                             *//*          1999-2002 Cambridge University                     *//*                    Engineering Department                   *//*                                                             *//*   Use of this software is governed by a License Agreement   *//*    ** See the file License for the Conditions of Use  **    *//*    **     This banner notice must not be removed      **    *//*                                                             *//* ----------------------------------------------------------- *//*            Cluster.c: Cluster words into classes            */char *Cluster_version = "!HVER!Cluster:   3.3 [CUED 28/04/05]";char *Cluster_vc_id = "$Id: Cluster.c,v 1.1.1.1 2005/05/12 10:52:19 jal58 Exp $";/* HTK/HLM libraries: */#include "HShell.h"#include "HMem.h"#include "HMath.h"#include "HWave.h"#include "HLabel.h"#include "LUtil.h"#include "LWMap.h"#include "LGBase.h"#include "LModel.h"#include "LCMap.h"/* Uncomment the following line to run integrity checks on each iteration   to ensure that:   * The class counts all add up correctly   * The maximum-likelihood values have all been updated correctly     *//*#define INTEGRITY_CHECK *//* -------------------------- Trace Flags ------------------------ */#define T_TOP   00001              /* Basic tracing */#define T_FILE  00002              /* Report major file operations */#define T_EXTRA 00004              /* Extra tracing */#define T_BOND    007              /* Undercover tracing */#define T_MEM   00010              /* Trace memory usage *//* Constants *//* Size of blocks we grab from New() and then allocate internally   (This is done to avoid grabbing around 100,000 small blocks!) #bytes */#define block_grab_size        1048576/* Cut-off point at which we decide not to use the internal block and   just use New() - number of bigrams using a given word */#define block_cut_off          (200*sizeof(bi_count))/* Initial size of bigram read buffer (large enough to hold maximum number   of bigrams featuring a single word in a single position) - can grow */#define initial_bigram_buffer  10000/* Granularity of growth of above buffer, if required */#define bigram_buffer_grow     1000/* Identifiers for word clustering sort orders */#define SORT_WMAP 1#define SORT_FREQ 2/* Type definitions *//* Bigram count */typedef struct {   UInt id;	  /* Word id */   int  count;	  /* Bigram count */}bi_count;/* All bigrams which start or end with a certain word */typedef struct {   bi_count *bi;   /* Array of counts */   int       size; /* Number of bigrams with this word in */}bigrams;typedef UInt unigram;   /* Occurrence count *//* ---------------------- Global Variables ----------------------- *//* DEFAULTS *//* Global variables - defaults */static int         N = 1000;                /* Default number of classes */static Boolean     show_MLV=FALSE;          /* Show MLV after each change */static char       *export_prefix="cluster"; /* Prefix of export filenames */static Boolean     unk_sep = FALSE;         /* Keep unknown word in its own class? */static Boolean     outCMapRaw = FALSE;      /* Output classes in raw mode */static Boolean     inCMapRaw = FALSE;       /* Input classes in raw mode *//* Global variables - others *//* Used by core clusterer */static int       **clCnt=NULL;              /* Array of arrays; index with count[c1][c2]                                               (clCnt = 'class count') */static int        *tmp_c1=NULL;             /* Temporary set of bigrams (1) */static int        *tmp_c2=NULL;             /* Temporary set of bigrams (2) */static int        *tmp_c3=NULL;             /* Temporary set of bigrams (3) */static int        *tmp_c4=NULL;             /* Temporary set of bigrams (4) */static int        *tmp_sum1=NULL;           /* Temporary word-class counts (1) */static int        *tmp_sum2=NULL;           /* Temporary word-class counts (2) */static int        *clSum=NULL;              /* Class unigram [classes]                                               returns word unigram sum */static int	  *clMemb=NULL;             /* Class membership [words]                                               returns class given a word */static int         GwGw, gGw, Gwg, gg;      /* Special-case class counts */static double     *mlv;                     /* ML values involving class [N] */static int        *bipair;                  /* Array of word bigrams (w,w) */static int         sum_of_all_bigram_counts;/* Sum of all bigram counts */static int         sum_of_all_uni_counts;   /* Sum of all unigram counts */static int         curr_class;              /* Temporary value, saves passing */static int         start_class = 2;         /* Which is the first 'real' class? */static double      curr_MLV=0;              /* ...and its current value */static int         W = 0;     		    /* Number of words */static bigrams    *forward, *backward;      /* Forward and backward bigram tables */static int         export_index=0;          /* What iteration is this? */static FILE       *logfile=NULL;            /* Log progress to this file */static char        tmp[256];                /* Scrap array */static int         start_id=-1, end_id=-1;  /* Start and end word ids */static int         unk_id=-1;               /* Unknown word token id */static MemHeap     global_heap;             /* Claim fixed block memory from here */static MemHeap     global_stack;            /* Claim other memory from here *//* Used by uni/bigram storage */static unigram     *uni;                    /* Unigram store */static int          max_words;              /* Maximum number of words */static bigrams     *forward=0, *backward;    /* Forward and backward bigram tables */static void        *block=0;                /* First word of free memory we have */static void        *block_end=0;            /* First byte after current block */static UInt         last_word;              /* ID of last word (w,?) read in */static int          store_idx;              /* Next index of second word in bigram */static bi_count    *store;                  /* Store of current word w (w,*) pairs */static int          curr_bistore_size;      /* Current size of bigram buffer store *//* Front-end code */static WordMap      wmap;                   /* HTK word map */static MemHeap      imem;                   /* memory for input gram file set */static MemHeap      imem2;                  /* memory for input gram file set (copy) */static NGInputSet   inset;                  /* input gram file set */static NGInputSet   inset2;                 /* input gram file set (copy) */static char         sent_start[256];        /* sentence start word */static char         sent_end[256];          /* sentence end word */static char         unknown_w[256];         /* unknown word token */static ConfParam   *cParm[MAXGLOBS];        /* configuration script parameters */static int          nParm = 0;              /* total num params */static int          trace = 0;              /* trace setting */static UInt        *class_sort;	            /* Used to sort output alphabetically */static Boolean      pipe_logfile;           /* HShell file handling - using pipe? */static int          rec_freq = 1000;        /* Frequency we write recovery files (0 = off) */static Boolean      verbose = FALSE;        /* Verbose file logging */static Boolean      write_logfile = TRUE;   /* Write a log file during execution */static int          sort_order = SORT_WMAP; /* Order words are considered in */static int         *sort_uni;               /* Sort unigrams by count */static Boolean     outCMapRawTrap = FALSE;  /* Has this been changed by config file? */static Boolean     inCMapRawTrap = FALSE;   /* Has this been changed by config file? *//* ---------------- Function Prototypes -------------------------- */#ifdef INTEGRITY_CHECKstatic void check_counts_sum(void);static void max_likelihood_check(void);#endifstatic void max_likelihood_init(void);/* Add a bigram */void bigram_add(NGram ng, int count);/* Call when all bigrams have been passed in */void bigram_added_all(void);/* Must be called before almost any other function in this file will work */void bigram_init(int words);/* Initialise this unigram storage module */void unigram_init(int numb_words);/* Add a unigram */void unigram_add(NGram ng, int count);/* Read a unigram */UInt unigram_read(UInt id);/* Set whether to show MLV or not (non-zero = on) */void classes_showMLV(int on);/* Set prefix for all output files */void set_output_prefix(char *name);/* Return the number of classes used by default */int classes_get_default(void);/* Set the number of classes used */void classes_set_number(int numb);/* Initialise this module - MUST have initialised bigrams first */void classes_init(int numb_words);/* Perform a given number of iterations of the clustering algorithm */void cluster_words(int iterations);/* Setup all class counts, given existing class word map */void setup_all_counts(void);/* Perform some initial clustering - currently just puts all in one class,   except for given start, end and unknown (if -k passed) ids */void initial_cluster(void);/* Write out class sets (pass non-zero to write recovery file) */void export_classes(int recovery);/* Import existing HLM classmap */void import_classmap(char *fname, int numb_words);/* Recover from a given recovery file */void do_recovery(char *fname, int words);/* Write out p(word | class) probabilities */void write_word_probs(char *filename);/* Write out p(word | class) counts */void write_word_counts(char *filename);/* Specify whether to keep the unknown word in its own solo-member   class or not (non-zero = keep separate) */void classes_keep_unk_separate(int keep_separate);/* Pass in start, end and unknown word ids */void set_ids(int start_id, int end_id, int unk_id);/* Report an error message to stderr */void report_error(char *text);/* TEMP? */char *what_is_word(UInt id); /* In Cluster.c *//* ---------------- Process Command Line ------------------------- *//* See if any configuration parameters have been set for this tool */void SetConfParms(void){   char b[256];   int  i;   nParm = GetConfig("CLUSTER", TRUE, cParm, MAXGLOBS);   if (nParm>0) {      if (GetConfInt(cParm,nParm,"TRACE", &i))      trace = i;      if (GetConfStr(cParm,nParm,"STARTWORD", b))   strcpy(sent_start, b);      if (GetConfStr(cParm,nParm,"ENDWORD", b))     strcpy(sent_end, b);      if (GetConfStr(cParm,nParm,"UNKNOWNNAME", b)) strcpy(unknown_w, b);      if (GetConfBool(cParm,nParm,"INCMAPRAW", &inCMapRaw)) {         inCMapRawTrap = TRUE;      }      if (GetConfBool(cParm,nParm,"OUTCMAPRAW", &outCMapRaw)) {         outCMapRawTrap = TRUE;      }   }}/* Provide skeleton help */void ReportUsage(void){   printf("\nUSAGE: Cluster [options] mapfile gramfile ...\n\n");   printf(" Option                                       Default\n");   printf(" -c n    use n classes                        %d\n", classes_get_default());   printf(" -i n    perform n iterations                 1\n");   printf(" -k      put unknown word in a separate class off\n");   printf(" -l f    start from existing classmap 'f'     off\n");   printf(" -m      add running ML values to logfile     %s\n", show_MLV?"on":"off");   printf(" -n      do not produce any logfile output    %s\n", write_logfile?"off":"on");   printf(" -o f    set prefix of output files           %s\n", export_prefix);   printf(" -p f    write word|class probs to file 'f'   off\n");   printf(" -q f    write word|class counts to file 'f'  off\n");   printf(" -r n    write recovery file freq (0=off)     %d\n", rec_freq);   printf(" -s t    specify sentence start word as 't'   %s\n", DEF_STARTWORD);   printf(" -t t    specify sentence end word as 't'     %s\n", DEF_ENDWORD);   printf(" -u t    specify unknown word token as 't'    %s\n", DEF_UNKNOWNNAME);   printf(" -v      use verbose log file format          %s\n", verbose?"on":"off");   printf(" -w t    specify word sort order - WMAP/FREQ  %s\n", sort_order==SORT_WMAP?"WMAP":"FREQ");   printf(" -x f    continue from recovery file 'f'      off\n");   printf(" Standard options:\n");   PrintStdOpts("");   printf("\n");}void check_file(FILE *file, char *fname, char *function){   if (!file)      HError(17011, "%s: Can't open file '%s'", function, fname);}/* --------------------- Import N-grams ----------------- *//* LoadBiGrams: load in N-gram files, keeping only bigrams */static void LoadBiGrams(){   UInt   ng[2];   float  cnt; /* Occurrence count */   int    added=0;   if (trace & T_FILE) {      printf("Loading bigrams from N-gram files\n");   }   OpenInputSet(&inset);   if (trace & T_FILE) {      printf("Opened input set of %d entries\n", inset.nFiles);   }   while (GetNextNGram(&inset, ng, &cnt, 2)) {      /* ng stores ngram in format [0],[1]...[N]; count is separate */      ng[0] = GetMEIndex(&wmap, ng[0]);      ng[1] = GetMEIndex(&wmap, ng[1]);      bigram_add(ng, (int) cnt);      added++;   }   CloseInputSet(&inset);   if (trace & T_FILE) {      printf("Bigram load complete - %d bigrams imported\n", added);   }}/* LoadUniGrams: load in N-gram files - we want unigrams */static void LoadUniGrams(){   UInt   ng[1];   float  cnt; /* Occurrence count */   int    added=0;   if (trace & T_FILE) {      printf("Loading unigrams from N-gram files\n");   }   OpenInputSet(&inset2);   while (GetNextNGram(&inset2, ng, &cnt, 1)) {      /* ng stores ngram in format [0],[1]...[N]; count is separate */      ng[0] = GetMEIndex(&wmap, ng[0]); /* convert into value indexed from 0 */      unigram_add(ng, (int) cnt);      added++;   }   CloseInputSet(&inset2);   if (trace & T_FILE) {      printf("Unigram load complete - %d unigrams imported\n", added);   }}/* Return word text given an internal id */char *what_is_word(UInt id){   return wmap.id[id]->name;}/* Return a word id given a word */UInt get_id_from_word(char *word){   if (!(GetLabId(word, FALSE))) {      HError(17050, "Word '%s' found in class map but not in word map", word);   }   return GetMEIndex(&wmap, (((MapEntry *)(GetLabId(word, FALSE)->aux))->ndx));}/* Class functions *//* Set whether to show MLV or not */void classes_showMLV(int on){   show_MLV = on ? TRUE : FALSE;}/* Set prefix for all output files */void set_output_prefix(char *name){   if (clCnt) {      HError(-17099, "set_output_prefix(): this function must be called before initialisation");      /* No need to abort - it will just affect future files opened */   }   export_prefix = New(&global_stack, strlen(name)+1);   strcpy(export_prefix, name);}
12 3 4 5 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -