lgprep.c

来自「该压缩包为最新版htk的源代码,htk是现在比较流行的语音处理软件,请有兴趣的朋」· C语言代码 · 共 792 行 · 第 1/2 页
792 行
/* ----------------------------------------------------------- *//*                                                             *//*                          ___                                *//*                       |_| | |_/   SPEECH                    *//*                       | | | | \   RECOGNITION               *//*                       =========   SOFTWARE                  */ /*                                                             *//*                                                             *//* ----------------------------------------------------------- *//* developed at:                                               *//*                                                             *//*      Speech Vision and Robotics group                       *//*      Cambridge University Engineering Department            *//*      http://svr-www.eng.cam.ac.uk/                          *//*                                                             *//* main authors: Valtcho Valtchev, Steve Young,                *//*               Julian Odell, Gareth Moore                    *//* ----------------------------------------------------------- *//*         Copyright:                                          *//*                                                             *//*          1994-2002 Cambridge University                     *//*                    Engineering Department                   *//*                                                             *//*   Use of this software is governed by a License Agreement   *//*    ** See the file License for the Conditions of Use  **    *//*    **     This banner notice must not be removed      **    *//*                                                             *//* ----------------------------------------------------------- *//*         File: LGPrep - prepare a sorted GramBase           *//* ----------------------------------------------------------- */char *lgprep_version = "!HVER!LGPrep:   3.3 [CUED 28/04/05]";char *lgprep_vc_id = "$Id: LGPrep.c,v 1.2 2005/05/12 15:51:21 jal58 Exp $";#include "HShell.h"#include "HMem.h"#include "HMath.h"#include "HWave.h"#include "HLabel.h"#include "LUtil.h"#include "LWMap.h"#include "LGBase.h"#include "LModel.h"/*    This tool processes source texts and saves sorted n-gram counts.   Text passes thru a window word by word and each n-gram is recorded.   The text in the window can also be modified by match and replace rules    and in this case, the n-grams in the original matched text are stored    in a set of 'negative' gram files and the n-grams in the modified text    are stored in a set of 'positive' gram files.*/   /* -------------------------- Trace Flags ------------------------ */static int trace = 0;#define T_TOP  0001     /* Top Level tracing */#define T_SAV  0002     /* Monitor Buffer Saving */#define T_INP  0004     /* Trace word input stream */#define T_SHR  0010     /* Trace shift register input */#define T_RIN  0020     /* Rule input monitoring */#define T_RUL  0040     /* Print rule set */#define T_MEM  0100     /* Print heap stats *//* ---------------- Configuration Parameters --------------------- */static ConfParam *cParm[MAXGLOBS];static int nParm = 0;            /* total num params *//* ------------------- Edit Rule Definitions --------------------- */#define MAX_FIELDS   256      /* max number of fields in a rule */#define MAX_ITEMS    256      /* max number of items in any one set */#define MAX_SETS     256      /* max number sets */typedef enum {             /* tags for rule fields */   f_WORD,                    /* literal word */   f_WILD,                    /* wildcard */   f_WSET,                    /* in word set */   f_NWSET,                   /* not in word set */   f_FIELD,                   /* slot contents */   f_NONE} FieldOp;typedef union {            /* contents of a field */   int   flid;                /* field index */   int   setid;               /* set index */   LabId wdid;                /* word literal */} FieldItem;typedef struct {           /* set definition */   int nItem;                 /* num words in set */   LabId item[MAX_ITEMS];     /* list of words */} SetDef;typedef struct {           /* field list (match or replace part) */   int n;                     /* number of fields in list */   FieldOp   fop[MAX_FIELDS]; /* tag/operation for each field */   FieldItem fdt[MAX_FIELDS]; /* actual data in each field */} FieldVec;typedef struct ruledef{    /* rule definition */   float pact;                /* % applic factor */   float psum;                /* accumulator */   FieldVec src;              /* match part of rule */   FieldVec tgt;              /* replacement part of rule */   struct ruledef *next;} RuleDef;typedef struct {           /* rule set */   MemHeap mem;               /* Memory for this ruleset */   int nRules;                /* number of rules */   RuleDef * head;            /* head of list of rules */   RuleDef * tail;            /* tail of list of rules */   int nSets;                 /* number of word sets actually defined */   SetDef **setlist;          /* array[0..MAX_SETS-1] of -> SetDef */} RuleSet;/* ------------------- Word Shift Registers ----------------------- */typedef struct {   int used;                  /* actual words in register */   UInt ng[MAXNG+1];          /* ng[0] is oldest word */   NGBuffer *ngb;             /* output ngram buffer */} ShiftReg;/* ---------------------- Global Variables ----------------------- */static int nSize     = 3;           /* ngram size */static int ngbSize   = 2000000;     /* ngram buffer size */static int egbSize   =  100000;     /* edited ngram buffer size */static int newWords  =  100000;     /* max new words to accommodate */static char *rootFN  = "gram";      /* gbase root filename */static int  dumpOfs  = 0;           /* initial numeric ext of gbase files */static char *dbsDir  = NULL;        /* directory to store gbase files */static char *ruleFN  = NULL;        /* file containing edit rules */static char *omapFN  = NULL;        /* output word map filename */static char *imapFN  = NULL;        /* input word map filename */static char *txtsrc  = NULL;        /* gram file text source descriptor */static Boolean gbGen = TRUE;        /* flag to enable GBase generation */static Boolean forceCnts = FALSE;   /* force the output of word counts */static Boolean htkEscape = TRUE;    /* default escaping */static Boolean tagSentStart = FALSE;/* tag senetence start words with _ */static WordMap wmap;                /* word map for this corpus */static Boolean mapUpdated;          /* used optimise sort/saving */static RuleSet rset;                /* rule set if any */static ShiftReg stdBuf;             /* used for normal N-gram processing */static ShiftReg posBuf;             /* N-grams from edited text */static ShiftReg negBuf;             /* N-grams from matched source text */static MemHeap ngbHeap;             /* memory for NGBuffers */static int editWinSize = 0;         /* size of edit window */static int editUsed;                /* number of words in editPipe */static LabId sstId = NULL;          /* sentence start id */static LabId editBuf[MAX_FIELDS];   /* edit buffer for input text */static int wordnum = 0;/* ---------------- Prototype functions -------------------------- */void Initialise(void);void ProcessText(char *fn, Boolean lastFile);/* ---------------- Process Command Line ------------------------- *//* SetConfParms: set configuration parameters relevant to this tool */void SetConfParms(void){   int i;   static char b[100];   sstId = GetLabId(DEF_STARTWORD,TRUE);   nParm = GetConfig("LGPREP", TRUE, cParm, MAXGLOBS);   if (nParm>0) {      if (GetConfInt(cParm,nParm,"TRACE",&i)) trace = i;      if (GetConfStr(cParm,nParm,"STARTWORD",b)) sstId = GetLabId(b, TRUE);   }}/* ReportUsage: Tool help */void ReportUsage(void){   printf("\nUSAGE: LGPrep [options] wmap [txtfile] ....\n\n");   printf(" Option                                       Default\n\n");   printf(" -a n    allow n new words in input text      %d\n", newWords);   printf(" -b n    set gram buffer size                 %d\n", ngbSize);   printf(" -c      force counts into wordmap on update  same as input\n");   printf(" -d s    output gram file directory           current directory\n");   printf(" -e n    edited gram buffer size              %d\n", egbSize);   printf(" -f s    fix text source using rules in s     off\n");   printf(" -h      disable HTK escaping on output       %s\n", htkEscape?"off":"on");   printf(" -i n    set output gram file start index     %d\n", dumpOfs);   printf(" -n n    set n-gram size                      %d\n", nSize);   printf(" -q      tag sentence start words with '_'    %s\n", tagSentStart?"on":"off");   printf(" -r s    set root gram filename               %s\n", rootFN);   printf(" -s s    store s in gram header source fields none\n");   printf(" -w s    write output map to s                wmap [or -d dir/wmap]\n");   printf(" -z      suppress gram file generation        %s\n", gbGen?"off":"on");   printf(" -Q      print rule summary help              off\n");   PrintStdOpts("");   printf("\n\n");}void RuleSummary(void){   printf("\nEdit Rule Syntax:\n");   printf("    <set-def>     = '#'<number> <word1> <word2> ... <wordN>.\n");   printf("    <rule-def>    = <app-factor> <match-def> : <repl-def>\n");   printf("    <match-def>   = { <word> | '*' | !<set> | %%<set> }\n");   printf("    <repl-def>    = { '$'<field> | string }\n");   printf("where\n");   printf("      <app-factor>   = float defining %% of matches to change\n");   printf("      <string>       => exact match of specified word\n");   printf("      *              => matches anything\n");   printf("      !<set>         => not in set\n");   printf("      %%<set>         => in set \n");   printf("      $<field>       => the value in field <number>\n");   printf("sets and fields are identified by a zero-based index.\n");   printf("Each set or rule def must be on a single line. \n\n");   Exit(0);}int main(int argc, char *argv[]){   char *s,*fn;   InitShell(argc,argv,lgprep_version,lgprep_vc_id);   InitMem();   InitMath();   InitWave();   InitLabel();   InitLUtil();   InitWMap();   InitGBase();   if (!InfoPrinted() && NumArgs() == 0)      ReportUsage();   if (NumArgs() == 0) Exit(EXIT_SUCCESS);   SetConfParms();   while (NextArg() == SWITCHARG) {      s = GetSwtArg();      if (strlen(s)!=1)          HError(16019,"Bad switch '%s' - must be single letter",s);      switch(s[0]){         case 'a':            newWords = GetChkedInt(10, 10000000, s); break;         case 'b':            ngbSize = GetChkedInt(10, 100000000, s); break;         case 'c':            forceCnts = TRUE; break;         case 'd':            if (NextArg() != STRINGARG)               HError(16019,"Output gram file directory expected with -d");            dbsDir = GetStrArg(); break;         case 'e':            egbSize = GetChkedInt(10, 100000000, s); break;         case 'f':            if (NextArg()!=STRINGARG)               HError(16019,"Rule filename expected with -f");            ruleFN = GetStrArg(); break;         case 'h':            htkEscape = FALSE; break;         case 'i':            dumpOfs = GetChkedInt(0, 100000, s); break;         case 'n':            nSize = GetChkedInt(1, MAXNG, s); break;         case 'q':	    tagSentStart=TRUE; break;         case 'r':            if (NextArg()!=STRINGARG)               HError(16019,"Gram base root filename expected with -r");            rootFN = GetStrArg(); break;         case 's':            if (NextArg()!=STRINGARG)               HError(16019,"Gram file text source descriptor expected with -s");            txtsrc = GetStrArg(); break;         case 'w':            if (NextArg()!=STRINGARG)               HError(16019,"Output word map filename expected with -w");            omapFN = GetStrArg(); break;         case 'z':            gbGen = FALSE; break;         case 'Q':            RuleSummary(); break;         case 'T':            trace = GetChkedInt(0,077,s); break;         default:            HError(16019,"LGPrep: Unknown switch '%s'",s);      }   }   if (NextArg() != STRINGARG)      HError(16019,"LGPrep: word map filename expected");   imapFN = GetStrArg();   if (omapFN == NULL) {     char path[256];     MakeFN("wmap",dbsDir,NULL,path);     omapFN = CopyString(&gstack,path);   }   Initialise();   if (NextArg() != STRINGARG)      ProcessText(NULL,TRUE);       /* input from stdin */   else      while (NextArg() == STRINGARG) {	/* Copy the string argument since it gets overwritten 	   by NextArg() when reading from script file */	fn = CopyString(&gstack,GetStrArg());	ProcessText(fn,NextArg() != STRINGARG);      }   if (NumArgs() != 0)      HError(-16019,"LGPrep: unused arguments left on command line");   if (trace&T_TOP) {      printf(" %d words processed\n",wordnum);   }   Exit(EXIT_SUCCESS);   return EXIT_SUCCESS; /* never reached -- make compiler happy */}/* ------------------------ Text Processing Routines -------------- *//* SkipToWord: scan string till first word or end of string */char *SkipToWord(char *s){   while (isspace((int) *s) && (*s != '\0')) ++s;   return s;}/* NextWord: extract next word from given string, returning ptr to next */char * NextWord(char *s, char *word){   while (isspace((int) *s) && (*s != '\0')) ++s;   if (*s == '\0') return NULL;   while (!isspace((int) *s) && (*s != '\0')) *word++ = *s++;   *word = '\0';   return s;}/* ----------------- Pattern Match/Replace Data Routines -------------- *//* InSet: return TRUE if wdid is in wset */Boolean InSet(SetDef *wset, LabId wdid){   int i;   for (i=0; i<wset->nItem; i++)      if (wdid == wset->item[i]) return TRUE;   return FALSE;}/* CreateRuleSet: initialise a rule set */void CreateRuleSet(RuleSet *rset){   int i;      CreateHeap(&(rset->mem),"ruleHeap",MSTAK,1,0.5,1000,10000);   rset->nRules = 0;   rset->head = rset->tail = NULL;   rset->nSets = 0;   rset->setlist = (SetDef **)New(&(rset->mem),sizeof(SetDef *)*MAX_SETS);   for (i=0; i<MAX_SETS; i++) rset->setlist[i] = NULL;}/* ReadSetDef: read set definition from s and add it to rule set */void ReadSetDef(char *s, RuleSet *rset){   SetDef *x;   char buf[256];   int n;      s = NextWord(s,buf);   n = atoi(buf);   if (s==NULL)      HError(16020,"ReadSetDef: no item in def for set %d",n);   if (trace&T_RIN) printf("  reading set %d: ",n);   if (n<0 || n>=MAX_SETS)      HError(16020,"ReadSetDef: set index %d out of range 0..%d",n,MAX_SETS);   if (rset->setlist[n] != NULL)      HError(16020,"ReadSetDef: set index %d already defined",n);   rset->setlist[n] = x = (SetDef *)New(&(rset->mem),sizeof(SetDef));   x->nItem = 0;   s = NextWord(s,buf);   while (s != NULL) {      x->item[x->nItem++] = GetLabId(buf,TRUE);      s = NextWord(s,buf);   }   rset->nSets++;   if (trace&T_RIN) {      printf("  %d elements read\n",x->nItem);      fflush(stdout);   }
lgprep.c - 源码说明

本页面展示了「该压缩包为最新版htk的源代码,htk是现在比较流行的语音处理软件,请有兴趣的朋友下载使用」中的 lgprep.c 源码文件，采用 C语言编程语言编写，共 792 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与htk相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?