📄 ladapt.c
字号:
/* ----------------------------------------------------------- *//* *//* ___ *//* |_| | |_/ SPEECH *//* | | | | \ RECOGNITION *//* ========= SOFTWARE */ /* *//* *//* ----------------------------------------------------------- *//* developed at: *//* *//* Speech Vision and Robotics group *//* Cambridge University Engineering Department *//* http://svr-www.eng.cam.ac.uk/ *//* *//* main authors: Valtcho Valtchev, Steve Young, *//* Julian Odell, Gareth Moore *//* ----------------------------------------------------------- *//* Copyright: *//* *//* 1994-2002 Cambridge University *//* Engineering Department *//* *//* Use of this software is governed by a License Agreement *//* ** See the file License for the Conditions of Use ** *//* ** This banner notice must not be removed ** *//* *//* ----------------------------------------------------------- *//* File: LAdapt.c - adapt LM with new text *//* ----------------------------------------------------------- */char *ladapt_version = "!HVER!LAdapt: 3.3 [CUED 28/04/05]";char *ladapt_vc_id = "$Id: LAdapt.c,v 1.1.1.1 2005/05/12 10:52:19 jal58 Exp $";#include "HShell.h"#include "HMem.h"#include "HMath.h"#include "HWave.h"#include "HLabel.h"#include "LWMap.h"#include "LCMap.h"#include "LGBase.h"#include "LUtil.h"#include "LModel.h"#include "LPCalc.h"#include "LPMerge.h"/* This tool processes source texts and updates an existing LM. Text passes through a window word by word and each n-gram is recorded. The text in the window can also be modified by match and replace rules and in this case the ngrams in the original matched text are stored in a set of 'negative' gram files and the ngrams in the modified text are stored in a set of 'positive' gram files.*/ /* -------------------------- Trace Flags ------------------------ */static int trace = 0;#define T_TOP 0001 /* Top Level tracing */#define T_SAV 0002 /* Monitor Buffer Saving */#define T_INP 0004 /* Trace word input stream */#define T_SHR 0010 /* Trace shift register input *//* ---------------- Configuration Parameters --------------------- */static ConfParam *cParm[MAXGLOBS];static int nParm = 0; /* total num params *//* ------------------- Word Shift Registers ----------------------- */typedef struct { int used; /* actual words in register */ UInt ng[MAXNG+1]; /* ng[0] is oldest word */ NGBuffer *ngb; /* output ngram buffer */} ShiftReg;/* ---------------------- Global Variables ----------------------- */static int nSize = 3; /* ngram size */static int ngbSize = 2000000; /* ngram buffer size */static int newWords = 100000; /* max new words to accommodate */static char *rootFN = "gram"; /* gbase root file name */static char *outFN = NULL; /* output LM filename */static char *dbsDir = NULL; /* directory to store gbase files */static char *wlistFN = NULL; /* file containing edit rules */static char *omapFN = "wmap"; /* output word map file name */static char *txtSrc = NULL; /* gram file text source descriptor */static MemHeap langHeap; /* memory for NGBuffers and LMs*/static BackOffLM *newLM; /* the generated LM */static BackOffLM *adpLM; /* the adapted final LM */static WordMap *tgtVoc = NULL; /* target vocabulary */ static WordMap wlist; /* restricting the word list */static ShiftReg stdBuf; /* used for normal N-gram processing */static Boolean pruneWords = FALSE; /* prune input text according to word list */static Boolean saveFiles = TRUE; /* save intermediate files */ static Boolean htkEscape = TRUE; /* string escaping for output word map */static Boolean mapUpdated; /* used optimise sort/saving */static Boolean processText = TRUE; /* generate model from raw text data */static char *defMapName = "LAdapt"; /* map name */static LabId unkId = NULL; /* OOV marker */static char unkStr[256] = DEF_UNKNOWNNAME; /* OOV class string *//* This MAX_NGRAM_FILES limit is arbitrary and can be removed */#define MAX_NGRAM_FILES 4096static int nLModel; /* number of loaded LMs */static LMInfo lmInfo[MAX_LMODEL]; /* array of loaded LMs */static WordMap wmap; /* word map for this corpus */static NGInputSet inSet; /* input set of files */static BuildInfo binfo; /* build parameters *//* ---------------- Process Command Line ------------------------- *//* SetConfParms: set conf parms relevant to this tool */void SetConfParms(void){ int i; char s[256]; nParm = GetConfig("LADAPT", TRUE, cParm, MAXGLOBS); if (nParm>0){ if (GetConfInt(cParm,nParm, "TRACE",&i)) trace = i; if (GetConfStr(cParm,nParm, "UNKNOWNNAME",s)) strcpy(unkStr,s); } }char *ReturnLMName(int fmt){ switch(fmt) { case LMF_TEXT: return LM_TXT_TEXT; case LMF_BINARY: return LM_TXT_BINARY; case LMF_ULTRA: return LM_TXT_ULTRA; default: return LM_TXT_OTHER; } }void ReportUsage(void){ printf("\nUSAGE: LAdapt [options] langModel txtfile ....\n\n"); printf(" Option Default\n\n"); printf(" -a n allow n new words in input text %d\n", newWords); printf(" -b n set ngram buffer size %d\n", ngbSize); printf(" -c n c set pruning for n-gram to c %d\n", DEF_CUTOFF); printf(" -d s set root n-gram data file name %s\n", rootFN); printf(" -f s set output LM format to s %s\n", ReturnLMName(DEF_SAVEFMT)); printf(" -g use existing n-gram files off\n"); printf(" -i f s interpolate with model s, weight f off\n"); printf(" -j n c set weighted discount pruning to c off\n"); printf(" -n n set n-gram size %d\n", nSize);#ifndef HTK_TRANSCRIBER printf(" -s s store s in gram header source flds none\n"); printf(" -t use Turing-Good discounting off\n");#endif printf(" -w fn load word list from fn none\n");#ifndef HTK_TRANSCRIBER printf(" -x save model with counts off\n");#endif PrintStdOpts(""); printf("\n\n");}int main(int argc, char *argv[]){ int i; char *c,*s,*fn; char sBuf[256],fmt[256]; void Initialise(void); void ProcessText(char *fn,Boolean lastFile); Boolean Exists(char *fn); BackOffLM *CombineModels(MemHeap *heap,LMInfo *lmi,int nLModel,int nSize,WordMap *wl) ; InitShell(argc,argv,ladapt_version,ladapt_vc_id); InitMem(); InitMath(); InitWave(); InitLabel(); InitLUtil(); InitWMap(); InitGBase(); InitLModel(); InitPCalc(); InitPMerge(); SetConfParms(); if (!InfoPrinted() && NumArgs() == 0) ReportUsage(); if (NumArgs() == 0) Exit(EXIT_SUCCESS); InitBuildInfo(&binfo); binfo.dctype = DC_ABSOLUTE; nLModel = 1; while (NextArg() == SWITCHARG) { s = GetSwtArg(); if (strlen(s)!=1) HError(16419,"Bad switch %s; must be single letter",s); switch(s[0]){ case 'a': newWords = GetChkedInt(10,10000000,s); break; case 'b': ngbSize = GetChkedInt(10,10000000,s); break; case 'c': i = GetChkedInt(2,LM_NSIZE,s); binfo.cutOff[i] = GetChkedInt(0,1000,s); break; case 'd': if (NextArg()!=STRINGARG) HError(16419,"Gram base root file name expected"); rootFN = GetStrArg(); break; case 'f': strcpy(fmt, GetStrArg()); for (c=fmt; *c; *c=toupper(*c), c++); /* To uppercase */ if (strcmp(fmt, LM_TXT_TEXT)==0) binfo.saveFmt = LMF_TEXT; else if (strcmp(fmt, LM_TXT_BINARY)==0) binfo.saveFmt = LMF_BINARY; else if (strcmp(fmt, LM_TXT_ULTRA)==0) binfo.saveFmt = LMF_ULTRA; else HError(16419,"Unrecognised LM format, should be one of [%s, %s, %s]", LM_TXT_TEXT, LM_TXT_BINARY, LM_TXT_ULTRA); break; case 'g': processText = FALSE; break; case 'i': if (NextArg()!=FLOATARG) HError(16419,"Interpolation weight expected"); lmInfo[nLModel].weight = GetChkedFlt(0.0,1.0,s); if (NextArg()!=STRINGARG) HError(16419,"Interpolation LM filename expected"); lmInfo[nLModel].fn = GetStrArg(); nLModel++; break; case 'j': i = GetChkedInt(2,LM_NSIZE,s); binfo.wdThresh[i] = GetChkedFlt(0.0,1E10,s); break; case 'n': nSize = GetChkedInt(1, MAXNG, s); break;#ifdef HTK_TRANSCRIBER case 's': if (NextArg()!=STRINGARG) HError(16419,"Gram file text source descriptor expected"); txtSrc = GetStrArg(); break; case 't': binfo.dctype = DC_KATZ; break;#endif case 'w': if (NextArg()!=STRINGARG) HError(16419,"Word list file name expected"); wlistFN = GetStrArg(); break;#ifndef HTK_TRANSCRIBER case 'x': binfo.ptype = LMP_COUNT; break;#endif case 'T': trace = GetChkedInt(0,077,s); break; default: HError(16419,"LAdapt: Unknown switch %s",s); } }#ifdef HTK_TRANSCRIBER if (nLModel==1) { /* must interpolate with at least one model */ HError(16419,"LAdapt: at least one model must be specified with -i option"); } if (binfo.saveFmt==LMF_TEXT) { /* save fomat cannot be TEXT */ binfo.saveFmt=LMF_BINARY; }#endif if (NextArg() != STRINGARG)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -