📄 hbuild.c
字号:
/* ----------------------------------------------------------- *//* *//* ___ *//* |_| | |_/ SPEECH *//* | | | | \ RECOGNITION *//* ========= SOFTWARE */ /* *//* *//* ----------------------------------------------------------- *//* Copyright: Microsoft Corporation *//* 1995-2000 Redmond, Washington USA *//* http://www.microsoft.com *//* *//* Use of this software is governed by a License Agreement *//* ** See the file License for the Conditions of Use ** *//* ** This banner notice must not be removed ** *//* *//* ----------------------------------------------------------- *//* File: HBuild.c: Word-Lattice Building *//* ----------------------------------------------------------- */char *hbuild_version = "!HVER!HBuild: 3.2 [CUED 09/12/02]";char *hbuild_vc_id = "$Id: HBuild.c,v 1.9 2002/12/19 16:37:40 ge204 Exp $";/* The HBuild program takes input files in a number of different formats and constructs suitable HTK word lattice files. The formats currently supported by HBuild include: a) Bigrams in either ARPA/Lincol-Labs format or HTK matrix format b) HTK Multi-Level lattices c) Word Lists for simple loops d) ARPA word-pair grammars (Resource Management style)*//* Trace Flags */#define T_TOP 0001 /* Top Level tracing */#include "HShell.h" /* HMM ToolKit Modules */#include "HMem.h"#include "HMath.h"#include "HSigP.h"#include "HAudio.h"#include "HWave.h"#include "HVQ.h"#include "HParm.h" #include "HLabel.h"#include "HModel.h"#include "HUtil.h" #include "HDict.h"#include "HNet.h"#include "HLM.h"typedef enum {unknown, wordLoop, boBiGram, matBiGram, multiLat, wordPair} BuildType;static int trace = 0; /* Trace flags */static LabId enterId; /* id of !ENTRY label in ngram */static LabId exitId; /* id of !EXIT label in ngram */static LabId bStartId=NULL; /* id of start bracket */static LabId bEndId=NULL; /* id of end bracket */static LabId unknownId; /* id of unknown label in ngram */static Boolean zapUnknown = FALSE; /* zap unknown symbols from bigram */MemHeap buildStack;/* ---------------- Configuration Parameters --------------------- */static ConfParam *cParm[MAXGLOBS];static int nParm = 0; /* total num params *//* ---------------- Process Command Line ------------------------- *//* SetConfParms: set conf parms relevant to this tool */void SetConfParms(void){ int i; nParm = GetConfig("HBUILD", TRUE, cParm, MAXGLOBS); if (nParm>0){ if (GetConfInt(cParm,nParm,"TRACE",&i)) trace = i; }}void ReportUsage(void){ printf("\nUSAGE: HBuild [options] wordList latFile\n\n"); printf(" Option Default\n\n"); printf(" -b binary lattice output ASCII\n"); printf(" -m s load matrix bigram from s off\n"); printf(" -n s load back-off bigram from s off\n"); printf(" -s s1 s2 s1/s2 are bigram start/end labels !ENTER !EXIT\n"); printf(" -t s1 s2 bracket word-loop/pair with s1 s2 off\n"); printf(" -u s set unknown symbol to s !NULL\n"); printf(" -w s load word-pair grammar from s off\n"); printf(" -x s load multi-level lattice from s off\n"); printf(" -z ignore ngrams with unknown symbol off\n"); PrintStdOpts(""); printf("\n\n");}int main(int argc, char *argv[]){ char *wordListFn,*latFn,*ipFn; LModel *bigramLm; BuildType bType = unknown; Boolean saveLatBin = FALSE; LatFormat format = HLAT_LMLIKE; Lattice *lat,*ipLat; Vocab voc; char *s; Lattice *ProcessWordLoop(MemHeap *latHeap, Vocab *voc); Lattice *ProcessBiGram(MemHeap *latHeap, Vocab *voc, LModel *biLM); void SaveLattice(Lattice *lat, char *latFn, LatFormat format); Lattice *LoadLattice(MemHeap *latHeap, char *latFn, Vocab *voc, Boolean shortArc); Lattice *ProcessWordPair(MemHeap *latHeap, Vocab *voc, char *fn); if(InitShell(argc,argv,hbuild_version,hbuild_vc_id)<SUCCESS) HError(3000,"HBuild: InitShell failed"); InitMem(); InitLabel(); InitMath(); InitDict(); InitNet(); InitLM(); CreateHeap(&buildStack, "HBuild Stack", MSTAK, 1, 0.0, 100000, LONG_MAX ); if (!InfoPrinted() && NumArgs() == 0) ReportUsage(); if (NumArgs() == 0) Exit(0); SetConfParms(); enterId=GetLabId("!ENTER",TRUE); /* All sentences should or are coerced */ exitId=GetLabId("!EXIT",TRUE); /* to start enterId and end exitId */ unknownId=GetLabId("!NULL",TRUE); /* Name for words not in list */ while (NextArg() == SWITCHARG) { s = GetSwtArg(); if (strlen(s)!=1) HError(3019,"HBuild: Bad switch %s; must be single letter",s); switch(s[0]){ case 'b': saveLatBin = TRUE; break; case 'm': if (bType != unknown) HError(3019,"HBuild: Can only specifiy one of -m, -n, -w, -x"); bType = matBiGram; if (NextArg()!=STRINGARG) HError(3019,"HBuild: Matrix Bigram file name expected"); ipFn = GetStrArg(); break; case 'n': if (bType != unknown) HError(3019,"HBuild: Can only specifiy one of -m, -n, -w, -x"); bType = boBiGram; if (NextArg()!=STRINGARG) HError(3019,"HBuild: Back-off Bigram file name expected"); ipFn = GetStrArg(); break; case 's': if (NextArg() != STRINGARG) HError(3019,"HBuild: Bigram ENTER label name expected"); enterId=GetLabId(GetStrArg(),TRUE); if (NextArg() != STRINGARG) HError(3019,"HBuild: Bigram EXIT label name expected"); exitId=GetLabId(GetStrArg(),TRUE); break; case 't': if (NextArg() != STRINGARG) HError(3019,"HBuild: Bracket start label name expected"); bStartId=GetLabId(GetStrArg(),TRUE); if (NextArg() != STRINGARG) HError(3019,"HBuild: Bracket end label name expected"); bEndId=GetLabId(GetStrArg(),TRUE); break; case 'u': if (NextArg() != STRINGARG) HError(3019,"HBuild: Unknown label name expected"); unknownId=GetLabId(GetStrArg(),TRUE); break; case 'w': if (bType != unknown) HError(3019,"HBuild: Can only specifiy one of -m, -n, -w, -x"); bType = wordPair; if (NextArg()!=STRINGARG) HError(3019,"HBuild: Word pair grammar file name expected"); ipFn = GetStrArg(); break; case 'x': if (bType != unknown) HError(3019,"HBuild: Can only specifiy one of -m, -n, -w, -x"); bType = multiLat; if (NextArg()!=STRINGARG) HError(3019,"HBuild: Multi-level lattice file name expected"); ipFn = GetStrArg(); break; case 'z': zapUnknown = TRUE; break; case 'T': trace = GetChkedInt(0,511,s); break; default: HError(3019,"HBuild: Unknown switch %s",s); } } if (NextArg()!=STRINGARG) HError(3019,"HBuild: Word List file name expected"); wordListFn = GetStrArg(); if (NextArg()!=STRINGARG) HError(3019,"HBuild: output lattice file name expected"); latFn = GetStrArg(); if (bType == unknown) bType = wordLoop; if (saveLatBin) format |= HLAT_LBIN; /* Read the word-list into a Vocab data structure */ InitVocab(&voc); if(ReadDict(wordListFn, &voc)<SUCCESS) HError(3013,"HBuild: ReadDict failed"); switch (bType) { case matBiGram: if (trace & T_TOP) printf("Reading bigram from file %s\n",ipFn); bigramLm = ReadLModel(&gstack, ipFn); if (bigramLm->type != matBigram) HError(3030,"HBuild: File specified is not a matrix bigram"); lat = ProcessBiGram(&gstack,&voc,bigramLm); SaveLattice(lat,latFn,format); break; case boBiGram: if (trace & T_TOP) printf("Reading bigram from file %s\n",ipFn); bigramLm = ReadLModel(&gstack, ipFn); if (bigramLm->type != boNGram) HError(3030,"HBuild: File specified is not a back-off bigram"); lat = ProcessBiGram(&gstack,&voc,bigramLm); SaveLattice(lat,latFn,format); break; case multiLat: if (trace & T_TOP) printf("Reading input lattice from file %s\n",ipFn); ipLat = LoadLattice(&buildStack,ipFn,&voc,FALSE); if (ipLat->subList!=NULL) { if (trace & T_TOP) printf("Expanding multi-level lattice\n"); lat = ExpandMultiLevelLattice(&buildStack,ipLat,&voc); } else lat = ipLat; SaveLattice(lat,latFn,format); break; case wordLoop: if (trace & T_TOP) printf("Building word loop\n"); lat = ProcessWordLoop(&gstack,&voc); SaveLattice(lat,latFn,format); break; case wordPair: lat = ProcessWordPair(&gstack,&voc,ipFn); SaveLattice(lat,latFn,format); break; default: HError(3001,"Only Bigram LMs / multiLats currently implemented"); } Exit(0); return (0); /* never reached -- make compiler happy */}/* Save a lattice to a file latFn */void SaveLattice(Lattice *lat, char *latFn, LatFormat format){ FILE *latf; Boolean isPipe; if (trace & T_TOP) printf("Saving lattice to file %s\n",latFn); if ( (latf = FOpen(latFn,NetOFilter,&isPipe)) == NULL) HError(3011,"SaveLattice : Cannot create new lattice file %s",latFn); if(WriteLattice(lat,latf,format)<SUCCESS) HError(3011,"SaveLattice : Cannot create new lattice file %s",latFn); FClose(latf,isPipe);}/* Load a lattice from file latFn */Lattice *LoadLattice(MemHeap *latHeap, char *latFn, Vocab *voc, Boolean shortArc){ FILE *latf; Boolean isPipe; Lattice *lat; if ( (latf = FOpen(latFn,NetFilter,&isPipe)) == NULL) HError(3010,"LoadLattice : Cannot open lattice file %s",latFn); if((lat = ReadLattice(latf,latHeap,voc,shortArc,FALSE))==NULL) HError(3010,"LoadLattice : ReadLattice failed"); FClose(latf,isPipe); return lat;}Lattice *ProcessWordLoop(MemHeap *latHeap, Vocab *voc){ int nNode,nArc; LNode *ln; LArc *la; Word wd; Lattice *lat; int i; nNode = voc->nwords+4; nArc = voc->nwords*2 + 3; lat = NewLattice(latHeap,nNode,nArc); lat->voc = voc; lat->lmscale = 1.0; lat->wdpenalty = 0.0; /* fill in start/end/loop word entries with !NULL */ wd = voc->nullWord; ln = lat->lnodes; ln->word = wd; ln->n=0; ln->v=0; ln = lat->lnodes+1; ln->word = wd; ln->n=0; ln->v=0; ln = lat->lnodes+nNode-1; ln->word = wd; ln->n=0; ln->v=0; ln = lat->lnodes+nNode-2; ln->word = wd; ln->n=0; ln->v=0; ln = lat->lnodes+2; for (i = 0; i< VHASHSIZE; i++) for ( wd = voc->wtab[i]; wd != NULL; wd = wd->next ) if ((wd != voc->nullWord) && (wd != voc->subLatWord)) { ln->word = wd; ln++; } la =lat->larcs; la->start = lat->lnodes; la->end = lat->lnodes+1; la->lmlike = 0.0; la = lat->larcs+1; la->start = lat->lnodes+nNode-2; la->end = lat->lnodes+nNode-1; la->lmlike = 0.0; la = lat->larcs+2; la->start = lat->lnodes+nNode-2; la->end = lat->lnodes+1; la->lmlike = 0.0; la = lat->larcs+3; for (i = 0; i < voc->nwords; i++) { la->start = lat->lnodes+1; la->end = lat->lnodes+2+i; la->lmlike = log(1.0/(float) (voc->nwords)); la++; } for (i = 0; i < voc->nwords; i++) { la->start = lat->lnodes+2+i; la->end = lat->lnodes+nNode-2; la->lmlike = 0.0; la++; } /* finally overwrite start/end !NULL words if sil at start/end */ if (bStartId != NULL) { wd = GetWord(voc,bStartId,TRUE); ln = lat->lnodes; ln->word = wd; wd = GetWord(voc,bEndId,TRUE); ln = lat->lnodes+nNode-1; ln->word = wd; } return lat;}/*ProcessBoBiGram: Convert back-off bigram in nLM into lattice */ Lattice *ProcessBoBiGram(MemHeap *latHeap, Vocab *voc, NGramLM *nLM){ int nNode,nArc; NEntry *ne; SEntry *se; Word wd,fromWd,toWd; LNode *ln,*fromNode,*toNode; LArc *la; lmId ndx[NSIZE+1]; int i,j,k;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -