📄 hvite.c
字号:
HError(3219,"HVite: HMM file extension expected"); hmmExt = GetStrArg(); break; case 'y': if (NextArg()!=STRINGARG) HError(3219,"HVite: Output label file extension expected"); labExt = GetStrArg(); break; case 'z': if (NextArg()!=STRINGARG) HError(3219,"HVite: Lattice output file extension expected"); latExt = GetStrArg(); break; case 'F': if (NextArg() != STRINGARG) HError(3219,"HVite: Data File format expected"); if((dfmt = Str2Format(GetStrArg())) == ALIEN) HError(-3289,"HVite: Warning ALIEN Input file format set"); break; case 'G': if (NextArg() != STRINGARG) HError(3219,"HVite: Source Label File format expected"); if((ifmt = Str2Format(GetStrArg())) == ALIEN) HError(-3289,"HVite: Warning ALIEN Input file format set"); break; case 'H':/*已经用*/ if (NextArg() != STRINGARG) HError(3219,"HVite: MMF File name expected"); AddMMF(&hset,GetStrArg()); break; case 'I': if (NextArg() != STRINGARG) HError(3219,"HVite: MLF file name expected"); LoadMasterFile(GetStrArg()); break; case 'L': if (NextArg()!=STRINGARG) HError(3219,"HVite: Label/network file directory expected"); labInDir = GetStrArg(); break; case 'P': if (NextArg() != STRINGARG) HError(3219,"HVite: Target Label File format expected"); if((ofmt = Str2Format(GetStrArg())) == ALIEN) HError(-3289,"HVite: Warning ALIEN Label output file format set"); break; case 'B': saveBinary = TRUE; break; case 'T': trace = GetChkedInt(0,511,s); break; case 'X': if (NextArg()!=STRINGARG) HError(3219,"HVite: Input label/network file extension expected"); labInExt = GetStrArg(); break; case 'h': if (NextArg()!=STRINGARG) HError(1,"Speaker name pattern expected"); xfInfo.outSpkrPat = GetStrArg(); if (NextArg()==STRINGARG) { xfInfo.inSpkrPat = GetStrArg(); if (NextArg()==STRINGARG) xfInfo.paSpkrPat = GetStrArg(); } if (NextArg() != SWITCHARG) HError(2319,"HERest: cannot have -h as the last option"); break; case 'E': if (NextArg()!=STRINGARG) HError(2319,"HERest: parent transform directory expected"); xfInfo.usePaXForm = TRUE; xfInfo.paXFormDir = GetStrArg(); if (NextArg()==STRINGARG) xfInfo.paXFormExt = GetStrArg(); if (NextArg() != SWITCHARG) HError(2319,"HVite: cannot have -E as the last option"); break; case 'J': if (NextArg()!=STRINGARG) HError(2319,"HERest: input transform directory expected"); AddInXFormDir(&hset,GetStrArg()); if (NextArg()==STRINGARG) xfInfo.inXFormExt = GetStrArg(); if (NextArg() != SWITCHARG) HError(2319,"HVite: cannot have -J as the last option"); break; case 'K': if (NextArg()!=STRINGARG) HError(2319,"HVite: output transform directory expected"); xfInfo.outXFormDir = GetStrArg(); xfInfo.useOutXForm = TRUE; if (NextArg()==STRINGARG) xfInfo.outXFormExt = GetStrArg(); if (NextArg() != SWITCHARG) HError(2319,"HVite: cannot have -K as the last option"); break; default: HError(3219,"HVite: Unknown switch %s",s); } } if (NextArg()!=STRINGARG) HError(3219,"HVite: Dictionary file name expected"); dictFn = GetStrArg();/*获得字典文件名*/ if (NextArg()!=STRINGARG) HError(3219,"HVite: HMM list file name expected"); hmmListFn = GetStrArg();/*获得模型列表文件名*/ if ((states || models) && nToks>1) HError(3230,"HVite: Alignment using multiple tokens is not supported"); if (NumArgs()==0 && wdNetFn==NULL) HError(3230,"HVite: Network must be specified for recognition from audio"); if (loadNetworks && loadLabels) HError(3230,"HVite: Must choose either alignment from network or labels"); if (nToks>1 && latExt==NULL && nTrans==1) HError(-3230,"HVite: Performing nbest recognition with no nbest output"); if ((update>0) && (!xfInfo.useOutXForm)) HError(3230,"HVite: Must use -K option with incremental adaptation"); Initialise(); /* Process the data */ if (wdNetFn==NULL) DoAlignment(); else DoRecognition(); /* Free up and we are done */ if (trace & T_MEM) { printf("Memory State on Completion\n"); PrintAllHeapStats(); } DeleteVRecInfo(vri); ResetHeap(&netHeap); FreePSetInfo(psi); UpdateSpkrStats(&hset,&xfInfo, NULL); ResetHeap(®Heap); ResetHeap(&modelHeap); Exit(0); return (0); /* never reached -- make compiler happy */}/* --------------------------- Initialisation ----------------------- *//* Initialise: set up global data structures */void Initialise(void){ Boolean eSep; int s; /* Load hmms, convert to inverse DiagC */ if(MakeHMMSet(&hset,hmmListFn)<SUCCESS)/*从音素列表hmmListFn建立HMMSet*/ HError(3228,"Initialise: MakeHMMSet failed"); if(LoadHMMSet(&hset,hmmDir,hmmExt)<SUCCESS) /*导入模型数据,当前hmmDir=null,hmmExt=null*/ HError(3228,"Initialise: LoadHMMSet failed"); ConvDiagC(&hset,TRUE);/*将模型的协方差矩阵转换成对角阵*/ /* Create observation and storage for input buffer */ SetStreamWidths(hset.pkind,hset.vecSize,hset.swidth,&eSep); obs=MakeObservation(&gstack,hset.swidth,hset.pkind, hset.hsKind==DISCRETEHS,eSep); /* sort out masks just in case using adaptation */ if (xfInfo.inSpkrPat == NULL) xfInfo.inSpkrPat = xfInfo.outSpkrPat; /*当前执行此处*/ if (xfInfo.paSpkrPat == NULL) xfInfo.paSpkrPat = xfInfo.outSpkrPat; /*当前执行此处*/ if (xfInfo.useOutXForm || (update>0)) {/*当前HVite.c做识别时,此判断条件为假*/ CreateHeap(®Heap, "regClassStore", MSTAK, 1, 0.5, 1000, 8000 ); /* This initialises things - temporary hack - THINK!! */ CreateAdaptXForm(&hset, "tmp"); /* initialise structures for the f-b frame-state alignment pass */ utt = (UttInfo *) New(®Heap, sizeof(UttInfo)); fbInfo = (FBInfo *) New(®Heap, sizeof(FBInfo)); /* initialise a recogniser for frame/state alignment purposes */ alignpsi=InitPSetInfo(&hset); alignvri=InitVRecInfo(alignpsi,1,TRUE,FALSE); SetPruningLevels(alignvri,0,genBeam,-LZERO,0.0,tmBeam); InitUttInfo(utt, FALSE); InitialiseForBack(fbInfo, ®Heap, &hset, (UPDSet) (UPXFORM), genBeam*2.0, genBeam*2.0, genBeam*4.0+1.0, 10.0); utt->twoDataFiles = FALSE; utt->S = hset.swidth[0]; AttachPreComps(&hset,hset.hmem); }/*当前HVite.c做识别时,此判断条件为假*/ /* Create observation and storage for input buffer */ SetStreamWidths(hset.pkind,hset.vecSize,hset.swidth,&eSep); obs=MakeObservation(&gstack,hset.swidth,hset.pkind, hset.hsKind==DISCRETEHS,eSep); CreateHeap(&bufHeap,"Input Buffer heap",MSTAK,1,0.0,50000,50000); CreateHeap(&repHeap,"Replay Buffer heap",MSTAK,1,0.0,50000,50000); maxM = MaxMixInSet(&hset);/*当前maxM=*/ for (s=1; s<=hset.swidth[0]; s++)/*当前hset.swidth[0]=1*/ maxMixInS[s] = MaxMixInSetS(&hset, s);/*当前MaxMixInSetS(&hset, s)=1*/ if (trace&T_TOP) { printf("Read %d physical / %d logical HMMs\n", hset.numPhyHMM,hset.numLogHMM); fflush(stdout); } /* Initialise recogniser */ if (nToks>1) nBeam=genBeam;/*当前HVite.c做识别时,此判断条件为假*//*static int nToks = 0; Number of tokens for N best */
/*PSetInfo *psi: Private data used by HRec ;VRecInfo *vri: Visible HRec Info */ psi=InitPSetInfo(&hset);/* Prepare HMMSet for recognition. Allocates seIndex and preComp from *//* hmmset heap.*/ vri=InitVRecInfo(psi,nToks,models,states);/* 当前nToks=0;states和model真值为假,static Boolean states = FALSE; Keep track of state alignment ,static Boolean models = FALSE; Keep track of model alignment */ /* Read dictionary and create storage for lattice */ InitVocab(&vocab); if(ReadDict(dictFn,&vocab)<SUCCESS) HError(3213, "Main: ReadDict failed"); CreateHeap(&ansHeap,"Lattice heap",MSTAK,1,0.0,4000,4000); if (trace & T_MEM){ printf("Memory State After Initialisation\n"); PrintAllHeapStats(); }}/* ------------------ Utterance Level Recognition ----------------------- *//* ReplayAudio: replay the last audio input */void ReplayAudio(BufferInfo info){ AudioOut ao; if (info.a != NULL) { ao = OpenAudioOutput(&repHeap,&(info.srcSampRate)); PlayReplayBuffer(ao, info.a); while (SamplesToPlay(ao) > 0 ); CloseAudioOutput(ao); }}/* DoOnlineAdaptation: Perform unsupervised online adaptation using the recognition hypothesis as the transcription */int DoOnlineAdaptation(Lattice *lat, ParmBuf pbuf, int nFrames){ Transcription *modelTrans, *trans; BufferInfo pbinfo; Lattice *alignLat, *wordNet; Network *alignNet; int i; GetBufferInfo(pbuf,&pbinfo); trans=TranscriptionFromLattice(&netHeap,lat,1); wordNet=LatticeFromLabels(GetLabelList(trans,1),bndId, &vocab,&netHeap); alignNet=ExpandWordNet(&netHeap,wordNet,&vocab,&hset); StartRecognition(alignvri,alignNet,0.0,0.0,0.0); /* do forced alignment */ for (i = 0; i < nFrames; i++) { ReadAsTable(pbuf, i, &obs); ProcessObservation(alignvri,&obs,-1,xfInfo.inXForm); } alignLat=CompleteRecognition(alignvri, pbinfo.tgtSampRate/10000000.0, &netHeap); if (alignvri->noTokenSurvived) { Dispose(&netHeap, trans); /* Return value 0 to indicate zero frames process failed */ return 0; } modelTrans=TranscriptionFromLattice(&netHeap,alignLat,1); /* format the transcription so that it contains just the models */ FormatTranscription(modelTrans,pbinfo.tgtSampRate,FALSE,TRUE, FALSE,FALSE,TRUE,FALSE,TRUE,TRUE, FALSE); /* Now do the frame/state alignment accumulating MLLR statistics */ /* set the various values in the utterance storage */ utt->tr = modelTrans; utt->pbuf = pbuf; utt->Q = CountLabs(utt->tr->head); utt->T = nFrames; utt->ot = obs; /* do frame state alignment and accumulate statistics */ fbInfo->inXForm = xfInfo.inXForm; fbInfo->al_inXForm = xfInfo.inXForm; fbInfo->paXForm = xfInfo.paXForm; if (!FBFile(fbInfo, utt, NULL)) nFrames = 0; Dispose(&netHeap, trans); if (trace&T_TOP) { printf("Accumulated statistics...\n"); fflush(stdout); } return nFrames;} /* ProcessFile: process given file. If fn=NULL then direct audio */Boolean ProcessFile(char *fn, Network *net, int utterNum, LogDouble currGenBeam, Boolean restartable){ FILE *file; ParmBuf pbuf; BufferInfo pbinfo; NetNode *d; Lattice *lat; LArc *arc,*cur; LNode *node; Transcription *trans; MLink m; LogFloat lmlk,aclk; int s,j,tact,nFrames; LatFormat form; char *p,lfn[255],buf1[80],buf2[80],thisFN[MAXSTRLEN]; Boolean enableOutput = TRUE, isPipe; /*printf("\n当前正在处理文件%s\n",fn);*/ if (fn!=NULL) strcpy(thisFN,fn); else if (fn==NULL && saveAudioOut) CounterFN(roPrefix,roSuffix,++roCounter,4,thisFN); else enableOutput = FALSE; if((pbuf = OpenBuffer(&bufHeap,fn,50,dfmt,TRI_UNDEF,TRI_UNDEF))==NULL)/*当前文件格式参数dfmt=UNDEFF,TRI_UNDEF=-1*/ HError(3250,"ProcessFile: Config parameters invalid"); /* Check pbuf same as hset */ GetBufferInfo(pbuf,&pbinfo); if (pbinfo.tgtPK!=hset.pkind) HError(3231,"ProcessFile: Incompatible sample kind %s vs %s", ParmKind2Str(pbinfo.tgtPK,buf1), ParmKind2Str(hset.pkind,buf2)); if (pbinfo.a != NULL && replay) AttachReplayBuf(pbinfo.a, (int) (3*(1.0E+07/pbinfo.srcSampRate)));/*当前此判断条件为假*/ StartRecognition(vri,net,lmScale,wordPen,prScale);/* 根据已有的网络net初始化可见的识别信息vri,当前lmScale=5.000000,wordPen=0.000000,prScale=1.000000,lmScale 是bigram and log(1/NSucc) scale factor,wordPen是 inter model propagation log prob,prScale是 pronunciation scale factor */ SetPruningLevels(vri,maxActive,currGenBeam,wordBeam,nBeam,tmBeam);/*当前maxActive=0,currGenBeam=10000000000.000000,wordBeam=10000000000.000000,nBeam=0.000000,tmBeam=10.000000*/ tact=0;nFrames=0; StartBuffer(pbuf); while(BufferStatus(pbuf)!=PB_CLEARED) { ReadAsBuffer(pbuf,&obs);/*从缓冲中读出一个观察给obs*/
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -