📄 ladapt.c
字号:
HError(16419,"LAdapt: language model file name expected"); outFN = CopyString(&gstack,GetStrArg()); Initialise(); if (processText) { if (NextArg() != STRINGARG) ProcessText(NULL,TRUE); /* input from stdin */ else while (NextArg() == STRINGARG) { /* !! copy string argument since it gets overwritten by NextArg() when reading from script file */ fn = CopyString(&gstack,GetStrArg()); ProcessText(fn,NextArg() != STRINGARG); } if (NumArgs() != 0) HError(-16419,"LAdapt: unused args left on cmd line"); for (i=0; i<stdBuf.ngb->fndx; i++) { sprintf(sBuf,"%s.%d",stdBuf.ngb->fn,i); AddInputGFile(&inSet,sBuf,1.0); } ResetHeap(&langHeap); } else { for (i=0; i<MAX_NGRAM_FILES; i++) { sprintf(sBuf,"%s.%d",rootFN,i); if (!Exists(sBuf)) break; AddInputGFile(&inSet,sBuf,1.0); } if (i==MAX_NGRAM_FILES) { HError(-16419, "LAdapt: Only %d n-gram files read (recompile with different setting\nof MAX_NGRAM_FILES"); } } if (nLModel==1) { adpLM = GenerateModel(&langHeap,&binfo); } else { if (binfo.ptype==LMP_COUNT) binfo.ptype = LMP_FLOAT; newLM = GenerateModel(&langHeap,&binfo); lmInfo[0].lm = newLM; lmInfo[0].fn = "unknown"; /* combine all models into one */ adpLM = CombineModels(&langHeap,lmInfo,nLModel,nSize,tgtVoc); }#ifdef HTK_TRANSCRIBER#ifdef HTK_CRYPT adpLM->encrypt = TRUE; /* force to write encrypted model */#endif#endif SaveLangModel(outFN,adpLM); Exit(EXIT_SUCCESS); return EXIT_SUCCESS; /* never reached -- make compiler happy */}/* ------------------------ Initialisation ----------------------- *//* Exists: return true if given file exists */Boolean Exists(char *fn){ FILE *f; if ((f=fopen(fn,"r")) == NULL) return FALSE; fclose(f); return TRUE;}/* Initialise: initialise global data structures */void Initialise(void){ int i; char path[256]; CreateHeap(&langHeap,"LModel mem",MSTAK,1,0.5,1000,20000); if (wlistFN!=NULL) { tgtVoc = &wlist; CreateWordList(wlistFN,tgtVoc,10); } if (processText) { /* init empty buffer */ CreateWordMap(NULL,&wmap,newWords); wmap.hasCnts = TRUE; wmap.name = defMapName; wmap.htkEsc = htkEscape; ++wmap.seqno; mapUpdated = FALSE; if (tgtVoc!=NULL) { /* add words from word list to the map */ pruneWords = TRUE; for (i=0; i<tgtVoc->used; i++) { AddWordToMap(&wmap,tgtVoc->id[i]); } SortWordMap(&wmap); unkId = GetLabId(unkStr,FALSE); } /* init ngram buffer */ MakeFN(rootFN,dbsDir,NULL,path); stdBuf.used = 0; stdBuf.ng[nSize] = 1; /* count = 1 */ stdBuf.ngb = CreateNGBuffer(&langHeap,nSize,ngbSize,path,&wmap); } else { CreateWordMap(omapFN,&wmap,1); } CreateInputSet(&gstack,&wmap,&inSet); binfo.wmap = &wmap; binfo.inSet = &inSet; binfo.nSize = nSize;}/* ----------------- NGram Counting Routines -------------------- *//* CompressBuffer: and save if necessary or mustSave is TRUE */void CompressBuffer(NGBuffer *ngb, Boolean mustSave){ float compx; if (ngb->used == 0) return; SortNGBuffer(ngb); compx = 100.0 * (float)ngb->used / (float)ngb->poolsize; if (trace&T_SAV) printf(" buffer %s.%d compressed%s to %.1f%%\n", ngb->fn, ngb->fndx, mustSave?"[must save]":"",compx); if (compx > 75.0 || mustSave) { if (saveFiles && mustSave && mapUpdated) { SaveWordMap(omapFN,&wmap,FALSE); mapUpdated = FALSE; if (trace&T_TOP) printf(" word map saved to %s\n",omapFN); } if (trace&T_TOP) { printf(" saving %d ngrams to file %s.%d\n", ngb->used, ngb->fn, ngb->fndx); } WriteNGBuffer(ngb,txtSrc); }}/* PutShiftRegister: push word into shift register and extract ngram */void PutShiftRegister(LabId id, ShiftReg *sr){ int i; MapEntry *me; if (trace&T_SHR){ printf(" %12s --> %s\n",id->name,sr->ngb->fn); fflush(stdout); } AddWordToMap(&wmap,id); mapUpdated = TRUE; me = (MapEntry *)id->aux; sr->ng[sr->used++] = me->ndx; if (sr->used == nSize) { /* record ngram */ StoreNGram(sr->ngb,sr->ng); /* shift words */ sr->used--; for (i=0; i<sr->used; i++) sr->ng[i] = sr->ng[i+1]; /* compress buffer if full */ if (sr->ngb->used == sr->ngb->poolsize) { CompressBuffer(sr->ngb,FALSE); } }}/* ProcessText: read text files line by line and count ngrams */void ProcessText(char *fn, Boolean lastFile){ FILE *f; LabId id; Boolean isPipe; char word[256]; if (trace&T_TOP) printf("Reading source text file %s\n",(fn==NULL) ? "<stdin>" : fn); if ((fn!=NULL) && (strcmp(fn,"-")!=0)) { if ((f = FOpen(fn,LMTextFilter,&isPipe))==NULL) HError(16410,"ProcessText: unable to open text file %s", fn); } else { f = stdin; } while (fscanf(f,"%255s",word)==1) { if (pruneWords) { if ((id = GetLabId(word,FALSE))==NULL && (id = unkId)==NULL) { stdBuf.used=0; continue; } } else { id = GetLabId(word,TRUE); } if (trace&T_INP) printf("[%s]\n",id->name); PutShiftRegister(id,&stdBuf); } if (fn!=NULL) { FClose(f,isPipe); if (lastFile) CompressBuffer(stdBuf.ngb,TRUE); } else { CompressBuffer(stdBuf.ngb,TRUE); } }/* CombineModels: load models and combine with the one in memory */BackOffLM *CombineModels(MemHeap *heap,LMInfo *lmi,int nLModel,int nSize,WordMap *wl) { int i,j,nw; float x; LMInfo *li; BackOffLM *tgtLM; WordMap wordList; LabId lab; NameId *na; /* normalise weights */ for (x=0.0, i=1; i<nLModel; i++) x += lmInfo[i].weight; lmInfo[0].weight = 1.0-x; /* load all models except the first one*/ for (li=lmInfo+1, i=1; i<nLModel; i++, li++) { if (trace&T_TOP) printf("Loading language model from %s\n",li->fn); li->lm = LoadLangModel(li->fn,wl,1.0,LMP_FLOAT,heap); } if (wl==NULL) { wl = &wordList; /* derive word list from LMs */ for (li=lmInfo, i=0; i<nLModel; i++, li++) { na = li->lm->binMap; for (j=0; j<li->lm->vocSize; j++) { lab = GetLabId(na[j+1]->name,TRUE); lab->aux=NULL; } } for (nw=0,li=lmInfo, i=0; i<nLModel; i++, li++) { na = li->lm->binMap; for (j=0; j<li->lm->vocSize; j++) { lab = GetLabId(na[j+1]->name,FALSE); if (lab->aux==NULL) { nw++; lab->aux = (Ptr) wl; } } } CreateWordList(NULL,wl,nw+10); for (nw=0,li=lmInfo, i=0; i<nLModel; i++, li++) { na = li->lm->binMap; for (j=0; j<li->lm->vocSize; j++) { lab = GetLabId(na[j+1]->name,FALSE); if (lab->aux==(Ptr) wl) { wl->id[nw++]=lab; lab->aux = NULL; } } } wl->used = nw; } if (trace&T_TOP) { printf("Using language model(s): \n"); for (li=lmInfo,i=0; i<nLModel; i++,li++) printf(" %d-gram %s, weight %.2f\n",li->lm->nSize,li->fn,li->weight); } if (trace&T_TOP) { printf("Generating %d-gram model %s\n",nSize,outFN); fflush(stdout); } tgtLM = MergeModels(heap,lmInfo,nLModel,nSize,wl);#ifdef HTK_CRYPT if (tgtLM->encrypt && binfo.saveFmt==LMF_TEXT) binfo.saveFmt = LMF_BINARY;#endif for (i=1; i<=nSize; i++) { tgtLM->gInfo[i].fmt = (i==1) ? LMF_TEXT : binfo.saveFmt; } return tgtLM;}/* ---------------------- End of LAdapt.c ----------------------- */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -