📄 lplex.c
字号:
if (nid==NULL) HError(-16625,"Unable to find word %s in model %s\n",(*wid)->name,li->fn);#endif } l2nId[i] = na; } /* ensure words present at least in one model */ for (wid = wList.id, j=0; j<nWords; j++, wid++) { for (inLM=FALSE,i=0; i<nLModel; i++, li++) if (l2nId[i][(int) ((*wid)->aux)]!=NULL) inLM = TRUE; if (!inLM) HError(16625,"Unable to find word %s in any model\n",(*wid)->name); } /* create equivalence class lookup array */ eqId = (LabId *) New(&permHeap,(nWords+NumEquiv()+2)*sizeof(NameId)); for (wid = wList.id, i=0; i<nWords; i++, wid++) { eqId[(int) ((*wid)->aux)] = NULL; } /* link equivalence classes */ LinkEquiv(); /* open output stream */ if (outStreamFN != NULL) if ((outStream = FOpen(outStreamFN,NoOFilter,&isPipe)) == NULL) HError(16610,"Initialise: unable to open output file %s",outStreamFN);}/* -------------------- OOV calculation/statistics ---------------------*//* CmpOOVE: qsort comparison for oov entries */static int CmpOOVE(const void *p1, const void *p2){ return strcmp(((OOVEntry *)p1)->wdid->name, ((OOVEntry *)p2)->wdid->name);}/* SortOOV: sort/unique OOV array of ps */static int SortOOV(PStats *ps){ int i, c; OOVEntry *ove; if (ps->uniqOOV==0) return 0; qsort(ps->oov, ps->uniqOOV, sizeof(OOVEntry), CmpOOVE); c = 0; ove = ps->oov; for (i=c+1; i<ps->uniqOOV; i++) if (CmpOOVE(ove+c, ove+i)==0) ove[c].count += ove[i].count; else { c++; ove[c] = ove[i]; } return (ps->uniqOOV = c+1);}/* StoreOOV: store OOV wdid/count into ps */static void StoreOOV(PStats *ps, LabId wdid, int count){ int n; ps->oov[ps->uniqOOV].wdid = wdid; ps->oov[ps->uniqOOV].count = count; ps->uniqOOV++; ps->nOOV++; if (ps->uniqOOV==MAX_OOV) { n = SortOOV(ps); printf("StoreOOV: sorting OOVs, compacting %d -> %d\n",MAX_OOV,n); if (n==MAX_OOV) HError(16630,"Maximum number of unique OOV's [%d] reached\n",MAX_OOV); }}/* ZeroStats: zero counts in PStats */static void ZeroStats(PStats *ps){ ps->nOOV = 0; ps->nUtt = 0; ps->nWrd = 0; ps->nTok = 0; ps->uniqOOV = 0; ps->logpp = ps->logpp2 = 0.0;}/* AddStats: add statistics from ps1 to ps2 */static void AddStats(PStats *ps1, PStats *ps2){ int i; for (i=0; i<ps1->uniqOOV; i++) StoreOOV(ps2, ps1->oov[i].wdid, ps1->oov[i].count); ps2->nTok += ps1->nTok; ps2->nUtt += ps1->nUtt; ps2->nWrd += ps1->nWrd; ps2->logpp += ps1->logpp; ps2->logpp2 += ps1->logpp2;}/* PrintInfo: print statistics from ps */static void PrintInfo(PStats *ps, Boolean showOOV){ int i; float ovr; LMInfo *li; OOVEntry *ove; double a, b, ppl, stdev; /* print perplexity measures */ a = (ps->logpp)/(double) (ps->nWrd); b = (ps->logpp2)/(double) (ps->nWrd); ppl = exp(-a); stdev = b - a*a; printf("perplexity %.4f, var %.4f, utterances %d, words predicted %d\n", ppl, stdev, ps->nUtt, ps->nWrd); fflush(stdout); /* calculate OOV rate and statistics */ ovr = ((float) (ps->nOOV) / (float) (ps->nTok - ps->nUtt))*100.0; printf("num tokens %d, OOV %d, OOV rate %.2f%% (excl. %s)\n", ps->nTok, ps->nOOV, ovr, senId->name); if (showOOV && (ps->uniqOOV > 0)) { SortOOV(ps); printf("unique OOVs [%d]\n", ps->uniqOOV); for (ove=ps->oov, i=0; i<ps->uniqOOV; i++, ove++) printf("%s \t%d\n", ove->wdid->name, ove->count); fflush(stdout); } for (li=lmInfo, i=0; i<nLModel; i++, li++) {#ifndef HTK_TRANSCRIBER printf("\nAccess statistics for %s:\n", li->fn); PrintTotalAccessStats(stdout,li->lm);#endif ResetAccessInfo(li->lm); }}/*-------------------------- Perplexity calculation --------------------------*/#define IS_UNK(id) (id==unkId || id->aux==NULL)#define IS_SST(id) (id==sstId)#define IS_SEN(id) (id==senId)static LabId GetEQLab(LabId id){ LabId cl; if (id->aux==NULL) return id; if ((cl = (LabId) eqId[(int) (id->aux)])==NULL) return id; return cl;}/* GetProb: return nSize-gram probability for ngram in wlab */static double GetProb(LabId *wlab, int nSize){ /* this routine will return the interpolated nSize-gram probability for the words in wlab. Note that the context maybe shortened in the case of multiple LMs and words which do not occur in some of them. */ int i,j; LMInfo *li; Boolean inThisLM,inAnyLM; double x,prob,psum; NameId nGram[LM_NSIZE]; if (nLModel==1) { inThisLM = TRUE; for (j=0; j<nSize; j++) { if ((nGram[j] = l2nId[0][(int) (wlab[j]->aux)])==NULL) inThisLM = FALSE; } if (inThisLM) { prob = GetNGramProb(lmInfo[0].lm, nGram, nSize); } else if (nSize > 1) prob = GetProb(wlab+1,nSize-1); else { prob = LZERO; HError(-16690,"GetProb: assigning zero probability"); } } else { psum = 0.0; inAnyLM = FALSE; for (li=lmInfo, i=0; i<nLModel; i++, li++) { for (inThisLM=TRUE, j=0; j<nSize; j++) { if ((nGram[j] = l2nId[i][(int) (wlab[j]->aux)])==NULL) inThisLM = FALSE; } if (!inThisLM) continue; x = GetNGramProb(li->lm, nGram, nSize);#ifdef INTERPOLATE_MAX if ((x = exp(x)) > psum) psum = x;#else psum += li->weight*exp(x);#endif inAnyLM = TRUE; } if (inAnyLM) prob = log(psum); else if (nSize > 1) prob = GetProb(wlab+1,nSize-1); else { prob = LZERO; HError(-16690,"GetProb: assigning zero probability"); } } return prob;}/* CalcPerplexity: compute perplexity and other statistics */static void CalcPerplexity(PStats *sent, LabId *pLab, int numPLabs, int nSize){ int i,j; LabId *p; float prob; Boolean hasOOV; for (p=pLab, i=nSize-1; i<numPLabs; i++, p++) { if (pLab[i]==unkId) continue; /* cannot predict OOVs */ if (skipOOV) { hasOOV = FALSE; for (j=1; j<nSize; j++) { if (pLab[i-j]==unkId) { hasOOV=TRUE; break; } } if (hasOOV) continue; /* skip to next label since context contains OOV */ } prob = GetProb(p, nSize); sent->nWrd++; sent->logpp += prob; sent->logpp2 += prob*prob; if (outStreamFN != NULL) fprintf(outStream,"%e\n",exp(prob)); if (trace&T_PROB) { printf("logP(%s |", pLab[i]->name); for (j=1; j<nSize; j++) { printf(" %s%s", (j==1)?"":",", pLab[i-j]->name); } printf(") = %.4f\n", prob); /* if (trace&T_INST_INFO) PrintInstStats(nSize); */ fflush(stdout); } } if (trace&T_SENT) PrintInfo(sent,FALSE);}/* ProcessLabelFile: compute perplexity and related statistics from labels */static void ProcessLabelFile(char *fn, int nSize){ LLink ll; double ppl; LabList *ref; LabId lab; Transcription *tr; int i,numPLabs,nLabel; tr = LOpen(&tempHeap, fn, lff); if (tr->numLists < 1) { HError(-16635,"ProcessLabelFile: transcription file %s is Empty",fn); return; } ref = GetLabelList(tr, 1); if (ref->head->succ == ref->tail) { HError(-16635,"ProcessLabelFile: transcription file %s is Empty",fn); return; } if (trace>0) { printf("Processing label file: %s\n", fn); fflush(stdout); } nLabel = CountLabs(ref); ZeroStats(&sent); sent.nTok = nLabel + 2; sent.nUtt = 1; /* copy labels into pLab, mapping OOVs */ numPLabs = 0; if (sstId!=NULL) /* add sentence start marker(s) */ for (i=0; i<(nSize-1); i++) pLab[numPLabs++] = sstId; for (i=0,ll=ref->head->succ; i<nLabel; i++,ll=ll->succ) { lab = GetEQLab(ll->labid); if ((i==0) && IS_SST(lab)) { sent.nTok--; continue; } if ((i==(nLabel-1)) && IS_SEN(lab)) { sent.nTok--; continue; } if (IS_UNK(lab)) { if (trace&T_OOV) printf("mapping OOV: %s\n", lab->name); StoreOOV(&sent,lab,1); lab = unkId; } pLab[numPLabs++] = lab; if (numPLabs>=LBUF_SIZE) { HError(16650, "Maximum utterance length in a label file exceeded (limit is compiled to be %d tokens)", LBUF_SIZE); } } if (senId!=NULL) /* add sentence end marker */ pLab[numPLabs++] = senId; CalcPerplexity(&sent, pLab, numPLabs, nSize); AddStats(&sent, &totl); if (trace&T_SEL) { /* compact info for sentence selection */ ppl = exp(-(sent.logpp)/(double) (sent.nWrd)); printf("#! %.4f", ppl); for (i=0, ll=ref->head->succ; i<nLabel; i++, ll=ll->succ) printf(" %s", ll->labid->name); printf("\n"); fflush(stdout); }}/* PPlexStream: compute perplexity and related statistics */static void ProcessTextStream(char *fn, int nSize){ int i; FILE *f; LabId lab=0; double ppl; int numPLabs; Boolean isPipe; char word[256]; if (fn!=NULL) { if ((f=FOpen(fn, LMTextFilter, &isPipe))==NULL) HError(16610,"ProcessTextStream: unable to open file %s", fn); } else { f = stdin; } if (trace>0) { printf("Processing text stream: %s\n", (fn==NULL)?"<stdin>":fn); fflush(stdout); } numPLabs = 0; ZeroStats(&sent); sent.nUtt = 1; sent.nTok = 0; while ((fscanf(f, "%200s", word))==1) { if (strlen(word)>=200) HError(-16640, "ProcessTextStream: word too long, will be split: %s\n", word); lab = GetEQLab(GetLabId(word, TRUE)); if (IS_SST(lab)) { numPLabs = 0; for (i=0; i<(nSize-1); i++) pLab[numPLabs++] = sstId; ZeroStats(&sent); sent.nUtt = 1; sent.nTok = 1; continue; } if (IS_UNK(lab)) { if (trace&T_OOV) printf("mapping OOV: %s\n", lab->name); StoreOOV(&sent,lab,1); lab = unkId; } pLab[numPLabs++] = lab; sent.nTok++; if (numPLabs>=LBUF_SIZE) { HError(16645,"ProcessTextStream: word buffer size exceeded - too many words without a sentence end (%d)",LBUF_SIZE); CalcPerplexity(&sent,pLab,numPLabs,nSize); numPLabs = 0; } if (IS_SEN(lab)) { CalcPerplexity(&sent,pLab,numPLabs,nSize); AddStats(&sent, &totl); if (trace&T_SEL) { /* compact info for sentence selection */ ppl = exp(-(sent.logpp)/(double) (sent.nWrd)); printf("#! %.4f", ppl); for (i=nSize-1; i<numPLabs; i++) printf(" %s", pLab[i]->name); printf("\n"); fflush(stdout); } ZeroStats(&sent); } } AddStats(&sent,&totl); if (fn!=NULL) FClose(f,isPipe);}/* ProcessFiles: process label files */static void ProcessFiles(){ int nSize; char *labFn; MLFEntry *me; char *inpfn[MAX_FILES]; int i,t,numFiles,fidx; numFiles = 0; while (NumArgs()>0){ if (NextArg()!=STRINGARG) HError(16619,"ProcessFiles: label file (MLF) name expected"); inpfn[numFiles++] = CopyString(&gstack, GetStrArg()); if (numFiles == MAX_FILES) { HError(-16619,"Processing only the first %d files",MAX_FILES); } } for (t=0; t<numTests; t++) { ZeroStats(&totl); nSize = testInfo[t]; printf("LPlex test #%d: %d-gram\n", t, nSize); if (numFiles==0) { ProcessTextStream(NULL, nSize); continue; } for (i=0; i<numFiles; i++) { labFn = inpfn[i]; if (streamMode) { ProcessTextStream(labFn,nSize); } else { if (IsMLFFile(labFn)) { if (trace>0) { printf("Processing MLF: %s\n", labFn); fflush(stdout); } fidx = NumMLFFiles(); if ((me=GetMLFTable()) != NULL) { while(me->next != NULL) me=me->next; LoadMasterFile(labFn); me=me->next; } else{ LoadMasterFile(labFn); me=GetMLFTable(); } while (me != NULL) { if (me->type == MLF_IMMEDIATE && me->def.immed.fidx == fidx) { ProcessLabelFile(me->pattern,nSize); } me = me->next; } } else { ProcessLabelFile(labFn,nSize); } } } PrintInfo(&totl, printOOV); }}/* --------------------- End of LPlex.c ------------------------ */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -