📄 hlstats.c
字号:
for (ae=aetab[hash];ae!=NULL;ae=ae->link) if (ae->word[0]==in[0] && ae->word[1]==in[1]) break; if (ae==NULL && create) { nae++; ae=(AEntry*)New(&statHeap,sizeof(AEntry)); for (i=0;i<ASIZE;i++) ae->word[i]=in[i]; ae->count=0; ae->link=aetab[hash]; aetab[hash]=ae; } return(ae);}/* GatherStats: update stats using given label file */void GatherStats(Transcription *t){ LLink l; LabList *ll; WordInfo *lt; int i,j,st,en,lab,in[ASIZE]; float dur; AEntry *ae; ll=GetLabelList(t,1); st=1; en=CountLabs(ll); /* If first label is enterId then we need to skip it */ l = GetLabN(ll,1); if (l->labid==enterId) st++; /* If the final label is exitId then it should be skipped */ l = GetLabN(ll,en); if (l->labid==exitId) en--; /* Coerce previous labels to be enterId */ for (i=0; i<ASIZE; i++) in[i]=(int)enterId->aux; lt = lTab+(int)enterId->aux; ++lt->count; /* Process actual labels in list */ for (i=st; i<=en; i++) { l = GetLabN(ll,i); lab=(int)l->labid->aux; dur = (float)(l->end - l->start)/10000.0; lt=lTab+lab; /* increment stats */ lt->count++; lt->sumDur += dur; if (dur < lt->minDur) lt->minDur=dur; if (dur > lt->maxDur) lt->maxDur=dur; lt->pCntr->count++; if (doBigram) { /* We ignore all transitions into enterId and exitId */ /* May wish to warn user about badly formed sentences */ if (!(lab==(int)enterId->aux || (lab==(int)exitId->aux))) { for (j=ASIZE-1;j>0;j--) in[j]=in[j-1]; in[0]=lab; ae = GetAEntry(in,TRUE); ae->count++; } } } /* Deal with transition into EXIT */ if (doBigram) { for (j=ASIZE-1;j>0;j--) in[j]=in[j-1]; in[0]=(int)exitId->aux; ae = GetAEntry(in,TRUE); ae->count++; } lt = lTab+(int)exitId->aux; ++lt->count;}/* ----------------------- Output Results -------------------- *//* CmpCntr: return sign(c1->count - c2->count) , if equal then use same ordering as in lTab */int CmpCntr(const void *p1, const void *p2){ Cntr *c1, *c2; int diff; c1=(Cntr *)p1; c2=(Cntr *)p2; diff=c1->count-c2->count; if (diff==0) return((int)c2->name->aux-(int)c1->name->aux); else return(diff);}/* CmpWordInfo: return sign(c1->count - c2->count) , if equal then use same ordering as in lTab */int CmpWordInfo(const void *p1, const void *p2){ WordInfo *c1, *c2; int diff; c1=(WordInfo *)p1; c2=(WordInfo *)p2; diff=c1->count-c2->count; if (diff==0) return((int)c2->name->aux-(int)c1->name->aux); else return(diff);}/* OutputCounts: output logical/physical counters */void OutputCounts(void){ int i; WordInfo *l; Cntr *p; if (doLCount){ qsort(lTab+1,lSize,sizeof(WordInfo),CmpWordInfo); printf("\nLogical Model Counts:\n"); printf(" Label LCount PCount\n"); for (i=0,l=lTab+1; i<lSize; i++,l++){ if (l->count > lCountLimit) break; printf("%12s %8d %8d\n",l->name->name,l->count,l->pCntr->count); } } if (doPCount){ /* Breaks log->phy relation */ qsort(pTab+1,pSize,sizeof(Cntr),CmpCntr); printf("\nPhysical Model Counts:\n"); printf(" Label PCount\n"); for (i=0,p=pTab+1; i<pSize; i++,p++){ if (p->count > pCountLimit) break; printf("%12s %8d\n",p->name->name,p->count); } } printf("\n"); fflush(stdout);}/* OutputDurs: output duration stats */void OutputDurs(void){ int i; WordInfo *l; printf("\nDuration Statistics:\n"); printf(" Label Count AveDur MinDur MaxDur\n"); for (i=0,l=lTab+1; i<lSize; i++,l++){ printf("%12s %7d",l->name->name,l->count); if (l->count>0 && l->name != enterId && l->name != exitId) { printf("%8.1f",l->sumDur/l->count); if (l->minDur < 1E30) printf("%8.1f",l->minDur); else printf("%8s","---"); printf("%8.1f",l->maxDur); } printf("\n"); } printf("\n"); fflush(stdout);}/* OutputList: output a list of all labels that occurred at least once */void OutputList(void){ int i; FILE *f; WordInfo *l; if ((f=fopen(listFile,"w"))==NULL) HError(1311,"OutputList: Cannot create label list file %s",listFile); for (i=0,l=lTab+1; i<lSize; i++,l++) if (l->count>0) fprintf(f,"%s\n",l->name->name); fclose(f);}/* ------------------- Bigram Handling ---------------------- */#define log2(x) (log(x)/log(2.0))#define ent2(x) ((x)>0.0?((x)*log2(x)):0.0)/* RebuildAETab: rebuild the aetab in aelists such that all ngrams (n,x) are stored in the list aelists[n]. */void RebuildAETab(AEntry **aelists){ AEntry *ae,*nx; int h; for (h=0; h<aetabsize; h++) { for (ae=aetab[h]; ae!=NULL; ae=nx) { nx=ae->link; if (ae->word[1]==0) continue; ae->link=aelists[ae->word[1]]; aelists[ae->word[1]]=ae; } aetab[h]=NULL; }}/* se_cmp: ordering relation for SEntrys based on word id */int se_cmp(const void *v1,const void *v2){ SEntry *s1,*s2; s1=(SEntry*)v1; s2=(SEntry*)v2; return((int)(s1->word-s2->word));}/* Simple calculation of backoff weights - 0.5 subtracted from each count */static float BuildNEntry(NEntry *ne,Vector boff,float bent){ SEntry *cse; AEntry *ae; double bowt,bsum,cnt,tot,ent,prob; ne->nse=0; tot=cnt=0.0; bsum=1.0; if (ne->word[0]!=(int)exitId->aux) for (ae=(AEntry *) ne->user; ae!=NULL; ae=ae->link) { tot+=ae->count; if (ae->word[0]!=0 && ae->word[0]!=(int)enterId->aux && ae->count>bigThresh) cnt+=(ae->count-disCount),ne->nse++,bsum-=boff[ae->word[0]]; } if (ne->nse==0) { ne->se=NULL; ne->bowt=0.0; ent=bent; } else { ne->se=(SEntry*)New(&statHeap,sizeof(SEntry)*ne->nse); bowt = (bsum>0.0) ? (1.0-cnt/tot)/bsum : 0.0; ent = (bowt>0.0) ? bowt*(bent-log2(bowt)) : 0.0; for (cse=ne->se,ae=(AEntry *) ne->user; ae!=NULL; ae=ae->link) if (ae->word[0]!=0 && ae->word[0]!=(int)enterId->aux && ae->count>bigThresh) { prob=((double)ae->count-disCount)/tot; cse->word=ae->word[0]; cse->prob=log(prob); ent -= ent2(prob); prob = bowt*boff[cse->word]; ent += ent2(prob); cse++; } if (bowt>0.0) ne->bowt=log(bowt); else ne->bowt=LZERO; qsort(ne->se,ne->nse,sizeof(SEntry),se_cmp); } return(ent);}/* OutputBoBigram: output ARPA/MIL-LL style back off bigram */void OutputBoBigram(void){ LModel lm; NGramLM *nglm; NEntry *ne; SEntry *se; AEntry **aelists; lmId ndx[NSIZE]; int i,tot,counts[NSIZE+1]; double uent,ent,bent; lm.heap=&statHeap; lm.type=boNGram; counts[1]=lSize;counts[2]=nae; for(i=3;i<NSIZE+1;i++) counts[i]=0; nglm=CreateBoNGram(&lm,lSize,counts); /* Give max size at creation */ for (i=1;i<=lSize;i++) nglm->wdlist[i]=lTab[i].name; aelists=(AEntry**)New(&tmpHeap,sizeof(AEntry*)*(lSize+1)); for (i=1;i<=lSize;i++) aelists[i]=NULL; RebuildAETab(aelists); /* Un-hash hashtable */ for (i=1,tot=0.0;i<=lSize;i++) { /* Calculate unigrams first */ if (i==(int)enterId->aux) nglm->unigrams[i]=0.0; else if (lTab[i].count<uniFloor) nglm->unigrams[i]=uniFloor; else nglm->unigrams[i]=lTab[i].count; tot+=nglm->unigrams[i]; } for (i=1,uent=0.0;i<=lSize;i++,se++) { nglm->unigrams[i]=nglm->unigrams[i]/tot; uent-=ent2(nglm->unigrams[i]); } nglm->counts[1]=lSize; /* Calculate real sizes during build */ nglm->counts[2]=0; for (i=0; i<NSIZE; i++) ndx[i]=0; if (trace&T_BIG) { printf("\n UNIGRAM NEntry - %4d foll, ent %.3f [= %.3f]\n\n", lSize,uent,pow(2.0,uent)); printf(" BIGRAMS NEntries\n"); fflush(stdout); } for (i=1,bent=0.0;i<=lSize;i++) { ndx[0]=i; ne=GetNEntry(nglm,ndx,TRUE); ne->user=aelists[i]; ent = BuildNEntry(ne,nglm->unigrams,uent); nglm->counts[2]+=ne->nse; if (trace&T_BIG) if (i!=(int)exitId->aux){ if (i==(int)enterId->aux) bent+=nglm->unigrams[(int)exitId->aux]*ent; else bent+=nglm->unigrams[i]*ent; printf(" %-20s - %4d foll, ent %6.3f [= %6.2f]\n", lTab[i].name->name,ne->nse,ent,pow(2.0,ent)); fflush(stdout); } } Dispose(&tmpHeap,aelists); if (trace&T_BIG) { printf("\n BIGRAM: training data entropy %.3f (perplexity %.2f)\n", bent,pow(2.0,bent)); fflush(stdout); } ndx[0]=0; /* Set up unigram nentry separately */ ne=GetNEntry(nglm,ndx,TRUE); ne->nse=lSize; se=ne->se=(SEntry*)New(nglm->heap,sizeof(SEntry)*lSize); for (i=1;i<=lSize;i++,se++) { se->word=i; if (nglm->unigrams[i]>0) se->prob=nglm->unigrams[i]=log(nglm->unigrams[i]); else se->prob=nglm->unigrams[i]=LZERO; } lm.name=CopyString(lm.heap,bigFile); /* Name and write to disk */ WriteLModel(&lm,bigFile,0);}/* OutputMatBigram: output matrix style bigram */void OutputMatBigram(void){ LModel lm; MatBiLM *matbi; AEntry **aelists,*ae; Vector vec; double vsum,fsum,tot,scale; double ent,bent,prob,fent; int i,j,nf,tf=0,nu,tu=0,np,tp=0,tn=0; lm.heap=&statHeap; lm.type=matBigram; matbi=CreateMatBigram(&lm,lSize); for (i=1;i<=lSize;i++) matbi->wdlist[i]=lTab[i].name; aelists=(AEntry**)New(&tmpHeap,sizeof(AEntry*)*(lSize+1)); for (i=1;i<=lSize;i++) aelists[i]=NULL; RebuildAETab(aelists); /* Un-hash hashtable */ if (trace&T_BIG) { printf("\n BIGRAMS from MatBigram\n"); fflush(stdout); } bent=0.0; fent = ent2(bigFloor); for (i=1;i<=lSize;i++) { vec=matbi->bigMat[i]; for (ae=aelists[i],tot=0.0; ae!=NULL; ae=ae->link) if (ae->word[0]!=0) tot += ae->count; fsum = (lSize-1)*bigFloor; vsum=0.0; for (ae=aelists[i];ae!=NULL;ae=ae->link) if (ae->count/tot > bigFloor && ae->word[0]!=0) fsum -= bigFloor, vsum += ae->count; else ae->count=0; scale = (1.0 - fsum) / vsum; for (j=1;j<=lSize;j++) { if (j==(int)enterId->aux) vec[j]=0.0; else if (tot==0.0) vec[j]=1.0/(lSize-1); else vec[j]=bigFloor; } for (ae=aelists[i];ae!=NULL;ae=ae->link) if (ae->count>0) vec[ae->word[0]]=ae->count*scale; if (trace&T_BIG) { nf=nu=np=0; if (tot==0.0) ent=-log2(1.0/(lSize-1)),prob=1.0,nu=lSize-1; else ent=-(lSize-1)*fent, prob=bigFloor*(lSize-1), nf+=lSize-1; for (ae=aelists[i];ae!=NULL;ae=ae->link) if (ae->count>0) { prob += vec[ae->word[0]]-bigFloor; ent -= ent2(vec[ae->word[0]]); ent += fent; nf--; np++; } if (i!=(int)exitId->aux){ j=lTab[i].count; bent+=j*ent;tn+=j; if (tot==0.0) printf(" %-20s - %4d unis, ent %6.3f [= %6.2f] (P=%7.5f)\n", lTab[i].name->name,nu,ent,pow(2.0,ent),prob); else printf(" %-20s - %4d foll, ent %6.3f [= %6.2f] (P=%7.5f)\n", lTab[i].name->name,np,ent,pow(2.0,ent),prob); fflush(stdout); } tf+=nf;tu+=nu;tp+=np; } } if (trace&T_BIG) { bent/=tn; printf("\n BIGRAM: training data entropy %.3f (perplexity %.2f)\n", bent,pow(2.0,bent)); printf(" Estimated %d, floored %d, unigrammed %d for %d\n", tp,tf,tu,lSize); fflush(stdout); } Dispose(&tmpHeap,aelists); /* convert probabilities to logs */ for (i=1;i<=matbi->numWords;i++) { vec = matbi->bigMat[i]; for (j=1; j<=matbi->numWords; j++){ vec[j] = ((vec[j]<MINLARG)?LZERO:log(vec[j])); } } lm.name=CopyString(lm.heap,bigFile); /* Name and write to disk */ WriteLModel(&lm,bigFile,0);}/* OutputStats: print desired stats on standard output */void OutputStats(void){ if (doDurs) OutputDurs(); if (doBigram) { if (doBOff) OutputBoBigram(); else OutputMatBigram(); } if (doList) OutputList(); if (doPCount || doLCount) OutputCounts(); /* Breaks log->phy links */}/* ------------------------------------------------------------ *//* END: HLStats.c *//* ------------------------------------------------------------ */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -