📄 hlstats.c
字号:
/* ----------------------------------------------------------- *//* *//* ___ *//* |_| | |_/ SPEECH *//* | | | | \ RECOGNITION *//* ========= SOFTWARE */ /* *//* *//* ----------------------------------------------------------- *//* Copyright: Microsoft Corporation *//* 1995-2000 Redmond, Washington USA *//* http://www.microsoft.com *//* *//* Use of this software is governed by a License Agreement *//* ** See the file License for the Conditions of Use ** *//* ** This banner notice must not be removed ** *//* *//* ----------------------------------------------------------- *//* File: HLStats.c: gather statistics from transcriptions *//* ----------------------------------------------------------- */char *hlstats_version = "!HVER!HLStats: 3.3 [CUED 28/04/05]";char *hlstats_vc_id = "$Id: HLStats.c,v 1.1.1.1 2005/05/12 10:52:54 jal58 Exp $";#include "HShell.h"#include "HMem.h"#include "HMath.h"#include "HSigP.h"#include "HAudio.h"#include "HWave.h"#include "HVQ.h"#include "HParm.h"#include "HLabel.h"#include "HModel.h"#include "HDict.h"#include "HLM.h"#include "HUtil.h"/* This program collects statistics (such as number of occurrences, min, max and average duration) from a set of label files. It is also able to generate simple backoff and matrix bigram language models.*//* -------------------------- Trace Flags & Vars ------------------------ */#define T_BAS 0x0001 /* Trace basic progress information */#define T_MEM 0x0002 /* Trace memory usage */#define T_BIG 0x0004 /* Trace bigram statistics */#define T_FIL 0x0008 /* Trace each file name */static int trace = 0; /* trace level *//* -------------------------- Global Variables etc ---------------------- */static Boolean doBigram = FALSE; /* do what? */static Boolean doDurs = FALSE;static Boolean doList = FALSE;static Boolean doPCount = FALSE; static Boolean doLCount = FALSE;static Boolean doBOff = FALSE;static char *listFile = NULL; /* file for label list */static char *bigFile = NULL; /* file for bigram */static float uniFloor = 1.0; /* min count for unigram probs */static float bigFloor = 0.0; /* floor for matrix bigram probs */static int bigThresh = 0; /* threshold for including bigram probs */static int pCountLimit = -1; /* max occurrences to list for pCount */static int lCountLimit = -1; /* max occurrences to list for lCount */static int hSize = 0; /* hash table size, small(0), med(1), large(2) */static LabId enterId; /* id of ENTRY label in ngram */static LabId exitId; /* id of EXIT label in ngram */static LabId nullId; /* id of !NULL label in ngram */static FileFormat ff=UNDEFF; /* Label file format */static MemHeap tmpHeap; /* Temporary storage */static MemHeap statHeap; /* Permenant stats storage *//* ---------------- Configuration Parameters --------------------- */static ConfParam *cParm[MAXGLOBS];static int nParm = 0; /* total num params */static float disCount = 0.5; /* discount for backoff *//* ------------------ Process Command Line ------------------------- *//* SetConfParms: set conf parms relevant to this tool */void SetConfParms(void){ double d; int i; nParm = GetConfig("HLSTATS", TRUE, cParm, MAXGLOBS); if (nParm>0){ if (GetConfFlt(cParm,nParm,"DISCOUNT",&d)) disCount = d; if (GetConfInt(cParm,nParm,"TRACE",&i)) trace = i; }}void ReportUsage(void){ printf("\nUSAGE: HLStats [options] hmmList labFile...\n\n"); printf(" Option Default\n\n"); printf(" -b fn output bigram to file fn off\n"); printf(" -c N count num logical occs upto N none\n"); printf(" -d compute duration statistics off\n"); printf(" -f f set matrix bigram floor prob f 0.0\n"); printf(" -h N set hashsize: medium(1), large(2) small(0)\n"); printf(" -l s output covering list of models to s off\n"); printf(" -o generate wsj style back-off files matrix\n"); printf(" -p N count num physical occs upto N none\n"); printf(" -s s1 s2 select start s1 and end s2 labels !ENTER !EXIT\n"); printf(" -t n set threshold for including bigram 0\n"); printf(" -u f set back off unigram floor prob f 1.0\n"); PrintStdOpts("GIX"); printf("\n\n");}int main(int argc, char *argv[]){ char * labFn, *listfn, *s; int i,fidx; MLFEntry *me = NULL; Transcription *t; void InitStats(char *listfn); void GatherStats(Transcription *t); void OutputStats(void); if(InitShell(argc,argv,hlstats_version,hlstats_vc_id)<SUCCESS) HError(1300,"HLStats: InitShell failed"); InitMem(); InitMath(); InitWave(); InitLabel(); InitLM(); if (!InfoPrinted() && NumArgs() == 0) ReportUsage(); if (NumArgs() == 0) Exit(0); SetConfParms(); enterId=GetLabId("!ENTER",TRUE); /* All sentences should or are coerced */ exitId=GetLabId("!EXIT",TRUE); /* to start enterId and end exitId */ nullId=GetLabId("!NULL",TRUE); /* Name for words not in list */ while (NextArg() == SWITCHARG) { s = GetSwtArg(); if (strlen(s)!=1) HError(1319,"HLStats: Bad switch %s; must be single letter",s); switch(s[0]){ case 'b': doBigram = TRUE; if (NextArg() != STRINGARG) HError(1319,"HLStats: Ngram output file name expected"); bigFile = GetStrArg(); break; case 'c': doLCount = TRUE; lCountLimit = GetChkedInt(0,100000,s); break; case 'd': doDurs = TRUE; break; case 'f': bigFloor = GetChkedFlt(0.0,1000.0,s); break; case 'h': hSize = GetChkedInt(1,2,s); break; case 'l': doList = TRUE; if (NextArg() != STRINGARG) HError(1319,"HLStats: Output label list file name expected"); listFile = GetStrArg(); break; case 'o': doBOff = TRUE; break; case 'p': doPCount = TRUE; pCountLimit = GetChkedInt(0,100000,s); break; case 's': if (NextArg() != STRINGARG) HError(1319,"HLStats: ENTER label name expected"); enterId=GetLabId(GetStrArg(),TRUE); if (NextArg() != STRINGARG) HError(1319,"HLStats: EXIT label name expected"); exitId=GetLabId(GetStrArg(),TRUE); break; case 't': bigThresh = GetChkedInt(0,100,s); break; case 'u': uniFloor = GetChkedFlt(0.0,1000.0,s); break; case 'G': if (NextArg() != STRINGARG) HError(1319,"HLStats: Input label File format expected"); if((ff = Str2Format(GetStrArg())) == ALIEN) HError(-1389,"HLStats: Warning ALIEN Label file format set"); break; case 'I': if (NextArg() != STRINGARG) HError(1319,"HLStats: Input MLF file name expected"); LoadMasterFile(GetStrArg()); break; case 'T': if (NextArg() != INTARG) HError(1319,"HLStats: Trace value expected"); trace = GetChkedInt(0,017,s); break; default: HError(1319,"HLStats: Unknown switch %s",s); } } if (NextArg()!=STRINGARG) HError(1319,"HLStats: Label list file name expected"); listfn = GetStrArg(); if (!(doDurs || doBigram || doList || doLCount || doPCount)) HError(1330,"HLStats: Nothing to do!"); InitStats(listfn); i=0; while (NumArgs()>0) { if (NextArg()!=STRINGARG) HError(1319,"HLStats: Input label file name expected"); labFn = GetStrArg(); if (IsMLFFile(labFn)) { fidx = NumMLFFiles(); if ((me=GetMLFTable()) != NULL) { while(me->next != NULL) me=me->next; LoadMasterFile(labFn); me=me->next; } else { LoadMasterFile(labFn); me=GetMLFTable(); } while (me != NULL) { if (me->type == MLF_IMMEDIATE && me->def.immed.fidx == fidx) { if (trace&T_FIL) { printf(" Processing file %s\n",me->pattern); fflush(stdout); } t = LOpen(&tmpHeap,me->pattern,ff); if (t->numLists<1) HError(-1330,"HLStats: Empty file %s",me->pattern); else GatherStats(t),i++; Dispose(&tmpHeap,t); } me = me->next; if ((trace&T_BAS) && !(trace&T_FIL) && NumMLFEntries()>5000 && i%1000==0) printf(". "),fflush(stdout); } if ((trace&T_BAS) && !(trace&T_FIL) && NumMLFEntries()>5000) printf("\n"); } else { if (trace&T_FIL) { printf(" Processing file %s\n",labFn); fflush(stdout); } t = LOpen(&tmpHeap,labFn,ff); if (t->numLists<1) HError(-1330,"HLStats: Empty file %s",me->pattern); else GatherStats(t),i++; Dispose(&tmpHeap,t); } } if (trace&T_MEM) PrintAllHeapStats(); OutputStats(); if (trace&T_MEM) PrintAllHeapStats(); Exit(0); return (0); /* never reached -- make compiler happy */}/* PrintSettings: print info on stats requested */void PrintSettings(void){ if (doLCount || doPCount){ printf("Computing Label Occurrence Statistics\n"); if (doPCount) printf(" upto %d physical\n",pCountLimit); if (doLCount) printf(" upto %d logical\n",lCountLimit); } if (doBigram) { printf("Computing Bigram Statistics\n"); if (doBOff){ printf(" unifloor = %f\n",uniFloor); printf(" bgthresh = %d\n",bigThresh); printf(" discount = %f\n",disCount); } else printf(" bigfloor = %f\n",bigFloor); } if (doDurs) printf("Computing Label Duration Statistics\n"); fflush(stdout);}/* -------------------- Gather Statistics -------------------- */typedef struct cntr{ /* Physical Label Occurrence Counters */ LabId name; /* Name */ int count; /* Times seen */} Cntr;typedef struct wordinfo{ /* Label Occurrence Counters */ LabId name; /* Name */ int count; /* Times seen */ Cntr *pCntr; /* Physical counter */ float minDur; /* Min duration */ float maxDur; /* Max duration */ float sumDur; /* Total duration */} WordInfo;#define ASIZE 2 /* Need two words to id a bigram */typedef struct aentry { /* Storage for counts */ unsigned short word[ASIZE]; /* Bigram id */ int count; /* Count */ struct aentry *link; /* Next entry in hash table */} AEntry;static int lSize; /* Number of logical labels */static int pSize; /* Number of physical labels */static WordInfo *lTab; /* Table of logical counts/durations */static Cntr *pTab; /* Table of physical counts */static AEntry **aetab; /* Hash table for bigram accumulators */static int aetabsize=0; /* Size of hash table selected from .. */static int hashsizes[4]={ 87793, 188281, 715249 };static int nae=0; /* Number of accumulators created *//* wd_cmp: word order relation used to sort lTab */static int wd_cmp(const void *v1,const void *v2){ WordInfo *w1,*w2; w1=(WordInfo*)v1;w2=(WordInfo*)v2; if (w1->name==enterId) return(-1); else if (w2->name==enterId) return(1); else if (w1->name==exitId) return(1); else if (w2->name==exitId) return(-1); return(strcmp(w1->name->name,w2->name->name));}/* InitWordInfo: Initialise contents of WordInfo rec */void InitWordInfo(WordInfo *w, LabId id, Cntr *pCntr){ w->name = id; w->pCntr = pCntr; w->minDur = 1E30; w->maxDur = 0.0; w->sumDur = 0.0; w->count = 0;}/* InitStats: Create and init all necessary global accumulators */void InitStats(char *listFn){ int h,p,l; MLink q,hm; HLink hmm; HMMSet *hset; CreateHeap(&tmpHeap,"TempHeap",MSTAK,1,1.0,8000,80000); CreateHeap(&statHeap,"StatHeap",MSTAK,1,1.0,8000,240000); hset=(HMMSet*)New(&tmpHeap,sizeof(HMMSet)); CreateHMMSet(hset,&tmpHeap,FALSE); if(MakeHMMSet(hset,listFn)<SUCCESS) HError(1328,"Initstats: MakeHMMSet failed"); /* Make sure we have entries for ENTER / EXIT labels */ if (FindMacroName(hset,'l',enterId)==NULL) { hmm=(HMMDef*)New(&tmpHeap,sizeof(HMMDef)); NewMacro(hset,0,'l',enterId,hmm); NewMacro(hset,0,'h',enterId,hmm); } if (FindMacroName(hset,'l',exitId)==NULL) { hmm=(HMMDef*)New(&tmpHeap,sizeof(HMMDef)); NewMacro(hset,0,'l',exitId,hmm); NewMacro(hset,0,'h',exitId,hmm); } pSize=hset->numPhyHMM; pTab=(Cntr*)New(&statHeap,(pSize+1)*sizeof(Cntr)); p=1; pTab[0].name=nullId; for (h=0; h<MACHASHSIZE; h++) for (q=hset->mtab[h]; q!=NULL; q=q->next) { if (q->type=='h') { hmm=(HLink) q->structure; hmm->hook=(Ptr)p; pTab[p].name=q->id; pTab[p].count=0; p++; } } lSize=hset->numLogHMM; lTab=(WordInfo*)New(&statHeap,(lSize+1)*sizeof(WordInfo)); l=1; InitWordInfo(lTab,nullId,pTab); for (h=0; h<MACHASHSIZE; h++) for (q=hset->mtab[h]; q!=NULL; q=q->next) if (q->type=='l') { hmm=(HLink) q->structure; hm=FindMacroStruct(hset,'h',q->structure); if (hm==NULL || hmm->hook==0) HError(1390,"InitStats: No physical name found for %s", q->id->name); InitWordInfo(lTab+l,q->id,pTab+(int)hmm->hook); l++; } qsort(lTab+1,lSize,sizeof(WordInfo),wd_cmp); for (l=1; l<=lSize; l++) lTab[l].name->aux=(Ptr)l; Dispose(&tmpHeap,hset); if (doBigram) { /* create aetab */ aetabsize=hashsizes[hSize]; aetab=(AEntry**)New(&statHeap,aetabsize*sizeof(AEntry*)); for (l=0;l<aetabsize;l++) aetab[l]=NULL; } if (trace&T_BAS) { PrintSettings(); printf("\n\nRead Label list - %d/%d labels\n",lSize,pSize); }}/* GetAEntry: find ngram in in aetab. If not found and create is set, then add new entry */AEntry *GetAEntry(int in[ASIZE],Boolean create){ AEntry *ae; int i; unsigned int hash; hash=0; for (i=0,hash=0;i<ASIZE;i++) hash=((hash<<16)+in[i])%aetabsize;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -