📄 lpcalc.c
字号:
/* ----------------------------------------------------------- *//* *//* ___ *//* |_| | |_/ SPEECH *//* | | | | \ RECOGNITION *//* ========= SOFTWARE */ /* *//* *//* ----------------------------------------------------------- *//* developed at: *//* *//* Speech Vision and Robotics group *//* Cambridge University Engineering Department *//* http://svr-www.eng.cam.ac.uk/ *//* *//* main authors: Valtcho Valtchev, Steve Young, *//* Julian Odell, Gareth Moore *//* ----------------------------------------------------------- *//* Copyright: *//* *//* 1994-2002 Cambridge University *//* Engineering Department *//* *//* Use of this software is governed by a License Agreement *//* ** See the file License for the Conditions of Use ** *//* ** This banner notice must not be removed ** *//* *//* ----------------------------------------------------------- *//* File: LPCalc: probability calculation *//* ----------------------------------------------------------- */char *lpcalc_version = "!HVER!LPCalc: 3.3 [CUED 28/04/05]";char *lpcalc_vc_id = "$Id: LPCalc.c,v 1.1.1.1 2005/05/12 10:52:18 jal58 Exp $";#include "HShell.h" /* HMM ToolKit Modules */#include "HMem.h"#include "HMath.h"#include "HWave.h"#include "HLabel.h"#include "LWMap.h" #include "LGBase.h" /* LM ToolKit Modules */#include "LUtil.h"#include "LModel.h"#include "LPCalc.h"#define T_TOP 0001 /* top level tracing */#define T_FOF 0002 /* FoF table tracing *//* -------------------------- Trace Flags ------------------------ */static int trace = 0;/* ---------------- Configuration Parameters --------------------- */static ConfParam *cParm[MAXGLOBS];static int nParm = 0; /* total num params *//* ---------------------- Global Variables ----------------------- */static LabId sstId; /* sentence start marker */static char sstStr[256] = DEF_STARTWORD; /* sentence start marker */static float uniFloor = 1.0; /* unigram floor *//* SetConfParms: set conf parms relevant to this tool */void InitPCalc(void){ int i; char s[256]; Register(lpcalc_version,lpcalc_vc_id); nParm = GetConfig("LPCALC", TRUE, cParm, MAXGLOBS); if (nParm>0){#ifndef HTK_TRANSCRIBER if (GetConfInt(cParm,nParm, "TRACE",&i)) trace = i;#endif if (GetConfStr(cParm,nParm, "STARTWORD",s)) strcpy(sstStr,s); } sstId = GetLabId(sstStr,TRUE);}/* EXPORT->InitBuildInfo: initialise build parameters */void InitBuildInfo(BuildInfo *bi){ int i, j; /* Temporary values */ char s[256]; /* Temporary string */ bi->nSize = 0; bi->ftab = NULL; bi->saveFmt = DEF_SAVEFMT; bi->ptype = DEF_LMPTYPE; bi->uniFloor = DEF_UNIFLOOR; bi->kRange = DEF_KRANGE; bi->dctype = DEF_DCTYPE; for (i=1; i<=LM_NSIZE; i++) bi->cutOff[i] = DEF_CUTOFF; bi->wmap = NULL; bi->inSet = NULL; if (GetConfInt(cParm,nParm, "UNIFLOOR",&i)) bi->uniFloor = i; if (GetConfInt(cParm,nParm, "KRANGE",&i)) bi->kRange = i; if (GetConfStr(cParm,nParm, "DCTYPE",s)) { if (!strcmp(s,"TG")) bi->dctype = DC_KATZ; else if (!strcmp(s,"ABS")) bi->dctype = DC_ABSOLUTE; else if (!strcmp(s,"LIN")) bi->dctype = DC_LINEAR; } /* See if any config file settings for n-gram cut-offs */ for (i=2; i<=LM_NSIZE; i++) { sprintf(s,"%dG_CUTOFF",i); if (GetConfInt(cParm,nParm,s,&j)) bi->cutOff[i] = j; }}#define LMNDX(wm,i) wm->me[i].sort+1/* EXPORT->FilterNGram: read n-grams and map them to LM IDs */Boolean FilterNGram(NGInputSet *inSet, UInt *gram, float *count, int nSize){ int i; UInt gbuf[LM_NSIZE]; if (!GetNextNGram(inSet,gbuf,count,nSize)) return FALSE;#ifdef SANITY for (i=0; i<nSize; i++) { if (GetMEIndex(inSet->wm,gbuf[i]) < 0) { HError(15590,"FilterNGram: Read n-gram contains out of map words"); } }#endif for (i=0; i<nSize; i++) { gram[i] = LMNDX(inSet->wm,GetMEIndex(inSet->wm,gbuf[i])); } return TRUE;}/* EXPORT->CalcUniProbs: calculate unigram */static int CalcUniProbs(BackOffLM *lm, FLEntry *tgtFE, Boolean rebuild){ NameId nid; double tMass; SMEntry *se,*unigram; int i, numFloored; if (rebuild) { memcpy(lm->se_buff,tgtFE->sea,tgtFE->nse*sizeof(SMEntry)); unigram = tgtFE->sea; tgtFE->sea = lm->se_buff; } else { unigram = (SMEntry *) New(lm->heap,lm->vocSize*sizeof(SMEntry)); } for (se=unigram, i=0; i<lm->vocSize; i++,se++) { /* initialise array */ se->prob = 0.0; se->ndx=i+1; } for (se=tgtFE->sea,i=0; i<tgtFE->nse; i++,se++) { /* copy all entries across */ unigram[se->ndx-1].prob = se->prob; } if (tgtFE->nse!=lm->vocSize) { printf("%d distinct unigrams found in data, %d in word list\n",tgtFE->nse,lm->vocSize); fflush(stdout); } tMass = 0.0; numFloored = 0; for (se=unigram, i=0; i<lm->vocSize; i++,se++) { if (se->prob < uniFloor) { se->prob = uniFloor; numFloored++; } tMass += se->prob; } if (numFloored>0) { printf("%d unigrams floored to %.1f\n",numFloored,uniFloor); fflush(stdout); } if (lm->probType!=LMP_COUNT) { /* clamp sentence start symbol prob */ if ((nid = GetNameId(lm->htab,sstStr,FALSE))!=NULL) { if ((se = FindSE(unigram,0,lm->vocSize,LM_INDEX(nid)))!=NULL) { tMass = tMass - se->prob; se->prob = 0.0; } } for (se=unigram, i=0; i<lm->vocSize; i++, se++) { se->prob = se->prob/tMass; } } tgtFE->sea = unigram; tgtFE->nse = lm->vocSize; tgtFE->ndx = 0; tgtFE->bowt = 0.0; if (!rebuild) { /* initialise root FE if building from scratch */ tgtFE->fea = NULL; tgtFE->nfe = 0; } return lm->vocSize;} static double ApplyTG(BackOffInfo *boi, FLEntry *tgtFE, double tMass, int nSize){ int i,k,r; SMEntry *se; double uMass; TuringGoodInfo *tgi; tgi = &boi->dcInfo.tgInfo; /* apply TG discounting */ for (se=tgtFE->sea,i=0; i<tgtFE->nse; i++,se++) { if ((r = (int) se->prob) <= tgi->kRange) { se->prob = tgi->coef[r] * se->prob; } } /* accumulate unseen probability mass */ uMass = 0.0; for (se=tgtFE->sea,i=0; i<tgtFE->nse; i++,se++) uMass += se->prob; uMass = tMass - uMass; if (uMass==0.0) { /* unable to accumulate unseen count, try alternative */ k = boi->cutOff+1; for (se=tgtFE->sea,i=0; i<tgtFE->nse; i++,se++) { uMass += (1.0 - tgi->coef[k]) * se->prob; se->prob *= tgi->coef[k]; if ((k++)==tgi->kRange) break; } } return uMass;}static double ApplyABS(BackOffInfo *boi, FLEntry *tgtFE, double tMass) { int i; SMEntry *se; double b,uMass; /* apply Absolute discounting */ b = boi->dcInfo.bCoef; for (se=tgtFE->sea,i=0; i<tgtFE->nse; i++,se++) { se->prob = se->prob - b; if (se->prob < 0.0) se->prob = 0.0; } /* accumulate unseen probability mass */ uMass = 0.0; for (se=tgtFE->sea,i=0; i<tgtFE->nse; i++,se++) uMass += se->prob; uMass = tMass - uMass; return uMass;}/* EXPORT->CalcNGramProbs: calculate and write n-gram entries lm - target language model (1..nSize-1)-grams should be in place feId - array[0..nSize-2] of LM IDs representing context nSize - n-gram to calculate tgtFE - target FLEntry rebuld - TRUE if converting LMP_COUNT -> LMP_FLOAT */static int CalcNGramProbs(BackOffLM *lm, UInt *feId, int nSize, FLEntry *tgtFE, Boolean rebuild){ int i, j, r; int nse, nItem; double uMass=0, tMass, boSum, prob; LMProbType ptype; BackOffInfo *boi; SMEntry *se,*bo_se,*se_perm,*tse; FLEntry *fe; /* se_perm -> permanent SE storage, tgtFE->sea -> lm->se_buff */ if (nSize==1) { return CalcUniProbs(lm,tgtFE,rebuild); } if ((ptype = lm->probType)==LMP_LOG) HError(15590,"CalcNGramProbs: Incompatible prob kind (%d)",ptype); if ((boi = lm->gInfo[nSize].boInfo)==NULL) HError(15590,"CalcNGramProbs: Back-off info not present for %d grams",nSize); if (boi->dcType!=DC_KATZ && boi->dcType!=DC_ABSOLUTE) HError(15590,"CalcNGramProbs: Unsupported LM type (%d)",boi->dcType); if (rebuild) { /* rebuilding model - no need to allocate storage */ se_perm = lm->se_buff; memcpy(lm->se_buff,tgtFE->sea,tgtFE->nse*sizeof(SMEntry)); se_perm = tgtFE->sea; tgtFE->sea = lm->se_buff; /* swap them round */ tMass = tgtFE->bowt; } else { se_perm = NULL; tMass = 0.0; } /* first, accumulate total count and apply cutoff */ nse = 0; for (se=tgtFE->sea,i=0; i<tgtFE->nse; i++,se++) { tMass += se->prob; if ((r = (int) se->prob) <= boi->cutOff) se->prob = 0.0; if (se->prob > 0.0) nse++; } if (se_perm==NULL) /* allocate permanent SE storage */ se_perm = (SMEntry *) New(lm->heap,nse*sizeof(SMEntry)); /* copy entries with non-zero probabilities to se_perm */ for (tse=se_perm,se=tgtFE->sea,i=0; i<tgtFE->nse; i++,se++) if (se->prob>0.0) *tse++=*se; /* then copy back to tgtFE->sea */ memcpy(tgtFE->sea,se_perm,nse*sizeof(SMEntry)); tgtFE->nse = nse; qsort(tgtFE->sea,tgtFE->nse,sizeof(SMEntry),CmpSE); if (ptype==LMP_COUNT) { /* building COUNT model */ /* accumulate unseen probability mass */ uMass = 0.0; for (se=tgtFE->sea,i=0; i<tgtFE->nse; i++,se++) uMass += se->prob; uMass = tMass - uMass; boSum = 1.0; } else { /* building probabilistic model */ switch(boi->dcType) { case DC_KATZ: uMass = ApplyTG(boi,tgtFE,tMass,nSize); break;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -