📄 lpcalc.c
字号:
case DC_ABSOLUTE: uMass = ApplyABS(boi,tgtFE,tMass); break; default : HError(15590,"CalcNGramProbs: Unsupported LM type (%d)",boi->dcType); break; } /* calculate sum of (n-1)-gram probs for unseen entries */ boSum = 0.0; GetNGramProbs(lm, feId+1, nSize-1, se_perm, nse); if (boi->wdThresh>0) { for (bo_se=se_perm,se=tgtFE->sea,fe=tgtFE->fea,i=j=0; i<tgtFE->nse; i++,bo_se++,se++){ prob=se->prob/tMass; if (fabs(se->prob*(log(prob)-log(bo_se->prob)))<boi->wdThresh && (fe==NULL || j>=tgtFE->nfe || fe->ndx!=se->ndx)) { uMass+=se->prob,se->prob=0.0; /* oh my goodness - who wrote that?! */ if (j<tgtFE->nfe && fe->ndx==se->ndx) /* PRUNE FE AS WELL */ fe->nse=0,fe->nfe=0; } if (j<tgtFE->nfe && fe->ndx<=se->ndx) fe++,j++; } } for (bo_se=se_perm,se=tgtFE->sea,i=0; i<tgtFE->nse; i++,bo_se++,se++) if (se->prob > 0.0) boSum += bo_se->prob; boSum = 1.0 - boSum; } nItem = 0; if (uMass!=tMass) { /* some real n-grams still left after discounting */ if (ptype == LMP_COUNT) { tMass = 1.0; } tse = se_perm; for (se=tgtFE->sea,i=0; i<tgtFE->nse; i++,se++) { if (se->prob>0.0) { tse->prob = se->prob / tMass; tse->ndx = se->ndx; tse++; nItem++; } } tgtFE->sea = se_perm; tgtFE->nse = nItem; tgtFE->ndx = feId[nSize-2]; tgtFE->bowt = (boSum <= 0.0) ? MIN_BOWT : (uMass / (tMass * boSum)); if (!rebuild) { tgtFE->fea = NULL; tgtFE->nfe = 0; } } else { tgtFE->sea = se_perm; tgtFE->nse = nItem; tgtFE->nfe = 0; } return nItem;}#define GRAM2TEXT() { \ for (s = sbuf, *sbuf='\0', j=0; j<nSize; j++) { \ sprintf(s," %s",wmap->id[gramKey[j]]->name); s+=strlen(s); \ } \}/* EXPORT CalculateNGram: calculate nSize-grams from gram files in inSet */static int CalculateNGram(BackOffLM *lm, NGInputSet *inSet, int nSize){ float count; WordMap *wmap; char *s, sbuf[256]; int i, j, nse, nfe, nItem; SMEntry *se, *se_buff; FLEntry *fe, *fe_buff, *feptr; UInt *ge, gram[LM_NSIZE]; UInt gramKey[LM_NSIZE]; Boolean newCX1, newCX2; if ((se = se_buff = lm->se_buff)==NULL) HError(15590,"CalculateNGram: se_buff not initialised"); if ((fe = fe_buff = lm->fe_buff)==NULL) HError(15590,"CalculateNGram: fe_buff not initialised"); if ((wmap = inSet->wm)==NULL) HError(15590,"CalculateNGram: Word map not set"); if (nSize < 1 || nSize > inSet->N) HError(15590,"CalculateNGram: Invalid nSize (%d)",nSize); nse = 0; nfe = 0; nItem = 0; OpenInputSet(inSet); if (!FilterNGram(inSet,gram,&count,nSize)) HError(15513,"CalculateNGram: Unable to read first n-gram"); memcpy(gramKey,gram,nSize*sizeof(UInt)); do {#ifdef SANITY for (i=0; i<nSize; i++) if (gram[i] < 1 || gram[i] > lm->vocSize) HError(15590,"CalculateNGram: LM index out of range (%d)",gram[i]);#endif for (newCX1=FALSE, ge=gram, i=0; i<nSize-2; i++, ge++) if (gramKey[i]!=*ge) { newCX1 = TRUE; break; } newCX2 = (nSize==1) ? newCX1 : newCX1 || (gramKey[nSize-2]!=gram[nSize-2]); if (newCX2) { fe->nse = nse; fe->sea = se_buff; if ((nse = CalcNGramProbs(lm,gramKey,nSize,fe,FALSE)) > 0) { fe->fea = NULL; fe->nfe = 0; nItem += nse; fe++; nfe++; } if (newCX1) { if (nfe>0) { for (feptr=&lm->root, i=0; i<nSize-2; i++) { FLEntry *feptr2; if ((feptr2 = FindFE(feptr->fea,0,feptr->nfe,gramKey[i]))==NULL) { GRAM2TEXT(); HError(15520,"CalculateNGram: Unable to find FLEntry to attach (%s)",sbuf); } feptr=feptr2; } if (feptr->nfe > 0 || feptr->fea!=NULL) { GRAM2TEXT(); HError(15525,"CalculateNGram: Attempt to overwrite entries when attaching (%s)",sbuf); } feptr->fea = fe_buff; feptr->nfe = nfe; StoreFEA(feptr,lm->heap); } fe = fe_buff; nfe = 0; for (ge=gram,i=0; i<nSize-2; i++,ge++) gramKey[i] = *ge; } gramKey[nSize-2] = gram[nSize-2]; se = se_buff; nse=0; } se->ndx = gram[nSize-1]; se->prob = count; se++; nse++;#ifdef SANITY if (nse>lm->vocSize) HError(15590,"CalculateNGram: SE buffer limit reached (%d)",nse);#endif } while(FilterNGram(inSet,gram,&count,nSize)); /* finish off the remaining n-grams accumulated */ if (nSize > 1) { /* (n>1)-grams */ fe->nse = nse; fe->sea = se_buff; if ((nse = CalcNGramProbs(lm,gramKey,nSize,fe,FALSE)) > 0) { fe->fea = NULL; fe->nfe = 0; nItem += nse; fe++; nfe++; } if (nfe > 0) { for (feptr=&lm->root, i=0; i<nSize-2; i++) { if ((feptr = FindFE(feptr->fea,0,feptr->nfe,gramKey[i]))==NULL) { GRAM2TEXT(); HError(15520,"CalculateNGram: Unable to find FLEntry for (%s)",sbuf); } } if (feptr->nfe > 0 || feptr->fea!=NULL) { GRAM2TEXT(); HError(15525,"CalculateNGram: Attempt to ovewrite entries when attaching (%s)",sbuf); } feptr->fea = fe_buff; feptr->nfe = nfe; StoreFEA(feptr,lm->heap); } } else { /* unigrams */ lm->root.nse = nse; lm->root.sea = se_buff; nItem = CalcUniProbs(lm,&lm->root,FALSE); } CloseInputSet(inSet); return nItem;}#define DEF_ABS_COEF 0.5static double CalcABSCoef(int nSize, FoFTab *ftab) { UInt **fof; double coef; fof = ftab->fof; if (fof[1][nSize]==0 || fof[2][nSize]==0) coef = DEF_ABS_COEF; else coef = (double) fof[1][nSize] / (double) (fof[1][nSize] + 2.0*fof[2][nSize]); if (trace&T_TOP) printf("Absolute discounting term %e\n",coef); return coef;}#define DEF_TG_COEF 0.99static void CalcTGCoefs(MemHeap *heap, BackOffInfo *boi, int nSize, FoFTab *ftab) { int r,K; UInt **fof; double kTerm,gTerm; TuringGoodInfo *tgi; Boolean ok,allPositive; fof = ftab->fof; tgi = &boi->dcInfo.tgInfo; K = tgi->kRange; tgi->coef = (float *) New(heap,(K+1)*sizeof(float)); for (r=0; r<=K; r++) tgi->coef[r] = 0.0; /* check for singularities */ for (ok = (fof[1][nSize]>0),r=1; ok && r<K; r++) ok = ok && (fof[r][nSize]>0); if (ok) { do { if (K <= 1) { HError(-15560, "CalcTGCoefs: Invalid K=%d - setting default K and coefficients", K); K = tgi->kRange; for (r=1; r <=K; r++) tgi->coef[r] = (r <= boi->cutOff) ? 0.0 : DEF_TG_COEF; return; } kTerm = (double) ((K+1) * fof[K+1][nSize]) / (double) fof[1][nSize]; /* if (kTerm>DEF_TG_COEF) { kTerm = DEF_TG_COEF; if (trace&T_TOP) printf("CalcTGCoefs: clamping kTerm to %f\n", DEF_TG_COEF); }*/ /* Further check that kTerm > (r+1).c[r+1]/c[r] for 1<=r<k */ allPositive = TRUE; for (r=(boi->cutOff?boi->cutOff:1); r<K; r++) { gTerm = (double) ((r+1) * fof[r+1][nSize])/(double) (r*fof[r][nSize]); if (((kTerm<=1.0) && (kTerm>=gTerm)) || ((kTerm>1.0) && (kTerm<gTerm))) allPositive = FALSE; printf("g[%d]=%f\n", r, gTerm); } if (allPositive) break; K--; if (trace&T_TOP) printf("CalcTGCoefs: lowering K to %d\n", K); } while(TRUE); for (r=1; r<=K; r++) { gTerm = (double) ((r+1) * fof[r+1][nSize])/(double) (r*fof[r][nSize]); if (r <= boi->cutOff) { tgi->coef[r] = 0.0; } else { tgi->coef[r] = ((gTerm - kTerm) / (1.0 - kTerm)); if (tgi->coef[r] < 1E-03) { HError(-15560, "CalcTGCoefs: Invalid coefficient detected in Turing-Good discounting (%f) - clamped to 1E-03 [gTerm=%f, kTerm=%f, r=%d, cutoff=%d]", tgi->coef[r], gTerm, kTerm, r, boi->cutOff); tgi->coef[r] = 1E-03; } } } if (trace&T_TOP) printf("%d-gram coefs:\n",nSize); } else { for (r=1; r <=K; r++) tgi->coef[r] = (r <= boi->cutOff) ? 0.0 : DEF_TG_COEF; } for (r=1; r<=K; r++) { if (trace&T_TOP) printf("coef[%d]=%e",r,tgi->coef[r]); if (tgi->coef[r]>1.0) { tgi->coef[r] = DEF_TG_COEF; if (trace&T_TOP) printf(", clamped to %.4f",tgi->coef[r]); } if (trace&T_TOP) printf("\n"); } tgi->kRange = K;}/* EXPORT->CalcDiscountCoefs: calculate discount coefs from fof table */static void CalcDiscountCoefs(BackOffLM *lm, FoFTab *ftab){ int ns; BackOffInfo *boi; for (ns=2; ns<=lm->nSize; ns++) { if ((boi = lm->gInfo[ns].boInfo)==NULL) HError(15590,"CalcDiscountCoefs: Back-off info not available for %d-gram",ns); switch (boi->dcType) { case DC_KATZ: CalcTGCoefs(lm->heap,boi,ns,ftab); break; case DC_ABSOLUTE: boi->dcInfo.bCoef = CalcABSCoef(ns,ftab); break; default: HError(15590,"CalcDiscountCoefs: Unsupported LM type (%d)",boi->dcType); } }}/* CheckCutoffs: check n-gram cutoffs and discounting range */static void CheckCutoffs(BackOffLM *lm){ BackOffInfo *boi; int ns,kRange,lastCutOff; lastCutOff=0; for (ns=2; ns<=lm->nSize; ns++) { if ((boi = lm->gInfo[ns].boInfo)==NULL) HError(15590,"CheckCutoffs: Back-off info not available for %d-gram",ns); if (boi->cutOff < lastCutOff) { HError(15540,"CheckCutoffs: %d-gram cutoff = %d, %d-gram cutoff = %d", ns,boi->cutOff,ns-1,lastCutOff); } if (boi->dcType!=DC_KATZ && boi->dcType!=DC_ABSOLUTE) HError(15590,"CheckCutoffs: Unsupported LM type (%d)",boi->dcType); if (boi->dcType==DC_KATZ) { kRange = boi->dcInfo.tgInfo.kRange; if (boi->cutOff > kRange) HError(-15540,"CheckCutoffs: %d-gram cutoff out of range (%d)",ns,boi->cutOff); } lastCutOff = boi->cutOff; }}/* InitTargetModel: initialise target LM structure */static BackOffLM *InitTargetModel(MemHeap *heap, BuildInfo *bi){ int i,ndx,N; NameId nId; BackOffLM *lm; BackOffInfo *boi; if (bi->nSize<1) HError(15590,"GenerateLM: Invalid n-gram size (%d)",bi->nSize); if (bi->ptype!=LMP_FLOAT && bi->ptype!=LMP_COUNT) HError(15590,"GenerateLM: Invalid probability kind (%d)",bi->ptype); lm = (BackOffLM *) New(heap,sizeof(BackOffLM)); lm->heap = heap; lm->gScale = 1.0; lm->nSize = bi->nSize; lm->probType = bi->ptype;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -