📄 lgprep.c
字号:
/* ReadRuleDef: read rule definition from s and it add to rule set */void ReadRuleDef(char *s, RuleSet *rset){ RuleDef *x; char buf[256]; float f; Boolean inPat = TRUE; x = (RuleDef *)New(&(rset->mem),sizeof(RuleDef)); x->next = NULL; if (rset->nRules++ == 0) rset->head = x; else rset->tail->next = x; rset->tail = x; s = NextWord(s,buf); f = atof(buf); if (f<0.0 || f>1.0) HError(16020,"ReadRuleDef: appl. factor %f out of range 0..1",f); x->pact = f; x->psum = 0.0; x->src.n = 0; s = NextWord(s,buf); while (s != NULL && inPat) { if (x->src.n >= MAX_FIELDS) HError(16020,"ReadRuleDef: too many fields in pattern"); switch(buf[0]) { case ':': inPat = FALSE; break; case '*': x->src.fop[x->src.n++] = f_WILD; break; case '%': x->src.fop[x->src.n] = f_WSET; x->src.fdt[x->src.n++].setid = atoi(buf+1); break; case '!': x->src.fop[x->src.n] = f_NWSET; x->src.fdt[x->src.n++].setid = atoi(buf+1); break; default: x->src.fop[x->src.n] = f_WORD; x->src.fdt[x->src.n++].wdid = GetLabId(buf,TRUE); break; } s = NextWord(s,buf); } x->tgt.n = 0; while (s != NULL) { if (x->tgt.n >= MAX_FIELDS) HError(16020,"ReadRuleDef: too many fields in replace"); switch(buf[0]) { case '$': x->tgt.fop[x->tgt.n] = f_FIELD; x->tgt.fdt[x->tgt.n++].setid = atoi(buf+1); break; default: x->tgt.fop[x->tgt.n] = f_WORD; x->tgt.fdt[x->tgt.n++].wdid = GetLabId(buf,TRUE); break; } s = NextWord(s,buf); } if (x->src.n > editWinSize) editWinSize = x->src.n; if (trace&T_RIN) printf(" read rule: %5.2f [%d : %d]\n",x->pact,x->src.n,x->tgt.n);}/* ReadRuleSet: read rule set from file */void ReadRuleSet(char *fn, RuleSet *rset){ Source src; char buf[1024], *s; Boolean infile; if (InitSource(fn,&src,NoFilter)==FAIL) { HError(16010, "ReadRuleSet: Can't read rule set from '%s'", fn); } do { infile = ReadLine(&src,buf); s = SkipToWord(buf); if (*s != '\0') { if (*s == '#') ReadSetDef(s+1,rset); else ReadRuleDef(s,rset); } } while (infile); if (trace&T_RIN) { printf("Loaded %d sets and %d rules from file %s\n", rset->nSets,rset->nRules,fn); fflush(stdout); } CloseSource(&src);}/* PrintFields: print a list of rule fields */void PrintFields(FieldVec *fl){ int i; for (i=0; i<fl->n; i++) switch(fl->fop[i]){ case f_FIELD: printf(" $%d",fl->fdt[i].flid); break; case f_WILD: printf(" *"); break; case f_WORD: printf(" %s",fl->fdt[i].wdid->name); break; case f_WSET: printf(" %%%d",fl->fdt[i].setid); break; case f_NWSET: printf(" !%d",fl->fdt[i].setid); break; default: printf(" <error; unknown type>"); break; }}/* PrintRuleSet: print rule set */void PrintRuleSet(RuleSet *rset){ int i,j; SetDef *x; RuleDef *r; printf("Rule Set [%d sets, %d rules]:\n",rset->nSets,rset->nRules); for (i=0; i<MAX_SETS; i++) { if ((x = rset->setlist[i]) != NULL) { printf(" #%3d ",i); for (j=0; j<x->nItem; j++) printf(" %s",x->item[j]->name); printf("\n"); } } for (r=rset->head; r != NULL; r = r->next) { printf("%5.2f ",r->pact); PrintFields(&r->src); printf(" : "); PrintFields(&r->tgt); printf("\n"); }}/* ------------------------ Initialisation ----------------------- *//* Exists: return true if given file exists */Boolean Exists(char *fn){ FILE *f; if ((f=fopen(fn,"r")) == NULL) return FALSE; fclose(f); return TRUE;}/* InitWordMap: load and initialise wordmap */void InitWordMap(void){ CreateWordMap(imapFN, &wmap, newWords); if (forceCnts) wmap.hasCnts = TRUE; if (!htkEscape) wmap.htkEsc = FALSE; /* default is TRUE */ ++wmap.seqno; mapUpdated = FALSE;}/* InitShiftReg: initialise a shift register */void InitShiftReg(ShiftReg *sr, int size, char *fn){ char path[256]; MakeFN(fn,dbsDir,NULL,path); sr->used = 0; sr->ng[nSize] = 1; /* count = 1 */ sr->ngb = CreateNGBuffer(&ngbHeap,nSize,size,path,&wmap); sr->ngb->fndx += dumpOfs;}/* Initialise: initialise global data structures */void Initialise(void){ char buf[256]; if (ruleFN != NULL) { if (trace&T_TOP) printf(" creating rule set %s\n",ruleFN); CreateRuleSet(&rset); ReadRuleSet(ruleFN,&rset); if (trace&T_RUL) PrintRuleSet(&rset); } InitWordMap(); CreateHeap(&ngbHeap,"NGB mem",MSTAK,1,0.0,1000,1000); if (gbGen) InitShiftReg(&stdBuf,ngbSize,rootFN); if (ruleFN != NULL) { sprintf(buf,"%s_pos",rootFN); InitShiftReg(&posBuf,egbSize,buf); sprintf(buf,"%s_neg",rootFN); InitShiftReg(&negBuf,egbSize,buf); }}/* ----------------- NGram Counting Routines -------------------- *//* CompressBuffer: and save if necessary or mustSave is TRUE */void CompressBuffer(NGBuffer *ngb, Boolean mustSave){ float compx; if (ngb->used == 0) return; if (trace&T_MEM) { printf("** before buffer sort\n"); PrintAllHeapStats(); } SortNGBuffer(ngb); if (trace&T_MEM) { printf("** after buffer sort\n"); PrintAllHeapStats(); } compx = 100.0 * (float)ngb->used / (float)ngb->poolsize; if (trace&T_SAV) { printf(" buffer %s.%d compressed%s to %.1f%% at word %d\n", ngb->fn, ngb->fndx, mustSave?"[must save]":"",compx,wordnum); } if (compx > 75.0 || mustSave) { if (mustSave && mapUpdated) { SaveWordMap(omapFN,&wmap,FALSE); mapUpdated = FALSE; if (trace&T_TOP) printf(" word map saved to %s\n",omapFN); } if (trace&T_TOP) { printf(" saving %d ngrams to file %s.%d\n", ngb->used, ngb->fn, ngb->fndx); } WriteNGBuffer(ngb,txtsrc); }}/* PutShiftRegister: push word into shift register and extract ngram */void PutShiftRegister(LabId id, ShiftReg *sr){ int i; MapEntry *me; if (trace&T_SHR){ printf(" %12s --> %s\n",id->name,sr->ngb->fn); fflush(stdout); } AddWordToMap(&wmap,id); mapUpdated = TRUE; me = (MapEntry *)id->aux; sr->ng[sr->used++] = me->ndx; if (sr->used == nSize) { /* record ngram */ StoreNGram(sr->ngb,sr->ng); /* shift words */ sr->used--; for (i=0; i<sr->used; i++) sr->ng[i] = sr->ng[i+1]; /* compress buffer if full */ if (sr->ngb->used == sr->ngb->poolsize) { CompressBuffer(sr->ngb,FALSE); } }}/* -------------------- Editing Routines ------------------------- *//* MatchRule: return true if given rule matches editBuf */Boolean MatchRule(RuleDef *r){ int i,j; for (i=0; i<r->src.n; i++) { switch(r->src.fop[i]) { case f_WORD: if (editBuf[i] != r->src.fdt[i].wdid) return FALSE; break; case f_WSET: j = r->src.fdt[i].setid; if (!InSet(rset.setlist[j],editBuf[i])) return FALSE; break; case f_NWSET: j = r->src.fdt[i].setid; if (InSet(rset.setlist[j],editBuf[i])) return FALSE; break; case f_WILD: break; default: HError(16090,": bad op [%d] in field %d of replace", r->tgt.fop[i],i); } } return TRUE;}/* ApplyRule: put replace part of rule r into buf */void ApplyRule(RuleDef *r, LabId *buf){ int i; for (i=0; i<r->tgt.n; i++) { switch(r->tgt.fop[i]) { case f_WORD: buf[i] = r->tgt.fdt[i].wdid; break; case f_FIELD: buf[i] = editBuf[r->tgt.fdt[i].flid]; break; default: HError(16090,": bad op [%d] in field %d of replace", r->tgt.fop[i],i); } }}/* SendToEditBuffer: insert word into edit buffer and apply rules */void SendToEditBuffer(LabId id){ RuleDef *r; LabId replBuf[MAX_FIELDS]; int i; editBuf[editUsed++] = id; if (editUsed == editWinSize) { /* buffer is filled */ /* try each rule in turn */ for (r=rset.head; r != NULL; r = r->next) if (MatchRule(r)) { r->psum += r->pact; if (r->psum>1.0) { ApplyRule(r,replBuf); r->psum -= 1.0; for (i=0; i< r->src.n; i++) PutShiftRegister(editBuf[i],&negBuf); negBuf.used = 0; for (i=0; i< r->tgt.n; i++) PutShiftRegister(replBuf[i],&posBuf); posBuf.used = 0; } } /* Shift words */ editUsed--; for (i=0; i<editUsed; i++) editBuf[i] = editBuf[i+1]; }}/* -------------------------- Input Text ------------------------- *//* ProcessText: read text files line by line and count ngrams */void ProcessText(char *fn, Boolean lastFile){ FILE *f; LabId id; char sbuf[1024],*word; Boolean isPipe,wasSentStart; if (trace&T_TOP) printf("Reading source text file %s\n",(fn==NULL) ? "<stdin>" : fn); if ((fn!=NULL) && (strcmp(fn,"-")!=0)) { if ((f = FOpen(fn,LMTextFilter,&isPipe))==NULL) HError(16010,"ProcessText: unable to open text file %s", fn); } else { f = stdin; } wasSentStart = FALSE; word = sbuf+1; sbuf[0]='_'; while (fscanf(f,"%255s",word)==1) { wordnum++; if (tagSentStart) { id = GetLabId(wasSentStart ? sbuf : word,TRUE); wasSentStart = (id==sstId); } else { id = GetLabId(word,TRUE); } if (trace&T_INP) printf("[%s]\n",id->name); if (ruleFN == NULL && !gbGen) { AddWordToMap(&wmap,id); } else { id = GetLabId(word,TRUE); if (ruleFN != NULL) SendToEditBuffer(id); if (gbGen) PutShiftRegister(id,&stdBuf); } } if (fn!=NULL) FClose(f,isPipe); if (lastFile) { if (ruleFN == NULL && !gbGen) { SortWordMap(&wmap); SaveWordMap(omapFN,&wmap,FALSE); } else { if (gbGen) CompressBuffer(stdBuf.ngb,TRUE); if (ruleFN != NULL){ CompressBuffer(negBuf.ngb,TRUE); CompressBuffer(posBuf.ngb,TRUE); } } }}/* ---------------------- End of LGPrep.c ----------------------- */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -