📄 lgbase.c
字号:
int i, count, isize, N; UInt *p, *q; char fn[256]; if (trace&T_SRT) { sprintf(fn,"%s.%d",ngb->fn,ngb->fndx); printf(" Sorting %d N-grams (next write to %s)\n", ngb->used,fn); } SortWordMap(ngb->wm); qs_cmpSize = N = ngb->info.N; qs_wmap = ngb->wm; usort(ngb->pool,ngb->used,ngb->info.ng_full,qs_CmpNGram); p = ngb->pool; count = 1; isize = N + 1; for (q = ngb->pool + isize, i=1; i < ngb->used; i++, q += isize) { if (CmpNGram(ngb->wm,ngb->info.N,p,q)==0) {#ifdef LM_FLOAT_COUNT pp = (float *) p+N; qq = (float *) p+N; *pp += *qq;#else p[N] += q[N];#endif } else { p += isize; count++; if (p != q) memcpy(p, q, ngb->info.ng_full); } } ngb->used = count; ngb->next = p+isize; if (trace&T_SRT) { printf(" N-grams sorted %d remaining\n", ngb->used); fflush(stdout); }}/* WriteNGHeader: write a header for given NG Buffer */static void WriteNGHeader(FILE *f, NGBuffer *ngb, char *source){ int N = ngb->info.N,chkndx; LabId chkid; if (ngb->used == 0) HError(15390,"WriteNGHeader: Ngram buffer is empty"); fprintf(f,"NGram = %d\n",N); fprintf(f,"WMap = %s\n",ngb->wm->name); fprintf(f,"SeqNo = %d\n",ngb->wm->seqno); fprintf(f,"Entries = %d\n",ngb->used); WriteRawHGram(f,"Gram1",N,ngb->pool,ngb->wm); WriteRawHGram(f,"GramN",N,ngb->next-(N+1),ngb->wm); chkid = ngb->wm->id[ngb->wm->used / 2]; chkndx = WordLMIndex(chkid); fprintf(f,"WMCheck = %s %d\n",chkid->name,chkndx); if (source != NULL) fprintf(f,"Source = %s\n",source); fprintf(f,"\\Grams\\\n");}/* EXPORT->WriteNGBuffer: write ngb in compressed format to f */void WriteNGBuffer(NGBuffer *ngb, char *source){ int i, N; UInt *p; FILE *f; char fn[256]; Boolean isPipe; N = ngb->info.N; sprintf(fn,"%s.%d",ngb->fn,ngb->fndx); f = FOpen(fn, LGramOFilter, &isPipe); WriteNGHeader(f,ngb,source); for (i=0,p = ngb->pool; i<ngb->used; i++, p += N + 1) WriteNGram(f, N, p); ngb->used = 0; ngb->next = ngb->pool; ++ngb->fndx; FClose(f,isPipe);}/* EXPORT->PrintNGBuffer: print given buffer */void PrintNGBuffer(NGBuffer *ngb){ UInt *p; int i,N; N = ngb->info.N; printf("NGram Buffer: out file %s.%d\n",ngb->fn,ngb->fndx); WriteNGHeader(stdout,ngb,NULL); for (i=0,p=ngb->pool; i<ngb->used; i++,p+=N+1) PrintNGram(N,p,ngb->wm); printf("%d entries\n",ngb->used);}/* ------------- Multiple N-Gram Input File Handling --------- *//* ShowAbbrTxtGram: show ngram with fixed field width for each entry */static void ShowAbbrTxtGram(int N, LabId *ng){ int i; for (i=0; i<N; i++) { printf(" %-5.5s",(ng[i]==NULL)?"?????":ng[i]->name); }}/* ShowAbbrRawGram: show ngram with fixed field width for each entry */static void ShowAbbrRawGram(int N, NGram ng, WordMap *wm){ int i; LabId id; for (i=0; i<N; i++) { id = WordLMName(ng[i],wm); printf(" %-5.5s",(id==NULL)?"?????":id->name); }}#define LASTN(s,n) (strlen(s)>n) ? s+strlen(s)-n : s/* ShowGFSons: print offspring gramfiles */static void ShowGFSons(int N, GFLink gf, char * parent, WordMap *wm){ GFLink p; if (gf==NULL) HError(15390,"ShowGFSons: Unexpected null gram file list"); for (p=gf; p!=NULL; p=p->alt) { printf("%-12.12s %-12.12s ",LASTN(p->fn,12),LASTN(parent,12)); ShowAbbrTxtGram(N,p->firstGram); printf("-> "); ShowAbbrTxtGram(N,p->lastGram); printf("\n"); if (p->next != NULL) ShowGFSons(N, p->next,p->fn,wm); }}/* ShowInputSetTree: print input tree of ordering dependencies */static void ShowInputSetTree(NGInputSet *inset){ printf("Input Set Tree: %d files\n",inset->nFiles); printf("%-12s %-12s %s\n","File","Parent", " First N-gram -> Last N-gram"); ShowGFSons(inset->N, inset->head.next,"Root",inset->wm);}/* EXPORT->CreateInputSet: create input file set */void CreateInputSet(MemHeap *mem, WordMap *wm, NGInputSet *inset){ int i; SortWordMap(wm); inset->mem = mem; inset->N = 0; inset->wm = wm; inset->nFiles = inset->nOpen = inset->maxNOpen = 0; inset->nextValid = FALSE; inset->head.next = inset->head.alt = inset->head.chain = NULL; for (i=0; i<MAXINF; i++) inset->gf[i] = NULL; /* make head gram file a sentinel */ for (i=0; i<MAXNG; i++) inset->head.lastGram[i] = wm->id[wm->firstNdx];}/* EXPORT->AddInputGFile: add file fn to input set */void AddInputGFile(NGInputSet *inset, char *fn, float weight){ int i,N; GFLink p; NGSource ngs; OpenNGramFile(&ngs,fn,inset->wm); if (ngs.nItems > 0) { N = ngs.info.N; if (inset->N > 0) { if (N != inset->N) HError(15340,"AddInputGFile: File %s is %d-gram but inset is %d-gram", fn,N,inset->N); } else { inset->N = N; } p = (GFLink) New(inset->mem,sizeof(GramFile)); p->alt = p->next = NULL; p->weight = weight; p->chain = inset->head.chain; inset->head.chain = p; strcpy(p->fn,fn); for (i=0; i<inset->N; i++) { p->firstGram[i] = ngs.firstGram[i]; p->lastGram[i] = ngs.lastGram[i]; } ++inset->nFiles; } CloseNGramFile(&ngs);}/* ShowInputState: show current open input streams */static void ShowInputState(char *mess, NGInputSet *inset){ int i,j; printf("%s: %d files open\n",mess,inset->nOpen); for (i=0; i<inset->nOpen; i++) { j = inset->gfsort[i]; printf(" %2d[%2d] %-10s",i,j,inset->gf[j]->fn); ShowAbbrRawGram(inset->N,inset->ngs[j].nxt,inset->wm); printf("\n"); }}/* SortGFList: sort the list of open files into order of next available N-Gram. Sort order is defined by gfsort array */static void SortGFList(NGInputSet *inset){ qs_inset = inset; usort(inset->gfsort,inset->nOpen,sizeof(int),qs_CmpGFile); if (trace&T_SRT) ShowInputState("Full sort",inset);}/* ReSortGFList: resort after reading topmost N-Gram */static void ReSortGFList(NGInputSet *inset){ int i,j,n,this; NGram p,q; Boolean found = FALSE; n = inset->nOpen; this = inset->gfsort[0]; p = inset->ngs[this].nxt; i = 1; while ( i<n && !found){ q = inset->ngs[inset->gfsort[i]].nxt; found = (CmpNGram(inset->wm,inset->N,p,q) <= 0 ); if (!found) ++i; } if (i==1) return; --i; /* i = index of new position for updated file */ for (j=0; j<i; j++) inset->gfsort[j] = inset->gfsort[j+1]; inset->gfsort[i] = this; if (trace&T_SRT) ShowInputState("Re-sorted",inset);}/* EXPORT->OpenInputSet: sort and open input streams */void OpenInputSet(NGInputSet *inset){ GFLink p,q,parent; for (p=inset->head.chain; p!=NULL; p=p->chain) { /* find parent of p */ parent = &(inset->head); for (q=inset->head.chain; q!=NULL; q=q->chain) { if (CmpTxtNGram(inset->N,q->lastGram,parent->lastGram) > 0 && CmpTxtNGram(inset->N,q->lastGram,p->firstGram) < 0 ) parent = q; } /* link p to its parent */ p->alt = parent->next; parent->next = p; } if (trace&T_ITR) ShowInputSetTree(inset); /* open all root file's offspring */ for (p=inset->head.next; p!=NULL; p=p->alt,inset->nOpen++){ OpenNGramFile(&(inset->ngs[inset->nOpen]),p->fn,inset->wm); inset->gf[inset->nOpen] = p; inset->gfsort[inset->nOpen] = inset->nOpen; } inset->maxNOpen = inset->nOpen; SortGFList(inset);}/* GetInsetGram: get next ngram and weight from the input set */static void GetInsetGram(NGInputSet *inset, NGram ng, float *wt){ NGSource *ngs; int i,cur; GFLink p,next,alt; UInt nextng[MAXNG]; if (inset->nOpen == 0 ) HError(15390,"GetInsetGram: No grams left");; cur = inset->gfsort[0]; ngs = &(inset->ngs[cur]); ReadNGram(ngs,ng); *wt = inset->gf[cur]->weight; if (ngs->nItems == 0) { /* close this source and open all its successors */ CloseNGramFile(ngs); if (trace&T_IST) printf(" closing file %s\n", inset->gf[cur]->fn); next = inset->gf[cur]->next; alt = (next == NULL) ? NULL : next->alt; if (next != NULL && alt == NULL) { /* single successor */ inset->gf[cur] = next; /* so just replace it */ OpenNGramFile(ngs,next->fn,inset->wm); if (trace&T_IST) printf(" replaced by file %s\n", next->fn); } else { /* zero or multiple successors */ /* delete exhausted input stream */ --inset->nOpen; for (i=cur; i<inset->nOpen; i++) { inset->ngs[i] = inset->ngs[i+1]; inset->gf[i] = inset->gf[i+1]; } for (i=0; i<inset->nOpen; i++) inset->gfsort[i] = i; if (next != NULL) { /* add multiple successors */ if (trace&T_IST) printf(" replaced by files ...\n"); for (p=next; p!=NULL; p=p->alt,inset->nOpen++){ OpenNGramFile(&(inset->ngs[inset->nOpen]),p->fn,inset->wm); inset->gf[inset->nOpen] = p; inset->gfsort[inset->nOpen] = inset->nOpen; if (trace&T_IST) printf(" %s\n", p->fn); } if (inset->maxNOpen < inset->nOpen) inset->maxNOpen = inset->nOpen; } } if (inset->nOpen > 1) SortGFList(inset); } else /* no change, just resort the parallel input streams */ if (inset->nOpen > 1) ReSortGFList(inset); if (checkOrder && inset->nOpen > 0){ /* check ordering is consistent */ cur = inset->gfsort[0]; ngs = &(inset->ngs[cur]); NGramExpand(inset->N,ngs->buf,nextng); if (CmpNGram(inset->wm,inset->N,ng,nextng) > 0) HError(15345,"GetInsetGram: n-grams out of order"); }}/* EXPORT->GetNextNGram: get next ngram from parallel input streams */Boolean GetNextNGram(NGInputSet *inset, NGram ng, float *count, int N){ float sum; UInt thisGram[MAXNG]; int i; Boolean same; if (N > inset->N) HError(15341,"GetNextNGram: Requested N[%d] > gram size [%d]",N,inset->N); if(!inset->nextValid) { /* either first or last */ if (inset->nOpen == 0) return FALSE; GetInsetGram(inset,inset->nextGram,&(inset->nextWt)); } /* pick up the last read gram from this inset */ for (i=0; i<N; i++) thisGram[i] = inset->nextGram[i]; sum = inset->nextWt * inset->nextGram[inset->N]; inset->nextValid = FALSE; /* read new grams whilst same as current */ if (inset->nOpen > 0) do { GetInsetGram(inset,inset->nextGram,&(inset->nextWt)); inset->nextValid = TRUE; same = TRUE; for (i=0; same && i<N; i++) same = thisGram[i] == inset->nextGram[i]; if (same) { sum += inset->nextWt * inset->nextGram[inset->N]; inset->nextValid = FALSE; } } while (same && inset->nOpen > 0); /* copy accumulated/truncated ngram to caller */ for (i=0; i<N; i++) ng[i] = thisGram[i]; *count = sum; return TRUE;}/* EXPORT->CloseInputSet: close input set */void CloseInputSet(NGInputSet *inset){ int i; if (trace&T_MOP) printf("Max parallel input streams = %d\n",inset->maxNOpen); for (i=0; i<inset->nOpen; i++) CloseNGramFile (&(inset->ngs[i])); inset->nOpen = 0;}/* ------------------- FoF Table Handling -------------- *//* EXPORT-> Create a FoF table with size rows */FoFTab *CreateFoFTab(MemHeap *mem, int size, int N){ FoFTab *p; int i,j; UInt *u; p = (FoFTab *)New(mem,sizeof(FoFTab)); p->size = size; p->N = N; p->fof = (UInt **)New(mem,sizeof(UInt *)*size); p->fof--; for (i=1; i<=size; i++){ u = (UInt *)New(mem,sizeof(UInt)*N); --u; p->fof[i] = u; } for (i=1; i<=size; i++) for (j=1; j<=N; j++) p->fof[i][j] = 0; return p;}/* WriteFoFHeader: write a header for given NG Buffer */static void WriteFoFHeader(FILE *f, FoFTab *tab, char *source){ if (tab->size == 0) HError(15390,"WriteFoFHeader: FoF table is empty"); fprintf(f,"NGram = %d\n",tab->N); fprintf(f,"Entries = %d\n",tab->size); if (source != NULL) fprintf(f,"Source = %s\n",source); fprintf(f,"\\Fofs\\\n");}/* EXPORT-> WriteFoFTab: Write given table to fn */void WriteFoFTab(char *fn, FoFTab *tab, char *source){ FILE *f; int i,j; f = (fn==NULL)?stdout:fopen(fn,"w"); if (f==NULL) HError(15311,"WriteFoFTab: Can't create output file %s", (fn==NULL)?"stdout":fn); WriteFoFHeader(f,tab,source); for (i = 1; i<=tab->size; i++){ for (j=1; j<=tab->N; j++) fprintf(f,"%10d ",tab->fof[i][j]); fprintf(f,"\n"); } if (trace&T_FOF) printf("FoF written to file %s - N=%d,size=%d\n", (fn==NULL)?"stdout":fn,tab->N,tab->size); if (fn!=NULL)fclose(f);}/* EXPORT-> Create a FoF table holding contents of file fn */FoFTab *ReadFoFTab(MemHeap *mem, char *fn){ LMFileHdr hdr; MemHeap hmem; int ibuf[MAXNG],size,N,i,j; FoFTab *p; Source src; CreateHeap(&hmem,"FoFheader",MSTAK,1,0.0,1000,1000); if (InitSource(fn,&src,NoFilter)==FAIL) { HError(15310,"ReadFofTab: Require FoF file to continue"); } if (ReadLMHeader(&hmem, &src, NoFilter, &hdr, &size) != LFOF_HDR) HError(15313,"ReadFoFTab: Bad header in file %s",fn); if (size<0 || size>=1000) HError(15313,"ReadFoFTab: Bad FoF table size %d in file %s",size,fn); if (!GetLMHdrInt("NGRAM",&N,hdr)) HError(15313,"ReadFoFTab: No NGRAM field in %s",fn); if (N<0 || N>=MAXNG) HError(15313,"ReadFoFTab: Bad ngram size %d in %s",N, fn); p = CreateFoFTab(mem,size,N); for (i = 1; i<=size; i++) { if (!ReadInt(&src,ibuf,N,FALSE)) HError(15313,"ReadFoFTab: Cannot read row %d from file %s",i,fn); for (j=1; j<=N; j++) p->fof[i][j] = ibuf[j-1]; } if (trace&T_FOF) printf("FoF created from file %s - N=%d,size=%d\n",fn,N,size); CloseSource(&src); DeleteHeap(&hmem); return p;}/* -------------------- End of LGBase.c ---------------------- */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -