📄 ngram.c
字号:
return NULL; } if (len > ng->ngram_len) { fprintf(stderr,"SLMReadLM Error: You can't specify longer n-gram length (%d) than the original model length (%d)\n",len,ng->ngram_len); return NULL; } ng2 = SLMNewLM(); memcpy(ng2,ng,sizeof(SLMNgram)); ng2->context_len = len-1; ng2->delegate = ng; ng2->next_lm = ng; return ng2;}#ifdef ENABLE_REMOTE_MODEL/* * openRemoteModel() connects to the specified host * and returns a pointer to SLMNgram structure that points * the host. */static SLMNgram *openRemoteModel(char *hostname, int portnum, int verbosity){ SLMNgram *ng; char buf[1024]; int sock; char cmd; unsigned char par1; unsigned short par2,id; unsigned char size; int i; sock = openSocket(hostname,portnum); if (sock < 0) return NULL; ng = SLMNewLM(); sprintf(buf,"%s:%d",hostname,portnum); ng->filename = strdup(buf); ng->type = SLM_REMOTE_MODEL; SLM_SOCK(ng) = sock; /* retrieve basic info */ cmd = SLM_NGD_BASIC_INFO; write(SLM_SOCK(ng),&cmd,1); read(SLM_SOCK(ng),&par2,2); ng->type |= ntohs(par2); read(SLM_SOCK(ng),&par1,1); ng->first_id = par1; read(SLM_SOCK(ng),&par1,1); ng->first_class_id = par1; read(SLM_SOCK(ng),&par1,1); ng->ngram_len = par1; read(SLM_SOCK(ng),&par1,1); ng->context_len = par1; read(SLM_SOCK(ng),&par2,2); ng->n_unigram = ng->n_word = ntohs(par2); /* retrieve word info */ ng->vocab_ht = SLMHashCreateSI(ng->n_word*3/2); ng->vocab = New_N(char*,ng->n_word); ng->vocab[0] = strdup("<UNK>"); cmd = SLM_NGD_ID2WORD; for (i = ng->first_id; i <= ng->n_word; i++) { write(SLM_SOCK(ng),&cmd,1); id = htons((unsigned short)i); write(SLM_SOCK(ng),&id,2); read(SLM_SOCK(ng),&size,1); read(SLM_SOCK(ng),buf,size); buf[size] = '\0'; ng->vocab[i] = strdup(buf); SLMIntHashInsert(ng->vocab_ht,ng->vocab[i],i); if (verbosity > 1) { fprintf(stderr,"%d:%s\n",i,buf); if (i % 100 == 0) { fprintf(stderr,"."); fflush(stderr); } } } if (verbosity > 1) { fprintf(stderr,"Remote model read.\n"); } return ng;}#endif/* * SLMReadLM() invokes SLMReadLM0() to read an LM. If filename is * "lmfile1.arpa[;length]*weight,lmfile2.arpa[;length]*weight,..." * then all LMs are read and combined with the specified weight. */SLMNgram *SLMReadLM(char *filename,int format,int verbosity){ SLMNgram *ng = NULL; char buf1[256],buf2[256]; char *p,*q; double w; int len;#ifdef ENABLE_REMOTE_MODEL if ((p = strchr(filename,':')) != NULL) { /* the filename is hostname:portnum */ strncpy(buf1,filename,p-filename); buf1[p-filename] = '\0'; return openRemoteModel(buf1,atoi(p+1),verbosity); }#endif q = buf1; for (p = filename; *p; p++) { if (*p == '*' || *p == ';' ) { *q = '\0'; if (*buf1 == '\0') { /* no filename is specified */ fprintf(stderr,"SLMReadLM: %s: no filename part\n",filename); exit(1); } len = 0; /* no length specified */ w = 1.0; /* no weight specified */ if (*p == ';') { /* length follows */ q = buf2; p++; while (*p && *p != '*' && *p != ',') *(q++) = *(p++); *q = '\0'; len = atoi(buf2); } if (*p == '*') { q = buf2; p++; while (*p && *p != ',') *(q++) = *(p++); *q = '\0'; w = atof(buf2); } if (verbosity > 1) { fprintf(stderr,"Reading LM file %s \n",buf1); if (len > 0) fprintf(stderr,"length=%d ",len); if (w != 1.0) fprintf(stderr,"weight=%f",w); fprintf(stderr,"\n"); } if (ng == NULL) { ng = SLMReadLM0(buf1,format,verbosity); if (len > 0 && ng->ngram_len != len) { ng->weight = 0.0; ng = create_delegate(ng,len); if (ng == NULL) { /* error */ return ng; } } ng->weight = w; } else { SLMAddLM(ng,len,w,buf1,format,verbosity); } q = buf1; } else { *(q++) = *p; } } if (ng == NULL) { *q = '\0'; if (verbosity > 1) { fprintf(stderr,"Reading LM file %s\n",buf1); } ng = SLMReadLM0(buf1,format,verbosity); } return ng;}voidSLMAddLM(SLMNgram *ng, int len, double weight, char *filename,int format,int verbosity){ SLMNgram *next_ng; if (ng == NULL) { fprintf(stderr,"SLMAddLM Warning: base LM == NULL\n"); return; } next_ng = check_ngram_filename(ng,filename); if (next_ng != NULL) { SLMNgram *ng2 = create_delegate(next_ng,len); if (ng2 == NULL) { /* error */ return; } ng2->weight = weight; ng2->next_lm = ng->next_lm; ng->next_lm = ng2; return; } next_ng = SLMReadLM0(filename,format,verbosity); if (len > 0 && next_ng->ngram_len != len) { SLMNgram *ng2 = create_delegate(next_ng,len); if (ng2 == NULL) { /* error */ return; } next_ng->weight = 0.0; next_ng = ng2; } next_ng->weight = weight; next_ng->next_lm = ng->next_lm; ng->next_lm = next_ng;}voidSLMFreeLM(SLMNgram *ng){ int i;#ifdef ENABLE_REMOTE_MODEL if (ng->type & SLM_REMOTE_MODEL) { Free(ng->filename); close(SLM_SOCK(ng)); return; }#endif if (ng->next_lm != NULL) SLMFreeLM(ng->next_lm); if (ng->delegate == NULL) { for (i = 0; i < ng->ngram_len-1; i++) Free(ng->node[i]); Free(ng->node); Free(ng->leaf); Free(ng->vocab); Free(ng->filename);#ifdef NG_CACHE Free(ng->hist);#endif SLMHashDestroy(ng->vocab_ht); if (SLM_NgramType(ng->type) == SLM_ClassNgram) { Free(ng->class_sym); SLMHashDestroy(ng->class_ht); } } Free(ng);}static SLMNgramNode*search_node(SLMNgram *ng, SLMNgramNode *base, int nelem, int level, int len, SLMWordID *idarray, int cache_ok){ SLMNgramNode ref,*nd; ref.id = idarray[level]; if (level == ng->ngram_len-1) { return bsearch(&ref,base,nelem,sizeof(SLMNgramLeaf), SLMNgramLeafCompare); } else {#ifdef NG_CACHE /* check cache */ if (cache_ok && ng->hist[level].id == idarray[level]) nd = ng->hist[level].node; else { nd = bsearch(&ref,base,nelem,sizeof(SLMNgramNode),SLMNgramNodeCompare); ng->hist[level].id = idarray[level]; ng->hist[level].node = nd; cache_ok = 0; }#else nd = bsearch(&ref,base,nelem,sizeof(SLMNgramNode),SLMNgramNodeCompare);#endif if (level == len-1 || nd == NULL) return nd; else { if (level == ng->ngram_len-2) { return search_node(ng, (SLMNgramNode*)&ng->leaf[nd->nextpos], nd->nelem, level+1,len,idarray, cache_ok); } else { return search_node(ng, &ng->node[level+1][nd->nextpos], nd->nelem, level+1,len,idarray, cache_ok); } } }}static double SLMGetBOProb0(SLMNgram *ng, int len, SLMWordID *idarray, SLMBOStatus *status){ SLMNgramNode *nn1,*nn2; double prob; int i; nn1 = search_node(ng,ng->node[0],ng->n_unigram,0,len,idarray,1); if (nn1 != NULL) { if (status) { for (i = 0; i < len; i++) status->hit[i] = SLM_STAT_HIT; } prob = nn1->prob; } else { if (len == 1) { /* unigram search failed */ prob = 0; } else { nn2 = search_node(ng,ng->node[0],ng->n_unigram, 0,len-1,idarray,1); if (nn2 != NULL) { if (status) status->hit[len-1] = SLM_STAT_BO_WITH_ALPHA; prob = nn2->alpha*SLMGetBOProb0(ng,len-1,idarray+1,status); } else prob = SLMGetBOProb0(ng,len-1,idarray+1,status); } } if (status) { status->ng_prob = prob; status->ug_prob = 1; } return prob;}SLMWordIDSLMWord2ID(SLMNgram *ng, char *word){ int id; char *q,buf[256]; if (ng->delegate != NULL) return SLMWord2ID(ng->delegate,word); id = SLMIntHashSearch(ng->vocab_ht,word); if (id == 0 && SLM_NgramType(ng->type) == SLM_ClassNgram) { /* if ng is class ngram, UNK is class by class */ for (q = word+strlen(word)-1; q >= word; q--) { if (*q == ng->delimiter) break; } q++; sprintf(buf,"<UNK>%c%s",ng->delimiter,q); id = SLMIntHashSearch(ng->vocab_ht,buf); } return id;}intSLMVocabSize(SLMNgram *ng){ if (SLM_NgramType(ng->type) == SLM_WordNgram) return ng->n_unigram; else return ng->n_word;}const char*SLMID2Word(SLMNgram *ng, SLMWordID id){ if (id == 0) return "<UNK>"; if (id >= SLMVocabSize(ng)) return "<ERROR>"; return ng->vocab[id];}intSLMContextLength(SLMNgram *ng){ int context_len = 0; while (ng) { if (ng->weight == 0) { /* dummy model */ ng = ng->next_lm; continue; } if (ng->context_len > context_len) context_len = ng->context_len; ng = ng->next_lm; } return context_len;} intSLMNgramLength(SLMNgram *ng){ int len = 0,x; while (ng) { if (IS_DISTANT_BIGRAM(ng)) x = 2; else x = ng->context_len+1; if (ng->weight > 0 && x > len) len = x; ng = ng->next_lm; } return len;} double SLMGetBOProb(SLMNgram *ng, int len, SLMWordID *idarray, SLMBOStatus *status){ SLMWordID cidarray[MAX_GRAM]; double prob_array[MAX_GRAM]; double weight_array[MAX_GRAM]; int i,j; double prob; double sum_weight = 0.0; int reallen; SLMBOStatus my_stat; if (len > MAX_GRAM) { fprintf(stderr,"SLMGetBOProb: n-gram length %d too big (limit is %d\n", len, MAX_GRAM); return 0.0; }#ifdef ENABLE_REMOTE_MODEL if (ng->type & SLM_REMOTE_MODEL) { char cmd = SLM_NGD_PROB; unsigned char len1 = len; SLMWordID id; int4 iprob; write(SLM_SOCK(ng),&cmd,1); write(SLM_SOCK(ng),&len1,1); for (i = 0; i < len; i++) { id = SLMhtonID(idarray[i]); write(SLM_SOCK(ng),&id,sizeof(SLMWordID)); } read(SLM_SOCK(ng),&len1,1); my_stat.len = len1; read(SLM_SOCK(ng),&iprob,4); my_stat.ng_prob = exp(SLMl2d(ntohl(iprob))); read(SLM_SOCK(ng),&iprob,4); my_stat.ug_prob = exp(SLMl2d(ntohl(iprob))); read(SLM_SOCK(ng),my_stat.hit,len1); if (status) { status->len = my_stat.len; status->ng_prob = my_stat.ng_prob; status->ug_prob = my_stat.ug_prob; for (i = 0; i < my_stat.len; i++) status->hit[i] = my_stat.hit[i]; } return my_stat.ng_prob*my_stat.ug_prob; }#endif j = 0; for (; ng != NULL; ng = ng->next_lm) { if (ng->weight == 0.0) continue; /* set reallen; reallen is set to n-gram length */ reallen = len; if (len > ng->context_len+1) reallen = ng->context_len+1; my_stat.len = len; for (i = 0; i < reallen; i++) my_stat.hit[i] = 0; if (SLM_NgramType(ng->type) == SLM_WordNgram) { if (IS_DISTANT_BIGRAM(ng)) { /* distant bigram */ if (len < ng->context_len+1) { reallen = 1; /* back to unigram */ cidarray[len-1] = idarray[len-1]; } else { reallen = 2; cidarray[len-2] = idarray[len-ng->context_len-1]; cidarray[len-1] = idarray[len-1]; } } else { for (i = 0; i < len; i++) { cidarray[i] = idarray[i]; } } } else { if (IS_DISTANT_BIGRAM(ng)) { if (len < ng->context_len+1) { reallen = 1; /* back to unigram */ cidarray[0] = ng->class_id[idarray[len-1]]; } else { reallen = 2; cidarray[0] = ng->class_id[idarray[len-ng->context_len-1]]; cidarray[1] = ng->class_id[idarray[len-1]]; } } else { for (i = 0; i < len; i++) { cidarray[i] = ng->class_id[idarray[i]]; } } } if (ng->delegate != NULL) prob_array[j] = SLMGetBOProb0(ng->delegate,reallen,cidarray+len-reallen,&my_stat); else prob_array[j] = SLMGetBOProb0(ng,reallen,cidarray+len-reallen,&my_stat); if (SLM_NgramType(ng->type) == SLM_ClassNgram) { my_stat.ug_prob = ng->c_uniprob[idarray[len-1]]; prob_array[j] *= ng->c_uniprob[idarray[len-1]]; } weight_array[j] = ng->weight; sum_weight += ng->weight; j++; } /* combine probs using weight*/ if (sum_weight < 0.99999) { /* Illegal weight; in this case weight is re-normalized */ for (i = 0; i < j; i++) { weight_array[i] /= sum_weight; } } prob = 0; for (i = 0; i < j; i++) { prob += weight_array[i]*prob_array[i]; } if (status) { status->len = my_stat.len; status->ng_prob = my_stat.ng_prob; status->ug_prob = my_stat.ug_prob; for (i = 0; i < my_stat.len; i++) status->hit[i] = my_stat.hit[i]; } return prob;}voidSLMBOStatusString(SLMBOStatus *status, char *buf){ int i; for (i = 0; i < status->len; i++) { if (status->hit[i] == SLM_STAT_HIT) { buf[i] = 'H'; } else if (status->hit[i] == SLM_STAT_BO_WITH_ALPHA) buf[i] = 'b'; else buf[i] = '-'; } buf[i] = '\0';}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -