📄 ngram.c
字号:
#include <stdlib.h>#include <math.h>#include <sys/types.h>#include <netinet/in.h>#include <unistd.h>#include "libs.h"#include "io.h"#include "misc.h"#include "hash.h"#include "vocab.h"#include "ngram.h"#include "net.h"#include "ngramd.h"#define SLM_SOCK(ng) ((ng)->socket)#define IS_DISTANT_BIGRAM(ng) ((ng)->ngram_len == 2 && (ng)->context_len > 1)static SLMNgram*format_error(int n){ fprintf(stderr,"SLMReadARPA ERROR: LM format error while reading %d-gram\n",n); return NULL;}static intSLMNgramNodeCompare(const void *n1, const void *n2){ return ((SLMNgramNode*)n1)->id - ((SLMNgramNode*)n2)->id;}static intSLMNgramLeafCompare(const void *n1, const void *n2){ return ((SLMNgramLeaf*)n1)->id - ((SLMNgramLeaf*)n2)->id;}static voidputNgramNodeData(SLMNgram *ng, SLMNgramNode *nn, int nelem, SLMWordID *idarray, int level, int n, int pos){ SLMNgramNode ref,*nd; int i; ref.id = idarray[level-1]; nd = bsearch(&ref,nn,nelem,sizeof(SLMNgramNode),SLMNgramNodeCompare); if (nd == NULL) { fprintf(stderr,"SLMReadLM ERROR: Inconsistent n-gram found: "); for (i = 0; i < n; i++) fprintf(stderr,SLMWordID_FMT " ",idarray[i]); fprintf(stderr,"\n"); /* exit(2); */ return; } if (level == n-1) { /* register new node */ if (nd->nextpos < 0) nd->nextpos = pos; nd->nelem++; } else { putNgramNodeData(ng,&ng->node[level][nd->nextpos],nd->nelem, idarray,level+1,n,pos); }}static SLMNgram *SLMNewLM(){ SLMNgram *ng; ng = New(SLMNgram); memset(ng,0,sizeof(SLMNgram)); ng->type = (SLM_WordNgram|SLM_ONE_UNK|SLM_UNK_IN_TRAIN|SLM_WORD_VOCAB_OPEN); ng->first_id = 0; ng->weight = 1.0; ng->next_lm = NULL; ng->delegate = NULL; ng->delimiter = SLM_DEFAULT_DELIMITER; ng->socket = -1; return ng;}static SLMNgram *SLMReadLM_arpa(char *filename,int verbosity){ FILEHANDLE f = z_open(filename,"r"); char buf[256],buf2[256],*a; SLMNgram *ng; int4 ngram_size[MAX_GRAM]; int i,j,m,n; int ngram_len; int distance = 0; int word_num; SLMNgramNode **nodes; SLMNgramLeaf *leaves; float prob,alpha; char **vocab; SLMHashTable *vocab_ht; SLMWordID idarray[MAX_GRAM]; ng = SLMNewLM(); ng->filename = strdup(filename); while ((a = z_gets(buf,256,f)) != NULL) { if (strncmp(buf,"\\distance=",10) == 0) sscanf(buf+10,"%d",&distance); if (strncmp(buf,"\\data\\",6) == 0) break; } if (a == NULL) { fprintf(stderr,"SLMReadLM ERROR: no \\data\\ found\n"); return NULL; } ngram_len = 0; while ((a = z_gets(buf,256,f)) != NULL) { if (strncmp(buf,"ngram ",6) == 0) { sscanf(buf+6,"%d=%d",&n,&m); if (n < 1 || n > MAX_GRAM) { fprintf(stderr,"SLMReadLM ERROR: Can't handle this LM's n-gram length; limit = %d\n", MAX_GRAM); return NULL; } ngram_size[n-1] = m; if (n > ngram_len) ngram_len = n; } else if (buf[0] == '\\') break; } if (a == NULL) { fprintf(stderr,"SLMReadLM ERROR: file ended while scanning \"ngram n=size\" section\n"); return NULL; } /* check for distant-bigram */ if (distance > 0 && ngram_len != 2) { fprintf(stderr,"SLMReadLM ERROR: distance=%d and n=%d: ngram size must be 2\n",distance,ngram_len); return NULL; } nodes = New_N(SLMNgramNode*,ngram_len-1); for (i = 0; i < ngram_len-1; i++) nodes[i] = New_N(SLMNgramNode,ngram_size[i]+1); leaves = New_N(SLMNgramLeaf,ngram_size[ngram_len-1]); vocab = New_N(char*,ngram_size[0]+1); vocab_ht = SLMHashCreateSI(ngram_size[0]*3/2); ng->ngram_len = ngram_len; if (distance == 0) ng->context_len = ngram_len-1; else ng->context_len = distance+1; ng->n_unigram = ngram_size[0]; ng->node = nodes; ng->leaf = leaves; ng->vocab = vocab; ng->vocab_ht = vocab_ht;#ifdef NG_CACHE ng->hist = New_N(SLMNgramSearchHist,ngram_len-1); for (i = 0; i < ngram_len-1; i++) { ng->hist[i].id = SLM_NONWORD; ng->hist[i].node = NULL; }#endif /* ngram start */ n = 1; word_num = 0; do { sprintf(buf2,"\\%d-grams:",n); if (strncmp(buf,buf2,strlen(buf2)) != 0) { fprintf(stderr,"SLMReadLM ERROR: \"\\%d-grams:\" not found\n",n); z_close(f); return NULL; } if (verbosity > 0) { fputs(buf2+1,stderr); fflush(stderr); } for (i = 0; i < ngram_size[n-1]; i++) { if (verbosity > 0 && i%20000 == 0) { fputc('.',stderr); fflush(stderr); } if (n < ngram_len) { if (z_getfloat(f,&prob) != 0) { z_close(f); return format_error(n); } if (n == 1) { z_getstr(f,buf2,256); if (word_num == 0) { if (strcmp(buf2,"<UNK>") != 0) { vocab[word_num] = strdup("<UNK>"); vocab[++word_num] = strdup(buf2); SLMIntHashInsert(vocab_ht,vocab[word_num],word_num); SLM_Set_WORD_VOCAB(ng->type,SLM_WORD_VOCAB_CLOSED); ng->first_id = 1; if (verbosity > 1) { fprintf(stderr,"<UNK> not found: vocab_type is set to %s\n", vocab_type_name(ng->type)); } } else vocab[word_num] = strdup(buf2); } else { vocab[word_num] = strdup(buf2); SLMIntHashInsert(vocab_ht,vocab[word_num],word_num); } z_getfloat(f,&alpha); nodes[0][i].id = word_num; nodes[0][i].prob = pow(10.0,prob); nodes[0][i].alpha = pow(10.0,alpha); nodes[0][i].nextpos = -1; nodes[0][i].nelem = 0; word_num++; } else { /* 1 < n < ngram_len */ for (j = 0; j < n; j++) { z_getstr(f,buf2,256); idarray[j] = SLMIntHashSearch(vocab_ht,buf2); } z_getfloat(f,&alpha); nodes[n-1][i].id = idarray[n-1]; nodes[n-1][i].prob = pow(10.0,prob); nodes[n-1][i].alpha = pow(10.0,alpha); nodes[n-1][i].nextpos = -1; nodes[n-1][i].nelem = 0; putNgramNodeData(ng,ng->node[0],ng->n_unigram,idarray,1,n,i); } } else { /* n == ngram_len */ if (z_getfloat(f,&prob) != 0) { z_close(f); return format_error(n); } for (j = 0; j < n; j++) { z_getstr(f,buf2,256); idarray[j] = SLMIntHashSearch(vocab_ht,buf2); } leaves[i].id = idarray[n-1]; leaves[i].prob = pow(10.0,prob); putNgramNodeData(ng,ng->node[0],ng->n_unigram,idarray,1,n,i); } } while ((a = z_gets(buf,256,f)) != NULL) { if (buf[0] == '\\') break; } if (a == NULL) { fprintf(stderr,"SLMReadLM ERROR: file ended while scanning \"\\%d-grams:\" or \"\\end\\\"\n",n+1); z_close(f); return NULL; } n++; } while (strncmp(buf,"\\end\\",5) != 0); if (verbosity > 0) fputc('\n',stderr); /* read additional class info */ while (z_gets(buf,256,f) != NULL) { if (strncmp(buf,"\\class\\",7) == 0) { if (verbosity > 1) fprintf(stderr,"Reading additional class ngram info...\n"); SLM_Set_NgramType(ng->type,SLM_ClassNgram); break; } } if (SLM_NgramType(ng->type) == SLM_WordNgram) { z_close(f); return ng; } ng->class_ht = ng->vocab_ht; ng->class_sym= ng->vocab; ng->first_class_id = ng->first_id; ng->first_id = 0; if (SLM_WORD_VOCAB(ng->type) == SLM_WORD_VOCAB_OPEN) SLM_Set_CLASS_VOCAB(ng->type,SLM_CLASS_VOCAB_OPEN); else SLM_Set_CLASS_VOCAB(ng->type,SLM_CLASS_VOCAB_CLOSED); SLM_Set_WORD_VOCAB(ng->type,SLM_WORD_VOCAB_OPEN); SLM_Set_N_UNK(ng->type,SLM_CLASS_UNK); z_getint(f,&ng->n_word); ng->vocab_ht = SLMHashCreateSI(ng->n_word*3/2); ng->vocab = New_N(char*,ng->n_word); ng->c_uniprob = New_N(float,ng->n_word); ng->class_id = New_N(SLMWordID,ng->n_word); for (i = 0; i < ng->n_word; i++) { z_getstr(f,buf,256); z_getstr(f,buf2,256); z_getfloat(f,&prob); if (i == 0 && strcmp(buf2,"<UNK>") != 0) { /* first word is not UNK */ SLM_Set_N_UNK(ng->type,SLM_NO_UNK); SLM_Set_WORD_VOCAB(ng->type,SLM_WORD_VOCAB_CLOSED); ng->first_id = 1; ng->vocab[0] = strdup("<UNK>"); i++; if (verbosity > 2) { fprintf(stderr,"\n<UNK> not found: vocab_type is set to <%s>\n", vocab_type_name(ng->type)); } } ng->class_id[i] = SLMIntHashSearch(ng->class_ht,buf); ng->vocab[i] = strdup(buf2); SLMIntHashInsert(ng->vocab_ht,ng->vocab[i],i); ng->c_uniprob[i] = pow(10.0,prob); } z_close(f); /* 13,Jul,2005 kato */ return ng;}int4SLMd2l(double x){ double scale = (double)0x7fffffffL/1000; double z = x*scale; int4 r; if (z > 0x7fffffff) r = 0x80000000; else r = z; return r;}doubleSLMl2d(int4 x){ double scale = (double)1000/0x7fffffffL; return (double)x*scale;}#define l2d(x) SLMl2d(x)static SLMNgram *SLMReadLM_binary(char *filename,int verbosity){ FILEHANDLE f = z_open(filename,"r"); char buf[SLM_BINLM_HEADER_SIZE]; SLMNgram *ng; int4 ngram_size[MAX_GRAM]; int i,j,m,n; int ngram_len; int word_num; SLMNgramNode **nodes; SLMNgramLeaf *leaves; float prob,alpha; char **vocab; SLMHashTable *vocab_ht; SLMWordID idarray[MAX_GRAM]; int4 l; uint2 s; uint2 distance = 0; ng = SLMNewLM(); ng->filename = strdup(filename); z_read(buf,sizeof(char),SLM_BINLM_HEADER_SIZE_V2,f); /* read header */ if (strcmp(buf,SLM_BINLM_HEADER_MSG_V1) == 0) { /* version 1 header */ z_read(buf,sizeof(char),SLM_BINLM_HEADER_SIZE_V1-SLM_BINLM_HEADER_SIZE_V2,f); } else if (strcmp(buf,SLM_BINLM_HEADER_MSG_V2) == 0) { /* version 2 header */ z_read(&s,sizeof(uint2),1,f); distance = ntohs(s); } else { fprintf(stderr,"SLMReadLM: Can't handle this version: %s\n",buf); z_close(f); return NULL; } z_read(&s,sizeof(short),1,f); /* ngram length */ ngram_len = ntohs(s); if (distance > 0 && ngram_len != 2) { fprintf(stderr,"SLMReadLM ERROR: distance=%d and n=%d: ngram size must be 2\n",distance,ngram_len); z_close(f); return NULL; } /* number of ngram */ for (i = 0; i < ngram_len; i++) { z_read(&l,sizeof(int4),1,f); ngram_size[i] = ntohl(l); } nodes = New_N(SLMNgramNode*,ngram_len-1); for (i = 0; i < ngram_len-1; i++) nodes[i] = New_N(SLMNgramNode,ngram_size[i]+1); leaves = New_N(SLMNgramLeaf,ngram_size[ngram_len-1]); vocab = New_N(char*,ngram_size[0]+1); vocab_ht = SLMHashCreateSI(ngram_size[0]*3/2); ng->ngram_len = ngram_len; if (distance == 0) ng->context_len = ngram_len-1; else ng->context_len = distance+1; ng->n_unigram = ngram_size[0]; ng->node = nodes; ng->leaf = leaves; ng->vocab = vocab; ng->vocab_ht = vocab_ht;#ifdef NG_CACHE ng->hist = New_N(SLMNgramSearchHist,ngram_len-1); for (i = 0; i < ngram_len-1; i++) { ng->hist[i].id = SLM_NONWORD; ng->hist[i].node = NULL; }#endif /* read vocab */ n = read_WordID(f,0); for (i = 1; i <= n; i++) { m = read_ushort(f,0); vocab[i] = New_N(char,m+1); z_read(vocab[i],sizeof(char),m,f); vocab[i][m] = '\0'; SLMIntHashInsert(vocab_ht,vocab[i],i); } /* ngram start */ n = 1; word_num = 0; do { if (verbosity > 0) { fprintf(stderr,"%d-grams",n); fflush(stderr); } for (i = 0; i < ngram_size[n-1]; i++) { if (verbosity > 0 && i%20000 == 0) { fputc('.',stderr); fflush(stderr); } if (n < ngram_len) { z_read(&l,sizeof(int4),1,f); prob = l2d(ntohl(l)); for (j = 0; j < n; j++) { idarray[j] = read_WordID(f,0); } z_read(&l,sizeof(int4),1,f); alpha = l2d(ntohl(l)); nodes[n-1][i].id = idarray[n-1]; nodes[n-1][i].prob = pow(10.0,prob); nodes[n-1][i].alpha = pow(10.0,alpha); nodes[n-1][i].nextpos = -1; nodes[n-1][i].nelem = 0; if (n > 1) putNgramNodeData(ng,ng->node[0],ng->n_unigram,idarray,1,n,i); } else { /* n == ngram_len */ z_read(&l,sizeof(int4),1,f); prob = l2d(ntohl(l)); for (j = 0; j < n; j++) { idarray[j] = read_WordID(f,0); } leaves[i].id = idarray[n-1]; leaves[i].prob = pow(10.0,prob); putNgramNodeData(ng,ng->node[0],ng->n_unigram,idarray,1,n,i); } } n++; } while (n <= ngram_len); if (verbosity > 0) fputc('\n',stderr); /* read additional class info */ if (z_read(buf,sizeof(char),16,f) == 16) { if (verbosity > 1) fprintf(stderr,"Reading additional class ngram info...\n"); SLM_Set_NgramType(ng->type,SLM_ClassNgram); } if (SLM_NgramType(ng->type) == SLM_WordNgram) { z_close(f); return ng; } ng->class_ht = ng->vocab_ht; ng->class_sym = ng->vocab; ng->first_class_id = ng->first_id; if (SLM_WORD_VOCAB(ng->type) == SLM_WORD_VOCAB_OPEN) SLM_Set_CLASS_VOCAB(ng->type,SLM_CLASS_VOCAB_OPEN); else SLM_Set_CLASS_VOCAB(ng->type,SLM_CLASS_VOCAB_CLOSED); SLM_Set_WORD_VOCAB(ng->type,SLM_WORD_VOCAB_OPEN); SLM_Set_N_UNK(ng->type,SLM_CLASS_UNK); z_read(&l,sizeof(int4),1,f); ng->n_word = ntohl(l); z_read(&s,sizeof(short),1,f); ng->first_id = ntohs(s); if (ng->first_id != 0) { /* first word is not UNK */ SLM_Set_N_UNK(ng->type,SLM_NO_UNK); SLM_Set_WORD_VOCAB(ng->type,SLM_WORD_VOCAB_CLOSED); ng->vocab[0] = strdup("<UNK>"); if (verbosity > 2) { fprintf(stderr,"\nvocab_type is set to <%s>\n", vocab_type_name(ng->type)); } } ng->vocab_ht = SLMHashCreateSI(ng->n_word*3/2); ng->vocab = New_N(char*,ng->n_word); ng->c_uniprob = New_N(float,ng->n_word); ng->class_id = New_N(SLMWordID,ng->n_word); for (i = ng->first_id; i <= ng->n_word; i++) { ng->class_id[i] = read_WordID(f,0); z_read(&l,sizeof(int4),1,f); ng->c_uniprob[i] = pow(10.0,l2d(ntohl(l))); if (i != 0) { z_read(&s,sizeof(short),1,f); j = ntohs(s); ng->vocab[i] = New_N(char,j+1); z_read(ng->vocab[i],sizeof(char),j,f); SLMIntHashInsert(ng->vocab_ht,ng->vocab[i],i); } } z_close(f); /* 13,Jul,2005 kato */ return ng;}static SLMNgram *check_ngram_filename(SLMNgram *ng, char *filename){ while (ng != NULL) { if (strcmp(ng->filename, filename) == 0) return ng; ng = ng->next_lm; } return NULL;}static SLMNgram *SLMReadLM0(char *filename,int format,int verbosity){ if (format == SLM_LM_ARPA) return SLMReadLM_arpa(filename,verbosity); else if (format == SLM_LM_BINARY) return SLMReadLM_binary(filename,verbosity); else { fprintf(stderr,"SLMReadLM Error: unsupported format %d\n",format); return NULL; }}static SLMNgram *create_delegate(SLMNgram *ng, int len){ SLMNgram *ng2; if (IS_DISTANT_BIGRAM(ng)) { fprintf(stderr,"SLMReadLM Error: Can't create delegate for a distant bigram\n");
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -