⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 ngram.c

📁 about sound recognition.i want to downlod
💻 C
📖 第 1 页 / 共 2 页
字号:
#include <stdlib.h>#include <math.h>#include <sys/types.h>#include <netinet/in.h>#include <unistd.h>#include "libs.h"#include "io.h"#include "misc.h"#include "hash.h"#include "vocab.h"#include "ngram.h"#include "net.h"#include "ngramd.h"#define SLM_SOCK(ng) ((ng)->socket)#define IS_DISTANT_BIGRAM(ng) ((ng)->ngram_len == 2 && (ng)->context_len > 1)static SLMNgram*format_error(int n){    fprintf(stderr,"SLMReadARPA ERROR: LM format error while reading %d-gram\n",n);    return NULL;}static intSLMNgramNodeCompare(const void *n1, const void *n2){    return ((SLMNgramNode*)n1)->id - ((SLMNgramNode*)n2)->id;}static intSLMNgramLeafCompare(const void *n1, const void *n2){    return ((SLMNgramLeaf*)n1)->id - ((SLMNgramLeaf*)n2)->id;}static voidputNgramNodeData(SLMNgram *ng, SLMNgramNode *nn, int nelem, SLMWordID *idarray, int level, int n, int pos){    SLMNgramNode ref,*nd;    int i;    ref.id = idarray[level-1];    nd = bsearch(&ref,nn,nelem,sizeof(SLMNgramNode),SLMNgramNodeCompare);    if (nd == NULL) {	fprintf(stderr,"SLMReadLM ERROR: Inconsistent n-gram found: ");	for (i = 0; i < n; i++)	    fprintf(stderr,SLMWordID_FMT " ",idarray[i]);	fprintf(stderr,"\n");	/*	exit(2); */	return;    }    if (level == n-1) {	/* register new node */	if (nd->nextpos < 0)	    nd->nextpos = pos;	nd->nelem++;    }    else {	putNgramNodeData(ng,&ng->node[level][nd->nextpos],nd->nelem,			 idarray,level+1,n,pos);    }}static SLMNgram *SLMNewLM(){    SLMNgram *ng;    ng = New(SLMNgram);    memset(ng,0,sizeof(SLMNgram));    ng->type = (SLM_WordNgram|SLM_ONE_UNK|SLM_UNK_IN_TRAIN|SLM_WORD_VOCAB_OPEN);    ng->first_id = 0;    ng->weight = 1.0;    ng->next_lm = NULL;    ng->delegate = NULL;    ng->delimiter = SLM_DEFAULT_DELIMITER;    ng->socket = -1;    return ng;}static SLMNgram *SLMReadLM_arpa(char *filename,int verbosity){    FILEHANDLE f = z_open(filename,"r");    char buf[256],buf2[256],*a;    SLMNgram *ng;    int4 ngram_size[MAX_GRAM];      int i,j,m,n;    int ngram_len;    int distance = 0;    int word_num;    SLMNgramNode **nodes;    SLMNgramLeaf *leaves;    float prob,alpha;    char **vocab;    SLMHashTable *vocab_ht;    SLMWordID idarray[MAX_GRAM];    ng = SLMNewLM();    ng->filename = strdup(filename);    while ((a = z_gets(buf,256,f)) != NULL) {	if (strncmp(buf,"\\distance=",10) == 0)	    sscanf(buf+10,"%d",&distance);	if (strncmp(buf,"\\data\\",6) == 0)	    break;    }    if (a == NULL) {	fprintf(stderr,"SLMReadLM ERROR: no \\data\\ found\n");	return NULL;    }    ngram_len = 0;    while ((a = z_gets(buf,256,f)) != NULL) {	if (strncmp(buf,"ngram ",6) == 0) {	    sscanf(buf+6,"%d=%d",&n,&m);	    if (n < 1 || n > MAX_GRAM) {		fprintf(stderr,"SLMReadLM ERROR: Can't handle this LM's n-gram length; limit = %d\n",			MAX_GRAM);		return NULL;	    }	    ngram_size[n-1] = m;	    if (n > ngram_len)		ngram_len = n;	}	else if (buf[0] == '\\')	    break;    }    if (a == NULL) {	fprintf(stderr,"SLMReadLM ERROR: file ended while scanning \"ngram n=size\" section\n");	return NULL;    }    /* check for distant-bigram */    if (distance > 0 && ngram_len != 2) {      fprintf(stderr,"SLMReadLM ERROR: distance=%d and n=%d: ngram size must be 2\n",distance,ngram_len);      return NULL;    }    nodes = New_N(SLMNgramNode*,ngram_len-1);    for (i = 0; i < ngram_len-1; i++)	nodes[i] = New_N(SLMNgramNode,ngram_size[i]+1);    leaves = New_N(SLMNgramLeaf,ngram_size[ngram_len-1]);    vocab = New_N(char*,ngram_size[0]+1);    vocab_ht = SLMHashCreateSI(ngram_size[0]*3/2);    ng->ngram_len = ngram_len;    if (distance == 0)      ng->context_len = ngram_len-1;    else      ng->context_len = distance+1;    ng->n_unigram = ngram_size[0];    ng->node = nodes;    ng->leaf = leaves;    ng->vocab = vocab;    ng->vocab_ht = vocab_ht;#ifdef NG_CACHE    ng->hist = New_N(SLMNgramSearchHist,ngram_len-1);    for (i = 0; i < ngram_len-1; i++) {	ng->hist[i].id = SLM_NONWORD;	ng->hist[i].node = NULL;    }#endif    /* ngram start */    n = 1;    word_num = 0;    do {	sprintf(buf2,"\\%d-grams:",n);	if (strncmp(buf,buf2,strlen(buf2)) != 0) {	    fprintf(stderr,"SLMReadLM ERROR: \"\\%d-grams:\" not found\n",n);	    z_close(f);	    return NULL;	}	if (verbosity > 0) {	    fputs(buf2+1,stderr);	    fflush(stderr);	}	for (i = 0; i < ngram_size[n-1]; i++) {	    if (verbosity > 0 && i%20000 == 0) {		fputc('.',stderr);		fflush(stderr);	    }	    if (n < ngram_len) {	        if (z_getfloat(f,&prob) != 0) {		    z_close(f);		    return format_error(n);		}		if (n == 1) {		    z_getstr(f,buf2,256);		    if (word_num == 0) {			if (strcmp(buf2,"<UNK>") != 0) {			    vocab[word_num] = strdup("<UNK>");			    vocab[++word_num] = strdup(buf2);			    SLMIntHashInsert(vocab_ht,vocab[word_num],word_num);			    SLM_Set_WORD_VOCAB(ng->type,SLM_WORD_VOCAB_CLOSED);			    ng->first_id = 1;			    if (verbosity > 1) {				fprintf(stderr,"<UNK> not found: vocab_type is set to %s\n",					vocab_type_name(ng->type));			    }			}			else			    vocab[word_num] = strdup(buf2);		    }		    else {			vocab[word_num] = strdup(buf2);			SLMIntHashInsert(vocab_ht,vocab[word_num],word_num);		    }		    z_getfloat(f,&alpha);		    nodes[0][i].id = word_num;		    nodes[0][i].prob = pow(10.0,prob);		    nodes[0][i].alpha = pow(10.0,alpha);		    nodes[0][i].nextpos = -1;		    nodes[0][i].nelem = 0;		    word_num++;		}		else { /* 1 < n < ngram_len */		    for (j = 0; j < n; j++) {			z_getstr(f,buf2,256);			idarray[j] = SLMIntHashSearch(vocab_ht,buf2);		    }		    z_getfloat(f,&alpha);		    nodes[n-1][i].id = idarray[n-1];		    nodes[n-1][i].prob = pow(10.0,prob);		    nodes[n-1][i].alpha = pow(10.0,alpha);		    nodes[n-1][i].nextpos = -1;		    nodes[n-1][i].nelem = 0;		    putNgramNodeData(ng,ng->node[0],ng->n_unigram,idarray,1,n,i);		}	    }	    else { /* n == ngram_len */	        if (z_getfloat(f,&prob) != 0) {		    z_close(f);		    return format_error(n);		}		for (j = 0; j < n; j++) {		    z_getstr(f,buf2,256);		    idarray[j] = SLMIntHashSearch(vocab_ht,buf2);		}		leaves[i].id = idarray[n-1];		leaves[i].prob = pow(10.0,prob);		putNgramNodeData(ng,ng->node[0],ng->n_unigram,idarray,1,n,i);	    }	}		while ((a = z_gets(buf,256,f)) != NULL) {	    if (buf[0] == '\\')		break;	}	if (a == NULL) {	    fprintf(stderr,"SLMReadLM ERROR: file ended while scanning \"\\%d-grams:\" or \"\\end\\\"\n",n+1);	    z_close(f);	    return NULL;	}	n++;    } while (strncmp(buf,"\\end\\",5) != 0);    if (verbosity > 0)	fputc('\n',stderr);    /* read additional class info */    while (z_gets(buf,256,f) != NULL) {	if (strncmp(buf,"\\class\\",7) == 0) {	    if (verbosity > 1)		fprintf(stderr,"Reading additional class ngram info...\n");	    SLM_Set_NgramType(ng->type,SLM_ClassNgram);	    break;	}    }    if (SLM_NgramType(ng->type) == SLM_WordNgram) {        z_close(f);	return ng;    }    ng->class_ht = ng->vocab_ht;    ng->class_sym= ng->vocab;    ng->first_class_id = ng->first_id;    ng->first_id = 0;    if (SLM_WORD_VOCAB(ng->type) == SLM_WORD_VOCAB_OPEN)	SLM_Set_CLASS_VOCAB(ng->type,SLM_CLASS_VOCAB_OPEN);    else	SLM_Set_CLASS_VOCAB(ng->type,SLM_CLASS_VOCAB_CLOSED);    SLM_Set_WORD_VOCAB(ng->type,SLM_WORD_VOCAB_OPEN);    SLM_Set_N_UNK(ng->type,SLM_CLASS_UNK);	    z_getint(f,&ng->n_word);    ng->vocab_ht = SLMHashCreateSI(ng->n_word*3/2);    ng->vocab = New_N(char*,ng->n_word);    ng->c_uniprob = New_N(float,ng->n_word);    ng->class_id = New_N(SLMWordID,ng->n_word);    for (i = 0; i < ng->n_word; i++) {	z_getstr(f,buf,256);	z_getstr(f,buf2,256);	z_getfloat(f,&prob);	if (i == 0 && strcmp(buf2,"<UNK>") != 0) {	    /* first word is not UNK */	    SLM_Set_N_UNK(ng->type,SLM_NO_UNK);	    SLM_Set_WORD_VOCAB(ng->type,SLM_WORD_VOCAB_CLOSED);	    ng->first_id = 1;	    ng->vocab[0] = strdup("<UNK>");	    i++;	    if (verbosity > 2) {		fprintf(stderr,"\n<UNK> not found: vocab_type is set to <%s>\n",			vocab_type_name(ng->type));	    }	}	ng->class_id[i] = SLMIntHashSearch(ng->class_ht,buf);	ng->vocab[i] = strdup(buf2);	SLMIntHashInsert(ng->vocab_ht,ng->vocab[i],i);	ng->c_uniprob[i] = pow(10.0,prob);    }    z_close(f); /* 13,Jul,2005 kato */    return ng;}int4SLMd2l(double x){    double scale = (double)0x7fffffffL/1000;    double z = x*scale;    int4 r;    if (z > 0x7fffffff)	r = 0x80000000;    else	r = z;    return r;}doubleSLMl2d(int4 x){    double scale = (double)1000/0x7fffffffL;    return (double)x*scale;}#define l2d(x) SLMl2d(x)static SLMNgram *SLMReadLM_binary(char *filename,int verbosity){    FILEHANDLE f = z_open(filename,"r");    char buf[SLM_BINLM_HEADER_SIZE];    SLMNgram *ng;    int4 ngram_size[MAX_GRAM];      int i,j,m,n;    int ngram_len;    int word_num;    SLMNgramNode **nodes;    SLMNgramLeaf *leaves;    float prob,alpha;    char **vocab;    SLMHashTable *vocab_ht;    SLMWordID idarray[MAX_GRAM];    int4 l;    uint2 s;    uint2 distance = 0;    ng = SLMNewLM();    ng->filename = strdup(filename);    z_read(buf,sizeof(char),SLM_BINLM_HEADER_SIZE_V2,f); /* read header */    if (strcmp(buf,SLM_BINLM_HEADER_MSG_V1) == 0) {	/* version 1 header */	z_read(buf,sizeof(char),SLM_BINLM_HEADER_SIZE_V1-SLM_BINLM_HEADER_SIZE_V2,f);    }    else if (strcmp(buf,SLM_BINLM_HEADER_MSG_V2) == 0) {	/* version 2 header */	z_read(&s,sizeof(uint2),1,f);	distance = ntohs(s);    }    else {	fprintf(stderr,"SLMReadLM: Can't handle this version: %s\n",buf);	z_close(f);	return NULL;    }    z_read(&s,sizeof(short),1,f); /* ngram length */    ngram_len = ntohs(s);    if (distance > 0 && ngram_len != 2) {      fprintf(stderr,"SLMReadLM ERROR: distance=%d and n=%d: ngram size must be 2\n",distance,ngram_len);      z_close(f);      return NULL;    }    /* number of ngram */    for (i = 0; i < ngram_len; i++) {	z_read(&l,sizeof(int4),1,f);	ngram_size[i] = ntohl(l);    }    nodes = New_N(SLMNgramNode*,ngram_len-1);    for (i = 0; i < ngram_len-1; i++)	nodes[i] = New_N(SLMNgramNode,ngram_size[i]+1);    leaves = New_N(SLMNgramLeaf,ngram_size[ngram_len-1]);    vocab = New_N(char*,ngram_size[0]+1);    vocab_ht = SLMHashCreateSI(ngram_size[0]*3/2);    ng->ngram_len = ngram_len;    if (distance == 0)	ng->context_len = ngram_len-1;    else	ng->context_len = distance+1;    ng->n_unigram = ngram_size[0];    ng->node = nodes;    ng->leaf = leaves;    ng->vocab = vocab;    ng->vocab_ht = vocab_ht;#ifdef NG_CACHE    ng->hist = New_N(SLMNgramSearchHist,ngram_len-1);    for (i = 0; i < ngram_len-1; i++) {	ng->hist[i].id = SLM_NONWORD;	ng->hist[i].node = NULL;    }#endif    /* read vocab */    n = read_WordID(f,0);    for (i = 1; i <= n; i++) {	m = read_ushort(f,0);	vocab[i] = New_N(char,m+1);	z_read(vocab[i],sizeof(char),m,f);        vocab[i][m] = '\0';	SLMIntHashInsert(vocab_ht,vocab[i],i);    }    /* ngram start */    n = 1;    word_num = 0;    do {	if (verbosity > 0) {	    fprintf(stderr,"%d-grams",n);	    fflush(stderr);	}	for (i = 0; i < ngram_size[n-1]; i++) {	    if (verbosity > 0 && i%20000 == 0) {		fputc('.',stderr);		fflush(stderr);	    }	    if (n < ngram_len) {		z_read(&l,sizeof(int4),1,f);		prob = l2d(ntohl(l));		for (j = 0; j < n; j++) {		    idarray[j] = read_WordID(f,0);		}		z_read(&l,sizeof(int4),1,f);		alpha = l2d(ntohl(l));		nodes[n-1][i].id = idarray[n-1];		nodes[n-1][i].prob = pow(10.0,prob);		nodes[n-1][i].alpha = pow(10.0,alpha);		nodes[n-1][i].nextpos = -1;		nodes[n-1][i].nelem = 0;		if (n > 1)		    putNgramNodeData(ng,ng->node[0],ng->n_unigram,idarray,1,n,i);	    }	    else { /* n == ngram_len */		z_read(&l,sizeof(int4),1,f);		prob = l2d(ntohl(l));		for (j = 0; j < n; j++) {		    idarray[j] = read_WordID(f,0);		}		leaves[i].id = idarray[n-1];		leaves[i].prob = pow(10.0,prob);		putNgramNodeData(ng,ng->node[0],ng->n_unigram,idarray,1,n,i);	    }	}	n++;    } while (n <= ngram_len);    if (verbosity > 0)	fputc('\n',stderr);    /* read additional class info */    if (z_read(buf,sizeof(char),16,f) == 16) {	if (verbosity > 1)	    fprintf(stderr,"Reading additional class ngram info...\n");	SLM_Set_NgramType(ng->type,SLM_ClassNgram);    }    if (SLM_NgramType(ng->type) == SLM_WordNgram) {        z_close(f);	return ng;    }    ng->class_ht = ng->vocab_ht;    ng->class_sym = ng->vocab;    ng->first_class_id = ng->first_id;    if (SLM_WORD_VOCAB(ng->type) == SLM_WORD_VOCAB_OPEN)	SLM_Set_CLASS_VOCAB(ng->type,SLM_CLASS_VOCAB_OPEN);    else	SLM_Set_CLASS_VOCAB(ng->type,SLM_CLASS_VOCAB_CLOSED);    SLM_Set_WORD_VOCAB(ng->type,SLM_WORD_VOCAB_OPEN);    SLM_Set_N_UNK(ng->type,SLM_CLASS_UNK);    z_read(&l,sizeof(int4),1,f);    ng->n_word = ntohl(l);    z_read(&s,sizeof(short),1,f);    ng->first_id = ntohs(s);    if (ng->first_id != 0) {	/* first word is not UNK */	SLM_Set_N_UNK(ng->type,SLM_NO_UNK);	SLM_Set_WORD_VOCAB(ng->type,SLM_WORD_VOCAB_CLOSED);	ng->vocab[0] = strdup("<UNK>");	if (verbosity > 2) {	    fprintf(stderr,"\nvocab_type is set to <%s>\n",		    vocab_type_name(ng->type));	}    }    ng->vocab_ht = SLMHashCreateSI(ng->n_word*3/2);    ng->vocab = New_N(char*,ng->n_word);    ng->c_uniprob = New_N(float,ng->n_word);    ng->class_id = New_N(SLMWordID,ng->n_word);    for (i = ng->first_id; i <= ng->n_word; i++) {	ng->class_id[i] = read_WordID(f,0);	z_read(&l,sizeof(int4),1,f);	ng->c_uniprob[i] = pow(10.0,l2d(ntohl(l)));	if (i != 0) {	    z_read(&s,sizeof(short),1,f);	    j = ntohs(s);	    ng->vocab[i] = New_N(char,j+1);	    z_read(ng->vocab[i],sizeof(char),j,f);	    SLMIntHashInsert(ng->vocab_ht,ng->vocab[i],i);	}    }    z_close(f); /* 13,Jul,2005 kato */    return ng;}static SLMNgram *check_ngram_filename(SLMNgram *ng, char *filename){    while (ng != NULL) {	if (strcmp(ng->filename, filename) == 0)	    return ng;	ng = ng->next_lm;    }    return NULL;}static SLMNgram *SLMReadLM0(char *filename,int format,int verbosity){    if (format == SLM_LM_ARPA)	return SLMReadLM_arpa(filename,verbosity);    else if (format == SLM_LM_BINARY)	return SLMReadLM_binary(filename,verbosity);    else {	fprintf(stderr,"SLMReadLM Error: unsupported format %d\n",format);	return NULL;    }}static SLMNgram *create_delegate(SLMNgram *ng, int len){    SLMNgram *ng2;    if (IS_DISTANT_BIGRAM(ng)) {	fprintf(stderr,"SLMReadLM Error: Can't create delegate for a distant bigram\n");

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -