⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 ngram_read_bin.c

📁 julius version 4.12.about sound recognition.
💻 C
📖 第 1 页 / 共 2 页
字号:
/** * @file   ngram_read_bin.c *  * <JA> * @brief  バイナリ妨及のN-gramファイルを粕み哈む * * バイナリ妨及では 2-gram と嫡羹き 3-gram が1つのファイルに * 箭められていますˉバイナリ妨及はJuilus迫极妨及のみをサポ〖トしており· * 戮のバイナリ妨及と高垂拉はありませんので庙罢して布さいˉ * * rev.3.5 より·バイナリN-gramのファイル妨及の办婶が恃构されましたˉ * バイトオ〖ダ〖が Big endian 盖年からマシン巴赂に恃构され(ヘッダに * 恃垂箕の掘凤を淡揭), またインデックスの 24bit 步 * および 2-gram のバックオフデ〖タの暗教も乖われましたˉ * これにより·3.5 笆惯の mkbingram で栏喇したバイナリN-gramは, * 3.4.2笆涟の Julius では蝗えませんので庙罢してくださいˉ * (ヘッダチェックでエラ〖となる) * * なお 3.5 笆惯の Julius では骄丸のモデルも啼玛なく粕めるˉこの眷圭, * インデックスの 24bit 步とバックオフの暗教はモデル粕み哈み箕に * その旁刨乖われるˉまたバイトオ〖ダ〖はヘッダを斧て努倒恃垂するので· * 佰なるバイトオ〖ダ〖のマシンで栏喇した * バイナリN-gramでも啼玛なく粕めるˉもちろん骄丸のモデルもそのまま * 粕み哈めるˉ *  * </JA> *  * <EN> * @brief  Read binary foramt N-gram file * * In binary format, both 2-gram and reverse 3-gram are stored * together in one file.  This binary format is not * compatible with other binary format of language model. *  * From 3.5, internal format of binary N-gram has changed for using * machine-dependent natural byte order (previously fixed to big endian), * 24bit index and 2-gram backoff compression.  So, binary N-gram * generated by mkbingram of 3.5 and later will not work on 3.4.2 and * earlier versions. * * There is full upward- and cross-machine compatibility in 3.5.  Old * binary N-gram files still can be read directly, in which case the conversion * to 24bit index will performed just after model has been read. * Byte order will also considered by header information, so * binary N-gram still can be used among different machines. * </EN> *  * @author Akinobu LEE * @date   Wed Feb 16 17:12:08 2005 * * $Revision: 1.6 $ *  *//* * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology * All rights reserved */#include <sent/stddefs.h>#include <sent/ngram2.h>static int file_version;  ///< N-gram format version of the filestatic boolean need_swap; ///< TRUE if need byte swap#ifdef WORDS_INTstatic boolean need_conv;	///< TRUE if need conversion of word ID from 2 bytes to 4 bytesstatic boolean words_int_retry = FALSE; ///< TRUE if retrying with conversion#endif/** *  *  */#define rdn(A,B,C,D) if (rdnfunc(A,B,C,D) == FALSE) return FALSE#define rdn_wordid(A,B,C,D) if (rdn_wordid_func(A,B,C,D) == FALSE) return FALSE/**  * Binary read function with byte swap *  * @param fp [in] file pointer * @param buf [out] data buffer * @param unitbyte [in] unit size in bytes * @param unitnum [in] number of unit to read. */static booleanrdnfunc(FILE *fp, void *buf, size_t unitbyte, size_t unitnum){  size_t tmp;  if ((tmp = myfread(buf, unitbyte, unitnum, fp)) < unitnum) {    jlog("Error: ngram_read_bin: failed to read %d bytes\n", unitbyte*unitnum);    return FALSE;  }  if (need_swap) {    if (unitbyte != 1) {      swap_bytes(buf, unitbyte, unitnum);    }  }  return TRUE;}#ifdef WORDS_INT/**  * Binary read function with byte swap and word id conversion *  * @param fp [in] file pointer * @param buf [out] data buffer * @param unitnum [in] number of unit to read. * @param need_conv [in] TRUE if need conversion from 2byte to 4byte */static booleanrdn_wordid_func(FILE *fp, void *buf, int unitnum, boolean need_conv){  int i;  unsigned short *s;  WORD_ID *t;  WORD_ID d;  if (need_conv) {    /* read unsigned short units */    rdn(fp, buf, sizeof(unsigned short), unitnum);    /* convert them to WORD_ID (integer) */    for(i=unitnum-1;i>=0;i--) {      s = (unsigned short *)buf + i;      t = (WORD_ID *)buf + i;      d = *s;      *t = d;    }  } else {    /* read as usual */    rdn(fp, buf, sizeof(WORD_ID), unitnum);  }  return TRUE;}#endif/**  * Check header to see whether the version matches. *  * @param fp [in] file pointer */static booleancheck_header(FILE *fp){  char buf[BINGRAM_HDSIZE], *p;  rdn(fp, buf, 1, BINGRAM_HDSIZE);  p = buf;#ifdef WORDS_INT  need_conv = FALSE;#endif  /* version check */  if (strnmatch(p, BINGRAM_IDSTR, strlen(BINGRAM_IDSTR))) {    /* bingram file made by mkbingram before 3.4.2 */    file_version = 3;    p += strlen(BINGRAM_IDSTR) + 1;  } else if (strnmatch(p, BINGRAM_IDSTR_V4, strlen(BINGRAM_IDSTR_V4))) {    /* bingram file made by mkbingram later than 3.5 */    file_version = 4;    p += strlen(BINGRAM_IDSTR_V4) + 1;  } else if (strnmatch(p, BINGRAM_IDSTR_V5, strlen(BINGRAM_IDSTR_V5))) {    /* bingram file made by JuliusLib-4 and later */    file_version = 5;    p += strlen(BINGRAM_IDSTR_V5) + 1;  } else {    /* not a bingram file */    jlog("Error: ngram_read_bin: invalid header\n");    return FALSE;  }  /* word size check (for bingram build by mkbingram 3.3p5 and later */  if (strnmatch(p, BINGRAM_SIZESTR_HEAD, strlen(BINGRAM_SIZESTR_HEAD))) {    p += strlen(BINGRAM_SIZESTR_HEAD);    if (! strnmatch(p, BINGRAM_SIZESTR_BODY, strlen(BINGRAM_SIZESTR_BODY))) {      /* word size does not match (int / short) */#ifdef WORDS_INT      if (strnmatch(p, BINGRAM_SIZESTR_BODY_2BYTE, strlen(BINGRAM_SIZESTR_BODY_2BYTE))) {	/* this is 2-byte word ID, will convert while reading */	jlog("Warning: ngram_read_bin: 2-bytes bingram, converting to 4 bytes\n");	need_conv = TRUE;	p += strlen(BINGRAM_SIZESTR_BODY_2BYTE) + 1;      } else {	jlog("Error: ngram_read_bin: unknown word byte size!\n");	return FALSE;      }#else      if (strnmatch(p, BINGRAM_SIZESTR_BODY_4BYTE, strlen(BINGRAM_SIZESTR_BODY_4BYTE))) {	/*** 4bytes to 2bytes not implemented, just terminate here... ***/	jlog("Error: ngram_read_bin: cannot handle 4-bytes bingram\n");	jlog("Error: ngram_read_bin: please use Julius compiled with --enable-words-int\n");	return FALSE;	//p += strlen(BINGRAM_SIZESTR_BODY_4BYTE) + 1;      } else {	jlog("Error: ngram_read_bin: unknown word byte size!\n");	return FALSE;      }#endif    } else {      p += strlen(BINGRAM_SIZESTR_BODY) + 1;    }    /* byte order check (v4 (rev.3.5) and later) */    if (file_version >= 4) {      if (!strnmatch(p, BINGRAM_BYTEORDER_HEAD, strlen(BINGRAM_BYTEORDER_HEAD))) {	jlog("Error: ngram_read_bin: no information for byte order??\n");	return FALSE;      }      p += strlen(BINGRAM_BYTEORDER_HEAD);      if (! strnmatch(p, BINGRAM_NATURAL_BYTEORDER, strlen(BINGRAM_NATURAL_BYTEORDER))) {	/* file endian and running endian is different, need swapping */	need_swap = TRUE;      } else {	need_swap = FALSE;      }      p += strlen(BINGRAM_NATURAL_BYTEORDER) + 1;    }  } /* if no BINGRAM_SIZESTR_HEAD found, just pass it */  /* in case of V3 bingram file, the unit size of word_id and its byte order     cannot be determined from the header.  In that case, we assume      byteorder to be a BIG ENDIAN.  The word_id unit size (2byte in normal,     or 4byte if bingram generated with mkbingram with --enable-words-int)     will be automagically detected.     */  if (file_version < 4) {    /* assume input as big endian */#ifdef WORDS_BIGENDIAN    need_swap = FALSE;#else    need_swap = TRUE;#endif  }      /*jlog("%s",buf);*/  return TRUE;}static booleanngram_read_bin_v5(FILE *fp, NGRAM_INFO *ndata){  int i,n,len;  char *w, *p;#ifdef WORDS_INT  unsigned short *buf;#endif  NGRAM_TUPLE_INFO *t;  /* read some info extended from version 5 */  rdn(fp, &(ndata->n), sizeof(int), 1);  rdn(fp, &(ndata->dir), sizeof(int), 1);  rdn(fp, &(ndata->bigram_index_reversed), sizeof(boolean), 1);  jlog("Stat: ngram_read_bin_v5: this is %s %d-gram file\n", (ndata->dir == DIR_LR) ? "forward" : "backward", ndata->n);  if (ndata->n > MAX_N) {    jlog("Error: ngram_read_bin_v5: too long N-gram (N=%d)\n", n);    jlog("Error: ngram_read_bin_v5: current maximum length of N-gram is set to %d\n", MAX_N);    jlog("Error: ngram_read_bin_v5: you can expand the limit by setting MAX_N in \"sent/ngram.h\"\n");    return FALSE;  }  /* read total info and set max_word_num */  for(n=0;n<ndata->n;n++) {    rdn(fp, &(ndata->d[n].totalnum), sizeof(NNID), 1);  }  ndata->max_word_num = ndata->d[0].totalnum;  /* read wname */  rdn(fp, &len, sizeof(int), 1);  w = mymalloc(len);  rdn(fp, w, 1, len);  /* assign... */  ndata->wname = (char **)mymalloc(sizeof(char *) * ndata->max_word_num);  p = w; i = 0;  while (p < w + len) {    ndata->wname[i++] = p;    while(*p != '\0') p++;    p++;  }  if (i != ndata->max_word_num) {    jlog("Error: ngram_read_bin_v5: wname error??\n");    return FALSE;  }  /* read N-gram */  for(n=0;n<ndata->n;n++) {    jlog("stat: ngram_read_bin_v5: reading %d-gram\n", n+1);    t = &(ndata->d[n]);        rdn(fp, &(t->is24bit), sizeof(boolean), 1);    rdn(fp, &(t->ct_compaction), sizeof(boolean), 1);    rdn(fp, &(t->bgnlistlen), sizeof(NNID), 1);    rdn(fp, &(t->context_num), sizeof(NNID), 1);    if (n > 0) {      if (t->is24bit) {	t->bgn_upper = (NNID_UPPER *)mymalloc_big(sizeof(NNID_UPPER), t->bgnlistlen);	rdn(fp, t->bgn_upper, sizeof(NNID_UPPER), t->bgnlistlen);	t->bgn_lower = (NNID_LOWER *)mymalloc_big(sizeof(NNID_LOWER), t->bgnlistlen);	rdn(fp, t->bgn_lower, sizeof(NNID_LOWER), t->bgnlistlen);      } else {	t->bgn = (NNID *)mymalloc_big(sizeof(NNID), t->bgnlistlen);	rdn(fp, t->bgn, sizeof(NNID), t->bgnlistlen);      }      t->num = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), t->bgnlistlen);      rdn(fp, t->num, sizeof(WORD_ID), t->bgnlistlen);      t->nnid2wid = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), t->totalnum);      rdn(fp, t->nnid2wid, sizeof(WORD_ID), t->totalnum);    } else {      t->bgn_upper = NULL;      t->bgn_lower = NULL;      t->bgn = NULL;      t->num = NULL;      t->bgnlistlen = 0;      t->nnid2wid = NULL;    }    t->prob = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->totalnum);    rdn(fp, t->prob, sizeof(LOGPROB), t->totalnum);    rdn(fp, &i, sizeof(int), 1);    if (i == 1) {      t->bo_wt = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->context_num);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -