📄 ngram_read_bin.c
字号:
/** * @file ngram_read_bin.c * * <JA> * @brief バイナリ妨及のN-gramファイルを粕み哈む * * バイナリ妨及では 2-gram と嫡羹き 3-gram が1つのファイルに * 箭められていますˉバイナリ妨及はJuilus迫极妨及のみをサポ〖トしており· * 戮のバイナリ妨及と高垂拉はありませんので庙罢して布さいˉ * * rev.3.5 より·バイナリN-gramのファイル妨及の办婶が恃构されましたˉ * バイトオ〖ダ〖が Big endian 盖年からマシン巴赂に恃构され(ヘッダに * 恃垂箕の掘凤を淡揭), またインデックスの 24bit 步 * および 2-gram のバックオフデ〖タの暗教も乖われましたˉ * これにより·3.5 笆惯の mkbingram で栏喇したバイナリN-gramは, * 3.4.2笆涟の Julius では蝗えませんので庙罢してくださいˉ * (ヘッダチェックでエラ〖となる) * * なお 3.5 笆惯の Julius では骄丸のモデルも啼玛なく粕めるˉこの眷圭, * インデックスの 24bit 步とバックオフの暗教はモデル粕み哈み箕に * その旁刨乖われるˉまたバイトオ〖ダ〖はヘッダを斧て努倒恃垂するので· * 佰なるバイトオ〖ダ〖のマシンで栏喇した * バイナリN-gramでも啼玛なく粕めるˉもちろん骄丸のモデルもそのまま * 粕み哈めるˉ * * </JA> * * <EN> * @brief Read binary foramt N-gram file * * In binary format, both 2-gram and reverse 3-gram are stored * together in one file. This binary format is not * compatible with other binary format of language model. * * From 3.5, internal format of binary N-gram has changed for using * machine-dependent natural byte order (previously fixed to big endian), * 24bit index and 2-gram backoff compression. So, binary N-gram * generated by mkbingram of 3.5 and later will not work on 3.4.2 and * earlier versions. * * There is full upward- and cross-machine compatibility in 3.5. Old * binary N-gram files still can be read directly, in which case the conversion * to 24bit index will performed just after model has been read. * Byte order will also considered by header information, so * binary N-gram still can be used among different machines. * </EN> * * @author Akinobu LEE * @date Wed Feb 16 17:12:08 2005 * * $Revision: 1.6 $ * *//* * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology * All rights reserved */#include <sent/stddefs.h>#include <sent/ngram2.h>static int file_version; ///< N-gram format version of the filestatic boolean need_swap; ///< TRUE if need byte swap#ifdef WORDS_INTstatic boolean need_conv; ///< TRUE if need conversion of word ID from 2 bytes to 4 bytesstatic boolean words_int_retry = FALSE; ///< TRUE if retrying with conversion#endif/** * * */#define rdn(A,B,C,D) if (rdnfunc(A,B,C,D) == FALSE) return FALSE#define rdn_wordid(A,B,C,D) if (rdn_wordid_func(A,B,C,D) == FALSE) return FALSE/** * Binary read function with byte swap * * @param fp [in] file pointer * @param buf [out] data buffer * @param unitbyte [in] unit size in bytes * @param unitnum [in] number of unit to read. */static booleanrdnfunc(FILE *fp, void *buf, size_t unitbyte, size_t unitnum){ size_t tmp; if ((tmp = myfread(buf, unitbyte, unitnum, fp)) < unitnum) { jlog("Error: ngram_read_bin: failed to read %d bytes\n", unitbyte*unitnum); return FALSE; } if (need_swap) { if (unitbyte != 1) { swap_bytes(buf, unitbyte, unitnum); } } return TRUE;}#ifdef WORDS_INT/** * Binary read function with byte swap and word id conversion * * @param fp [in] file pointer * @param buf [out] data buffer * @param unitnum [in] number of unit to read. * @param need_conv [in] TRUE if need conversion from 2byte to 4byte */static booleanrdn_wordid_func(FILE *fp, void *buf, int unitnum, boolean need_conv){ int i; unsigned short *s; WORD_ID *t; WORD_ID d; if (need_conv) { /* read unsigned short units */ rdn(fp, buf, sizeof(unsigned short), unitnum); /* convert them to WORD_ID (integer) */ for(i=unitnum-1;i>=0;i--) { s = (unsigned short *)buf + i; t = (WORD_ID *)buf + i; d = *s; *t = d; } } else { /* read as usual */ rdn(fp, buf, sizeof(WORD_ID), unitnum); } return TRUE;}#endif/** * Check header to see whether the version matches. * * @param fp [in] file pointer */static booleancheck_header(FILE *fp){ char buf[BINGRAM_HDSIZE], *p; rdn(fp, buf, 1, BINGRAM_HDSIZE); p = buf;#ifdef WORDS_INT need_conv = FALSE;#endif /* version check */ if (strnmatch(p, BINGRAM_IDSTR, strlen(BINGRAM_IDSTR))) { /* bingram file made by mkbingram before 3.4.2 */ file_version = 3; p += strlen(BINGRAM_IDSTR) + 1; } else if (strnmatch(p, BINGRAM_IDSTR_V4, strlen(BINGRAM_IDSTR_V4))) { /* bingram file made by mkbingram later than 3.5 */ file_version = 4; p += strlen(BINGRAM_IDSTR_V4) + 1; } else if (strnmatch(p, BINGRAM_IDSTR_V5, strlen(BINGRAM_IDSTR_V5))) { /* bingram file made by JuliusLib-4 and later */ file_version = 5; p += strlen(BINGRAM_IDSTR_V5) + 1; } else { /* not a bingram file */ jlog("Error: ngram_read_bin: invalid header\n"); return FALSE; } /* word size check (for bingram build by mkbingram 3.3p5 and later */ if (strnmatch(p, BINGRAM_SIZESTR_HEAD, strlen(BINGRAM_SIZESTR_HEAD))) { p += strlen(BINGRAM_SIZESTR_HEAD); if (! strnmatch(p, BINGRAM_SIZESTR_BODY, strlen(BINGRAM_SIZESTR_BODY))) { /* word size does not match (int / short) */#ifdef WORDS_INT if (strnmatch(p, BINGRAM_SIZESTR_BODY_2BYTE, strlen(BINGRAM_SIZESTR_BODY_2BYTE))) { /* this is 2-byte word ID, will convert while reading */ jlog("Warning: ngram_read_bin: 2-bytes bingram, converting to 4 bytes\n"); need_conv = TRUE; p += strlen(BINGRAM_SIZESTR_BODY_2BYTE) + 1; } else { jlog("Error: ngram_read_bin: unknown word byte size!\n"); return FALSE; }#else if (strnmatch(p, BINGRAM_SIZESTR_BODY_4BYTE, strlen(BINGRAM_SIZESTR_BODY_4BYTE))) { /*** 4bytes to 2bytes not implemented, just terminate here... ***/ jlog("Error: ngram_read_bin: cannot handle 4-bytes bingram\n"); jlog("Error: ngram_read_bin: please use Julius compiled with --enable-words-int\n"); return FALSE; //p += strlen(BINGRAM_SIZESTR_BODY_4BYTE) + 1; } else { jlog("Error: ngram_read_bin: unknown word byte size!\n"); return FALSE; }#endif } else { p += strlen(BINGRAM_SIZESTR_BODY) + 1; } /* byte order check (v4 (rev.3.5) and later) */ if (file_version >= 4) { if (!strnmatch(p, BINGRAM_BYTEORDER_HEAD, strlen(BINGRAM_BYTEORDER_HEAD))) { jlog("Error: ngram_read_bin: no information for byte order??\n"); return FALSE; } p += strlen(BINGRAM_BYTEORDER_HEAD); if (! strnmatch(p, BINGRAM_NATURAL_BYTEORDER, strlen(BINGRAM_NATURAL_BYTEORDER))) { /* file endian and running endian is different, need swapping */ need_swap = TRUE; } else { need_swap = FALSE; } p += strlen(BINGRAM_NATURAL_BYTEORDER) + 1; } } /* if no BINGRAM_SIZESTR_HEAD found, just pass it */ /* in case of V3 bingram file, the unit size of word_id and its byte order cannot be determined from the header. In that case, we assume byteorder to be a BIG ENDIAN. The word_id unit size (2byte in normal, or 4byte if bingram generated with mkbingram with --enable-words-int) will be automagically detected. */ if (file_version < 4) { /* assume input as big endian */#ifdef WORDS_BIGENDIAN need_swap = FALSE;#else need_swap = TRUE;#endif } /*jlog("%s",buf);*/ return TRUE;}static booleanngram_read_bin_v5(FILE *fp, NGRAM_INFO *ndata){ int i,n,len; char *w, *p;#ifdef WORDS_INT unsigned short *buf;#endif NGRAM_TUPLE_INFO *t; /* read some info extended from version 5 */ rdn(fp, &(ndata->n), sizeof(int), 1); rdn(fp, &(ndata->dir), sizeof(int), 1); rdn(fp, &(ndata->bigram_index_reversed), sizeof(boolean), 1); jlog("Stat: ngram_read_bin_v5: this is %s %d-gram file\n", (ndata->dir == DIR_LR) ? "forward" : "backward", ndata->n); if (ndata->n > MAX_N) { jlog("Error: ngram_read_bin_v5: too long N-gram (N=%d)\n", n); jlog("Error: ngram_read_bin_v5: current maximum length of N-gram is set to %d\n", MAX_N); jlog("Error: ngram_read_bin_v5: you can expand the limit by setting MAX_N in \"sent/ngram.h\"\n"); return FALSE; } /* read total info and set max_word_num */ for(n=0;n<ndata->n;n++) { rdn(fp, &(ndata->d[n].totalnum), sizeof(NNID), 1); } ndata->max_word_num = ndata->d[0].totalnum; /* read wname */ rdn(fp, &len, sizeof(int), 1); w = mymalloc(len); rdn(fp, w, 1, len); /* assign... */ ndata->wname = (char **)mymalloc(sizeof(char *) * ndata->max_word_num); p = w; i = 0; while (p < w + len) { ndata->wname[i++] = p; while(*p != '\0') p++; p++; } if (i != ndata->max_word_num) { jlog("Error: ngram_read_bin_v5: wname error??\n"); return FALSE; } /* read N-gram */ for(n=0;n<ndata->n;n++) { jlog("stat: ngram_read_bin_v5: reading %d-gram\n", n+1); t = &(ndata->d[n]); rdn(fp, &(t->is24bit), sizeof(boolean), 1); rdn(fp, &(t->ct_compaction), sizeof(boolean), 1); rdn(fp, &(t->bgnlistlen), sizeof(NNID), 1); rdn(fp, &(t->context_num), sizeof(NNID), 1); if (n > 0) { if (t->is24bit) { t->bgn_upper = (NNID_UPPER *)mymalloc_big(sizeof(NNID_UPPER), t->bgnlistlen); rdn(fp, t->bgn_upper, sizeof(NNID_UPPER), t->bgnlistlen); t->bgn_lower = (NNID_LOWER *)mymalloc_big(sizeof(NNID_LOWER), t->bgnlistlen); rdn(fp, t->bgn_lower, sizeof(NNID_LOWER), t->bgnlistlen); } else { t->bgn = (NNID *)mymalloc_big(sizeof(NNID), t->bgnlistlen); rdn(fp, t->bgn, sizeof(NNID), t->bgnlistlen); } t->num = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), t->bgnlistlen); rdn(fp, t->num, sizeof(WORD_ID), t->bgnlistlen); t->nnid2wid = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), t->totalnum); rdn(fp, t->nnid2wid, sizeof(WORD_ID), t->totalnum); } else { t->bgn_upper = NULL; t->bgn_lower = NULL; t->bgn = NULL; t->num = NULL; t->bgnlistlen = 0; t->nnid2wid = NULL; } t->prob = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->totalnum); rdn(fp, t->prob, sizeof(LOGPROB), t->totalnum); rdn(fp, &i, sizeof(int), 1); if (i == 1) { t->bo_wt = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->context_num);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -