📄 ngram_read_bin.c
字号:
/** * @file ngram_read_bin.c * @author Akinobu LEE * @date Wed Feb 16 17:12:08 2005 * * <JA> * @brief バイナリ妨及のN-gramファイルを粕み哈む * * バイナリ妨及では 2-gram と嫡羹き 3-gram が1つのファイルに * 箭められていますˉバイナリ妨及はJuilus迫极妨及のみをサポ〖トしており· * 戮のバイナリ妨及と高垂拉はありませんので庙罢して布さいˉ * * rev.3.5 より·バイナリN-gramのファイル妨及の办婶が恃构されましたˉ * バイトオ〖ダ〖が Big endian 盖年からマシン巴赂に恃构され(ヘッダに * 恃垂箕の掘凤を淡揭), またインデックスの 24bit 步 * および 2-gram のバックオフデ〖タの暗教も乖われましたˉ * これにより·3.5 笆惯の mkbingram で栏喇したバイナリN-gramは, * 3.4.2笆涟の Julius では蝗えませんので庙罢してくださいˉ * (ヘッダチェックでエラ〖となる) * * なお 3.5 笆惯の Julius では骄丸のモデルも啼玛なく粕めるˉこの眷圭, * インデックスの 24bit 步とバックオフの暗教はモデル粕み哈み箕に * その旁刨乖われるˉまたバイトオ〖ダ〖はヘッダを斧て努倒恃垂するので· * 佰なるバイトオ〖ダ〖のマシンで栏喇した * バイナリN-gramでも啼玛なく粕めるˉもちろん骄丸のモデルもそのまま * 粕み哈めるˉ * * </JA> * * <EN> * @brief Read binary foramt N-gram file * * In binary format, both 2-gram and reverse 3-gram are stored * together in one file. This binary format is not * compatible with other binary format of language model. * * From 3.5, internal format of binary N-gram has changed for using * machine-dependent natural byte order (previously fixed to big endian), * 24bit index and 2-gram backoff compression. So, binary N-gram * generated by mkbingram of 3.5 and later will not work on 3.4.2 and * earlier versions. * * There is full upward- and cross-machine compatibility in 3.5. Old * binary N-gram files still can be read directly, in which case the conversion * to 24bit index will performed just after model has been read. * Byte order will also considered by header information, so * binary N-gram still can be used among different machines. * </EN> * * $Revision: 1.6 $ * *//* * Copyright (c) 1991-2006 Kawahara Lab., Kyoto University * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology * Copyright (c) 2005-2006 Julius project team, Nagoya Institute of Technology * All rights reserved */#include <sent/stddefs.h>#include <sent/ngram2.h>static int file_version; ///< N-gram format version of the filestatic boolean need_swap; ///< TRUE if need byte swap#ifdef WORDS_INTstatic boolean need_conv; ///< TRUE if need conversion of word ID from 2 bytes to 4 bytesstatic boolean words_int_retry = FALSE; ///< TRUE if retrying with conversion#endif/** * Binary read function with byte swap * * @param fp [in] file pointer * @param buf [out] data buffer * @param unitbyte [in] unit size in bytes * @param unitnum [in] number of unit to read. */static voidrdn(FILE *fp, void *buf, size_t unitbyte, int unitnum){ size_t tmp; if ((tmp = myfread(buf, unitbyte, unitnum, fp)) < (size_t)unitnum) { perror("ngram_read_bin"); j_error("read failed\n"); } if (need_swap) { if (unitbyte != 1) { swap_bytes(buf, unitbyte, unitnum); } }}#ifdef WORDS_INT/** * Binary read function with byte swap and word id conversion * * @param fp [in] file pointer * @param buf [out] data buffer * @param unitnum [in] number of unit to read. * @param need_conv [in] TRUE if need conversion from 2byte to 4byte */static voidrdn_wordid(FILE *fp, void *buf, int unitnum, boolean need_conv){ int i; unsigned short *s; WORD_ID *t; WORD_ID d; if (need_conv) { /* read unsigned short units */ rdn(fp, buf, sizeof(unsigned short), unitnum); /* convert them to WORD_ID (integer) */ for(i=unitnum-1;i>=0;i--) { s = (unsigned short *)buf + i; t = (WORD_ID *)buf + i; d = *s; *t = d; } } else { /* read as usual */ rdn(fp, buf, sizeof(WORD_ID), unitnum); }}#endif/** * Check header to see whether the version matches. * * @param fp [in] file pointer */static voidcheck_header(FILE *fp){ char buf[BINGRAM_HDSIZE], *p; rdn(fp, buf, 1, BINGRAM_HDSIZE); p = buf;#ifdef WORDS_INT need_conv = FALSE;#endif /* version check */ if (strnmatch(p, BINGRAM_IDSTR, strlen(BINGRAM_IDSTR))) { /* bingram file made by mkbingram before 3.4.2 */ file_version = 3; p += strlen(BINGRAM_IDSTR) + 1; } else if (strnmatch(p, BINGRAM_IDSTR_V4, strlen(BINGRAM_IDSTR_V4))) { /* bingram file made by mkbingram later than 3.5 */ file_version = 4; p += strlen(BINGRAM_IDSTR_V4) + 1; } else { /* not a bingram file */ j_printerr("Error: invalid header, you probably use an old bingram\n"); j_error("Error: if so, please re-make with newer mkbingram that comes with Julius-2.0 or later\n"); } /* word size check (for bingram build by mkbingram 3.3p5 and later */ if (strnmatch(p, BINGRAM_SIZESTR_HEAD, strlen(BINGRAM_SIZESTR_HEAD))) { p += strlen(BINGRAM_SIZESTR_HEAD); if (! strnmatch(p, BINGRAM_SIZESTR_BODY, strlen(BINGRAM_SIZESTR_BODY))) { /* word size does not match (int / short) */#ifdef WORDS_INT if (strnmatch(p, BINGRAM_SIZESTR_BODY_2BYTE, strlen(BINGRAM_SIZESTR_BODY_2BYTE))) { /* this is 2-byte word ID, will convert while reading */ j_printerr("\nWarning: 2-bytes bingram, converting to 4 bytes\n"); need_conv = TRUE; p += strlen(BINGRAM_SIZESTR_BODY_2BYTE) + 1; } else { j_error("\nError: unknown word byte size!\n"); }#else if (strnmatch(p, BINGRAM_SIZESTR_BODY_4BYTE, strlen(BINGRAM_SIZESTR_BODY_4BYTE))) { /*** 4bytes to 2bytes not implemented, just terminate here... ***/ j_printerr("\nError: cannot handle 4-bytes bingram\n"); j_error("Error: please use Julius compiled with --enable-words-int\n"); //p += strlen(BINGRAM_SIZESTR_BODY_4BYTE) + 1; } else { j_error("\nError: unknown word byte size!\n"); }#endif } else { p += strlen(BINGRAM_SIZESTR_BODY) + 1; } /* byte order check (v4 (rev.3.5) and later) */ if (file_version == 4) { if (!strnmatch(p, BINGRAM_BYTEORDER_HEAD, strlen(BINGRAM_BYTEORDER_HEAD))) { j_error("\nError: no information for byte order??\n"); } p += strlen(BINGRAM_BYTEORDER_HEAD); if (! strnmatch(p, BINGRAM_NATURAL_BYTEORDER, strlen(BINGRAM_NATURAL_BYTEORDER))) { /* file endian and running endian is different, need swapping */ need_swap = TRUE; } else { need_swap = FALSE; } p += strlen(BINGRAM_NATURAL_BYTEORDER) + 1; } } /* if no BINGRAM_SIZESTR_HEAD found, just pass it */ /* in case of V3 bingram file, the unit size of word_id and its byte order cannot be determined from the header. In that case, we assume byteorder to be a BIG ENDIAN. The word_id unit size (2byte in normal, or 4byte if bingram generated with mkbingram with --enable-words-int) will be automagically detected. */ if (file_version != 4) { /* assume input as big endian */#ifdef WORDS_BIGENDIAN need_swap = FALSE;#else need_swap = TRUE;#endif } /*j_printf("%s",buf);*/}/** * Read a N-gram binary file and store to data. * * @param fp [in] file pointer * @param ndata [out] N-gram data to store the read data * * @return TRUE on success, FALSE on failure. */booleanngram_read_bin(FILE *fp, NGRAM_INFO *ndata){ int i,n,len; char *w, *p; NNID *n3_bgn; NNID d, ntmp;#ifdef WORDS_INT unsigned short *buf;#endif#ifdef WORDS_INT /* reset retry flag */ words_int_retry = FALSE; /* when retrying, it restarts from here with words_int_retry = TRUE */ ngram_read_bin_start:#endif ndata->from_bin = TRUE; /* check initial header */ check_header(fp);#ifdef WORDS_INT /* in retry mode, force word_id conversion */ if (words_int_retry) need_conv = TRUE;#endif #ifdef WORDS_INT if (need_conv) j_printerr("(wordid conv)..");#endif /* read total info and set max_word_num */ for(n=0;n<MAX_N;n++) { rdn(fp, &(ndata->ngram_num[n]), sizeof(NNID), 1); if (file_version == 4 && ndata->ngram_num[n] >= NNIDMAX) { j_error("Error: too big %d-gram (%d, should be less than %d)\n", n+1, ndata->ngram_num[n], NNIDMAX); } } ndata->max_word_num = ndata->ngram_num[0]; if (file_version == 4) rdn(fp, &(ndata->bigram_bo_num), sizeof(NNID), 1); /* version requirement check */ switch(file_version) { case 4: ndata->version = 4; break; case 3: if (ndata->ngram_num[2] >= NNIDMAX) { j_printerr("Warning: more than %d 3-gram tuples, use old structure\n", NNIDMAX); ndata->version = 3; } else { ndata->version = 4; /* will be converted to v4 later */ } break; } /* read wname */ rdn(fp, &len, sizeof(int), 1); w = mymalloc(len); rdn(fp, w, 1, len); /* assign... */ ndata->wname = (char **)mymalloc(sizeof(char *)*ndata->ngram_num[0]); p = w; i = 0; while (p < w + len) { ndata->wname[i++] = p; while(*p != '\0') p++; p++; } if (i != ndata->ngram_num[0]) { j_error("wname error??\n"); } /* malloc 1-gram */ ndata->p = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[0]); ndata->bo_wt_lr = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[0]); ndata->bo_wt_rl = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[0]); ndata->n2_bgn = (NNID *)mymalloc(sizeof(NNID) * ndata->ngram_num[0]); ndata->n2_num = (WORD_ID *)mymalloc(sizeof(WORD_ID) * ndata->ngram_num[0]); /* read 1-gram */ j_printerr("1-gram."); rdn(fp, ndata->p, sizeof(LOGPROB), ndata->ngram_num[0]); j_printerr("."); rdn(fp, ndata->bo_wt_lr, sizeof(LOGPROB), ndata->ngram_num[0]); j_printerr("."); rdn(fp, ndata->bo_wt_rl, sizeof(LOGPROB), ndata->ngram_num[0]); j_printerr("."); rdn(fp, ndata->n2_bgn, sizeof(NNID), ndata->ngram_num[0]); j_printerr(".");#ifdef WORDS_INT rdn_wordid(fp, ndata->n2_num, ndata->ngram_num[0], need_conv);#else rdn(fp, ndata->n2_num, sizeof(WORD_ID), ndata->ngram_num[0]);#endif#ifdef WORDS_INT { /* check if we are wrongly reading word_id=2byte bingram (if bingram version >= 4, this should not be happen because header correctly tells the word_id byte size. This will occur only if matches all the conditions below: - you run Julius with --enable-words-int, - you use old bingram of version <= 3, and - you use bingram file converted without --enable-words-int */ WORD_ID w; for(w=0;w<ndata->ngram_num[0];w++) { if (ndata->n2_num[w] > ndata->ngram_num[0]) { if (words_int_retry) { j_error("\nError: retry failed, wrong bingram format\n"); } j_printerr("\nWarning: incorrect data, may be a 2-byte v3 bingram, retry with converion\n"); free(ndata->wname[0]); free(ndata->wname); free(ndata->p); free(ndata->bo_wt_lr); free(ndata->bo_wt_rl); free(ndata->n2_bgn); free(ndata->n2_num); myfrewind(fp); words_int_retry = TRUE; goto ngram_read_bin_start; } } }#endif /* malloc the rest */ ndata->n2tonid = (WORD_ID *)mymalloc(sizeof(WORD_ID) * ndata->ngram_num[1]); ndata->p_lr = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[1]); ndata->p_rl = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[1]); if (file_version == 4) { ndata->n2bo_upper = (NNID_UPPER *)mymalloc(sizeof(NNID_UPPER) * ndata->ngram_num[1]); ndata->n2bo_lower = (NNID_LOWER *)mymalloc(sizeof(NNID_LOWER) * ndata->ngram_num[1]); ndata->bo_wt_rrl = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->bigram_bo_num); ndata->n3_bgn_upper = (NNID_UPPER *)mymalloc(sizeof(NNID_UPPER) * ndata->bigram_bo_num); ndata->n3_bgn_lower = (NNID_LOWER *)mymalloc(sizeof(NNID_LOWER) * ndata->bigram_bo_num); ndata->n3_num = (WORD_ID *)mymalloc(sizeof(WORD_ID) * ndata->bigram_bo_num); } else { ndata->bo_wt_rrl = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[1]); ndata->n3_num = (WORD_ID *)mymalloc(sizeof(WORD_ID) * ndata->ngram_num[1]); if (ndata->version == 4) { ndata->n3_bgn_upper = (NNID_UPPER *)mymalloc(sizeof(NNID_UPPER) * ndata->ngram_num[1]); ndata->n3_bgn_lower = (NNID_LOWER *)mymalloc(sizeof(NNID_LOWER) * ndata->ngram_num[1]); n3_bgn = (NNID *)mymalloc(sizeof(NNID) * ndata->ngram_num[1]); } else { ndata->n3_bgn = (NNID *)mymalloc(sizeof(NNID) * ndata->ngram_num[1]); } } ndata->n3tonid = (WORD_ID *)mymalloc(sizeof(WORD_ID) * ndata->ngram_num[2]); ndata->p_rrl = (LOGPROB *)mymalloc(sizeof(LOGPROB) * ndata->ngram_num[2]); /* read 2-gram*/ j_printerr("2-gram.");#ifdef WORDS_INT rdn_wordid(fp, ndata->n2tonid, ndata->ngram_num[1], need_conv);#else rdn(fp, ndata->n2tonid, sizeof(WORD_ID), ndata->ngram_num[1]);#endif j_printerr("."); rdn(fp, ndata->p_lr, sizeof(LOGPROB), ndata->ngram_num[1]); j_printerr("."); rdn(fp, ndata->p_rl, sizeof(LOGPROB), ndata->ngram_num[1]); j_printerr("."); if (file_version == 4) { rdn(fp, ndata->n2bo_upper, sizeof(NNID_UPPER), ndata->ngram_num[1]); rdn(fp, ndata->n2bo_lower, sizeof(NNID_LOWER), ndata->ngram_num[1]); j_printerr("."); rdn(fp, ndata->bo_wt_rrl, sizeof(LOGPROB), ndata->bigram_bo_num); j_printerr("."); rdn(fp, ndata->n3_bgn_upper, sizeof(NNID_UPPER), ndata->bigram_bo_num); rdn(fp, ndata->n3_bgn_lower, sizeof(NNID_LOWER), ndata->bigram_bo_num); j_printerr(".");#ifdef WORDS_INT rdn_wordid(fp, ndata->n3_num, ndata->bigram_bo_num, need_conv);#else rdn(fp, ndata->n3_num, sizeof(WORD_ID), ndata->bigram_bo_num);#endif } else { rdn(fp, ndata->bo_wt_rrl, sizeof(LOGPROB), ndata->ngram_num[1]); j_printerr("."); if (ndata->version == 4) { rdn(fp, n3_bgn, sizeof(NNID), ndata->ngram_num[1]); for(d=0;d<ndata->ngram_num[1];d++) { if (n3_bgn[d] == NNID_INVALID) { ndata->n3_bgn_lower[d] = 0; ndata->n3_bgn_upper[d] = NNID_INVALID_UPPER; } else { ntmp = n3_bgn[d] & 0xffff; ndata->n3_bgn_lower[d] = ntmp; ntmp = n3_bgn[d] >> 16; ndata->n3_bgn_upper[d] = ntmp; } } } else { rdn(fp, ndata->n3_bgn, sizeof(NNID), ndata->ngram_num[1]); } j_printerr(".");#ifdef WORDS_INT rdn_wordid(fp, ndata->n3_num, ndata->ngram_num[1], need_conv);#else rdn(fp, ndata->n3_num, sizeof(WORD_ID), ndata->ngram_num[1]);#endif } /* read 3-gram*/ j_printerr("3-gram.");#ifdef WORDS_INT rdn_wordid(fp, ndata->n3tonid, ndata->ngram_num[2], need_conv);#else rdn(fp, ndata->n3tonid, sizeof(WORD_ID), ndata->ngram_num[2]);#endif j_printerr("."); rdn(fp, ndata->p_rrl, sizeof(LOGPROB), ndata->ngram_num[2]); /* make word search tree for later lookup */ j_printerr("indexing..."); ngram_make_lookup_tree(ndata); /* compact the 2-gram back-off and 3-gram links */ if (file_version != 4 && ndata->version == 4) { free(n3_bgn); ngram_compact_bigram_context(ndata); } /* set unknown id */ set_unknown_id(ndata); return TRUE;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -