📄 ngram_write_bin.c
字号:
/** * @file ngram_write_bin.c * @author Akinobu LEE * @date Wed Feb 16 17:23:16 2005 * * <JA> * @brief N-gramをバイナリ妨及でファイルに今き叫す * * rev.3.5 より·粕み哈みの光庐拉を雇胃して今き叫しのバイトオ〖ダ〖を * Big endian 盖年からマシン巴赂に恃构されたˉまたインデックスの 24bit 步 * および 2-gram のバックオフデ〖タの暗教も乖うなど·ファイル妨及の * 柒婶慌屯が办婶恃构されたˉこれにより·3.5 笆惯の mkbingram で * で栏喇したバイナリN-gramは, 3.4.2笆涟の Julius では蝗えないˉ * (ヘッダチェックでエラ〖となる) * * なお 3.5 笆惯の Julius では骄丸のモデルも啼玛なく粕めるˉこの眷圭, * インデックスの 24bit 步とバックオフの暗教はモデル粕み哈み箕に * その旁刨乖われるˉ * * バイトオ〖ダ〖に簇してヘッダに淡揭することで·粕み哈み箕に冉年して * 粕み哈むˉこれにより·佰なるバイトオ〖ダ〖のマシンで栏喇した * バイナリN-gramでも啼玛なく粕めるˉもちろん骄丸のモデルもそのまま * 粕み哈めるˉ * </JA> * * <EN> * @brief Write a whole N-gram data to a file in binary format * * From 3.5, internal format of binary N-gram has changed for using * machine-dependent natural byte order (previously fixed to big endian), * 24bit index and 2-gram backoff compression. So, binary N-gram * generated by mkbingram of 3.5 and later will not work on 3.4.2 and * earlier versions. * * There is full upward- and cross-machine compatibility in 3.5. Old * binary N-gram files still can be read directly, in which case the conversion * to 24bit index will performed just after model has been read. * Byte order will also considered by header information, so * binary N-gram still can be used among different machines. * </EN> * * $Revision: 1.4 $ * *//* * Copyright (c) 1991-2006 Kawahara Lab., Kyoto University * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology * Copyright (c) 2005-2006 Julius project team, Nagoya Institute of Technology * All rights reserved */#include <sent/stddefs.h>#include <sent/ngram2.h>static boolean need_swap; ///< TRUE if need byte swap/** * Binary write function, with byte swapping if needed. * * @param fp [in] file pointer * @param buf [in] data buffer to write * @param unitbyte [in] unit size in bytes * @param unitnum [in] number of unit to write */static voidwrt(FILE *fp, void *buf, size_t unitbyte, int unitnum){ if (need_swap == TRUE && unitbyte != 1) { swap_bytes((char *)buf, unitbyte, unitnum); } if (myfwrite(buf, unitbyte, unitnum, fp) < (size_t)unitnum) { perror("write_ngram_bin: wrt"); j_error("write failed\n"); } if (need_swap == TRUE && unitbyte != 1) { swap_bytes((char *)buf, unitbyte, unitnum); }}/** * Write header information, with identifier string. * * @param fp [in] file pointer * @param str [in] user header string (any string within BINGRAM_HDSIZE * bytes is allowed) * @param version [in] file format version id */static voidwrite_header(FILE *fp, char *str, int version){ char buf[BINGRAM_HDSIZE]; int i, totallen; for(i=0;i<BINGRAM_HDSIZE;i++) buf[i] = EOF; switch(version) { case 4: totallen = strlen(BINGRAM_IDSTR_V4) + 1 + strlen(BINGRAM_SIZESTR_HEAD) + strlen(BINGRAM_SIZESTR_BODY) + 1 + strlen(BINGRAM_BYTEORDER_HEAD) + strlen(BINGRAM_NATURAL_BYTEORDER) + 1 + strlen(str); break; case 3: totallen = strlen(BINGRAM_IDSTR) + 1 + strlen(BINGRAM_SIZESTR_HEAD) + strlen(BINGRAM_SIZESTR_BODY) + 1 + strlen(str); break; } if (totallen >= BINGRAM_HDSIZE) { j_printerr("Warning: user header too long, last will be truncated\n"); i = strlen(str) - (totallen - BINGRAM_HDSIZE); str[i] = '\0'; } switch(version) { case 4: sprintf(buf, "%s\n%s%s %s%s\n%s", BINGRAM_IDSTR_V4, BINGRAM_SIZESTR_HEAD, BINGRAM_SIZESTR_BODY, BINGRAM_BYTEORDER_HEAD, BINGRAM_NATURAL_BYTEORDER, str); break; case 3: sprintf(buf, "%s\n%s%s\n%s", BINGRAM_IDSTR, BINGRAM_SIZESTR_HEAD, BINGRAM_SIZESTR_BODY, str); break; } wrt(fp, buf, 1, BINGRAM_HDSIZE);}/** * Write a whole N-gram data in binary format. * * @param fp [in] file pointer * @param ndata [in] N-gram data to write * @param headerstr [in] user header string * * @return TRUE on success, FALSE on failure */booleanngram_write_bin(FILE *fp, NGRAM_INFO *ndata, char *headerstr){ int i,n,len; /* write initial header */ write_header(fp, headerstr, ndata->version); /* set swap requirement */ if (ndata->version == 4) { need_swap = FALSE; } else {#ifdef WORDS_BIGENDIAN need_swap = FALSE;#else need_swap = TRUE;#endif } /* write total info */ for(n=0;n<MAX_N;n++) { wrt(fp, &(ndata->ngram_num[n]), sizeof(NNID), 1); /*j_printf("ngram %d=%d\n",n+1,ndata->ngram_num[n]);*/ } if (ndata->version == 4) { wrt(fp, &(ndata->bigram_bo_num), sizeof(NNID), 1); } j_printf("wrote total info\n"); /* unk_*, isopen, max_word_num are set after read, so need not save */ /* write wname */ len = 0; for(i=0;i<ndata->ngram_num[0];i++) { len += strlen(ndata->wname[i]) + 1; } wrt(fp, &len, sizeof(int), 1); for(i=0;i<ndata->ngram_num[0];i++) { wrt(fp, ndata->wname[i], 1, strlen(ndata->wname[i]) + 1); /* include \0 */ } j_printf("wrote wnames (%d bytes)\n", len + sizeof(int)); /* write 1-gram */ wrt(fp, ndata->p, sizeof(LOGPROB), ndata->ngram_num[0]); wrt(fp, ndata->bo_wt_lr, sizeof(LOGPROB), ndata->ngram_num[0]); wrt(fp, ndata->bo_wt_rl, sizeof(LOGPROB), ndata->ngram_num[0]); wrt(fp, ndata->n2_bgn, sizeof(NNID), ndata->ngram_num[0]); wrt(fp, ndata->n2_num, sizeof(WORD_ID), ndata->ngram_num[0]); j_printf("wrote 1-gram (%d KB)\n", ((sizeof(LOGPROB)*3 + sizeof(NNID) + sizeof(WORD_ID)) * ndata->ngram_num[0]) / 1024); /* write 2-gram*/ wrt(fp, ndata->n2tonid, sizeof(WORD_ID), ndata->ngram_num[1]); wrt(fp, ndata->p_lr, sizeof(LOGPROB), ndata->ngram_num[1]); wrt(fp, ndata->p_rl, sizeof(LOGPROB), ndata->ngram_num[1]); switch (ndata->version) { case 4: wrt(fp, ndata->n2bo_upper, sizeof(NNID_UPPER), ndata->ngram_num[1]); wrt(fp, ndata->n2bo_lower, sizeof(NNID_LOWER), ndata->ngram_num[1]); wrt(fp, ndata->bo_wt_rrl, sizeof(LOGPROB), ndata->bigram_bo_num); wrt(fp, ndata->n3_bgn_upper, sizeof(NNID_UPPER), ndata->bigram_bo_num); wrt(fp, ndata->n3_bgn_lower, sizeof(NNID_LOWER), ndata->bigram_bo_num); wrt(fp, ndata->n3_num, sizeof(WORD_ID), ndata->bigram_bo_num); j_printf("wrote 2-gram (%d KB)\n", ((sizeof(LOGPROB)*2 + sizeof(NNID_UPPER) + sizeof(NNID_LOWER) + sizeof(WORD_ID)) * ndata->ngram_num[1] + (sizeof(LOGPROB) + sizeof(NNID_UPPER) + sizeof(NNID_LOWER) + sizeof(WORD_ID)) * ndata->bigram_bo_num) / 1024); break; case 3: wrt(fp, ndata->bo_wt_rrl, sizeof(LOGPROB), ndata->ngram_num[1]); wrt(fp, ndata->n3_bgn, sizeof(NNID), ndata->ngram_num[1]); wrt(fp, ndata->n3_num, sizeof(WORD_ID), ndata->ngram_num[1]); j_printf("wrote 2-gram (%d KB)\n", ((sizeof(LOGPROB)*3 + sizeof(NNID) + sizeof(WORD_ID)*2) * ndata->ngram_num[1]) / 1024); break; } /* write 3-gram*/ wrt(fp, ndata->n3tonid, sizeof(WORD_ID), ndata->ngram_num[2]); wrt(fp, ndata->p_rrl, sizeof(LOGPROB), ndata->ngram_num[2]); j_printf("wrote 3-gram (%d KB)\n", ((sizeof(LOGPROB) + sizeof(WORD_ID)) * ndata->ngram_num[2]) / 1024); return TRUE;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -