📄 ngram_write_bin.c
字号:
/** * @file ngram_write_bin.c * * <JA> * @brief N-gramをバイナリ妨及でファイルに今き叫す * * rev.3.5 より·粕み哈みの光庐拉を雇胃して今き叫しのバイトオ〖ダ〖を * Big endian 盖年からマシン巴赂に恃构されたˉまたインデックスの 24bit 步 * および 2-gram のバックオフデ〖タの暗教も乖うなど·ファイル妨及の * 柒婶慌屯が办婶恃构されたˉこれにより·3.5 笆惯の mkbingram で * で栏喇したバイナリN-gramは, 3.4.2笆涟の Julius では蝗えないˉ * (ヘッダチェックでエラ〖となる) * * なお 3.5 笆惯の Julius では骄丸のモデルも啼玛なく粕めるˉこの眷圭, * インデックスの 24bit 步とバックオフの暗教はモデル粕み哈み箕に * その旁刨乖われるˉ * * バイトオ〖ダ〖に簇してヘッダに淡揭することで·粕み哈み箕に冉年して * 粕み哈むˉこれにより·佰なるバイトオ〖ダ〖のマシンで栏喇した * バイナリN-gramでも啼玛なく粕めるˉもちろん骄丸のモデルもそのまま * 粕み哈めるˉ * </JA> * * <EN> * @brief Write a whole N-gram data to a file in binary format * * From 3.5, internal format of binary N-gram has changed for using * machine-dependent natural byte order (previously fixed to big endian), * 24bit index and 2-gram backoff compression. So, binary N-gram * generated by mkbingram of 3.5 and later will not work on 3.4.2 and * earlier versions. * * There is full upward- and cross-machine compatibility in 3.5. Old * binary N-gram files still can be read directly, in which case the conversion * to 24bit index will performed just after model has been read. * Byte order will also considered by header information, so * binary N-gram still can be used among different machines. * </EN> * * @author Akinobu LEE * @date Wed Feb 16 17:23:16 2005 * * $Revision: 1.4 $ * *//* * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology * All rights reserved */#include <sent/stddefs.h>#include <sent/ngram2.h>static boolean need_swap; ///< TRUE if need byte swap#define wrt(A,B,C,D) if (wrtfunc(A,B,C,D) == FALSE) return FALSEstatic unsigned int count;voidreset_wrt_counter(){ count = 0;}static unsigned intget_wrt_counter(){ return count;} /** * Binary write function, with byte swapping if needed. * * @param fp [in] file pointer * @param buf [in] data buffer to write * @param unitbyte [in] unit size in bytes * @param unitnum [in] number of unit to write */static booleanwrtfunc(FILE *fp, void *buf, size_t unitbyte, size_t unitnum){ if (need_swap == TRUE && unitbyte != 1) { swap_bytes((char *)buf, unitbyte, unitnum); } if (myfwrite(buf, unitbyte, unitnum, fp) < unitnum) { jlog("Error: write_ngram_bin: failed to write %d bytes", unitbyte*unitnum); return FALSE; } if (need_swap == TRUE && unitbyte != 1) { swap_bytes((char *)buf, unitbyte, unitnum); } count += unitbyte * unitnum; return TRUE;}/** * Write header information, with identifier string. * * @param fp [in] file pointer * @param str [in] user header string (any string within BINGRAM_HDSIZE * bytes is allowed) * @param version [in] file format version id */static booleanwrite_header(FILE *fp, char *str){ char buf[BINGRAM_HDSIZE]; int i, totallen; for(i=0;i<BINGRAM_HDSIZE;i++) buf[i] = EOF; totallen = strlen(BINGRAM_IDSTR_V5) + 1 + strlen(BINGRAM_SIZESTR_HEAD) + strlen(BINGRAM_SIZESTR_BODY) + 1 + strlen(BINGRAM_BYTEORDER_HEAD) + strlen(BINGRAM_NATURAL_BYTEORDER) + 1 + strlen(str); if (totallen >= BINGRAM_HDSIZE) { jlog("Warning: write_bingram: header too long, last will be truncated\n"); i = strlen(str) - (totallen - BINGRAM_HDSIZE); str[i] = '\0'; } sprintf(buf, "%s\n%s%s %s%s\n%s", BINGRAM_IDSTR_V5, BINGRAM_SIZESTR_HEAD, BINGRAM_SIZESTR_BODY, BINGRAM_BYTEORDER_HEAD, BINGRAM_NATURAL_BYTEORDER, str); wrt(fp, buf, 1, BINGRAM_HDSIZE); return TRUE;}/** * Write a whole N-gram data in binary format. * * @param fp [in] file pointer * @param ndata [in] N-gram data to write * @param headerstr [in] user header string * * @return TRUE on success, FALSE on failure */booleanngram_write_bin(FILE *fp, NGRAM_INFO *ndata, char *headerstr){ int i,n; unsigned int len; int wlen; NGRAM_TUPLE_INFO *t; reset_wrt_counter(); /* write initial header */ if (write_header(fp, headerstr) == FALSE) return FALSE; /* swap not needed any more */ need_swap = FALSE; /* write some header info */ wrt(fp, &(ndata->n), sizeof(int), 1); wrt(fp, &(ndata->dir), sizeof(int), 1); wrt(fp, &(ndata->bigram_index_reversed), sizeof(boolean), 1); /* write total info */ for(n=0;n<ndata->n;n++) { wrt(fp, &(ndata->d[n].totalnum), sizeof(NNID), 1); /*jlog("ngram %d=%d\n",n+1,ndata->ngram_num[n]);*/ } /* unk_*, isopen, max_word_num are set after read, so need not save */ /* write wname */ wlen = 0; for(i=0;i<ndata->max_word_num;i++) { wlen += strlen(ndata->wname[i]) + 1; } wrt(fp, &wlen, sizeof(int), 1); for(i=0;i<ndata->max_word_num;i++) { wrt(fp, ndata->wname[i], 1, strlen(ndata->wname[i]) + 1); /* include \0 */ } /* write N-gram */ for(n=0;n<ndata->n;n++) { t = &(ndata->d[n]); wrt(fp, &(t->is24bit), sizeof(boolean), 1); wrt(fp, &(t->ct_compaction), sizeof(boolean), 1); wrt(fp, &(t->bgnlistlen), sizeof(NNID), 1); wrt(fp, &(t->context_num), sizeof(NNID), 1); if (n > 0) { if (t->is24bit) { wrt(fp, t->bgn_upper, sizeof(NNID_UPPER), t->bgnlistlen); wrt(fp, t->bgn_lower, sizeof(NNID_LOWER), t->bgnlistlen); } else { wrt(fp, t->bgn, sizeof(NNID), t->bgnlistlen); } wrt(fp, t->num, sizeof(WORD_ID), t->bgnlistlen); wrt(fp, t->nnid2wid, sizeof(WORD_ID), t->totalnum); } wrt(fp, t->prob, sizeof(LOGPROB), t->totalnum); if (t->bo_wt) { i = 1; wrt(fp, &i, sizeof(int), 1); wrt(fp, t->bo_wt, sizeof(LOGPROB), t->context_num); } else { i = 0; wrt(fp, &i, sizeof(int), 1); } if (t->nnid2ctid_upper) { i = 1; wrt(fp, &i, sizeof(int), 1); wrt(fp, t->nnid2ctid_upper, sizeof(NNID_UPPER), t->totalnum); wrt(fp, t->nnid2ctid_lower, sizeof(NNID_LOWER), t->totalnum); } else { i = 0; wrt(fp, &i, sizeof(int), 1); } } /* write additional LR 2-gram */ if (ndata->bo_wt_1) { i = 1; wrt(fp, &i, sizeof(int), 1); wrt(fp, ndata->bo_wt_1, sizeof(LOGPROB), ndata->d[0].context_num); } else { i = 0; wrt(fp, &i, sizeof(int), 1); } if (ndata->p_2) { i = 1; wrt(fp, &i, sizeof(int), 1); wrt(fp, ndata->p_2, sizeof(LOGPROB), ndata->d[1].totalnum); } else { i = 0; wrt(fp, &i, sizeof(int), 1); } len = get_wrt_counter(); jlog("Stat: ngram_write_bin: wrote %lu bytes (%.1f MB)\n", len, len / 1048576.0); return TRUE;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -