📄 ngram_read_bin.c
字号:
rdn(fp, t->bo_wt, sizeof(LOGPROB), t->context_num); } else { t->bo_wt = NULL; } rdn(fp, &i, sizeof(int), 1); if (i == 1) { t->nnid2ctid_upper = (NNID_UPPER *)mymalloc_big(sizeof(NNID_UPPER), t->totalnum); t->nnid2ctid_lower = (NNID_LOWER *)mymalloc_big(sizeof(NNID_LOWER), t->totalnum); rdn(fp, t->nnid2ctid_upper, sizeof(NNID_UPPER), t->totalnum); rdn(fp, t->nnid2ctid_lower, sizeof(NNID_LOWER), t->totalnum); } else { t->nnid2ctid_upper = NULL; t->nnid2ctid_lower = NULL; } } rdn(fp, &i, sizeof(int), 1); if (i == 1) { ndata->bo_wt_1 = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), ndata->d[0].context_num); rdn(fp, ndata->bo_wt_1, sizeof(LOGPROB), ndata->d[0].context_num); } else { ndata->bo_wt_1 = NULL; } rdn(fp, &i, sizeof(int), 1); if (i == 1) { ndata->p_2 = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), ndata->d[1].totalnum); rdn(fp, ndata->p_2, sizeof(LOGPROB), ndata->d[1].totalnum); } else { ndata->p_2 = NULL; } return TRUE;}static booleanngram_read_bin_compat(FILE *fp, NGRAM_INFO *ndata, int *retry_ret){ int i,n,len; char *w, *p; NNID *n3_bgn; NNID d, ntmp;#ifdef WORDS_INT unsigned short *buf;#endif NGRAM_TUPLE_INFO *t, *tt, *ttt; /* old binary N-gram assumes these types */ ndata->bigram_index_reversed = TRUE; ndata->n = 3; ndata->dir = DIR_RL; /* read total info and set max_word_num */ for(n=0;n<ndata->n;n++) { rdn(fp, &(ndata->d[n].totalnum), sizeof(NNID), 1); } ndata->max_word_num = ndata->d[0].totalnum; if (file_version == 4) { rdn(fp, &(ndata->d[1].context_num), sizeof(NNID), 1); } for(n=0;n<ndata->n;n++) { if (n < 2) { ndata->d[n].is24bit = FALSE; } else { if (ndata->d[n].totalnum >= NNID_MAX_24) { jlog("Warning: ngram_read_bin_compat: num of %d-gram exceeds 24bit, now switch to %dbit index\n", n+1, sizeof(NNID) * 8); ndata->d[n].is24bit = FALSE; } else { ndata->d[n].is24bit = TRUE; } } ndata->d[n].nnid2ctid_upper = NULL; ndata->d[n].nnid2ctid_lower = NULL; } /* always do back-off compaction for 3-gram and up */ /* mark 2-gram and up */ ndata->d[0].ct_compaction = FALSE; for(n=1;n<ndata->n;n++) { ndata->d[n].ct_compaction = TRUE; } /* read wname */ rdn(fp, &len, sizeof(int), 1); w = mymalloc(len); rdn(fp, w, 1, len); /* assign... */ ndata->wname = (char **)mymalloc(sizeof(char *) * ndata->max_word_num); p = w; i = 0; while (p < w + len) { ndata->wname[i++] = p; while(*p != '\0') p++; p++; } if (i != ndata->max_word_num) { jlog("Error: ngram_read_bin_compat: wname error??\n"); return FALSE; } /* malloc 1-gram */ t = &(ndata->d[0]); tt = &(ndata->d[1]); ttt = &(ndata->d[2]); t->bgn_upper = NULL; t->bgn_lower = NULL; t->bgn = NULL; t->num = NULL; t->bgnlistlen = 0; t->nnid2wid = NULL; t->nnid2ctid_upper = NULL; t->nnid2ctid_lower = NULL; t->context_num = t->totalnum; t->prob = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->totalnum); ndata->bo_wt_1 = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->context_num); t->bo_wt = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), t->context_num); tt->bgnlistlen = t->context_num; tt->bgn = (NNID *)mymalloc_big(sizeof(NNID), tt->bgnlistlen); tt->num = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), tt->bgnlistlen); /* read 1-gram */ jlog("stat: ngram_read_bin_compat: reading 1-gram\n"); rdn(fp, t->prob, sizeof(LOGPROB), t->totalnum); rdn(fp, ndata->bo_wt_1, sizeof(LOGPROB), t->context_num); rdn(fp, t->bo_wt, sizeof(LOGPROB), t->context_num); rdn(fp, tt->bgn, sizeof(NNID), tt->bgnlistlen);#ifdef WORDS_INT rdn_wordid(fp, tt->num, tt->bgnlistlen, need_conv);#else rdn(fp, tt->num, sizeof(WORD_ID), tt->bgnlistlen);#endif#ifdef WORDS_INT { /* check if we are wrongly reading word_id=2byte bingram (if bingram version >= 4, this should not be happen because header correctly tells the word_id byte size. This will occur only if matches all the conditions below: - you run Julius with --enable-words-int, - you use old bingram of version <= 3, and - you use bingram file converted without --enable-words-int */ WORD_ID w; for(w=0;w<ndata->max_word_num;w++) { if (ndata->d[1].num[w] > ndata->max_word_num) { if (words_int_retry) { jlog("Error: ngram_read_bin_compat: retry failed, wrong bingram format\n"); return FALSE; } jlog("Warning: ngram_read_bin_compat: incorrect data, may be a 2-byte v3 bingram, retry with conversion\n"); free(ndata->wname[0]); free(ndata->wname); free(t->prob); free(ndata->bo_wt_1); free(t->bo_wt); free(tt->bgn); free(tt->num); myfrewind(fp); words_int_retry = TRUE; *retry_ret = 1; return FALSE; } } }#endif /* malloc the rest */ tt->nnid2wid = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), tt->totalnum); tt->prob = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), tt->totalnum); ndata->p_2 = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), tt->totalnum); if (file_version == 4) { /* context compaction and 24bit */ tt->nnid2ctid_upper = (NNID_UPPER *)mymalloc_big(sizeof(NNID_UPPER), tt->totalnum); tt->nnid2ctid_lower = (NNID_LOWER *)mymalloc_big(sizeof(NNID_LOWER), tt->totalnum); tt->bo_wt = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), tt->context_num); ttt->bgnlistlen = tt->context_num; ttt->bgn_upper = (NNID_UPPER *)mymalloc_big(sizeof(NNID_UPPER), ttt->bgnlistlen); ttt->bgn_lower = (NNID_LOWER *)mymalloc_big(sizeof(NNID_LOWER), ttt->bgnlistlen); ttt->num = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), ttt->bgnlistlen); } else { tt->context_num = tt->totalnum; tt->bo_wt = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), tt->context_num); ttt->bgnlistlen = tt->context_num; ttt->num = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), ttt->bgnlistlen); if (ttt->is24bit) { ttt->bgn_upper = (NNID_UPPER *)mymalloc_big(sizeof(NNID_UPPER), ttt->bgnlistlen); ttt->bgn_lower = (NNID_LOWER *)mymalloc_big(sizeof(NNID_LOWER), ttt->bgnlistlen); n3_bgn = (NNID *)mymalloc_big(sizeof(NNID), ttt->bgnlistlen); } else { ttt->bgn = (NNID *)mymalloc_big(sizeof(NNID), ttt->bgnlistlen); } } ttt->nnid2wid = (WORD_ID *)mymalloc_big(sizeof(WORD_ID), ttt->totalnum); ttt->prob = (LOGPROB *)mymalloc_big(sizeof(LOGPROB), ttt->totalnum); ttt->bo_wt = NULL; /* read 2-gram*/ jlog("Stat: ngram_read_bin_compat: reading 2-gram\n");#ifdef WORDS_INT rdn_wordid(fp, tt->nnid2wid, tt->totalnum, need_conv);#else rdn(fp, tt->nnid2wid, sizeof(WORD_ID), tt->totalnum);#endif rdn(fp, ndata->p_2, sizeof(LOGPROB), tt->totalnum); rdn(fp, tt->prob, sizeof(LOGPROB), tt->totalnum); if (file_version == 4) { rdn(fp, tt->nnid2ctid_upper, sizeof(NNID_UPPER), tt->totalnum); rdn(fp, tt->nnid2ctid_lower, sizeof(NNID_LOWER), tt->totalnum); rdn(fp, tt->bo_wt, sizeof(LOGPROB), tt->context_num); rdn(fp, ttt->bgn_upper, sizeof(NNID_UPPER), ttt->bgnlistlen); rdn(fp, ttt->bgn_lower, sizeof(NNID_LOWER), ttt->bgnlistlen);#ifdef WORDS_INT rdn_wordid(fp, ttt->num, ttt->bgnlistlen, need_conv);#else rdn(fp, ttt->num, sizeof(WORD_ID), ttt->bgnlistlen);#endif } else { rdn(fp, tt->bo_wt, sizeof(LOGPROB), tt->context_num); if (ttt->is24bit) { rdn(fp, n3_bgn, sizeof(NNID), ttt->bgnlistlen); for(d=0;d<ttt->bgnlistlen;d++) { if (n3_bgn[d] == NNID_INVALID) { ttt->bgn_lower[d] = 0; ttt->bgn_upper[d] = NNID_INVALID_UPPER; } else { ntmp = n3_bgn[d] & 0xffff; ttt->bgn_lower[d] = ntmp; ntmp = n3_bgn[d] >> 16; ttt->bgn_upper[d] = ntmp; } } } else { rdn(fp, ttt->bgn, sizeof(NNID), ttt->bgnlistlen); }#ifdef WORDS_INT rdn_wordid(fp, ttt->num, ttt->bgnlistlen, need_conv);#else rdn(fp, ttt->num, sizeof(WORD_ID), ttt->bgnlistlen);#endif } /* read 3-gram*/ jlog("Stat: ngram_read_bin_compat: reading 3-gram\n");#ifdef WORDS_INT rdn_wordid(fp, ttt->nnid2wid, ttt->totalnum, need_conv);#else rdn(fp, ttt->nnid2wid, sizeof(WORD_ID), ttt->totalnum);#endif rdn(fp, ttt->prob, sizeof(LOGPROB), ttt->totalnum); /* compact the 2-gram back-off and 3-gram links */ if (file_version != 4) { if (ttt->is24bit) { free(n3_bgn); if (ngram_compact_context(ndata, 2) == FALSE) return FALSE; } } return TRUE;}/** * Read a N-gram binary file and store to data. * * @param fp [in] file pointer * @param ndata [out] N-gram data to store the read data * * @return TRUE on success, FALSE on failure. */booleanngram_read_bin(FILE *fp, NGRAM_INFO *ndata){ int retry;#ifdef WORDS_INT /* reset retry flag */ words_int_retry = FALSE; /* when retrying, it restarts from here with words_int_retry = TRUE */ ngram_read_bin_start:#endif ndata->from_bin = TRUE; /* check initial header */ if (check_header(fp) == FALSE) return FALSE; #ifdef WORDS_INT /* in retry mode, force word_id conversion */ if (words_int_retry) need_conv = TRUE;#endif #ifdef WORDS_INT if (need_conv) jlog("Stat: ngram_read_bin: word-id size conversion enabled\n");#endif if (file_version <= 4) { retry = 0; if (ngram_read_bin_compat(fp, ndata, &retry) == FALSE) {#ifdef WORDS_INT if (retry == 1) { goto ngram_read_bin_start; } else { return FALSE; }#else return FALSE;#endif } } else { if (ngram_read_bin_v5(fp, ndata) == FALSE) return FALSE; } /* make word search tree for later lookup */ jlog("Stat: ngram_read_bin: making entry name index\n"); ngram_make_lookup_tree(ndata); bi_prob_func_set(ndata); return TRUE;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -