📄 ngram_read_arpa.c
字号:
j_printerr(" 2-gram read %d (%d%%)\n", bi_count, bi_count * 100 / ndata->ngram_num[1]); } prob = (LOGPROB)atof(first_token(buf)); w_r = lookup_word(ndata, next_token()); w_l = lookup_word(ndata, next_token()); bo_wt = (LOGPROB)atof(next_token()); n2 = search_bigram(ndata, w_l, w_r); if (n2 == NNID_INVALID) { j_printerr("Warning: (%s,%s) not exist in LR 2-gram (ignored)\n", ndata->wname[w_l], ndata->wname[w_r]); } else { ndata->p_rl[n2] = prob; ndata->bo_wt_rrl[n2] = bo_wt; } } j_printerr(" 2-gram read %d end\n", bi_count); } /** * Read reverse 3-gram data from RL 3-gram file and store them. * * @param fp [in] file pointer * @param ndata [i/o] N-gram to set the read data. */static voidset_trigram(FILE *fp, NGRAM_INFO *ndata){ int w_l, w_m, w_r; LOGPROB p_rl; int w_r_last, w_m_last, w_l_last; NNID n2, n2_last; NNID n3; NNID ntmp; /* allocate pointer from 2gram to 3gram */ switch(ndata->version) { case 3: ndata->n3_bgn = (NNID *)mymalloc(sizeof(NNID)*ndata->ngram_num[1]); for(n2=0;n2<ndata->ngram_num[1];n2++) ndata->n3_bgn[n2] = NNID_INVALID; break; case 4: ndata->n3_bgn_upper = (NNID_UPPER *)mymalloc(sizeof(NNID_UPPER)*ndata->ngram_num[1]); ndata->n3_bgn_lower = (NNID_LOWER *)mymalloc(sizeof(NNID_LOWER)*ndata->ngram_num[1]); for(n2=0;n2<ndata->ngram_num[1];n2++) { ndata->n3_bgn_upper[n2] = NNID_INVALID_UPPER; ndata->n3_bgn_lower[n2] = 0; } break; } ndata->n3_num = (WORD_ID *)mymalloc(sizeof(WORD_ID)*ndata->ngram_num[1]); for(n2=0;n2<ndata->ngram_num[1];n2++) ndata->n3_num[n2] = 0; /* allocate data area for 3-gram */ ndata->n3tonid = (WORD_ID *)mymalloc(sizeof(WORD_ID)*ndata->ngram_num[2]); ndata->p_rrl = (LOGPROB *)mymalloc(sizeof(LOGPROB)*ndata->ngram_num[2]); n3 = 0; n2 = n2_last = NNID_INVALID; w_r_last = w_m_last = w_l_last = -1; for (;;) { if (getl(buf, sizeof(buf), fp) == NULL || buf[0] == '\\') break; strcpy(pbuf, buf); if (n3 % 100000 == 0) { j_printerr(" 3-gram read %d (%d%%)\n", n3, n3 * 100 / ndata->ngram_num[2]); } /* N-gram probability */ p_rl = (LOGPROB)atof(first_token(buf)); /* read in right (first) word and lookup its ID */ w_r = lookup_word(ndata, next_token()); /* read in middle word and lookup its ID */ w_m = lookup_word(ndata, next_token()); /* if context changed, create the next structure */ if (w_r != w_r_last || w_m != w_m_last) { n2 = search_bigram(ndata, (WORD_ID)w_m, (WORD_ID)w_r); if (n2 == NNID_INVALID) { /* no context */ j_printerr("Warning: context (%s,%s) not exist in LR 2-gram (ignored)\n", ndata->wname[w_m], ndata->wname[w_r]); continue; } switch(ndata->version) { case 3: ntmp = ndata->n3_bgn[n2_last]; break; case 4: ntmp = ((NNID)(ndata->n3_bgn_upper[n2_last]) << 16) + (NNID)(ndata->n3_bgn_lower[n2_last]); break; } if (n2_last != NNID_INVALID) ndata->n3_num[n2_last] = n3 - ntmp; /* check: the next 'n2' should be an new entry */ switch(ndata->version) { case 3: if (ndata->n3_bgn[n2] != NNID_INVALID) { j_printerr("Error: entry not sorted (same left context not sequenced)\n"); j_error("at 3-gram #%d: \"%s\"\n", n3+1, pbuf); } ndata->n3_bgn[n2] = n3; break; case 4: if (ndata->n3_bgn_upper[n2] != NNID_INVALID_UPPER) { j_printerr("Error: entry not sorted (same left context not sequenced)\n"); j_error("at 3-gram #%d: \"%s\"\n", n3+1, pbuf); } ntmp = n3 & 0xffff; ndata->n3_bgn_lower[n2] = ntmp; ntmp = n3 >> 16; ndata->n3_bgn_upper[n2] = ntmp; break; } n2_last = n2; w_l_last = -1; } else { if (n2 == NNID_INVALID) continue; } /* read in left (last) word and store */ w_l = lookup_word(ndata, next_token()); if (w_l == w_l_last) { j_printerr("Error: duplicated entry\n"); j_error("at 3-gram #%d: \"%s\"\n", n3+1, pbuf); } else if (w_l < w_l_last) { j_printerr("Error: entry not sorted downward\n"); j_error("at 3-gram #%d: \"%s\"\n", n3+1, pbuf); } ndata->n3tonid[n3] = w_l; ndata->p_rrl[n3] = p_rl; n3++; w_m_last = w_m; w_r_last = w_r; w_l_last = w_l; /* check the 3-gram num */ if (n3 > ndata->ngram_num[2]) { j_printerr("Error: actual 3-gram num not match the header value\n"); j_error("%d != %d ?\n", n3, ndata->ngram_num[2]); } } /* store the last n3_num */ switch(ndata->version) { case 3: ntmp = ndata->n3_bgn[n2_last]; break; case 4: ntmp = ((NNID)(ndata->n3_bgn_upper[n2_last]) << 16) + (NNID)(ndata->n3_bgn_lower[n2_last]); break; } ndata->n3_num[n2_last] = n3 - ntmp; j_printerr(" 3-gram read %d end\n", n3);}static boolean LR_2gram_read = FALSE; ///< TRUE if LR 2gram has already been read/** * Read in one ARPA N-gram file, either LR 2-gram or RL 3-gram. * * @param fp [in] file pointer * @param ndata [out] N-gram data to store the read data * @param direction [in] specify whether this is LR 2-gram or RL 3-gram * * @return TRUE on success, FALSE on failure. */booleanngram_read_arpa(FILE *fp, NGRAM_INFO *ndata, int direction){ int n; ndata->from_bin = FALSE; if (!LR_2gram_read && direction == DIR_RL) { j_printerr("you should first read LR 2-gram\n"); return FALSE; } if (direction == DIR_LR) { n = 2; } else { n = 3; } /* read until `\data\' found */ while (getl(buf, sizeof(buf), fp) != NULL && strncmp(buf,"\\data\\",6) != 0); /* read n-gram total info */ if (direction == DIR_LR) { set_total_info(fp, ndata); } else { set_and_check_total_info(fp, ndata); } if (ndata->ngram_num[0] > MAX_WORD_NUM) { j_error("Error: vocabulary size exceeded limit (%d)\n", MAX_WORD_NUM); } ndata->max_word_num = ndata->ngram_num[0]; /* version requirement check (determined by 3-gram entry limit) */ if (n >= 3) { if (ndata->ngram_num[2] >= NNIDMAX) { j_printerr("Warning: more than %d 3-gram tuples, use old structure\n", NNIDMAX); ndata->version = 3; } else { ndata->version = 4; } } /* read 1-gram data */ if (!strnmatch(buf,"\\1-grams",8)) { j_error("data format error: 1-gram not found\n"); } j_printerr(" reading 1-gram part...\n"); if (direction == DIR_LR) { set_unigram(fp, ndata); } else { add_unigram(fp, ndata); } if (n >= 2) { /* read 2-gram data */ if (!strnmatch(buf,"\\2-grams", 8)) { j_error("data format error: 2-gram not found\n"); } j_printerr(" reading 2-gram part...\n"); if (direction == DIR_LR) { set_bigram(fp, ndata); } else { add_bigram_rl(fp, ndata); } } if (n >= 3) { /* read 3-gram data */ if (!strnmatch(buf,"\\3-grams", 8)) { j_error("data format error: 3-gram not found\n"); } if ( direction == DIR_LR) { j_error("should not happen..\n"); } else { j_printerr(" reading 3-gram part...\n"); set_trigram(fp, ndata); } } /* finished */ if (!strnmatch(buf, "\\end", 4)) { j_error("data format error: data end marker \"\\end\" not found\n"); }#ifdef CLASS_NGRAM /* skip in-class word entries (they should be in word dictionary) */ if (getl(buf, sizeof(buf), fp) != NULL) { if (strnmatch(buf, "\\class", 6)) { j_printerr(" skipping in-class word entries...\n"); } }#endif if (n >= 3 && ndata->version == 4) { /* compact the 2-gram back-off and 3-gram links */ ngram_compact_bigram_context(ndata); } /* set unknown (=OOV) word id */ set_unknown_id(ndata); if (direction == DIR_LR) { LR_2gram_read = TRUE; } return TRUE;}/** * Compact the 2-gram context information. * * @param ndata [i/o] N-gram data */voidngram_compact_bigram_context(NGRAM_INFO *ndata){ NNID i; int c; int dst; NNID ntmp; /* version check */ if (ndata->version != 4) { j_error("InternalError: bigram context compaction called for version != 4\n"); } /* count number of valid bigram context */ c = 0; for(i=0;i<ndata->ngram_num[1];i++) { if (ndata->n3_bgn_upper[i] != NNID_INVALID_UPPER) { c++; } else { if (ndata->n3_num[i] != 0) { printf("bgn=%d|%d, num=%d, bo_wt_rrl=%f\n", ndata->n3_bgn_upper[i], ndata->n3_bgn_lower[i], ndata->n3_num[i], ndata->bo_wt_rrl[i]); j_error("Error: ngram_compact_bigram_context: internal error\n"); } if (ndata->bo_wt_rrl[i] != 0.0) { j_error("Error: 2-gram has no upper 3-gram, but not 0.0 back-off weight\n"); } } } ndata->bigram_bo_num = c; j_printerr("num: %d -> %d\n", ndata->ngram_num[1], ndata->bigram_bo_num); /* allocate index buffer */ ndata->n2bo_upper = (NNID_UPPER *)mymalloc(sizeof(NNID_UPPER) * ndata->ngram_num[1]); ndata->n2bo_lower = (NNID_LOWER *)mymalloc(sizeof(NNID_LOWER) * ndata->ngram_num[1]); /* make index and do compaction of context informations */ dst = 0; for(i=0;i<ndata->ngram_num[1];i++) { if (ndata->n3_bgn_upper[i] != NNID_INVALID_UPPER) { ndata->bo_wt_rrl[dst] = ndata->bo_wt_rrl[i]; ndata->n3_bgn_upper[dst] = ndata->n3_bgn_upper[i]; ndata->n3_bgn_lower[dst] = ndata->n3_bgn_lower[i]; ndata->n3_num[dst] = ndata->n3_num[i]; ntmp = dst & 0xffff; ndata->n2bo_lower[i] = ntmp; ntmp = dst >> 16; ndata->n2bo_upper[i] = ntmp; dst++; } else { ndata->n2bo_upper[i] = NNID_INVALID_UPPER; ndata->n2bo_lower[i] = 0; } } /* really shrink the memory area */ ndata->bo_wt_rrl = (LOGPROB *)myrealloc(ndata->bo_wt_rrl, sizeof(LOGPROB) * ndata->bigram_bo_num); ndata->n3_bgn_upper = (NNID_UPPER *)myrealloc(ndata->n3_bgn_upper, sizeof(NNID_UPPER) * ndata->bigram_bo_num); ndata->n3_bgn_lower = (NNID_LOWER *)myrealloc(ndata->n3_bgn_lower, sizeof(NNID_LOWER) * ndata->bigram_bo_num); ndata->n3_num = (WORD_ID *)myrealloc(ndata->n3_num, sizeof(WORD_ID) * ndata->bigram_bo_num);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -