📄 fngramlm.cc
字号:
BooleanFNgram::read(const unsigned int specNum, File &file){ char *line; Array<unsigned int> numGrams; /* the number of n-grams for each parent set */ Array<unsigned int> numRead; /* Number of n-grams actually read */ int state = -2 ; /* section of file being read: * -2 - pre-header, -1 - header, * 0 - unigrams, 1 - node-1 bigrams, 2 - node-2 bigrams, etc, ... */ unsigned int numBitsInState = 0; unsigned int state_order = 0; unsigned int max_state = 0; Boolean warnedAboutUnk = false; /* at most one warning about <unk> */ const unsigned int numParents = fngs.fnSpecArray[specNum].numParents; const int numSubSets = fngs.fnSpecArray[specNum].numSubSets; for (int i = 0; i < numSubSets; i++) { numGrams[i] = 0; numRead[i] = 0; } clear(specNum); /* * The ARPA format implicitly assumes a zero-gram backoff weight of 0. * This has to be properly represented in the BOW trie so various * recursive operation work correctly. */ VocabIndex nullContext[1]; nullContext[0] = Vocab_None; *fNgrams[specNum].parentSubsets[0].insertBOW(nullContext) = LogP_Zero; while (line = file.getline()) { Boolean backslash = (line[0] == '\\'); unsigned thisNode; int nFNgrams; switch (state) { case -2: /* looking for start of header */ if (backslash && strncmp(line, "\\data\\", 6) == 0) { state = -1; continue; } /* * Everything before "\\data\\" is ignored */ continue; case -1: /* ngram header */ // expect this to be of the form "0xC-grams" where "C" // is a hex digit string indicating the BG node if (backslash && sscanf(line, "\\%i-grams", &state) == 1) { /* * start reading grams */ if (state >= numSubSets) { file.position() << "invalid ngram order " << HEX << state << "\n" << DEC; return false; } if (debug(DEBUG_READ_STATS)) { dout() << "reading " << numGrams[state] << " " << HEX << state << "-grams\n" << DEC; } numBitsInState = FNgramSpecsType::numBitsSet(state); state_order = numBitsInState+1; continue; } else if (sscanf(line, "ngram %i=%d", &thisNode, &nFNgrams) == 2) { /* * scanned a line of the form * ngram <0xCCCC>=<howmany> * now perform various sanity checks */ if (thisNode >= (unsigned)numSubSets) { file.position() << "gram node " << HEX << thisNode << " out of range\n" << DEC; return false; } if (nFNgrams < 0) { file.position() << "gram count number " << nFNgrams << " out of range\n"; return false; } if (thisNode > max_state) { max_state = thisNode; } numGrams[thisNode] = nFNgrams; continue; } else { file.position() << "unexpected input\n"; return false; } default: /* reading n-grams, where n == state */ if (backslash && sscanf(line, "\\%i-grams", &state) == 1) { if (state >= numSubSets) { file.position() << "invalid ngram order " << HEX << state << "\n" << DEC; return false; } if (debug(DEBUG_READ_STATS)) { dout() << "reading " << numGrams[state] << " " << HEX << state << "-grams\n" << DEC; } numBitsInState = FNgramSpecsType::numBitsSet(state); // state order is the number of strings we expect to see on a line // i.e., the parents + child. state_order = numBitsInState+1; /* * start reading more n-grams */ continue; } else if (backslash && strncmp(line, "\\end\\", 5) == 0) { /* * Check that the total number of ngrams read matches * that found in the header */ for (int i = 0; i <= (int)max_state ; i++) { if (numGrams[i] != numRead[i]) { file.position() << "warning: " << numRead[i] << " " << HEX << i << "-grams read, expected " << numGrams[i] << "\n" << DEC; } } return true; } else if (state > numSubSets) { /* * Save time and memory by skipping ngrams outside * the order range of this model. This is moot * right now as we signal an error condition above. * TODO: change so that file can have higher-order * than model specifies, where we ignore those. */ continue; } else { VocabString words[1 // probability + maxNumParentsPerChild // longest gram + 1 // child + 1 // backoff location + 1 // optional bow + 1 // end marker ]; /* result of parsing an n-gram line, * some elements are actually * numerical parameters, but so what? */ VocabIndex wids[maxNumParentsPerChild + 2]; /* gram+child+endmark as word indices */ LogP prob, bow; /* * Parse a line of the form * <prob> <w1> <w2> ... <wN> <c> [<hx_bg_location>] [<bow>] * * I.e., We can have zero or one <bow>s, We do not * have a hex bg location only when state == 0. Note, * that unlike the ARPA format, here the <bow> here is * associated with the backoff of <w1> <w2> ... <wN> * <c> at hx_bg_location rather than context * consisting of <w1> <w2> ... <wN> <c> */ unsigned int howmany = Vocab::parseWords(line, words, maxNumParentsPerChild + 5); // unsigned int have_cnt = (numBitsInState > 1) ? 1 : 0; unsigned int have_cnt = 0; if ((howmany < state_order+1+have_cnt) || (howmany > state_order+2+have_cnt)) { file.position() << "error, ngram line has invalid number (" << howmany << ") of fields, expecting either " << state_order+1+have_cnt << " or " << state_order+2+have_cnt << "\n"; return false; } /* * Parse prob */ if (!parseLogP(words[0], prob)) { file.position() << "bad prob \"" << words[0] << "\"\n"; return false; } else if (prob > LogP_One || prob != prob) { file.position() << "warning: questionable prob \"" << words[0] << "\"\n"; } else if (prob == LogP_PseudoZero) { /* * convert pseudo-zeros back into real zeros */ prob = LogP_Zero; } /* * * get backoff graph location * */ unsigned int gram_cnt = 0; // support for counts in lm file, not finished. if (have_cnt) { char *endptr = (char *)words[state_order+1]; gram_cnt = (unsigned)strtol(words[state_order+1],&endptr,0); if (endptr == words[state_order+1] || gram_cnt == ~0x0) { file.position() << "warning: invalid backoff graph location\"" << words[state_order+1] << "\"\n"; } } /* * Parse bow, if any */ if (howmany == state_order + 2 + have_cnt) { /* * Parsing floats strings is the most time-consuming * part of reading in backoff models. We therefore * try to avoid parsing bows where they are useless, * i.e., for contexts that are longer than what this * model uses. We also do a quick sanity check to * warn about non-zero bows in that position. */ if (state == 0) { // unlike normal ARPA files, here we // would not expect a bow for the unigram. if (words[state_order + 1 + have_cnt][0] != '0') { file.position() << "ignoring non-zero bow \"" << words[state_order + 1 + have_cnt] << "\" for minimal ngram\n"; } } else if (!parseLogP(words[state_order + 1 + have_cnt], bow)) { file.position() << "bad bow \"" << words[state_order + 1 + have_cnt] << "\"\n"; return false; } else if (bow == LogP_Inf || bow != bow) { file.position() << "warning: questionable bow \"" << words[state_order + 1 + have_cnt] << "\"\n"; } else if (bow == LogP_PseudoZero) { /* * convert pseudo-zeros back into real zeros */ bow = LogP_Zero; } } /* * Terminate the words array after the last word, * then translate it to word indices. We also * reverse the ngram since that's how we'll need it * to index the trie. */ words[state_order + 1] = 0; vocab.addWords(&words[1], wids, maxNumParentsPerChild+2); Vocab::reverse(wids); /* * Store bow, if any */ if (howmany == state_order + 2 + have_cnt && state != 0) { *(fNgrams[specNum].parentSubsets[state].insertBOW(&wids[1])) = bow; } /* * Save the last word (which is now the first, due to reversal) * then use the first n-1 to index into * the context trie, storing the prob. */ if (!warnedAboutUnk && wids[0] == vocab.unkIndex() && prob != LogP_Zero && !vocab.unkIsWord()) { file.position() << "warning: found non-zero LM probability for " << vocab.getWord(vocab.unkIndex()) << " in closed-vocabulary LM\n"; warnedAboutUnk = true; } *(fNgrams[specNum].parentSubsets[state]. insertProbAndCNT(wids[0], &wids[1],gram_cnt)) = prob; /* * Hey, we're done with this ngram! */ numRead[state] ++; } } } /* * we reached a premature EOF */ file.position() << "reached EOF before \\end\\\n"; return false;}voidFNgram::write(){ for (unsigned specNum=0;specNum<fNgramsSize;specNum++) { if (fngs.fnSpecArray[specNum].lmFileName && !(*fngs.fnSpecArray[specNum].lmFileName == '_' && !*(fngs.fnSpecArray[specNum].lmFileName+1))) { File f(fngs.fnSpecArray[specNum].lmFileName,"w"); if (debug(DEBUG_WRITE_STATS)) { dout() << "writing FLM to " << fngs.fnSpecArray[specNum].lmFileName << "\n"; } write(specNum,f); } }}BooleanFNgram::read(){ for (unsigned specNum=0;specNum<fNgramsSize;specNum++) { if (fngs.fnSpecArray[specNum].lmFileName && !(*fngs.fnSpecArray[specNum].lmFileName == '_' && !*(fngs.fnSpecArray[specNum].lmFileName+1))) { File f(fngs.fnSpecArray[specNum].lmFileName,"r"); if (debug(DEBUG_READ_STATS)) { dout() << "reading FLM from " << fngs.fnSpecArray[specNum].lmFileName << "\n"; } if (!read(specNum,f)) return false; } } if (debug(DEBUG_ESTIMATE_LM)) wordProbSum(); return true;}// writes out in an ARPA-inspired file format -// - there are node-grams rather than n-grams, where// node is the bit vector giving LM parents in the LM graph// - the bow for a context is located with the ngram that// could use that bow (rather than for the backoff graph)voidFNgram::write(unsigned int specNum,File &file){ Array<unsigned> howmanyFNgrams; VocabIndex context[maxNumParentsPerChild + 2]; VocabString scontext[maxNumParentsPerChild + 2]; fprintf(file, "\n\\data\\\n"); const unsigned numParents = fngs.fnSpecArray[specNum].numParents; // starting with the unigram, and moving up to the all LM-parents case. for (int level=0;level<=(int)numParents;level ++) { FNgramSpecsType::FNgramSpec::LevelIter liter(numParents,level); unsigned int node; while (liter.next(node)) { howmanyFNgrams[node] = numFNgrams(specNum,node); fprintf(file, "ngram 0x%X=%d\n",node, howmanyFNgrams[node]); } } for (int level=0;level<=(int)numParents;level++) { FNgramSpecsType::FNgramSpec::LevelIter liter(numParents,level); unsigned int node; while (liter.next(node)) { fprintf(file, "\n\\0x%X-grams:\n", node); if (debug(DEBUG_WRITE_STATS)) { char buff[1024]; sprintf(buff,"writing %d 0x%X-grams\n",howmanyFNgrams[node],node); dout() << buff; // for some reason, gcc3.1.1 produces spurious "2"s with the folowing //dout() << "writing " << DEC << howmanyFNgrams[node] << " 0x" << HEX //<< node << "-grams" << DEC << "\n"; } BOsIter citer(*this, specNum, node, context, vocab.compareIndex()); BOnode *tr_node; while (tr_node = citer.next()) { // TODO: write out BOWs that have contexts but no words // since for gen BW, spent time computing them. vocab.getWords(context, scontext, maxNumParentsPerChild + 1); Vocab::reverse(scontext); ProbsIter piter(*tr_node, vocab.compareIndex()); VocabIndex pword; LogP *prob; unsigned*cnt; Boolean first_word = true; while (prob = piter.next(pword,cnt)) { if (file.error()) { return; } fprintf(file, "%.*lg\t", LogP_Precision, (double)(*prob == LogP_Zero ? LogP_PseudoZero : *prob)); Vocab::write(file, scontext); fprintf(file, "%s%s", (node != 0 ? " " : ""), vocab.getWord(pword)); // if (node > 0 && *cnt != ~0x0) // fprintf(file, "\t0x%X", *cnt); if (first_word && level > 0) { // write BOW for the context right here rather than // (as in ARPA file) with the n-1 gram because we're // not even guaranteed in a FLM with different symbol sets // for the random varibles that there will be an n-1 gram // at which to store the BOW. LogP *bow = fNgrams[specNum].parentSubsets[node].findBOW(context); if (bow) { fprintf(file, "\t%.*lg",LogP_Precision, (double)(*bow == LogP_Zero ? LogP_PseudoZero : *bow)); } else { // there should always be a bow for a real context. // in the structures. assert(0); } first_word = false; } fprintf(file, "\n"); } } } } fprintf(file, "\n\\end\\\n");}unsigned intFNgram::numFNgrams(const unsigned int specNum,const unsigned int node){ VocabIndex context[maxNumParentsPerChild + 2];
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -