📄 fngramstats.cc
字号:
}template <class CountT>static inline BooleanstringToCount(const char *str, CountT &count){ double x; if (sscanf(str, "%lf", &x) == 1) { count = x; return true; } else { return false; }}static inline BooleanstringToCount(const char *str, unsigned &count){ /* * scanf("%u") doesn't check for a positive sign, so we have to ourselves. */ return (*str != '-' && sscanf(str, "%u", &count) == 1);}static inline BooleanstringToGenInt(const char *str, unsigned &count){ /* * scanf("%u") doesn't do 0x hex numbers. */ char *endptr = (char*)str; int res = strtol(str,&endptr,0); if (endptr == str || res < 0) return false; count = (unsigned) res; return true;}static inline BooleanstringToCount(const char *str, int &count){ return (sscanf(str, "%d", &count) == 1);}/***************************************************************************************** ***************************************************************************************** ***************************************************************************************** */template <class CountT>unsigned intFNgramCounts<CountT>::parseFNgram(char *line, VocabString *words, unsigned int max, CountT &count, unsigned int &parSpec, Boolean &ok){ unsigned howmany = Vocab::parseWords(line, words, max); if (howmany == max) { ok = false; return 0; } /* * Parse the last word as a count */ if (!stringToCount(words[howmany - 1], count)) { ok = false; return 0; } /* * Parse the first word as a par spec bitmap */ if (!stringToGenInt(words[0], parSpec)) { ok=false; return 0; } words[howmany-1] = 0; howmany -=2; ok = true; return howmany;}template <class CountT>unsigned intFNgramCounts<CountT>::readFNgram(File &file, VocabString *words, unsigned int max, CountT &count, unsigned int &parSpec, Boolean& ok){ char *line; ok = true; /* * Read next ngram count from file, skipping blank lines */ line = file.getline(); if (line == 0) { return 0; } unsigned howmany = parseFNgram(line, words, max, count,parSpec,ok); if (howmany == 0) { ok = false; file.position() << "malformed N-gram count or more than " << max - 1 << " words per line\n"; return 0; } return howmany;}/* * * This reads an ngram counts file (i.e., word and word history followed * by a count of the occurance of that word and word history), and loads * it into the counts object. */template <class CountT>BooleanFNgramCounts<CountT>::read(unsigned int specNum,File &file){ static VocabString words[maxNumParentsPerChild+1]; static VocabIndex wids[maxNumParentsPerChild+1]; static Boolean tagsfound[maxNumParentsPerChild+1]; CountT count; unsigned int howmany; unsigned int parSpec; Boolean ok; if (specNum >= fnSpecs.fnSpecArray.size()) return false; while (howmany = readFNgram(file, words, maxFNgramOrder + 1, count, parSpec,ok)) { if (!ok) return false; /* * Map words to indices, skip over first word since it is parspec in string form */ Boolean skipNgramCount = false; if (openVocab) { vocab.addWords2(words+1, wids, maxFNgramOrder,tagsfound+1); for (int j = 0; j < maxFNgramOrder && words[j+1] != 0; j++) { if (tagsfound[j+1]==0) { skipNgramCount = true; } } } else { vocab.getIndices(words+1, wids, maxFNgramOrder, vocab.unkIndex()); } /* * Update the count */ if (!skipNgramCount) { if (fnSpecs.fnSpecArray[specNum].parentSubsets[parSpec].counts == NULL) { if (debug(DEBUG_WARNINGS)) { fprintf(stderr,"WARNING: counts file contains counts for backoff graph node that does not exist given backoff constraints, creading counts object anyway\n"); } fnSpecs.fnSpecArray[specNum].parentSubsets[parSpec].counts = new FNgramNode; // alternatively, we could just ignore the count, since it will // not be used. } *(fnSpecs.fnSpecArray[specNum].parentSubsets[parSpec].counts->insert(wids)) += count; } } return true;}template <class CountT>BooleanFNgramCounts<CountT>::read(){ for (int i=0;i<(int)fnSpecs.fnSpecArray.size();i++) { File f(fnSpecs.fnSpecArray[i].countFileName, "r"); if (!read(i,f)) return false; } return true;}/***************************************************************************************** ***************************************************************************************** ***************************************************************************************** */template <class CountT>unsigned intFNgramCounts<CountT>::writeFNgram(File &file, const VocabString *words, CountT count, unsigned int parSpec){ unsigned int i; if (words[0]) { fprintf(file,"0x%X\t",parSpec); fprintf(file, "%s", words[0]); for (i = 1; words[i]; i++) { fprintf(file, " %s", words[i]); } } // why could we have a count w/o any words? fprintf(file, "\t%s\n", countToString(count)); return i;}/* * For reasons of efficiency the write() method doesn't use * writeFNgram() (yet). Instead, it fills up a string buffer * as it descends the tree recursively. this avoid having to * lookup shared prefix words and buffer the corresponding strings * repeatedly. */template <class CountT>voidFNgramCounts<CountT>::writeNode( FNgramNode *node, /* the trie node we're at */ const unsigned int parSpec, /* parent specifier */ File &file, /* output file */ char *buffer, /* output buffer */ char *bptr, /* pointer into output buffer */ unsigned int level, /* current trie level */ unsigned int order, /* target trie level */ Boolean sorted) /* produce sorted output */{ FNgramNode *child; VocabIndex wid; TrieIter<VocabIndex,CountT> iter(*node, sorted ? vocab.compareIndex() : 0); /* * Iterate over the child nodes at the current level, * appending their word strings to the buffer */ while (!file.error() && (child = iter.next(wid))) { VocabString word = vocab.getWord(wid); if (word == 0) { cerr << "undefined word index " << wid << "\n"; continue; } unsigned wordLen = strlen(word); if (bptr + wordLen + 1 > buffer + maxLineLength) { *bptr = '0'; cerr << "ngram ["<< buffer << word << "] exceeds write buffer\n"; continue; } strcpy(bptr, word); /* * If this is the final level, print out the ngram and the count. * Otherwise set up another level of recursion. */ if (order == 0 || level == order) { fprintf(file, "0x%X\t%s\t%s\n",parSpec,buffer, countToString(child->value())); } if (order == 0 || level < order) { *(bptr + wordLen) = ' '; writeNode(child, parSpec, file, buffer, bptr + wordLen + 1, level + 1, order, sorted); } }}template <class CountT>voidFNgramCounts<CountT>::writeSpec(File &file, const unsigned int specNum, const Boolean sorted){ if (specNum >= fnSpecs.fnSpecArray.size()) return; static char buffer[maxLineLength]; const int numSubSets = 1<<fnSpecs.fnSpecArray[specNum].numParents; for (int i=0;i<numSubSets;i++) { if (fnSpecs.fnSpecArray[specNum].parentSubsets[i].counts != NULL) { writeNode(fnSpecs.fnSpecArray[specNum].parentSubsets[i].counts, i,file, buffer, buffer, 1, numBitsSet(i)+1, sorted); } // TODO: also write out individal BG node count file(s) if specified. }}template <class CountT>voidFNgramCounts<CountT>::write(const Boolean sorted){ for (int i=0;i<(int)fnSpecs.fnSpecArray.size();i++) { if (strcmp(fnSpecs.fnSpecArray[i].countFileName,FNGRAM_DEV_NULL_FILE) != 0) { File f(fnSpecs.fnSpecArray[i].countFileName, "w"); writeSpec(f,i,sorted); } }}template <class CountT>voidFNgramCounts<CountT>::estimateDiscounts(){ fnSpecs.estimateDiscounts(*((FactoredVocab*)&vocab));}template <class CountT>voidFNgramCounts<CountT>::computeCardinalityFunctions(){ fnSpecs.computeCardinalityFunctions(*((FactoredVocab*)&vocab));}template <class CountT>CountTFNgramCounts<CountT>::sumCounts(unsigned int specNum,unsigned int node){ return fnSpecs.fnSpecArray[specNum].parentSubsets[node].accumulateCounts();}template <class CountT>CountTFNgramCounts<CountT>::sumCounts(unsigned int specNum){ CountT sum=0; for (int node=0;node<(int)fnSpecs.fnSpecArray[specNum].numSubSets; node++) { sum+=sumCounts(specNum,node); } return sum;}template <class CountT>CountTFNgramCounts<CountT>::sumCounts(){ CountT sum=0; for (int i=0;i<(int)fnSpecs.fnSpecArray.size();i++) { sum+=sumCounts(i); } return sum;}// parse file into sentences and update statstemplate <class CountT>unsigned intFNgramCounts<CountT>::countFile(File &file){ int numWords = 0; char *line; while (line = file.getline()) { unsigned int howmany = countString(line); /* * Since getline() returns only non-empty lines, * a return value of 0 indicates some sort of problem. */ if (howmany == 0) { file.position() << "line too long?\n"; } else { numWords += howmany; } } if (debug(DEBUG_PRINT_TEXTSTATS)) { file.position(dout()) << endl; for (int i=0;i<(int)fnSpecs.fnSpecArray.size();i++) { dout() << "LM(" << i << ") " << fnSpecs.fnSpecArray[i].stats; } // file.position(dout()) << this -> stats; } return numWords;}#endif /* EXCLUDE_CONTRIB_END */#endif /* _FNgramStats_cc_ */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -