📄 fngramstats.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 2 页
字号:
上一页 12
}template <class CountT>static inline BooleanstringToCount(const char *str, CountT &count){    double x;    if (sscanf(str, "%lf", &x) == 1) {	count = x;	return true;    } else {	return false;    }}static inline BooleanstringToCount(const char *str, unsigned &count){    /*     * scanf("%u") doesn't check for a positive sign, so we have to ourselves.     */    return (*str != '-' && sscanf(str, "%u", &count) == 1);}static inline BooleanstringToGenInt(const char *str, unsigned &count){  /*   * scanf("%u") doesn't do 0x hex numbers.   */  char *endptr = (char*)str;  int res = strtol(str,&endptr,0);  if (endptr == str || res < 0) return false;  count = (unsigned) res;  return true;}static inline BooleanstringToCount(const char *str, int &count){    return (sscanf(str, "%d", &count) == 1);}/***************************************************************************************** *****************************************************************************************  ***************************************************************************************** */template <class CountT>unsigned intFNgramCounts<CountT>::parseFNgram(char *line,		      VocabString *words,		      unsigned int max,		      CountT &count,		      unsigned int &parSpec,		      Boolean &ok){    unsigned howmany = Vocab::parseWords(line, words, max);    if (howmany == max) {      ok = false;      return 0;    }    /*     * Parse the last word as a count     */    if (!stringToCount(words[howmany - 1], count)) {      ok = false;      return 0;    }    /*     * Parse the first word as a par spec bitmap     */    if (!stringToGenInt(words[0], parSpec)) {      ok=false;      return 0;    }    words[howmany-1] = 0;    howmany -=2;    ok = true;    return howmany;}template <class CountT>unsigned intFNgramCounts<CountT>::readFNgram(File &file,				 VocabString *words,				 unsigned int max,				 CountT &count,				 unsigned int &parSpec,				 Boolean& ok){    char *line;    ok = true;    /*     * Read next ngram count from file, skipping blank lines     */    line = file.getline();    if (line == 0) {      return 0;    }    unsigned howmany = parseFNgram(line, words, max, count,parSpec,ok);    if (howmany == 0) {      ok = false;      file.position() << "malformed N-gram count or more than " << max - 1 << " words per line\n";      return 0;    }    return howmany;}/* * * This reads an ngram counts file (i.e., word and word history followed * by a count of the occurance of that word and word history), and loads * it into the counts object. */template <class CountT>BooleanFNgramCounts<CountT>::read(unsigned int specNum,File &file){    static VocabString words[maxNumParentsPerChild+1];    static VocabIndex wids[maxNumParentsPerChild+1];    static Boolean tagsfound[maxNumParentsPerChild+1];    CountT count;    unsigned int howmany;    unsigned int parSpec;    Boolean ok;    if (specNum >= fnSpecs.fnSpecArray.size())      return false;    while (howmany = readFNgram(file, words, maxFNgramOrder + 1, count, parSpec,ok)) {      if (!ok)	return false;      /*        * Map words to indices, skip over first word since it is parspec in string form       */      Boolean skipNgramCount = false;      if (openVocab) {	vocab.addWords2(words+1, wids, maxFNgramOrder,tagsfound+1);	for (int j = 0; j < maxFNgramOrder && words[j+1] != 0; j++) {	  if (tagsfound[j+1]==0) {	    skipNgramCount = true;	  }	}      } else {	vocab.getIndices(words+1, wids, maxFNgramOrder, vocab.unkIndex());      }      /*       *  Update the count       */      if (!skipNgramCount) {	if (fnSpecs.fnSpecArray[specNum].parentSubsets[parSpec].counts == NULL) {	  if (debug(DEBUG_WARNINGS)) {	    fprintf(stderr,"WARNING: counts file contains counts for backoff graph node that does not exist given backoff constraints, creading counts object anyway\n");	  }	  fnSpecs.fnSpecArray[specNum].parentSubsets[parSpec].counts = new FNgramNode;	  // alternatively, we could just ignore the count, since it will	  // not be used.	}	*(fnSpecs.fnSpecArray[specNum].parentSubsets[parSpec].counts->insert(wids))	  += count;      }    }    return true;}template <class CountT>BooleanFNgramCounts<CountT>::read(){  for (int i=0;i<(int)fnSpecs.fnSpecArray.size();i++) {    File f(fnSpecs.fnSpecArray[i].countFileName, "r");    if (!read(i,f))      return false;  }  return true;}/***************************************************************************************** *****************************************************************************************  ***************************************************************************************** */template <class CountT>unsigned intFNgramCounts<CountT>::writeFNgram(File &file,				  const VocabString *words,				  CountT count,				  unsigned int parSpec){    unsigned int i;    if (words[0]) {      fprintf(file,"0x%X\t",parSpec);      fprintf(file, "%s", words[0]);      for (i = 1; words[i]; i++) {	fprintf(file, " %s", words[i]);      }    }    // why could we have a count w/o any words?    fprintf(file, "\t%s\n", countToString(count));    return i;}/* * For reasons of efficiency the write() method doesn't use * writeFNgram()  (yet).  Instead, it fills up a string buffer  * as it descends the tree recursively.  this avoid having to * lookup shared prefix words and buffer the corresponding strings * repeatedly. */template <class CountT>voidFNgramCounts<CountT>::writeNode(    FNgramNode *node,		/* the trie node we're at */    const unsigned int parSpec, /* parent specifier */    File &file,			/* output file */    char *buffer,		/* output buffer */    char *bptr,			/* pointer into output buffer */    unsigned int level,		/* current trie level */    unsigned int order,		/* target trie level */    Boolean sorted)		/* produce sorted output */{    FNgramNode *child;    VocabIndex wid;    TrieIter<VocabIndex,CountT> iter(*node, sorted ? vocab.compareIndex() : 0);    /*     * Iterate over the child nodes at the current level,     * appending their word strings to the buffer     */    while (!file.error() && (child = iter.next(wid))) {	VocabString word = vocab.getWord(wid);	if (word == 0) {	   cerr << "undefined word index " << wid << "\n";	   continue;	}	unsigned wordLen = strlen(word);	if (bptr + wordLen + 1 > buffer + maxLineLength) {	   *bptr = '0';	   cerr << "ngram ["<< buffer << word		<< "] exceeds write buffer\n";	   continue;	}        	strcpy(bptr, word);	/*	 * If this is the final level, print out the ngram and the count.	 * Otherwise set up another level of recursion.	 */	if (order == 0 || level == order) {	   fprintf(file, "0x%X\t%s\t%s\n",parSpec,buffer, countToString(child->value()));	} 		if (order == 0 || level < order) {	   *(bptr + wordLen) = ' ';	   writeNode(child, parSpec, file, buffer, bptr + wordLen + 1, level + 1,			order, sorted);	}    }}template <class CountT>voidFNgramCounts<CountT>::writeSpec(File &file, 				const unsigned int specNum, 				const Boolean sorted){  if (specNum >= fnSpecs.fnSpecArray.size())    return;    static char buffer[maxLineLength];  const int numSubSets = 1<<fnSpecs.fnSpecArray[specNum].numParents;  for (int i=0;i<numSubSets;i++) {    if (fnSpecs.fnSpecArray[specNum].parentSubsets[i].counts != NULL) {      writeNode(fnSpecs.fnSpecArray[specNum].parentSubsets[i].counts,		i,file, buffer, buffer, 1, numBitsSet(i)+1, sorted);          }    // TODO: also write out individal BG node count file(s) if specified.  }}template <class CountT>voidFNgramCounts<CountT>::write(const Boolean sorted){  for (int i=0;i<(int)fnSpecs.fnSpecArray.size();i++) {    if (strcmp(fnSpecs.fnSpecArray[i].countFileName,FNGRAM_DEV_NULL_FILE) != 0) {      File f(fnSpecs.fnSpecArray[i].countFileName, "w");      writeSpec(f,i,sorted);    }  }}template <class CountT>voidFNgramCounts<CountT>::estimateDiscounts(){  fnSpecs.estimateDiscounts(*((FactoredVocab*)&vocab));}template <class CountT>voidFNgramCounts<CountT>::computeCardinalityFunctions(){  fnSpecs.computeCardinalityFunctions(*((FactoredVocab*)&vocab));}template <class CountT>CountTFNgramCounts<CountT>::sumCounts(unsigned int specNum,unsigned int node){  return fnSpecs.fnSpecArray[specNum].parentSubsets[node].accumulateCounts();}template <class CountT>CountTFNgramCounts<CountT>::sumCounts(unsigned int specNum){  CountT sum=0;  for (int node=0;node<(int)fnSpecs.fnSpecArray[specNum].numSubSets; node++) {    sum+=sumCounts(specNum,node);  }  return sum;}template <class CountT>CountTFNgramCounts<CountT>::sumCounts(){  CountT sum=0;  for (int i=0;i<(int)fnSpecs.fnSpecArray.size();i++) {    sum+=sumCounts(i);  }  return sum;}// parse file into sentences and update statstemplate <class CountT>unsigned intFNgramCounts<CountT>::countFile(File &file){    int numWords = 0;    char *line;    while (line = file.getline()) {	unsigned int howmany = countString(line);	/*	 * Since getline() returns only non-empty lines,	 * a return value of 0 indicates some sort of problem.	 */	if (howmany == 0) {	    file.position() << "line too long?\n";	} else {	    numWords += howmany;	}    }    if (debug(DEBUG_PRINT_TEXTSTATS)) {      file.position(dout()) << endl;      for (int i=0;i<(int)fnSpecs.fnSpecArray.size();i++) {	dout() << "LM(" << i << ") " << fnSpecs.fnSpecArray[i].stats;      }      // file.position(dout()) << this -> stats;    }    return numWords;}#endif /* EXCLUDE_CONTRIB_END */#endif /* _FNgramStats_cc_ */
上一页 12
💿 文件大小 3034 K
👤 上传用户 wanghaihah
📂 所属分类其他
🏷️ 相关标签

#工具包
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -