📄 fngramlm.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 5 页
字号:
BooleanFNgram::read(const unsigned int specNum, File &file){    char *line;    Array<unsigned int> numGrams;   /* the number of n-grams for each parent set */				    Array<unsigned int> numRead;      /* Number of n-grams actually read */    int state = -2 ;		/* section of file being read:				 * -2 - pre-header, -1 - header,				 * 0 - unigrams, 1 - node-1 bigrams, 				   2 - node-2 bigrams, etc, ... */    unsigned int numBitsInState = 0;    unsigned int state_order = 0;    unsigned int max_state = 0;    Boolean warnedAboutUnk = false; /* at most one warning about <unk> */    const unsigned int numParents = fngs.fnSpecArray[specNum].numParents;    const int numSubSets = fngs.fnSpecArray[specNum].numSubSets;    for (int i = 0; i < numSubSets; i++) {	numGrams[i] = 0;	numRead[i] = 0;    }    clear(specNum);    /*     * The ARPA format implicitly assumes a zero-gram backoff weight of 0.     * This has to be properly represented in the BOW trie so various     * recursive operation work correctly.     */    VocabIndex nullContext[1];    nullContext[0] = Vocab_None;    *fNgrams[specNum].parentSubsets[0].insertBOW(nullContext) = LogP_Zero;    while (line = file.getline()) {		Boolean backslash = (line[0] == '\\');	unsigned thisNode;	int nFNgrams;	switch (state) {	case -2: 	/* looking for start of header */	    if (backslash && strncmp(line, "\\data\\", 6) == 0) {		state = -1;		continue;	    }	    /*	     * Everything before "\\data\\" is ignored	     */	    continue;	case -1:		/* ngram header */	    // expect this to be of the form "0xC-grams" where "C"	    // is a hex digit string indicating the BG node	    if (backslash && sscanf(line, "\\%i-grams", &state) == 1) {		/*		 * start reading grams		 */		if (state >= numSubSets) {		  file.position() << "invalid ngram order " << HEX << state << "\n" << DEC;		  return false;		}	        if (debug(DEBUG_READ_STATS)) {		  dout() << "reading " << numGrams[state] << " " << HEX			 << state << "-grams\n" << DEC;		}		numBitsInState = FNgramSpecsType::numBitsSet(state);		state_order = numBitsInState+1;		continue;	    } else if (sscanf(line, "ngram %i=%d", &thisNode, &nFNgrams) == 2) {		/*		 * scanned a line of the form		 *	ngram <0xCCCC>=<howmany>		 * now perform various sanity checks		 */		if (thisNode >= (unsigned)numSubSets) {		  file.position() << "gram node " << HEX << thisNode				  << " out of range\n" << DEC;		  return false;		}		if (nFNgrams < 0) {		    file.position() << "gram count number " << nFNgrams				    << " out of range\n";		    return false;		}		if (thisNode > max_state) {		  max_state = thisNode;		}		numGrams[thisNode] = nFNgrams;		continue;	    } else {		file.position() << "unexpected input\n";		return false;	    }	default:	/* reading n-grams, where n == state */	    if (backslash && sscanf(line, "\\%i-grams", &state) == 1) {		if (state >= numSubSets) {		  file.position() << "invalid ngram order " << HEX << state << "\n" << DEC;		  return false;		}	        if (debug(DEBUG_READ_STATS)) {		  dout() << "reading " << numGrams[state] << " " << HEX			 << state << "-grams\n" << DEC;		}		numBitsInState = FNgramSpecsType::numBitsSet(state);		// state order is the number of strings we expect to see on a line		// i.e., the parents + child.		state_order = numBitsInState+1;		/*		 * start reading more n-grams		 */		continue;	    } else if (backslash && strncmp(line, "\\end\\", 5) == 0) {		/*		 * Check that the total number of ngrams read matches		 * that found in the header		 */		for (int i = 0; i <= (int)max_state ; i++) {		    if (numGrams[i] != numRead[i]) {		      file.position() << "warning: " << numRead[i] << " " << HEX				      << i << "-grams read, expected "				      << numGrams[i] << "\n" << DEC;		    }		}		return true;	    } else if (state > numSubSets) {		/*		 * Save time and memory by skipping ngrams outside		 * the order range of this model. This is moot		 * right now as we signal an error condition above.		 * TODO: change so that file can have higher-order		 *     than model specifies, where we ignore those.		 */		continue;	    } else {	        VocabString words[1                         // probability				  + maxNumParentsPerChild   // longest gram				  + 1                       // child				  + 1                       // backoff location				  + 1                       // optional bow				  + 1                       // end marker		];				/* result of parsing an n-gram line,				 * some elements are actually				 * numerical parameters, but so what?  				 */		VocabIndex wids[maxNumParentsPerChild + 2];				/* gram+child+endmark as word indices */		LogP prob, bow;		/*		 * Parse a line of the form		 *	<prob>	<w1> <w2> ... <wN> <c> [<hx_bg_location>] [<bow>]		 *		 * I.e., We can have zero or one <bow>s, We do not		 * have a hex bg location only when state == 0.  Note,		 * that unlike the ARPA format, here the <bow> here is		 * associated with the backoff of <w1> <w2> ... <wN>		 * <c> at hx_bg_location rather than context		 * consisting of <w1> <w2> ... <wN> <c>		 */		unsigned int howmany =		  Vocab::parseWords(line, words, maxNumParentsPerChild + 5);		// unsigned int have_cnt = (numBitsInState > 1) ? 1 : 0;		unsigned int have_cnt = 0;		if ((howmany < state_order+1+have_cnt) || (howmany > state_order+2+have_cnt)) {		  file.position() << "error, ngram line has invalid number (" << howmany				  << ") of fields, expecting either " << state_order+1+have_cnt 				  << " or " << state_order+2+have_cnt << "\n";		  return false;		}		/*		 * Parse prob		 */		if (!parseLogP(words[0], prob)) {		    file.position() << "bad prob \"" << words[0] << "\"\n";		    return false;		} else if (prob > LogP_One || prob != prob) {		    file.position() << "warning: questionable prob \""				    << words[0] << "\"\n";		} else if (prob == LogP_PseudoZero) {		    /*		     * convert pseudo-zeros back into real zeros		     */		    prob = LogP_Zero;		}		/*		 * 		 * get backoff graph location		 *		 */		unsigned int gram_cnt = 0; // support for counts in lm file, not finished.		if (have_cnt) {		  char *endptr = (char *)words[state_order+1];		  gram_cnt = (unsigned)strtol(words[state_order+1],&endptr,0);		  if (endptr == words[state_order+1] || gram_cnt == ~0x0) {		    file.position() << "warning: invalid backoff graph location\""				    << words[state_order+1] << "\"\n";		  }		}		/* 		 * Parse bow, if any		 */		if (howmany == state_order + 2 + have_cnt) {		    /*		     * Parsing floats strings is the most time-consuming		     * part of reading in backoff models.  We therefore		     * try to avoid parsing bows where they are useless,		     * i.e., for contexts that are longer than what this		     * model uses.  We also do a quick sanity check to		     * warn about non-zero bows in that position.		     */		    if (state == 0) {		      // unlike normal ARPA files, here we 		      // would not expect a bow for the unigram.			if (words[state_order + 1 + have_cnt][0] != '0') {			    file.position() << "ignoring non-zero bow \""					    << words[state_order + 1 + have_cnt]					    << "\" for minimal ngram\n";			}		    } else if (!parseLogP(words[state_order + 1 + have_cnt], bow)) {			file.position() << "bad bow \"" << 			  words[state_order + 1 + have_cnt] << "\"\n";			return false;		    } else if (bow == LogP_Inf || bow != bow) {			file.position() << "warning: questionable bow \""		                    	<< words[state_order + 1 + have_cnt] << "\"\n";		    } else if (bow == LogP_PseudoZero) {			/*			 * convert pseudo-zeros back into real zeros			 */			bow = LogP_Zero;		    }		}		/* 		 * Terminate the words array after the last word,		 * then translate it to word indices.  We also		 * reverse the ngram since that's how we'll need it		 * to index the trie.		 */		words[state_order + 1] = 0;		vocab.addWords(&words[1], wids, maxNumParentsPerChild+2);		Vocab::reverse(wids);		/*		 * Store bow, if any		 */		if (howmany == state_order + 2 + have_cnt && state != 0) {		  *(fNgrams[specNum].parentSubsets[state].insertBOW(&wids[1])) = bow;		}		/*		 * Save the last word (which is now the first, due to reversal)		 * then use the first n-1 to index into		 * the context trie, storing the prob.		 */		if (!warnedAboutUnk &&		    wids[0] == vocab.unkIndex() &&		    prob != LogP_Zero &&		    !vocab.unkIsWord())		  {		    file.position() << "warning: found non-zero LM probability for "				    << vocab.getWord(vocab.unkIndex())				    << " in closed-vocabulary LM\n";		    warnedAboutUnk = true;		  }		*(fNgrams[specNum].parentSubsets[state].		  insertProbAndCNT(wids[0], &wids[1],gram_cnt)) = prob;		/*		 * Hey, we're done with this ngram!		 */		numRead[state] ++;	    }	}    }    /*     * we reached a premature EOF     */    file.position() << "reached EOF before \\end\\\n";    return false;}voidFNgram::write(){  for (unsigned specNum=0;specNum<fNgramsSize;specNum++) {      if (fngs.fnSpecArray[specNum].lmFileName && 	!(*fngs.fnSpecArray[specNum].lmFileName == '_'	  && !*(fngs.fnSpecArray[specNum].lmFileName+1))) {      File f(fngs.fnSpecArray[specNum].lmFileName,"w");      if (debug(DEBUG_WRITE_STATS)) {	dout() << "writing FLM to " << fngs.fnSpecArray[specNum].lmFileName << "\n";      }      write(specNum,f);    }  }}BooleanFNgram::read(){  for (unsigned specNum=0;specNum<fNgramsSize;specNum++) {      if (fngs.fnSpecArray[specNum].lmFileName && 	!(*fngs.fnSpecArray[specNum].lmFileName == '_'	  && !*(fngs.fnSpecArray[specNum].lmFileName+1))) {      File f(fngs.fnSpecArray[specNum].lmFileName,"r");      if (debug(DEBUG_READ_STATS)) {	dout() << "reading FLM from " << fngs.fnSpecArray[specNum].lmFileName << "\n";      }      if  (!read(specNum,f))	return false;    }  }  if (debug(DEBUG_ESTIMATE_LM))    wordProbSum();  return true;}// writes out in an ARPA-inspired file format -//   - there are node-grams rather than n-grams, where//     node is the bit vector giving LM parents in the LM graph//   - the bow for a context is located with the ngram that//     could use that bow (rather than for the backoff graph)voidFNgram::write(unsigned int specNum,File &file){    Array<unsigned> howmanyFNgrams;    VocabIndex context[maxNumParentsPerChild + 2];    VocabString scontext[maxNumParentsPerChild + 2];    fprintf(file, "\n\\data\\\n");    const unsigned numParents = fngs.fnSpecArray[specNum].numParents;    // starting with the unigram, and moving up to the all LM-parents case.    for (int level=0;level<=(int)numParents;level ++) {      FNgramSpecsType::FNgramSpec::LevelIter liter(numParents,level);      unsigned int node;      while (liter.next(node)) {	howmanyFNgrams[node] = numFNgrams(specNum,node);	fprintf(file, "ngram 0x%X=%d\n",node, howmanyFNgrams[node]);      }    }    for (int level=0;level<=(int)numParents;level++) {      FNgramSpecsType::FNgramSpec::LevelIter liter(numParents,level);      unsigned int node;      while (liter.next(node)) {	fprintf(file, "\n\\0x%X-grams:\n", node);        if (debug(DEBUG_WRITE_STATS)) {	  char buff[1024];	  sprintf(buff,"writing %d 0x%X-grams\n",howmanyFNgrams[node],node);	  dout() << buff;	  // for some reason, gcc3.1.1 produces spurious "2"s with the folowing	  //dout() << "writing " << DEC << howmanyFNgrams[node] << " 0x" << HEX	  //<< node << "-grams" << DEC << "\n";	}        	BOsIter citer(*this, specNum, node, context, vocab.compareIndex());	BOnode *tr_node;	while (tr_node = citer.next()) {	  // TODO: write out BOWs that have contexts but no words	  // since for gen BW, spent time computing them.	    vocab.getWords(context, scontext, maxNumParentsPerChild + 1);	    Vocab::reverse(scontext);	    ProbsIter piter(*tr_node, vocab.compareIndex());	    VocabIndex pword;	    LogP *prob;	    unsigned*cnt;	    	    Boolean first_word = true;	    while (prob = piter.next(pword,cnt)) {		if (file.error()) {		    return;		}		fprintf(file, "%.*lg\t", LogP_Precision,				(double)(*prob == LogP_Zero ?						LogP_PseudoZero : *prob));		Vocab::write(file, scontext);		fprintf(file, "%s%s", (node != 0 ? " " : ""), vocab.getWord(pword));		// if (node > 0 && *cnt != ~0x0)		// fprintf(file, "\t0x%X", *cnt);		if (first_word && level > 0) {		  // write BOW for the context right here rather than		  // (as in ARPA file) with the n-1 gram because we're		  // not even guaranteed in a FLM with different symbol sets		  // for the random varibles that there will be an n-1 gram		  // at which to store the BOW.		  LogP *bow =		    fNgrams[specNum].parentSubsets[node].findBOW(context);		  if (bow) {		    fprintf(file, "\t%.*lg",LogP_Precision,			    (double)(*bow == LogP_Zero ?				       LogP_PseudoZero : *bow));		  } else {		    // there should always be a bow for a real context.		    // in the structures.		    assert(0);		  }		  first_word = false;		}		fprintf(file, "\n");	    }	}      }    }    fprintf(file, "\n\\end\\\n");}unsigned intFNgram::numFNgrams(const unsigned int specNum,const unsigned int node){  VocabIndex context[maxNumParentsPerChild + 2];
💿 文件大小 3034 K
👤 上传用户 wanghaihah
📂 所属分类其他
🏷️ 相关标签

#工具包
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -