📄 fngramspecs.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 4 页
字号:
上一页 1 2 34
template <class CountT>  unsigned intFNgramSpecs<CountT>::FNgramSpec::parseNodeString(char *str, Boolean &success) {  if (str == NULL)    return 0;  success = false;  char *endptr;  unsigned bits = 0;    endptr = str;  bits = (unsigned) strtolplusb(str,&endptr,0);  if (endptr != str) {    success = true;    return bits;  }  char *p = str;  const unsigned buflen = 2047;  char buff[buflen+1];  unsigned par = 1;  while (*p) {    // parse tokens of the form TAGNUMBER,TAGNUMBER,TAGNUMBER    // where TAG is one of the parent names, and number    // is the negative of the parent position. For example,    // given parents of the form    //    W : 3 M(-1) M(-2) M(0) S(-1) S(-2) S(+2)    // a valid string would be    //     M1,M0,S1,S2,S+2     // and which could correspond to parents    //    // M(-1), M(0), S(-1), S(-2), S(+2)    //    // and be bit vector    //      0b11101    // which is returned.        // get parent    char *parent = p;    while (*p && !isdigit(*p) && *p != '+' && *p != '-') {      p++;    }    char tmp = *p;    *p = '\0';    strncpy(buff,parent,buflen);    *p = tmp;    // get parent position    // NOTE: this is such that;    //   W1 == W-1, i.e., both are the previous word.    //   W+1 == the future word    // this means that W1 is really an index into the past, rather than the future,    // meaning that W1 is not the same as W+1. The reason for this is that    // people will much more commonly use language models with histories in the past rather    // than histories in the future, so the more common case uses fewer characters.    bool plusMinusPresent = (*p == '+' || *p == '-');    int parPos = strtol(p,&endptr,0);    if (endptr == p) {      fprintf(stderr,"Can't form integer at parent specifier %d in string (%s)\n",par,str);      return 0; // doesn't matter what we return here.    }    p = endptr;    // search for parent and position.    unsigned i;    for (i=0;i<numParents;i++) {      if (parentOffsets[i] == (!plusMinusPresent?(-1):(+1))*parPos &&	  strcmp(parents[i],buff) == 0) {	// found 	if (bits & (1<<i)) {	  // already set, might be an oversite or error by user, give warning	  fprintf(stderr,		  "WARNING: parent specifier (%s%d) at position %d given twice in string (%s)\n",		  buff,(!plusMinusPresent?(-1):(+1))*parPos,par,str);	}	bits |= (1<<i);	break;      }    }    if (i == numParents) {      fprintf(stderr,"Can't find a valid parent specifier with (%s%d) at position %d in string (%s)\n",	      buff,(!plusMinusPresent?(-1):(+1))*parPos,par,str);      return 0; // doesn't matter what we return here.    }     if (*p == ',')      p++;    par++;  }  success = true;  return bits;}// null for null stringstatic inline const char *nfns(const char *const str) {  return (str == NULL ? "NULL" : str);}static inline char bchar(const Boolean b) {  return (b?'T':'F');}template <class CountT> voidFNgramSpecs<CountT>::printFInfo(){  for (int i=0;i<(int)fnSpecArray.size();i++) {    fprintf(stderr, "----\nchild = [%s], %d parents\n",fnSpecArray[i].child,	   fnSpecArray[i].numParents);    for (int j=0;j<(int)fnSpecArray[i].numParents;j++) {      fprintf(stderr, "   parent %d = [%s(%d)] = [%d(%d)]\n",j,	     fnSpecArray[i].parents[j],	     fnSpecArray[i].parentOffsets[j],	     fnSpecArray[i].parentPositions[j],	     fnSpecArray[i].parentOffsets[j]);    }    fprintf(stderr, "   count filename = (%s)\n",fnSpecArray[i].countFileName);    for (int subset=0;subset<(1<<fnSpecArray[i].numParents);subset++) {      if (fnSpecArray[i].parentSubsets[subset].counts != NULL) {	fprintf(stderr,	       "   node 0x%X, constraint 0x%X, count object is %s\n",subset,	       fnSpecArray[i].parentSubsets[subset].backoffConstraint,	       (fnSpecArray[i].parentSubsets[subset].counts != NULL ? 		"allocated" : "unallocated"));	fprintf(stderr,	       "      gtmin=%d, gtmax=%d, gt=(%s), cdiscount=%f, ndiscount=%c, wbdiscount=%c,\n"	        "      kndiscount=%c, kn=(%s), interpolate=%c, write=(%s), backoffStrategy=%d\n",	       fnSpecArray[i].parentSubsets[subset].gtmin,	       fnSpecArray[i].parentSubsets[subset].gtmax,	       nfns(fnSpecArray[i].parentSubsets[subset].gtFile),	       fnSpecArray[i].parentSubsets[subset].cdiscount,	       bchar(fnSpecArray[i].parentSubsets[subset].ndiscount),	       bchar(fnSpecArray[i].parentSubsets[subset].wbdiscount),	       bchar(fnSpecArray[i].parentSubsets[subset].kndiscount),	       nfns(fnSpecArray[i].parentSubsets[subset].knFile),	       bchar(fnSpecArray[i].parentSubsets[subset].interpolate),	       nfns(fnSpecArray[i].parentSubsets[subset].writeFile),	       fnSpecArray[i].parentSubsets[subset].backoffStrategy);      }    }  }  fflush(stderr); // TODO: remove since always flushed}/* *  This breaks each word into factors or streams which presumably is formatted as: * *     <Tag1>-<factor1>:<Tag2>-<factor2>:...:<TagN>-<factorN> *  * Load them in the order that was constructed for the tags in FNgramSpecs() * constructor. That way, the position in the resulting matrix corresponds * to the tag position that several of the objects use such as FNgramStats. * * Duplicate tags for a word are ignored (the first one encountered on * the left is used, possibly issuing a warning). Missing tags for a * word are assumed to be <tag>-"<NULL>" I.e., for a morph class, a * missing morph tag would be assumed to be "M-<NULL>" where "<NULL>" * is the special null word, and "M" is the morph tag. This way, files * need not specify the nulls when they exist (and is the reason why NULL * is a special word). * * Any unknown tags are ignored (and a message is printed if the debug level is set * accordingly). That way, extra tags not used at the moment can stay in the file * without affecting anything. * * A word tag really should be there for the start/end sentence stuff * to work.  (if there is no word stream, a warning message will be * issued if debug > 0) * */template <class CountT>unsigned intFNgramSpecs<CountT>::loadWordFactors(const VocabString *words,				     WordMatrix& wm,				     unsigned int max){  if (!words)    return 0;  // parse words into factors  int i;  for (i=0; i<(int)max && words[i] != 0; i++) {    // TODO: call FactoredVocab::loadWordFactor() here instead of    // code below.    // assume we don't need to reclaim any of the word_factors[i][j], so    // just zero out our pointers here.    ::memset(wm.word_factors[i],0,(maxNumParentsPerChild+1)*sizeof(VocabString));    VocabString word = words[i];    VocabString word_p = word;    if (debug(DEBUG_EVERY_SENTENCE_INFO))      fprintf(stderr,"Processing word (%s)\n",word);    // word looks like:    // <Tag1>-<factor1>:<Tag2>-<factor2>:...:<TagN>-<factorN>    // if a tag is missing (i.e., just <factor_n>), then we    // assume it is a FNGRAM_WORD_TAG, which indicates     // it is a word.        Boolean tag_assigned = false;    // make a copy of word, for useful messages    char word_copy[2048];    strncpy(word_copy,word,2047);    Boolean last_factor = false;    while (!last_factor) {      char *end_p = (char *)strchr(word_p,FNGRAM_FACTOR_SEPARATOR);      if (end_p != NULL) {	// this is not last word	*end_p = '\0';      } else 	last_factor = true;      if (debug(DEBUG_EVERY_SENTENCE_INFO))	fprintf(stderr,"working on factor (%s)\n",word_p);      char *sep_p = (char *)strchr(word_p,FNGRAM_WORD_TAG_SEP);      if (sep_p == NULL) {	// no tag, assume word tag. Note, either all words must	// have a word tag "W-...", or no words can have a word tag. Otherwise,	// vocab object will assign two different wids for same word, one	// with wordtag and one without.	wm.word_factors[i][FNGRAM_WORD_TAG_POS] = word_p;	tag_assigned = true;      } else {	*sep_p = '\0';	unsigned* pos = tagPosition.find(word_p);	*sep_p = FNGRAM_WORD_TAG_SEP;	if (pos == NULL) {	  if (debug(DEBUG_TAG_WARNINGS)) {	    fprintf(stderr,"Warning: unknown tag in factor (%s) of word (%s) when parsing file\n",		    word_p,word_copy);	  }	  goto next_tag;	}	if (*pos == FNGRAM_WORD_TAG_POS) {	  // TODO: normalize word so that it either always uses a "W-" tag	  // or does not use a "W-" tag.	}	if (wm.word_factors[i][*pos] != NULL) {	  if (debug(DEBUG_WARN_DUP_TAG)) 	    fprintf(stderr,"Warning: tag given twice in word (%s) when parsing "		    "file. Using first instance.\n",word_copy);	} else	  wm.word_factors[i][*pos] = word_p;	tag_assigned = true;      }    next_tag:      word_p = end_p+1;    }    if (!tag_assigned) {      if (debug(DEBUG_TAG_WARNINGS)) {	fprintf(stderr,"Warning: no known tags in word (%s), treating all tags as NULLs",		word_copy);      }    }    // store any nulls    int j;    for (j=0;j<(int)tagPosition.numEntries();j++) {      if (wm.word_factors[i][j] == 0) {	wm.word_factors[i][j] = fvocab.tagNulls[j];      }    }    wm.word_factors[i][j] = 0;        }  if (debug(DEBUG_MISSING_FIRST_LAST_WORD)) {    // extra check for a word stream. If a word stream does not    // exist, there is no current way that the start/end sentence    // stuff will be added.    if (wm.word_factors[0][FNGRAM_WORD_TAG_POS] == fvocab.tagNulls[FNGRAM_WORD_TAG_POS])      fprintf(stderr,"Warning: using NULL for first word in sentence\n");    if (wm.word_factors[i-1][FNGRAM_WORD_TAG_POS] == fvocab.tagNulls[FNGRAM_WORD_TAG_POS])      fprintf(stderr,"Warning: using NULL for last word in sentence\n");  }  if (debug(DEBUG_EVERY_SENTENCE_INFO))    fprintf(stderr,"%d words in sentence\n",i);  if (i < (int)max) {    // zero out last one    ::memset(wm.word_factors[i],0,(maxNumParentsPerChild+1)*sizeof(VocabString));  }  if (debug(DEBUG_EVERY_SENTENCE_INFO))    wm.print(stderr);  return i;}template <class CountT>voidFNgramSpecs<CountT>::estimateDiscounts(FactoredVocab& vocab){  for (int i=0;i<(int)fnSpecArray.size();i++) {    // estimate the discounts in increasing level order in BG    // Change this for loop to:    //    for (int level=0;level<=fnSpecArray[i].numParents;level++) {    // to get meta, meta-meta, meta-meta-meta, (etc...) counts.    // And change this for loop to:    //    for (int level=fnSpecArray[i].numParents;level>=0;level--) {        // just to get meta level counts.#define METAMETAMETAETC 0#if METAMETAMETAETC    // fprintf(stderr,"Doing meta meta meta etc. counts\n");    for (int level=fnSpecArray[i].numParents;level>=0;level--) {    #else      // fprintf(stderr,"Doing just meta counts\n");    for (int level=0;level<=(int)fnSpecArray[i].numParents;level++) {#endif      typename FNgramSpec::LevelIter iter(fnSpecArray[i].numParents,level);      unsigned int subset;      while (iter.next(subset)) {	if (fnSpecArray[i].parentSubsets[subset].counts != NULL) {	  FDiscount *discount = 0;		  Boolean gt = false;	  if (fnSpecArray[i].parentSubsets[subset].ndiscount) {	    discount = 	      new FNaturalDiscount(fnSpecArray[i].parentSubsets[subset].gtmin);	    assert(discount);	  } else if (fnSpecArray[i].parentSubsets[subset].wbdiscount) {	    discount = new FWittenBell(fnSpecArray[i].parentSubsets[subset].gtmin);	    assert(discount);	  } else if (fnSpecArray[i].parentSubsets[subset].cdiscount != -1.0) {	    discount = 	      new FConstDiscount(fnSpecArray[i].parentSubsets[subset].cdiscount,				 fnSpecArray[i].parentSubsets[subset].gtmin);	    assert(discount);	  } else if (fnSpecArray[i].parentSubsets[subset].knFile ||		     fnSpecArray[i].parentSubsets[subset].kndiscount) {	    discount = 	      new FModKneserNey(fnSpecArray[i].parentSubsets[subset].gtmin, 			        fnSpecArray[i].parentSubsets[subset].knCountsModified,			        fnSpecArray[i].parentSubsets[subset].knCountsModifyAtEnd);	    assert(discount);	  } else if (fnSpecArray[i].parentSubsets[subset].knFile ||		     fnSpecArray[i].parentSubsets[subset].ukndiscount) {	    discount = 	      new FKneserNey(fnSpecArray[i].parentSubsets[subset].gtmin, 			     fnSpecArray[i].parentSubsets[subset].knCountsModified,			     fnSpecArray[i].parentSubsets[subset].knCountsModifyAtEnd);	    assert(discount);	  } else {	    gt = true;	    discount = new FGoodTuring(fnSpecArray[i].parentSubsets[subset].gtmin,				       fnSpecArray[i].parentSubsets[subset].gtmax);	    assert(discount);	  }	  discount->debugme(debuglevel());	  discount->interpolate = fnSpecArray[i].parentSubsets[subset].interpolate;	  Boolean estimated = false;	  if (fnSpecArray[i].parentSubsets[subset].knFile && 	      fnSpecArray[i].parentSubsets[subset].kndiscount) {	    File file(fnSpecArray[i].parentSubsets[subset].knFile,"r",0);	    if (!file.error()) {	      if (!discount->read(file)) {		fprintf(stderr,"error reading kn discount file (%s)\n",			fnSpecArray[i].parentSubsets[subset].knFile);		exit(-1);	      }	      estimated = true;	    }	  }	  if (!estimated && fnSpecArray[i].parentSubsets[subset].gtFile && gt) {	    File file(fnSpecArray[i].parentSubsets[subset].gtFile,"r",0);	    if (!file.error()) {	      if (!discount->read(file)) {		fprintf(stderr,"error reading gt discount file (%s)\n",			fnSpecArray[i].parentSubsets[subset].gtFile);		exit(-1);	      }	      estimated = true;	    }	  }	  if (!estimated) {	    vocab.setCurrentTagVocab(fnSpecArray[i].child);	    if (!discount->estimate(fnSpecArray[i],				    subset,				    vocab)) {	      // TODO: make better error message here.	      fprintf(stderr,"error in discount estimator\n");	      exit(-1);	    }	    estimated = true;	    if (fnSpecArray[i].parentSubsets[subset].kndiscount && 		fnSpecArray[i].parentSubsets[subset].knFile) {	      File file(fnSpecArray[i].parentSubsets[subset].knFile,"w");	      discount->write(file);	    } else if (gt && fnSpecArray[i].parentSubsets[subset].gtFile) {	      File file(fnSpecArray[i].parentSubsets[subset].gtFile,"w");	      discount->write(file);	    }	  }	  fnSpecArray[i].parentSubsets[subset].discount = discount;	}      }    }  }}template <class CountT>voidFNgramSpecs<CountT>::computeCardinalityFunctions(FactoredVocab& vocab){  for (int specNum=0;specNum<(int)fnSpecArray.size();specNum++) {    // child    vocab.setCurrentTagVocab(fnSpecArray[specNum].childPosition);    const unsigned numChildWords = vocab.currentTagVocabCardinality();    for (unsigned node=0;node<fnSpecArray[specNum].numSubSets;node++) {      fnSpecArray[specNum].parentSubsets[node].prodCardinalities 	= numChildWords;      fnSpecArray[specNum].parentSubsets[node].sumCardinalities 	= numChildWords;      fnSpecArray[specNum].parentSubsets[node].sumLogCardinalities 	= log10((double)numChildWords);      // parents      for (int par=0;par<(int)fnSpecArray[specNum].numParents;par++) {	if (node & (1<<par)) {	  vocab.setCurrentTagVocab(fnSpecArray[specNum].parentPositions[par]);	  const unsigned numParWords = vocab.currentTagVocabCardinality();	  fnSpecArray[specNum].parentSubsets[node].prodCardinalities 	    *= numParWords;	  fnSpecArray[specNum].parentSubsets[node].sumCardinalities 	    += numParWords;	  fnSpecArray[specNum].parentSubsets[node].sumLogCardinalities 	    += log((double)numParWords);	}      }    }  }} // return pointer to a static buff where// we've got the tag of a if any. template <class CountT>VocabStringFNgramSpecs<CountT>::getTag(VocabString a){  // TODO: this routine is a quick hack and should be redone properly.  if (!a)    return NULL;  static char buff[1024];  char* sep_p = (char *)strchr(a,FNGRAM_WORD_TAG_SEP);  if (sep_p == NULL)    return NULL;  *sep_p = '\0';  // make sure we don't overrun buffer and it's terminated  strncpy(buff,a,sizeof(buff)-1);  buff[sizeof(buff)-1] = '\0';  *sep_p = FNGRAM_WORD_TAG_SEP;  return &buff[0];}				       template <class CountT>VocabStringFNgramSpecs<CountT>::wordTag(){  return FNGRAM_WORD_TAG_STR;}#endif /* EXCLUDE_CONTRIB_END */#endif /* _FNgramSpecs_cc_ */
上一页 1 2 34
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -