📄 fngramspecs.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 4 页
字号:
	  tokens[tok] << ")\n";	exit(-1);      }      if (nodeId < 0) {	f.position() << "Error: couldn't form unsigned int in " <<	  tokens[tok] << "for node specifier when reading factored spec file\n";	exit(-1);      }      if (nodeId >= numSubSets) {	fprintf(stderr,"Error: node specifier must be between 0x0 and 0x%x inclusive\n",		numSubSets-1);	exit(-1);      }      tok++;      // get backoff constraint for this node      fnSpecArray[i].parentSubsets[nodeId].backoffConstraint	= fnSpecArray[i].parseNodeString((char*)tokens[tok],success);      if (!success) {	f.position() << "Error: couldn't form BG node constraint in string (" <<	  tokens[tok] << ")\n";	exit(-1);      }      tok++;      //       // Current set of Node Options      //       // gtmin [num]      // gtmax [num]      // gt [fileName string]      // cdiscount [double]      // ndiscount []      // wbdiscount []      // kndiscount []      // ukndiscount []      // kn-counts-modified []      // kn-counts-modify-at-end []      // kn [fileName string]      // interpolate []      // write [fileName string]      // strategy [option]      //    where [option] is one of:      //            counts_no_norm      //            counts_sum_counts_norm      //            counts_sum_num_words_norm      //            counts_prod_card_norm      //            counts_sum_card_norm      //            counts_sum_log_card_norm      //            bog_node_prob      //     startGetOptions:      for (;tok<howmany;tok++) {	if (strcmp(tokens[tok],"gtmin") == 0) {	  if (tok+1==howmany) {	    fprintf(stderr,"Error: gtmin argument needs a value"		    "reading factored spec file\n");	    exit(-1);	  }	  tok++;	  if (sscanf(tokens[tok],"%d",&fnSpecArray[i].parentSubsets[nodeId].gtmin) != 1){	    fprintf(stderr,"Error: gtmin argument needs integer value");	    exit(-1);	  }	} else if (strcmp(tokens[tok],"gtmax") == 0) {	  if (tok+1==howmany) {	    fprintf(stderr,"Error: gtmax argument needs a value"		    "reading factored spec file\n");	    exit(-1);	  }	  tok++;	  if (sscanf(tokens[tok],"%d",&fnSpecArray[i].parentSubsets[nodeId].gtmax) != 1){	    fprintf(stderr,"Error: gtmax argument needs integer value");	    exit(-1);	  }	} else if (strcmp(tokens[tok],"gt") == 0) {	  if (tok+1==howmany) {	    fprintf(stderr,"Error: gt argument needs a value"		    "reading factored spec file\n");	    exit(-1);	  }	  tok++;	  delete [] fnSpecArray[i].parentSubsets[nodeId].gtFile;	  fnSpecArray[i].parentSubsets[nodeId].gtFile = strdup(tokens[tok]);	} else if (strcmp(tokens[tok],"cdiscount") == 0) {	  if (tok+1==howmany) {	    fprintf(stderr,"Error: cdiscount argument needs a value");	    exit(-1);	  }	  tok++;	  double tmp;	  char *endptr;	  tmp = strtod(tokens[tok],&endptr);	  if (endptr == tokens[tok]) {	    fprintf(stderr,"Error: cdiscount argument (%s) should be floating point value",		    tokens[tok]);	    exit(-1);	  }	  fnSpecArray[i].parentSubsets[nodeId].cdiscount = tmp;	} else if (strcmp(tokens[tok],"ndiscount") == 0) {	  fnSpecArray[i].parentSubsets[nodeId].ndiscount = true;	} else if (strcmp(tokens[tok],"wbdiscount") == 0) {	  fnSpecArray[i].parentSubsets[nodeId].wbdiscount = true;	} else if (strcmp(tokens[tok],"kndiscount") == 0) {	  fnSpecArray[i].parentSubsets[nodeId].kndiscount = true;	} else if (strcmp(tokens[tok],"ukndiscount") == 0) {	  fnSpecArray[i].parentSubsets[nodeId].ukndiscount = true;	} else if (strcmp(tokens[tok],"kn-counts-modified") == 0) {	  fnSpecArray[i].parentSubsets[nodeId].knCountsModified = true;	} else if (strcmp(tokens[tok],"kn-counts-modify-at-end") == 0) {	  fnSpecArray[i].parentSubsets[nodeId].knCountsModifyAtEnd= true;	} else if (strcmp(tokens[tok],"kn-count-parent") == 0) {	  if (tok+1==howmany) {	    fprintf(stderr,"Error: kn-count-parent argument needs a parent specifier\n");	    exit(-1);	  }	  tok++;	  unsigned par = fnSpecArray[i].parseNodeString((char*)tokens[tok],success);	  if (!success) {	    fprintf(stderr,"Error: kn-count-parent argument invalid\n");	    exit(-1);	  }	  fnSpecArray[i].parentSubsets[nodeId].knCountParent = par;	} else if (strcmp(tokens[tok],"kn") == 0) {	  if (tok+1==howmany) {	    fprintf(stderr,"Error: kn argument needs a value"		    "reading factored spec file\n");	    exit(-1);	  }	  tok++;	  delete [] fnSpecArray[i].parentSubsets[nodeId].knFile;	  fnSpecArray[i].parentSubsets[nodeId].knFile = strdup(tokens[tok]);	} else if (strcmp(tokens[tok],"interpolate") == 0) {	  fnSpecArray[i].parentSubsets[nodeId].interpolate = true;	} else if (strcmp(tokens[tok],"write") == 0) {	  if (tok+1==howmany) {	    fprintf(stderr,"Error: write argument needs a value"		    "reading factored spec file\n");	    exit(-1);	  }	  tok++;	  delete [] fnSpecArray[i].parentSubsets[nodeId].writeFile;	  fnSpecArray[i].parentSubsets[nodeId].writeFile = strdup(tokens[tok]);	} else if (strcmp(tokens[tok],"strategy") == 0) {	  if (tok+1==howmany) {	    fprintf(stderr,"Error: strategy argument needs a value"		    "reading factored spec file\n");	    exit(-1);	  }	  tok++;	  if (strcmp(tokens[tok],"counts_no_norm") == 0) {	    fnSpecArray[i].parentSubsets[nodeId].backoffStrategy	      = CountsNoNorm;	  } else if (strcmp(tokens[tok],"counts_sum_counts_norm") == 0) {	    fnSpecArray[i].parentSubsets[nodeId].backoffStrategy	      = CountsSumCountsNorm;	  } else if (strcmp(tokens[tok],"counts_sum_num_words_norm") == 0) {	    fnSpecArray[i].parentSubsets[nodeId].backoffStrategy	      = CountsSumNumWordsNorm;	  } else if (strcmp(tokens[tok],"counts_prod_card_norm") == 0) {	    fnSpecArray[i].parentSubsets[nodeId].backoffStrategy	      = CountsProdCardinalityNorm;	  } else if (strcmp(tokens[tok],"counts_sum_card_norm") == 0) {	    fnSpecArray[i].parentSubsets[nodeId].backoffStrategy	      = CountsSumCardinalityNorm;	  } else if (strcmp(tokens[tok],"counts_sum_log_card_norm") == 0) {	    fnSpecArray[i].parentSubsets[nodeId].backoffStrategy	      = CountsSumLogCardinalityNorm;	  } else if (strcmp(tokens[tok],"bog_node_prob") == 0) {	    fnSpecArray[i].parentSubsets[nodeId].backoffStrategy	      = BogNodeProb;	  } else {	    fprintf(stderr,"Error: unknown strategy argument (%s) when "		  "reading factored spec file\n",tokens[tok]);	    exit(-1);	  }	} else if (strcmp(tokens[tok],"combine") == 0) {	  if (tok+1==howmany) {	    fprintf(stderr,"Error: combine argument needs a value "		    "reading factored spec file\n");	    exit(-1);	  }	  tok++;	  if (strcmp(tokens[tok],"max") == 0) {	    fnSpecArray[i].parentSubsets[nodeId].backoffCombine	      = MaxBgChild;	  } else if (strcmp(tokens[tok],"min") == 0) {	    fnSpecArray[i].parentSubsets[nodeId].backoffCombine	      = MinBgChild;	  } else if ((strcmp(tokens[tok],"avg") == 0) ||		     (strcmp(tokens[tok],"mean") == 0)) {	    fnSpecArray[i].parentSubsets[nodeId].backoffCombine	      = AvgBgChild;	  } else if (strcmp(tokens[tok],"wmean") == 0) {	    fnSpecArray[i].parentSubsets[nodeId].backoffCombine	      = WmeanBgChild;	    // next set of tokens must have a combination of (node_spec, weight)	    // for each child.	    // we compute this below, but we compute it here since we don't	    // have the quantity numBGchildren yet.	    typename FNgramSpec::BGChildIterCnstr	      citer(fnSpecArray[i].numParents,nodeId,fnSpecArray[i].parentSubsets[nodeId].backoffConstraint); 	    unsigned int numChildrenUsed = 0;	    for (unsigned child;citer.next(child);) {	      numChildrenUsed++;	    }	    if (tok+2*numChildrenUsed >= (unsigned)howmany) {	      f.position() << "Error: combine wmean needs " << numChildrenUsed << 		" node & weight pairs, one for each child\n";	      exit(-1);	    }	    // TODO: add to destructor	    LogP2 *wmean = new LogP2[numChildrenUsed];	    for (unsigned cnum=0;cnum<numChildrenUsed;cnum++) { wmean[cnum] = 0.0; }	    tok++;	    for (unsigned cnum=0;cnum<numChildrenUsed;cnum++) {	      double value;	      unsigned int childSpec = fnSpecArray[i].parseNodeString((char*)tokens[tok],success);	      if (!success) {		f.position() << "Error: combine wmean invalid node specifier\n";		exit(-1);	      }	      tok++;	      char *endptr;	      value = strtod(tokens[tok],&endptr);	      if (endptr == tokens[tok] || value < 0.0) {		f.position() << "Error: combine wmean invalid weight value\n";		exit(-1);	      }	      citer.init();	      unsigned int cpos = 0;	      for (unsigned child;citer.next(child);) {		if (!(~child & 		    (fnSpecArray[i].parentSubsets[nodeId].backoffConstraint 		     & nodeId)))		  continue;		if (child == childSpec)		  break;		cpos++;	      }	      if (cpos == numChildrenUsed) {		f.position() << "Error: combine wmean, invalid child node given\n";		exit(-1);	      }	      // load them in the array in the order that they will	      // be encountered when doing a child iter.	      wmean[cpos] = value;	      tok++;	    }	    double sum = 0;	    for (unsigned cnum=0;cnum<numChildrenUsed;cnum++) { 	      sum += wmean[cnum];	    }	    // normalize and convert to logp	    for (unsigned cnum=0;cnum<numChildrenUsed;cnum++) 	      { wmean[cnum] = ProbToLogP(wmean[cnum]) - ProbToLogP(sum); }	    fnSpecArray[i].parentSubsets[nodeId].wmean = wmean;	  } else if (strcmp(tokens[tok],"sum") == 0) {	    fnSpecArray[i].parentSubsets[nodeId].backoffCombine	      = SumBgChild;	  } else if (strcmp(tokens[tok],"prod") == 0) {	    fnSpecArray[i].parentSubsets[nodeId].backoffCombine	      = ProdBgChild;	  } else if (strcmp(tokens[tok],"gmean") == 0) {	    fnSpecArray[i].parentSubsets[nodeId].backoffCombine	      = GmeanBgChild;	  } else {	    fprintf(stderr,"Error: unknown combine argument (%s) when "		  "reading factored spec file\n",tokens[tok]);	    exit(-1);	  }	} else if ((strcmp(tokens[tok],"\\") == 0) &&		   tok == (howmany-1)) {	  // do poor man's next line parsing.	  line = f.getline();	  if (line == 0) {	    f.position() << "Error: File::getline() returned 0 when reading FLM spec file\n";	    exit(-1);	  }	  howmany = Vocab::parseWords(line,tokens,128);	  tok = 0;	  goto startGetOptions;	} else {	  fprintf(stderr,"Error: unknown argument (%s) when"		  "reading factored spec file\n",tokens[tok]);	  exit(-1);	}      }      if (fnSpecArray[i].parentSubsets[nodeId].backoffCombine == SumBgChild	  &&	  fnSpecArray[i].parentSubsets[nodeId].interpolate) {	f.position() << "WARNING: using 'interpolate' and 'combine sum' together\n";      }    }    if (debug(DEBUG_EXTREME)) {      // debug all the iterators.      for (int level=fnSpecArray[i].numParents; level>=0; level--) {	typename FNgramSpec::LevelIter iter(fnSpecArray[i].numParents,level);	fprintf(stderr, "level 0x%X:",level);	unsigned int node;	while (iter.next(node)) {	  fprintf(stderr, " node 0x%X,",node);	}	fprintf(stderr, "\n");      }      for (int node=0;node<numSubSets;node++) {      	fprintf(stderr, "node 0x%X\n",node);	typename FNgramSpec::BGParentIter piter(fnSpecArray[i].numParents,node);	for (unsigned parent=0;piter.next(parent);) {	  fprintf(stderr, "parent 0x%X,",parent);	}	fprintf(stderr, "\n");	typename FNgramSpec::BGAncestorIter aiter(fnSpecArray[i].numParents,node);	for (unsigned ancestor;aiter.next(ancestor);) {	  fprintf(stderr, "ancestor 0x%X,",ancestor);	}	fprintf(stderr, "\n");	typename FNgramSpec::BGChildIter citer(fnSpecArray[i].numParents,node);	for (unsigned child;citer.next(child);) {	  fprintf(stderr, "child 0x%X,",child);	}	fprintf(stderr, "\n");	typename FNgramSpec::BGDescendantIter diter(fnSpecArray[i].numParents,node);	for (unsigned des;diter.next(des);) {	  fprintf(stderr, "descendant 0x%X,",des);	}	fprintf(stderr, "\n");      }      fflush(stderr);    }    // only create counts objects for nodes that are to be used    fnSpecArray[i].parentSubsets[numSubSets-1].counts = new FNgramNode;    fnSpecArray[i].parentSubsets[numSubSets-1].order = numBitsSet(numSubSets-1)+1;    // descend down the BG, level by level    for (int level=fnSpecArray[i].numParents;level>=0;level--) {      typename FNgramSpec::LevelIter liter(fnSpecArray[i].numParents,level);      Boolean allAreNull = true;      for (unsigned nodeAtLevel;liter.next(nodeAtLevel);) {	if (fnSpecArray[i].parentSubsets[nodeAtLevel].counts == NULL)	  continue;	allAreNull = false;	typename FNgramSpec::BGChildIterCnstr 	  citer(fnSpecArray[i].numParents,nodeAtLevel,fnSpecArray[i].parentSubsets[nodeAtLevel].backoffConstraint);	unsigned int numChildrenUsed = 0;	for (unsigned child;citer.next(child);) {	  if (fnSpecArray[i].parentSubsets[child].counts == NULL) {	    fnSpecArray[i].parentSubsets[child].counts = new FNgramNode;	    fnSpecArray[i].parentSubsets[child].order = numBitsSet(child)+1;	  }	  numChildrenUsed++;	  // make sure kn-count-parent has counts itself.	  if (fnSpecArray[i].parentSubsets[child].knCountParent != ~0x0) {	    const unsigned kncp = fnSpecArray[i].parentSubsets[child].knCountParent;	    if (kncp >= (unsigned)numSubSets || 		fnSpecArray[i].parentSubsets[kncp].counts == NULL) {	      f.position() << "Error: kn-counts-parent argument " << HEX << 		kncp << DEC << " must specify a parent that exists and is in use\n";	      exit(-1);	    }	  }	}	// everybody must have a child.	if (nodeAtLevel > 0 && numChildrenUsed == 0) {	  fprintf(stderr,"ERROR: backoff graph node 0x%X has no children with backoff constraint 0x%X. Must have at least one child.\n",nodeAtLevel,fnSpecArray[i].parentSubsets[nodeAtLevel].backoffConstraint);	  exit(-1);	}	fnSpecArray[i].parentSubsets[nodeAtLevel].numBGChildren = numChildrenUsed;      }      if (allAreNull) {	// no count object was created	// NOTE: we might not want to consider this an error, if we for example 	// want not to backoff to lower levels in backoff graph. In that case,	// probabilities would become zero, however.	fprintf(stderr,"ERROR: backoff constraints leave level %d of backoff graph "		"entirely unexpanded, lower distribution order never reached\n",level);	exit(-1);      }    }    if (debug(DEBUG_BG_PRINT)) {       fprintf(stderr, "Language Model %d --------------\n",i);      for (int level=fnSpecArray[i].numParents;level>=0;level--) {	fprintf(stderr, "-- Level %d\n",level);	typename FNgramSpec::LevelIter liter(fnSpecArray[i].numParents,level);	for (unsigned nodeAtLevel;liter.next(nodeAtLevel);) {	  if (fnSpecArray[i].parentSubsets[nodeAtLevel].counts == NULL)	    continue;	  fprintf(stderr, "      Node: ");	  fnSpecArray[i].printNodeString(stderr,nodeAtLevel);	  fprintf(stderr, " (0x%X), Constraint: ",nodeAtLevel);	  fnSpecArray[i].printNodeString(stderr,					 fnSpecArray[i].parentSubsets[nodeAtLevel].backoffConstraint);	  fprintf(stderr, " (0x%X)\n",fnSpecArray[i].parentSubsets[nodeAtLevel].backoffConstraint);	  fprintf(stderr, "         %d Children:",fnSpecArray[i].parentSubsets[nodeAtLevel].numBGChildren);	  typename FNgramSpec::BGChildIterCnstr 	    citer(fnSpecArray[i].numParents,nodeAtLevel,fnSpecArray[i].parentSubsets[nodeAtLevel].backoffConstraint);	  Boolean do_comma = false;	  for (unsigned child;citer.next(child);) {	    if (fnSpecArray[i].parentSubsets[child].counts != NULL) {	      fprintf(stderr, (do_comma?"; ":" "));	      fnSpecArray[i].printNodeString(stderr,child);	      fprintf(stderr, " (0x%X)",child);	    }	    do_comma = true;	  }	  fprintf(stderr, "\n");	}      }    }  }  if (nextPosition > maxNumParentsPerChild) {    f.position() << "Error: may only have at most " << maxNumParentsPerChild <<      " distinct tags\n";    exit(-1);  }    LHashIter<VocabString,unsigned> tags(tagPosition);  VocabString tag;  unsigned *pos;  char buff[2048];  while ((pos = tags.next(tag)) != NULL) {    // TODO: WARNING: should use strncat here.    buff[0] = '\0';    fvocab.tagNulls[*pos] = strdup(strcat(strcat(strcat(buff,tag),FNGRAM_WORD_TAG_SEP_STR),FNGRAM_WORD_TAG_NULL_SPEC_STR));    buff[0] = '\0';    fvocab.tagUnks[*pos] = strdup(strcat(strcat(strcat(buff,tag),FNGRAM_WORD_TAG_SEP_STR),Vocab_Unknown));    buff[0] = '\0';    fvocab.tagSes[*pos] = strdup(strcat(strcat(strcat(buff,tag),FNGRAM_WORD_TAG_SEP_STR),Vocab_SentStart));    buff[0] = '\0';    fvocab.tagSss[*pos] = strdup(strcat(strcat(strcat(buff,tag),FNGRAM_WORD_TAG_SEP_STR),Vocab_SentEnd));    buff[0] = '\0';    fvocab.tagPauses[*pos] = strdup(strcat(strcat(strcat(buff,tag),FNGRAM_WORD_TAG_SEP_STR),Vocab_Pause));  }  if (debug(DEBUG_VERY_VERBOSE))    printFInfo();}template <class CountT>  BooleanFNgramSpecs<CountT>::FNgramSpec::printNodeString(FILE *f,unsigned int node){  Boolean do_comma = false;  for (unsigned i=0;i<numParents;i++) {    if (node & (1<<i)) {      fprintf(f,"%s%s%+d",	      (do_comma?",":""),	      parents[i],	      parentOffsets[i]);      do_comma = true;    }  }  return true;}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -