📄 fngramspecs.cc

📁 这是一款很好用的工具包
💻 CC
📖 第 1 页 / 共 4 页
字号:
    }  }  return false;}/* * parent iter code */template <class CountT>FNgramSpecs<CountT>::FNgramSpec::BGParentIter::BGParentIter(const unsigned int _numParents,					    const unsigned int _homeNode)  : numParents(_numParents),numNodes(1<<_numParents),homeNode(_homeNode),    numBitsSetOfHomeNode(numBitsSet(homeNode)){  init();}template <class CountT> BooleanFNgramSpecs<CountT>::FNgramSpec::BGParentIter::next(unsigned int&node) {  for (;state<numNodes;state++) {    // all bits in homeNode must also be on in parent=state    if (((homeNode & state) == homeNode) &&	(numBitsSet(state) == (numBitsSetOfHomeNode+1))) {      node = state++;      return true;    }  }  return false;}/* * grandparent iter code */template <class CountT>FNgramSpecs<CountT>::FNgramSpec::BGGrandParentIter::BGGrandParentIter(const unsigned int _numParents,					    const unsigned int _homeNode,					    const unsigned int _great)  : numParents(_numParents),numNodes(1<<_numParents),homeNode(_homeNode),    numBitsSetOfHomeNode(numBitsSet(homeNode)),great(_great){  init();}template <class CountT> BooleanFNgramSpecs<CountT>::FNgramSpec::BGGrandParentIter::next(unsigned int&node) {  for (;state<numNodes;state++) {    // all bits in homeNode must also be on in parent=state    if (((homeNode & state) == homeNode) &&	(numBitsSet(state) == (numBitsSetOfHomeNode+2+great))) {      node = state++;      return true;    }  }  return false;}/* * ancestor iter code */template <class CountT>FNgramSpecs<CountT>::FNgramSpec::BGAncestorIter::BGAncestorIter(const unsigned int _numParents,					    const unsigned int _homeNode)  : numParents(_numParents),numNodes(1<<_numParents),homeNode(_homeNode),    numBitsSetOfHomeNode(numBitsSet(homeNode)){  init();}template <class CountT> BooleanFNgramSpecs<CountT>::FNgramSpec::BGAncestorIter::next(unsigned int&node) {  for (;state<numNodes;state++) {    // all bits in homeNode must also be on in parent=state    if ((homeNode & state) == homeNode) {      node = state++;      return true;    }  }  return false;}/* * child iter code */template <class CountT>FNgramSpecs<CountT>::FNgramSpec::BGChildIter::BGChildIter(const unsigned int _numParents,					  const unsigned int _homeNode)  : numParents(_numParents),numNodes(1<<_numParents),homeNode(_homeNode),    numBitsSetOfHomeNode(numBitsSet(homeNode)){  init();}template <class CountT>  BooleanFNgramSpecs<CountT>::FNgramSpec::BGChildIter::next(unsigned int&node) {  for (;state>=0;state--) {    // all bits in child=state must also be on in homeNode    if (((state & homeNode) == state) &&	((1+numBitsSet(state)) == numBitsSetOfHomeNode)) {      node = state--;      return true;    }  }  return false;}/* * child iter with constraints code */template <class CountT>FNgramSpecs<CountT>::FNgramSpec::BGChildIterCnstr::BGChildIterCnstr( const unsigned int _numParents, const unsigned int _homeNode, const unsigned int _bo_constraints )  : numParents(_numParents),numNodes(1<<_numParents),homeNode(_homeNode),    bo_constraints(_bo_constraints),    numBitsSetOfHomeNode(numBitsSet(homeNode)){  init();}template <class CountT>  BooleanFNgramSpecs<CountT>::FNgramSpec::BGChildIterCnstr::next(unsigned int&node) {  for (;state>=0;state--) {    if ((// all bits in child=state must also be on in homeNode	 ((state & homeNode) == state) &&	 ((1+numBitsSet(state)) == numBitsSetOfHomeNode))	&&	(// child can validly come from a parent with current BO constraints	 ~state & 	 (bo_constraints	  & homeNode)))	{	  node = state--;	  return true;	}  }  return false;}/* * grand child iter code */template <class CountT>FNgramSpecs<CountT>::FNgramSpec::BGGrandChildIter::BGGrandChildIter(const unsigned int _numParents,							       const unsigned int _homeNode,							       const unsigned int _great)  : numParents(_numParents),numNodes(1<<_numParents),homeNode(_homeNode),    numBitsSetOfHomeNode(numBitsSet(homeNode)),great(_great){  init();}template <class CountT>  BooleanFNgramSpecs<CountT>::FNgramSpec::BGGrandChildIter::next(unsigned int&node) {  for (;state>=0;state--) {    // all bits in child=state must also be on in homeNode    if (((state & homeNode) == state) &&	((great+2+numBitsSet(state)) == numBitsSetOfHomeNode)) {      node = state--;      return true;    }  }  return false;}/* * descendant iter code */template <class CountT>FNgramSpecs<CountT>::FNgramSpec::BGDescendantIter::BGDescendantIter(const unsigned int _numParents,					  const unsigned int _homeNode)  : numParents(_numParents),numNodes(1<<_numParents),homeNode(_homeNode),    numBitsSetOfHomeNode(numBitsSet(homeNode)){  init();}template <class CountT>  BooleanFNgramSpecs<CountT>::FNgramSpec::BGDescendantIter::next(unsigned int&node) {  for (;state>=0;state--) {    // all bits in child=state must also be on in homeNode    if ((state & homeNode) == state) {      node = state--;      return true;    }  }  return false;}/* **************************************** * main FNgramSpecs constructor         * **************************************** */template <class CountT>FNgramSpecs<CountT>::FNgramSpecs(File& f,				 FactoredVocab& fv,				 unsigned debuglevel)  : fvocab(fv), Debug(debuglevel){  if (f.error())    return;  // parse the input file which we now assume to be open and valid.  char *line;  // position of word is always zero  *tagPosition.insert(FNGRAM_WORD_TAG_STR) = FNGRAM_WORD_TAG_POS;  // this initialization assumes that FNGRAM_WORD_TAG_POS == 0  int nextPosition = 1;  // get number of CS Ngrams  line = f.getline();  // printf("line = (%s)\n",line);  if (line == 0) {    // not sure what to do here with these errors, so we just exit with a message.    f.position() << "Error: File::getline() returned 0 when reading FLM spec file\n";    exit(-1);  }  register char *p = line;  // skip space to next token  while (*p && isspace(*p)) p++;  //////////////////////////////////////////////////////////////  // get number of LM specs that are being given here  char* endptr = p;  int n_csngrams = (int) strtol(p,&endptr,0);  if (endptr == p) {    f.position() << "Error: couldn't form int for number of factored LMs in when reading FLM spec file\n";    exit(-1);  }  p = endptr;  // each FLM gram spec is on a diff line.  for (int i=0;i<n_csngrams;i++) {    // Parse a chunk of text of the form:     // W : 4 W(-1) M(0) S(0) R(0) count_filename lm_filename num_node_specs    //    <node_spec_1> <node_constraint_1> [optional_node_options]    //    <node_spec_2> <node_constraint_2> [optional_node_options]    //    ...    //    <node_spec_N> <node_constraint_N> [optional_node_options]    //    char *token;    char *parse_state;    char tmp;    line = f.getline();    // TODO: allow this to be multi-line    if (line == 0) {      f.position() << "Error: File::getline() returned 0 when reading FLM spec file\n";      exit(-1);    }    // skip blanks    p = line; while (*p && isspace(*p)) p++;    if (!isalnum(*p)) {      f.position() << "Error: expecting child spec in FLM in when reading FLM spec file\n";      exit(-1);    }    ////////////////////////////////////        // pull out the name of the child    if (!*p) {      f.position() << "Error: couldn't get child name when reading factor spec file\n";      exit(-1);          }    token = p;    do { p++; } while (*p && *p != ' ' && *p != '\t' && *p != '\r' && *p != ':');    tmp = *p; *p = '\0'; // C string parsing is beautiful, isn't it.    fnSpecArray[i].child = strdup(token); // TODO: finish destructor and free all strdups.    *p = tmp;    // insert the tag    Boolean found;    unsigned *pos_p = tagPosition.insert(fnSpecArray[i].child,found);    if (!found)      *pos_p = nextPosition++;    fnSpecArray[i].childPosition = *pos_p;    // skip to next token    while (*p && (*p == ' ' || *p == '\t' || *p == '\r' || *p == ':')) p++;    ////////////////////////////////////    // get num parents    if (!*p) {      f.position() << "Error: couldn't get number parents when reading factor spec file\n";      exit(-1);          }    endptr = p;    fnSpecArray[i].numParents = (int) strtol(p,&endptr,0);    if (endptr == p) {      f.position() << "Error: couldn't form int for number FN-grams in when reading FLM spec file\n";      exit(-1);    }    p = endptr;    if (fnSpecArray[i].numParents > maxNumParentsPerChild) {      f.position()	  << "Error: number parents must not be negative or greater than " 	  << maxNumParentsPerChild << "\n";     }    // skip space to next token    while (*p && isspace(*p)) p++;    fnSpecArray[i].numSubSets = 1<<fnSpecArray[i].numParents;    for (int j=0;j<(int)fnSpecArray[i].numParents;j++) {      ////////////////////////////////////      // get name of parent      if (!*p) {	f.position() << "Error: couldn't get parent name when reading factor spec file\n";	exit(-1);            }      token = p;      do { p++; } while (*p && *p != ' ' && *p != '\t' && *p != '\r' && *p != '(');      tmp = *p; *p = '\0';      fnSpecArray[i].parents[j] = strdup(token);      *p = tmp;      // insert the tag      Boolean found;      pos_p = tagPosition.insert(fnSpecArray[i].parents[j],found);      if (!found)	*pos_p = nextPosition++;      fnSpecArray[i].parentPositions[j] = *pos_p;      // skip to next token  (NOTE: this would accept multiple '(' chars.)      while (*p && (*p == ' ' || *p == '\t' || *p == '\r' || *p == '(')) p++;      ////////////////////////////////////      // get offset      if (!*p) {	f.position() << "Error: couldn't get parent offset when reading factor spec file\n";	exit(-1);            }      endptr = p;      fnSpecArray[i].parentOffsets[j] = (int) strtol(p,&endptr,0);      if (endptr == p) {	f.position() << "Error: couldn't form int for number FN-grams in when reading FLM spec file\n";	exit(-1);      }      // Future Language Model Support: allow offsets to come from the future as well as the past      // if (fnSpecArray[i].parentOffsets[j] > 0) {      //    f.position() << "Error: can't have positive parent offset in structure file\n";      //    exit(-1);      // }      if (fnSpecArray[i].parentOffsets[j] == 0 && fnSpecArray[i].parentPositions[j] == fnSpecArray[i].childPosition) {	f.position() << "Error: parent and child can not be the same\n";	exit(-1);      }      // make sure that the same parent is not specified twice.      for (int l=0;l<j;l++) {	if ((fnSpecArray[i].parentPositions[j] == fnSpecArray[i].parentPositions[l]) &&	    (fnSpecArray[i].parentOffsets[j] == fnSpecArray[i].parentOffsets[l])) {	  f.position() << "Error: cannot specify same parent more than once\n";	  exit(-1);	}      }      p = endptr;      // skip to next token (NOTE: this would accept multiple ')' chars.)      while (*p && (*p == ' ' || *p == '\t' || *p == '\r' || *p == ')')) p++;    }    ////////////////////////////////////    // get Count file name    while (*p && isspace(*p)) p++;    if (!(*p)) {      f.position() << "Error: couldn't get count file name when reading factor spec file\n";      exit(-1);          }    token = p;    do { p++; } while (*p && *p != ' ' && *p != '\t' && *p != '\r' && *p != '\n');    tmp = *p; *p = '\0';        fnSpecArray[i].countFileName = strdup(token);    *p = tmp;    ////////////////////////////////////    // get LM file name    // skip space to next token    while (*p && isspace(*p)) p++;    if (!*p) {      f.position() << "Error: couldn't get LM file name when reading factor spec file\n";      exit(-1);    }    token = p;    do { p++; } while (*p && *p != ' ' && *p != '\t' && *p != '\r' && *p != '\n');    tmp = *p; *p = '\0';        fnSpecArray[i].lmFileName = strdup(token);    *p = tmp;    ////////////////////////////////////    // get number of nodes that have node specs    // skip space to next token    while (*p && isspace(*p)) p++;    if (!*p) {      f.position() << "Error: couldn't get num node specs name when reading factor spec file\n";      exit(-1);          }    endptr = p;    int numNodeSpecs = 0;    numNodeSpecs = (int) strtol(p,&endptr,0);    if (endptr == p || numNodeSpecs < 0) {      f.position() << "Error: couldn't form unsigned int for number node specs when reading FLM spec file\n";      exit(-1);    }    // finally! done with this line, now get node specs    const int numSubSets = 1<<fnSpecArray[i].numParents;    // next set of numNodeSpecs lines contain node specs    for (int j=0;j<numNodeSpecs;j++) {      // line should have the form      // NODE_NUM BACKOFFCONSTRAINT <options>      // options include what ngram-count.cc uses on comand line for      // discount options. I've given up on any (even slighty) fancy       // C string parsing for now, so this is just a string of tokens which      // are parsed in a very simple way.      // TODO: do a proper multi-line tokenizer here.      line = f.getline();      // printf("line = (%s)\n",line);      if (line == 0) {	f.position() << "Error: File::getline() returned 0 when readin FLM spec file\n";	exit(-1);      }      VocabString tokens[128];      int howmany = Vocab::parseWords(line,tokens,128);      if (howmany < 2) {	f.position() << "Error: specifier must at least specify node id and back-off constraint\n";	exit(-1);      }      int tok = 0;            // get node id      int nodeId = 0x0;      Boolean success;      nodeId = (int) fnSpecArray[i].parseNodeString((char*)tokens[tok],success);      if (!success) {	f.position() << "Error: couldn't form BG node specifier in string (" <<
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -