📄 fngramspecs.cc
字号:
} } return false;}/* * parent iter code */template <class CountT>FNgramSpecs<CountT>::FNgramSpec::BGParentIter::BGParentIter(const unsigned int _numParents, const unsigned int _homeNode) : numParents(_numParents),numNodes(1<<_numParents),homeNode(_homeNode), numBitsSetOfHomeNode(numBitsSet(homeNode)){ init();}template <class CountT> BooleanFNgramSpecs<CountT>::FNgramSpec::BGParentIter::next(unsigned int&node) { for (;state<numNodes;state++) { // all bits in homeNode must also be on in parent=state if (((homeNode & state) == homeNode) && (numBitsSet(state) == (numBitsSetOfHomeNode+1))) { node = state++; return true; } } return false;}/* * grandparent iter code */template <class CountT>FNgramSpecs<CountT>::FNgramSpec::BGGrandParentIter::BGGrandParentIter(const unsigned int _numParents, const unsigned int _homeNode, const unsigned int _great) : numParents(_numParents),numNodes(1<<_numParents),homeNode(_homeNode), numBitsSetOfHomeNode(numBitsSet(homeNode)),great(_great){ init();}template <class CountT> BooleanFNgramSpecs<CountT>::FNgramSpec::BGGrandParentIter::next(unsigned int&node) { for (;state<numNodes;state++) { // all bits in homeNode must also be on in parent=state if (((homeNode & state) == homeNode) && (numBitsSet(state) == (numBitsSetOfHomeNode+2+great))) { node = state++; return true; } } return false;}/* * ancestor iter code */template <class CountT>FNgramSpecs<CountT>::FNgramSpec::BGAncestorIter::BGAncestorIter(const unsigned int _numParents, const unsigned int _homeNode) : numParents(_numParents),numNodes(1<<_numParents),homeNode(_homeNode), numBitsSetOfHomeNode(numBitsSet(homeNode)){ init();}template <class CountT> BooleanFNgramSpecs<CountT>::FNgramSpec::BGAncestorIter::next(unsigned int&node) { for (;state<numNodes;state++) { // all bits in homeNode must also be on in parent=state if ((homeNode & state) == homeNode) { node = state++; return true; } } return false;}/* * child iter code */template <class CountT>FNgramSpecs<CountT>::FNgramSpec::BGChildIter::BGChildIter(const unsigned int _numParents, const unsigned int _homeNode) : numParents(_numParents),numNodes(1<<_numParents),homeNode(_homeNode), numBitsSetOfHomeNode(numBitsSet(homeNode)){ init();}template <class CountT> BooleanFNgramSpecs<CountT>::FNgramSpec::BGChildIter::next(unsigned int&node) { for (;state>=0;state--) { // all bits in child=state must also be on in homeNode if (((state & homeNode) == state) && ((1+numBitsSet(state)) == numBitsSetOfHomeNode)) { node = state--; return true; } } return false;}/* * child iter with constraints code */template <class CountT>FNgramSpecs<CountT>::FNgramSpec::BGChildIterCnstr::BGChildIterCnstr( const unsigned int _numParents, const unsigned int _homeNode, const unsigned int _bo_constraints ) : numParents(_numParents),numNodes(1<<_numParents),homeNode(_homeNode), bo_constraints(_bo_constraints), numBitsSetOfHomeNode(numBitsSet(homeNode)){ init();}template <class CountT> BooleanFNgramSpecs<CountT>::FNgramSpec::BGChildIterCnstr::next(unsigned int&node) { for (;state>=0;state--) { if ((// all bits in child=state must also be on in homeNode ((state & homeNode) == state) && ((1+numBitsSet(state)) == numBitsSetOfHomeNode)) && (// child can validly come from a parent with current BO constraints ~state & (bo_constraints & homeNode))) { node = state--; return true; } } return false;}/* * grand child iter code */template <class CountT>FNgramSpecs<CountT>::FNgramSpec::BGGrandChildIter::BGGrandChildIter(const unsigned int _numParents, const unsigned int _homeNode, const unsigned int _great) : numParents(_numParents),numNodes(1<<_numParents),homeNode(_homeNode), numBitsSetOfHomeNode(numBitsSet(homeNode)),great(_great){ init();}template <class CountT> BooleanFNgramSpecs<CountT>::FNgramSpec::BGGrandChildIter::next(unsigned int&node) { for (;state>=0;state--) { // all bits in child=state must also be on in homeNode if (((state & homeNode) == state) && ((great+2+numBitsSet(state)) == numBitsSetOfHomeNode)) { node = state--; return true; } } return false;}/* * descendant iter code */template <class CountT>FNgramSpecs<CountT>::FNgramSpec::BGDescendantIter::BGDescendantIter(const unsigned int _numParents, const unsigned int _homeNode) : numParents(_numParents),numNodes(1<<_numParents),homeNode(_homeNode), numBitsSetOfHomeNode(numBitsSet(homeNode)){ init();}template <class CountT> BooleanFNgramSpecs<CountT>::FNgramSpec::BGDescendantIter::next(unsigned int&node) { for (;state>=0;state--) { // all bits in child=state must also be on in homeNode if ((state & homeNode) == state) { node = state--; return true; } } return false;}/* **************************************** * main FNgramSpecs constructor * **************************************** */template <class CountT>FNgramSpecs<CountT>::FNgramSpecs(File& f, FactoredVocab& fv, unsigned debuglevel) : fvocab(fv), Debug(debuglevel){ if (f.error()) return; // parse the input file which we now assume to be open and valid. char *line; // position of word is always zero *tagPosition.insert(FNGRAM_WORD_TAG_STR) = FNGRAM_WORD_TAG_POS; // this initialization assumes that FNGRAM_WORD_TAG_POS == 0 int nextPosition = 1; // get number of CS Ngrams line = f.getline(); // printf("line = (%s)\n",line); if (line == 0) { // not sure what to do here with these errors, so we just exit with a message. f.position() << "Error: File::getline() returned 0 when reading FLM spec file\n"; exit(-1); } register char *p = line; // skip space to next token while (*p && isspace(*p)) p++; ////////////////////////////////////////////////////////////// // get number of LM specs that are being given here char* endptr = p; int n_csngrams = (int) strtol(p,&endptr,0); if (endptr == p) { f.position() << "Error: couldn't form int for number of factored LMs in when reading FLM spec file\n"; exit(-1); } p = endptr; // each FLM gram spec is on a diff line. for (int i=0;i<n_csngrams;i++) { // Parse a chunk of text of the form: // W : 4 W(-1) M(0) S(0) R(0) count_filename lm_filename num_node_specs // <node_spec_1> <node_constraint_1> [optional_node_options] // <node_spec_2> <node_constraint_2> [optional_node_options] // ... // <node_spec_N> <node_constraint_N> [optional_node_options] // char *token; char *parse_state; char tmp; line = f.getline(); // TODO: allow this to be multi-line if (line == 0) { f.position() << "Error: File::getline() returned 0 when reading FLM spec file\n"; exit(-1); } // skip blanks p = line; while (*p && isspace(*p)) p++; if (!isalnum(*p)) { f.position() << "Error: expecting child spec in FLM in when reading FLM spec file\n"; exit(-1); } //////////////////////////////////// // pull out the name of the child if (!*p) { f.position() << "Error: couldn't get child name when reading factor spec file\n"; exit(-1); } token = p; do { p++; } while (*p && *p != ' ' && *p != '\t' && *p != '\r' && *p != ':'); tmp = *p; *p = '\0'; // C string parsing is beautiful, isn't it. fnSpecArray[i].child = strdup(token); // TODO: finish destructor and free all strdups. *p = tmp; // insert the tag Boolean found; unsigned *pos_p = tagPosition.insert(fnSpecArray[i].child,found); if (!found) *pos_p = nextPosition++; fnSpecArray[i].childPosition = *pos_p; // skip to next token while (*p && (*p == ' ' || *p == '\t' || *p == '\r' || *p == ':')) p++; //////////////////////////////////// // get num parents if (!*p) { f.position() << "Error: couldn't get number parents when reading factor spec file\n"; exit(-1); } endptr = p; fnSpecArray[i].numParents = (int) strtol(p,&endptr,0); if (endptr == p) { f.position() << "Error: couldn't form int for number FN-grams in when reading FLM spec file\n"; exit(-1); } p = endptr; if (fnSpecArray[i].numParents > maxNumParentsPerChild) { f.position() << "Error: number parents must not be negative or greater than " << maxNumParentsPerChild << "\n"; } // skip space to next token while (*p && isspace(*p)) p++; fnSpecArray[i].numSubSets = 1<<fnSpecArray[i].numParents; for (int j=0;j<(int)fnSpecArray[i].numParents;j++) { //////////////////////////////////// // get name of parent if (!*p) { f.position() << "Error: couldn't get parent name when reading factor spec file\n"; exit(-1); } token = p; do { p++; } while (*p && *p != ' ' && *p != '\t' && *p != '\r' && *p != '('); tmp = *p; *p = '\0'; fnSpecArray[i].parents[j] = strdup(token); *p = tmp; // insert the tag Boolean found; pos_p = tagPosition.insert(fnSpecArray[i].parents[j],found); if (!found) *pos_p = nextPosition++; fnSpecArray[i].parentPositions[j] = *pos_p; // skip to next token (NOTE: this would accept multiple '(' chars.) while (*p && (*p == ' ' || *p == '\t' || *p == '\r' || *p == '(')) p++; //////////////////////////////////// // get offset if (!*p) { f.position() << "Error: couldn't get parent offset when reading factor spec file\n"; exit(-1); } endptr = p; fnSpecArray[i].parentOffsets[j] = (int) strtol(p,&endptr,0); if (endptr == p) { f.position() << "Error: couldn't form int for number FN-grams in when reading FLM spec file\n"; exit(-1); } // Future Language Model Support: allow offsets to come from the future as well as the past // if (fnSpecArray[i].parentOffsets[j] > 0) { // f.position() << "Error: can't have positive parent offset in structure file\n"; // exit(-1); // } if (fnSpecArray[i].parentOffsets[j] == 0 && fnSpecArray[i].parentPositions[j] == fnSpecArray[i].childPosition) { f.position() << "Error: parent and child can not be the same\n"; exit(-1); } // make sure that the same parent is not specified twice. for (int l=0;l<j;l++) { if ((fnSpecArray[i].parentPositions[j] == fnSpecArray[i].parentPositions[l]) && (fnSpecArray[i].parentOffsets[j] == fnSpecArray[i].parentOffsets[l])) { f.position() << "Error: cannot specify same parent more than once\n"; exit(-1); } } p = endptr; // skip to next token (NOTE: this would accept multiple ')' chars.) while (*p && (*p == ' ' || *p == '\t' || *p == '\r' || *p == ')')) p++; } //////////////////////////////////// // get Count file name while (*p && isspace(*p)) p++; if (!(*p)) { f.position() << "Error: couldn't get count file name when reading factor spec file\n"; exit(-1); } token = p; do { p++; } while (*p && *p != ' ' && *p != '\t' && *p != '\r' && *p != '\n'); tmp = *p; *p = '\0'; fnSpecArray[i].countFileName = strdup(token); *p = tmp; //////////////////////////////////// // get LM file name // skip space to next token while (*p && isspace(*p)) p++; if (!*p) { f.position() << "Error: couldn't get LM file name when reading factor spec file\n"; exit(-1); } token = p; do { p++; } while (*p && *p != ' ' && *p != '\t' && *p != '\r' && *p != '\n'); tmp = *p; *p = '\0'; fnSpecArray[i].lmFileName = strdup(token); *p = tmp; //////////////////////////////////// // get number of nodes that have node specs // skip space to next token while (*p && isspace(*p)) p++; if (!*p) { f.position() << "Error: couldn't get num node specs name when reading factor spec file\n"; exit(-1); } endptr = p; int numNodeSpecs = 0; numNodeSpecs = (int) strtol(p,&endptr,0); if (endptr == p || numNodeSpecs < 0) { f.position() << "Error: couldn't form unsigned int for number node specs when reading FLM spec file\n"; exit(-1); } // finally! done with this line, now get node specs const int numSubSets = 1<<fnSpecArray[i].numParents; // next set of numNodeSpecs lines contain node specs for (int j=0;j<numNodeSpecs;j++) { // line should have the form // NODE_NUM BACKOFFCONSTRAINT <options> // options include what ngram-count.cc uses on comand line for // discount options. I've given up on any (even slighty) fancy // C string parsing for now, so this is just a string of tokens which // are parsed in a very simple way. // TODO: do a proper multi-line tokenizer here. line = f.getline(); // printf("line = (%s)\n",line); if (line == 0) { f.position() << "Error: File::getline() returned 0 when readin FLM spec file\n"; exit(-1); } VocabString tokens[128]; int howmany = Vocab::parseWords(line,tokens,128); if (howmany < 2) { f.position() << "Error: specifier must at least specify node id and back-off constraint\n"; exit(-1); } int tok = 0; // get node id int nodeId = 0x0; Boolean success; nodeId = (int) fnSpecArray[i].parseNodeString((char*)tokens[tok],success); if (!success) { f.position() << "Error: couldn't form BG node specifier in string (" <<
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -