📄 fngramspecs.cc
字号:
tokens[tok] << ")\n"; exit(-1); } if (nodeId < 0) { f.position() << "Error: couldn't form unsigned int in " << tokens[tok] << "for node specifier when reading factored spec file\n"; exit(-1); } if (nodeId >= numSubSets) { fprintf(stderr,"Error: node specifier must be between 0x0 and 0x%x inclusive\n", numSubSets-1); exit(-1); } tok++; // get backoff constraint for this node fnSpecArray[i].parentSubsets[nodeId].backoffConstraint = fnSpecArray[i].parseNodeString((char*)tokens[tok],success); if (!success) { f.position() << "Error: couldn't form BG node constraint in string (" << tokens[tok] << ")\n"; exit(-1); } tok++; // // Current set of Node Options // // gtmin [num] // gtmax [num] // gt [fileName string] // cdiscount [double] // ndiscount [] // wbdiscount [] // kndiscount [] // ukndiscount [] // kn-counts-modified [] // kn-counts-modify-at-end [] // kn [fileName string] // interpolate [] // write [fileName string] // strategy [option] // where [option] is one of: // counts_no_norm // counts_sum_counts_norm // counts_sum_num_words_norm // counts_prod_card_norm // counts_sum_card_norm // counts_sum_log_card_norm // bog_node_prob // startGetOptions: for (;tok<howmany;tok++) { if (strcmp(tokens[tok],"gtmin") == 0) { if (tok+1==howmany) { fprintf(stderr,"Error: gtmin argument needs a value" "reading factored spec file\n"); exit(-1); } tok++; if (sscanf(tokens[tok],"%d",&fnSpecArray[i].parentSubsets[nodeId].gtmin) != 1){ fprintf(stderr,"Error: gtmin argument needs integer value"); exit(-1); } } else if (strcmp(tokens[tok],"gtmax") == 0) { if (tok+1==howmany) { fprintf(stderr,"Error: gtmax argument needs a value" "reading factored spec file\n"); exit(-1); } tok++; if (sscanf(tokens[tok],"%d",&fnSpecArray[i].parentSubsets[nodeId].gtmax) != 1){ fprintf(stderr,"Error: gtmax argument needs integer value"); exit(-1); } } else if (strcmp(tokens[tok],"gt") == 0) { if (tok+1==howmany) { fprintf(stderr,"Error: gt argument needs a value" "reading factored spec file\n"); exit(-1); } tok++; delete [] fnSpecArray[i].parentSubsets[nodeId].gtFile; fnSpecArray[i].parentSubsets[nodeId].gtFile = strdup(tokens[tok]); } else if (strcmp(tokens[tok],"cdiscount") == 0) { if (tok+1==howmany) { fprintf(stderr,"Error: cdiscount argument needs a value"); exit(-1); } tok++; double tmp; char *endptr; tmp = strtod(tokens[tok],&endptr); if (endptr == tokens[tok]) { fprintf(stderr,"Error: cdiscount argument (%s) should be floating point value", tokens[tok]); exit(-1); } fnSpecArray[i].parentSubsets[nodeId].cdiscount = tmp; } else if (strcmp(tokens[tok],"ndiscount") == 0) { fnSpecArray[i].parentSubsets[nodeId].ndiscount = true; } else if (strcmp(tokens[tok],"wbdiscount") == 0) { fnSpecArray[i].parentSubsets[nodeId].wbdiscount = true; } else if (strcmp(tokens[tok],"kndiscount") == 0) { fnSpecArray[i].parentSubsets[nodeId].kndiscount = true; } else if (strcmp(tokens[tok],"ukndiscount") == 0) { fnSpecArray[i].parentSubsets[nodeId].ukndiscount = true; } else if (strcmp(tokens[tok],"kn-counts-modified") == 0) { fnSpecArray[i].parentSubsets[nodeId].knCountsModified = true; } else if (strcmp(tokens[tok],"kn-counts-modify-at-end") == 0) { fnSpecArray[i].parentSubsets[nodeId].knCountsModifyAtEnd= true; } else if (strcmp(tokens[tok],"kn-count-parent") == 0) { if (tok+1==howmany) { fprintf(stderr,"Error: kn-count-parent argument needs a parent specifier\n"); exit(-1); } tok++; unsigned par = fnSpecArray[i].parseNodeString((char*)tokens[tok],success); if (!success) { fprintf(stderr,"Error: kn-count-parent argument invalid\n"); exit(-1); } fnSpecArray[i].parentSubsets[nodeId].knCountParent = par; } else if (strcmp(tokens[tok],"kn") == 0) { if (tok+1==howmany) { fprintf(stderr,"Error: kn argument needs a value" "reading factored spec file\n"); exit(-1); } tok++; delete [] fnSpecArray[i].parentSubsets[nodeId].knFile; fnSpecArray[i].parentSubsets[nodeId].knFile = strdup(tokens[tok]); } else if (strcmp(tokens[tok],"interpolate") == 0) { fnSpecArray[i].parentSubsets[nodeId].interpolate = true; } else if (strcmp(tokens[tok],"write") == 0) { if (tok+1==howmany) { fprintf(stderr,"Error: write argument needs a value" "reading factored spec file\n"); exit(-1); } tok++; delete [] fnSpecArray[i].parentSubsets[nodeId].writeFile; fnSpecArray[i].parentSubsets[nodeId].writeFile = strdup(tokens[tok]); } else if (strcmp(tokens[tok],"strategy") == 0) { if (tok+1==howmany) { fprintf(stderr,"Error: strategy argument needs a value" "reading factored spec file\n"); exit(-1); } tok++; if (strcmp(tokens[tok],"counts_no_norm") == 0) { fnSpecArray[i].parentSubsets[nodeId].backoffStrategy = CountsNoNorm; } else if (strcmp(tokens[tok],"counts_sum_counts_norm") == 0) { fnSpecArray[i].parentSubsets[nodeId].backoffStrategy = CountsSumCountsNorm; } else if (strcmp(tokens[tok],"counts_sum_num_words_norm") == 0) { fnSpecArray[i].parentSubsets[nodeId].backoffStrategy = CountsSumNumWordsNorm; } else if (strcmp(tokens[tok],"counts_prod_card_norm") == 0) { fnSpecArray[i].parentSubsets[nodeId].backoffStrategy = CountsProdCardinalityNorm; } else if (strcmp(tokens[tok],"counts_sum_card_norm") == 0) { fnSpecArray[i].parentSubsets[nodeId].backoffStrategy = CountsSumCardinalityNorm; } else if (strcmp(tokens[tok],"counts_sum_log_card_norm") == 0) { fnSpecArray[i].parentSubsets[nodeId].backoffStrategy = CountsSumLogCardinalityNorm; } else if (strcmp(tokens[tok],"bog_node_prob") == 0) { fnSpecArray[i].parentSubsets[nodeId].backoffStrategy = BogNodeProb; } else { fprintf(stderr,"Error: unknown strategy argument (%s) when " "reading factored spec file\n",tokens[tok]); exit(-1); } } else if (strcmp(tokens[tok],"combine") == 0) { if (tok+1==howmany) { fprintf(stderr,"Error: combine argument needs a value " "reading factored spec file\n"); exit(-1); } tok++; if (strcmp(tokens[tok],"max") == 0) { fnSpecArray[i].parentSubsets[nodeId].backoffCombine = MaxBgChild; } else if (strcmp(tokens[tok],"min") == 0) { fnSpecArray[i].parentSubsets[nodeId].backoffCombine = MinBgChild; } else if ((strcmp(tokens[tok],"avg") == 0) || (strcmp(tokens[tok],"mean") == 0)) { fnSpecArray[i].parentSubsets[nodeId].backoffCombine = AvgBgChild; } else if (strcmp(tokens[tok],"wmean") == 0) { fnSpecArray[i].parentSubsets[nodeId].backoffCombine = WmeanBgChild; // next set of tokens must have a combination of (node_spec, weight) // for each child. // we compute this below, but we compute it here since we don't // have the quantity numBGchildren yet. typename FNgramSpec::BGChildIterCnstr citer(fnSpecArray[i].numParents,nodeId,fnSpecArray[i].parentSubsets[nodeId].backoffConstraint); unsigned int numChildrenUsed = 0; for (unsigned child;citer.next(child);) { numChildrenUsed++; } if (tok+2*numChildrenUsed >= (unsigned)howmany) { f.position() << "Error: combine wmean needs " << numChildrenUsed << " node & weight pairs, one for each child\n"; exit(-1); } // TODO: add to destructor LogP2 *wmean = new LogP2[numChildrenUsed]; for (unsigned cnum=0;cnum<numChildrenUsed;cnum++) { wmean[cnum] = 0.0; } tok++; for (unsigned cnum=0;cnum<numChildrenUsed;cnum++) { double value; unsigned int childSpec = fnSpecArray[i].parseNodeString((char*)tokens[tok],success); if (!success) { f.position() << "Error: combine wmean invalid node specifier\n"; exit(-1); } tok++; char *endptr; value = strtod(tokens[tok],&endptr); if (endptr == tokens[tok] || value < 0.0) { f.position() << "Error: combine wmean invalid weight value\n"; exit(-1); } citer.init(); unsigned int cpos = 0; for (unsigned child;citer.next(child);) { if (!(~child & (fnSpecArray[i].parentSubsets[nodeId].backoffConstraint & nodeId))) continue; if (child == childSpec) break; cpos++; } if (cpos == numChildrenUsed) { f.position() << "Error: combine wmean, invalid child node given\n"; exit(-1); } // load them in the array in the order that they will // be encountered when doing a child iter. wmean[cpos] = value; tok++; } double sum = 0; for (unsigned cnum=0;cnum<numChildrenUsed;cnum++) { sum += wmean[cnum]; } // normalize and convert to logp for (unsigned cnum=0;cnum<numChildrenUsed;cnum++) { wmean[cnum] = ProbToLogP(wmean[cnum]) - ProbToLogP(sum); } fnSpecArray[i].parentSubsets[nodeId].wmean = wmean; } else if (strcmp(tokens[tok],"sum") == 0) { fnSpecArray[i].parentSubsets[nodeId].backoffCombine = SumBgChild; } else if (strcmp(tokens[tok],"prod") == 0) { fnSpecArray[i].parentSubsets[nodeId].backoffCombine = ProdBgChild; } else if (strcmp(tokens[tok],"gmean") == 0) { fnSpecArray[i].parentSubsets[nodeId].backoffCombine = GmeanBgChild; } else { fprintf(stderr,"Error: unknown combine argument (%s) when " "reading factored spec file\n",tokens[tok]); exit(-1); } } else if ((strcmp(tokens[tok],"\\") == 0) && tok == (howmany-1)) { // do poor man's next line parsing. line = f.getline(); if (line == 0) { f.position() << "Error: File::getline() returned 0 when reading FLM spec file\n"; exit(-1); } howmany = Vocab::parseWords(line,tokens,128); tok = 0; goto startGetOptions; } else { fprintf(stderr,"Error: unknown argument (%s) when" "reading factored spec file\n",tokens[tok]); exit(-1); } } if (fnSpecArray[i].parentSubsets[nodeId].backoffCombine == SumBgChild && fnSpecArray[i].parentSubsets[nodeId].interpolate) { f.position() << "WARNING: using 'interpolate' and 'combine sum' together\n"; } } if (debug(DEBUG_EXTREME)) { // debug all the iterators. for (int level=fnSpecArray[i].numParents; level>=0; level--) { typename FNgramSpec::LevelIter iter(fnSpecArray[i].numParents,level); fprintf(stderr, "level 0x%X:",level); unsigned int node; while (iter.next(node)) { fprintf(stderr, " node 0x%X,",node); } fprintf(stderr, "\n"); } for (int node=0;node<numSubSets;node++) { fprintf(stderr, "node 0x%X\n",node); typename FNgramSpec::BGParentIter piter(fnSpecArray[i].numParents,node); for (unsigned parent=0;piter.next(parent);) { fprintf(stderr, "parent 0x%X,",parent); } fprintf(stderr, "\n"); typename FNgramSpec::BGAncestorIter aiter(fnSpecArray[i].numParents,node); for (unsigned ancestor;aiter.next(ancestor);) { fprintf(stderr, "ancestor 0x%X,",ancestor); } fprintf(stderr, "\n"); typename FNgramSpec::BGChildIter citer(fnSpecArray[i].numParents,node); for (unsigned child;citer.next(child);) { fprintf(stderr, "child 0x%X,",child); } fprintf(stderr, "\n"); typename FNgramSpec::BGDescendantIter diter(fnSpecArray[i].numParents,node); for (unsigned des;diter.next(des);) { fprintf(stderr, "descendant 0x%X,",des); } fprintf(stderr, "\n"); } fflush(stderr); } // only create counts objects for nodes that are to be used fnSpecArray[i].parentSubsets[numSubSets-1].counts = new FNgramNode; fnSpecArray[i].parentSubsets[numSubSets-1].order = numBitsSet(numSubSets-1)+1; // descend down the BG, level by level for (int level=fnSpecArray[i].numParents;level>=0;level--) { typename FNgramSpec::LevelIter liter(fnSpecArray[i].numParents,level); Boolean allAreNull = true; for (unsigned nodeAtLevel;liter.next(nodeAtLevel);) { if (fnSpecArray[i].parentSubsets[nodeAtLevel].counts == NULL) continue; allAreNull = false; typename FNgramSpec::BGChildIterCnstr citer(fnSpecArray[i].numParents,nodeAtLevel,fnSpecArray[i].parentSubsets[nodeAtLevel].backoffConstraint); unsigned int numChildrenUsed = 0; for (unsigned child;citer.next(child);) { if (fnSpecArray[i].parentSubsets[child].counts == NULL) { fnSpecArray[i].parentSubsets[child].counts = new FNgramNode; fnSpecArray[i].parentSubsets[child].order = numBitsSet(child)+1; } numChildrenUsed++; // make sure kn-count-parent has counts itself. if (fnSpecArray[i].parentSubsets[child].knCountParent != ~0x0) { const unsigned kncp = fnSpecArray[i].parentSubsets[child].knCountParent; if (kncp >= (unsigned)numSubSets || fnSpecArray[i].parentSubsets[kncp].counts == NULL) { f.position() << "Error: kn-counts-parent argument " << HEX << kncp << DEC << " must specify a parent that exists and is in use\n"; exit(-1); } } } // everybody must have a child. if (nodeAtLevel > 0 && numChildrenUsed == 0) { fprintf(stderr,"ERROR: backoff graph node 0x%X has no children with backoff constraint 0x%X. Must have at least one child.\n",nodeAtLevel,fnSpecArray[i].parentSubsets[nodeAtLevel].backoffConstraint); exit(-1); } fnSpecArray[i].parentSubsets[nodeAtLevel].numBGChildren = numChildrenUsed; } if (allAreNull) { // no count object was created // NOTE: we might not want to consider this an error, if we for example // want not to backoff to lower levels in backoff graph. In that case, // probabilities would become zero, however. fprintf(stderr,"ERROR: backoff constraints leave level %d of backoff graph " "entirely unexpanded, lower distribution order never reached\n",level); exit(-1); } } if (debug(DEBUG_BG_PRINT)) { fprintf(stderr, "Language Model %d --------------\n",i); for (int level=fnSpecArray[i].numParents;level>=0;level--) { fprintf(stderr, "-- Level %d\n",level); typename FNgramSpec::LevelIter liter(fnSpecArray[i].numParents,level); for (unsigned nodeAtLevel;liter.next(nodeAtLevel);) { if (fnSpecArray[i].parentSubsets[nodeAtLevel].counts == NULL) continue; fprintf(stderr, " Node: "); fnSpecArray[i].printNodeString(stderr,nodeAtLevel); fprintf(stderr, " (0x%X), Constraint: ",nodeAtLevel); fnSpecArray[i].printNodeString(stderr, fnSpecArray[i].parentSubsets[nodeAtLevel].backoffConstraint); fprintf(stderr, " (0x%X)\n",fnSpecArray[i].parentSubsets[nodeAtLevel].backoffConstraint); fprintf(stderr, " %d Children:",fnSpecArray[i].parentSubsets[nodeAtLevel].numBGChildren); typename FNgramSpec::BGChildIterCnstr citer(fnSpecArray[i].numParents,nodeAtLevel,fnSpecArray[i].parentSubsets[nodeAtLevel].backoffConstraint); Boolean do_comma = false; for (unsigned child;citer.next(child);) { if (fnSpecArray[i].parentSubsets[child].counts != NULL) { fprintf(stderr, (do_comma?"; ":" ")); fnSpecArray[i].printNodeString(stderr,child); fprintf(stderr, " (0x%X)",child); } do_comma = true; } fprintf(stderr, "\n"); } } } } if (nextPosition > maxNumParentsPerChild) { f.position() << "Error: may only have at most " << maxNumParentsPerChild << " distinct tags\n"; exit(-1); } LHashIter<VocabString,unsigned> tags(tagPosition); VocabString tag; unsigned *pos; char buff[2048]; while ((pos = tags.next(tag)) != NULL) { // TODO: WARNING: should use strncat here. buff[0] = '\0'; fvocab.tagNulls[*pos] = strdup(strcat(strcat(strcat(buff,tag),FNGRAM_WORD_TAG_SEP_STR),FNGRAM_WORD_TAG_NULL_SPEC_STR)); buff[0] = '\0'; fvocab.tagUnks[*pos] = strdup(strcat(strcat(strcat(buff,tag),FNGRAM_WORD_TAG_SEP_STR),Vocab_Unknown)); buff[0] = '\0'; fvocab.tagSes[*pos] = strdup(strcat(strcat(strcat(buff,tag),FNGRAM_WORD_TAG_SEP_STR),Vocab_SentStart)); buff[0] = '\0'; fvocab.tagSss[*pos] = strdup(strcat(strcat(strcat(buff,tag),FNGRAM_WORD_TAG_SEP_STR),Vocab_SentEnd)); buff[0] = '\0'; fvocab.tagPauses[*pos] = strdup(strcat(strcat(strcat(buff,tag),FNGRAM_WORD_TAG_SEP_STR),Vocab_Pause)); } if (debug(DEBUG_VERY_VERBOSE)) printFInfo();}template <class CountT> BooleanFNgramSpecs<CountT>::FNgramSpec::printNodeString(FILE *f,unsigned int node){ Boolean do_comma = false; for (unsigned i=0;i<numParents;i++) { if (node & (1<<i)) { fprintf(f,"%s%s%+d", (do_comma?",":""), parents[i], parentOffsets[i]); do_comma = true; } } return true;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -