📄 fngramspecs.cc
字号:
template <class CountT> unsigned intFNgramSpecs<CountT>::FNgramSpec::parseNodeString(char *str, Boolean &success) { if (str == NULL) return 0; success = false; char *endptr; unsigned bits = 0; endptr = str; bits = (unsigned) strtolplusb(str,&endptr,0); if (endptr != str) { success = true; return bits; } char *p = str; const unsigned buflen = 2047; char buff[buflen+1]; unsigned par = 1; while (*p) { // parse tokens of the form TAGNUMBER,TAGNUMBER,TAGNUMBER // where TAG is one of the parent names, and number // is the negative of the parent position. For example, // given parents of the form // W : 3 M(-1) M(-2) M(0) S(-1) S(-2) S(+2) // a valid string would be // M1,M0,S1,S2,S+2 // and which could correspond to parents // // M(-1), M(0), S(-1), S(-2), S(+2) // // and be bit vector // 0b11101 // which is returned. // get parent char *parent = p; while (*p && !isdigit(*p) && *p != '+' && *p != '-') { p++; } char tmp = *p; *p = '\0'; strncpy(buff,parent,buflen); *p = tmp; // get parent position // NOTE: this is such that; // W1 == W-1, i.e., both are the previous word. // W+1 == the future word // this means that W1 is really an index into the past, rather than the future, // meaning that W1 is not the same as W+1. The reason for this is that // people will much more commonly use language models with histories in the past rather // than histories in the future, so the more common case uses fewer characters. bool plusMinusPresent = (*p == '+' || *p == '-'); int parPos = strtol(p,&endptr,0); if (endptr == p) { fprintf(stderr,"Can't form integer at parent specifier %d in string (%s)\n",par,str); return 0; // doesn't matter what we return here. } p = endptr; // search for parent and position. unsigned i; for (i=0;i<numParents;i++) { if (parentOffsets[i] == (!plusMinusPresent?(-1):(+1))*parPos && strcmp(parents[i],buff) == 0) { // found if (bits & (1<<i)) { // already set, might be an oversite or error by user, give warning fprintf(stderr, "WARNING: parent specifier (%s%d) at position %d given twice in string (%s)\n", buff,(!plusMinusPresent?(-1):(+1))*parPos,par,str); } bits |= (1<<i); break; } } if (i == numParents) { fprintf(stderr,"Can't find a valid parent specifier with (%s%d) at position %d in string (%s)\n", buff,(!plusMinusPresent?(-1):(+1))*parPos,par,str); return 0; // doesn't matter what we return here. } if (*p == ',') p++; par++; } success = true; return bits;}// null for null stringstatic inline const char *nfns(const char *const str) { return (str == NULL ? "NULL" : str);}static inline char bchar(const Boolean b) { return (b?'T':'F');}template <class CountT> voidFNgramSpecs<CountT>::printFInfo(){ for (int i=0;i<(int)fnSpecArray.size();i++) { fprintf(stderr, "----\nchild = [%s], %d parents\n",fnSpecArray[i].child, fnSpecArray[i].numParents); for (int j=0;j<(int)fnSpecArray[i].numParents;j++) { fprintf(stderr, " parent %d = [%s(%d)] = [%d(%d)]\n",j, fnSpecArray[i].parents[j], fnSpecArray[i].parentOffsets[j], fnSpecArray[i].parentPositions[j], fnSpecArray[i].parentOffsets[j]); } fprintf(stderr, " count filename = (%s)\n",fnSpecArray[i].countFileName); for (int subset=0;subset<(1<<fnSpecArray[i].numParents);subset++) { if (fnSpecArray[i].parentSubsets[subset].counts != NULL) { fprintf(stderr, " node 0x%X, constraint 0x%X, count object is %s\n",subset, fnSpecArray[i].parentSubsets[subset].backoffConstraint, (fnSpecArray[i].parentSubsets[subset].counts != NULL ? "allocated" : "unallocated")); fprintf(stderr, " gtmin=%d, gtmax=%d, gt=(%s), cdiscount=%f, ndiscount=%c, wbdiscount=%c,\n" " kndiscount=%c, kn=(%s), interpolate=%c, write=(%s), backoffStrategy=%d\n", fnSpecArray[i].parentSubsets[subset].gtmin, fnSpecArray[i].parentSubsets[subset].gtmax, nfns(fnSpecArray[i].parentSubsets[subset].gtFile), fnSpecArray[i].parentSubsets[subset].cdiscount, bchar(fnSpecArray[i].parentSubsets[subset].ndiscount), bchar(fnSpecArray[i].parentSubsets[subset].wbdiscount), bchar(fnSpecArray[i].parentSubsets[subset].kndiscount), nfns(fnSpecArray[i].parentSubsets[subset].knFile), bchar(fnSpecArray[i].parentSubsets[subset].interpolate), nfns(fnSpecArray[i].parentSubsets[subset].writeFile), fnSpecArray[i].parentSubsets[subset].backoffStrategy); } } } fflush(stderr); // TODO: remove since always flushed}/* * This breaks each word into factors or streams which presumably is formatted as: * * <Tag1>-<factor1>:<Tag2>-<factor2>:...:<TagN>-<factorN> * * Load them in the order that was constructed for the tags in FNgramSpecs() * constructor. That way, the position in the resulting matrix corresponds * to the tag position that several of the objects use such as FNgramStats. * * Duplicate tags for a word are ignored (the first one encountered on * the left is used, possibly issuing a warning). Missing tags for a * word are assumed to be <tag>-"<NULL>" I.e., for a morph class, a * missing morph tag would be assumed to be "M-<NULL>" where "<NULL>" * is the special null word, and "M" is the morph tag. This way, files * need not specify the nulls when they exist (and is the reason why NULL * is a special word). * * Any unknown tags are ignored (and a message is printed if the debug level is set * accordingly). That way, extra tags not used at the moment can stay in the file * without affecting anything. * * A word tag really should be there for the start/end sentence stuff * to work. (if there is no word stream, a warning message will be * issued if debug > 0) * */template <class CountT>unsigned intFNgramSpecs<CountT>::loadWordFactors(const VocabString *words, WordMatrix& wm, unsigned int max){ if (!words) return 0; // parse words into factors int i; for (i=0; i<(int)max && words[i] != 0; i++) { // TODO: call FactoredVocab::loadWordFactor() here instead of // code below. // assume we don't need to reclaim any of the word_factors[i][j], so // just zero out our pointers here. ::memset(wm.word_factors[i],0,(maxNumParentsPerChild+1)*sizeof(VocabString)); VocabString word = words[i]; VocabString word_p = word; if (debug(DEBUG_EVERY_SENTENCE_INFO)) fprintf(stderr,"Processing word (%s)\n",word); // word looks like: // <Tag1>-<factor1>:<Tag2>-<factor2>:...:<TagN>-<factorN> // if a tag is missing (i.e., just <factor_n>), then we // assume it is a FNGRAM_WORD_TAG, which indicates // it is a word. Boolean tag_assigned = false; // make a copy of word, for useful messages char word_copy[2048]; strncpy(word_copy,word,2047); Boolean last_factor = false; while (!last_factor) { char *end_p = (char *)strchr(word_p,FNGRAM_FACTOR_SEPARATOR); if (end_p != NULL) { // this is not last word *end_p = '\0'; } else last_factor = true; if (debug(DEBUG_EVERY_SENTENCE_INFO)) fprintf(stderr,"working on factor (%s)\n",word_p); char *sep_p = (char *)strchr(word_p,FNGRAM_WORD_TAG_SEP); if (sep_p == NULL) { // no tag, assume word tag. Note, either all words must // have a word tag "W-...", or no words can have a word tag. Otherwise, // vocab object will assign two different wids for same word, one // with wordtag and one without. wm.word_factors[i][FNGRAM_WORD_TAG_POS] = word_p; tag_assigned = true; } else { *sep_p = '\0'; unsigned* pos = tagPosition.find(word_p); *sep_p = FNGRAM_WORD_TAG_SEP; if (pos == NULL) { if (debug(DEBUG_TAG_WARNINGS)) { fprintf(stderr,"Warning: unknown tag in factor (%s) of word (%s) when parsing file\n", word_p,word_copy); } goto next_tag; } if (*pos == FNGRAM_WORD_TAG_POS) { // TODO: normalize word so that it either always uses a "W-" tag // or does not use a "W-" tag. } if (wm.word_factors[i][*pos] != NULL) { if (debug(DEBUG_WARN_DUP_TAG)) fprintf(stderr,"Warning: tag given twice in word (%s) when parsing " "file. Using first instance.\n",word_copy); } else wm.word_factors[i][*pos] = word_p; tag_assigned = true; } next_tag: word_p = end_p+1; } if (!tag_assigned) { if (debug(DEBUG_TAG_WARNINGS)) { fprintf(stderr,"Warning: no known tags in word (%s), treating all tags as NULLs", word_copy); } } // store any nulls int j; for (j=0;j<(int)tagPosition.numEntries();j++) { if (wm.word_factors[i][j] == 0) { wm.word_factors[i][j] = fvocab.tagNulls[j]; } } wm.word_factors[i][j] = 0; } if (debug(DEBUG_MISSING_FIRST_LAST_WORD)) { // extra check for a word stream. If a word stream does not // exist, there is no current way that the start/end sentence // stuff will be added. if (wm.word_factors[0][FNGRAM_WORD_TAG_POS] == fvocab.tagNulls[FNGRAM_WORD_TAG_POS]) fprintf(stderr,"Warning: using NULL for first word in sentence\n"); if (wm.word_factors[i-1][FNGRAM_WORD_TAG_POS] == fvocab.tagNulls[FNGRAM_WORD_TAG_POS]) fprintf(stderr,"Warning: using NULL for last word in sentence\n"); } if (debug(DEBUG_EVERY_SENTENCE_INFO)) fprintf(stderr,"%d words in sentence\n",i); if (i < (int)max) { // zero out last one ::memset(wm.word_factors[i],0,(maxNumParentsPerChild+1)*sizeof(VocabString)); } if (debug(DEBUG_EVERY_SENTENCE_INFO)) wm.print(stderr); return i;}template <class CountT>voidFNgramSpecs<CountT>::estimateDiscounts(FactoredVocab& vocab){ for (int i=0;i<(int)fnSpecArray.size();i++) { // estimate the discounts in increasing level order in BG // Change this for loop to: // for (int level=0;level<=fnSpecArray[i].numParents;level++) { // to get meta, meta-meta, meta-meta-meta, (etc...) counts. // And change this for loop to: // for (int level=fnSpecArray[i].numParents;level>=0;level--) { // just to get meta level counts.#define METAMETAMETAETC 0#if METAMETAMETAETC // fprintf(stderr,"Doing meta meta meta etc. counts\n"); for (int level=fnSpecArray[i].numParents;level>=0;level--) { #else // fprintf(stderr,"Doing just meta counts\n"); for (int level=0;level<=(int)fnSpecArray[i].numParents;level++) {#endif typename FNgramSpec::LevelIter iter(fnSpecArray[i].numParents,level); unsigned int subset; while (iter.next(subset)) { if (fnSpecArray[i].parentSubsets[subset].counts != NULL) { FDiscount *discount = 0; Boolean gt = false; if (fnSpecArray[i].parentSubsets[subset].ndiscount) { discount = new FNaturalDiscount(fnSpecArray[i].parentSubsets[subset].gtmin); assert(discount); } else if (fnSpecArray[i].parentSubsets[subset].wbdiscount) { discount = new FWittenBell(fnSpecArray[i].parentSubsets[subset].gtmin); assert(discount); } else if (fnSpecArray[i].parentSubsets[subset].cdiscount != -1.0) { discount = new FConstDiscount(fnSpecArray[i].parentSubsets[subset].cdiscount, fnSpecArray[i].parentSubsets[subset].gtmin); assert(discount); } else if (fnSpecArray[i].parentSubsets[subset].knFile || fnSpecArray[i].parentSubsets[subset].kndiscount) { discount = new FModKneserNey(fnSpecArray[i].parentSubsets[subset].gtmin, fnSpecArray[i].parentSubsets[subset].knCountsModified, fnSpecArray[i].parentSubsets[subset].knCountsModifyAtEnd); assert(discount); } else if (fnSpecArray[i].parentSubsets[subset].knFile || fnSpecArray[i].parentSubsets[subset].ukndiscount) { discount = new FKneserNey(fnSpecArray[i].parentSubsets[subset].gtmin, fnSpecArray[i].parentSubsets[subset].knCountsModified, fnSpecArray[i].parentSubsets[subset].knCountsModifyAtEnd); assert(discount); } else { gt = true; discount = new FGoodTuring(fnSpecArray[i].parentSubsets[subset].gtmin, fnSpecArray[i].parentSubsets[subset].gtmax); assert(discount); } discount->debugme(debuglevel()); discount->interpolate = fnSpecArray[i].parentSubsets[subset].interpolate; Boolean estimated = false; if (fnSpecArray[i].parentSubsets[subset].knFile && fnSpecArray[i].parentSubsets[subset].kndiscount) { File file(fnSpecArray[i].parentSubsets[subset].knFile,"r",0); if (!file.error()) { if (!discount->read(file)) { fprintf(stderr,"error reading kn discount file (%s)\n", fnSpecArray[i].parentSubsets[subset].knFile); exit(-1); } estimated = true; } } if (!estimated && fnSpecArray[i].parentSubsets[subset].gtFile && gt) { File file(fnSpecArray[i].parentSubsets[subset].gtFile,"r",0); if (!file.error()) { if (!discount->read(file)) { fprintf(stderr,"error reading gt discount file (%s)\n", fnSpecArray[i].parentSubsets[subset].gtFile); exit(-1); } estimated = true; } } if (!estimated) { vocab.setCurrentTagVocab(fnSpecArray[i].child); if (!discount->estimate(fnSpecArray[i], subset, vocab)) { // TODO: make better error message here. fprintf(stderr,"error in discount estimator\n"); exit(-1); } estimated = true; if (fnSpecArray[i].parentSubsets[subset].kndiscount && fnSpecArray[i].parentSubsets[subset].knFile) { File file(fnSpecArray[i].parentSubsets[subset].knFile,"w"); discount->write(file); } else if (gt && fnSpecArray[i].parentSubsets[subset].gtFile) { File file(fnSpecArray[i].parentSubsets[subset].gtFile,"w"); discount->write(file); } } fnSpecArray[i].parentSubsets[subset].discount = discount; } } } }}template <class CountT>voidFNgramSpecs<CountT>::computeCardinalityFunctions(FactoredVocab& vocab){ for (int specNum=0;specNum<(int)fnSpecArray.size();specNum++) { // child vocab.setCurrentTagVocab(fnSpecArray[specNum].childPosition); const unsigned numChildWords = vocab.currentTagVocabCardinality(); for (unsigned node=0;node<fnSpecArray[specNum].numSubSets;node++) { fnSpecArray[specNum].parentSubsets[node].prodCardinalities = numChildWords; fnSpecArray[specNum].parentSubsets[node].sumCardinalities = numChildWords; fnSpecArray[specNum].parentSubsets[node].sumLogCardinalities = log10((double)numChildWords); // parents for (int par=0;par<(int)fnSpecArray[specNum].numParents;par++) { if (node & (1<<par)) { vocab.setCurrentTagVocab(fnSpecArray[specNum].parentPositions[par]); const unsigned numParWords = vocab.currentTagVocabCardinality(); fnSpecArray[specNum].parentSubsets[node].prodCardinalities *= numParWords; fnSpecArray[specNum].parentSubsets[node].sumCardinalities += numParWords; fnSpecArray[specNum].parentSubsets[node].sumLogCardinalities += log((double)numParWords); } } } }} // return pointer to a static buff where// we've got the tag of a if any. template <class CountT>VocabStringFNgramSpecs<CountT>::getTag(VocabString a){ // TODO: this routine is a quick hack and should be redone properly. if (!a) return NULL; static char buff[1024]; char* sep_p = (char *)strchr(a,FNGRAM_WORD_TAG_SEP); if (sep_p == NULL) return NULL; *sep_p = '\0'; // make sure we don't overrun buffer and it's terminated strncpy(buff,a,sizeof(buff)-1); buff[sizeof(buff)-1] = '\0'; *sep_p = FNGRAM_WORD_TAG_SEP; return &buff[0];} template <class CountT>VocabStringFNgramSpecs<CountT>::wordTag(){ return FNGRAM_WORD_TAG_STR;}#endif /* EXCLUDE_CONTRIB_END */#endif /* _FNgramSpecs_cc_ */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -