⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 vocab.h

📁 这是一款很好用的工具包
💻 H
字号:
/* * Vocab.h -- *	Interface to the Vocab class. * * * SYNOPSIS * * Vocab represents sets of string tokens as typically used for vocabularies, * word class names, etc.  Additionally, Vocab provides a mapping from * such string tokens (type VocabString) to integers (type VocabIndex). * VocabIndex is typically used to index words in language models to * conserve space and speed up comparisons etc.  Thus, Vocab essentially * implements a symbol table into which strings can be "interned." * * INTERFACE * * VocabIndex(VocabIndex start, VocabIndex end) *	Initializes a Vocab and sets the index range. *	Indices are allocated starting at start and incremented from there. *	No indices are allowed beyond end. *	This provides a way to map several distinct Vocabs to disjoint *	ranges of integers, and then use them jointly without danger of *	confusion. * * VocabIndex addWord(VocabString token) *	Adds a new string to the Vocab, returning the assigned index, *	or looks up the index if the token already exists. * * VocabString getWord(VocabIndex index) *	Returns the string token associated with index, or 0 if it none *	exists. * * VocabIndex getIndex(VocabString token) *	Returns the index for a string token, or Vocab_None if none exists. * * void remove(VocabString token) * void remove(VocabIndex index) *	Deletes an item from the Vocab, either by token or by index. * * unsigned int numWords() *	Returns the number of tokens (and indices) in the Vocab. * * VocabIndex highIndex() *	Returns the highest word index in use, or Vocab_None if  *	vocabulary is empty. * * ITERATION * * VocabIter implements iterations over Vocabs.  * * VocabIter(Vocab &vocab) *	Creates and initializes an iteration over vocab. * * void init() *	Reset an iteration to the "first" element. * * VocabString next() * VocabString next(VocabIndex &index) *	Returns the next Vocab token in an iteration, or 0 if the *	iteration is finished.  index is set to the corresponding *	index. * * unsigned int read(File &file) *	Read a word list from a file into the Vocab, implicitly performing *	an addWord() on each token read.  Returns the number of tokens read. * * void write(File &file) *	Write the current set of word tokes to a file, in random order. * * NOTE: While an iteration over a Vocab is ongoing, no modifications * are allowed to the Vocab, EXCEPT possibly removal of the * "current" token/index. * * An iteration returns the elements of a Vocab in random, but deterministic * order. Furthermore, when copied or used in initialization of other objects, * VocabIter objects retain the current "position" in an iteration.  This * allows nested iterations that enumerate all pairs of distinct elements. * * Copyright (c) 1995-2005 SRI International.  All Rights Reserved. * * @(#)$Header: /home/srilm/devel/lm/src/RCS/Vocab.h,v 1.34 2006/01/05 20:21:27 stolcke Exp $ * */#ifndef _Vocab_h_#define _Vocab_h_#include <iostream>using namespace std;#include "Boolean.h"#include "File.h"#include "LHash.h"#include "SArray.h"#include "Array.h"#include "MemStats.h"#ifdef USE_SHORT_VOCABtypedef unsigned short	VocabIndex;#elsetypedef unsigned int	VocabIndex;#endiftypedef const char	*VocabString;const unsigned int	maxWordLength = 256;const VocabIndex	Vocab_None = (VocabIndex)-1;const VocabString	Vocab_Unknown = "<unk>";const VocabString	Vocab_SentStart = "<s>";const VocabString	Vocab_SentEnd = "</s>";const VocabString	Vocab_Pause = "-pau-";typedef int (*VocabIndexComparator)(VocabIndex, VocabIndex);typedef int (*VocabIndicesComparator)(const VocabIndex *, const VocabIndex *);class Vocab{    friend class VocabIter;public:    Vocab(VocabIndex start = 0, VocabIndex end = (Vocab_None-1));    virtual ~Vocab() {};    virtual VocabIndex addWord(VocabString name);    virtual VocabString getWord(VocabIndex index);    virtual VocabIndex getIndex(VocabString name,				    VocabIndex unkIndex = Vocab_None);    virtual void remove(VocabString name);    virtual void remove(VocabIndex index);    virtual unsigned int numWords() const { return byName.numEntries(); };    virtual VocabIndex highIndex() const;    /*     * Special (pseudo-) vocabulary tokens     */    virtual VocabIndex &unkIndex() { return _unkIndex; };  /* <unk> index */    virtual VocabIndex &ssIndex() { return _ssIndex; };	   /* <s> index */    virtual VocabIndex &seIndex() { return _seIndex; };	   /* </s> index */    virtual VocabIndex &pauseIndex() { return _pauseIndex; }; /* -pau- index */    virtual Boolean &unkIsWord() { return _unkIsWord; };					/* consider <unk> a regular word */    virtual Boolean &toLower() { return _toLower; }	;					/* map word strings to lowercase */    /*     * Some Vocab tokens/indices are "pseudo words", i.e., they don't     * get probabilities since they can only occur in contexts.     */    virtual Boolean isNonEvent(VocabString word)	/* pseudo-word? */	{ return isNonEvent(getIndex(word)); };    virtual Boolean isNonEvent(VocabIndex word) const	/* non-event? */	{ return !_unkIsWord && (word == _unkIndex) ||		 nonEventMap.find(word) != 0; };    virtual VocabIndex addNonEvent(VocabIndex word);    virtual VocabIndex addNonEvent(VocabString name)	{ return addNonEvent(addWord(name)); };    virtual Boolean addNonEvents(Vocab &nonevents);    virtual Boolean removeNonEvent(VocabIndex word);    /*     * Handling of meta-count tags: these are tags that represent a token     * count total, or a type frequency count (count-of-count).     * If metaTag == "__META__", the following tags acquire special meaning:     *     *	__META__		a word count total     *	__META__1		count of singleton word types     *	__META__2		count of word types occurring twice     *	...			...     *	__META__N		count of word types occurring N times     */    virtual VocabString &metaTag() { return _metaTag; }; /* meta-count tag */    Boolean isMetaTag(VocabIndex word)	{ return metaTagMap.find(word) != 0; };    unsigned typeOfMetaTag(VocabIndex word)	{ unsigned *type = metaTagMap.find(word);	  return type != 0 ? *type : (unsigned)-1; };    VocabIndex metaTagOfType(unsigned);    /*     * Utilities for handling Vocab sequences     */    virtual unsigned int getWords(const VocabIndex *wids,			  VocabString *words, unsigned int max);    virtual unsigned int addWords(const VocabString *words,			  VocabIndex *wids, unsigned int max);    virtual unsigned int getIndices(const VocabString *words,			    VocabIndex *wids, unsigned int max,			    VocabIndex unkIndex = Vocab_None);    static unsigned int parseWords(char *line,				   VocabString *words, unsigned int max);    static unsigned int length(const VocabIndex *words);    static unsigned int length(const VocabString *words);    static VocabIndex *copy(VocabIndex *to, const VocabIndex *from);    static VocabString *copy(VocabString *to, const VocabString *from);    static VocabIndex *reverse(VocabIndex *words);    static Boolean contains(const VocabIndex *words, VocabIndex word);    static VocabString *reverse(VocabString *words);    static void write(File &file, const VocabString *words);    /*     *  Comparison of Vocabs and their sequences     */    static int compare(VocabIndex word1, VocabIndex word2);				/* order on word indices induced by Vocab */    static int compare(VocabString word1, VocabString word2)	{ return compareVocab ? compare(compareVocab->getIndex(word1),compareVocab->getIndex(word2)) : strcmp(word1, word2); };    static int compare(const VocabIndex *word1, const VocabIndex *word2);				/* order on word index sequences */    static int compare(const VocabString *word1, const VocabString *word2);    VocabIndexComparator compareIndex() const;    VocabIndicesComparator compareIndices() const;    /*     * Miscellaneous     */    virtual unsigned int read(File &file);    virtual void write(File &file, Boolean sorted = true);    virtual void use() const { outputVocab = (Vocab *)this; }; // discard const    virtual void memStats(MemStats &stats) const;    static Vocab *outputVocab;  /* implicit parameter to operator<< */    static Vocab *compareVocab;	/* implicit parameter to compare() */protected:    LHash<VocabString,VocabIndex> byName;    Array<VocabString> byIndex;    VocabIndex nextIndex;    VocabIndex maxIndex;    LHash<VocabIndex, unsigned> nonEventMap;	/* set of non-event words */    LHash<VocabIndex, unsigned> metaTagMap;	/* maps metatags to their type:						   0	count total						   1	single counts						   ...						   N	count of count N */    // hidden data members (accessed through virtual functions    VocabIndex _unkIndex;		/* <unk> index */    VocabIndex _ssIndex;		/* <s> index */    VocabIndex _seIndex;		/* </s> index */    VocabIndex _pauseIndex;		/* -pau- index */    Boolean _unkIsWord;			/* consider <unk> a regular word */    Boolean _toLower;			/* map word strings to lowercase */    VocabString _metaTag;		/* meta-count tag */};ostream &operator<< (ostream &, const VocabString *words);ostream &operator<< (ostream &, const VocabIndex *words);class VocabIter{public:    VocabIter(const Vocab &vocab, Boolean sorted = false);    void init();    VocabString next() { VocabIndex index; return next(index); };    VocabString next(VocabIndex &index);private:    LHashIter<VocabString,VocabIndex> myIter;};/*  * We sometimes use strings over VocabIndex as keys into maps. * Define the necessary support functions (see Map.h, LHash.cc, SArray.cc). */static inline unsignedLHash_hashKey(const VocabIndex *key, unsigned maxBits){    unsigned i = 0;    /*     * The rationale here is similar to LHash_hashKey(unsigned),     * except that we shift more to preserve more of the typical number of     * bits in a VocabIndex.  The value was optimized to encoding 3 words     * at a time (trigrams).     */    for (; *key != Vocab_None; key ++) {	i += (i << 12) + *key;    }    return LHash_hashKey(i, maxBits);}static inline const VocabIndex *Map_copyKey(const VocabIndex *key){    VocabIndex *copy = new VocabIndex[Vocab::length(key) + 1];    assert(copy != 0);    unsigned i;    for (i = 0; key[i] != Vocab_None; i ++) {	copy[i] = key[i];    }    copy[i] = Vocab_None;    return copy;}static inline voidMap_freeKey(const VocabIndex *key){    delete [] (VocabIndex *)key;}static inline BooleanLHash_equalKey(const VocabIndex *key1, const VocabIndex *key2){    unsigned i;    for (i = 0; key1[i] != Vocab_None && key2[i] != Vocab_None; i ++) {	if (key1[i] != key2[i]) {	    return false;	}    }    if (key1[i] == Vocab_None && key2[i] == Vocab_None) {	return true;    } else {	return false;    }}     static inline intSArray_compareKey(const VocabIndex *key1, const VocabIndex *key2){    unsigned int i = 0;    for (i = 0; ; i++) {	if (key1[i] == Vocab_None) {	    if (key2[i] == Vocab_None) {		return 0;	    } else {		return -1;	/* key1 is shorter */	    }	} else {	    if (key2[i] == Vocab_None) {		return 1;	/* key2 is shorted */	    } else {		int comp = SArray_compareKey(key1[i], key2[i]);		if (comp != 0) {		    return comp;	/* they differ at pos i */		}	    }	}    }    /*NOTREACHED*/}#endif /* _Vocab_h_ */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -