subvocab.cc

来自「这是一款很好用的工具包」· CC 代码 · 共 101 行

101 行

/*
 * SubVocab.cc --
 *	Vocabulary subset class
 *
 */

#ifndef lint
static char Copyright[] = "Copyright (c) 1996,1999,2003 SRI International.  All Rights Reserved.";
static char RcsId[] = "@(#)$Header: /home/srilm/devel/lm/src/RCS/SubVocab.cc,v 1.7 2006/01/05 20:21:27 stolcke Exp $";
#endif

#include <iostream>
using namespace std;
#include <string.h>
#include <ctype.h>
#include <assert.h>

#include "SubVocab.h"

#include "LHash.h"
#include "Array.h"

SubVocab::SubVocab(Vocab &baseVocab)
    :  _baseVocab(baseVocab)
{
    /*
     * These defaults are inherited from the base vocab.
     */
    outputVocab = &baseVocab;

    /*
     * sub-vocabularies don't have any special tokens by default
     */
    remove(_unkIndex);
    remove(_ssIndex);
    remove(_seIndex);
    remove(_pauseIndex);
}

// Add word to vocabulary
VocabIndex
SubVocab::addWord(VocabString name)
{
    /*
     * Try to find word in base vocabulary
     * If it doesn't exist there, add it first to the base vocabulary.
     * Then use the same index here.
     */
    VocabIndex wid = _baseVocab.addWord(name);

    if (wid == Vocab_None) {
	return Vocab_None;
    } else {
	return addWord(wid);
    }
}

// Add index to sub-vocabulary
VocabIndex
SubVocab::addWord(VocabIndex wid)
{
    /*
     * Index has to already exist in the base vocabulary.
     * If not, we fail.
     */
    VocabString baseName = _baseVocab.getWord(wid);

    if (baseName == 0) {
	return Vocab_None;
    } else {
	Boolean found;

	// use baseName here in case base Vocab changed capitalization
	VocabIndex *indexPtr = byName.insert(baseName, found);

	if (found) {
	    assert(*indexPtr == wid);
	} else {
	    *indexPtr = wid;
	    byIndex[wid] = byName.getInternalKey(baseName);

	    /*
	     * Initialize word strings between last highest and new index 
	     * (so that lookups return 0)
	     */
	    for (unsigned i = nextIndex; i < wid; i ++) {
		byIndex[i] = 0;
	    }
	    
	    /*
	     * nextIndex is 1 plus the highest word index used.
	     */
	    if (wid + 1 > nextIndex) {
		nextIndex = wid + 1;
	    }
	} 
	return wid;
    }
}

subvocab.cc - 源码说明

本页面展示了「这是一款很好用的工具包」中的 subvocab.cc 源码文件，采用 CC 编程语言编写，共 101 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与工具包相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?