⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 ngram-merge.cc

📁 这是一款很好用的工具包
💻 CC
字号:
/*
 * ngram-merge --
 *	Merge sorted ngram counts
 *
 */

#ifndef lint
static char Copyright[] = "Copyright (c) 1995-2006 SRI International.  All Rights Reserved.";
static char RcsId[] = "@(#)$Id: ngram-merge.cc,v 1.16 2006/01/05 20:21:27 stolcke Exp $";
#endif

#include <new>
#include <iostream>
using namespace std;
#include <stdlib.h>
#include <locale.h>
#include <assert.h>

#include "option.h"
#include "version.h"

#include "File.h"
#include "Vocab.h"
#include "NgramStats.cc"
#include "Array.cc"

typedef double FloatCount;	// fractional count type
typedef unsigned IntCount;	// integral count type
typedef union {
	IntCount i;
	FloatCount f;
} MergeCount;			// integer or fractional count

static int version = 0;
static char *outName = (char *)"-";
static int floatCounts = 0;
static int optRest;

static Option options[] = {
    { OPT_TRUE, "version", &version, "print version information" },
    { OPT_TRUE, "float-counts", &floatCounts, "use fractional counts" },
    { OPT_STRING, "write", &outName, "counts file to write" },
    { OPT_REST, "-", &optRest, "indicate end of option list" },
    { OPT_DOC, 0, 0, "following options, specify 2 or more files to merge" },
};

typedef struct {
    VocabString ngram[maxNgramOrder + 1];
    unsigned int howmany;
    MergeCount count;
    Boolean isNext;
} PerFile;

// Get a new line from all files that are 'next'.
// Return true if at least one of the files got more data.
Boolean
getInput(unsigned int nfiles, PerFile *perfile, File *files)
{
    Boolean haveInput = false;

    for (unsigned int i = 0; i < nfiles; i++) {
	if (perfile[i].isNext) {
	    perfile[i].isNext = false;

	    perfile[i].howmany = 
		floatCounts ?
		    NgramCounts<FloatCount>::readNgram(files[i],
			    perfile[i].ngram, maxNgramOrder + 1, 
			    perfile[i].count.f) : 
		    NgramCounts<IntCount>::readNgram(files[i],
			    perfile[i].ngram, maxNgramOrder + 1, 
			    perfile[i].count.i);
	}
	if (perfile[i].howmany > 0) {
	    haveInput = true;
	}
    }
    return haveInput;
}

// Go through all current entries and mark those that compare equal
// and come before all others as 'next'.
// Return the next (minimal) Ngram, and its aggregate count.
VocabString *
findNext(unsigned int nfiles, PerFile *perfile, MergeCount &count)
{
    VocabString *minNgram = 0;
    MergeCount minCount;
    
    if (floatCounts) {
	minCount.f = 0.0;
    } else {
	minCount.i = 0;
    }

    for (unsigned int i = 0; i < nfiles; i++) {
	if (perfile[i].howmany == 0) {
	    /*
	     * Data in this file is exhausted -- skip it.
	     */
	    continue;
	} else if (minNgram == 0) {
	    /*
	     * The first non-empty file -- it is our first candidate
	     * for next ngram.
	     */
	    minNgram = perfile[i].ngram;
	    perfile[i].isNext = true;
	    minCount = perfile[i].count;
	} else {
	    int comp = Vocab::compare(minNgram, perfile[i].ngram);

	    if (comp == 0) {
		/*
		 * Another file that shares the minimal ngram
		 */
		perfile[i].isNext = true;
			
		if (floatCounts) {
		    minCount.f += perfile[i].count.f;
		} else {
		    minCount.i += perfile[i].count.i;
		}
	    } else if (comp > 0) {
		/*
		 * A new minimal ngram.  Invalidate all the previous
		 * 'next' candidates.
		 */
	        minNgram = perfile[i].ngram;
		perfile[i].isNext = true;
		for (unsigned j = 0; j < i; j++) {
		   perfile[j].isNext = false;
		}
	        minCount = perfile[i].count;
	    }
	}
    }
    count = minCount;
    return minNgram;
}

static
void
merge_counts(unsigned int nfiles, File *files, File &out)
{
    makeArray(PerFile, perfile, nfiles);

    /*
     * Get input from all files
     */
    for (int i = 0; i < nfiles; i++) {
	perfile[i].isNext = true;
    }

    while (getInput(nfiles, perfile, files)) {
	MergeCount count;
	VocabString *nextNgram = findNext(nfiles, perfile, count);

	if (floatCounts) {
	    NgramCounts<FloatCount>::writeNgram(out, nextNgram, count.f);
	} else {
	    NgramCounts<IntCount>::writeNgram(out, nextNgram, count.i);
	}
    }
}

int
main(int argc, char **argv)
{
    setlocale(LC_CTYPE, "");
    setlocale(LC_COLLATE, "");

    argc = Opt_Parse(argc, argv, options, Opt_Number(options),
							OPT_OPTIONS_FIRST);
    if (version) {
	printVersion(RcsId);
	exit(0);
    }

    int i;

    if (argc < 3) {
	cerr << "need at least two input files\n";
	exit(1);
    }

    int nfiles = argc - 1;
    File *files = new File[nfiles];
    assert(files != 0);

    for (i = 0; i < nfiles; i++) {
	new (&files[i]) File(argv[i + 1], "r");
    }

    File outfile(outName, "w");
    merge_counts(nfiles, files, outfile);

    delete [] files;

    exit(0);
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -