merge-nbest.gawk

来自「这是一款很好用的工具包」· GAWK 代码 · 共 182 行

GAWK

182 行

#!/usr/local/bin/gawk -f## merge-nbest --#	merge hyps from multiple N-best lists into a single list## $Header: /home/srilm/devel/utils/src/RCS/merge-nbest.gawk,v 1.6 2004/11/02 02:00:35 stolcke Exp $#BEGIN {	M_LN10 = 2.30258509299404568402;	# from <math.h>	logINF = -320;	bytelogscale = M_LN10 * 10000.5 / 1024.0;	use_orig_hyps = 1;	add_scores = 0;	last_nbestformat = -1;	nbestmagic1 = "NBestList1.0";	nbestmagic2 = "NBestList2.0";	pause = "-pau-";	max_nbest = 0;	multiwords = 0;	nopauses = 0;}function log10(x) {	return log(x) / M_LN10;}function exp10(x) {	if (x < logINF) {		return 0;	} else {		return exp(x * M_LN10);	}}function addlogs(x,y) {    if (x<y) {	temp = x; x = y; y = temp;    }    return x + log10(1 + exp10(y - x));}function process_nbest(file) {	if (file ~ /.*\.gz$|.*\.Z/) {	    input = "exec gunzip -c " file;	} else {	    input = "exec cat " file;	}	nbestformat = 0;	num_hyps = 0;	while ((status = (input | getline)) > 0) {	    if ($1 == nbestmagic1) {		nbestformat = 1;	    } else if ($1 == nbestmagic2) {		nbestformat = 2;	    } else {		words = "";		num_words = 0;		num_hyps ++;		if (max_nbest > 0 && num_hyps > max_nbest) {		    break;		}		if (nbestformat == 1) {		    for (i = 2; i <= NF; i++) {			words = words " " $i;			if ($i != pause) num_words ++;		    }		    score = substr($1, 2, length($1)-2)/bytelogscale;		    num_words = 1;		} else if (nbestformat == 2) {		    prev_end_time = -1;		    for (i = 2; i <= NF; i += 11) {			start_time = $(i + 3);			end_time = $(i + 5);			# skip tokens that are subsumed by the previous word			# (this eliminates phone and state symbols)			# XXX: due to a bug in Decipher some state tags have			# incorrect timemarks.  We filter them based on their			# token string.			if (start_time > prev_end_time && !($i ~ /-[0-9]$/)) {			    words = words " " $i;			    if ($i != pause) num_words ++;			    prev_end_time = end_time;			}		    }		    score = substr($1, 2, length($1)-2)/bytelogscale;		} else {		    for (i = 4; i <= NF; i++) {			words = words " " $i;		    }		    score = $1 + 8 * $2;		    num_words = $3;		}		# resolve multiwords and eliminate pauses if so desired		if (multiwords) {			gsub("_", " ", words);		}		if (nopauses) {			gsub(" " pause, " ", words);		}		# if word sequence is new, record it		if (!(words in scores)) {		    scores[words] = score;		    hyps[words] = $0;		    nwords[words] = num_words;		} else if (add_scores) {		    scores[words] = addlogs(scores[words], score);		}	        if (last_nbestformat < 0) {		    last_nbestformat = nbestformat;		} else if (nbestformat != last_nbestformat) {		    use_orig_hyps = 0;		    last_nbestformat = nbestformat;		}	    }	}	if (status < 0) {		print "error opening " file >> "/dev/stderr";	}	close(input);}function output_nbest() {	if (!use_orig_hyps || use_orig_hyps && last_nbestformat == 1) {		print nbestmagic1;	} else if (use_orig_hyps && last_nbestformat == 2) {		print nbestmagic2;	}	for (words in scores) {	    if (add_scores) {		print scores[words], 0, nwords[words], words;	    } else if (use_orig_hyps) {		print hyps[words];	    } else {		print "(" (scores[words] * bytelogscale) ")" words;	    }	}}BEGIN {	if (ARGC < 2) {	    print "usage: " ARGV[0] " N-BEST1 N-BEST2 ..." \			    >> "/dev/stderr";	    exit(2);	}	for (arg = 1; arg < ARGC; arg ++) {	    if (equals = index(ARGV[arg], "=")) {		var = substr(ARGV[arg], 1, equals - 1);		val = substr(ARGV[arg], equals + 1);	        if (var == "multiwords") {		    multiwords = val + 0;		} else if (var == "max_nbest") {		    max_nbest = val + 0;		} else if (var == "nopauses") {		    nopauses = val + 0;		} else if (var == "use_orig_hyps") {		    use_orig_hyps = val + 0;		} else if (var == "add_scores") {		    add_scores = val + 0;		} 	    } else {	        process_nbest(ARGV[arg]);	    }	}	output_nbest();}

merge-nbest.gawk - 源码说明

本页面展示了「这是一款很好用的工具包」中的 merge-nbest.gawk 源码文件，采用 GAWK 编程语言编写，共 182 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与工具包相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?