📄 rescore-decipher

📁 这是一款很好用的工具包
💻
字号:
#!/bin/sh## rescore-nbest --#	generate scores from Decipher(TM) n-best lists## $Header: /home/srilm/devel/utils/src/RCS/rescore-decipher,v 1.32 2004/08/01 01:56:31 stolcke Exp $#bytelog=0nodecipherlm=0multiwords=0norescore=0decipher_lmw=8decipher_wtw=0lm_only=0pretty_file=filter_command=limit_vocab=0fast_rescore=ngram_tool=ngramngram_options=count_oovs=0rescore_option=-rescorewhile [ $# -gt 0 ]do    case "$1" in    -bytelog)		    bytelog=1	    ;;    -nodecipherlm)	    nodecipherlm=1	    ;;    -multiwords)	    multiwords=1	    mw_option=-multiwords	    ;;    -norescore)	    norescore=1	    ;;    -lm-only)	    lm_only=1	    ;;    -count-oovs)	    count_oovs=1	    rescore_option="-debug 1 -ppl"	    ;;    -pretty)	    pretty_file="$2"; shift	    ;;    -ngram-tool)	    ngram_tool="$2"; shift	    ;;    -filter)	    filter_command="$2"; shift	    ;;    -limit-vocab)	    limit_vocab=1	    ;;    -fast)    	    fast_rescore=1	    ;;    -*)	echo "$0: unknown option $1" >&2	    exit 2 ;;    *)	    break	    ;;    esac    shiftdoneif [ $# -lt 3  ]; then    {	echo "usage: $0 [-bytelog] [-nodecipherlm] [-multiwords] [-norescore] [-lm-only] [-count-oovs] [-pretty map] [-ngram-tool pgm] [-filter command] [-fast] nbest-file-list score-dir lm-options ..." >&2	echo "where"	echo "	-bytelog	produces bytelog scaled scores"	echo "	-nodecipherlm	avoids Decipher LM score computation"	echo "	-multiwords	expand multiwords into constituent words"	echo "	-norescore	don't rescore LM, just extract scores"	echo "	-lm-only	output no N-best lists, only LM scores"	echo "	-count-oovs	output number of OOV and zeroprob words"	echo "	-pretty map	word mapping file"	echo "	-ngram-tool pgm use pgm for LM evaluation"	echo "	-filter command	text filter to apply to N-best hyps"	echo "	-fast		fast rescoring mode, no text filtering allowed"    } >&2    exit 1fifilelist="$1"scoredir="$2"shift; shiftif [ ! -d $scoredir ]; then	mkdir $scoredirfi# when not rescoring need to get decipher lmw and wtw from remaining optionsif [ $norescore -gt 0 ]; then    while [ $# -gt 0 ]    do	case "$1" in	-decipher-lmw)			decipher_lmw=$2		shift		;;	-decipher-wtw)		decipher_wtw=$2		shift		;;	*)	shift		;;	esac    donefiif [ $norescore -eq 0 -a $limit_vocab -gt 0 ]; then    #    # limit LM vocabulary to words found in the nbest lists    #    nbestvocab=/tmp/$$nbest.vocab    trap "rm -f $nbestvocab; exit" 0 1 2 15    # generate nbest vocabulary    nbest-lattice -no-rescore -no-reorder $mw_option -nbest-files "$filelist" \		-write-vocab $nbestvocab    # tell ngram to use this vocab    ngram_options="-limit-vocab -vocab $nbestvocab"fiif [ -n "$fast_rescore" ]; then## Fast rescoring mode:#	Hand N-best lists directly to ngram. No text filtering is supported#	if [ -n "$pretty_file" -o -n "$filter_command" -o $lm_only -gt 0 -o $count_oovs -gt 0 ]        then		echo "Text filtering, -lm-only, and -count-oovs not supported with -fast" >&2		exit 2    	fi	if [ $nodecipherlm -eq 0 ]; then		echo "Must use -nodecipherlm with -fast" >&2		exit 2    	fi	if [ $norescore -gt 0 ]; then		nbest-lattice -no-rescore -no-reorder $mw_option \			-nbest-files "$filelist" \			-write-nbest-dir "$scoredir"	else 		if [ "$multiwords" -gt 0 ]; then			mw_option=-split-multiwords		fi		$ngram_tool \			-no-reorder $mw_option \			-nbest-files "$filelist" \			-write-nbest-dir "$scoredir" \			-rescore-lmw 1 -rescore-wtw 1 \			$ngram_options "$@"	fielse # fast_rescore ## General rescoring mode:#	Concatenate hyps for all nbest list, record number of hyps for#		each file in the output stream#	Feed to ngram -rescore (using lm-options)#		or using -ppl for counting OOVs#	Parse ngram output into lm scores and deposit into target files#escape="***FILE:"cat $filelist | ( \while read filename rest; do	case $filename in	# preserve LMstate labels in the file list and pass them to ngram	"<LMstate>")	echo $filename $rest			continue ;;	esac	gunzip -cf $filename | \gawk 'BEGIN {	filename = "";	numhyps = 0;	nbestformat = 0;	# constants	bytelogscale = 2.30258509299404568402 * 10000.5 / 1024.0;	pause = "-pau-";}function bytelog2log10(x) {	return x / bytelogscale;}NR == 1 {	sentid = filename;	sub("^.*/", "", sentid);	sub("\\.gz$", "", sentid);	sub("\\.Z$", "", sentid);	sub("\\.score$", "", sentid);	sub("\\.wv$", "", sentid);	sub("\\.wav$", "", sentid);	sub("\\.wav_cep$", "", sentid);	# read pretty map file	if (pretty_file) {	    while ((getline mapline < pretty_file) > 0) {		npretty = split(mapline, pretty_list);		word = pretty_list[1];		pretty_map[word] = "";		for (i = 2; i <= npretty; i ++) {		    pretty_map[word] = pretty_map[word] " " pretty_list[i];		}	    }	}	print escape, sentid;}function pretty_up(start) {	for (i = start; i <= NF; i ++) {	    if ($i in pretty_map) {		$i = pretty_map[$i];	    }	    if (multiwords) gsub("_", " ", $i);	}}/^NBestList1\.0/ {	nbestformat = 1;	if (nodecipherlm) {	    printf "%s: -nodecipherlm ineffective for NBestList1.0\n", filename > "/dev/stderr" ;	}	next;}/^NBestList2\.0/ {	nbestformat = 2;	next;}{	numhyps ++;	if (nbestformat == 0) {	    pretty_up(4);	    if (count_oovs) {		# output only the words, add <s> to handle empty hyps		$1 = $2 = $3 = "";		print "<s>", $0;	    } else {		print;	    }	} else if (nbestformat == 1) {	    pretty_up(2);	    if (count_oovs) {		# output only the words, add <s> to handle empty hyps		$1 = "";		print "<s>", $0;	    } else if (norescore) {		# convert to SRILM format		score = substr($1,2,length($1)-2);		$1 = "";	    	print bytelog2log10(score), 0, 0, $0;	    } else {		# keep Decipher format		print;	    }	} else if (nbestformat == 2) {	    score = substr($1,2,length($1)-2);	    # compute total AC and LM scores 	    lm_score = 0;	    num_words = 0;	    num_pauses = 0;	    words = "";	    prev_end_time = -1;	    for (i = 2; i <= NF; i += 11) {		start_time = $(i + 3);		end_time = $(i + 5);		# skip tokens that are subsumed by the previous word		# (this eliminates phone and state symbols)		# XXX: due to a bug in Decipher some state tags have incorrect		# timemarks.  We filter them based on their token string.		if (start_time > prev_end_time && !($i ~ /-[0-9]$/)) {		    words = words " " $i;		    num_words ++;		    if ($i == pause) num_pauses ++;		    lm_score += $(i + 7);		    prev_end_time = end_time;		}	    }	    $0 = $1 " " words;	    pretty_up(2);	    # Compute AC score from total and lm scores. This takes into	    # account that the recognizer might sum scores of equivalent hyps	    # (e.g., those differing only in pauses or pronunciations) and	    # reflect the summing in the total score, but not in the word AC	    # scores.	    ac_score = score - lm_score;	    if (count_oovs) {		# output only the words, add <s> to handle empty hyps		$1 = "";		print "<s>", $0;	    } else if (norescore) {		# convert to SRILM nbest format		# NOTES:		# - subtract Decipher WTW (including for pauses!)		# - compute number of words WITHOUT pauses for output		$1 = "";		print bytelog2log10(ac_score), \			bytelog2log10(lm_score/decipher_lmw) - \				numwords * decipher_wtw,  \			split(words, dummy) - num_pauses, $0;	    } else if (nodecipherlm) {		# output only acoustic score in Decipher format		$1 = "(" ac_score ")";		print;	    } else {		# output combined score in Decipher format		print;	    }	}}END {	if (numhyps == 0) {		print "WARNING: nbest list " filename " is empty" \			> "/dev/stderr" ;	}}' filename=$filename escape="$escape" count_oovs=$count_oovs \  nodecipherlm=$nodecipherlm multiwords=$multiwords pretty_file="$pretty_file" \  norescore=$norescore decipher_lmw=$decipher_lmw decipher_wtw=$decipher_wtw done) | \if [ $norescore -gt 0 -a -z "$filter_command" ]; then    # no rescoring and no filtering    catelif [ $norescore -gt 0 ]; then    # no resoring, but filter hyps    eval "$filter_command"elif [ -z "$filter_command" ]; then    # standard rescoring without filtering    $ngram_tool -debug 1 $rescore_option - -rescore-lmw 1 -rescore-wtw 1 \		-escape "$escape " $ngram_options "$@" else    # rescoring with filtering    eval "$filter_command" | \    $ngram_tool -debug 1 $rescore_option - -rescore-lmw 1 -rescore-wtw 1 \		-escape "$escape " $ngram_options "$@" fi | \gawk -v bytelog=$bytelog 'BEGIN {	currentfile = "";	scoredir = "";	scorefile = "";	numhyps = 0;	bytelogscale = 2.30258509299404568402 * 10000.5 / 1024.0;}$1 == escape {	if (currentfile) {		close(scorefile);	}	currentfile = $2;	if (!lm_only && !count_oovs) {	    # backward compatibility	    currentfile = $2 ".score";	}	scorefile = "gzip > " scoredir "/" currentfile ".gz";	printf "processing hyps for %s\n", currentfile \		> "/dev/stderr" ;	hypno = 0;	next;}# parse ngram -ppl output to get OOV (including zeroprobs) countcount_oovs && $6 == "OOVs" {	num_oovs = $5;	next;}count_oovs && $2 == "zeroprobs," {	num_oovs += $1;	print num_oovs | scorefile;	next;}# process ngram -rescore output!count_oovs {	if ($2 ~ /NaN/) {	    print "WARNING: LM score in nbest list " currentfile " is NaN" \							    > "/dev/stderr" ;	    $2 = -100000;	}			if (bytelog) {	    $1 = $1 * bytelogscale;	    $2 = $2 * bytelogscale;	}	if (lm_only) {	    print $2 | scorefile;	} else  {	    print | scorefile;	}}END {	if (currentfile) {		close(scorefile);	}}' scoredir=$scoredir escape="$escape" bytelog=$bytelog lm_only=$lm_only count_oovs=$count_oovsfi # fast_rescore
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -