⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 compute-sclite

📁 这是一款很好用的工具包
💻
字号:
#!/bin/sh## compute-sclite --#	compute word error rate from a sentid hyp file and a sentid reference#	file, using the NIST 'sclite' program## $Header: /home/srilm/devel/utils/src/RCS/compute-sclite,v 1.29 2005/05/08 21:18:50 stolcke Exp $## enforce proper sorting orderLC_COLLATE=Cexport LC_COLLATEreject="@reject@"subsets=remove_periods=if [ $# -lt 2 ]; then	echo "usage: $0 [-v] -h hyps -r refs [-S id-subset] [-M|-multiwords] [-noperiods] [-g glm-file] [sclite-options ...]" >&2	echo "   or  $0 hyps refs" >&2	exit 2elif [ $# -eq 2 ]; then	# old syntax	hyps=${1}	refs=${2}else	# parse arguments	while [ $# -gt 0 ]; do		case "$1" in		-v)	verbose=1 ;;		-r)	refs=$2; shift ;;		-h)	hyps="$hyps $2"			name=`basename $2`			shift ;;		-S)	subsets="$subsets $2"; shift ;;		-M|-multiwords)			multiwords=1 ;;		-noperiods)			remove_periods=1 ;;		-H)	remove_hesitations=1 ;;		-R)	reject="<>" ;;		-g)	glmfile=$2; shift ;;  -s) case_sensitive=1 ;;		*)	options="$options $1" ;;		esac		shift	donefiif [ -n "$case_sensitive" ]; then filter_options="-s"; options="$options -s";fi sentids=/tmp/ce.sentids$$speakers=/tmp/ce.speakers$$sortedrefs=/tmp/ce.refs$$sortedhyps=/tmp/ce.hyps$$ignorehyps=/tmp/ce.ign$$trap '/bin/rm -f $sentids $speakers $sortedrefs $sortedhyps $ignorehyps' \	0 1 2 13 15multijoin () {	if [ $# -eq 1 ]; then	    cat $1	else	    join $1 $2 | { shift; shift; multijoin - "$@"; }	fi}## extract and sort sentids from hyps#case "$hyps" in *.ctm)     cat $hyps | \	gawk '!/^;;/ && $7 != "non-lex" && $7 != "fp" { print $1 "_" $2 }' ;;*)   cat $hyps | gawk '{ print $1 }' ;;esac | \sort | \multijoin - $subsets > $sentids## extract and sort refs for these sentids#case "$refs" in*.stm) # NIST scoring:     sed 's,\([-_][ABab]\)[-_].*,\1,' $sentids | uniq | \				tr '[A-Z]' '[a-z]' > $speakers     # filter out speakers not occurring in hyp file    gawk '!/^;;/ { print tolower($1 "_" $2), $0 }' $refs | \    sort +0 -1 +4n -5 | \    join - $speakers | \    gawk '{ $1 = ""; if ($7 ~ /^<.*>$/) $7 = "<>"; print }' | \    if [ -n "$glmfile" ]; then	gawk '{ gsub("-","_",$1); gsub("-","_",$3); print }' | \	csrfilt.sh $filter_options -i stm -dh $glmfile      else	cat    fi > $sortedrefs    ;;*.stm.filt) # NIST scoring with pre-filtered references    sed 's,\([-_][ABab]\)[-_].*,\1,' $sentids | uniq | \				tr '[A-Z]' '[a-z]' > $speakers     # filter out speakers not occurring in hyp file    gawk '!/^;;/ { print tolower($1 "_" $2), $0 }' $refs | \    sort +0 -1 +4n -5 | \    join - $speakers | \    gawk '{ $1 = ""; if ($7 ~ /^<.*>$/) $7 = "<>"; print }' | \    if [ -n "$glmfile" ]; then	gawk '{ gsub("-","_",$1); gsub("-","_",$3); print }'    else	cat    fi > $sortedrefs    ;;*)  sort "$refs" | join - $sentids | \     gawk '{ if (multiwords) for (i = 2; i <= NF; i++) \		gsub("_", " ", $i); print }'\	    multiwords=$multiwords | \     sed -e 's,\[[^]]*\],,g' | \     sentid-to-sclite \	    > $sortedrefs    # find segments to ignore    gawk 'NF == 2 && tolower($2) == "ignore_time_segment_in_scoring" \		{ print $1 }' < $refs | \    sort > $ignorehyps    ;;esac## sort and condition hyps#case "$refs" in*.stm|*.stm.filt) # NIST scoring    # sclite will handle ignored segments    case "$hyps" in     *.ctm)	cat $hyps | gawk '!/^;;/ { print tolower($1 "_" $2), $0 }' | \	join - $speakers | \	gawk '{ $1 = ""; print }' ;;    *)  sort +0 -1 $hyps | join - $sentids | sentid-to-ctm  ;;    esac | \    gawk '{ # handle new-style CTM format (convert it to old format)	    if (NF >= 7) {		if ($7 != "lex") next;		else $7 = $8 = "";	    }	    if (remove_periods) gsub("[.]", "", $5);	    print;	  }' remove_periods=$remove_periods | \    if [ -n "$glmfile" ]; then	gawk '{ gsub("-","_",$1); print }' | \	csrfilt.sh $filter_options -i ctm -dh $glmfile | \	if [ -n "$remove_hesitations" ]; then		grep -v '%HESITATION'	else		cat	fi    else	cat    fi > $sortedhyps    ;;*)  # we have to remove ignored segments ourselves    sort +0 -1 $hyps | join - $sentids | join -v 1 - $ignorehyps | \     gawk '{ if (multiwords) for (i = 2; i <= NF; i++) gsub("_", " ", $i);	     if (remove_periods) for (i = 2; i <= NF; i++) gsub("[.]", "", $i);	     print }'\	    remove_periods=$remove_periods multiwords=$multiwords | \     sed -e 's,\[[^]]*\],,g' \	    -e 's,<[^>]*>,,g' \	    -e "s,$reject,,g" \	    -e 's,-pau-,,g' | \     sentid-to-sclite \	    > $sortedhyps     ;;esac[ "$verbose" ] && set -xcase "$refs" in*.stm|*.stm.filt) # NIST scoring    sclite -f 0 -O . -n $name \	    -h $sortedhyps ctm $name -r $sortedrefs stm  \	    -D $options    ;;*)  sclite -f 0 -O . -n $name \	    -h $sortedhyps trn $name -r $sortedrefs trn  \	    -i swb $options    ;;esac

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -