compute-oov-rate.gawk

来自「这是一款很好用的工具包」· GAWK 代码 · 共 78 行

GAWK
78
字号
#!/usr/local/bin/gawk -f## compute-oov-rate --#	Compute OOV word rate from a vocabulary and a unigram count file## usage: compute-oov-rate vocab countfile ...## Assumes unigram counts do not have repeated words.## $Header: /home/srilm/devel/utils/src/RCS/compute-oov-rate.gawk,v 1.8 2003/03/08 03:59:39 stolcke Exp $#BEGIN {	# high bit characters also detect multibyte characters	letter = "[[:alpha:]\x80-\xFF]";	if ("x" !~ letter) letter = "[A-Za-z\x80-\xFF]";}# Read vocab#ARGIND == 1 {	vocab[$1] = 1;}function is_fragment(word) {	return word ~ (letter "-$") || word ~ ("^-" letter);}## Read counts#ARGIND > 1 {	if ($1 == "<s>" || $1 == "</s>" || $1 == "-pau-") {		next;	}	total_count += $2;	total_types ++;	if (!vocab[$1]) {		oov_count += $2;		oov_types ++; 		if (!is_fragment($1)) {		    if (write_oov_words) {			    print > write_oov_words;		    }		} else {		    if (write_oov_frags) {			    print > write_oov_frags;		    }		}	}	if (!is_fragment($1)) {		total_nofrag_count += $2;		total_nofrag_types ++;		if (!vocab[$1]) {			oov_nofrag_count += $2;			oov_nofrag_types ++; 		}	}}END {	printf "OOV tokens: %d / %d (%.2f%%) ", \			oov_count, total_count, 100 * oov_count/total_count;	printf "excluding fragments: %d / %d (%.2f%%)\n", \			oov_nofrag_count, total_nofrag_count, \			100 * oov_nofrag_count/total_nofrag_count;	printf "OOV types: %d / %d (%.2f%%) ", \			oov_types, total_types, 100 * oov_types/total_types;	printf "excluding fragments: %d / %d (%.2f%%)\n", \			oov_nofrag_types, total_nofrag_types, \			100 * oov_nofrag_types/total_nofrag_types;}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?