⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 compute-oov-rate.gawk

📁 这是一款很好用的工具包
💻 GAWK
字号:
#!/usr/local/bin/gawk -f## compute-oov-rate --#	Compute OOV word rate from a vocabulary and a unigram count file## usage: compute-oov-rate vocab countfile ...## Assumes unigram counts do not have repeated words.## $Header: /home/srilm/devel/utils/src/RCS/compute-oov-rate.gawk,v 1.8 2003/03/08 03:59:39 stolcke Exp $#BEGIN {	# high bit characters also detect multibyte characters	letter = "[[:alpha:]\x80-\xFF]";	if ("x" !~ letter) letter = "[A-Za-z\x80-\xFF]";}# Read vocab#ARGIND == 1 {	vocab[$1] = 1;}function is_fragment(word) {	return word ~ (letter "-$") || word ~ ("^-" letter);}## Read counts#ARGIND > 1 {	if ($1 == "<s>" || $1 == "</s>" || $1 == "-pau-") {		next;	}	total_count += $2;	total_types ++;	if (!vocab[$1]) {		oov_count += $2;		oov_types ++; 		if (!is_fragment($1)) {		    if (write_oov_words) {			    print > write_oov_words;		    }		} else {		    if (write_oov_frags) {			    print > write_oov_frags;		    }		}	}	if (!is_fragment($1)) {		total_nofrag_count += $2;		total_nofrag_types ++;		if (!vocab[$1]) {			oov_nofrag_count += $2;			oov_nofrag_types ++; 		}	}}END {	printf "OOV tokens: %d / %d (%.2f%%) ", \			oov_count, total_count, 100 * oov_count/total_count;	printf "excluding fragments: %d / %d (%.2f%%)\n", \			oov_nofrag_count, total_nofrag_count, \			100 * oov_nofrag_count/total_nofrag_count;	printf "OOV types: %d / %d (%.2f%%) ", \			oov_types, total_types, 100 * oov_types/total_types;	printf "excluding fragments: %d / %d (%.2f%%)\n", \			oov_nofrag_types, total_nofrag_types, \			100 * oov_nofrag_types/total_nofrag_types;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -