📄 compute-oov-rate.gawk
字号:
#!/usr/local/bin/gawk -f## compute-oov-rate --# Compute OOV word rate from a vocabulary and a unigram count file## usage: compute-oov-rate vocab countfile ...## Assumes unigram counts do not have repeated words.## $Header: /home/srilm/devel/utils/src/RCS/compute-oov-rate.gawk,v 1.8 2003/03/08 03:59:39 stolcke Exp $#BEGIN { # high bit characters also detect multibyte characters letter = "[[:alpha:]\x80-\xFF]"; if ("x" !~ letter) letter = "[A-Za-z\x80-\xFF]";}# Read vocab#ARGIND == 1 { vocab[$1] = 1;}function is_fragment(word) { return word ~ (letter "-$") || word ~ ("^-" letter);}## Read counts#ARGIND > 1 { if ($1 == "<s>" || $1 == "</s>" || $1 == "-pau-") { next; } total_count += $2; total_types ++; if (!vocab[$1]) { oov_count += $2; oov_types ++; if (!is_fragment($1)) { if (write_oov_words) { print > write_oov_words; } } else { if (write_oov_frags) { print > write_oov_frags; } } } if (!is_fragment($1)) { total_nofrag_count += $2; total_nofrag_types ++; if (!vocab[$1]) { oov_nofrag_count += $2; oov_nofrag_types ++; } }}END { printf "OOV tokens: %d / %d (%.2f%%) ", \ oov_count, total_count, 100 * oov_count/total_count; printf "excluding fragments: %d / %d (%.2f%%)\n", \ oov_nofrag_count, total_nofrag_count, \ 100 * oov_nofrag_count/total_nofrag_count; printf "OOV types: %d / %d (%.2f%%) ", \ oov_types, total_types, 100 * oov_types/total_types; printf "excluding fragments: %d / %d (%.2f%%)\n", \ oov_nofrag_types, total_nofrag_types, \ 100 * oov_nofrag_types/total_nofrag_types;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -