📄 make-hiddens-lm.gawk
字号:
#!/usr/local/bin/gawk -f## make-hiddens-lm --# Create a hidden-sentence-boundary ngram LM from a standard one## This script edits a ARPA backoff model file as follows:## 1 - ngrams involving <s> and </s> are duplicated using the# hidden segment boundary token <#s>.# 2 - ngrams starting with <s> are eliminated.# 3 - the backoff weight of <s> is set to 1.# this together with the previous change sets all probabilities conditioned# on <s> to the respective marignal probabilities without <s>.# 4 - ngrams ending in </s> get probability 1.# this avoids an end-of-sentence penalty in rescoring.## $Header: /home/srilm/devel/utils/src/RCS/make-hiddens-lm.gawk,v 1.7 2004/11/02 02:00:35 stolcke Exp $#BEGIN { sent_start = "<s>"; sent_end = "</s>"; hiddens = "<#s>"; remove_old_ngrams = 0;}NF==0 { print; next;}/^ngram *[0-9][0-9]*=/ { print; next;}/^.[0-9]-grams:/ { currorder=substr($0,2,1);}/^\\/ { print; next;}# currorder && currorder < highorder { if (NF < currorder + 2) { print $0 "\t0"; } else { print; } next;}$0 ~ sent_start || $0 ~ sent_end { oldline = $0; # modify sentence initial/final ngrams if ($2 == sent_end && currorder == 1) { sos_uniprob = $1; if (no_s_end) { # set </s> prob to 1 $1 = 0; } if (!remove_old_ngrams) { print; } next; } else if ($2 == sent_start && currorder == 1) { if (no_s_start) { # set <s> backoff weight to 1 $3 = 0; } if (!remove_old_ngrams) { print; } # use unigram prob from </s> if (sos_uniprob == "") { print "warning: could not find " sent_end " unigram" \ >> "/dev/stderr"; } else { oldline = sos_uniprob "\t" $2 "\t" $3; } } else if ($2 == sent_start) { # suppress other ngrams starting with <s> if (!no_s_start && !remove_old_ngrams) { print; } } else if ($(currorder + 1) == sent_end) { if (no_s_end) { # set </s> prob to 1 $1 = 0; } if (!remove_old_ngrams) { print; } } # replace <s> and </s> with <#s> and output result gsub(sent_start, hiddens, oldline); gsub(sent_end, hiddens, oldline); print oldline; next;}{ print }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -