📄 change-lm-vocab
字号:
#!/bin/sh## change-lm-vocab --# create a language model from an existing one by changing its# vocabulary.# All n-grams in the new vocab are retained with their original# probabilities. Backoff weights are recomputed and backed-off# unigrams for all new words are added.# -subset option performs subsetting of the vocabulary without adding# new words.## usage: change-lm-vocab [-subset] -vocab vocabfile -lm oldlm -write-lm newlm## $Header: /home/srilm/devel/utils/src/RCS/change-lm-vocab,v 1.7 2003/04/12 20:11:49 stolcke Exp $#oldlm=-newlm=-vocab=/dev/nullwhile [ $# -gt 0 ]; do case "$1" in -vocab) vocab="$2" ; shift ;; -lm) oldlm="$2" ; shift ;; -write-lm) newlm="$2" ; shift ;; -tolower) options="$options $1" ; tolower=1 ;; -subset) subset=yes ;; *) options="$options $1" ;; esac shiftdone# -subset prevents new words being added to the LMif [ "$subset" ]; then ngram_vocab="/dev/null"else ngram_vocab="$vocab"figunzip -cf $oldlm | gawk '# read the vocab fileNR == 1 && vocab { # always include sentence begin/end is_word["<s>"] = is_word["</s>"] = 1; while ((getline word < vocab) > 0) { is_word[to_lower ? tolower(word) : word] = 1; } close(vocab);}# process old lmNF==0 { print; next;}/^ngram *[0-9][0-9]*=/ { order = substr($2,1,index($2,"=")-1); print; next;}/^\\[0-9]-grams:/ { currorder=substr($0,2,1); print; next;}/^\\/ { print; next;}currorder { for (i = 2 ; i <= currorder + 1; i ++) { if (!((to_lower ? tolower($i) : $i) in is_word)) next; } print; next;}{ print }' vocab=$vocab to_lower=$tolower | \ngram -lm - -vocab "$ngram_vocab" -renorm -write-lm "$newlm" $options
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -