change-lm-vocab

来自「这是一款很好用的工具包」· 代码 · 共 79 行

TXT
79
字号
#!/bin/sh## change-lm-vocab --#	create a language model from an existing one by changing its#	vocabulary.#	All n-grams in the new vocab are retained with their original#	probabilities.  Backoff weights are recomputed and backed-off#	unigrams for all new words are added.#	-subset option performs subsetting of the vocabulary without adding#	new words.## usage: change-lm-vocab [-subset] -vocab vocabfile -lm oldlm -write-lm newlm## $Header: /home/srilm/devel/utils/src/RCS/change-lm-vocab,v 1.7 2003/04/12 20:11:49 stolcke Exp $#oldlm=-newlm=-vocab=/dev/nullwhile [ $# -gt 0 ]; do	case "$1" in	-vocab)	vocab="$2" ; shift ;;	-lm)	oldlm="$2" ; shift ;;	-write-lm)	newlm="$2" ; shift ;;	-tolower) options="$options $1" ; tolower=1 ;;	-subset)  subset=yes ;;	*)	options="$options $1" ;;	esac	shiftdone# -subset prevents new words being added to the LMif [ "$subset" ]; then	ngram_vocab="/dev/null"else	ngram_vocab="$vocab"figunzip -cf $oldlm | gawk '# read the vocab fileNR == 1 && vocab {	# always include sentence begin/end	is_word["<s>"] = is_word["</s>"] = 1;	while ((getline word < vocab) > 0) {		is_word[to_lower ? tolower(word) : word] = 1;	}			close(vocab);}# process old lmNF==0 {	print; next;}/^ngram *[0-9][0-9]*=/ {	order = substr($2,1,index($2,"=")-1);	print;	next;}/^\\[0-9]-grams:/ {	currorder=substr($0,2,1);	print;	next;}/^\\/ {	print; next;}currorder {	for (i = 2 ; i <= currorder + 1; i ++) {		if (!((to_lower ? tolower($i) : $i) in is_word)) next;	}	print;	next;}{ print }' vocab=$vocab to_lower=$tolower | \ngram -lm - -vocab "$ngram_vocab" -renorm -write-lm "$newlm" $options

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?