⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 make-big-lm

📁 这是一款很好用的工具包
💻
字号:
#!/bin/sh## make-big-lm --#	Create a large ngram language model## This script automates various techniques for building large ngram models.# It is useful for building LMs that would exceed available real memory# if built in one pass by ngram-count.# The techiques employed are#	- Assume counts are already produced#	  (typically using make-batch-counts/merge-batch-counts)#	- Compute Good Turing discounts without loading all counts#	  into memory.#	- ngram-counts loads only those counts exceeding cutoff values.## $Header: /home/srilm/devel/utils/src/RCS/make-big-lm,v 1.15 2005/01/05 23:46:10 stolcke Exp $#name=biglmorder=3gt1min=1gt2min=1gt3min=2gt4min=2gt5min=2gt6min=2gt7min=2gt8min=2gt9min=2gt1max=7gt2max=7gt3max=7gt4max=7gt5max=7gt6max=7gt7max=7gt8max=7gt9max=7kndiscount1=0kndiscount2=0kndiscount3=0kndiscount4=0kndiscount5=0kndiscount6=0kndiscount7=0kndiscount8=0kndiscount9=0using_kn=max_per_file=10000000trust_totals=0metatag=__meta__	# lowercase so it works with ngram-count -tolowerwhile [ $# -gt 0 ]; do    case "$1" in    -name)	name=$2; shift ;;    -order)	order=$2 ; shift ;;    -gt1min)	gt1min=$2; options="$options $1 $2" ; shift ;;    -gt2min)	gt2min=$2; options="$options $1 $2" ; shift ;;    -gt3min)	gt3min=$2; options="$options $1 $2" ; shift ;;    -gt4min)	gt4min=$2; options="$options $1 $2" ; shift ;;    -gt5min)	gt5min=$2; options="$options $1 $2" ; shift ;;    -gt6min)	gt6min=$2; options="$options $1 $2" ; shift ;;    -gt7min)	gt7min=$2; options="$options $1 $2" ; shift ;;    -gt8min)	gt8min=$2; options="$options $1 $2" ; shift ;;    -gt9min)	gt9min=$2; options="$options $1 $2" ; shift ;;    -gt1max)	gt1max=$2; shift ;;    -gt2max)	gt2max=$2; shift ;;    -gt3max)	gt3max=$2; shift ;;    -gt4max)	gt4max=$2; shift ;;    -gt5max)	gt5max=$2; shift ;;    -gt6max)	gt6max=$2; shift ;;    -gt7max)	gt7max=$2; shift ;;    -gt8max)	gt8max=$2; shift ;;    -gt9max)	gt9max=$2; shift ;;    -kndiscount1)	kndiscount1=1; using_kn=1 ;;    -kndiscount2)	kndiscount2=1; using_kn=1 ;;    -kndiscount3)	kndiscount3=1; using_kn=1 ;;    -kndiscount4)	kndiscount4=1; using_kn=1 ;;    -kndiscount5)	kndiscount5=1; using_kn=1 ;;    -kndiscount6)	kndiscount6=1; using_kn=1 ;;    -kndiscount7)	kndiscount7=1; using_kn=1 ;;    -kndiscount8)	kndiscount8=1; using_kn=1 ;;    -kndiscount9)	kndiscount9=1; using_kn=1 ;;    -kndiscount)	kndiscount1=1; kndiscount2=1; kndiscount3=1;			kndiscount4=1; kndiscount5=1; kndiscount6=1;			kndiscount7=1; kndiscount8=1; kndiscount9=1;			using_kn=1 ;;    -read)	if [ "$2" = - -o "$2" = "/dev/stdin" ]; then			echo "$0: cannot read from stdin" >&2			exit 2		fi		counts="$counts $2" ; shift ;;    -trust-totals) trust_totals=1 ;;    -max-per-file) max_per_file=$2 ; shift ;;    *)		options="$options $1" ;;    esac    shiftdoneif [ $trust_totals -eq 0 ]; then    options="$options -meta-tag $metatag"else    if [ "$using_kn" ]; then	echo "$0: -trust-totals incompatible with KN discounting; ignoring it" >&2	options="$options -meta-tag $metatag"    else	options="$options -trust-totals"    fifiset -e## if KN smoothing is used, compute the modified lower-order counts #if [ "$using_kn" ]; then    kncounts=$name.kncounts.gz    if [ -f $kncounts ]; then	echo "using existing $kncounts" >&2    else	mkdir -p $name.kndir 	gunzip -cf $counts | \	(set -x; make-kn-counts \		no_max_order=1 max_per_file=$max_per_file \		order=$order \		kndiscount1=$kndiscount1 kndiscount2=$kndiscount2 \		kndiscount3=$kndiscount3 kndiscount4=$kndiscount4 \		kndiscount5=$kndiscount5 kndiscount6=$kndiscount6 \		kndiscount7=$kndiscount7 kndiscount8=$kndiscount8 \		kndiscount9=$kndiscount9 \		output=$name.kndir/kncounts)	(set -x; merge-batch-counts $name.kndir)	# this will fail if more than one count file is left in kndir,	# i.e., if merging didn't finish successfully	mv `find $name.kndir -name \*.ngrams.gz -print ` $kncounts    fi    options="$options -kn-counts-modified"fi## compute counts-of-counts#if [ -f $name.gt${order}counts ]; then    echo "using existing gtcounts" >&2 else    if [ "$using_kn" ]; then	# concatenate KN modified counts with highest-order original counts	gunzip -c $kncounts | gawk 'NF < 1+'$order	gunzip -cf $counts | gawk 'NF == 1+'$order    else         gunzip -cf $counts    fi | (set -x; get-gt-counts out=$name max=20 maxorder=$order)fi## compute discount factors#gtflags=for n in 1 2 3 4 5 6 7 8 9 do    if [ $n -le $order -a -f $name.gt${n}counts ]; then	if (set +e; eval [ \"\$kndiscount${n}\" -eq 1 ]); then	    gtflags="$gtflags -kn${n} $name.kn${n}"	    eval make-kn-discounts \			min=\$gt${n}min $name.gt${n}counts > $name.kn${n}	else 	    gtflags="$gtflags -gt${n} $name.gt${n}"	    eval make-gt-discounts \			min=\$gt${n}min max=\$gt${n}max \			$name.gt${n}counts > $name.gt${n}	fi    fidone## filter counts and build lm#if [ "$using_kn" ]; then    # concatenate KN modified counts with highest-order original counts    gunzip -c $kncounts | gawk 'NF < 1+'$order    gunzip -cf $counts | gawk 'NF == 1+'$orderelse     gunzip -cf $countsfi | \(set -x;  \ngram-count -read - -read-with-mincounts -order $order \	$gtflags \	$options)

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -