📄 make-big-lm
字号:
#!/bin/sh## make-big-lm --# Create a large ngram language model## This script automates various techniques for building large ngram models.# It is useful for building LMs that would exceed available real memory# if built in one pass by ngram-count.# The techiques employed are# - Assume counts are already produced# (typically using make-batch-counts/merge-batch-counts)# - Compute Good Turing discounts without loading all counts# into memory.# - ngram-counts loads only those counts exceeding cutoff values.## $Header: /home/srilm/devel/utils/src/RCS/make-big-lm,v 1.15 2005/01/05 23:46:10 stolcke Exp $#name=biglmorder=3gt1min=1gt2min=1gt3min=2gt4min=2gt5min=2gt6min=2gt7min=2gt8min=2gt9min=2gt1max=7gt2max=7gt3max=7gt4max=7gt5max=7gt6max=7gt7max=7gt8max=7gt9max=7kndiscount1=0kndiscount2=0kndiscount3=0kndiscount4=0kndiscount5=0kndiscount6=0kndiscount7=0kndiscount8=0kndiscount9=0using_kn=max_per_file=10000000trust_totals=0metatag=__meta__ # lowercase so it works with ngram-count -tolowerwhile [ $# -gt 0 ]; do case "$1" in -name) name=$2; shift ;; -order) order=$2 ; shift ;; -gt1min) gt1min=$2; options="$options $1 $2" ; shift ;; -gt2min) gt2min=$2; options="$options $1 $2" ; shift ;; -gt3min) gt3min=$2; options="$options $1 $2" ; shift ;; -gt4min) gt4min=$2; options="$options $1 $2" ; shift ;; -gt5min) gt5min=$2; options="$options $1 $2" ; shift ;; -gt6min) gt6min=$2; options="$options $1 $2" ; shift ;; -gt7min) gt7min=$2; options="$options $1 $2" ; shift ;; -gt8min) gt8min=$2; options="$options $1 $2" ; shift ;; -gt9min) gt9min=$2; options="$options $1 $2" ; shift ;; -gt1max) gt1max=$2; shift ;; -gt2max) gt2max=$2; shift ;; -gt3max) gt3max=$2; shift ;; -gt4max) gt4max=$2; shift ;; -gt5max) gt5max=$2; shift ;; -gt6max) gt6max=$2; shift ;; -gt7max) gt7max=$2; shift ;; -gt8max) gt8max=$2; shift ;; -gt9max) gt9max=$2; shift ;; -kndiscount1) kndiscount1=1; using_kn=1 ;; -kndiscount2) kndiscount2=1; using_kn=1 ;; -kndiscount3) kndiscount3=1; using_kn=1 ;; -kndiscount4) kndiscount4=1; using_kn=1 ;; -kndiscount5) kndiscount5=1; using_kn=1 ;; -kndiscount6) kndiscount6=1; using_kn=1 ;; -kndiscount7) kndiscount7=1; using_kn=1 ;; -kndiscount8) kndiscount8=1; using_kn=1 ;; -kndiscount9) kndiscount9=1; using_kn=1 ;; -kndiscount) kndiscount1=1; kndiscount2=1; kndiscount3=1; kndiscount4=1; kndiscount5=1; kndiscount6=1; kndiscount7=1; kndiscount8=1; kndiscount9=1; using_kn=1 ;; -read) if [ "$2" = - -o "$2" = "/dev/stdin" ]; then echo "$0: cannot read from stdin" >&2 exit 2 fi counts="$counts $2" ; shift ;; -trust-totals) trust_totals=1 ;; -max-per-file) max_per_file=$2 ; shift ;; *) options="$options $1" ;; esac shiftdoneif [ $trust_totals -eq 0 ]; then options="$options -meta-tag $metatag"else if [ "$using_kn" ]; then echo "$0: -trust-totals incompatible with KN discounting; ignoring it" >&2 options="$options -meta-tag $metatag" else options="$options -trust-totals" fifiset -e## if KN smoothing is used, compute the modified lower-order counts #if [ "$using_kn" ]; then kncounts=$name.kncounts.gz if [ -f $kncounts ]; then echo "using existing $kncounts" >&2 else mkdir -p $name.kndir gunzip -cf $counts | \ (set -x; make-kn-counts \ no_max_order=1 max_per_file=$max_per_file \ order=$order \ kndiscount1=$kndiscount1 kndiscount2=$kndiscount2 \ kndiscount3=$kndiscount3 kndiscount4=$kndiscount4 \ kndiscount5=$kndiscount5 kndiscount6=$kndiscount6 \ kndiscount7=$kndiscount7 kndiscount8=$kndiscount8 \ kndiscount9=$kndiscount9 \ output=$name.kndir/kncounts) (set -x; merge-batch-counts $name.kndir) # this will fail if more than one count file is left in kndir, # i.e., if merging didn't finish successfully mv `find $name.kndir -name \*.ngrams.gz -print ` $kncounts fi options="$options -kn-counts-modified"fi## compute counts-of-counts#if [ -f $name.gt${order}counts ]; then echo "using existing gtcounts" >&2 else if [ "$using_kn" ]; then # concatenate KN modified counts with highest-order original counts gunzip -c $kncounts | gawk 'NF < 1+'$order gunzip -cf $counts | gawk 'NF == 1+'$order else gunzip -cf $counts fi | (set -x; get-gt-counts out=$name max=20 maxorder=$order)fi## compute discount factors#gtflags=for n in 1 2 3 4 5 6 7 8 9 do if [ $n -le $order -a -f $name.gt${n}counts ]; then if (set +e; eval [ \"\$kndiscount${n}\" -eq 1 ]); then gtflags="$gtflags -kn${n} $name.kn${n}" eval make-kn-discounts \ min=\$gt${n}min $name.gt${n}counts > $name.kn${n} else gtflags="$gtflags -gt${n} $name.gt${n}" eval make-gt-discounts \ min=\$gt${n}min max=\$gt${n}max \ $name.gt${n}counts > $name.gt${n} fi fidone## filter counts and build lm#if [ "$using_kn" ]; then # concatenate KN modified counts with highest-order original counts gunzip -c $kncounts | gawk 'NF < 1+'$order gunzip -cf $counts | gawk 'NF == 1+'$orderelse gunzip -cf $countsfi | \(set -x; \ngram-count -read - -read-with-mincounts -order $order \ $gtflags \ $options)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -