📄 make-ngram-pfsg.gawk
字号:
#!/usr/local/bin/gawk -f## make-ngram-pfsg --# Create a Decipher PFSG from an N-gram language model## usage: make-ngram-pfsg [debug=1] [check_bows=1] [maxorder=N] backoff-lm > pfsg## $Header: /home/srilm/devel/utils/src/RCS/make-ngram-pfsg.gawk,v 1.28 2004/11/01 22:25:42 stolcke Exp $############################################ Output format specific code#BEGIN { logscale = 2.30258509299404568402 * 10000.5; round = 0.5; start_tag = "<s>"; end_tag = "</s>"; null = "NULL"; version = 0; top_level_name = ""; if ("pid" in PROCINFO) { pid = PROCINFO["pid"]; } else { getline pid < "/dev/pid"; } tmpfile = "/tmp/pfsg." pid; # hack to remove tmpfile when killed print "" | "trap '/bin/rm -f " tmpfile "' 0 1 2 15 30; cat >/dev/null"; debug = 0; write_contexts = ""; read_contexts = "";}function rint(x) { if (x < 0) { return int(x - round); } else { return int(x + round); }}function scale_log(x) { return rint(x * logscale);}function output_for_node(name) { num_words = split(name, words); if (num_words == 0) { print "output_for_node: got empty name" >> "/dev/stderr"; exit(1); } else if (words[1] == bo_name) { return null; } else if (words[num_words] == end_tag || \ words[num_words] == start_tag) { return null; } else { return words[num_words]; }}function node_exists(name) { return (name in node_num);}function node_index(name) { i = node_num[name]; if (i == "") { i = num_nodes ++; node_num[name] = i; node_string[i] = output_for_node(name); if (debug) { print "node " i " = " name ", output = " node_string[i] \ >> "/dev/stderr"; } } return i;}function start_grammar(name) { num_trans = 0; num_nodes = 0; return;}function end_grammar(name) { if (!node_exists(start_tag)) { print start_tag " tag undefined in LM" >> "/dev/stderr"; exit(1); } else if (!node_exists(end_tag)) { print end_tag " tag undefined in LM" >> "/dev/stderr"; exit(1); } printf "%d pfsg nodes\n", num_nodes >> "/dev/stderr"; printf "%d pfsg transitions\n", num_trans >> "/dev/stderr"; # output version id if supplied if (version) { print "version " version "\n"; } # use optional top-level grammar name if given print "name " (top_level_name ? top_level_name : name); printf "nodes %s", num_nodes; for (i = 0; i < num_nodes; i ++) { printf " %s", node_string[i]; } printf "\n"; print "initial " node_index(start_tag); print "final " node_index(end_tag); print "transitions " num_trans; fflush(); if (close(tmpfile) < 0) { print "error closing tmp file" >> "/dev/stderr"; exit(1); } system("/bin/cat " tmpfile);}function add_trans(from, to, prob) {#print "add_trans " from " -> " to " " prob >> "/dev/stderr"; num_trans ++; print node_index(from), node_index(to), scale_log(prob) > tmpfile;}########################################### Generic code for parsing backoff file#BEGIN { maxorder = 0; grammar_name = "PFSG"; bo_name = "BO"; check_bows = 0; epsilon = 1e-5; # tolerance for lowprob detection}NR == 1 { start_grammar(grammar_name); if (read_contexts) { while ((getline context < read_contexts) > 0) { is_context[context] = 1; } close(read_contexts); }}NF == 0 { next;}/^ngram *[0-9][0-9]*=/ { num_grams = substr($2,index($2,"=")+1); if (num_grams > 0) { order = substr($2,1,index($2,"=")-1); # limit maximal N-gram order if desired if (maxorder > 0 && order > maxorder) { order = maxorder; } if (order == 1) { grammar_name = "UNIGRAM_PFSG"; } else if (order == 2) { grammar_name = "BIGRAM_PFSG"; } else if (order == 3) { grammar_name = "TRIGRAM_PFSG"; } else { grammar_name = "NGRAM_PFSG"; } } next;}/^\\[0-9]-grams:/ { currorder = substr($0,2,1); next;}/^\\/ { next;}## unigram parsing#currorder == 1 { first_word = last_word = ngram = $2; ngram_prefix = ngram_suffix = ""; # we need all unigram backoffs (except for </s>), # so fill in missing bow where needed if (NF == 2 && last_word != end_tag) { $3 = 0; }}## bigram parsing#currorder == 2 { ngram_prefix = first_word = $2; ngram_suffix = last_word = $3; ngram = $2 " " $3;}## trigram parsing#currorder == 3 { first_word = $2; last_word = $4; ngram_prefix = $2 " " $3; ngram_suffix = $3 " " $4; ngram = ngram_prefix " " last_word;}## higher-order N-gram parsing#currorder >= 4 && currorder <= order { first_word = $2; last_word = $(currorder + 1); ngram_infix = $3; for (i = 4; i <= currorder; i ++ ) { ngram_infix = ngram_infix " " $i; } ngram_prefix = first_word " " ngram_infix; ngram_suffix = ngram_infix " " last_word; ngram = ngram_prefix " " last_word;}# # shared code for N-grams of all orders#currorder <= order { prob = $1; bow = $(currorder + 2); # skip backoffs that exceed maximal order, # but always include unigram backoffs if (bow != "" && (currorder == 1 || currorder < order)) { # remember all LM contexts for creation of N-gram transitions bows[ngram] = bow; # insert backoff transitions if (read_contexts ? (ngram in is_context) : \ (currorder < order - 1)) \ { add_trans(bo_name " " ngram, bo_name " " ngram_suffix, bow); add_trans(ngram, bo_name " " ngram, 0); } else { add_trans(ngram, bo_name " " ngram_suffix, bow); } if (write_contexts) { print ngram_suffix > write_contexts; } } if (last_word == start_tag) { if (currorder > 1) { printf "warning: ignoring ngram into start tag %s -> %s\n", \ ngram_prefix, last_word >> "/dev/stderr"; } } else { # insert N-gram transition to maximal suffix of target context if (last_word == end_tag) { target = end_tag; } else if (ngram in bows || currorder == 1) { # the minimal context is unigram target = ngram; } else if (ngram_suffix in bows) { target = ngram_suffix; } else { target = ngram_suffix; for (i = 3; i <= currorder; i ++) { target = substr(target, length($i) + 2); if (target in bows) break; } } if (currorder == 1 || \ (read_contexts ? (ngram_prefix in is_context) : \ (currorder < order))) \ { add_trans(bo_name " " ngram_prefix, target, prob); } else { add_trans(ngram_prefix, target, prob); } if (check_bows) { if (currorder < order) { probs[ngram] = prob; } if (ngram_suffix in probs && \ probs[ngram_suffix] + bows[ngram_prefix] - prob > epsilon) { printf "warning: ngram loses to backoff %s -> %s\n", \ ngram_prefix, last_word >> "/dev/stderr"; } } }}END { end_grammar(grammar_name);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -