⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 replace-words-with-classes.gawk

📁 这是一款很好用的工具包
💻 GAWK
字号:
#!/usr/local/bin/gawk -f## replace-with-words-classes --#	replace class expansions with class names## usage: replace-with-words-classes classes=<classfile> text > text-with-classes#        replace-with-words-classes classes=<classfile> have_counts=1 counts \#							> counts-with-classes## optional arguments:#	outfile=<file>	output file for class expansion counts (default: none)#	normalize=<0|1>	normalize counts to probabilities (default = 1)#	addone=<count>	value to add to counts for probability smoothing (1)## $Header: /home/srilm/devel/utils/src/RCS/replace-words-with-classes.gawk,v 1.7 2004/11/02 02:00:35 stolcke Exp $#function read_classes(file) {	    num_class_defs = 0;    delete num_class_expansions;    delete class_expansions;    delete class_expansion_probs;    while ((getline line < file) > 0) {	n = split(line, a);	if (n == 0) continue;	class = a[1];	num_exp = ++ num_class_expansions[class];	if (a[2] ~ /^[-+0-9.][-+0-9e.]*$/) {		prob = a[2];		i = 3;	} else {		prob = "";		i = 2;	}		expansion = a[i];	for (i++; i <= n; i++) {	    expansion = expansion " " a[i];	}	class_expansions[class " " num_exp] = expansion;	if (prob != "") {	    class_expansion_probs[class " " num_exp] = prob;	}	num_class_defs ++;    }    print "read " num_class_defs " class expansions" >> "/dev/stderr";    # assign default expansion probs    for (class in num_class_expansions) {	num_exp =  num_class_expansions[class];	for (i = 1; i <= num_exp; i ++) {	    if (class_expansion_probs[class " " i] == "") {		class_expansion_probs[class " " i] = 1/num_exp;	    }	}	    }}##############################################################################function add_to_prefix_tree(class, expansion, prob) {    nwords = split(expansion, w);    node = 0;    for (k = 1; k <= nwords; k ++) {	next_node = tree[node " " w[k]];	if (!next_node) {	    next_node = ++num_nodes;	    tree[node " " w[k]] = next_node;	}	node = next_node;    }    if (!(node in node_class)) {	node_class[node] = class;	node_prob[node] = prob;    }    return node;}BEGIN {    normalize = 1;    addone = 1;    partial = 0;}NR == 1 {    if (classes) {	read_classes(classes);	close(classes);    } else {	print "no classes file specified" >> "/dev/stderr";    }    for (class in num_class_expansions) {	for (i = 1; i <= num_class_expansions[class]; i ++) {	    class_expansion_node[class " " i] = \		add_to_prefix_tree(class, class_expansions[class " " i], \				    class_expansion_probs[class " " i]);	}    }}	{    output = "";    next_pos = 1;    # partial option: multiple spaces block multiword replacement    if (partial) {	gsub("[ 	][ 	]*[ 	]", " | ");    }    #    # handle ngram counts by simply leaving the count value alone    # and doing substitution on the ngram itself.    #    if (have_counts) {	max_pos = NF - 1;    } else {	max_pos = NF;    }    while (next_pos <= max_pos) {	class = "";	prob = 0;	num_exp_words = 0;	# search for largest class expansion starting at current position	node = 0;	k = 0;	while (1) {	    node = tree[node " " $(next_pos + k)];	    if (node) {		if (node in node_class) {		    # we have found a complete expansion, record its class		    class = node_class[node];		    class_node = node;		    prob = node_prob[prob];		    num_exp_words = k + 1;		}	    } else {		break;	    }	    k ++;	}	if (next_pos == 1) {	    space = "";	} else {	    space = " ";	}	if (!class) {	    output = output space $next_pos;	    next_pos ++;	} else {	    output = output space class;	    next_pos += num_exp_words;	    node_count[class_node] ++;	    class_count[class] ++;	}    }    # partial option: multiple spaces block multiword replacement    if (partial) {	gsub(" [|] ", " ", output);	sub("^[|]", " ", output);	sub("[|]$", " ", output);    }    if (have_counts) {	print output, $NF;    } else {	print output;    }}function estimate(count, total, N) {    denom = total + N *addone;    if (denom == 0) {	return 0;    } else {	return (count + addone)/denom;    }}END {    if (outfile) {	for (class in num_class_expansions) {	    for (i = 1; i <= num_class_expansions[class]; i ++) {		nc = node_count[class_expansion_node[class " " i]] + 0;		print class, \		      normalize ? \				 estimate(nc, class_count[class], \					num_class_expansions[class]) :				 nc, \		      class_expansions[class " " i] > outfile;	    }	}	close(outfile);    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -