⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 akerblad.java

📁 A Java port of LDC s Champollion sentence aligner (http://champollion.sourceforge.net). Intended aud
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
package net.sourceforge.akerblad;import info.jonclark.io.FileLineArray;import info.jonclark.log.LogUtils;import info.jonclark.util.ArrayUtils;import java.io.File;import java.io.FileNotFoundException;import java.io.IOException;import java.io.PrintWriter;import java.util.ArrayList;import java.util.HashMap;import java.util.HashSet;import java.util.logging.Logger;/* * Akerblad Sentence Aligner<br> * A Java port of Champollion<br> * Ported by Jonathan Clark * <p> * Usage: champollion [-hfnscda] <L1 axis> <L2 axis> <alignment output file> */public class Akerblad {    public static final int WIN_PER_100 = 8;    public static final int MIN_WIN_SIZE = 10;    public static final int MAX_WIN_SIZE = 100;    private final Config config;    private static final Logger log = LogUtils.getLogger();    class IntPair {	public final int x;	public final int y;	public IntPair(int x, int y) {	    this.x = x;	    this.y = y;	}    }    protected static class Config {	public HashSet<String> xStop;	public HashMap<String, ArrayList<String>> dict;	public boolean fast;	public boolean emulate;	public boolean disallow3;	public boolean alignall;	public double xtoyc;    }    protected static class DocPair {	// x axis	public ArrayList<String> xst = new ArrayList<String>();	public ArrayList<Integer> lenx = new ArrayList<Integer>();	public HashMap<String, Integer> xTokenStat = new HashMap<String, Integer>();	public HashMap<String, HashSet<Integer>> xTkn2snt = new HashMap<String, HashSet<Integer>>();	// y axis	public ArrayList<String> yst = new ArrayList<String>();	public ArrayList<Integer> leny = new ArrayList<Integer>();	public HashMap<String, Integer> yTokenStat = new HashMap<String, Integer>();	public HashMap<String, HashSet<Integer>> yTkn2snt = new HashMap<String, HashSet<Integer>>();	int nx;	int ny;	double xyRatio;	int windowSize;	// final alignments	ArrayList<String> rAlign = new ArrayList<String>();    }    private Akerblad(Config config) {	this.config = config;    }    private static String opts(String[] args, String name) {	int idx = ArrayUtils.findInUnsortedArray(args, name);	if (idx == -1 || idx == args.length)	    usage();	return args[idx + 1];    }    public static void main(String[] args) throws Exception {	/*	 * parse command line	 */	// getopts('hnac:s:d:', \%opts) || usage();	if (ArrayUtils.unsortedArrayContains(args, "-h"))	    usage();	Config config = new Config();	config.fast = ArrayUtils.unsortedArrayContains(args, "-f");	config.emulate = ArrayUtils.unsortedArrayContains(args, "-e");	config.disallow3 = ArrayUtils.unsortedArrayContains(args, "-n");	config.alignall = ArrayUtils.unsortedArrayContains(args, "-a");	config.xtoyc = Double.parseDouble(opts(args, "-c"));	String dictFilename = opts(args, "-d");	String xStopList = opts(args, "-s");	// TODO: Parse these last 3 filenames a bit better	// TODO: Write a perl-style getOpts command for Java	if (args.length < 3)	    usage();	String xfn = args[args.length - 3];	String yfn = args[args.length - 2];	String alignedFilename = args[args.length - 1];	/*	 * load components	 */	config.xStop = XStopLoader.loadXstop(xStopList);	config.dict = DictionaryLoader.loadDict(dictFilename, config.xStop);	Akerblad champollion = new Akerblad(config);	if(ArrayUtils.unsortedArrayContains(args, "-l")) {	    FileLineArray xFiles = new FileLineArray(new File(xfn), FileLineArray.Mode.READ);	    FileLineArray yFiles = new FileLineArray(new File(yfn), FileLineArray.Mode.READ);	    FileLineArray aFiles = new FileLineArray(new File(alignedFilename), FileLineArray.Mode.READ);	    	    if(xFiles.getLineCount() != yFiles.getLineCount() || xFiles.getLineCount() != aFiles.getLineCount()) {		System.err.println("Input file lists do not contain the same number of lines.");		System.exit(1);	    }	    	    for(int i=0; i<xFiles.getLineCount(); i++) {		champollion.doAlignment(xFiles.getLine(i), yFiles.getLine(i), aFiles.getLine(i));	    }	} else {	    champollion.doAlignment(xfn, yfn, alignedFilename);	}	System.exit(0);    }    public void doAlignment(String xfn, String yfn, String alignedFilename) throws IOException {	DocPair dp = new DocPair();	AxisLoader.loadAxis(config, xfn, dp.xst, dp.lenx, dp.xTokenStat, dp.xTkn2snt, config.xStop);	AxisLoader.loadAxis(config, yfn, dp.yst, dp.leny, dp.yTokenStat, dp.yTkn2snt, config.xStop);	dp.nx = dp.xst.size();	dp.ny = dp.yst.size();	dp.xyRatio = (double) dp.nx / (double) dp.ny;	int w1_size = (int) (dp.xyRatio * dp.nx * WIN_PER_100 / 100);	int w2_size = (int) (Math.abs(dp.nx - dp.ny) * 3 / 4);	dp.windowSize = Math.min(Math.max(MIN_WIN_SIZE, Math.max(w1_size, w2_size)), MAX_WIN_SIZE);	System.err.println("Window size: " + dp.windowSize);	// # If necessary, tie memory-intensive variables to /*	// # to reduce memory usage	// #if (nx > 4000) { usedbfile = 1 };	// #if (usedbfile) {	// # pathxf = "/tmp/pathxf.";	// # pathyf = "/tmp/pathyf.";	// # scoref = "/tmp/score.";	// # unlink pathxf if -e pathxf;	// # unlink pathyf if -e pathyf;	// # unlink scoref if -e scoref;	// # tie %path_x, "DB_File", pathxf || die "0: Cannot open dbmfile	// pathxf!\n";	// # tie %path_y, "DB_File", pathyf || die "0: Cannot open dbmfile	// pathyf!\n";	// # tie %score, "DB_File", scoref || die "0: Cannot open dbmfile	// scoref!\n";	// #}	System.err.print("Aligning Sentences ... ");	// #if (notokenization) {	// # findTransPairsCn();	// #} else {	// # findTransPairs();	// #}	AkerbladKernel kernel = new AkerbladKernel(config, dp);	kernel.align(dp.lenx, dp.leny, dp.nx, dp.ny);	System.err.println("done.");	// If all sentences are translated	if (config.alignall) {	    throw new Error("Unsupported.");	    // mergeOmission();	}	printAlignment(dp, alignedFilename);	/*	 * Clean up	 */	// if (usedbfile) {	// untie %path_x;	// untie %path_y;	// untie %score;	// unlink pathxf;	// unlink pathyf;	// unlink scoref;	// }    }    // private void mergeOmission() {    // my xalign_tkn, yalign_tkn, xyRatio;    // my %x2ymap, %y2xmap;    // my @align_org, @align;    //    // @align_org = reverse @rAlign;    //    // i = 0;    // x2ymap{0} = [0];    // y2xmap{0} = [0];    // xfnp1 = xfn+1;    // yfnp1 = yfn+1;    // x2ymap{xfnp1} = [yfnp1];    // y2xmap{yfnp1} = [xfnp1];    // foreach (@align_org) {    // index{_} = i; i++;    // next if /omitted/;    // /(.+) <=> (.+)/;    // xsent = 1; ysent = 2;    // @xsent = split /,/, xsent;    // @ysent = split /,/, ysent;    // foreach (@xsent) {    // xalign_tkn += lenx[_-1];    // x2ymap{_} = [@ysent];    // }    // foreach (@ysent) {    // yalign_tkn += leny[_-1];    // y2xmap{_} = [@xsent];    // }    // }    //            // xyRatio = xalign_tkn/yalign_tkn;    //    // for (i = 0; i<@align_org; i++) {    // next unless align_org[i] =~ /omitted/;    //    // if (align_org[i] =~ /omitted <=> (\d+)/) {    // ysid = 1;    // lb = lowerbound(ysid, \%y2xmap);    // ub = upperbound(ysid, \%y2xmap);    // // System.err.println("UB: ub LB: lb\n");    // next unless defined ub && defined lb;    // if (ub-lb == 2) {    // xsid = lb+1;    // align_org[i] = "xsid <=> ysid";    // align_org[index{"xsid <=> omitted"}] = "";    // } elsif (ub-lb == 1) {    // my pxtkn, pytkn, nxtkn, nytkn;    //    		    // # counting tokens of previous alignment    // align_org[i-1] =~ /(.+) <=> (.+)/;    // xsent = 1; ysent = 2;    // @xsent = split /,/, xsent;    // @ysent = split /,/, ysent;    // foreach (@xsent) {    // pxtkn += lenx[_-1];    // }    // foreach (@ysent) {    // pytkn += leny[_-1];    // }    //    // # counting tokens of next alignment    // align_org[i+1] =~ /(.+) <=> (.+)/;    // xsent = 1; ysent = 2;    // @xsent = split /,/, xsent;    // @ysent = split /,/, ysent;    // foreach (@xsent) {    // nxtkn += lenx[_-1];    // }    // foreach (@ysent) {    // nytkn += leny[_-1];    // }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -