📄 akerblad.java
字号:
package net.sourceforge.akerblad;import info.jonclark.io.FileLineArray;import info.jonclark.log.LogUtils;import info.jonclark.util.ArrayUtils;import java.io.File;import java.io.FileNotFoundException;import java.io.IOException;import java.io.PrintWriter;import java.util.ArrayList;import java.util.HashMap;import java.util.HashSet;import java.util.logging.Logger;/* * Akerblad Sentence Aligner<br> * A Java port of Champollion<br> * Ported by Jonathan Clark * <p> * Usage: champollion [-hfnscda] <L1 axis> <L2 axis> <alignment output file> */public class Akerblad { public static final int WIN_PER_100 = 8; public static final int MIN_WIN_SIZE = 10; public static final int MAX_WIN_SIZE = 100; private final Config config; private static final Logger log = LogUtils.getLogger(); class IntPair { public final int x; public final int y; public IntPair(int x, int y) { this.x = x; this.y = y; } } protected static class Config { public HashSet<String> xStop; public HashMap<String, ArrayList<String>> dict; public boolean fast; public boolean emulate; public boolean disallow3; public boolean alignall; public double xtoyc; } protected static class DocPair { // x axis public ArrayList<String> xst = new ArrayList<String>(); public ArrayList<Integer> lenx = new ArrayList<Integer>(); public HashMap<String, Integer> xTokenStat = new HashMap<String, Integer>(); public HashMap<String, HashSet<Integer>> xTkn2snt = new HashMap<String, HashSet<Integer>>(); // y axis public ArrayList<String> yst = new ArrayList<String>(); public ArrayList<Integer> leny = new ArrayList<Integer>(); public HashMap<String, Integer> yTokenStat = new HashMap<String, Integer>(); public HashMap<String, HashSet<Integer>> yTkn2snt = new HashMap<String, HashSet<Integer>>(); int nx; int ny; double xyRatio; int windowSize; // final alignments ArrayList<String> rAlign = new ArrayList<String>(); } private Akerblad(Config config) { this.config = config; } private static String opts(String[] args, String name) { int idx = ArrayUtils.findInUnsortedArray(args, name); if (idx == -1 || idx == args.length) usage(); return args[idx + 1]; } public static void main(String[] args) throws Exception { /* * parse command line */ // getopts('hnac:s:d:', \%opts) || usage(); if (ArrayUtils.unsortedArrayContains(args, "-h")) usage(); Config config = new Config(); config.fast = ArrayUtils.unsortedArrayContains(args, "-f"); config.emulate = ArrayUtils.unsortedArrayContains(args, "-e"); config.disallow3 = ArrayUtils.unsortedArrayContains(args, "-n"); config.alignall = ArrayUtils.unsortedArrayContains(args, "-a"); config.xtoyc = Double.parseDouble(opts(args, "-c")); String dictFilename = opts(args, "-d"); String xStopList = opts(args, "-s"); // TODO: Parse these last 3 filenames a bit better // TODO: Write a perl-style getOpts command for Java if (args.length < 3) usage(); String xfn = args[args.length - 3]; String yfn = args[args.length - 2]; String alignedFilename = args[args.length - 1]; /* * load components */ config.xStop = XStopLoader.loadXstop(xStopList); config.dict = DictionaryLoader.loadDict(dictFilename, config.xStop); Akerblad champollion = new Akerblad(config); if(ArrayUtils.unsortedArrayContains(args, "-l")) { FileLineArray xFiles = new FileLineArray(new File(xfn), FileLineArray.Mode.READ); FileLineArray yFiles = new FileLineArray(new File(yfn), FileLineArray.Mode.READ); FileLineArray aFiles = new FileLineArray(new File(alignedFilename), FileLineArray.Mode.READ); if(xFiles.getLineCount() != yFiles.getLineCount() || xFiles.getLineCount() != aFiles.getLineCount()) { System.err.println("Input file lists do not contain the same number of lines."); System.exit(1); } for(int i=0; i<xFiles.getLineCount(); i++) { champollion.doAlignment(xFiles.getLine(i), yFiles.getLine(i), aFiles.getLine(i)); } } else { champollion.doAlignment(xfn, yfn, alignedFilename); } System.exit(0); } public void doAlignment(String xfn, String yfn, String alignedFilename) throws IOException { DocPair dp = new DocPair(); AxisLoader.loadAxis(config, xfn, dp.xst, dp.lenx, dp.xTokenStat, dp.xTkn2snt, config.xStop); AxisLoader.loadAxis(config, yfn, dp.yst, dp.leny, dp.yTokenStat, dp.yTkn2snt, config.xStop); dp.nx = dp.xst.size(); dp.ny = dp.yst.size(); dp.xyRatio = (double) dp.nx / (double) dp.ny; int w1_size = (int) (dp.xyRatio * dp.nx * WIN_PER_100 / 100); int w2_size = (int) (Math.abs(dp.nx - dp.ny) * 3 / 4); dp.windowSize = Math.min(Math.max(MIN_WIN_SIZE, Math.max(w1_size, w2_size)), MAX_WIN_SIZE); System.err.println("Window size: " + dp.windowSize); // # If necessary, tie memory-intensive variables to /* // # to reduce memory usage // #if (nx > 4000) { usedbfile = 1 }; // #if (usedbfile) { // # pathxf = "/tmp/pathxf."; // # pathyf = "/tmp/pathyf."; // # scoref = "/tmp/score."; // # unlink pathxf if -e pathxf; // # unlink pathyf if -e pathyf; // # unlink scoref if -e scoref; // # tie %path_x, "DB_File", pathxf || die "0: Cannot open dbmfile // pathxf!\n"; // # tie %path_y, "DB_File", pathyf || die "0: Cannot open dbmfile // pathyf!\n"; // # tie %score, "DB_File", scoref || die "0: Cannot open dbmfile // scoref!\n"; // #} System.err.print("Aligning Sentences ... "); // #if (notokenization) { // # findTransPairsCn(); // #} else { // # findTransPairs(); // #} AkerbladKernel kernel = new AkerbladKernel(config, dp); kernel.align(dp.lenx, dp.leny, dp.nx, dp.ny); System.err.println("done."); // If all sentences are translated if (config.alignall) { throw new Error("Unsupported."); // mergeOmission(); } printAlignment(dp, alignedFilename); /* * Clean up */ // if (usedbfile) { // untie %path_x; // untie %path_y; // untie %score; // unlink pathxf; // unlink pathyf; // unlink scoref; // } } // private void mergeOmission() { // my xalign_tkn, yalign_tkn, xyRatio; // my %x2ymap, %y2xmap; // my @align_org, @align; // // @align_org = reverse @rAlign; // // i = 0; // x2ymap{0} = [0]; // y2xmap{0} = [0]; // xfnp1 = xfn+1; // yfnp1 = yfn+1; // x2ymap{xfnp1} = [yfnp1]; // y2xmap{yfnp1} = [xfnp1]; // foreach (@align_org) { // index{_} = i; i++; // next if /omitted/; // /(.+) <=> (.+)/; // xsent = 1; ysent = 2; // @xsent = split /,/, xsent; // @ysent = split /,/, ysent; // foreach (@xsent) { // xalign_tkn += lenx[_-1]; // x2ymap{_} = [@ysent]; // } // foreach (@ysent) { // yalign_tkn += leny[_-1]; // y2xmap{_} = [@xsent]; // } // } // // xyRatio = xalign_tkn/yalign_tkn; // // for (i = 0; i<@align_org; i++) { // next unless align_org[i] =~ /omitted/; // // if (align_org[i] =~ /omitted <=> (\d+)/) { // ysid = 1; // lb = lowerbound(ysid, \%y2xmap); // ub = upperbound(ysid, \%y2xmap); // // System.err.println("UB: ub LB: lb\n"); // next unless defined ub && defined lb; // if (ub-lb == 2) { // xsid = lb+1; // align_org[i] = "xsid <=> ysid"; // align_org[index{"xsid <=> omitted"}] = ""; // } elsif (ub-lb == 1) { // my pxtkn, pytkn, nxtkn, nytkn; // // # counting tokens of previous alignment // align_org[i-1] =~ /(.+) <=> (.+)/; // xsent = 1; ysent = 2; // @xsent = split /,/, xsent; // @ysent = split /,/, ysent; // foreach (@xsent) { // pxtkn += lenx[_-1]; // } // foreach (@ysent) { // pytkn += leny[_-1]; // } // // # counting tokens of next alignment // align_org[i+1] =~ /(.+) <=> (.+)/; // xsent = 1; ysent = 2; // @xsent = split /,/, xsent; // @ysent = split /,/, ysent; // foreach (@xsent) { // nxtkn += lenx[_-1]; // } // foreach (@ysent) { // nytkn += leny[_-1]; // }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -