📄 axisloader.java
字号:
package net.sourceforge.akerblad;import info.jonclark.util.HashUtils;import info.jonclark.util.StringUtils;import java.io.BufferedReader;import java.io.FileReader;import java.io.IOException;import java.util.ArrayList;import java.util.HashMap;import java.util.HashSet;import net.sourceforge.akerblad.Akerblad.Config;public class AxisLoader { public static void loadAxis(Config config, String axisFilename, ArrayList<String> stAref, ArrayList<Integer> lenAref, HashMap<String, Integer> tokenStatHref, HashMap<String, HashSet<Integer>> tkn2sntHref, HashSet<String> xStop) throws IOException { assert axisFilename != null; assert stAref != null; assert lenAref != null; assert tokenStatHref != null; assert tkn2sntHref != null; assert xStop != null; int stno = 0; // open A, "<$axis_fn" || die "$0: Can not open $axis_fn\n"; BufferedReader in = new BufferedReader(new FileReader(axisFilename)); System.err.print("Loading axis..."); String snt; while ((snt = in.readLine()) != null) { if(config.emulate) { // emulate champollion: don't fix all duplicate whitespace snt = StringUtils.replaceFast(snt, " ", " "); } else { // otherwise, actually normalize ALL the whitespace snt = StringUtils.whitespaceToSpace(snt, false); } stAref.add(snt); String[] tokens = StringUtils.tokenize(snt); // undef %new_st; // TODO: Filter stopwords at this stage? // determine sentence length in characters (without spaces) int nSpaces = StringUtils.countOccurances(snt, ' '); if(config.emulate) { // emulate champollion's counting of bytes instead of characters lenAref.add(snt.getBytes().length - nSpaces);// System.err.println("len: " + (snt.getBytes().length - nSpaces)); } else { lenAref.add(snt.length() - nSpaces); } for (final String token : tokens) { // s/\W/\\$&/g; // System.err.println(token); if (xStop.contains(token)) continue; HashUtils.increment(tokenStatHref, token); HashUtils.increment(tokenStatHref, "TTAALL"); HashSet<Integer> hs = tkn2sntHref.get(token); if(hs == null) hs = new HashSet<Integer>(); hs.add(stno); // $new_st{$_}++; } stno++; } // System.err.println(tokenStatHref.get("TTAALL")); in.close(); System.err.println("done."); System.err.println("Number of sentences: " + stno); } // public static void load_axis_cn() { // my ($axis_fn, $st_aref, $len_aref, $token_stat_href, $tkn2snt_href) = @_; // my $token, %new_st, %tkn2snt; // my $stno = 0; // // open A, "<$axis_fn" || die "$0: Can not open $axis_fn\n"; // // System.err.println("Loading axis..."); // while (<A>){ // chomp; // s/\s+/ /g; // $$len_aref[$stno] = length $_; // s/\s//g; // @_ = split //,$_; // #print STDERR join '', @_, "\n"; // foreach (@_) { // $$tkn2snt_href{$_}{$stno} = 1; // } // push @$st_aref, $_; // $stno++; // } // close A; // System.err.println("done."); // System.err.println("Number of sentences: $stno"); // }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -