📄 taggingdata.java
字号:
/* Copyright (C) 2006, Xuan-Hieu Phan Email: hieuxuan@ecei.tohoku.ac.jp pxhieu@gmail.com URL: http://www.hori.ecei.tohoku.ac.jp/~hieuxuan Graduate School of Information Sciences, Tohoku University*/package crf.tagger;import java.io.*;import java.util.*;public class TaggingData { List data = null; // each sentence on one line public void readData(String dataFile) { if (data != null) { data.clear(); } else { data = new ArrayList(); } // open data file BufferedReader fin = null; try { fin = new BufferedReader(new FileReader(dataFile)); System.out.println("Reading input data ..."); String line = null; // start to read sentences => sequences while ((line = fin.readLine()) != null) { line = PennTokenizer.tokenize(line); StringTokenizer strTok = new StringTokenizer(line, " \t\r\n"); if (strTok.countTokens() == 0) { // skip this blank line continue; } // create new data sequence List seq = new ArrayList(); while (strTok.hasMoreTokens()) { Observation obsr = new Observation(); obsr.originalData = strTok.nextToken(); seq.add(obsr); } data.add(seq); } System.out.println("Reading input data (" + Integer.toString(data.size()) + " sequences) completed!"); } catch (IOException e) { System.out.println("Couldn't open data file" + dataFile); return; } } // write output, each sentence on a line // <word1>/<postag1> <word2>/<postag2> ... public void writeData(Map lbInt2Str, String outputFile) { if (data == null) { return; } PrintWriter fout = null; try { fout = new PrintWriter(new FileWriter(outputFile)); // main loop for writing for (int i = 0; i < data.size(); i++) { List seq = (List)data.get(i); for (int j = 0; j < seq.size(); j++) { Observation obsr = (Observation)seq.get(j); fout.print(obsr.toString(lbInt2Str) + " "); } fout.println(); } fout.close(); } catch(IOException e) { System.out.println("Couldn't create file: " + outputFile); return; } } // context predicate generation for each position public void cpGen(Map cpStr2Int, List seq, int i) { List tempCps = new ArrayList(); int j; int seqLen = seq.size(); if (i < 0 || i > seqLen - 1) { return; } // single word for (j = -2; j <= 2; j++) { if (i + j >= 0 && i + j < seqLen) { // 1 = w: String cp = "1:"; cp += Integer.toString(j) + ":" + ((Observation)seq.get(i + j)).originalData; tempCps.add(cp.toLowerCase()); } } // prefixes for (j = 0; j <= 0; j++) { if (i + j >= 0 && i + j < seqLen) { String currentToken = ((Observation)seq.get(i + j)).originalData; int prefixLen = currentToken.length() - 2; if (prefixLen > 4) { prefixLen = 4; } for (int count = 1; count <= prefixLen; count++) { String prefix = currentToken.substring(0, count); // 2 = p: String cp = "2:"; cp += Integer.toString(j) + ":" + prefix; tempCps.add(cp); } } } // suffixes for (j = 0; j <= 0; j++) { if (i + j >= 0 && i + j < seqLen) { String currentToken = ((Observation)seq.get(i + j)).originalData; int suffixLen = currentToken.length() - 2; if (suffixLen > 4) { suffixLen = 4; } for (int count = 1; count <= suffixLen; count++) { String suffix = currentToken.substring(currentToken.length() - count, currentToken.length()); // 3 = s: String cp = "3:"; cp += Integer.toString(j) + ":" + suffix; tempCps.add(cp); } } } // two consecutive words for (j = -1; j <= 0; j++) { if (i + j >= 0 && i + j + 1 < seqLen) { // 4 = ww: String cp = "4:"; cp += Integer.toString(j) + ":" + Integer.toString(j + 1) + ":" + ((Observation)seq.get(i + j)).originalData + ":" + ((Observation)seq.get(i + j + 1)).originalData; tempCps.add(cp.toLowerCase()); } } for (j = 0; j <= 0; j++) { if (i + j >= 0 && i + j < seqLen) { int k; String currentToken = ((Observation)seq.get(i + j)).originalData; int tokenLen = currentToken.length(); boolean isAllCap = true; k = 0; while (k < tokenLen) { if (!(Character.isUpperCase(currentToken.charAt(k)))) { isAllCap = false; break; } k++; } if (isAllCap) { // 5 = i:allc String cp = "5:" + Integer.toString(j); tempCps.add(cp); if (currentToken.endsWith("S")) { // 6 = i:allcs cp = "6:" + Integer.toString(j); tempCps.add(cp); } } if (!isAllCap && Character.isUpperCase(currentToken.charAt(0))) { // 7 = i:fstc String cp = "7:" + Integer.toString(j); tempCps.add(cp); String preToken = null; if (i + j > 0) { preToken = ((Observation)seq.get(i + j - 1)).originalData; } if (i + j == 0 || (i + j > 0 && preToken.compareTo("``") == 0)) { // 8 = i:fstsfstc cp = "8:" + Integer.toString(j); tempCps.add(cp); } else { // 9 = i:nfstsfstc cp = "9:" + Integer.toString(j); tempCps.add(cp); } if (currentToken.endsWith("s")) { // 10 = i:fstcs cp = "10:" + Integer.toString(j); tempCps.add(cp); if (i + j == 0 || (i + j > 0 && preToken.compareTo("``") == 0)) { // 11 = i:fstsfstcs cp = "11:" + Integer.toString(j); tempCps.add(cp); } else { // 12 = i:nfstsfstcs cp = "12:" + Integer.toString(j); tempCps.add(cp); } } } boolean hasNumber = false; k = 0; while (k < tokenLen) { if (Character.isDigit(currentToken.charAt(k))) { hasNumber = true; break; } k++; } boolean isAllNumber = true; k = 0; while (k < tokenLen) { if (!(Character.isDigit(currentToken.charAt(k))) && currentToken.charAt(k) != '.' && currentToken.charAt(k) != ',') { isAllNumber = false; break; } k++; } if (!hasNumber) { isAllNumber = false; } if (isAllNumber) { // 13 = n:alln String cp = "13:" + Integer.toString(j); tempCps.add(cp); } if (!isAllNumber && hasNumber) { // 14 = n:hasn String cp = "14:" + Integer.toString(j); tempCps.add(cp); } boolean hasHyphen = false; k = 0; while (k < tokenLen) { if (currentToken.charAt(k) == '-') { hasHyphen = true; break; } k++; } if (hasHyphen) { //15 = h:hyph String cp = "15:" + Integer.toString(j); tempCps.add(cp); } } } List tempCpsInt = new ArrayList(); for (int k = 0; k < tempCps.size(); k++) { Integer cpInt = (Integer)cpStr2Int.get((String)tempCps.get(k)); if (cpInt == null) { continue; } tempCpsInt.add(cpInt); } Observation obsr = (Observation)seq.get(i); obsr.cps = new int[tempCpsInt.size()]; for (int k = 0; k < tempCpsInt.size(); k++) { obsr.cps[k] = ((Integer)tempCpsInt.get(k)).intValue(); } } // context predicate generation for each sequence public void cpGen(Map cpStr2Int, List seq) { for (int i = 0; i < seq.size(); i++) { cpGen(cpStr2Int, seq, i); } } // context predicate generation public void cpGen(Map cpStr2Int) { System.out.println("Generating context predicates for input data ..."); for (int i = 0; i < data.size(); i++) { cpGen(cpStr2Int, (List)data.get(i)); } System.out.println("Generating context predicates for input data completed!"); } } // end of class TaggingData
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -