📄 segmenter.java
字号:
}
} else {
if (zhwords.containsKey(new String(currentword.toString() + currentchar).intern()) == true &&
((String)(zhwords.get(new String(currentword.toString() + currentchar).intern()))).equals("1") == true) {
// word is in lexicon
currentword.append(currentchar);
if (debug) {
try {logger.info(new String(currentword.toString().getBytes(debugencoding)));}
catch (Exception a) { };
}
} else if (isAllForeign(currentword.toString()) &&
cforeign.contains(new String(new char[] {currentchar}).intern()) &&
i + 2 < clength &&
(zhwords.containsKey(cline.substring(i, i+2).intern()) == false)) {
// Possible a transliteration of a foreign name
currentword.append(currentchar);
if (debug) {
try {logger.info(new String(currentword.toString().getBytes(debugencoding)));}
catch (Exception a) { };
}
} else if (isNumber(currentword.toString()) &&
cnumbers.contains(new String(new char[] {currentchar}).intern())
/* && (i + 2 < clength) &&
(zhwords.containsKey(cline.substring(i, i+2).intern()) == false) */ ) {
// Put all consecutive number characters together
currentword.append(currentchar);
if (debug) {
try {logger.info(new String(currentword.toString().getBytes(debugencoding)));}
catch (Exception a) { };
}
} else if ((zhwords.containsKey(new String(currentword.toString() + currentchar).intern())) &&
(((String)(zhwords.get(new String(currentword.toString() +
currentchar).intern()))).equals("2") == true) &&
i + 1 < clength &&
(zhwords.containsKey(new String(currentword.toString() + currentchar +
cline.charAt(i+1)).intern()) == true))
{
if (debug) {
try {logger.info(new String(currentword.toString().getBytes(debugencoding)));}
catch (Exception a) { };
}
// Starts a word in the lexicon
currentword.append(currentchar);
} else { // Start anew
if (debug) {
try {logger.info(new String(currentword.toString().getBytes(debugencoding)));}
catch (Exception a) { };
}
outline.append(currentword.toString());
if (Character.isWhitespace(currentchar) == false) {
outline.append(separator);
}
currentword.setLength(0);
currentword.append(currentchar);
}
}
} else { // Not chinese character
//logger.warning("not cjk");
if (currentword.length() > 0) {
outline.append(currentword.toString());
if (Character.isWhitespace(currentchar) == false) {
outline.append(separator);
}
currentword.setLength(0);
}
outline.append(currentchar);
}
}
outline.append(currentword.toString());
return outline.toString();
//return offsets;
}
public LinkedList segmentLine(String cline) {
StringBuffer currentword = new StringBuffer();
StringBuffer outline = new StringBuffer();
int i, clength;
char currentchar;
String separator = "";
LinkedList offsets = new LinkedList();
int currentoffset = 0;
offsets.add(new Integer(0));
clength = cline.length();
for (i = 0; i < clength; i++) {
currentchar = cline.charAt(i);
if (Character.UnicodeBlock.of(currentchar) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS ||
isNumber(cline.substring(i, i+1)) == true) {
// Character in CJK block
if (currentword.length() == 0) { // start looking for next word
//logger.warning("current word length 0");
if (i > 0 && (Character.isWhitespace(cline.charAt(i-1)) == false)) {
outline.append(separator);
currentoffset += separator.length();
if (separator.length() > 0) {offsets.add(new Integer(currentoffset));}
}
currentword.append(currentchar);
} else {
if (zhwords.containsKey(new String(currentword.toString() + currentchar).intern()) == true &&
((String)(zhwords.get(new String(currentword.toString() + currentchar).intern()))).equals("1") == true) {
// word is in lexicon
currentword.append(currentchar);
} else if (isNumber(currentword.toString()) &&
cnumbers.contains(new String(new char[] {currentchar}).intern())
/* && (i + 2 < clength) &&
(zhwords.containsKey(cline.substring(i, i+2).intern()) == false) */ ) {
// Put all consecutive number characters together
currentword.append(currentchar);
if (debug) {
try {logger.info(new String(currentword.toString().getBytes(debugencoding)));}
catch (Exception a) { };
}
} else if (isAllForeign(currentword.toString()) &&
cforeign.contains(new String(new char[] {currentchar}).intern()) &&
i + 2 < clength &&
(zhwords.containsKey(cline.substring(i, i+2).intern()) == false)) {
// Possible a transliteration of a foreign name
currentword.append(currentchar);
} else if ((zhwords.containsKey(new String(currentword.toString() + currentchar).intern())) &&
(((String)(zhwords.get(new String(currentword.toString() + currentchar).intern()))).equals("2") == true) &&
i + 1 < clength &&
(zhwords.containsKey(new String(currentword.toString() + currentchar + cline.charAt(i+1)).intern()) == true))
{
// Starts a word in the lexicon
currentword.append(currentchar);
} else { // Start anew
outline.append(currentword.toString());
currentoffset += currentword.length(); offsets.add(new Integer(currentoffset));
if (Character.isWhitespace(currentchar) == false) {
outline.append(separator);
currentoffset += separator.length();
if (separator.length() > 0) {offsets.add(new Integer(currentoffset));}
}
currentword.setLength(0);
currentword.append(currentchar);
}
}
} else { // Not chinese character
//logger.warning("not cjk");
if (currentword.length() > 0) {
outline.append(currentword.toString());
currentoffset += currentword.length(); offsets.add(new Integer(currentoffset));
if (Character.isWhitespace(currentchar) == false) {
outline.append(separator);
currentoffset += separator.length();
if (separator.length() > 0) {offsets.add(new Integer(currentoffset));}
}
currentword.setLength(0);
}
while ((i < clength) &&
(Character.UnicodeBlock.of(cline.charAt(i)) != Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS)) {
currentword.append(cline.charAt(i));
i++;
}
i--;
outline.append(currentword.toString());
currentoffset += currentword.length(); offsets.add(new Integer(currentoffset));
currentword.setLength(0);
}
}
outline.append(currentword.toString());
if (currentword.length() > 0) {
currentoffset += currentword.length(); offsets.add(new Integer(currentoffset));
}
return offsets;
}
public void addword(String newword) {
zhwords.put(newword.intern(), "1");
if (newword.length() == 3) {
if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
zhwords.put(newword.substring(0,2).intern(), "2");
}
}
if (newword.length() == 4) {
if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
zhwords.put(newword.substring(0,2).intern(), "2");
}
if (zhwords.containsKey(newword.substring(0, 3).intern()) == false) {
zhwords.put(newword.substring(0,3).intern(), "2");
}
}
if (newword.length() == 5) {
if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
zhwords.put(newword.substring(0,2).intern(), "2");
}
if (zhwords.containsKey(newword.substring(0, 3).intern()) == false) {
zhwords.put(newword.substring(0,3).intern(), "2");
}
if (zhwords.containsKey(newword.substring(0, 4).intern()) == false) {
zhwords.put(newword.substring(0,4).intern(), "2");
}
}
if (newword.length() == 6) {
if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
zhwords.put(newword.substring(0,2).intern(), "2");
}
if (zhwords.containsKey(newword.substring(0, 3).intern()) == false) {
zhwords.put(newword.substring(0,3).intern(), "2");
}
if (zhwords.containsKey(newword.substring(0, 4).intern()) == false) {
zhwords.put(newword.substring(0,4).intern(), "2");
}
if (zhwords.containsKey(newword.substring(0, 5).intern()) == false) {
zhwords.put(newword.substring(0,5).intern(), "2");
}
}
}
public void segmentFile(String inputfile, String encoding) {
byte[] gbbytes;
String outfile = inputfile + ".seg";
String segstring;
boolean debug = false;
try {
String dataline;
InputStream srcdata = new FileInputStream(inputfile);
BufferedReader in = new BufferedReader(new InputStreamReader(srcdata, encoding));
BufferedWriter outbuffer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile), encoding));
while ((dataline = in.readLine()) != null) {
segstring = segmentLine(dataline, " ");
if (debug) {
gbbytes = segstring.getBytes(encoding);
logger.warning("Output: " + new String(gbbytes));
}
outbuffer.write(segstring);
outbuffer.newLine();
}
in.close();
outbuffer.close();
}
catch (Exception e) {
logger.warning("Exception " + e.toString());
}
}
public static void printHelp() {
System.out.println("Usage:\njava -jar segmenter.jar [-b|-g|-8] inputfile.txt");
System.out.println("\t-b Big5, -g GB2312, -8 UTF-8");
System.out.println(" Segmented text will be saved to inputfile.txt.seg");
System.exit(0);
}
public static void main(String[] argv) {
Vector inputfiles = new Vector() ;
String encoding = "BIG5";
int charform = segmenter.TRAD;
boolean debug = false;
int i, j;
for (i = 0; i < argv.length; i++) {
if (argv[i].equals("-b")) {
if (debug) logger.info("Setting to Big5, TRAD");
encoding = "BIG5";
charform = segmenter.TRAD;
} else if (argv[i].equals("-g")) {
if (debug) logger.info("Setting to GB, SIMP");
encoding = "GBK";
charform = segmenter.SIMP;
} else if (argv[i].equals("-8")) {
encoding = "UTF8";
charform = segmenter.BOTH;
} else if (argv[i].equals("-s")) {
if (debug) logger.info("Setting to SIMP");
charform = segmenter.SIMP;
} else if (argv[i].equals("-t")) {
if (debug) logger.info("Setting to TRAD");
charform = segmenter.TRAD;
} else if (argv[i].equals("-h")) {
printHelp();
} else {
inputfiles.add(argv[i]);
}
}
if (inputfiles.size() == 0) {
logger.info("ERROR: Please specify name of Chinese text file to segment.\n");
printHelp();
}
logger.warning("Loading segmenter word list. One moment please.");
segmenter mainsegmenter = new segmenter(charform, true);
logger.warning("Total keys " + mainsegmenter.zhwords.size());
File tmpfile;
String dirfiles[];
for (i = 0; i < inputfiles.size(); i++) {
tmpfile = new File((String)inputfiles.get(i));
if (tmpfile.exists() == false) {
logger.info("ERROR: Source file " + (String)inputfiles.get(i) +
" does not exist.\n");
continue;
}
if (tmpfile.isDirectory() == true) {
dirfiles = tmpfile.list();
if (dirfiles != null) {
for (j = 0; j < dirfiles.length; j++) {
inputfiles.add((String)inputfiles.get(i) + File.separator +
dirfiles[j]);
}
}
continue;
}
logger.warning("Segmenting " + inputfiles.get(i) +
" with encoding " + encoding);
mainsegmenter.segmentFile((String)inputfiles.get(i), encoding);
}
}
// accessor
public TreeSet getCsurname(){
return csurname;
}
public TreeSet getCforeign(){
return cforeign;
}
public TreeSet getCnumbers(){
return cnumbers;
}
public TreeSet getCnotname(){
return cnotname;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -