📄 segmenter.java
字号:
package org.apache.lucene.analysis.cw;
import java.lang.*;
import java.io.*;
import java.util.*;
import java.util.logging.*;
/* Written by Erik Peterson
erik AT mandarintools.com
Last modified Jan. 13, 2004
Modified by Francis, Chong @ Mar 11, 2005
- add package
- implements serializable interface
- add getter for the treeset csurname, cforeign, cnumbers, cnotname;
- replace all output that goes stderr/stdout to java logging
*/
public class segmenter implements Serializable {
private static Logger logger = Logger.getLogger(segmenter.class.getName());
//private Hashtable zhwords;
private TreeMap zhwords;
private TreeSet csurname, cforeign, cnumbers, cnotname;
private String debugencoding;
private boolean debug;
// Char form
public final static int TRAD = 0;
public final static int SIMP = 1;
public final static int BOTH = 2;
// Charform is TRAD, SIMP or BOTH
public segmenter(int charform, boolean loadwordfile) {
debug = false;
debugencoding = "UTF-8";
int count = 0;
int treelevel;
csurname = new TreeSet();
cforeign = new TreeSet();
cnumbers = new TreeSet();
cnotname = new TreeSet();
if (charform == SIMP) {
loadset(cnumbers, "data/snumbers_u8.txt");
loadset(cforeign, "data/sforeign_u8.txt");
loadset(csurname, "data/ssurname_u8.txt");
loadset(cnotname, "data/snotname_u8.txt");
} else if (charform == TRAD) {
loadset(cnumbers, "data/tnumbers_u8.txt");
loadset(cforeign, "data/tforeign_u8.txt");
loadset(csurname, "data/tsurname_u8.txt");
loadset(cnotname, "data/tnotname_u8.txt");
} else { // BOTH
loadset(cnumbers, "data/snumbers_u8.txt");
loadset(cforeign, "data/sforeign_u8.txt");
loadset(csurname, "data/ssurname_u8.txt");
loadset(cnotname, "data/snotname_u8.txt");
loadset(cnumbers, "data/tnumbers_u8.txt");
loadset(cforeign, "data/tforeign_u8.txt");
loadset(csurname, "data/tsurname_u8.txt");
loadset(cnotname, "data/tnotname_u8.txt");
}
//zhwords = new Hashtable(120000);
zhwords = new TreeMap();
if (!loadwordfile) {
return;
}
String newword = null;
try {
InputStream worddata = null;
if (charform == SIMP) {
worddata = getClass().getResourceAsStream("simplexu8.txt");
} else if (charform == TRAD) {
worddata = getClass().getResourceAsStream("tradlexu8.txt");
} else if (charform == BOTH) {
worddata = getClass().getResourceAsStream("bothlexu8.txt");
}
BufferedReader in = new BufferedReader(new InputStreamReader(worddata, "UTF8"));
while ((newword = in.readLine()) != null) {
if ((newword.indexOf("#") == -1) && (newword.length() < 5)) {
zhwords.put(newword.intern(), "1");
if (newword.length() == 3) {
if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
zhwords.put(newword.substring(0,2).intern(), "2");
}
}
if (newword.length() == 4) {
if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
zhwords.put(newword.substring(0,2).intern(), "2");
}
if (zhwords.containsKey(newword.substring(0, 3).intern()) == false) {
zhwords.put(newword.substring(0,3).intern(), "2");
}
}
// Add section for words of 5 characters
//addword(newword);
/* hashnode = zhwords;
for (treelevel = 0; treelevel < newword.length(); treelevel++) {
if (hashnode.containsKey(newword.substring(treelevel,treelevel+1).intern())
== true) {
// Do nothing, wait for next loop
} else {
// Add new hashnode to the tree
hashnode.put(newword.substring(treelevel,treelevel+1).intern(),
new Hashtable(50));
}
hashnode =
(Hashtable)hashnode.get(newword.substring(treelevel,treelevel+1).intern());
}
hashnode.put("EOW", new Integer(newword.length())); // Can I put something more useful here?
*/
if (count++ % 20000 == 0) { logger.fine("" + count); }
//if (count > 65000) { break; }
}
}
in.close();
}
catch (IOException e) {
logger.warning("IOException: "+e);
}
}
/** Load a set of character data */
private void loadset(TreeSet targetset, String sourcefile) {
String dataline;
try {
InputStream setdata = getClass().getResourceAsStream(sourcefile);
BufferedReader in = new BufferedReader(new InputStreamReader(setdata, "UTF-8"));
while ((dataline = in.readLine()) != null) {
if ((dataline.indexOf("#") > -1) || (dataline.length() == 0)) {
continue;
}
targetset.add(dataline.intern());
}
in.close();
}
catch (Exception e) {
logger.warning("Exception loading data file" + sourcefile + " " + e);
}
}
public boolean isNumber(String testword) {
boolean result = true;
for (int i = 0; i < testword.length(); i++) {
if (cnumbers.contains(testword.substring(i, i+1).intern()) == false) {
result = false;
break;
}
}
if (debug) {
try {logger.info(new String(testword.getBytes("UTF-8")) + " " + result);}
catch (Exception a) { };
}
return result;
}
public boolean isAllForeign(String testword) {
boolean result = true;
for (int i = 0; i < testword.length(); i++) {
if (cforeign.contains(testword.substring(i, i+1).intern()) == false) {
result = false;
break;
}
}
return result;
}
public boolean isNotCJK(String testword) {
boolean result = true;
for (int i = 0; i < testword.length(); i++) {
if (Character.UnicodeBlock.of(testword.charAt(i)) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
result = false;
break;
}
}
return result;
}
/*
String add_ChineseNames(String tmpline) {
int tlen = tmpline.length();
StringBuffer newline = new StringBuffer();
for (int m = 0; m < tlen; m++) {
$tchar = substr($tmpline, $m, 1);
$currtoken = "";
if ($tchar =~ /^\s$/) {
$newline .= $tchar;
} else {
$currtoken = "";
while ($tchar !~ /^\s$/ and $m < $tlen) {
$currtoken .= $tchar;
$m++;
$tchar = substr($tmpline, $m, 1);
}
if (defined($csurname{$currtoken}) or
defined($uncommoncsurname{$currtoken})) { # found a surname, see what follows
# go past following spaces
$tchar = substr($tmpline, $m, 1);
$spaces = "";
while ($tchar =~ /\s/ and $m < $tlen) {
$spaces .= $tchar;
$m++;
$tchar = substr($tmpline, $m, 1);
}
# Get next token
$tchar = substr($tmpline, $m, 1);
$currtoken2 = "";
while ($tchar !~ /\s/ and $m < $tlen) {
$currtoken2 .= $tchar;
$m++;
$tchar = substr($tmpline, $m, 1);
}
# go past following spaces
$tchar = substr($tmpline, $m, 1);
$spaces2 = "";
while ($tchar =~ /\s/ and $m < $tlen) {
$spaces2 .= $tchar;
$m++;
$tchar = substr($tmpline, $m, 1);
}
# Get next token
$tchar = substr($tmpline, $m, 1);
$currtoken3 = "";
while ($tchar !~ /\s/ and $m < $tlen) {
$currtoken3 .= $tchar;
$m++;
$tchar = substr($tmpline, $m, 1);
}
if (isChinese($currtoken2) and (length($currtoken2) == 2) and
(!defined($cnotname{$currtoken2})) and
isChinese($currtoken3) and length($currtoken3) == 2 and
!defined($cnotname{$currtoken3}))
{
$newline .= $cname[0] . $currtoken . $currtoken2 . $currtoken3 . $cname[1];
$cwords{$currtoken . $currtoken2 . $currtoken3} = 1;
$cwords{$currtoken . $currtoken2} = 2; # short version for checking
} elsif (isChinese($currtoken2) and (length($currtoken2) == 2)
and (!defined($cnotname{$currtoken2})))
{
$newline .= $currtoken . $currtoken2 . $spaces2 . $currtoken3;
$cwords{$currtoken . $currtoken2} = 1;
} elsif (defined($csurname{$currtoken}) and
isChinese($currtoken2) and (length($currtoken2) == 4) and
($cwords{$currtoken2} != 1) and
(!defined($cnotname{$currtoken2})))
{
$newline .= $cname[0] . $currtoken . $currtoken2 . $cname[1] . $spaces2 . $currtoken3;
$cwords{$currtoken . $currtoken2} = 1;
$cwords{$currtoken . substr($currtoken2, 0, 2)} = 2; # short version to check
} elsif (defined($uncommoncsurname{$currtoken}) and
isChinese($currtoken2) and (length($currtoken2) == 4)
and (!defined($cnotname{$currtoken2}))
and ($cwords{$currtoken2} != 1))
{
$newline .= $cname[0] . $currtoken . $currtoken2 . $cname[1] . $spaces2 . $currtoken3;
$cwords{$currtoken . $currtoken2} = 1;
$cwords{$currtoken . substr($currtoken2, 0, 2)} = 2; # short version to check
} else {
$newline .= $currtoken . $spaces . $currtoken2 . $spaces2 . $currtoken3;
}
} else {
$newline .= $currtoken;
}
$m--; # reset so won't skip space
}
}
$newline;
}
*/
public String stemWord(String word) {
String[] prefix = new String[] {"\u7b2c", "\u526f", "\u4e0d"};
String[] suffix = new String[] {"\u4e86", "\u7684", "\u5730", "\u4e0b", "\u4e0a", "\u4e2d", "\u91cc",
"\u5230", "\u5185", "\u5916", "\u4eec"};
String[] infix = new String[] {"\u5f97", "\u4e0d"};
int i;
StringBuffer unstemmed = new StringBuffer(word);
for (i = 0; i < prefix.length; i++) {
if (unstemmed.substring(0, 1).equals(prefix[i]) == true &&
(zhwords.get(unstemmed.substring(1, unstemmed.length()).intern()) != null ||
unstemmed.length() == 2)) {
logger.info("Stemmed prefix");
try {logger.info(new String(unstemmed.toString().getBytes(debugencoding)));} catch (Exception a) { };
unstemmed.deleteCharAt(0);
return unstemmed.toString();
}
}
for (i = 0; i < suffix.length; i++) {
if (unstemmed.substring(unstemmed.length()-1, unstemmed.length()).equals(suffix[i]) == true &&
(zhwords.get(unstemmed.substring(0, unstemmed.length()-1).intern()) != null ||
unstemmed.length() == 2)) {
logger.info("Stemmed suffix");
try {logger.info(new String(unstemmed.toString().getBytes(debugencoding)));} catch (Exception a) { };
unstemmed.deleteCharAt(unstemmed.length()-1);
return unstemmed.toString();
}
}
for (i = 0; i < infix.length; i++) {
if (unstemmed.length() == 3 && unstemmed.substring(1, 2).equals(infix[i]) == true &&
zhwords.get(new String(unstemmed.substring(0, 1) + unstemmed.substring(2, 3)).intern()) != null) {
logger.info("Stemmed infix");
unstemmed.deleteCharAt(1);
return unstemmed.toString();
}
}
return unstemmed.toString();
}
public String segmentLine(String cline, String separator) {
StringBuffer currentword = new StringBuffer();
StringBuffer outline = new StringBuffer();
int i, clength;
char currentchar;
//separator = " ";
clength = cline.length();
int[][] offsets = new int[clength][2];
for (i = 0; i < clength; i++) {
currentchar = cline.charAt(i);
if (Character.UnicodeBlock.of(currentchar) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS ||
isNumber(cline.substring(i, i+1)) == true) {
// Character in CJK block
if (currentword.length() == 0) { // start looking for next word
//logger.warning("current word length 0");
if (i > 0 && (Character.isWhitespace(cline.charAt(i-1)) == false)) {
outline.append(separator);
}
currentword.append(currentchar);
if (debug) {
try {logger.info(new String(currentword.toString().getBytes(debugencoding)));} catch (Exception a) { };
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -