📄 chinesesegmenter.java
字号:
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.*;
import java.util.Locale;
import java.util.TreeMap;
import java.util.TreeSet;
public class ChineseSegmenter {
private static ChineseSegmenter segmenter = null;
private TreeMap zhwords;
private TreeSet cforeign, cnumbers, csign, cenglish;
public final static int TRAD = 0;
public final static int SIMP = 1;
public final static int BOTH = 2;
private ChineseSegmenter(int charform, boolean loadwordfile) {
cforeign = new TreeSet();
cnumbers = new TreeSet();
csign = new TreeSet();
if (charform == SIMP) {
loadset(cnumbers, "data/snumbers_u8.txt");
loadset(cforeign, "data/sforeign_u8.txt");
} else if (charform == TRAD) {
loadset(cnumbers, "data/tnumbers_u8.txt");
loadset(cforeign, "data/tforeign_u8.txt");
} else { // BOTH
loadset(cnumbers, "data/snumbers_u8.txt");
loadset(cforeign, "data/sforeign_u8.txt");
loadset(cnumbers, "data/tnumbers_u8.txt");
loadset(cforeign, "data/tforeign_u8.txt");
}
loadset(csign, "data/snotname_u8.txt");
loadset(cenglish, "data/english_u8.txt");
// zhwords = new Hashtable(120000);
zhwords = new TreeMap();
if (!loadwordfile) {
return;
}
String newword = null;
try {
InputStream worddata = null;
if (charform == SIMP) {
worddata = getClass().getResourceAsStream("simplexu8.txt");
} else if (charform == TRAD) {
worddata = getClass().getResourceAsStream("tradlexu8.txt");
} else if (charform == BOTH) {
worddata = getClass().getResourceAsStream("bothlexu8.txt");
}
BufferedReader in = new BufferedReader(new InputStreamReader(
worddata, "UTF8"));
while ((newword = in.readLine()) != null) {
if ((newword.indexOf("#") == -1) && (newword.length() < 5)) {
zhwords.put(newword.intern(), "1");
if (newword.length() == 3) {
if (zhwords.containsKey(newword.substring(0, 2)
.intern()) == false) {
zhwords.put(newword.substring(0, 2).intern(), "2");
}
}
if (newword.length() == 4) {
if (zhwords.containsKey(newword.substring(0, 2)
.intern()) == false) {
zhwords.put(newword.substring(0, 2).intern(), "2");
}
if (zhwords.containsKey(newword.substring(0, 3)
.intern()) == false) {
zhwords.put(newword.substring(0, 3).intern(), "2");
}
}
}
}
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public synchronized static void reset() {
ChineseSegmenter.segmenter = null;
}
public synchronized static ChineseSegmenter getGBSegmenter() {
Locale.setDefault(Locale.SIMPLIFIED_CHINESE);
if (ChineseSegmenter.segmenter == null) {
ChineseSegmenter.segmenter = new ChineseSegmenter(ChineseSegmenter.SIMP, true);
}
return ChineseSegmenter.segmenter;
}
public synchronized static ChineseSegmenter getBig5Segmenter() {
Locale.setDefault(Locale.TRADITIONAL_CHINESE);
if (ChineseSegmenter.segmenter == null) {
ChineseSegmenter.segmenter = new ChineseSegmenter(ChineseSegmenter.TRAD, true);
}
return ChineseSegmenter.segmenter;
}
private void loadset(TreeSet targetset, String sourcefile) {
String dataline;
try {
InputStream setdata = getClass().getResourceAsStream(sourcefile);
BufferedReader in = new BufferedReader(new InputStreamReader(
setdata, "UTF-8"));
while ((dataline = in.readLine()) != null) {
if ((dataline.indexOf("#") > -1) || (dataline.length() == 0)) {
continue;
}
targetset.add(dataline.intern());
}
in.close();
} catch (Exception e) {
System.err.println("Exception loading data file" + sourcefile + " "
+ e);
e.printStackTrace();
}
}
public boolean isNumber(String testword) {
boolean result = true;
for (int i = 0; i < testword.length(); i++) {
if (cnumbers.contains(testword.substring(i, i + 1).intern()) == false) {
result = false;
break;
}
}
return result;
}
public boolean isSign(String testword) {
boolean result = true;
for (int i = 0; i < testword.length(); i++) {
if (csign.contains(testword.substring(i, i + 1).intern()) == false) {
result = false;
break;
}
}
return result;
}
public boolean isEnglish(String testword) {
boolean result = true;
for (int i = 0; i < testword.length(); i++) {
if (cenglish.contains(testword.substring(i, i + 1).intern()) == false) {
result = false;
break;
}
}
return result;
}
public boolean isAllForeign(String testword) {
boolean result = true;
for (int i = 0; i < testword.length(); i++) {
if (cforeign.contains(testword.substring(i, i + 1).intern()) == false) {
result = false;
break;
}
}
return result;
}
public boolean isNotCJK(String testword) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -