📄 chinesesegmenter.java
字号:
/* * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * This program is based on the work of Erik Peterson (erik AT * mandarintools.com) * * Last modified by Jun Lu (zbno@hotmail.com) on Aug. 20, 2005 * and is sponsored by http://www.zbno.com * * For more Chinese computing information, please go to * http://www.chinesecomputing.com. */import java.io.BufferedReader;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.util.Locale;import java.util.TreeMap;import java.util.TreeSet;public class ChineseSegmenter { private static ChineseSegmenter segmenter = null; // private Hashtable zhwords; private TreeMap zhwords; private TreeSet cforeign, cnumbers; // Char form public final static int TRAD = 0; public final static int SIMP = 1; public final static int BOTH = 2; // Charform is TRAD, SIMP or BOTH private ChineseSegmenter(int charform, boolean loadwordfile) { cforeign = new TreeSet(); cnumbers = new TreeSet(); if (charform == SIMP) { loadset(cnumbers, "data/snumbers_u8.txt"); loadset(cforeign, "data/sforeign_u8.txt"); } else if (charform == TRAD) { loadset(cnumbers, "data/tnumbers_u8.txt"); loadset(cforeign, "data/tforeign_u8.txt"); } else { // BOTH loadset(cnumbers, "data/snumbers_u8.txt"); loadset(cforeign, "data/sforeign_u8.txt"); loadset(cnumbers, "data/tnumbers_u8.txt"); loadset(cforeign, "data/tforeign_u8.txt"); } // zhwords = new Hashtable(120000); zhwords = new TreeMap(); if (!loadwordfile) { return; } String newword = null; try { InputStream worddata = null; if (charform == SIMP) { worddata = getClass().getResourceAsStream("simplexu8.txt"); } else if (charform == TRAD) { worddata = getClass().getResourceAsStream("tradlexu8.txt"); } else if (charform == BOTH) { worddata = getClass().getResourceAsStream("bothlexu8.txt"); } BufferedReader in = new BufferedReader(new InputStreamReader( worddata, "UTF8")); while ((newword = in.readLine()) != null) { if ((newword.indexOf("#") == -1) && (newword.length() < 5)) { zhwords.put(newword.intern(), "1"); if (newword.length() == 3) { if (zhwords.containsKey(newword.substring(0, 2) .intern()) == false) { zhwords.put(newword.substring(0, 2).intern(), "2"); } } if (newword.length() == 4) { if (zhwords.containsKey(newword.substring(0, 2) .intern()) == false) { zhwords.put(newword.substring(0, 2).intern(), "2"); } if (zhwords.containsKey(newword.substring(0, 3) .intern()) == false) { zhwords.put(newword.substring(0, 3).intern(), "2"); } } } } in.close(); } catch (IOException e) { e.printStackTrace(); } } public synchronized static void reset() { ChineseSegmenter.segmenter = null; } public synchronized static ChineseSegmenter getGBSegmenter() { Locale.setDefault(Locale.SIMPLIFIED_CHINESE); if (ChineseSegmenter.segmenter == null) { ChineseSegmenter.segmenter = new ChineseSegmenter(ChineseSegmenter.SIMP, true); } return ChineseSegmenter.segmenter; } public synchronized static ChineseSegmenter getBig5Segmenter() { Locale.setDefault(Locale.TRADITIONAL_CHINESE); if (ChineseSegmenter.segmenter == null) { ChineseSegmenter.segmenter = new ChineseSegmenter(ChineseSegmenter.TRAD, true); } return ChineseSegmenter.segmenter; } private void loadset(TreeSet targetset, String sourcefile) { String dataline; try { InputStream setdata = getClass().getResourceAsStream(sourcefile); BufferedReader in = new BufferedReader(new InputStreamReader( setdata, "UTF-8")); while ((dataline = in.readLine()) != null) { if ((dataline.indexOf("#") > -1) || (dataline.length() == 0)) { continue; } targetset.add(dataline.intern()); } in.close(); } catch (Exception e) { System.err.println("Exception loading data file" + sourcefile + " " + e); e.printStackTrace(); } } public boolean isNumber(String testword) { boolean result = true; for (int i = 0; i < testword.length(); i++) { if (cnumbers.contains(testword.substring(i, i + 1).intern()) == false) { result = false; break; } } return result; } public boolean isAllForeign(String testword) { boolean result = true; for (int i = 0; i < testword.length(); i++) { if (cforeign.contains(testword.substring(i, i + 1).intern()) == false) { result = false; break; } } return result; } public boolean isNotCJK(String testword) { boolean result = true; for (int i = 0; i < testword.length(); i++) { if (Character.UnicodeBlock.of(testword.charAt(i)) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) { result = false; break; } } return result; } public String segmentLine(String cline, String separator) { StringBuffer currentword = new StringBuffer(); StringBuffer outline = new StringBuffer(); int i, clength; char currentchar; // separator = " "; clength = cline.length(); for (i = 0; i < clength; i++) { currentchar = cline.charAt(i); if (Character.UnicodeBlock.of(currentchar) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS || isNumber(cline.substring(i, i + 1)) == true) { // Character in CJK block if (currentword.length() == 0) { // start looking for next // word if (i > 0 && (Character.isWhitespace(cline.charAt(i - 1)) == false)) { outline.append(separator); } currentword.append(currentchar); } else { if (zhwords.containsKey(new String(currentword.toString() + currentchar).intern()) == true && ((String) (zhwords.get(new String(currentword .toString() + currentchar).intern()))).equals("1") == true) { // word is in lexicon currentword.append(currentchar); } else if (isAllForeign(currentword.toString()) && cforeign.contains(new String( new char[] { currentchar }).intern()) && i + 2 < clength && (zhwords.containsKey(cline.substring(i, i + 2) .intern()) == false)) { // Possible a transliteration of a foreign name currentword.append(currentchar); } else if (isNumber(currentword.toString()) && cnumbers.contains(new String( new char[] { currentchar }).intern()) /* * && (i + 2 < clength) && * (zhwords.containsKey(cline.substring(i, i+2).intern()) == * false) */) { // Put all consecutive number characters together currentword.append(currentchar); } else if ((zhwords.containsKey(new String(currentword .toString() + currentchar).intern())) && (((String) (zhwords.get(new String(currentword .toString() + currentchar).intern()))).equals("2") == true) && i + 1 < clength && (zhwords.containsKey(new String(currentword .toString() + currentchar + cline.charAt(i + 1)) .intern()) == true)) { // Starts a word in the lexicon currentword.append(currentchar); } else { // Start anew outline.append(currentword.toString()); if (Character.isWhitespace(currentchar) == false) { outline.append(separator); } currentword.setLength(0); currentword.append(currentchar); } } } else { // Not chinese character // System.err.println("not cjk"); if (currentword.length() > 0) { outline.append(currentword.toString()); if (Character.isWhitespace(currentchar) == false) { outline.append(separator); } currentword.setLength(0); } outline.append(currentchar); } } outline.append(currentword.toString()); return outline.toString(); // return offsets; } public static void main(String[] args) throws Exception { ChineseSegmenter seg = ChineseSegmenter.getGBSegmenter(); System.out.println(seg.segmentLine("Some string in chinese.", " ")); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -