📄 segmenter.jav.old
字号:
package org.apache.lucene.analysis.cw;
import java.lang.*;
import java.io.*;
import java.util.*;
import java.util.logging.*;
/*
Originally Written by Erik Peterson
erik AT mandarintools.com
Modified by Siu Ying
siu DOT ying AT gmail DOT com
*/
public class Segmenter
{
static Logger logger = Logger.getLogger(Segmenter.class.getName());
static Segmenter mysegmenter;
private TreeMap zhwords;
private TreeSet csurname, cforeign, cnumbers, cnotname;
private String debugencoding;
private boolean debug;
// Char form
public final static int TRAD = 0;
public final static int SIMP = 1;
public final static int BOTH = 2;
/* singleton */
public static synchronized Segmenter getSegmenter(String swapfilename){
if (mysegmenter == null){
mysegmenter = new Segmenter(BOTH, true, swapfilename);
}
return mysegmenter;
}
private Segmenter(int charform, boolean loadwordfile, String swapfilename)
{
debug = false;
debugencoding = "UTF-8";
int count = 0;
int treelevel;
csurname = new TreeSet();
cforeign = new TreeSet();
cnumbers = new TreeSet();
cnotname = new TreeSet();
if (charform == SIMP)
{
loadset(cnumbers, "data/snumbers_u8.txt");
loadset(cforeign, "data/sforeign_u8.txt");
loadset(csurname, "data/ssurname_u8.txt");
loadset(cnotname, "data/snotname_u8.txt");
}
else if (charform == TRAD)
{
loadset(cnumbers, "data/tnumbers_u8.txt");
loadset(cforeign, "data/tforeign_u8.txt");
loadset(csurname, "data/tsurname_u8.txt");
loadset(cnotname, "data/tnotname_u8.txt");
}
else
{ // BOTH
loadset(cnumbers, "data/snumbers_u8.txt");
loadset(cforeign, "data/sforeign_u8.txt");
loadset(csurname, "data/ssurname_u8.txt");
loadset(cnotname, "data/snotname_u8.txt");
loadset(cnumbers, "data/tnumbers_u8.txt");
loadset(cforeign, "data/tforeign_u8.txt");
loadset(csurname, "data/tsurname_u8.txt");
loadset(cnotname, "data/tnotname_u8.txt");
}
zhwords = new TreeMap();
if (!loadwordfile)
{
return;
}
boolean loaded_obj = false;
Date start = new Date();
/* load words from object file, not txt file */
File swapfile = new File(swapfilename);
if ( swapfile.exists() && swapfile.canRead() )
{
try
{
logger.info("Loading data from object file");
InputStream swapdata = new FileInputStream(swapfile);
ObjectInputStream obj = new ObjectInputStream(swapdata);
zhwords = (TreeMap)obj.readObject();
loaded_obj = true;
}
catch(java.io.IOException ioe)
{
logger.info("Error reading object file: " + ioe);
}
catch(ClassNotFoundException cne)
{
logger.info("Error reading object file: Format not expected: " + cne);
}
}
if (! loaded_obj)
{
String newword = null;
try
{
InputStream worddata = null;
if (charform == SIMP)
{
worddata = getClass().getResourceAsStream("simplexu8.txt");
}
else if (charform == TRAD)
{
worddata = getClass().getResourceAsStream("tradlexu8.txt");
}
else if (charform == BOTH)
{
worddata = getClass().getResourceAsStream("bothlexu8.txt");
}
BufferedReader in = new BufferedReader(new InputStreamReader(worddata, "UTF8"));
while ((newword = in.readLine()) != null)
{
if ((newword.indexOf("#") == -1) && (newword.length() < 5))
{
zhwords.put(newword.intern(), "1");
if (newword.length() == 3)
{
if (zhwords.containsKey(newword.substring(0, 2).intern()) == false)
{
zhwords.put(newword.substring(0,2).intern(), "2");
}
}
if (newword.length() == 4)
{
if (zhwords.containsKey(newword.substring(0, 2).intern()) == false)
{
zhwords.put(newword.substring(0,2).intern(), "2");
}
if (zhwords.containsKey(newword.substring(0, 3).intern()) == false)
{
zhwords.put(newword.substring(0,3).intern(), "2");
}
}
// Add section for words of 5 characters
//addword(newword);
if (count++ % 20000 == 0)
{
logger.info("" + count);
}
//if (count > 65000) { break; }
}
}
in.close();
}
catch (IOException e)
{
logger.info("IOException: "+e);
}
}
Date end = new Date();
logger.info("Loaded in " + (end.getTime() - start.getTime()) + " ms");
/* write words to object file */
if (! loaded_obj){
try
{
swapfile.createNewFile();
FileOutputStream f = new FileOutputStream(swapfile);
ObjectOutput s = new ObjectOutputStream(f);
s.writeObject(zhwords);
s.flush();
}
catch(java.io.FileNotFoundException fe)
{
logger.info("Error finding object file: " + fe);
}
catch(java.io.IOException ioe)
{
logger.info("Error write object file: " + ioe);
}
}
}
/** Load a set of character data */
private void loadset(TreeSet targetset, String sourcefile)
{
String dataline;
try
{
InputStream setdata = getClass().getResourceAsStream(sourcefile);
BufferedReader in = new BufferedReader(new InputStreamReader(setdata, "UTF-8"));
while ((dataline = in.readLine()) != null)
{
if ((dataline.indexOf("#") > -1) || (dataline.length() == 0))
{
continue;
}
targetset.add(dataline.intern());
}
in.close();
}
catch (Exception e)
{
logger.info("Exception loading data file" + sourcefile + " " + e);
}
}
public boolean isNumber(String testword)
{
boolean result = true;
for (int i = 0; i < testword.length(); i++)
{
if (cnumbers.contains(testword.substring(i, i+1).intern()) == false)
{
result = false;
break;
}
}
if (debug)
{
try
{
logger.info(new String(testword.getBytes("UTF-8")) + " " + result);
}
catch (Exception a)
{ }
;
}
return result;
}
public boolean isAllForeign(String testword)
{
boolean result = true;
for (int i = 0; i < testword.length(); i++)
{
if (cforeign.contains(testword.substring(i, i+1).intern()) == false)
{
result = false;
break;
}
}
return result;
}
public boolean isNotCJK(String testword)
{
boolean result = true;
for (int i = 0; i < testword.length(); i++)
{
if (Character.UnicodeBlock.of(testword.charAt(i)) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS)
{
result = false;
break;
}
}
return result;
}
public String stemWord(String word)
{
String[] prefix = new String[] {"\u7b2c", "\u526f", "\u4e0d"};
String[] suffix = new String[] {"\u4e86", "\u7684", "\u5730", "\u4e0b", "\u4e0a", "\u4e2d", "\u91cc",
"\u5230", "\u5185", "\u5916", "\u4eec"};
String[] infix = new String[] {"\u5f97", "\u4e0d"};
int i;
StringBuffer unstemmed = new StringBuffer(word);
for (i = 0; i < prefix.length; i++)
{
if (unstemmed.substring(0, 1).equals(prefix[i]) == true &&
(zhwords.get(unstemmed.substring(1, unstemmed.length()).intern()) != null ||
unstemmed.length() == 2))
{
logger.info("Stemmed prefix");
try
{
logger.info(new String(unstemmed.toString().getBytes(debugencoding)));
}
catch (Exception a)
{ }
;
unstemmed.deleteCharAt(0);
return unstemmed.toString();
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -