📄 segmenter.jav.old
字号:
{ // Not chinese character
//logger.info("not cjk");
if (currentword.length() > 0)
{
outline.append(currentword.toString());
currentoffset += currentword.length();
offsets.add(new Integer(currentoffset));
if (Character.isWhitespace(currentchar) == false)
{
outline.append(separator);
currentoffset += separator.length();
if (separator.length() > 0)
{
offsets.add(new Integer(currentoffset));
}
}
currentword.setLength(0);
}
while ((i < clength) &&
(Character.UnicodeBlock.of(cline.charAt(i)) != Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS))
{
currentword.append(cline.charAt(i));
i++;
}
i--;
outline.append(currentword.toString());
currentoffset += currentword.length();
offsets.add(new Integer(currentoffset));
currentword.setLength(0);
}
}
outline.append(currentword.toString());
if (currentword.length() > 0)
{
currentoffset += currentword.length();
offsets.add(new Integer(currentoffset));
}
return offsets;
}
public void addword(String newword)
{
zhwords.put(newword.intern(), "1");
if (newword.length() == 3)
{
if (zhwords.containsKey(newword.substring(0, 2).intern()) == false)
{
zhwords.put(newword.substring(0,2).intern(), "2");
}
}
if (newword.length() == 4)
{
if (zhwords.containsKey(newword.substring(0, 2).intern()) == false)
{
zhwords.put(newword.substring(0,2).intern(), "2");
}
if (zhwords.containsKey(newword.substring(0, 3).intern()) == false)
{
zhwords.put(newword.substring(0,3).intern(), "2");
}
}
if (newword.length() == 5)
{
if (zhwords.containsKey(newword.substring(0, 2).intern()) == false)
{
zhwords.put(newword.substring(0,2).intern(), "2");
}
if (zhwords.containsKey(newword.substring(0, 3).intern()) == false)
{
zhwords.put(newword.substring(0,3).intern(), "2");
}
if (zhwords.containsKey(newword.substring(0, 4).intern()) == false)
{
zhwords.put(newword.substring(0,4).intern(), "2");
}
}
if (newword.length() == 6)
{
if (zhwords.containsKey(newword.substring(0, 2).intern()) == false)
{
zhwords.put(newword.substring(0,2).intern(), "2");
}
if (zhwords.containsKey(newword.substring(0, 3).intern()) == false)
{
zhwords.put(newword.substring(0,3).intern(), "2");
}
if (zhwords.containsKey(newword.substring(0, 4).intern()) == false)
{
zhwords.put(newword.substring(0,4).intern(), "2");
}
if (zhwords.containsKey(newword.substring(0, 5).intern()) == false)
{
zhwords.put(newword.substring(0,5).intern(), "2");
}
}
}
public void segmentFile(String inputfile, String encoding)
{
byte[] gbbytes;
String outfile = inputfile + ".seg";
String segstring;
boolean debug = false;
LinkedList list;
LinkedList strlist = new LinkedList();
String dataline;
InputStream srcdata;
BufferedReader in;
BufferedWriter outbuffer;
StringBuffer buffer;
try{
srcdata = new FileInputStream(inputfile);
in = new BufferedReader(new InputStreamReader(srcdata, encoding));
outbuffer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile), encoding));
buffer = new StringBuffer();
}catch(UnsupportedEncodingException uee){
logger.warning("Encoding not supported, abort: " + uee);
return;
}catch(FileNotFoundException fnfe){
logger.warning("File not found, abort: " + fnfe);
return;
}
try{
while ((dataline = in.readLine()) != null){
buffer.append(dataline + " ");
}
}catch(IOException ioe){
// ignored
logger.warning("Error reading from stream: " + ioe);
}
try{
String output = buffer.toString();
if (output.length() == 0) return;
list = segmentLine(output);
String seg_result = segmentLine(output, " ");
logger.info("Offset: " + list);
logger.info("Output: " + seg_result);
// iterate through the list
Iterator iter = list.iterator();
int endpos = 0;
int startpos = 0;
while(iter.hasNext()){
endpos = ((Integer)iter.next()).intValue();
String token = output.substring(startpos, endpos);
if (token.trim().length() > 0){
strlist.add(token);
}
startpos = endpos;
}
// outbuffer.write(segstring);
outbuffer.write(seg_result);
outbuffer.newLine();
}catch(IOException ioe){
logger.warning("Error writing to file: " + ioe);
}finally{
System.out.flush();
try{
in.close();
outbuffer.close();
}catch(IOException ioe){}
}
}
public static void printHelp()
{
System.err.println("Usage:\njava -jar Segmenter.jar [-b|-g|-8] inputfile.txt");
System.err.println("\t-b Big5, -g GB2312, -8 UTF-8");
System.err.println(" Segmented text will be saved to inputfile.txt.seg");
System.exit(0);
}
public static void main(String[] argv)
{
Vector inputfiles = new Vector() ;
String encoding = "BIG5";
int charform = Segmenter.TRAD;
boolean debug = false;
int i, j;
for (i = 0; i < argv.length; i++)
{
if (argv[i].equals("-b"))
{
if (debug)
logger.info("Setting to Big5, TRAD");
encoding = "BIG5";
charform = Segmenter.TRAD;
}
else if (argv[i].equals("-g"))
{
if (debug)
logger.info("Setting to GB, SIMP");
encoding = "GBK";
charform = Segmenter.SIMP;
}
else if (argv[i].equals("-8"))
{
encoding = "UTF8";
charform = Segmenter.BOTH;
}
else if (argv[i].equals("-s"))
{
if (debug)
logger.info("Setting to SIMP");
charform = Segmenter.SIMP;
}
else if (argv[i].equals("-t"))
{
if (debug)
logger.info("Setting to TRAD");
charform = Segmenter.TRAD;
}
else if (argv[i].equals("-h"))
{
printHelp();
}
else
{
inputfiles.add(argv[i]);
}
}
if (inputfiles.size() == 0)
{
logger.info("ERROR: Please specify name of Chinese text file to segment.\n");
printHelp();
}
logger.info("Loading segmenter word list. One moment please.");
Segmenter mainsegmenter = Segmenter.getSegmenter("zword.obj");
logger.info("Total keys " + mainsegmenter.zhwords.size());
File tmpfile;
String dirfiles[];
for (i = 0; i < inputfiles.size(); i++)
{
tmpfile = new File((String)inputfiles.get(i));
if (tmpfile.exists() == false)
{
logger.info("ERROR: Source file " + (String)inputfiles.get(i) +
" does not exist.\n");
continue;
}
if (tmpfile.isDirectory() == true)
{
dirfiles = tmpfile.list();
if (dirfiles != null)
{
for (j = 0; j < dirfiles.length; j++)
{
inputfiles.add((String)inputfiles.get(i) + File.separator +
dirfiles[j]);
}
}
continue;
}
logger.info("Segmenting " + inputfiles.get(i) +
" with encoding " + encoding);
mainsegmenter.segmentFile((String)inputfiles.get(i), encoding);
}
}
// accesser
public TreeSet getCsurname(){
return csurname;
}
public TreeSet getCforeign(){
return cforeign;
}
public TreeSet getCnumbers(){
return cnumbers;
}
public TreeSet getCnotname(){
return cnotname;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -