⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 segmenter.java

📁 一个jsp写的bbs
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
package org.apache.lucene.analysis.cw;

import java.lang.*;
import java.io.*;
import java.util.*;
import java.util.logging.*;

/* Written by Erik Peterson
   erik AT mandarintools.com
   Last modified Jan. 13, 2004
   
   Modified by Francis, Chong @ Mar 11, 2005
    - add package
    - implements serializable interface
    - add getter for the treeset csurname, cforeign, cnumbers, cnotname;
    - replace all output that goes stderr/stdout to java logging
*/

public class segmenter implements Serializable {
    private static Logger logger = Logger.getLogger(segmenter.class.getName());
    //private Hashtable zhwords;
    private TreeMap zhwords;
    private TreeSet csurname, cforeign, cnumbers, cnotname;
    private String debugencoding;

    private boolean debug;

    // Char form
    public final static int TRAD = 0;
    public final static int SIMP = 1;
    public final static int BOTH = 2;

    // Charform is TRAD, SIMP or BOTH
    public segmenter(int charform, boolean loadwordfile) {
	debug = false;
	debugencoding = "UTF-8";

	int count = 0;

	int treelevel;

	csurname = new TreeSet();
	cforeign = new TreeSet();
	cnumbers = new TreeSet();
	cnotname = new TreeSet();

	if (charform == SIMP) {
	    loadset(cnumbers, "data/snumbers_u8.txt");
	    loadset(cforeign, "data/sforeign_u8.txt");
	    loadset(csurname, "data/ssurname_u8.txt");
	    loadset(cnotname, "data/snotname_u8.txt");
	} else if (charform == TRAD) {
	    loadset(cnumbers, "data/tnumbers_u8.txt");
	    loadset(cforeign, "data/tforeign_u8.txt");
	    loadset(csurname, "data/tsurname_u8.txt");
	    loadset(cnotname, "data/tnotname_u8.txt");
	} else {  // BOTH
	    loadset(cnumbers, "data/snumbers_u8.txt");
	    loadset(cforeign, "data/sforeign_u8.txt");
	    loadset(csurname, "data/ssurname_u8.txt");
	    loadset(cnotname, "data/snotname_u8.txt");
	    loadset(cnumbers, "data/tnumbers_u8.txt");
	    loadset(cforeign, "data/tforeign_u8.txt");
	    loadset(csurname, "data/tsurname_u8.txt");
	    loadset(cnotname, "data/tnotname_u8.txt");
	}

	//zhwords = new Hashtable(120000);
	zhwords = new TreeMap();
	
	if (!loadwordfile) {
	    return;
	}

	String newword = null;
	try {
	    InputStream worddata = null;
	    if (charform == SIMP) {
		worddata = getClass().getResourceAsStream("simplexu8.txt");
	    } else if (charform == TRAD) {
		worddata = getClass().getResourceAsStream("tradlexu8.txt");
	    } else if (charform == BOTH) {
		worddata = getClass().getResourceAsStream("bothlexu8.txt");
	    }
	    BufferedReader in = new BufferedReader(new InputStreamReader(worddata, "UTF8"));
	    while ((newword = in.readLine()) != null) {
		if ((newword.indexOf("#") == -1) && (newword.length() < 5)) {

		    zhwords.put(newword.intern(), "1");

		    if (newword.length() == 3) {
			if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
			    zhwords.put(newword.substring(0,2).intern(), "2");
			}
		    }

		    if (newword.length() == 4) {
			if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
			    zhwords.put(newword.substring(0,2).intern(), "2");
			}
			if (zhwords.containsKey(newword.substring(0, 3).intern()) == false) {
			    zhwords.put(newword.substring(0,3).intern(), "2");
			}

		    }
		    // Add section for words of 5 characters

		    //addword(newword); 

		    /* hashnode = zhwords;
		    
		    for (treelevel = 0; treelevel < newword.length(); treelevel++) {
			if (hashnode.containsKey(newword.substring(treelevel,treelevel+1).intern())
			    == true) {
			    // Do nothing, wait for next loop
			} else {
			    // Add new hashnode to the tree
			    hashnode.put(newword.substring(treelevel,treelevel+1).intern(), 
					 new Hashtable(50));
			}
			hashnode = 
			    (Hashtable)hashnode.get(newword.substring(treelevel,treelevel+1).intern());
		    }
		    hashnode.put("EOW", new Integer(newword.length()));  // Can I put something more useful here?
		    */

		    if (count++ % 20000 == 0) { logger.fine("" + count); }
		    //if (count > 65000) { break; }
		}
	    } 
	    in.close();

	}
	catch (IOException e) {
	    logger.warning("IOException: "+e);
	}

    }

    /** Load a set of character data */
    private void loadset(TreeSet targetset, String sourcefile) {
	String dataline;
	try {
	    InputStream setdata = getClass().getResourceAsStream(sourcefile);
	    BufferedReader in = new BufferedReader(new InputStreamReader(setdata, "UTF-8"));
	    while ((dataline = in.readLine()) != null) {
		if ((dataline.indexOf("#") > -1) || (dataline.length() == 0)) {
		    continue;
		}
		targetset.add(dataline.intern());
	    }
	    in.close();
	}
	catch (Exception e) {
	    logger.warning("Exception loading data file" + sourcefile + " " + e);
	}

    }

    public boolean isNumber(String testword) {
	boolean result = true;
	for (int i = 0; i < testword.length(); i++) {
	    if (cnumbers.contains(testword.substring(i, i+1).intern()) == false) {
		result = false;
		break;
	    }
	}

	if (debug) {
	    try {logger.info(new String(testword.getBytes("UTF-8")) + " " + result);} 
	    catch (Exception a) { };
	}

	return result;
    }

    public boolean isAllForeign(String testword) {
	boolean result = true;
	for (int i = 0; i < testword.length(); i++) {
	    if (cforeign.contains(testword.substring(i, i+1).intern()) == false) {
		result = false;
		break;
	    }
	}

	return result;
    }

    public boolean isNotCJK(String testword) {
	boolean result = true;
	for (int i = 0; i < testword.length(); i++) {
	    if (Character.UnicodeBlock.of(testword.charAt(i)) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
		result = false;
		break;
	    }
	}

	return result;
    }

  /*
    String add_ChineseNames(String tmpline) {
       int tlen = tmpline.length();
       StringBuffer newline = new StringBuffer();
       for (int m = 0; m < tlen; m++) {
       $tchar = substr($tmpline, $m, 1);
       $currtoken = "";
	if ($tchar =~ /^\s$/) { 
	    $newline .= $tchar;
	} else {
	    $currtoken = "";
	    while ($tchar !~ /^\s$/ and $m < $tlen) {
		$currtoken .= $tchar;
		$m++;
		$tchar = substr($tmpline, $m, 1);
	    }

	    if (defined($csurname{$currtoken}) or
		defined($uncommoncsurname{$currtoken})) { # found a surname, see what follows
		# go past following spaces
		$tchar = substr($tmpline, $m, 1);
		$spaces = "";
		while ($tchar =~ /\s/ and $m < $tlen) {
		    $spaces .= $tchar;
		    $m++;
		    $tchar = substr($tmpline, $m, 1);
		}
		# Get next token
		$tchar = substr($tmpline, $m, 1);
		$currtoken2 = "";
		while ($tchar !~ /\s/ and $m < $tlen) {
		    $currtoken2 .= $tchar;
		    $m++;
		    $tchar = substr($tmpline, $m, 1);
		}
		# go past following spaces
		$tchar = substr($tmpline, $m, 1);
		$spaces2 = "";
		while ($tchar =~ /\s/ and $m < $tlen) {
		    $spaces2 .= $tchar;
		    $m++;
		    $tchar = substr($tmpline, $m, 1);
		}
		# Get next token
		$tchar = substr($tmpline, $m, 1);
		$currtoken3 = "";
		while ($tchar !~ /\s/ and $m < $tlen) {
		    $currtoken3 .= $tchar;
		    $m++;
		    $tchar = substr($tmpline, $m, 1);
		}
		if (isChinese($currtoken2) and (length($currtoken2) == 2) and 
		    (!defined($cnotname{$currtoken2})) and 
		    isChinese($currtoken3) and length($currtoken3) == 2 and
		    !defined($cnotname{$currtoken3})) 
		{
		    $newline .= $cname[0] . $currtoken . $currtoken2 . $currtoken3 . $cname[1];
		    $cwords{$currtoken . $currtoken2 . $currtoken3} = 1;
		    $cwords{$currtoken . $currtoken2} = 2;  # short version for checking
		} elsif (isChinese($currtoken2) and (length($currtoken2) == 2) 
			 and (!defined($cnotname{$currtoken2})))
		{
		    $newline .= $currtoken . $currtoken2 . $spaces2 . $currtoken3;
		    $cwords{$currtoken . $currtoken2} = 1;
		} elsif (defined($csurname{$currtoken}) and 
			 isChinese($currtoken2) and (length($currtoken2) == 4) and
			 ($cwords{$currtoken2} != 1) and
			 (!defined($cnotname{$currtoken2})))
		{
		    $newline .= $cname[0] . $currtoken . $currtoken2 . $cname[1] . $spaces2 . $currtoken3;
		    $cwords{$currtoken . $currtoken2} = 1;
		    $cwords{$currtoken . substr($currtoken2, 0, 2)} = 2; # short version to check
		} elsif (defined($uncommoncsurname{$currtoken}) and 
			 isChinese($currtoken2) and (length($currtoken2) == 4) 
			 and (!defined($cnotname{$currtoken2})) 
			 and ($cwords{$currtoken2} != 1))
		{
		    $newline .= $cname[0] . $currtoken . $currtoken2 . $cname[1] . $spaces2 . $currtoken3;
		    $cwords{$currtoken . $currtoken2} = 1;
		    $cwords{$currtoken . substr($currtoken2, 0, 2)} = 2; # short version to check
		} else {
		    $newline .= $currtoken . $spaces . $currtoken2 . $spaces2 . $currtoken3;
		}
				 
	    } else {
		$newline .= $currtoken;
	    }
	    $m--; # reset so won't skip space
	}
    }
    
    $newline;
}

     */

    public String stemWord(String word) {
	String[] prefix = new String[] {"\u7b2c", "\u526f", "\u4e0d"};
	String[] suffix = new String[] {"\u4e86", "\u7684", "\u5730", "\u4e0b", "\u4e0a", "\u4e2d", "\u91cc", 
					"\u5230", "\u5185", "\u5916", "\u4eec"};
	String[] infix  = new String[] {"\u5f97", "\u4e0d"};
	int i;
	
	StringBuffer unstemmed = new StringBuffer(word);

	for (i = 0; i < prefix.length; i++) {
	    if (unstemmed.substring(0, 1).equals(prefix[i]) == true && 
		(zhwords.get(unstemmed.substring(1, unstemmed.length()).intern()) != null ||
		 unstemmed.length() == 2)) {
		logger.info("Stemmed prefix");
		try {logger.info(new String(unstemmed.toString().getBytes(debugencoding)));} catch (Exception a) { };
		unstemmed.deleteCharAt(0);
		return unstemmed.toString();
	    }
	}


	for (i = 0; i < suffix.length; i++) {
	    if (unstemmed.substring(unstemmed.length()-1, unstemmed.length()).equals(suffix[i]) == true && 
		(zhwords.get(unstemmed.substring(0, unstemmed.length()-1).intern()) != null ||
		 unstemmed.length() == 2)) {
		logger.info("Stemmed suffix");
		try {logger.info(new String(unstemmed.toString().getBytes(debugencoding)));} catch (Exception a) { };
		unstemmed.deleteCharAt(unstemmed.length()-1);
		return unstemmed.toString();
	    }
	}
    
	for (i = 0; i < infix.length; i++) {
	    if (unstemmed.length() == 3 && unstemmed.substring(1, 2).equals(infix[i]) == true &&
		zhwords.get(new String(unstemmed.substring(0, 1) + unstemmed.substring(2, 3)).intern()) != null) {
		logger.info("Stemmed infix");
		unstemmed.deleteCharAt(1);
		return unstemmed.toString();
	    }
	}

	return unstemmed.toString();
    }
    

    public String segmentLine(String cline, String separator) {
	StringBuffer currentword = new StringBuffer();
	StringBuffer outline = new StringBuffer();
	int i, clength;
	char currentchar;
	//separator = " ";

	clength = cline.length();
	int[][] offsets = new int[clength][2];

	for (i = 0; i < clength; i++) {
	    currentchar = cline.charAt(i);
	    if (Character.UnicodeBlock.of(currentchar) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS ||
		isNumber(cline.substring(i, i+1)) == true) {
		// Character in CJK block
		if (currentword.length() == 0) {  // start looking for next word
		    //logger.warning("current word length 0");
		    if (i > 0 && (Character.isWhitespace(cline.charAt(i-1)) == false)) {
			outline.append(separator); 
		    }
		    currentword.append(currentchar);
		    if (debug) {
			try {logger.info(new String(currentword.toString().getBytes(debugencoding)));} catch (Exception a) { };

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -