⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 segmenter.java

📁 segment,一个简单的中文分词程序
💻 JAVA
字号:
import java.lang.*;
import java.io.*;
import java.util.*;

/* Written by Erik Peterson
   erik AT mandarintools.com
   Last modified Jan. 13, 2004
*/

public class segmenter {
    private TreeMap zhwords;
    private TreeSet csurname, cforeign, cnumbers, cnotname;
    private String debugencoding;

    private boolean debug;

    // Char form
    public final static int TRAD = 0;
    public final static int SIMP = 1;
    public final static int BOTH = 2;

    // Charform is TRAD, SIMP or BOTH
    public segmenter(int charform, boolean loadwordfile) {
	debug = false;
	debugencoding = "UTF-8";

	int count = 0;

	//int treelevel;

	csurname = new TreeSet();
	cforeign = new TreeSet();
	cnumbers = new TreeSet();
	cnotname = new TreeSet();

	if (charform == SIMP) {
	    loadset(cnumbers, "data/snumbers_u8.txt");
	    loadset(cforeign, "data/sforeign_u8.txt");
	    loadset(csurname, "data/ssurname_u8.txt");
	    loadset(cnotname, "data/snotname_u8.txt");
	} else if (charform == TRAD) {
	    loadset(cnumbers, "data/tnumbers_u8.txt");
	    loadset(cforeign, "data/tforeign_u8.txt");
	    loadset(csurname, "data/tsurname_u8.txt");
	    loadset(cnotname, "data/tnotname_u8.txt");
	} else {  // BOTH
	    loadset(cnumbers, "data/snumbers_u8.txt");
	    loadset(cforeign, "data/sforeign_u8.txt");
	    loadset(csurname, "data/ssurname_u8.txt");
	    loadset(cnotname, "data/snotname_u8.txt");
	    loadset(cnumbers, "data/tnumbers_u8.txt");
	    loadset(cforeign, "data/tforeign_u8.txt");
	    loadset(csurname, "data/tsurname_u8.txt");
	    loadset(cnotname, "data/tnotname_u8.txt");
	}

	//zhwords = new Hashtable(120000);
	zhwords = new TreeMap();
	
	if (!loadwordfile) {
	    return;
	}

	String newword = null;
	try {
	    InputStream worddata = null;
	    if (charform == SIMP) {
		worddata = getClass().getResourceAsStream("simplexu8.txt");
	    } else if (charform == TRAD) {
		worddata = getClass().getResourceAsStream("tradlexu8.txt");
	    } else if (charform == BOTH) {
		worddata = getClass().getResourceAsStream("bothlexu8.txt");
	    }
	    BufferedReader in = new BufferedReader(new InputStreamReader(worddata, "UTF8"));
	    while ((newword = in.readLine()) != null) {
		if (newword.indexOf("#") == -1) {
		    addword(newword);
		    if (debug && count++ % 20000 == 0) { System.err.println(count); }
		}
	    } 
	    in.close();

	}
	catch (IOException e) {
	    System.err.println("IOException: "+e);
	}

    }


    /** Load a set of character data */
    private void loadset(TreeSet targetset, String sourcefile) {
	String dataline;
	try {
	    InputStream setdata = getClass().getResourceAsStream(sourcefile);
	    BufferedReader in = new BufferedReader(new InputStreamReader(setdata, "UTF-8"));
	    while ((dataline = in.readLine()) != null) {
		if ((dataline.indexOf("#") > -1) || (dataline.length() == 0)) {
		    continue;
		}
		targetset.add(dataline);
	    }
	    in.close();
	}
	catch (Exception e) {
	    System.err.println("Exception loading data file" + sourcefile + " " + e);
	}

    }

    public boolean isNumber(String testword) {
	boolean result = true;
	for (int i = 0; i < testword.length(); i++) {
	    if (cnumbers.contains(testword.substring(i, i+1)) == false) {
		result = false;
		break;
	    }
	}

	if (debug) {
	    printDebug(testword + " " + result); 
	}

	return result;
    }

    public boolean isAllForeign(String testword) {
	boolean result = true;
	for (int i = 0; i < testword.length(); i++) {
	    if (cforeign.contains(testword.substring(i, i+1)) == false) {
		result = false;
		break;
	    }
	}

	return result;
    }

    public boolean isNotCJK(String testword) {
	boolean result = true;
	for (int i = 0; i < testword.length(); i++) {
	    if (Character.UnicodeBlock.of(testword.charAt(i)) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
		result = false;
		break;
	    }
	}

	return result;
    }



    public String stemWord(String word) {
	String[] prefix = new String[] {"\u7b2c", "\u526f", "\u4e0d"};
	String[] suffix = new String[] {"\u4e86", "\u7684", "\u5730", "\u4e0b", "\u4e0a", "\u4e2d", "\u91cc", 
					"\u5230", "\u5185", "\u5916", "\u4eec"};
	String[] infix  = new String[] {"\u5f97", "\u4e0d"};
	int i;
	
	StringBuffer unstemmed = new StringBuffer(word);

	for (i = 0; i < prefix.length; i++) {
	    if (unstemmed.substring(0, 1).equals(prefix[i]) == true && 
		(zhwords.get(unstemmed.substring(1, unstemmed.length())) != null ||
		 unstemmed.length() == 2)) {
		//System.out.println("Stemmed prefix");
		//try {System.out.println(new String(unstemmed.toString().getBytes(debugencoding)));} catch (Exception a) { };
		unstemmed.deleteCharAt(0);
		return unstemmed.toString();
	    }
	}


	for (i = 0; i < suffix.length; i++) {
	    if (unstemmed.substring(unstemmed.length()-1, unstemmed.length()).equals(suffix[i]) == true && 
		(zhwords.get(unstemmed.substring(0, unstemmed.length()-1)) != null ||
		 unstemmed.length() == 2)) {
		System.out.println("Stemmed suffix");
		try {System.out.println(new String(unstemmed.toString().getBytes(debugencoding)));} catch (Exception a) { };
		unstemmed.deleteCharAt(unstemmed.length()-1);
		return unstemmed.toString();
	    }
	}
    
	for (i = 0; i < infix.length; i++) {
	    if (unstemmed.length() == 3 && unstemmed.substring(1, 2).equals(infix[i]) == true &&
		zhwords.get(new String(unstemmed.substring(0, 1) + unstemmed.substring(2, 3))) != null) {
		System.out.println("Stemmed infix");
		unstemmed.deleteCharAt(1);
		return unstemmed.toString();
	    }
	}

	return unstemmed.toString();
    }



    // Takes a Chinese string, returns string with separator inserted between each Chinese word
    public String segmentLine(String cline, String separator) {
	int[] boundaries = segmentLineOffsets(cline);
	StringBuffer clinebuffer = new StringBuffer(cline);
	int i, seplen = separator.length();

	if (boundaries.length == 0) { return cline; }

	for (i = boundaries.length-2; i >= 0; i--) {
	    if (boundaries[i] > 0 &&
		i+boundaries[i] != cline.length() &&
		cline.substring(i, i+seplen).equals(separator) == false &&
		cline.substring(i+boundaries[i], i+boundaries[i]+seplen).equals(separator) == false) {
		clinebuffer.insert(i+boundaries[i], separator);
	    }
	}
	return clinebuffer.toString();
    }


    public LinkedList segmentLine(String cline) {
	int[] boundaries = segmentLineOffsets(cline);
	LinkedList offsets = new LinkedList();
	int i;
	
	for (i = 0; i < boundaries.length; i++) {
	    if (boundaries[i] > 0) {
		offsets.add(new Integer(i));
	    }
	}

	return offsets;
    }




    public int[] segmentLineOffsets(String cline) {
	int i, j, tmpoffset;
	int clength = cline.length();
	int[] offsets = new int[clength];

	if (debug) System.out.println("Line length " + clength);

	// Handle Chinese & non-Chinese text; Group spaces, letters, punctuation, numbers
	if (debug) System.out.println("Grouping Chinese, letters, digits and spaces");
	i = 0;
	while (i < clength) {
	    if (debug) System.out.println("i " + i);
	    
	    if (Character.UnicodeBlock.of(cline.charAt(i)) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
		j = 8;
		if (i+j > clength) { j = clength - i; }
		for (; i+j <= clength && j > 1; j--) {
		    if (zhwords.containsKey(cline.substring(i, i+j))) {
			break;
		    }
		}
		offsets[i] = j;
		i += j;
		
	    } else if (Character.isWhitespace(cline.charAt(i))) {
		j=1;
		while (i+j < clength && Character.isWhitespace(cline.charAt(i+j))) {
		    j++;
		}
		offsets[i] = j;
		i += j;
	    } else if (Character.isLetter(cline.charAt(i))) {
		j=1;
		while (i+j < clength && Character.isLetter(cline.charAt(i+j))) {
		    j++;
		}
		offsets[i] = j;
		i += j;
	    } else if (Character.isDigit(cline.charAt(i))) {
		j=1;
		while (i+j < clength && Character.isDigit(cline.charAt(i+j))) {
		    j++;
		}
		offsets[i] = j;
		i += j;

	    } else {
		offsets[i] = 1;
		i++;
		
	    }
	    
	}
	
	if (debug) System.out.println("Grouping foreign transliterations");
	// Add in foreign transliterations
	i = 0;
	while (i < clength) {
	    if (offsets[i] > 0 ) {
		// Possibly a transliteration of a foreign name
		while (i+offsets[i] < clength &&
		       i+offsets[i]+offsets[i+offsets[i]] < clength &&
		       isAllForeign(cline.substring(i, i+offsets[i]+offsets[i+offsets[i]])) 
		       ) {
		    tmpoffset = offsets[i+offsets[i]];
		    offsets[i+offsets[i]] = 0;
		    offsets[i] = offsets[i] + tmpoffset;
		}
	    }
	    i++;
	}
	
	if (debug) System.out.println("Grouping numbers");
	// Concatenate numbers
	i = 0;
	while (i < clength) {
	    if (offsets[i] > 0)   {
		// Add in numbers
		while (i+offsets[i] < clength && 
		       i+offsets[i]+offsets[i+offsets[i]] < clength &&
		       isNumber(cline.substring(i, i+offsets[i]+offsets[i+offsets[i]]))) {
		    tmpoffset = offsets[i+offsets[i]];
		    offsets[i+offsets[i]] = 0;
		    offsets[i] = offsets[i] + tmpoffset;
		}
	    }
	    i++;
	}

	/*
	if (debug) System.out.println("Grouping Chinese names");
	// Group possible Chinese names together
	i = 0;
	while (i < clength) {
	    if (offsets[i] == 1 &&
		dictdata.isChineseSurname(cline.substring(i, i+offsets[i]))) {
		// Check for two syllable given name
		if (i+offsets[i] < clength && offsets[i+offsets[i]] == 1 &&
		    dictdata.isChinese(cline.substring(i, i+offsets[i])) &&
		    i+offsets[i]+offsets[i+offsets[i]] < clength && 
		    offsets[i+offsets[i] + offsets[i+offsets[i]]] == 1 &&
		    dictdata.isChinese(cline.substring(i+offsets[i], i+offsets[i]+offsets[i+offsets[i]]))) {

		    offsets[i+offsets[i]] = 0;
		    offsets[i+offsets[i] + offsets[i+offsets[i]]] = 0;
		    offsets[i] = 3;
		    
		}
		// Check for one syllable given name
		else if (i+offsets[i] < clength && offsets[i+offsets[i]] == 1 &&
			 dictdata.isChinese(cline.substring(i, i+offsets[i]))) {

		    offsets[i+offsets[i]] = 0;
		    offsets[i] = 2;
		}

	    }
	    i++;
	}
	*/

	

	return offsets;
    }


    

    public void addword(String newword) {
	int i;
	zhwords.put(newword, "1");
    }


    public void segmentFile(String inputfile, String encoding) {
	byte[] gbbytes;
	String outfile = inputfile + ".seg";
	String segstring;
	boolean debug = false;

	try {
	    String dataline;
	    InputStream srcdata = new FileInputStream(inputfile);
	    BufferedReader in = new BufferedReader(new InputStreamReader(srcdata, encoding));
	    BufferedWriter outbuffer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile), encoding));

	    
	    while ((dataline = in.readLine()) != null) {
		segstring = segmentLine(dataline, " ");
		if (debug) {
		    gbbytes = segstring.getBytes(encoding);
		    System.err.println("Output: " + new String(gbbytes));
		} 
		outbuffer.write(segstring);
		outbuffer.newLine();
	    }

	    in.close();
	    outbuffer.close();
	}
	catch (Exception e) {
	    System.err.println("Exception " + e.toString());
	}

    }

    public void printDebug(String debuginfo) {
	try {
	    System.out.println(new String(debuginfo.getBytes(debugencoding)));
	} 
	catch (Exception a) { a.printStackTrace(); }
    }


    public static void printHelp() {
	System.out.println("Usage:\njava -jar segmenter.jar [-b|-g|-8|-s|-t] inputfile.txt");
	System.out.println("\t-b Big5, -g GB2312, -8 UTF-8, -s simp. chars, -t trad. chars");
	System.out.println("  Segmented text will be saved to inputfile.txt.seg");
	System.exit(0);
    }

    public static void main(String[] argv) {
	Vector inputfiles = new Vector() ;
	String encoding = "BIG5";
	int charform = segmenter.TRAD;
	boolean debug = false;
	int i, j;

	for (i = 0; i < argv.length; i++) {
	    if (argv[i].equals("-b")) {
		if (debug) System.out.println("Setting to Big5, TRAD");
		encoding = "BIG5";
		charform = segmenter.TRAD;
	    } else if (argv[i].equals("-g")) {
		if (debug) System.out.println("Setting to GB, SIMP");
		encoding = "GBK";
		charform = segmenter.SIMP;
	    } else if (argv[i].equals("-8")) {
		encoding = "UTF8";
		charform = segmenter.BOTH;
	    } else if (argv[i].equals("-s")) {
		if (debug) System.out.println("Setting to UTF-8 SIMP");
		encoding = "UTF8";
		charform = segmenter.SIMP;
	    } else if (argv[i].equals("-t")) {
		if (debug) System.out.println("Setting to UTF-8 TRAD");
		encoding = "UTF8";
		charform = segmenter.TRAD;
	    } else if (argv[i].equals("-h")) {
		printHelp();
	    } else if (argv[i].equals("-d")) {
		debug = true;
	    } else {
		inputfiles.add(argv[i]);
	    }
	}

	if (inputfiles.size() == 0) {
	    System.out.println("ERROR: Please specify name of Chinese text file to segment.\n");
	    printHelp();
	} 


	System.err.println("Loading segmenter word list.  One moment please.");
	segmenter mainsegmenter = new segmenter(charform, true);

	System.err.println("Total keys " + mainsegmenter.zhwords.size());

	File tmpfile;
	String dirfiles[];
	for (i = 0; i < inputfiles.size(); i++) {
	    tmpfile = new File((String)inputfiles.get(i));
	    if (tmpfile.exists() == false) {
		System.out.println("ERROR: Source file " + (String)inputfiles.get(i) + 
				   " does not exist.\n");
		continue;
	    }
	    if (tmpfile.isDirectory() == true) {
		dirfiles = tmpfile.list();
		if (dirfiles != null) {
		    for (j = 0; j < dirfiles.length; j++) {
			inputfiles.add((String)inputfiles.get(i) + File.separator +
				       dirfiles[j]);
		    }
		}
		continue;
	    }
	    System.err.println("Segmenting " + inputfiles.get(i) + 
			       " with encoding " + encoding);
	    mainsegmenter.segmentFile((String)inputfiles.get(i), encoding);
	}
    }

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -