📄 segmenter.java

📁 1、锁定某个主题抓取； 2、能够产生日志文本文件
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
import java.lang.*;
import java.io.*;
import java.util.*;
import java.lang.Math.*;

/* Integery by @Author Kelven.JU
   erik AT mandarintools.com
   Last modified Jan. 13, 2004
*/

public class segmenter {
    //private Hashtable zhwords;
    public TreeMap zhwords;
    public TreeSet csurname, cforeign, cnumbers, cnotname;
    public String debugencoding;

    //统计词项与词频
    public ArrayList wordSum ;   
    public ArrayList wordCount;
    public ArrayList wordAll;
    public ArrayList wordCountAll;
    //记录最大词频（计算ＴＦ值用到）
    public int wordCountMax=-1;
    public Integer wordCountMaxInteger;

    public boolean debug;

    // Char form
    public final static int TRAD = 0;
    public final static int SIMP = 1;
    public final static int BOTH = 2;

    // Charform is TRAD, SIMP or BOTH
    public segmenter(int charform, boolean loadwordfile) {
	debug = false;
	debugencoding = "UTF-8";

	int count = 0;

	int treelevel;

	csurname = new TreeSet();
	cforeign = new TreeSet();
	cnumbers = new TreeSet();
	cnotname = new TreeSet();

	if (charform == SIMP) {
	    loadset(cnumbers, "data/snumbers_u8.txt");
	    loadset(cforeign, "data/sforeign_u8.txt");
	    loadset(csurname, "data/ssurname_u8.txt");
	    loadset(cnotname, "data/snotname_u8.txt");
	} else if (charform == TRAD) {
	    loadset(cnumbers, "data/tnumbers_u8.txt");
	    loadset(cforeign, "data/tforeign_u8.txt");
	    loadset(csurname, "data/tsurname_u8.txt");
	    loadset(cnotname, "data/tnotname_u8.txt");
	} else {  // BOTH
	    loadset(cnumbers, "data/snumbers_u8.txt");
	    loadset(cforeign, "data/sforeign_u8.txt");
	    loadset(csurname, "data/ssurname_u8.txt");
	    loadset(cnotname, "data/snotname_u8.txt");
	    loadset(cnumbers, "data/tnumbers_u8.txt");
	    loadset(cforeign, "data/tforeign_u8.txt");
	    loadset(csurname, "data/tsurname_u8.txt");
	    loadset(cnotname, "data/tnotname_u8.txt");
	}

	//zhwords = new Hashtable(120000);
	zhwords = new TreeMap();
	
	if (!loadwordfile) {
	    return;
	}

	String newword = null;
	try {
	    InputStream worddata = null;
	    if (charform == SIMP) {
		worddata = getClass().getResourceAsStream("simplexu8.txt");
	    } else if (charform == TRAD) {
		worddata = getClass().getResourceAsStream("tradlexu8.txt");
	    } else if (charform == BOTH) {
		worddata = getClass().getResourceAsStream("bothlexu8.txt");
	    }
	    BufferedReader in = new BufferedReader(new InputStreamReader(worddata, "UTF8"));
	    while ((newword = in.readLine()) != null) {
		if ((newword.indexOf("#") == -1) && (newword.length() < 5)) {

		    zhwords.put(newword.intern(), "1");

		    if (newword.length() == 3) {
			if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
			    zhwords.put(newword.substring(0,2).intern(), "2");
			}
		    }

		    if (newword.length() == 4) {
			if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
			    zhwords.put(newword.substring(0,2).intern(), "2");
			}
			if (zhwords.containsKey(newword.substring(0, 3).intern()) == false) {
			    zhwords.put(newword.substring(0,3).intern(), "2");
			}

		    }
		    // Add section for words of 5 characters

		    //addword(newword); 

		    /* hashnode = zhwords;
		    
		    for (treelevel = 0; treelevel < newword.length(); treelevel++) {
			if (hashnode.containsKey(newword.substring(treelevel,treelevel+1).intern())
			    == true) {
			    // Do nothing, wait for next loop
			} else {
			    // Add new hashnode to the tree
			    hashnode.put(newword.substring(treelevel,treelevel+1).intern(), 
					 new Hashtable(50));
			}
			hashnode = 
			    (Hashtable)hashnode.get(newword.substring(treelevel,treelevel+1).intern());
		    }
		    hashnode.put("EOW", new Integer(newword.length()));  // Can I put something more useful here?
		    */

		    if (count++ % 20000 == 0) { System.err.println(count); }
		    //if (count > 65000) { break; }
		}
	    } 
	    in.close();

	}
	catch (IOException e) {
	    System.err.println("IOException: "+e);
	}

    }
	
   /**Output Word Count Information**/
   public void outputWorkCount(){
   int tmpIndex;
   			for(int k = 0;k<wordSum.size();k++){
				System.out.print(wordSum.get(k));
				tmpIndex=k;
				System.out.println(" : "+(int) ((Integer)wordCount.get(tmpIndex)).intValue());	
				}

   	}


   /**Get the TF value of every word in this document**/
   public ArrayList getTfValue(String args){
   int tmpIndex;
   BufferedWriter fileOut;
   ArrayList tmpArrayList = new ArrayList(3);
   File tmpTfFilePath=new File("tf");
   tmpTfFilePath.mkdir();
   try{
	fileOut=new BufferedWriter ( new FileWriter (args+".tf"));

   			for(int k = 0;k<wordSum.size();k++){
				//System.out.print(wordSum.get(k));
				tmpIndex=k;
				tmpArrayList.add(((double) ((Integer)wordCount.get(tmpIndex)).intValue())/(double)wordCountMax);
	try{
	fileOut.newLine();
	fileOut.write(k+" : "+((double) ((Integer)wordCount.get(tmpIndex)).intValue())/(double)wordCountMax);
	fileOut.flush();
	if((k+1)==wordSum.size())
		fileOut.close();
    	}catch (IOException e){
    System.out.println("Function getTfValue [segmenter.java 166] IO error!");}
				}
    	}catch (IOException e){
    System.out.println("Function getTfValue [segmenter.java 159] IO error!");
	}
	return tmpArrayList;
	

   	}

   /**Return true if this document contains a word equals the from @args**/
   public boolean containsWord(String args){
   			for(int k = 0;k<wordSum.size();k++){
				if(wordSum.get(k).equals(args)){
					return true;
					}
				}

				return false;
   	}

   /**Set the three Objects needed to be Operated**/
   public void setWordOfOneDocument(ArrayList args){
  	wordSum=args;
   	return;
   	}

   public void setWordCountOfOneDocument(ArrayList args){
   	wordCount=args;
	return;
   	}

   public void setWordMaxCountOfOneDocument(Integer args){
   	wordCountMaxInteger=args;
	return;
   	}

   public void setWordOfAllDocument(ArrayList args){
   	wordAll=args;
	return;
   	}

   public void setWordCountOfAllDocument(ArrayList args){
   	wordCountAll=args;
	return;
   	}

   public int getWordCountMaxOfOneDocument(){
   	return wordCountMax;
   	}

    /** Load a set of character data */
    public void loadset(TreeSet targetset, String sourcefile) {
	String dataline;
	try {
	    InputStream setdata = getClass().getResourceAsStream(sourcefile);
	    BufferedReader in = new BufferedReader(new InputStreamReader(setdata, "UTF-8"));
	    while ((dataline = in.readLine()) != null) {
		if ((dataline.indexOf("#") > -1) || (dataline.length() == 0)) {
		    continue;
		}
		targetset.add(dataline.intern());
	    }
	    in.close();
	}
	catch (Exception e) {
	    System.err.println("Exception loading data file" + sourcefile + " " + e);
	}

    }

    public boolean isNumber(String testword) {
	boolean result = true;
	for (int i = 0; i < testword.length(); i++) {
	    if (cnumbers.contains(testword.substring(i, i+1).intern()) == false) {
		result = false;
		break;
	    }
	}

	if (debug) {
	    try {System.out.println(new String(testword.getBytes("UTF-8")) + " " + result);} 
	    catch (Exception a) { };
	}

	return result;
    }

    public boolean isAllForeign(String testword) {
	boolean result = true;
	for (int i = 0; i < testword.length(); i++) {
	    if (cforeign.contains(testword.substring(i, i+1).intern()) == false) {
		result = false;
		break;
	    }
	}

	return result;
    }

    public boolean isNotCJK(String testword) {
	boolean result = true;
	for (int i = 0; i < testword.length(); i++) {
	    if (Character.UnicodeBlock.of(testword.charAt(i)) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
		result = false;
		break;
	    }
	}

	return result;
    }

  /*
    String add_ChineseNames(String tmpline) {
       int tlen = tmpline.length();
       StringBuffer newline = new StringBuffer();
       for (int m = 0; m < tlen; m++) {
       $tchar = substr($tmpline, $m, 1);
       $currtoken = "";
	if ($tchar =~ /^\s$/) { 
	    $newline .= $tchar;
	} else {
	    $currtoken = "";
	    while ($tchar !~ /^\s$/ and $m < $tlen) {
		$currtoken .= $tchar;
		$m++;
		$tchar = substr($tmpline, $m, 1);
	    }

	    if (defined($csurname{$currtoken}) or
		defined($uncommoncsurname{$currtoken})) { # found a surname, see what follows
		# go past following spaces
		$tchar = substr($tmpline, $m, 1);
		$spaces = "";
		while ($tchar =~ /\s/ and $m < $tlen) {
		    $spaces .= $tchar;
		    $m++;
		    $tchar = substr($tmpline, $m, 1);
		}
		# Get next token
		$tchar = substr($tmpline, $m, 1);
		$currtoken2 = "";
		while ($tchar !~ /\s/ and $m < $tlen) {
		    $currtoken2 .= $tchar;
		    $m++;
		    $tchar = substr($tmpline, $m, 1);
		}
		# go past following spaces
		$tchar = substr($tmpline, $m, 1);
		$spaces2 = "";
		while ($tchar =~ /\s/ and $m < $tlen) {
		    $spaces2 .= $tchar;
		    $m++;
		    $tchar = substr($tmpline, $m, 1);
		}
		# Get next token
		$tchar = substr($tmpline, $m, 1);
		$currtoken3 = "";
		while ($tchar !~ /\s/ and $m < $tlen) {
		    $currtoken3 .= $tchar;
		    $m++;
		    $tchar = substr($tmpline, $m, 1);
		}
		if (isChinese($currtoken2) and (length($currtoken2) == 2) and 
		    (!defined($cnotname{$currtoken2})) and 
		    isChinese($currtoken3) and length($currtoken3) == 2 and
		    !defined($cnotname{$currtoken3})) 
		{
		    $newline .= $cname[0] . $currtoken . $currtoken2 . $currtoken3 . $cname[1];
		    $cwords{$currtoken . $currtoken2 . $currtoken3} = 1;
		    $cwords{$currtoken . $currtoken2} = 2;  # short version for checking
		} elsif (isChinese($currtoken2) and (length($currtoken2) == 2) 
			 and (!defined($cnotname{$currtoken2})))
		{
		    $newline .= $currtoken . $currtoken2 . $spaces2 . $currtoken3;
		    $cwords{$currtoken . $currtoken2} = 1;
		} elsif (defined($csurname{$currtoken}) and 
			 isChinese($currtoken2) and (length($currtoken2) == 4) and
			 ($cwords{$currtoken2} != 1) and
			 (!defined($cnotname{$currtoken2})))
		{
		    $newline .= $cname[0] . $currtoken . $currtoken2 . $cname[1] . $spaces2 . $currtoken3;
		    $cwords{$currtoken . $currtoken2} = 1;
		    $cwords{$currtoken . substr($currtoken2, 0, 2)} = 2; # short version to check
		} elsif (defined($uncommoncsurname{$currtoken}) and 
			 isChinese($currtoken2) and (length($currtoken2) == 4) 
			 and (!defined($cnotname{$currtoken2})) 
			 and ($cwords{$currtoken2} != 1))
		{
		    $newline .= $cname[0] . $currtoken . $currtoken2 . $cname[1] . $spaces2 . $currtoken3;
		    $cwords{$currtoken . $currtoken2} = 1;
		    $cwords{$currtoken . substr($currtoken2, 0, 2)} = 2; # short version to check
		} else {
		    $newline .= $currtoken . $spaces . $currtoken2 . $spaces2 . $currtoken3;
		}
				 
	    } else {
		$newline .= $currtoken;
	    }
	    $m--; # reset so won't skip space
	}
    }
    
    $newline;
}

     */

    public String stemWord(String word) {
	String[] prefix = new String[] {"\u7b2c", "\u526f", "\u4e0d"};
	String[] suffix = new String[] {"\u4e86", "\u7684", "\u5730", "\u4e0b", "\u4e0a", "\u4e2d", "\u91cc", 
					"\u5230", "\u5185", "\u5916", "\u4eec"};
	String[] infix  = new String[] {"\u5f97", "\u4e0d"};
	int i;
	
	StringBuffer unstemmed = new StringBuffer(word);

	for (i = 0; i < prefix.length; i++) {
	    if (unstemmed.substring(0, 1).equals(prefix[i]) == true && 
		(zhwords.get(unstemmed.substring(1, unstemmed.length()).intern()) != null ||
		 unstemmed.length() == 2)) {
		System.out.println("Stemmed prefix");
		try {System.out.println(new String(unstemmed.toString().getBytes(debugencoding)));} catch (Exception a) { };
		unstemmed.deleteCharAt(0);
		return unstemmed.toString();
	    }
	}


	for (i = 0; i < suffix.length; i++) {
	    if (unstemmed.substring(unstemmed.length()-1, unstemmed.length()).equals(suffix[i]) == true && 
		(zhwords.get(unstemmed.substring(0, unstemmed.length()-1).intern()) != null ||
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -