⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 estimate.java.svn-base

📁 又一个中文分词组件
💻 SVN-BASE
字号:
package net.paoding.analysis.analyzer.estimate;

import java.io.IOException;
import java.io.PrintStream;
import java.io.Reader;
import java.io.StringReader;
import java.util.Iterator;
import java.util.LinkedList;

import net.paoding.analysis.analyzer.PaodingTokenizer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;

public class Estimate {
	private Analyzer analyzer;
	private String print;
	private PrintGate printGate;
	

	public Estimate() {
		this.setPrint("50");//默认只打印前50行分词效果
	}

	public Estimate(Analyzer analyzer) {
		setAnalyzer(analyzer);
		this.setPrint("50");//默认只打印前50行分词效果
	}

	public void setAnalyzer(Analyzer analyzer) {
		this.analyzer = analyzer;
	}

	public Analyzer getAnalyzer() {
		return analyzer;
	}

	public void setPrint(String print) {
		if (print == null || print.length() == 0 || print.equalsIgnoreCase("null") || print.equalsIgnoreCase("no")) {
			printGate = null;
			this.print = null;
		}
		else {
			printGate = new LinePrintGate();
			printGate.setPrint(print, 10);
			this.print = print;
		}
	}

	public String getPrint() {
		return print;
	}

	public void test(String input) {
		this.test(System.out, input);
	}
	
	public void test(PrintStream out, String input) {
		Reader reader = new StringReaderEx(input);
		this.test(out, reader);
	}

	public void test(PrintStream out, Reader reader) {
		try {
			long begin = System.currentTimeMillis();
			TokenStream ts = analyzer.tokenStream("", reader);
			Token token;
			LinkedList list = new LinkedList();
			int wordsCount = 0;
			while ((token = ts.next()) != null) {
				if (printGate != null && printGate.filter(wordsCount)) {
					list.add(new CToken(token, wordsCount));
				}
				wordsCount++;
			}
			long end = System.currentTimeMillis();
			int c = 0;
			if (list.size() > 0) {
				Iterator iter = list.iterator();
				CToken ctoken;
				while (iter.hasNext()) {
					ctoken = (CToken) iter.next();
					c = ctoken.i;
					token = ctoken.t;
					if (c % 10 == 0) {
						if (c != 0) {
							out.println();
						}
						out.print((c/10 + 1)+ ":\t");
					}
					out.print(token.termText() + "/");
				}
			}
			if (wordsCount == 0) {
				System.out.println("\tAll are noise characters or words");
			} else {
				if (c % 10 != 1) {
					System.out.println();
				}
				String inputLength = "<未知>";
				if (reader instanceof StringReaderEx) {
					inputLength = "" + ((StringReaderEx) reader).inputLength;
				}
				else if (ts instanceof PaodingTokenizer) {
					inputLength = "" + ((PaodingTokenizer) ts).getInputLength();
				}
				System.out.println();
				System.out.println("\t分词器" + analyzer.getClass().getName());
				System.out.println("\t内容长度 " + inputLength + "字符, 分 " + wordsCount
						+ "个词");
				System.out.println("\t分词耗时 " + (end - begin) + "ms ");
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
		finally {
			try {
				reader.close();
			} catch (IOException e) {
			}
		}
	}
	
	//-------------------------------------------
	
	static class CToken {
		Token t;
		int i;
		
		CToken(Token t, int i) {
			this.t = t;
			this.i = i;
		}
	}

	static interface PrintGate {
		public void setPrint(String print, int unitSize);
		boolean filter(int count);
	}
	
	static class PrintGateToken implements PrintGate {
		private int begin;
		private int end;
		public void setBegin(int begin) {
			this.begin = begin;
		}
		public void setEnd(int end) {
			this.end = end;
		}

		public void setPrint(String print, int unitSize) {
			int i = print.indexOf('-');
			if (i > 0) {
				int bv = Integer.parseInt(print.substring(0, i));
				int ev = Integer.parseInt(print.substring(i + 1));
				setBegin(unitSize * (Math.abs(bv) - 1) );//第5行,是从第40开始的
				setEnd(unitSize * Math.abs(ev));//到第10行,是截止于100(不包含该边界)
			}
			else {
				setBegin(0);
				int v = Integer.parseInt(print);
				setEnd(unitSize * (Math.abs(v)));
			}
		}
		public boolean filter(int count) {
			return count >= begin && count < end;
		}
	}
	
	static class LinePrintGate implements PrintGate {

		private PrintGate[] list;
		
		public void setPrint(String print, int unitSize) {
			String[] prints = print.split(",");
			list = new PrintGate[prints.length];
			for (int i = 0; i < prints.length; i++) {
				PrintGateToken pg = new PrintGateToken();
				pg.setPrint(prints[i], unitSize);
				list[i] = pg;
			}
		}
		
		public boolean filter(int count) {
			for (int i = 0; i < list.length; i++) {
				if (list[i].filter(count)) {
					return true;
				}
			}
			return false;
		}
		
	}
	
	static class StringReaderEx extends StringReader {
		private int inputLength;
		public StringReaderEx(String s) {
			super(s);
			inputLength = s.length();
		}
	}
	
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -