simplewordtokenizer.java

来自「spam source codejasen-0.9jASEN - java An」· Java 代码 · 共 182 行

JAVA
182
字号
/*
 * @(#)SimpleWordTokenizer.java 
 * 
 * Copyright (c) 2004, 2005  jASEN.org
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   1. Redistributions of source code must retain the above copyright notice,
 *      this list of conditions and the following disclaimer.
 *
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions and the following disclaimer in
 *      the documentation and/or other materials provided with the distribution.
 *
 *   3. The names of the authors may not be used to endorse or promote products
 *      derived from this software without specific prior written permission.
 *
 *   4. Any modification or additions to the software must be contributed back
 *      to the project.
 *
 *   5. Any investigation or reverse engineering of source code or binary to
 *      enable emails to bypass the filters, and hence inflict spam and or viruses
 *      onto users who use or do not use jASEN could subject the perpetrator to
 *      criminal and or civil liability.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JASEN.ORG,
 * OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */
package org.jasen.core.token;

import java.io.BufferedReader;
import java.io.CharArrayWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;


/**
 * Used to parse text which has already been semi formatted.
 * <P>
 * This class is used to prepare the linguisic analysis engine
 * </P>
 * @author Jason Polites
 */
public class SimpleWordTokenizer {

	private String text;
	private String[] tokens;

	private Reader reader;

	public SimpleWordTokenizer(File file) throws FileNotFoundException {
		this.reader = new BufferedReader(new FileReader(file));
	}

	public SimpleWordTokenizer(InputStream in) {
		this.reader = new BufferedReader(new InputStreamReader(in));
	}

	/**
	 * Tokenizes (splits) the text
	 * @throws IOException
	 */
	public void tokenize() throws IOException {

		CharArrayWriter out = null;

		try {
			char[] buffer = new char[1024];
			out = new CharArrayWriter();

			int count;

			while ( (count = reader.read(buffer, 0, buffer.length)) != -1) {
			  out.write(buffer, 0, count);
			}

			out.flush();

			text = new String(out.toCharArray());

		}
		finally {
			if(reader != null) {
				reader.close();
			}

			if(out != null) {
				out.close();
			}
		}

		if(text != null) {
			text = text.toLowerCase();
		}

		tokens = text.split("\\s+");
	}



	/*public static void main(String[] args) {

		try {
			File tokenFile = new File("D:\\Projects\\Synetek\\EverySpam\\LinguisticTests\\tokens_2.txt");

			SimpleWordTokenizer t = new SimpleWordTokenizer(tokenFile);
			t.tokenize();
			//t.printTokens(System.out);

			// Now, try the analyzer
			LinguisticAnalyzer analyzer = new LexicalTreeAnalyzer();
			//LinguisticAnalyzer analyzer = new CharacterPositionAnalyzer();
			//analyzer.setDimension(50);
			analyzer.setTokens(t.getTokens());
			analyzer.initialize();
			//analyzer.printProbabilityMatrix(System.out);

			String input = null;

			BufferedReader br = new BufferedReader(new InputStreamReader(System.in));

			System.out.println("Type \"exit\" to quit:");

			do {
				System.out.print("Enter a word: ");

				input = br.readLine();

				if(input.length() > 50) {
					System.out.println("ERROR! Words must be 50 characters or less");
				}
				else if(input.indexOf("list ") != -1) {
					String[] split = input.split(" ");

					analyzer.list(Integer.parseInt(split[1]), System.out);
				}
				else if(input.indexOf(' ') != -1) {
					System.out.println("ERROR! Words cannot contain spaces");
				}
				else if(!input.equalsIgnoreCase("exit")) {
					System.out.println(NumberUtility.formatAsPercentage(analyzer.isWord(input), 2));
				}


			}
			while (!input.equalsIgnoreCase("exit"));

			System.out.println("Exited");

		}
		catch (Exception e) {
			e.printStackTrace();
		}

	}*/

	/**
	 * Gets the tokens returned from the tokenization process
	 * @return
	 */
	public String[] getTokens() {
		return tokens;
	}

}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?