simplewordtokenizer.java
来自「spam source codejasen-0.9jASEN - java An」· Java 代码 · 共 182 行
JAVA
182 行
/*
* @(#)SimpleWordTokenizer.java
*
* Copyright (c) 2004, 2005 jASEN.org
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the distribution.
*
* 3. The names of the authors may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* 4. Any modification or additions to the software must be contributed back
* to the project.
*
* 5. Any investigation or reverse engineering of source code or binary to
* enable emails to bypass the filters, and hence inflict spam and or viruses
* onto users who use or do not use jASEN could subject the perpetrator to
* criminal and or civil liability.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JASEN.ORG,
* OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
package org.jasen.core.token;
import java.io.BufferedReader;
import java.io.CharArrayWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
/**
* Used to parse text which has already been semi formatted.
* <P>
* This class is used to prepare the linguisic analysis engine
* </P>
* @author Jason Polites
*/
public class SimpleWordTokenizer {
private String text;
private String[] tokens;
private Reader reader;
public SimpleWordTokenizer(File file) throws FileNotFoundException {
this.reader = new BufferedReader(new FileReader(file));
}
public SimpleWordTokenizer(InputStream in) {
this.reader = new BufferedReader(new InputStreamReader(in));
}
/**
* Tokenizes (splits) the text
* @throws IOException
*/
public void tokenize() throws IOException {
CharArrayWriter out = null;
try {
char[] buffer = new char[1024];
out = new CharArrayWriter();
int count;
while ( (count = reader.read(buffer, 0, buffer.length)) != -1) {
out.write(buffer, 0, count);
}
out.flush();
text = new String(out.toCharArray());
}
finally {
if(reader != null) {
reader.close();
}
if(out != null) {
out.close();
}
}
if(text != null) {
text = text.toLowerCase();
}
tokens = text.split("\\s+");
}
/*public static void main(String[] args) {
try {
File tokenFile = new File("D:\\Projects\\Synetek\\EverySpam\\LinguisticTests\\tokens_2.txt");
SimpleWordTokenizer t = new SimpleWordTokenizer(tokenFile);
t.tokenize();
//t.printTokens(System.out);
// Now, try the analyzer
LinguisticAnalyzer analyzer = new LexicalTreeAnalyzer();
//LinguisticAnalyzer analyzer = new CharacterPositionAnalyzer();
//analyzer.setDimension(50);
analyzer.setTokens(t.getTokens());
analyzer.initialize();
//analyzer.printProbabilityMatrix(System.out);
String input = null;
BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
System.out.println("Type \"exit\" to quit:");
do {
System.out.print("Enter a word: ");
input = br.readLine();
if(input.length() > 50) {
System.out.println("ERROR! Words must be 50 characters or less");
}
else if(input.indexOf("list ") != -1) {
String[] split = input.split(" ");
analyzer.list(Integer.parseInt(split[1]), System.out);
}
else if(input.indexOf(' ') != -1) {
System.out.println("ERROR! Words cannot contain spaces");
}
else if(!input.equalsIgnoreCase("exit")) {
System.out.println(NumberUtility.formatAsPercentage(analyzer.isWord(input), 2));
}
}
while (!input.equalsIgnoreCase("exit"));
System.out.println("Exited");
}
catch (Exception e) {
e.printStackTrace();
}
}*/
/**
* Gets the tokens returned from the tokenization process
* @return
*/
public String[] getTokens() {
return tokens;
}
}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?