📄 chinesetokenizer.java
字号:
package text_category;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringReader;
import com.xjt.nlp.word.ICTCLAS;
import edu.udo.cs.wvtool.generic.tokenizer.WVTTokenizer;
import edu.udo.cs.wvtool.main.WVTDocumentInfo;
import edu.udo.cs.wvtool.util.TokenEnumeration;
public class ChineseTokenizer implements WVTTokenizer, TokenEnumeration {
/** The underlying character stream of the currently tokenized document */
private Reader input;
/**
* The token, which is currently provided. This buffer is neccessary, to implement the semantic of TokenEnumeration
*/
private String currentToken;
public ChineseTokenizer()
{
input = null;
currentToken = null;
}
/**
* @see edu.udo.cs.wvtool.generic.tokenizer.WVTTokenizer#tokenize(Reader, WVTDocumentInfo)
*/
public TokenEnumeration tokenize(Reader source, WVTDocumentInfo d) {
if (source != null) {
String resultstring = null;
try
{
BufferedReader br = new BufferedReader(source);
StringBuffer sb = new StringBuffer();
String inputstring = null;
while ((inputstring = br.readLine()) != null)
{
sb.append(inputstring);
}
inputstring = sb.toString();
resultstring = inputstring;
resultstring = ICTCLASCutWord(inputstring);
}catch(Exception e)
{
return null;
}
input = new StringReader(resultstring);
readNextToken();
return this;
} else
return null;
}
/**
* Read a token from the character stream and store it into currentToken. If there are no more tokens left store a null value.
*
*/
public void readNextToken() {
StringBuffer buf = new StringBuffer();
boolean endReached = false;
int in = 0;
try {
// Read from the stream, until a letter occurs
in = input.read();
char ch = (char) in;
while ((in != -1) && !Character.isLetter(ch)) {
in = input.read();
ch = (char) in;
}
if (in != -1)
buf.append(ch);
// Read from the stream, util a non-letter occurs
while ((in != -1) && Character.isLetter(ch)) {
in = input.read();
ch = (char) in;
if (Character.isLetter(ch))
buf.append(ch);
}
} catch (Exception e) {
endReached = true;
}
if (in == -1)
endReached = true;
if (endReached) {
// If the stream ended with a non-empty token, this is the last
// token, otherwise there is no more token.
if (buf.length() > 0)
currentToken = buf.toString();
else
currentToken = null;
return;
} else {
// if the end of the stream has not been reached yet, simply store
// the extracted token.
currentToken = buf.toString();
return;
}
}
/**
* @see edu.udo.cs.wvtool.util.TokenEnumeration#hasMoreTokens()
*/
public boolean hasMoreTokens() {
// If the current token does not equal the null value, then there is at
// least this token left
if (input != null)
return (currentToken != null);
else
return false;
}
/**
* @see edu.udo.cs.wvtool.util.TokenEnumeration#nextToken()
*/
public String nextToken() {
String result = null;
// If unequal null, return the current token and read another one from
// the stream
if (currentToken != null) {
result = currentToken;
readNextToken();
} else
result = null;
return result;
}
public static String ICTCLASCutWord(String inputstring)
{
String resultstring = null;
try
{
ICTCLAS splitword = ICTCLAS.getInstance();
inputstring = inputstring.replace("\"", "");
inputstring = inputstring.replace("'", "");
inputstring = inputstring.replace("((", "");
inputstring = inputstring.replace("/", "");
inputstring = inputstring.replace(" ", "");
inputstring = inputstring.replace(">", "");
inputstring = inputstring.replace("<", "");
/*Character.UnicodeBlock ub;
char[] ch = inputstring.toCharArray();
StringBuffer temp = new StringBuffer();
for (int c = 0; c < ch.length; c++)
{
ub = Character.UnicodeBlock.of(ch[c]);
if ((ub == Character.UnicodeBlock.BASIC_LATIN)
|| (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) || Character.isLetter(ch[c]))
{
temp.append(ch[c]);
}
}
inputstring = temp.toString();*/
//System.out.println(inputstring);
inputstring = splitword.paragraphProcess(inputstring);
String[] immediatestrings = inputstring.split(" ");
StringBuffer sb = new StringBuffer();
for (int i = 0; i < immediatestrings.length; i++)
{
if (immediatestrings[i].length() <= 0)
continue;
int end = immediatestrings[i].lastIndexOf("/");
String str = "";
if (end < 0 || end > immediatestrings[i].length())
{
str = immediatestrings[i] + " ";
}
else
{
str = immediatestrings[i].substring(0, end) + " ";
}
sb.append(str);
}
resultstring = sb.toString();
}catch(Exception e)
{
return null;
}
return resultstring;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -