📄 contenttokenizer.java
字号:
/*
* 创建日期 2005-2-17
*
* TODO 要更改此生成的文件的模板,请转至
* 窗口 - 首选项 - Java - 代码样式 - 代码模板
*/
package net.nutch.analysis;
import java.io.*;
import java.net.URL;
import java.util.ArrayList;
import java.util.StringTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import kit.nlp.util.*;
/**
* @author Administrator
*
* TODO 要更改此生成的类型注释的模板,请转至
* 窗口 - 首选项 - Java - 代码样式 - 代码模板
*/
public class ContentTokenizer extends Tokenizer{
private ArrayList<Token> tokens = new ArrayList<Token>();
private int cur = 0;
public ContentTokenizer(Reader reader) throws Exception{
super(reader);
token(reader);
}
public void token(Reader reader) throws Exception{
BufferedReader in = new BufferedReader(reader);
String temp = null;
StringBuffer sb = new StringBuffer();
while( (temp=in.readLine()) != null ){
sb.append(temp);
temp = null;
}
in.close();
temp = sb.toString();
//WordsSegment ws = new WordsSegment();
if (temp.startsWith("http:")){
try{
URL url = new URL(temp);
String host = url.getHost();
int tokenend = 7 + host.getBytes().length;
Token t = new Token(host,7,tokenend);
tokens.add(t);
int dot = 0;
while((dot=host.indexOf("."))>=0){
host = host.substring(dot+1);
t = new Token(host,7 + dot + 1, tokenend);
tokens.add(t);
}
kit.nlp.util.Token[] urlTokens = WordsSegment.segmentToken(temp);
if( urlTokens == null)
return;
for( kit.nlp.util.Token term : urlTokens ){
String word = term.getTerm().toLowerCase();
if (Stopwords.isStopword(word))
continue;
int offset = term.getOffset();
t = new Token(word, offset, offset + word.getBytes().length);
tokens.add(t);
}
}catch(Exception e){
}
}else{
ArrayList<kit.nlp.util.Token> termList =
new ArrayList<kit.nlp.util.Token>();
StringTokenizer token = new StringTokenizer(temp, " ");
int offset = 0;
while(token.hasMoreTokens()){
kit.nlp.util.Token term = new kit.nlp.util.Token();
term.setOffset(offset);
String text = token.nextToken().toLowerCase();
if (Stopwords.isSymbol(text))
term.setType(37);
term.setTerm(text);
offset += text.length();
termList.add(term);
}
kit.nlp.util.Token[] terms = new kit.nlp.util.Token[termList.size()];
terms = termList.toArray(terms);
for( kit.nlp.util.Token term : terms ){
String word = term.getTerm();
offset = term.getOffset();
Token t = new Token(word, offset, offset + word.length());
tokens.add(t);
}
kit.nlp.util.Token[] phrases = WordsSegment.tokenPhrase(terms);
if (phrases == null || phrases.length == 0)
return;
int i = terms.length-1;
for ( int j=phrases.length-1; j>=0; j--){
kit.nlp.util.Token phrase = phrases[j];
String word = phrase.getTerm().toLowerCase();
//System.out.println("*********phrase:"+word);
offset = phrase.getOffset();
/******/
for (; i>=0; i-- ){
kit.nlp.util.Token term = terms[i];
if (offset == term.getOffset())
break;
}
if (i < 0){
i = terms.length-1;
continue;
}
/*******/
Token t = new Token(word, offset, offset + word.length(), "phrase");
t.setPositionIncrement(0);
tokens.add(i+1,t);
}
}
return;
}
public final Token next() throws IOException {
return next(true);
}
public final Token next(boolean skipStopWord) throws IOException {
if (cur >= tokens.size())
return null;
Token token = tokens.get(cur++);
if (skipStopWord){
while(Stopwords.isStopword(token.termText())){
if (cur >= tokens.size())
return null;
token = tokens.get(cur++);
}
}
return token;
}
public static void main(String[] args) {
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -