📄 treedictsplitter.java
字号:
/*
* Copyright 2002-2005 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* Created on 2005-12-28
* author 谢骋超
*
*/
package cn.edu.zju.dartsplitter.impl;
import java.util.HashSet;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.LinkedBlockingQueue;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Token;
import cn.edu.zju.dartsplitter.DictTree;
import cn.edu.zju.dartsplitter.data.DictNode;
/**
* @author xiecc
* @email xieccy@gmail.com xieccy@yahoo.com
* homepage: http://blog.itpub.net/xiecc
* projectpage: http://ccnt.zju.edu.cn/projects
*/
public class TreeDictSplitter {
/**
* Logger for this class
*/
private static final Logger logger = Logger
.getLogger(TreeDictSplitter.class);
private Token token;
private Queue<Token> tokenQueue = new LinkedBlockingQueue<Token>();
private Set<Integer> hasTokenedSet = new HashSet<Integer>();// 动态规划法,将已经token过的起始位置放入到Set里,下次见到就不用再计算了
private DictTree dictTree;
/**
* @return Returns the token.
*/
public Token getToken() {
return token;
}
/**
* @param token
* The token to set.
*/
public void setToken(Token token) {
this.token = token;
}
/**
* @return Returns the tokenQueue.
*/
public Queue<Token> getTokenQueue() {
return tokenQueue;
}
private boolean isTokened(int pos) {
return this.hasTokenedSet.contains(pos);
}
private void setTokened(int pos) {
hasTokenedSet.add(pos);
}
/**
* @return Returns the dictTree.
*/
public DictTree getDictTree() {
return dictTree;
}
/**
* @param dictTree
* The dictTree to set.
*/
public void setDictTree(DictTree dictTree) {
this.dictTree = dictTree;
}
public TreeDictSplitter(Token token) {
super();
this.token = token;
}
public Queue<Token> split() {
if (null == token) {
return tokenQueue;
}
if ((token.endOffset() - token.startOffset()) <= 1) {
tokenQueue.offer(token);
return tokenQueue;
}
enqueueToken(0, DictNode.EMPTY_NODE);
return tokenQueue;
}
private void enqueueToken(int pos, DictNode parentNode) {
if (pos >= token.termText().length()) {// 如果长度超出了就表示分词结束
return;
}
//用动态规划法来判断从当前位置的字符开始分词是否已经进行过了
if (isTokened(pos) && parentNode.getLevel() == 0) {
return;
}
if (parentNode.getLevel() == 0) {
setTokened(pos);
}
String strPrefix = token.termText().substring(pos, pos + 1);
DictNode dictNode = getDictTree().buildOrGetSubNodes(strPrefix,
parentNode);
if (null == dictNode) {
enqueueToken(pos + 1, DictNode.EMPTY_NODE);// 当前字无法组成任何词,往后移一位,继续跑
return;
}
if (dictNode.hasEnd()) {
dictNode.incUseCount();
Token curToken = new Token(dictNode.getTokenValue(), token
.startOffset()
+ pos + 1 - dictNode.getLevel(), token.startOffset() + pos
+ 1);
tokenQueue.offer(curToken);
enqueueToken(pos + 1, DictNode.EMPTY_NODE);// 截完上一个词,开始截下一词
}
enqueueToken(pos + 1, dictNode);// 继续往下跑,看看能不能组成更大的词
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -