📄 abstractalgorithm.java
字号:
/**
*
*/
package org.solol.mmseg.internal;
import java.util.ArrayList;
import java.util.List;
import org.solol.mmseg.core.AlgorithmException;
import org.solol.mmseg.core.Config;
import org.solol.mmseg.core.ConfigurationException;
import org.solol.mmseg.core.DictionaryFactory;
import org.solol.mmseg.core.IAlgorithm;
import org.solol.mmseg.core.IChunk;
import org.solol.mmseg.core.IDictionary;
import org.solol.mmseg.core.IRule;
import org.solol.mmseg.core.IWord;
/**
* @author solo L
*
*/
public abstract class AbstractAlgorithm implements IAlgorithm {
protected DictionaryFactory dictionaryFactory;
protected IRule mmRule = new MMRule();
protected IRule lawlRule = new LAWLRule();
protected IRule svwlRule = new SVWLRule();
protected IRule lsdmfocwRule = new LSDMFOCWRule();
protected int index = 0;
public AbstractAlgorithm() throws AlgorithmException {
try {
dictionaryFactory = DictionaryFactory.getFactory();
} catch (ConfigurationException e) {
throw new AlgorithmException(e);
}
}
public AbstractAlgorithm(DictionaryFactory dictionaryFactory) {
this.dictionaryFactory = dictionaryFactory;
}
public IWord next(char[] chars) throws AlgorithmException {
if (index >= chars.length) {
index = 0;
return null;
}
char current = chars[index];
if (isBasicLatin(current)) {
IWord word = getBasicLatinWord(chars,index);
index += word.getLength();
return word;
} else {
IChunk[] chunks = createChunks(chars, index);
IWord word = getCJKWord(chunks);
index += word.getLength();
return word;
}
}
protected IWord getBasicLatinWord(char[] chars,int index) {
StringBuffer basicLatinWord = new StringBuffer();
while ((index < chars.length) && isBasicLatin(chars[index])) {
if(Character.isWhitespace(chars[index])){
if(basicLatinWord.length() == 0){
basicLatinWord.append(chars[index]);
index++;
return new Word(basicLatinWord.toString(),Word.BASICLATIN_WORD);
} else {
return new Word(basicLatinWord.toString(),Word.BASICLATIN_WORD);
}
}
basicLatinWord.append(chars[index]);
index++;
}
return new Word(basicLatinWord.toString(),Word.BASICLATIN_WORD);
}
protected abstract IChunk[] createChunks(char[] chars, int index2);
protected IWord[] findMatchWords(char[] chars, int index) {
IDictionary dictionary = dictionaryFactory.createDictionary();
char c = chars[index];
List matchWords = new ArrayList();
StringBuffer sb = new StringBuffer();
sb.append(c);
String wordValue = sb.toString();
if (dictionary.isMatched(wordValue)) {
matchWords.add(dictionary.getWord(wordValue));
}
for (int i = 1; i < Config.WORD_MAX_LENGTH
&& ((i + index) < chars.length); i++) {
if (isBasicLatin(chars[index]))
break;
sb.append(chars[i + index]);
String temp = sb.toString();
if (dictionary.isMatched(temp)) {
matchWords.add(dictionary.getWord(temp));
}
}
if (matchWords.isEmpty()) {
matchWords.add(new Word(wordValue,Word.UNRECOGNIZED));
}
IWord[] words = new IWord[matchWords.size()];
matchWords.toArray(words);
matchWords.clear();
return words;
}
protected IWord getCJKWord(IChunk[] chunks) throws AlgorithmException {
IChunk[] chunkList = mmRule.invoke(chunks);
if (chunkList.length >= 2) {
chunkList = lawlRule.invoke(chunkList);
if (chunkList.length >= 2) {
chunkList = svwlRule.invoke(chunkList);
if (chunkList.length >= 2) {
chunkList = lsdmfocwRule.invoke(chunkList);
if (chunkList.length >= 2) {
throw new AlgorithmException("There is an ambiguity!");
}
}
}
}
IChunk chunk = chunkList[0];
return chunk.getWords()[0];
}
private boolean isBasicLatin(char c) {
return Character.UnicodeBlock.of(c) == Character.UnicodeBlock.BASIC_LATIN;
}
/* private boolean isCJK(char c) {
return Character.UnicodeBlock.of(c) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
|| Character.UnicodeBlock.of(c) == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION;
}*/
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -