⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 abstractalgorithm.java

📁 基于词典和最大匹配算法的的中文分词组件
💻 JAVA
字号:
/**
 * 
 */
package org.solol.mmseg.internal;

import java.util.ArrayList;
import java.util.List;

import org.solol.mmseg.core.AlgorithmException;
import org.solol.mmseg.core.Config;
import org.solol.mmseg.core.ConfigurationException;
import org.solol.mmseg.core.DictionaryFactory;
import org.solol.mmseg.core.IAlgorithm;
import org.solol.mmseg.core.IChunk;
import org.solol.mmseg.core.IDictionary;
import org.solol.mmseg.core.IRule;
import org.solol.mmseg.core.IWord;

/**
 * @author solo L
 * 
 */
public abstract class AbstractAlgorithm implements IAlgorithm {

	protected DictionaryFactory dictionaryFactory;

	protected IRule mmRule = new MMRule();

	protected IRule lawlRule = new LAWLRule();

	protected IRule svwlRule = new SVWLRule();

	protected IRule lsdmfocwRule = new LSDMFOCWRule();

	protected int index = 0;

	public AbstractAlgorithm() throws AlgorithmException {
		try {
			dictionaryFactory = DictionaryFactory.getFactory();
		} catch (ConfigurationException e) {
			throw new AlgorithmException(e);
		}
	}

	public AbstractAlgorithm(DictionaryFactory dictionaryFactory) {
		this.dictionaryFactory = dictionaryFactory;
	}

	public IWord next(char[] chars) throws AlgorithmException {

		if (index >= chars.length) {
			index = 0;
			return null;
		}
		
		char current = chars[index];
		
		if (isBasicLatin(current)) {
			IWord word = getBasicLatinWord(chars,index);			
			index += word.getLength();
			return word;
		} else {
			IChunk[] chunks = createChunks(chars, index);

			IWord word = getCJKWord(chunks);

			index += word.getLength();

			return word;
		}		
	}	

	protected IWord getBasicLatinWord(char[] chars,int index) {
		StringBuffer basicLatinWord = new StringBuffer();
		while ((index < chars.length) && isBasicLatin(chars[index])) {
			if(Character.isWhitespace(chars[index])){
				if(basicLatinWord.length() == 0){						
					basicLatinWord.append(chars[index]);
					index++;
					return new Word(basicLatinWord.toString(),Word.BASICLATIN_WORD);
				} else {
					return new Word(basicLatinWord.toString(),Word.BASICLATIN_WORD);
				}
			}
			basicLatinWord.append(chars[index]);
			index++;
		}
		
		return new Word(basicLatinWord.toString(),Word.BASICLATIN_WORD);
	}	

	protected abstract IChunk[] createChunks(char[] chars, int index2);

	protected IWord[] findMatchWords(char[] chars, int index) {
		IDictionary dictionary = dictionaryFactory.createDictionary();

		char c = chars[index];

		List matchWords = new ArrayList();

		StringBuffer sb = new StringBuffer();
		sb.append(c);

		String wordValue = sb.toString();
		if (dictionary.isMatched(wordValue)) {
			matchWords.add(dictionary.getWord(wordValue));
		}

		for (int i = 1; i < Config.WORD_MAX_LENGTH
				&& ((i + index) < chars.length); i++) {
			if (isBasicLatin(chars[index]))
				break;
			sb.append(chars[i + index]);
			String temp = sb.toString();
			if (dictionary.isMatched(temp)) {
				matchWords.add(dictionary.getWord(temp));
			}
		}

		if (matchWords.isEmpty()) {
			matchWords.add(new Word(wordValue,Word.UNRECOGNIZED));
		}

		IWord[] words = new IWord[matchWords.size()];
		matchWords.toArray(words);

		matchWords.clear();

		return words;
	}

	protected IWord getCJKWord(IChunk[] chunks) throws AlgorithmException {
		IChunk[] chunkList = mmRule.invoke(chunks);

		if (chunkList.length >= 2) {
			chunkList = lawlRule.invoke(chunkList);
			if (chunkList.length >= 2) {
				chunkList = svwlRule.invoke(chunkList);
				if (chunkList.length >= 2) {
					chunkList = lsdmfocwRule.invoke(chunkList);
					if (chunkList.length >= 2) {
						throw new AlgorithmException("There is an ambiguity!");
					}
				}
			}
		}

		IChunk chunk = chunkList[0];

		return chunk.getWords()[0];
	}
	
	private boolean isBasicLatin(char c) {
		return Character.UnicodeBlock.of(c) == Character.UnicodeBlock.BASIC_LATIN;
	}	
	
/*	private boolean isCJK(char c) {
		return Character.UnicodeBlock.of(c) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
				|| Character.UnicodeBlock.of(c) == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION;
	}*/
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -