⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 hashbinarydictionary.java

📁 paoding的中文分词程序,效果还可以.这个可以作为一些基本应用的分词.
💻 JAVA
字号:
/**
 * Copyright 2007 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package net.paoding.analysis.dictionary;

import java.util.HashMap;
import java.util.Map;

/**
 * Dictionary的散列+二叉查找实现。
 * <p>
 * 用于对大数量的,且头字符相同的字符串较多的情况,e.g汉字词语字典。在这种情况下,检索速度将比二叉字典更快。
 * <p>
 * 
 * HashBinaryDictionary以一组已经排序的词语为输入,所有<b>头字符</b>相同的词语划为一个集合作为分字典(使用BinaryDictionary实现)。
 * 查找词语时,先根据第一个字符找得分词典(BinaryDictionary实现),再从该分词典中定位该词语。
 * <p>
 * 
 * @author Zhiliang Wang [qieqie.wang@gmail.com]
 * 
 * @see BinaryDictionary
 * 
 * @since 1.0
 * 
 */
public class HashBinaryDictionary implements Dictionary {

	// -------------------------------------------------

	/**
	 * 字典中所有词语,用于方便{@link #get(int)}方法
	 */
	private Word[] ascWords;

	/**
	 * 首字符到分词典的映射
	 */
	private Map/* <Object, SubDictionaryWrap> */subs;

	/**
	 * 
	 */
	private final int hashIndex;

	private final int start;
	private final int end;
	private final int count;

	// -------------------------------------------------

	/**
	 * 
	 * @param ascWords
	 *            升序排列词语
	 * @param initialCapacity
	 * @param loadFactor
	 */
	public HashBinaryDictionary(Word[] ascWords, int initialCapacity,
			float loadFactor) {
		this(ascWords, 0, 0, ascWords.length, initialCapacity, loadFactor);
	}

	public HashBinaryDictionary(Word[] ascWords, int hashIndex, int start,
			int end, int initialCapacity, float loadFactor) {
		this.ascWords = ascWords;
		this.start = start;
		this.end = end;
		this.count = end - start;
		this.hashIndex = hashIndex;
		subs = new HashMap/* <Object, SubDictionaryWrap> */(initialCapacity,
				loadFactor);
		createSubDictionaries();
	}

	// -------------------------------------------------

	/**
	 * 创建分词典映射,为构造函数调用
	 */
	protected void createSubDictionaries() {
		if (this.start >= ascWords.length) {
			return;
		}
		
		// 定位相同头字符词语的开头和结束位置以确认分字典
		int beginIndex = this.start;
		int endIndex = this.start + 1;
		
		char beginHashChar = getChar(ascWords[start], hashIndex);
		char endHashChar;
		for (; endIndex < this.end; endIndex++) {
			endHashChar = getChar(ascWords[endIndex], hashIndex);
			if (endHashChar != beginHashChar) {
				addSubDictionary(beginHashChar, beginIndex, endIndex);
				beginIndex = endIndex;
				beginHashChar = endHashChar;
			}
		}
		addSubDictionary(beginHashChar, beginIndex, this.end);
	}

	protected char getChar(CharSequence s, int index) {
		if (index >= s.length()) {
			return (char) 0;
		}
		return s.charAt(index);
	}

	/**
	 * 将位置在beginIndex和endIndex之间(不包括endIndex)的词语作为一个分词典
	 * 
	 * @param hashChar
	 * @param beginIndex
	 * @param endIndex
	 */
	protected void addSubDictionary(char hashChar, int beginIndex, int endIndex) {
		Dictionary subDic = createSubDictionary(ascWords, beginIndex, endIndex);
		SubDictionaryWrap subDicWrap = new SubDictionaryWrap(hashChar,
				subDic, beginIndex);
		subs.put(keyOf(hashChar), subDicWrap);
	}

	protected Dictionary createSubDictionary(Word[] ascWords, int beginIndex,
			int endIndex) {
		int count = endIndex - beginIndex;
		if (count < 16) {
			return new BinaryDictionary(ascWords, beginIndex, endIndex);
		} else {
			return new HashBinaryDictionary(ascWords, hashIndex + 1,
					beginIndex, endIndex, getCapacity(count), 0.75f);
		}
	}

	protected static final int[] capacityCandiate = { 16, 32, 64, 128, 256,
			512, 1024, 2048, 4096, 10192 };

	protected int getCapacity(int count) {
		int capacity = -1;
		count <<= 2;
		count /= 3;
		count += 1;
		for (int i = 0; i < capacityCandiate.length; i++) {
			if (count <= capacityCandiate[i]) {
				capacity = capacityCandiate[i];
				break;
			}
		}
		if (capacity < 0) {
			capacity = capacityCandiate[capacityCandiate.length - 1];
		}
		return capacity;
	}

	// -------------------------------------------------

	public Word get(int index) {
		return ascWords[start + index];
	}

	public Hit search(CharSequence input, int begin, int count) {
		SubDictionaryWrap subDic = (SubDictionaryWrap) subs.get(keyOf(input
				.charAt(hashIndex + begin)));
		if (subDic == null) {
			return Hit.UNDEFINED;
		}
		Dictionary dic = subDic.dic;
		// 对count==hashIndex + 1的处理
		if (count == hashIndex + 1) {
			Word header = dic.get(0);
			if (header.length() == hashIndex + 1) {
				if (subDic.wordIndexOffset + 1 < this.ascWords.length) {
					return new Hit(subDic.wordIndexOffset, header,
							this.ascWords[subDic.wordIndexOffset + 1]);
				} else {
					return new Hit(subDic.wordIndexOffset, header, null);
				}
			} else {
				return new Hit(Hit.UNCLOSED_INDEX, null, header);
			}
		}
		// count > hashIndex + 1
		Hit word = dic.search(input, begin, count);
		if (word.isHit()) {
			int index = subDic.wordIndexOffset + word.getIndex();
			word.setIndex(index);
			if (word.getNext() == null && index < size()) {
				word.setNext(get(index + 1));
			}
		}
		return word;
	}

	public int size() {
		return count;
	}

	// -------------------------------------------------

	/**
	 * 字符的在{@link #subs}的key值。
	 * 
	 * @param theChar
	 * @return
	 * 
	 * @see #subs
	 */
	protected Object keyOf(char theChar) {
		// return theChar - 0x4E00;// '一'==0x4E00
		return new Integer(theChar);
	}

	/**
	 * 分词典封箱
	 */
	static class SubDictionaryWrap {
		/**
		 * 分词典词组的头字符
		 */
		char hashChar;

		/**
		 * 分词典
		 */
		Dictionary dic;

		/**
		 * 分词典第一个词语在所有词语中的偏移位置
		 */
		int wordIndexOffset;

		public SubDictionaryWrap(char hashChar, Dictionary dic,
				int wordIndexOffset) {
			this.hashChar = hashChar;
			this.dic = dic;
			this.wordIndexOffset = wordIndexOffset;
		}
	}

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -