⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 hashbinarydictionary.java

📁 一个分词的源码
💻 JAVA
字号:
/*
 * 本代码所有权归作者所有 但在保持源代码不被破坏以及所有人署名的基础上 任何人可自由无限使用
 */
package com.sohospace.dictionary;

import java.util.HashMap;
import java.util.Map;

/**
 * Dictionary的散列+二叉查找实现。
 * <p>
 * 用于对大数量的,且头字符相同的字符串较多的情况,e.g汉字词语字典。在这种情况下,检索速度将比二叉字典更快。
 * <p>
 * 
 * HashBinaryDictionary以一组已经排序的词语为输入,所有<b>头字符</b>相同的词语划为一个集合作为分字典(使用BinaryDictionary实现)。
 * 查找词语时,先根据第一个字符找得分词典(BinaryDictionary实现),再从该分词典中定位该词语。
 * <p>
 * 
 * @author zhiliang.wang@yahoo.com.cn
 * 
 * @see BinaryDictionary
 * 
 * @since 1.0
 * 
 */
public class HashBinaryDictionary implements Dictionary {

	// -------------------------------------------------

	/**
	 * 字典中所有词语,用于方便{@link #get(int)}方法
	 */
	private String[] ascWords;

	/**
	 * 首字符到分词典的映射
	 */
	private Map<Integer, SubDictionaryWrap> subs;
	
	/**
	 * 
	 */
	private final int hashIndex;
	
	private final int start;
	private final int end;
	private final int count;

	// -------------------------------------------------

	/**
	 * 
	 * @param ascWords
	 *            升序排列词语
	 * @param initialCapacity
	 * @param loadFactor
	 */
	public HashBinaryDictionary(String[] ascWords, int initialCapacity,
			float loadFactor) {
		this(ascWords, 0, 0, ascWords.length, initialCapacity, loadFactor);
	}
	
	public HashBinaryDictionary(String[] ascWords, int hashIndex, int start, int end, int initialCapacity,
			float loadFactor) {
		this.ascWords = ascWords;
		this.start = start;
		this.end = end;
		this.count = end - start;
		this.hashIndex = hashIndex;
		subs = new HashMap<Integer, SubDictionaryWrap>(initialCapacity, loadFactor);
		createSubDictionaries();
	}

	// -------------------------------------------------

	/**
	 * 创建分词典映射,为构造函数调用
	 */
	protected void createSubDictionaries() {
		// 定位相同头字符词语的开头和结束位置以确认分字典
		int beginIndex = this.start;
		int endIndex = this.start + 1;

		char beginHashChar = getChar(ascWords[start], hashIndex);
		char endHashChar;
		for (; endIndex < this.end; endIndex++) {
			endHashChar = getChar(ascWords[endIndex], hashIndex);
			if (endHashChar != beginHashChar) {
				addSubDictionary(beginHashChar, beginIndex, endIndex);
				beginIndex = endIndex;
				beginHashChar = endHashChar;
			}
		}
		addSubDictionary(beginHashChar, beginIndex, this.end);
	}
	
	protected char getChar(String s, int index) {
		if (index >= s.length()) {
			return (char)0;
		}
		return s.charAt(index);
	}

	/**
	 * 将位置在beginIndex和endIndex之间(不包括endIndex)的词语作为一个分词典
	 * 
	 * @param hashChar
	 * @param beginIndex
	 * @param endIndex
	 */
	protected void addSubDictionary(char hashChar, int beginIndex, int endIndex) {
		SubDictionaryWrap subDic = new SubDictionaryWrap(hashChar,
				createSubDictionary(ascWords, beginIndex, endIndex), beginIndex);
		Integer key = keyOf(hashChar);
		if (subs.containsKey(key)) {
			System.out.println("出现这个文字,表示输入的词语排序错误,请确保词典排序正确>>>>>>>>>"
					+ hashChar);
		}
		subs.put(key, subDic);
	}

	
	protected Dictionary createSubDictionary(String[] ascWords, int beginIndex, int endIndex) {
		return new BinaryDictionary(ascWords, beginIndex, endIndex);
	}

	// -------------------------------------------------

	public String get(int index) {
		return ascWords[start + index];
	}

	public Hit search(CharSequence input, int begin, int count) {
		SubDictionaryWrap subDic = subs.get(keyOf(input.charAt(begin)));
		if (subDic == null) {
			return Hit.UNDEFINED;
		}
		Dictionary dic = subDic.dic;
		//对count==1的快速处理
//		if (count == 1) {
//			String header;
//			try {
//				header = dic.get(0);
//			} catch (RuntimeException e) {
//				System.err.println(input.subSequence(begin, begin + count));
//				System.err.println(dic.size());
//				throw e;
//				
//			}
//			if (header.length() == 1) {
//				return new Hit(subDic.wordIndexOffset, header, get(subDic.wordIndexOffset + 1));
//			}
//			else {
//				return new Hit(Hit.UNCLOSED_INDEX, null, header);
//			}
//		}
		//count >2
		Hit word = dic.search(input, begin, count);
		if (word.isHit()) {
			int index = subDic.wordIndexOffset + word.getIndex();
			word.setIndex(index);
			if (word.getNext() == null && index < size()) {
				word.setNext(get(index + 1));
			}
		}
		return word;
	}

	public int size() {
		return count;
	}

	// -------------------------------------------------

	/**
	 * 字符的在{@link #subs}的key值。
	 * 
	 * @param theChar
	 * @return
	 * 
	 * @see #subs
	 */
	protected int keyOf(char theChar) {
		// return theChar - 0x4E00;// '一'==0x4E00
		return theChar;
	}

	/**
	 * 分词典封箱
	 */
	static class SubDictionaryWrap {
		/**
		 * 分词典词组的头字符
		 */
		char firstChar;

		/**
		 * 分词典
		 */
		Dictionary dic;

		/**
		 * 分词典第一个词语在所有词语中的偏移位置
		 */
		int wordIndexOffset;

		public SubDictionaryWrap(char firstChar, Dictionary dic, int wordIndexOffset) {
			this.firstChar = firstChar;
			this.dic = dic;
			this.wordIndexOffset = wordIndexOffset;
		}
	}

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -