⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 dictionary.java

📁 基于词典的分词工具,用与对文本文件的分词
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
				}

				return true;
			}
		}

		return false;
	}

	// The data for modify
	protected boolean delModified() {
		mts = null;
		return true;
	}

	public boolean isExist(String word, int handle) {
		if (word != null) {
			Preword pw = preProcessing(word);
			if (pw != null) {
				if (findInOriginalTable(pw.getIndex(), pw.getRes(), handle) >= 0
						|| findInModifyTable(pw.getIndex(), pw.getRes(), handle) >= 0)
					return true;
			}
		}

		return false;
	}

	public ArrayList<WordItem> getHandle(String word) {
		ArrayList<WordItem> result = null;

		if (word != null) {
			result = new ArrayList<WordItem>();
			Preword pw = preProcessing(word);
			if (pw != null && pw.getWord() != null) {
				int found = findInOriginalTable(pw.getIndex(), pw.getRes(), -1);
				if (found >= 0) {
					WordItem wi = new WordItem();
					WordItem wi2 = wts.get(pw.getIndex()).getWords().get(found);
					wi.setHandle(wi2.getHandle());
					wi.setFreq(wi2.getFreq());
					result.add(wi);

					int temp = found + 1;
					WordTable wt = wts.get(pw.getIndex());
					while (temp < wt.getCount() && strEqual(wt.getWords().get(temp).getWord(), pw.getRes())) {
						wi = new WordItem();
						wi.setHandle(wt.getWords().get(temp).getHandle());
						wi.setFreq(wt.getWords().get(temp).getFreq());
						wi.setWord(word);
						result.add(wi);
						temp++;
					}

					return result;
				}

				int found2 = findInModifyTable(pw.getIndex(), pw.getRes(), -1);
				if (found2 >= 0) {
					ModifyTable mt = mts.get(pw.getIndex());
					ArrayList<WordItem> wis = mt.getWords();
					for (int i = found2; i < wis.size(); i++) {
						WordItem wi0 = wis.get(i);
						if (strEqual(wi0.getWord(), pw.getRes())) {
							WordItem wi = new WordItem();
							wi.setHandle(wi0.getHandle());
							wi.setFreq(wi0.getFreq());
							wi.setWord(word);
							result.add(wi);
						}
					}
				}
			}
		}
		return result;
	}

	/**
	 * 用2分法查询源词典库,看是否已存在
	 * 
	 * @param index
	 *            大数据块对应的下标(所有同一个字开头的词条为一个大数据块)
	 * @param res
	 *            去掉头一个字后剩余的部分
	 * @param handle
	 * @return
	 */
	public int findInOriginalTable(int index, String res, int handle) {
		int result = -1;

		if (res != null && wts != null) {
			WordTable wt = wts.get(index);
			if (wt != null && wt.getCount() > 0) {
				int start = 0;
				int end = wt.getCount() - 1;
				int mid = (end + start) / 2;
				ArrayList<WordItem> wis = wt.getWords();
				while (start <= end) {
					WordItem wi = wis.get(mid);
					int cmpValue = GFString.compareTo(wi.getWord(), res);
					if (cmpValue == 0 && (wi.getHandle() == handle || handle == -1)) {
						if (handle == -1) {
							while (mid >= 0 && res.compareTo(wis.get(mid).getWord()) == 0) {
								mid--;
							}
							if (mid < 0 || res.compareTo(wis.get(mid).getWord()) != 0)
								mid++;
						}

						result = mid;
						return result;

					} else if (cmpValue < 0 || cmpValue == 0 && wi.getHandle() < handle && handle != -1)
						start = mid + 1;
					else if (cmpValue > 0 || cmpValue == 0 && wi.getHandle() > handle && handle != -1)
						end = mid - 1;

					mid = (start + end) / 2;
				}
			}
		}
		return result;
	}

	/**
	 * 从修改表中查询是否存在,并返回它的位置坐标
	 * 
	 * @param index
	 * @param res
	 * @param handle
	 * @return 位置坐标
	 */
	protected int findInModifyTable(int index, String res, int handle) {
		int result = -1;

		if (mts != null && mts.size() > index) {
			ArrayList<WordItem> wis = mts.get(index).getWords();
			if (res != null && wis != null) {
				int i = 0;
				for (; i < wis.size(); i++) {
					WordItem wi = wis.get(i);
					if (wi.getWord().length() < res.length()
							|| (wi.getWord().length() == res.length() && wi.getHandle() < handle))
						continue;
				}
				if (i < wis.size() && strEqual(wis.get(i).getWord(), res)
						&& (wis.get(i).getHandle() == handle || handle < 0))
					result = i;
			}
		}
		return result;
	}

	// TODO
	public boolean strEqual(String b1, String b2) {
		if (b1 == null && b2 == null)
			return true;
		else if (b1 != null && b2 != null) {
			return b1.equals(b2);
		}
		return false;
	}

	public int getWordType(String word) {
		if (word != null) {
			int type = Utility.charType(word);
			int len = word.length();

			if (len > 0 && type == Utility.CT_CHINESE && GFString.isAllChinese(word))
				return Utility.WT_CHINESE;
			else if (len > 0 && type == Utility.CT_DELIMITER)
				return Utility.WT_DELIMITER;

		}
		return Utility.WT_OTHER;
	}

	/**
	 * 预处理,先把词前后的空格去掉
	 * 
	 * @param word
	 * @param wordRet
	 * @param isAdd
	 * @return
	 */
	public Preword preProcessing(String word) {
		Preword result = null;

		if (word != null && word.length() > 0) {

			int type = Utility.charType(word);
			word = GFString.removeSpace(word);
			int len = word.length();
			int end = len - 1, begin = 0;

			if (begin > end)
				return null;

			result = new Preword();
			result.setWord(word);

			if (type == Utility.CT_CHINESE) {// Chinese word
				result.setIndex(Utility.CC_ID(word));
				if (word != null)
					result.setRes(word.length() > 1 ? word.substring(1) : "");

			}

			else if (type == Utility.CT_DELIMITER) {// Delimiter
				result.setIndex(3755);
				result.setRes(word);

			} else
				result.setIndex(-1);
		}
		return result;// other invalid
	}

	public boolean mergePOS(int handle) {
		mts = new ArrayList<ModifyTable>();

		for (int i = 0; i < Utility.CC_NUM; i++) {

		}

		return false;
	}

	/**
	 * 从词典库中找出最匹配的一个
	 * 
	 * @param word
	 * @return
	 */
	public WordItem getMaxMatch(String word) {		
		if (word != null) {
			Preword pw = preProcessing(word);
			if (pw != null & pw.getWord() != null && pw.getIndex() >= 0) {
				String firstChar = pw.getWord().substring(0, 1);
				int found = findInOriginalTable(pw.getIndex(), pw.getRes(), -1);
				if (found == -1) {
					ArrayList<WordItem> wis = wts.get(pw.getIndex()).getWords();
					for (int j = 0; j < wis.size(); j++) {
						int compValue = GFString.compareTo(wis.get(j).getWord(), pw.getRes());
						if (compValue == 1) {
							found = j;
							break;
						}
					}
				}
				// 从源词典表中找出去掉第一个开头的字之后相等的词
				if (found >= 0 && wts != null && wts.get(pw.getIndex()) != null) {
					// 至少有一个
					ArrayList<WordItem> wis = wts.get(pw.getIndex()).getWords();
					if (wis == null) return null;
					
					WordItem wi = wis.get(found);
					String wordRet = firstChar + wi.getWord();					
					return new WordItem(wordRet,wi.getLen(),wi.getHandle(),wi.getFreq());
				}

				ArrayList<WordItem> wis = null;
				if (mts != null && mts.get(pw.getIndex()) != null) {
					wis = mts.get(pw.getIndex()).getWords();
					
					if (wis != null)
						for (WordItem wi : wis) {
							if (pw.getRes() != null && pw.getRes().equals(wi.getWord())) {
								String wordRet = firstChar + wi.getWord();								
								return new WordItem(wordRet,wi.getLen(),wi.getHandle(),wi.getFreq());
							}
						}
				}
			}
		}
		return null;
	}

	public int getFreq(String word, int handle) {
		if (word != null && word.length() > 0) {
			Preword pw = preProcessing(word);
			if (pw != null) {
				int found = findInOriginalTable(pw.getIndex(), pw.getRes(), handle);
				if (found >= 0 && wts != null) {
					WordTable wt = wts.get(pw.getIndex());
					WordItem wi = wt.getWords().get(found);
					return wi.getFreq();
				}

				int found2 = findInModifyTable(pw.getIndex(), pw.getRes(), handle);
				if (found2 >= 0 && mts != null) {
					ModifyTable mt = mts.get(pw.getIndex());
					WordItem wi = mt.getWords().get(found);
					return wi.getFreq();
				}
			}
		}
		return 0;
	}

	// ---------------------------------------------------------//
	// 暂时不会用到的方法
	public boolean optimum() {
		return false;
	}

	public boolean merge(Dictionary dict2, int nRatio) {
		return false;
	}

	public boolean outputChars(String sFilename) {
		return false;
	}

	public boolean output(String sFilename) {
		return false;
	}

	public boolean getPOSString(int nPOS, String sPOSRet) {
		return false;
	}

	public int getPOSValue(byte[] sPOS) {
		return 0;
	}

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -