⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 cjkknife.java

📁 一个用于搜索分词的项目
💻 JAVA
字号:
/*
 * 本代码所有权归作者所有 但在保持源代码不被破坏以及所有人署名的基础上 任何人可自由无限使用
 */
package com.sohospace.paoding.cjk;

import com.sohospace.dictionary.Dictionary;
import com.sohospace.dictionary.Hit;
import com.sohospace.paoding.CharSet;
import com.sohospace.paoding.Collector;
import com.sohospace.paoding.Knife;

/**
 * 
 * @author zhiliang.wang@yahoo.com.cn
 * 
 * @since 1.0
 * 
 */
public class CJKKnife implements Knife {

	// -------------------------------------------------


	private CJKDictionaryFactory factory;

	// -------------------------------------------------


	public CJKKnife() {
	}

	public CJKKnife(CJKDictionaryFactory factory) {
		this.factory = factory;
	}

	// -------------------------------------------------


	public CJKDictionaryFactory getFactory() {
		return factory;
	}

	public void setFactory(CJKDictionaryFactory factory) {
		this.factory = factory;
	}


	// -------------------------------------------------

	public boolean assignable(CharSequence beaf, int index) {
		return CharSet.isCjkUnifiedIdeographs(beaf.charAt(index));
	}
	
	public int dissect(Collector collector, CharSequence beaf, int offset) {
		if (CharSet.isCjkUnifiedIdeographs(beaf.charAt(beaf.length() - 1))
				 && offset > 0 && beaf.length() - offset < 50){
			return -offset;
		}
		Dictionary vocabulary = factory.getVocabulary();
		/* 例句:王崇浩住在北京积水潭桥附近 */
		// setup和end用于规定其之间的文字是否为词典词语
		int setup, end;
		// 为unidentifiedIndex服务,为已找出的词语结束位置的最大者,e.g '在','京','桥','近'
		int identifiedEnd = offset;
		// 用于定位未能分词的块的开始位置,e.g '王'
		int unidentifiedIndex = -1;
		//用于辅助判断是否调用shouldAWord()方法
		int maxWordLength = 0;
		Hit word = null;
		for (setup = offset, end = offset; setup < beaf.length()
				&& CharSet.isCjkUnifiedIdeographs(beaf.charAt(setup)); end = ++setup) {
			for (int count = 1; end < beaf.length()
					&& CharSet.isCjkUnifiedIdeographs(beaf.charAt(end++)); count++) {
				//第一次for循环时,end=setup+1
				word = vocabulary.search(beaf, setup, count);
				if (word.isUndefined()) {
					if (unidentifiedIndex < 0 && setup >= identifiedEnd) {
						unidentifiedIndex = setup;
					}
					break;
				} else if (word.isHit()) {
					if (identifiedEnd < end) {
						identifiedEnd = end;
					}
					if (unidentifiedIndex >= 0) {
						dissectUnidentified(collector, beaf, unidentifiedIndex,
								setup - unidentifiedIndex);
						unidentifiedIndex = -1;
					}
					collector.collect(word.getWord(), setup, end);
					if (setup == offset && maxWordLength < count) {
						maxWordLength = count;
					}
					if (!(word.isUnclosed() && end < beaf.length()// 这个判断是为下面判断服务
					&& beaf.charAt(end) >= word.getNext().charAt(count))) {
						break;
					}
				}
			}
		}
		if (identifiedEnd != end) {
			dissectUnidentified(collector, beaf, identifiedEnd,
					end - identifiedEnd);
		}
		int len = end - offset;
		if (len > 2 && len != maxWordLength && shouldAWord(beaf, offset, end)) {
			collect(collector, beaf, offset, end);
		}
		return setup;//此时end=start
	}

	// -------------------------------------------------


	/**
	 * 对非词汇表中的字词分词
	 * 
	 * @param cellector
	 * @param beaf
	 * @param offset
	 * @param count
	 */
	protected void dissectUnidentified(Collector collector, CharSequence beaf,
			int offset, int count) {
		int end = offset + count;
		Hit word = null;
		int nearEnd = end - 1;
		for (int i = offset, j; i < end;) {
			j = skipXword(beaf, i, end);
			if (j >= 0 && i != j) {
				i = j;
				continue;
			}
			j = collectNumber(collector, beaf, i, end);
			if (j >= 0 && i != j) {
				i = j;
				continue;
			}
			word = factory.getXchars().search(beaf, i, 1);
			if (word.isHit()) {
				i++;
				continue;
			}
			// 头字
			if (i == offset) {
				// 百度门事件=百度+门+...!=百度+门事+...
				collect(collector, beaf, offset, offset + 1);
			}
			// 尾字
			if (i == nearEnd) {
				if (nearEnd != offset) {
					collect(collector, beaf, nearEnd, end);
				}
			}
			// 穷尽二元分词
			else {
				collect(collector, beaf, i, i + 2);
			}
			i++;
		}
	}

	protected boolean shouldAWord(CharSequence beaf, int offset, int end) {
		if (offset > 0 && end < beaf.length()) {//确保前有字符,后也有字符
			int prev = offset - 1;
			if (beaf.charAt(prev) == '“' && beaf.charAt(end) == '”') {
				return true;
			} else if (beaf.charAt(prev) == '‘' && beaf.charAt(end) == '’') {
				return true;
			} else if (beaf.charAt(prev) == '\'' && beaf.charAt(end) == '\'') {
				return true;
			} else if (beaf.charAt(prev) == '\"' && beaf.charAt(end) == '\"') {
				return true;
			}
		}
		return false;
	}

	private final void collect(Collector collector, CharSequence beaf,
			int offset, int end) {
		collector.collect(beaf.subSequence(offset, end).toString(), offset, end);
	}

	private final int skipXword(CharSequence beaf, int offset, int end) {
		Hit word;
		for (int k = offset + 2; k <= end; k++) {
			word = factory.getXwords().search(beaf, offset, k - offset);
			if (word.isHit()) {
				offset = k;
			}
			if (word.isUndefined() || !word.isUnclosed()) {
				break;
			}
		}
		return offset;
	}

	private final int collectNumber(Collector collector, CharSequence beaf,
			int offset, int end) {
		int number1 = -1;
		int number2 = -1;
		int cur = offset;
		int bitValue = 0;
		int maxUnit = 0;
		boolean hasDigit = false;// 作用:去除没有数字只有单位的汉字,如“万”,“千”
		for (; cur <= end && (bitValue = toNumber(beaf.charAt(cur))) >= 0; cur++) {
			if (bitValue == 2
					&& (beaf.charAt(cur) == '两' || beaf.charAt(cur) == '俩' || beaf
							.charAt(cur) == '倆')) {
				if (cur != offset)
					break;
			}
			if (bitValue >= 0 && bitValue < 10) {
				hasDigit = true;
				if (number2 < 0)
					number2 = bitValue;
				else {
					number2 *= 10;
					number2 += bitValue;
				}
			} else {
				if (number2 < 0) {
					if (number1 < 0) {
						number1 = 1;
					}
					number1 *= bitValue;
				} else {
					if (number1 < 0) {
						number1 = 0;
					}
					if (bitValue >= maxUnit) {
						number1 += number2;
						number1 *= bitValue;
						maxUnit = bitValue;
					} else {
						number1 += number2 * bitValue;
					}
				}
				number2 = -1;
			}
		}
		if (!hasDigit && cur < beaf.length()
				&& !factory.getUnits().search(beaf, cur, 1).isHit()) {
			return offset;
		}
		if (number2 > 0) {
			if (number1 < 0) {
				number1 = number2;
			} else {
				number1 += number2;
			}
		}
		if (number1 >= 0) {
			collector.collect(String.valueOf(number1), offset, cur);

			// 后面可能跟了计量单位
			Hit wd;
			int i = cur + 1;
			while (i <= beaf.length()
					&& (wd = factory.getUnits().search(beaf, cur, i - cur))
							.isHit()) {
				collector.collect(String.valueOf(number1)
						+ beaf.subSequence(cur, i), offset, i);
				cur++;
				if (!wd.isUnclosed()) {
					break;
				}
				i++;
			}
		}
		return cur;
	}

	private final int toNumber(char c) {
		switch (c) {
		case '零':
		case '〇':
			return 0;
		case '一':
		case '壹':
			return 1;
		case '二':
		case '两':
		case '俩':
		case '貳':
			return 2;
		case '三':
		case '叁':
			return 3;
		case '四':
		case '肆':
			return 4;
		case '五':
		case '伍':
			return 5;
		case '六':
		case '陸':
			return 6;
		case '柒':
		case '七':
			return 7;
		case '捌':
		case '八':
			return 8;
		case '九':
		case '玖':
			return 9;
		case '十':
		case '什':
			return 10;
		case '百':
		case '佰':
			return 100;
		case '千':
		case '仟':
			return 1000;
		case '万':
		case '萬':
			return 10000;
		case '亿':
		case '億':
			return 100000000;
		default:
			return -1;
		}
	}


}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -