⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 segment.java

📁 基于中科院的ICTCLAS实现中文分词系统 开发工具是JAVA.经测试,效果很好
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
				wordResult[index].setValue(fValue);
				// Utility.insertGraph(optGraph,sg,false);
			}
		}

		ArrayList<WordResult> result = new ArrayList<WordResult>();
		if (wordResult != null)
			for (WordResult wr : wordResult)
				if (wr != null)
					result.add(wr);
		return result;
	}

	private ArrayList<SegGraph> getSegPath(ArrayList<Atom> atoms, ArrayList<SegGraph> sgs, int[] unipath) {
		int curIndex = 0;
		int nextIndex = 0;
		ArrayList<SegGraph> path = null;

		if (atoms != null && sgs != null && unipath != null) {
			path = new ArrayList<SegGraph>();
			for (int i = 0; i < unipath.length; i++) {
				if (i != unipath.length - 1) {
					curIndex = unipath[i];
					nextIndex = unipath[i + 1];
				} else {
					curIndex = unipath[i];
					nextIndex = -1;
				}

				SegGraph seg = Utility.getElement(sgs, curIndex, nextIndex);
				if (seg != null) {
					String word = getWords(atoms, curIndex, nextIndex);
					seg.setWord(word);
					path.add(seg);
				}
			}
		}
		return path;
	}

	private String getWords(ArrayList<Atom> atoms, int curIndex, int nextIndex) {
		String words = null;

		if (atoms != null && curIndex >= 0 && curIndex < atoms.size()) {
			if (nextIndex == -1)
				nextIndex = atoms.size();
			words = "";
			for (int i = curIndex; i < nextIndex; i++) {
				words += atoms.get(i).getWord();
			}
		}
		return words;
	}

	public ArrayList<WordResult> adjust(ArrayList<WordResult> wrs) {
		ArrayList<WordResult> result = null;
		PersonName pname = null;

		if (wrs == null || wrs.size() == 0)
			return null;

		int j = 0;
		boolean isBeProcess = false;
		WordResult wr = null;
		result = new ArrayList<WordResult>();
		for (int i = 0; i < wrs.size(); i++) {
			wr = wrs.get(i);
			if (wr.getHandle() == 28274 && (pname = chineseNameSplit(wr.getWord(), unPerson.unDict)) != null
					&& !"叶利钦".equals(wr.getWord())) {
				if (pname.getFirstName() != null) {
					WordResult wr2 = new WordResult();
					wr2.setWord(pname.getFirstName());
					wr2.setHandle(28274);
					result.add(wr2);
				}

				if (pname.getMidName() != null) {
					WordResult wr2 = new WordResult();
					wr2.setWord(pname.getMidName());
					wr2.setHandle(28274);
					result.add(wr2);
				}

				if (pname.getLastName() != null) {
					WordResult wr2 = new WordResult();
					wr2.setWord(pname.getLastName());
					wr2.setHandle(28274);
					result.add(wr2);
				}

				isBeProcess = true;
			}
			// Rule2 for overlap words ABB 一段段、一片片
			else if (wr.getHandle() == 27904 && wrs.get(i + 1).getWord().getBytes().length == 2
					&& wrs.get(i + 1).getWord().equals(wrs.get(i + 2).getWord())) {
				WordResult wr2 = new WordResult();
				wr2.setWord(wr.getWord() + wrs.get(i + 1).getWord() + wrs.get(i + 2).getWord());
				wr2.setHandle(27904);
				result.add(wr2);
				i += 2;
				isBeProcess = true;
			}
			// Rule3 for overlap words AA
			else if (wr.getWord().getBytes().length == 2 && wr.getWord().equals(wrs.get(i + 1))) {
				WordResult wr2 = new WordResult();
				wr2.setWord(wr.getWord() + wrs.get(i + 1).getWord());
				wr2.setHandle(24832);
				if (wr.getHandle() / 256 == 'v' || wrs.get(i + 1).getHandle() / 256 == 'v')// 30208='v'8256
					wr2.setHandle(30208);

				if (wr.getHandle() / 256 == 'n' || wrs.get(i + 1).getHandle() / 256 == 'n')// 30208='v'8256
					wr2.setHandle('n' * 256);

				i += 1;
				if (wrs.get(i + 1).getWord().getBytes().length == 2) {// AAB:洗/洗/脸、蒙蒙亮
					if ((wr2.getHandle() == 30208 && wrs.get(i + 1).getHandle() / 256 == 'n')
							|| (wr2.getHandle() == 24832 && wrs.get(i + 1).getHandle() / 256 == 'a')) {
						wr2.setWord(wr2.getWord() + wrs.get(i + 1).getWord());
						i += 1;
					}
				}
				j += 1;
				isBeProcess = true;
				result.add(wr2);
			}
			// Rule 4: AAB 洗/洗澡
			else if (wr.getWord().getBytes().length == 2 && (wr.getHandle() / 256 == 'v' || wr.getHandle() == 24832)
					&& wrs.get(i + 1).getWord().getBytes().length == 4
					&& wrs.get(i + 1).getWord().indexOf(wr.getWord()) == 0) {
				WordResult wr2 = new WordResult();
				wr2.setWord(wr.getWord() + wrs.get(i + 1).getWord());
				wr2.setHandle(24832); // 24832=='a'*256

				if (wr.getHandle() / 256 == 'v' || wrs.get(i + 1).getHandle() / 256 == 'v')// 30208='v'8256
					wr2.setHandle(30208);

				i += 1;
				j += 1;
				isBeProcess = true;
				result.add(wr2);
			} else if (wr.getHandle() / 256 == 'v' && wr.getHandle() % 256 != 0)// uj,ud,uv,uz,ul,ug->u
				wr.setHandle('u' * 256);
			else if (wr.getWord().getBytes().length == 2 && wrs.get(i + 1).getWord().getBytes().length == 4
					&& wrs.get(i + 1).getWord().indexOf(wr.getWord()) == 0
					&& wrs.get(i + 1).getWord().indexOf(wrs.get(i + 2).getWord()) == 0) {
				WordResult wr2 = new WordResult();
				wr2.setWord(wr.getWord() + wrs.get(i + 1).getWord() + wrs.get(i + 2).getWord());
				wr2.setHandle(wrs.get(i + 1).getHandle());
				i += 2;
				j++;
				isBeProcess = true;
				result.add(wr2);
			} else if (wr.getHandle() == 28275)// PostFix
			{
				if (unPlace.unDict.isExist(wrs.get(i + 1).getWord(), 4)) {
					WordResult wr2 = new WordResult();
					wr2.setWord(wr.getWord() + wrs.get(i + 1).getWord());
					wr2.setHandle(28275);
					i += 1;
					j += 1;
					isBeProcess = true;
					result.add(wr2);
				} else if (wrs.get(i + 1).getWord().getBytes().length == 2 && "队".equals(wrs.get(i + 1).getWord())) {
					WordResult wr2 = new WordResult();
					wr2.setWord(wr.getWord() + wrs.get(i + 1).getWord());
					wr2.setHandle(28276);
					i += 1;
					j += 1;
					isBeProcess = true;
					result.add(wr2);
				} else if (wrs.get(i + 1).getWord().getBytes().length == 2
						&& "语文字杯".indexOf(wrs.get(i + 1).getWord()) != -1) {
					WordResult wr2 = new WordResult();
					wr2.setWord(wr.getWord() + wrs.get(i + 1).getWord());
					wr2.setHandle(28282);
					i += 1;
					j += 1;
					isBeProcess = true;
					result.add(wr2);
				} else if ("裔".equals(wrs.get(i + 1).getWord())) {
					WordResult wr2 = new WordResult();
					wr2.setWord(wr.getWord() + wrs.get(i + 1).getWord());
					wr2.setHandle(28160);
					i += 1;
					j += 1;
					isBeProcess = true;
					result.add(wr2);
				}
			} else if (wr.getHandle() == 30208 || wr.getHandle() == 28160)// v
			{
				if (i + 1 < wrs.size() && "员".equals(wrs.get(i + 1).getWord())) {
					WordResult wr2 = new WordResult();
					wr2.setWord(wr.getWord() + wrs.get(i + 1).getWord());
					wr2.setHandle(28160);
					i += 1;
					j += 1;
					isBeProcess = true;
					result.add(wr2);
				}
			} else if (wr.getHandle() == 28280) {// www/nx ./w sina/nx;
				// EIM/nx -601/m
				WordResult wr2 = new WordResult();
				wr2.setWord(wr.getWord());
				wr2.setHandle(28280);
				while (wrs.get(i + 1).getHandle() == 28280 || "..".indexOf(wrs.get(i + 1).getWord()) != -1
						|| (wrs.get(i + 1).getHandle() == 27904 && Utility.isAllNum(wrs.get(i + 1).getWord()))) {
					wr2.setWord(wr2.getWord() + wrs.get(i + 1).getWord());
					i += 1;
				}
				j += 1;
				isBeProcess = true;
				result.add(wr2);
			}

			if (!isBeProcess) {// If not processed,that's mean: not need to
				// adjust;
				// just copy to the final result
				WordResult wr2 = new WordResult();
				wr2.setWord(wr.getWord());
				wr2.setHandle(wr.getHandle());
				result.add(wr2);
				j++;
			}
		}

		return result;
	}

	public PersonName chineseNameSplit(String word, Dictionary personDict) {
		PersonName result = null;

		if (word != null && personDict != null) {
			int len = word.length();
			if (len < 2 || len > 4)
				return null;
			String[] atoms = GFString.atomSplit(word);
			for (String s : atoms) {
				if (Utility.charType(s) != Utility.CT_CHINESE && Utility.charType(s) != Utility.CT_OTHER)
					return null;
			}

			String surName = null;
			int surNameLen = 2;
			if (len > 2)
				surName = word.substring(0, surNameLen);
			else if (len == 2)
				surName = word;
			if (!personDict.isExist(surName, 1)) {
				surNameLen = 1;
				if (len > 1)
					surName = word.substring(0, surNameLen);
				else if (len == 1)
					surName = word;
				if (!personDict.isExist(surName, 1)) {
					surName = null;
					surNameLen = 0;
				}
			} 
			String giveName = word.substring(surNameLen);
			if (len > 3) {
				String temp = word.substring(surNameLen, surNameLen + 1);
				if (personDict.isExist(temp, 1)) {
				 
					giveName = word.substring(surNameLen + 1);
				}
			}

			double freq = personDict.getFrequency(surName, 1);
			String temp = giveName.substring(0, 1);
			double freq2 = personDict.getFrequency(temp, 2);

			if (surNameLen != 2
					&& ((surNameLen == 0 && len > 2) || giveName.length() > 2 || getForeignCharCount(word) >= 3
							&& freq < personDict.getFrequency("张", 1) / 40
							&& freq2 < personDict.getFrequency("华", 2) / 20 || (freq < 10 && getForeignCharCount(giveName) == (len - surNameLen) / 2)))
				return null;
			if (len == 2 && unPerson.isGivenName(word))
				return null;
		}
		return result;

	}

	private int getForeignCharCount(String name) {

		return 0;
	}

	public String getResult() {
		return splitedWord;
	}

	public long getSpendTime() {
		return spendTime;
	}

	public void setSpendTime(long spendTime) {
		this.spendTime = spendTime;
	}

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -