postagger.java

来自「基于java语言的分词系统」· Java 代码 · 共 746 行 · 第 1/2 页
JAVA
746 行
						sn.addPos(new POS(2, freq));
						freq = (double) 1 / (double) (context.getFreq(0, 3) * 2 + 1);
						sn.addPos(new POS(3, freq));
						freq = (double) 1 / (double) (context.getFreq(0, 12) * 2 + 1);
						sn.addPos(new POS(12, freq));
						freq = (double) 1 / (double) (context.getFreq(0, 13) * 2 + 1);
						sn.addPos(new POS(13, freq));
					}
					freq = (double) 1 / (double) (context.getFreq(0, 41) * 8);
					sn.addPos(new POS(41, freq));
					freq = (double) 1 / (double) (context.getFreq(0, 42) * 8);
					sn.addPos(new POS(42, freq));
					freq = (double) 1 / (double) (context.getFreq(0, 43) * 8);
					sn.addPos(new POS(43, freq));
				} else if (sn.getLen() >= 4) {
					freq = (double) 1 / (double) (context.getFreq(0, 41) * 8);
					sn.addPos(new POS(41, freq));
					freq = (double) 1 / (double) (context.getFreq(0, 42) * 8);
					sn.addPos(new POS(42, freq));
					freq = (double) 1 / (double) (context.getFreq(0, 43) * 8);
					sn.addPos(new POS(43, freq));
				} else if (sn.getLen() == 2) {
					charType = Utility.charType(word);
					if (charType == Utility.CT_OTHER || charType == Utility.CT_CHINESE) {
						freq = (double) 1 / (double) (context.getFreq(0, 1) * 2 + 1);
						sn.addPos(new POS(1, freq));
						freq = (double) 1 / (double) (context.getFreq(0, 2) * 2 + 1);
						sn.addPos(new POS(2, freq));
						freq = (double) 1 / (double) (context.getFreq(0, 3) * 2 + 1);
						sn.addPos(new POS(3, freq));
						freq = (double) 1 / (double) (context.getFreq(0, 30) * 8 + 1);
						sn.addPos(new POS(30, freq));
						freq = (double) 1 / (double) (context.getFreq(0, 11) * 4 + 1);
						sn.addPos(new POS(11, freq));
						freq = (double) 1 / (double) (context.getFreq(0, 12) * 4 + 1);
						sn.addPos(new POS(12, freq));
						freq = (double) 1 / (double) (context.getFreq(0, 13) * 4 + 1);
						sn.addPos(new POS(13, freq));
						freq = (double) 1 / (double) (context.getFreq(0, 21) * 2 + 1);
						sn.addPos(new POS(21, freq));
						freq = (double) 1 / (double) (context.getFreq(0, 22) * 2 + 1);
						sn.addPos(new POS(22, freq));
						freq = (double) 1 / (double) (context.getFreq(0, 23) * 2 + 1);
						sn.addPos(new POS(23, freq));
					}
					freq = (double) 1 / (double) (context.getFreq(0, 41) * 8);
					sn.addPos(new POS(41, freq));
					freq = (double) 1 / (double) (context.getFreq(0, 42) * 8);
					sn.addPos(new POS(42, freq));
					freq = (double) 1 / (double) (context.getFreq(0, 43) * 8);
					sn.addPos(new POS(43, freq));
				}
				break;
			default:
				break;
			}
			if (sn.getAllPos() != null)
				result = sn.getAllPos().size();
		}
		return result;
	}

	/**
	 * 人名模式匹配
	 * 
	 * <pre>
	 *          
	 *          BBCD 343 0.003606 
	 *          BBC 2 0.000021 
	 *          BBE 125 0.001314 
	 *          BBZ 30 0.000315 
	 *          BCD 62460 0.656624 
	 *          BEE 0 0.000000 
	 *          BE 13899 0.146116 
	 *          BG 869 0.009136 
	 *          BXD 4 0.000042 
	 *          BZ 3707 0.038971 
	 *          CD 8596 0.090367 
	 *          EE 26 0.000273 
	 *          FB 871 0.009157 
	 *          Y 3265 0.034324
	 *          XD 926 0.009735
	 *          
	 *          The person recognition patterns set
	 *          BBCD:姓+姓+名1+名2;
	 *          BBE: 姓+姓+单名;
	 *          BBZ: 姓+姓+双名成词;
	 *          BCD: 姓+名1+名2;
	 *          BE: 姓+单名;
	 *          BEE: 姓+单名+单名;韩磊磊
	 *          BG: 姓+后缀
	 *          BXD: 姓+姓双名首字成词+双名末字
	 *          BZ: 姓+双名成词;
	 *          B: 姓
	 *          CD: 名1+名2;
	 *          EE: 单名+单名;
	 *          FB: 前缀+姓
	 *          XD: 姓双名首字成词+双名末字
	 *          Y: 姓单名成词
	 * </pre>
	 */
	private void personRecognize(SegGraph segGraph, ArrayList<SegNode> sns) {
		String sPos = null;
		String personName = null;
		// 人名识别模式
		final String[] patterns = { "BBCD", "BBC", "BBE", "BBZ", "BCD", "BEE", "BE", "BG", "BXD", "BZ", "CDCD", "CD",
				"EE", "FB", "Y", "XD", "" };
		final double[] factor = { 0.003606, 0.000021, 0.001314, 0.000315, 0.656624, 0.000021, 0.146116, 0.009136,
				0.000042, 0.038971, 0, 0.090367, 0.000273, 0.009157, 0.034324, 0.009735, 0 };

		if (segGraph != null && sns != null) {
			int j = 1, k, nPos;
			boolean bMatched = false;

			sPos = word2pattern(sns);
			while (sPos != null && j < sPos.length()) {
				bMatched = false;
				for (k = 0; !bMatched && patterns[k].length() > 0; k++) {
					// 如果当前句子中有符合该模式的字串，并且该字串前后都不是圆点，则认为是匹配的
					if (sPos.substring(j).indexOf(patterns[k]) == 0 && !"·".equals(sns.get(j - 1).getWord())
							&& !"·".equals(sns.get(j + patterns[k].length()))) {// Find

						String temp = sPos.substring(j + 2);
						if (temp.length() > 1)
							temp = temp.substring(0, 1);

						// Rule 1 for exclusion:前缀+姓+名1(名2): 规则(前缀+姓)失效；
						if ("FB".equals(patterns[k]) && ("E".equals(temp) || "C".equals(temp) || "G".equals(temp))) {
							continue;
						}

						nPos = j;
						personName = "";
						// Get the possible person name
						while (nPos < j + patterns[k].length()) {
							SegNode sn = sns.get(nPos);
							if (sn.getPos() < 4
									&& unknownDict.getFreq(sn.getWord(), sn.getPos()) < Utility.LITTLE_FREQUENCY)
								personName += sn.getWord();
							nPos += 1;
						}
						if ("CDCD".equals(patterns[k])) {
							if (GetForeignCharCount(personName) > 0)
								j += patterns[k].length() - 1;
							continue;
						}

						SegNode usn = new SegNode();
						usn.setRow(sns.get(j).getRow());
						usn.setCol(sns.get(j + patterns[k].length() - 1).getCol());
						usn.setWord(unknownFlags);
						usn.setSrcWord(personName);
						double value = -Math.log(factor[k]) + computePossibility(j, patterns[k].length(), sns);
						usn.setPos(pos);
						usn.setValue(value);
						segGraph.insert(usn, true);

						j += patterns[k].length();
						bMatched = true;
					}
				}
				if (!bMatched)// Not matched, add j by 1
					j += 1;
			}

		}
	}

	// TODO:
	private int GetForeignCharCount(String personName) {
		return 0;
	}

	/**
	 * 地名模式匹配
	 * 
	 */
	private void placeRecognize(SegGraph segGraph, ArrayList<SegNode> sns, Dictionary coreDict) {
		if (segGraph != null && coreDict != null) {
			int start = 1;
			int end = 1;
			double dPanelty = 1;
			String srcWord = "";
			for (int i = 1; i < sns.size(); i++) {
				start = i;
				end = start;
				srcWord = sns.get(i).getSrcWord();
				if (getBestTag(sns, i) == 1) {
					for (end = i + 1; end < sns.size(); end++) {
						int bestTag = getBestTag(sns, end);
						if (bestTag == -1)
							continue;
						else if (bestTag == 1 || bestTag == 3) {
							if (end > i + 1)
								dPanelty += 1;
							srcWord += sns.get(end).getSrcWord();
						} else if (bestTag == 2)
							srcWord += sns.get(end).getSrcWord();
						else
							break;
					}

				} else if (getBestTag(sns, i) == 2) {
					dPanelty += 1;
					for (end = i + 1; end < sns.size(); end++) {
						int bestTag = getBestTag(sns, end);
						if (bestTag == -1)
							continue;
						else if (bestTag == 3) {
							if (end > i + 1)
								dPanelty += 1;
							srcWord += sns.get(end).getSrcWord();
						} else if (bestTag == 2)
							srcWord += sns.get(end).getSrcWord();
						else
							break;
					}
				}
				if (end > start) {
					SegNode newsn = new SegNode();
					newsn.setRow(sns.get(start).getRow());
					newsn.setCol(sns.get(end - 1).getCol());
					newsn.setPos(pos);
					newsn.setWord(unknownFlags);
					newsn.setSrcWord(srcWord);
					double value = computePossibility(start, end - start + 1, sns);
					newsn.setValue(value);
					segGraph.insert(newsn, true);
				}
			}
		}
	}

	private int getBestTag(ArrayList<SegNode> sns, int index) {
		if (sns != null && index >= 0 && index < sns.size()) {
			SegNode sn = sns.get(index);
			return getBestTag(sn);

		}

		return -1;
	}

	private int getBestTag(SegNode sn) {
		if (sn != null) {
			ArrayList<POS> allPos = sn.getAllPos();
			if (allPos != null) {
				for (POS pos : allPos) {
					if (pos.isBest())
						return pos.getTag();
				}
			}
		}

		return -1;
	}

	// Judge whether the name is a given name
	public boolean isGivenName(String sName) {
		String firstChar;
		String secondChar;
		// given Name Possibility
		double gnp = 0;
		// singleNamePossibility
		double snp = 0;

		if (sName != null) {
			if (sName.getBytes().length != 4)
				return false;

			firstChar = sName.substring(0, 1);
			secondChar = sName.substring(1);

			// The possibility of P(Wi|Ti)
			gnp += Math.log((double) unknownDict.getFreq(firstChar, 2) + 1.0);
			gnp -= Math.log(context.getFreq(0, 2) + 1.0);
			gnp += Math.log((double) unknownDict.getFreq(secondChar, 3) + 1.0);
			gnp -= Math.log(context.getFreq(0, 3) + 1.0);
			// The possibility of conversion from 2 to 3
			gnp += Math.log(context.getPossibility(0, 2, 3) + 1.0);
			gnp -= Math.log(context.getFreq(0, 2) + 1.0);

			// The possibility of P(Wi|Ti)
			snp += Math.log((double) unknownDict.getFreq(firstChar, 1) + 1.0);
			snp -= Math.log(context.getFreq(0, 1) + 1.0);
			snp += Math.log((double) unknownDict.getFreq(secondChar, 4) + 1.0);
			snp -= Math.log(context.getFreq(0, 4) + 1.0);
			// The possibility of conversion from 1 to 4
			snp += Math.log(context.getPossibility(0, 1, 4) + 1.0);
			snp -= Math.log(context.getFreq(0, 1) + 1.0);

			// 张震||m_dict.getFrequency(sFirstChar,1)/m_dict.getFrequency(sFirstChar,2)>=10
			// The possibility being a single given name is more than being a
			// 2-char given name
			if (snp >= gnp)
				return false;
			return true;
		}

		return false;
	}

	// 把经过初次分词后的链表形式转成人名字符串模式
	private String word2pattern(ArrayList<SegNode> sns) {
		String result = null;

		if (sns != null) {
			result = "";
			for (SegNode sn : sns) {
				result += (char) (getBestTag(sn) + 'A');
			}

		}
		return result;
	}
 
	/**
	 * 标记出最佳词性
	 * 
	 * @param sns
	 */
	private void tagBest(ArrayList<SegNode> sns) {

		if (sns != null) {
			int size = sns.size();

			// 不考虑开始和结束标记
			for (int i = size - 1, j = 0; i >= 0; i--) {
				ArrayList<POS> allPos = sns.get(i).getAllPos();
				if (allPos != null && allPos.size() > j) {
					POS pos = allPos.get(j);
					pos.setBest(true);
					j = pos.getPrev();
				} else if (i + 1 < size - 1) {
					int tag = getBestTag(sns.get(i + 1));
					POS pos = new POS(tag, 0);
					pos.setBest(true);
					sns.get(i).addPos(pos);
				}
			}
			// 把结束点去掉，用到它的目的仅仅是为了得到最后一个“末＃＃末”词的最优词性

			if (size > 1) {
				if (sns.get(size - 1).getWord() == null)
					sns.remove(size - 1);
			}
		}
	}

	private double computePossibility(int startPos, int length, ArrayList<SegNode> sns) {
		double retValue = 0, posPoss;

		if (sns != null && unknownDict != null && context != null) {
			for (int i = startPos; i < startPos + length && sns != null; i++) {
				SegNode sn = sns.get(i);
				int bestTag = getBestTag(sn);
				if (bestTag != -1) {
					int freq = unknownDict.getFreq(sn.getSrcWord(), bestTag);
					posPoss = Math.log((double) (context.getFreq(0, sn.getPos()) + 1));
					posPoss += -Math.log((double) (freq + 1));
					retValue += posPoss;
				}
			}
		}
		return retValue;
	}

	public Dictionary getUnknownDict() {
		return unknownDict;
	}

}
postagger.java - 源码说明

本页面展示了「基于java语言的分词系统」中的 postagger.java 源码文件，采用 Java 编程语言编写，共 746 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与java相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?