📄 span.java

📁 基于java语言的分词系统
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
	 *          BBE 125 0.001314 
	 *          BBZ 30 0.000315 
	 *          BCD 62460 0.656624 
	 *          BEE 0 0.000000 
	 *          BE 13899 0.146116 
	 *          BG 869 0.009136 
	 *          BXD 4 0.000042 
	 *          BZ 3707 0.038971 
	 *          CD 8596 0.090367 
	 *          EE 26 0.000273 
	 *          FB 871 0.009157 
	 *          Y 3265 0.034324
	 *          XD 926 0.009735
	 *          
	 *          The person recognition patterns set
	 *          BBCD:姓+姓+名1+名2;
	 *          BBE: 姓+姓+单名;
	 *          BBZ: 姓+姓+双名成词;
	 *          BCD: 姓+名1+名2;
	 *          BE: 姓+单名;
	 *          BEE: 姓+单名+单名;韩磊磊
	 *          BG: 姓+后缀
	 *          BXD: 姓+姓双名首字成词+双名末字
	 *          BZ: 姓+双名成词;
	 *          B: 姓
	 *          CD: 名1+名2;
	 *          EE: 单名+单名;
	 *          FB: 前缀+姓
	 *          XD: 姓双名首字成词+双名末字
	 *          Y: 姓单名成词
	 * </pre>
	 */
	public boolean PersonRecognize(Dictionary personDict) {
		String sPOS = "z";
		String sPersonName;
		// 0 1 2 3 4 5
		final String[] patterns = { "BBCD", "BBC", "BBE", "BBZ", "BCD", "BEE", "BE", "BG", "BXD", "BZ", "CDCD", "CD",
				"EE", "FB", "Y", "XD", "" };
		// BBCD BBC BBE BBZ BCD BEE BE BG
		final double[] factor = { 0.003606, 0.000021, 0.001314, 0.000315, 0.656624, 0.000021, 0.146116, 0.009136,
		// BXD BZ CDCD CD EE FB Y XD
				0.000042, 0.038971, 0, 0.090367, 0.000273, 0.009157, 0.034324, 0.009735, 0 };
		// About parameter:

		final int patternLen[] = { 4, 3, 3, 3, 3, 3, 2, 2, 3, 2, 4, 2, 2, 2, 1, 2, 0 };
		int i = 0;
		for (i = 1; m_nBestTag[i] > -1; i++)
			// Convert to string from POS
			sPOS += (char) (m_nBestTag[i] + 'A');
		int j = 1, k, nPos;// Find the proper pattern from the first POS
		int nLittleFreqCount;// Counter for the person name role with little
		// frequecy
		boolean bMatched = false;

		while (j < i) {
			bMatched = false;
			for (k = 0; !bMatched && patternLen[k] > 0; k++) {
				if (sPOS.substring(j).indexOf(patterns[k]) == 0 && !"·".equals(m_sWords[j - 1])
						&& !"·".equals(m_sWords[j + patternLen[k]])) {// Find

					String temp = sPOS.substring(j + 2);
					if (temp.length() > 1)
						temp = temp.substring(0, 1);

					// Rule 1 for exclusion:前缀+姓+名1(名2): 规则(前缀+姓)失效；
					if ("FB".equals(patterns[k]) && ("E".equals(temp) || "C".equals(temp) || "G".equals(temp))) {
						continue;
					}

					nPos = j;// Record the person position in the tag
					// sequence
					sPersonName = "";
					nLittleFreqCount = 0;// Record the number of role with
					// little frequency
					while (nPos < j + patternLen[k]) {// Get the possible
						// person name

						if (m_nBestTag[nPos] < 4
								&& personDict.getFreq(m_sWords[nPos], m_nBestTag[nPos]) < Utility.LITTLE_FREQUENCY)
							nLittleFreqCount++;// The counter increase
						sPersonName += m_sWords[nPos];
						nPos += 1;
					}
					if ("CDCD".equals(patterns[k])) {
						if (GetForeignCharCount(sPersonName) > 0)
							j += patternLen[k] - 1;
						continue;
					}
					m_nUnknownWords[m_nUnknownIndex][0] = m_nWordPosition[j];
					m_nUnknownWords[m_nUnknownIndex][1] = m_nWordPosition[j + patternLen[k]];
					m_dWordsPossibility[m_nUnknownIndex] = -Math.log(factor[k])
							+ ComputePossibility(j, patternLen[k], personDict);
					// Mutiply the factor
					m_nUnknownIndex += 1;
					j += patternLen[k];
					bMatched = true;
				}
			}
			if (!bMatched)// Not matched, add j by 1
				j += 1;
		}
		return true;
	}

	private int guessPOS(int index) {
		int j = 0, i = index, charType;
		int nLen;
		switch (tagType) {
		case TT_NORMAL:
			break;
		case TT_PERSON:
			j = 0;
			if (m_sWords[index].indexOf("××") != -1) {
				m_nTags[i][j] = 6;
				m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 6) + 1);
			} else {
				m_nTags[i][j] = 0;
				m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 0) + 1);
				nLen = m_sWords[index].getBytes().length;
				if (nLen >= 4) {
					m_nTags[i][j] = 0;
					m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 0) + 1);
					m_nTags[i][j] = 11;
					m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 11) * 8);
					m_nTags[i][j] = 12;
					m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 12) * 8);
					m_nTags[i][j] = 13;
					m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 13) * 8);
				} else if (nLen == 2) {
					m_nTags[i][j] = 0;
					m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 0) + 1);
					charType = Utility.charType(m_sWords[index]);
					if (charType == Utility.CT_OTHER || charType == Utility.CT_CHINESE) {
						m_nTags[i][j] = 1;
						m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 1) + 1);
						m_nTags[i][j] = 2;
						m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 2) + 1);
						m_nTags[i][j] = 3;
						m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 3) + 1);
						m_nTags[i][j] = 4;
						m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 4) + 1);
					}
					m_nTags[i][j] = 11;
					m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 11) * 8);
					m_nTags[i][j] = 12;
					m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 12) * 8);
					m_nTags[i][j] = 13;
					m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 13) * 8);
				}
			}
			break;
		case TT_PLACE:
			j = 0;
			m_nTags[i][j] = 0;
			m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 0) + 1);
			nLen = m_sWords[index].length();
			if (nLen >= 4) {
				m_nTags[i][j] = 11;
				m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 11) * 8);
				m_nTags[i][j] = 12;
				m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 12) * 8);
				m_nTags[i][j] = 13;
				m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 13) * 8);
			} else if (nLen == 2) {
				m_nTags[i][j] = 0;
				m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 0) + 1);
				charType = Utility.charType(m_sWords[index]);
				if (charType == Utility.CT_OTHER || charType == Utility.CT_CHINESE) {
					m_nTags[i][j] = 1;
					m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 1) + 1);
					m_nTags[i][j] = 2;
					m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 2) + 1);
					m_nTags[i][j] = 3;
					m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 3) + 1);
					m_nTags[i][j] = 4;
					m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 4) + 1);
				}
				m_nTags[i][j] = 11;
				m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 11) * 8);
				m_nTags[i][j] = 12;
				m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 12) * 8);
				m_nTags[i][j] = 13;
				m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 13) * 8);
			}
			break;
		case TT_TRANS_PERSON:
			j = 0;
			nLen = m_sWords[index].length();

			m_nTags[i][j] = 0;
			m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 0) + 1);

			if (!Utility.isAllChinese(m_sWords[index])) {
				if (Utility.isAllLetter(m_sWords[index])) {
					m_nTags[i][j] = 1;
					m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 1) + 1);
					m_nTags[i][j] = 11;
					m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 11) + 1);
					m_nTags[i][j] = 2;
					m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 2) * 2 + 1);
					m_nTags[i][j] = 3;
					m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 3) * 2 + 1);
					m_nTags[i][j] = 12;
					m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 12) * 2 + 1);
					m_nTags[i][j] = 13;
					m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 13) * 2 + 1);
				}
				m_nTags[i][j] = 41;
				m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 41) * 8);
				m_nTags[i][j] = 42;
				m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 42) * 8);
				m_nTags[i][j] = 43;
				m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 43) * 8);
			} else if (nLen >= 4) {
				m_nTags[i][j] = 41;
				m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 41) * 8);
				m_nTags[i][j] = 42;
				m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 42) * 8);
				m_nTags[i][j] = 43;
				m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 43) * 8);
			} else if (nLen == 2) {
				charType = Utility.charType(m_sWords[index]);
				if (charType == Utility.CT_OTHER || charType == Utility.CT_CHINESE) {
					m_nTags[i][j] = 1;
					m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 1) * 2 + 1);
					m_nTags[i][j] = 2;
					m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 2) * 2 + 1);
					m_nTags[i][j] = 3;
					m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 3) * 2 + 1);
					m_nTags[i][j] = 30;
					m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 30) * 8 + 1);
					m_nTags[i][j] = 11;
					m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 11) * 4 + 1);
					m_nTags[i][j] = 12;
					m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 12) * 4 + 1);
					m_nTags[i][j] = 13;
					m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 13) * 4 + 1);
					m_nTags[i][j] = 21;
					m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 21) * 2 + 1);
					m_nTags[i][j] = 22;
					m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 22) * 2 + 1);
					m_nTags[i][j] = 23;
					m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 23) * 2 + 1);
				}
				m_nTags[i][j] = 41;
				m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 41) * 8);
				m_nTags[i][j] = 42;
				m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 42) * 8);
				m_nTags[i][j] = 43;
				m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 43) * 8);
			}
			break;
		default:
			break;
		}

		return j;
	}

	int GetForeignCharCount(String personName) {
		return 0;
	}

	public boolean PlaceRecognize(Dictionary coreDict, Dictionary placeDict) {
		int nStart = 1, nEnd = 1, i = 1, nTemp;
		double dPanelty = 1.0;// Panelty value
		while (m_nBestTag[i] > -1) {
			if (m_nBestTag[i] == 1)// 1 Trigger the recognition procession
			{
				nStart = i;
				nEnd = nStart + 1;
				while (m_nBestTag[nEnd] == 1)//
				{
					if (nEnd > nStart + 1)
						dPanelty += 1.0;
					nEnd++;
				}
				while (m_nBestTag[nEnd] == 2)
					// 2,12,22
					nEnd++;
				nTemp = nEnd;
				while (m_nBestTag[nEnd] == 3) {
					if (nEnd > nTemp)
						dPanelty += 1.0;
					nEnd++;
				}
			} else if (m_nBestTag[i] == 2)// 1,11,21 Trigger the recognition
			{
				dPanelty += 1.0;
				nStart = i;
				nEnd = nStart + 1;
				while (m_nBestTag[nEnd] == 2)
					// 2
					nEnd++;
				nTemp = nEnd;
				while (m_nBestTag[nEnd] == 3)// 2
				{
					if (nEnd > nTemp)
						dPanelty += 1.0;
					nEnd++;
				}
			}
			if (nEnd > nStart) {
				m_nUnknownWords[m_nUnknownIndex][0] = m_nWordPosition[nStart];
				m_nUnknownWords[m_nUnknownIndex][1] = m_nWordPosition[nEnd];
				m_dWordsPossibility[m_nUnknownIndex++] = ComputePossibility(nStart, nEnd - nStart + 1, placeDict)
						+ Math.log(dPanelty);
				nStart = nEnd;
			}
			if (i < nEnd)
				i = nEnd;
			else
				i = i + 1;
		}
		return true;
	}

	private double ComputePossibility(int startPos, int length, Dictionary dict) {
		double retValue = 0, posPoss;
		int nFreq;
		for (int i = startPos; i < startPos + length; i++) {
			nFreq = dict.getFreq(m_sWords[i], m_nBestTag[i]);
			// nFreq is word being the POS
			posPoss = Math.log((double) (context.getFreq(0, m_nBestTag[i]) + 1)) - Math.log((double) (nFreq + 1));
			retValue += posPoss;
		}
		return retValue;
	}
}
上一页 12
💿 文件大小 6647 K
👤 上传用户 ddddong
📂 所属分类多国语言处理
🏷️ 相关标签

#java #语言 #分
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -