cresult.java

来自「基于中科院的ICTCLAS实现中文分词系统 开发工具是JAVA.经测试,效果很好」· Java 代码 · 共 778 行 · 第 1/2 页

JAVA
778
字号
						+ GFString.getChineseString(pItem[i].sWord, "gb2312")
						+ "</src>";

				if (sPOS[0] != 0) {
					result += "</any>";
				}
			}
			i++;
		}
		return true;
	}

	protected boolean ChineseNameSplit(byte[] sPersonName, byte[] sSurname,
			byte[] sSurname2, byte[] sGivenName, CDictionary personDict) {
		int nSurNameLen = 4;
		int nLen = sPersonName.length;
		int nFreq;
		int i = 0;
		int nCharType;
		int nFreqGiven;
		byte[] sTemp = new byte[3];
		if (nLen < 3 || nLen > 8)// Not a traditional Chinese person name
			return false;

		while (i < nLen)// No Including non-CHinese char
		{
			nCharType = Utility.charType(sPersonName[i], sPersonName[i + 1]);
			if (nCharType != Final.CT_CHINESE && nCharType != Final.CT_OTHER)
				return false;
			i += 2;
		}
		sSurname2[0] = 0;// init
		GFCommon.bytesCopy(sSurname, sPersonName, 0, nSurNameLen);
		sSurname[nSurNameLen] = 0;
		if (!personDict.IsExist(sSurname, 1)) {
			nSurNameLen = 2;
			sSurname[nSurNameLen] = 0;
			if (!personDict.IsExist(sSurname, 1)) {
				nSurNameLen = 0;
				sSurname[nSurNameLen] = 0;
			}
		}

		byte[] bt = GFCommon.bytesCopy(sPersonName, nSurNameLen,
				sPersonName.length - nSurNameLen);
		GFCommon.bytesCopy(sGivenName, bt, 0, bt.length);
		if (nLen > 6) {
			bt = GFCommon.bytesCopy(sPersonName, nSurNameLen,
					sPersonName.length - nSurNameLen);
			GFCommon.bytesCopy(sTemp, bt, 0, 2);
			sTemp[2] = 0;// Get the second possible surname
			if (personDict.IsExist(sTemp, 1)) {// Hongkong women's name:
				// Surname+surname+given name
				GFCommon.bytesCopy(sSurname2, sTemp, 0, sTemp.length);
				bt = GFCommon.bytesCopy(sPersonName, nSurNameLen + 2,
						sPersonName.length - nSurNameLen - 2);
				GFCommon.bytesCopy(sGivenName, bt, 0, bt.length);
			}
		}
		nFreq = personDict.GetFrequency(sSurname, 1);
		GFCommon.bytesCopy(sTemp, sGivenName, 0, 2);
		sTemp[2] = 0;
		nFreqGiven = personDict.GetFrequency(sTemp, 2);
		if (nSurNameLen != 4
				&& ((nSurNameLen == 0 && nLen > 4)
						|| sGivenName.length > 4
						|| (Utility.GetForeignCharCount(sPersonName) >= 3
								&& nFreq < personDict.GetFrequency("张"
										.getBytes(), 1) / 40 && nFreqGiven < personDict
								.GetFrequency("华".getBytes(), 2) / 20) || (nFreq < 10 && Utility
						.GetForeignCharCount(sGivenName) == (nLen - nSurNameLen) / 2)))
			return false;

		// Single Surname+given name
		if (nLen == 4 && m_uPerson.IsGivenName(sPersonName)) {
			return false;
		}
		return true;
	}

	protected boolean PKU2973POS(int nHandle, byte[] sPOS973) {
		int[] nHandleSet = { 24832, 24932, 24935, 24942, 25088, 25344, 25600,
				25703, 25856, 26112, 26368, 26624, 26880, 27136, 27392, 27648,
				27904, 28160, 28263, 28274, 28275, 28276, 28280, 28282, 28416,
				28672, 28928, 29184, 29440, 29696, 29799, 29952, 30052, 30055,
				30058, 30060, 30070, 30074, 30208, 30308, 30311, 30318, 30464,
				30720, 30976, 31232 };
		// "a", "ad","ag","an","b", "c", "d", "dg","e", "f","g", "h", "i", "j",
		// "k", "l", "m", "n", "ng","nr","ns","nt","nx","nz","o", "p", "q", "r",
		// "s", "t", "tg","u", "ud","ug","uj","ul","uv","uz","v",
		// "vd","vg","vn","w", "x", "y", "z"
		String[] sPOSRelated = { "a", "ad", "ga", "an", "f", "c", "d", "d",
				"e", "nd", "g", "h", "i", "j", "k", "l", "m", "n", "gn", "nh",
				"ns", "ni", "ws", "nz", "o", "p", "q", "r", "nl", "nt", "gt",
				"u", "ud", "ug", "uj", "ul", "uv", "uz", "v", "vd", "gv", "vn",
				"w", "x", "u", "a" };
		/*
		 * "Bg","gf", "Rg","gr", "Mg","gm", "Yg","u", "Ug","u", "Qg","q",
		 */

		int nIndex = Utility.BinarySearch(nHandle, nHandleSet, 46);
		if (nIndex == -1)
			sPOS973[0] = "@".getBytes()[0];
		else
			GFCommon.bytesCopy(sPOS973, sPOSRelated[nIndex].getBytes(), 0,
					sPOSRelated[nIndex].length());
		return true;
	}

	protected boolean Adjust(TagWordResult[] pItem, TagWordResult[] pItemRet) {
		int i = 0, j = 0;
		int nLen;
		byte[] sSurName = new byte[10];
		byte[] sSurName2 = new byte[10];
		byte[] sGivenName = new byte[10];
		boolean bProcessed = false;// Have been processed
		while (pItem[i].sWord[0] != 0) {
			nLen = pItem[i].sWord.length;
			bProcessed = false;

			// Rule1: adjust person name
			if (pItem[i].nHandle == 28274
					&& ChineseNameSplit(pItem[i].sWord, sSurName, sSurName2,
							sGivenName, m_uPerson.m_dict)
					&& !"叶利钦".equals(GFString.getChineseString(pItem[i].sWord,
							"gb2312")))// 'nr'
			{// Divide name into surname and given name

				if (sSurName[0] != 0) {
					GFCommon.bytesCopy(pItemRet[j].sWord, sSurName, 0,
							sSurName.length);
					pItemRet[j++].nHandle = 28274;
				}
				if (sSurName2[0] != 0) {
					GFCommon.bytesCopy(pItemRet[j].sWord, sSurName2, 0,
							sSurName2.length);
					pItemRet[j++].nHandle = 28274;
				}
				if (sGivenName[0] != 0) {
					GFCommon.bytesCopy(pItemRet[j].sWord, sGivenName, 0,
							sGivenName.length);
					pItemRet[j++].nHandle = 28274;
				}
				bProcessed = true;
			}
			// Rule2 for overlap words ABB 一段段、一片片
			else if (pItem[i].nHandle == 27904
					&& pItem[i + 1].sWord.length == 2
					&& Utility.strcmp(pItem[i + 1].sWord, pItem[i + 2].sWord)) {// (pItem[i+1].nHandle/256=='q'||pItem[i+1].nHandle/256=='a')&&
				int index = 0;
				GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i].sWord, index,
						pItem[i].sWord.length);
				index += pItem[i].sWord.length;
				GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i + 1].sWord,
						index, pItem[i + 1].sWord.length);
				index += pItem[i + 1].sWord.length;
				GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i + 2].sWord,
						index, pItem[i + 2].sWord.length);
				pItemRet[j].nHandle = 27904;
				j += 1;
				i += 2;
				bProcessed = true;
			}
			// Rule3 for overlap words AA
			else if (nLen == 2
					&& Utility.strcmp(pItem[i].sWord, pItem[i + 1].sWord)) {
				int index = 0;
				GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i].sWord, index,
						pItem[i].sWord.length);
				index += pItem[i].sWord.length;
				GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i + 1].sWord,
						index, pItem[i + 1].sWord.length);
				// 24832=='a'*256
				pItemRet[j].nHandle = 24832;// a
				if (pItem[i].nHandle / 256 == 'v'
						|| pItem[i + 1].nHandle / 256 == 'v')// 30208='v'8256
				{
					pItemRet[j].nHandle = 30208;
				}
				if (pItem[i].nHandle / 256 == 'n'
						|| pItem[i + 1].nHandle / 256 == 'n')// 30208='v'8256
				{
					pItemRet[j].nHandle = 'n' * 256;
				}
				i += 1;
				if (pItem[i + 1].sWord.length == 2) {// AAB:洗/洗/脸、蒙蒙亮
					if ((pItemRet[j].nHandle == 30208 && pItem[i + 1].nHandle / 256 == 'n')
							|| (pItemRet[j].nHandle == 24832 && pItem[i + 1].nHandle / 256 == 'a')) {
						GFCommon.bytesCopy(pItemRet[j].sWord,
								pItem[i + 1].sWord, index,
								pItem[i + 1].sWord.length);
						i += 1;
					}
				}
				j += 1;
				bProcessed = true;
			}

			// Rule 4: AAB 洗/洗澡
			else if (nLen == 2
					&& Utility.strncmp(pItem[i].sWord,0, pItem[i + 1].sWord, 2)
					&& pItem[i + 1].sWord.length == 4
					&& (pItem[i].nHandle / 256 == 'v' || pItem[i].nHandle == 24832))// v,a
			{
				int index = 0;
				GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i].sWord, index,
						pItem[i].sWord.length);
				index += pItem[i].sWord.length;
				GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i + 1].sWord,
						index, pItem[i + 1].sWord.length);
				// 24832=='a'*256
				pItemRet[j].nHandle = 24832;// 'a'
				if (pItem[i].nHandle / 256 == 'v'
						|| pItem[i + 1].nHandle / 256 == 'v')// 30208='v'8256
				{
					pItemRet[j].nHandle = 30208;
				}

				i += 1;
				j += 1;
				bProcessed = true;
			} else if (pItem[i].nHandle / 256 == 'u'
					&& pItem[i].nHandle % 256 == 0)// uj,ud,uv,uz,ul,ug->u
				pItem[i].nHandle = 'u' * 256;
			else if (nLen == 2
					&& Utility.strncmp(pItem[i].sWord,0, pItem[i + 1].sWord, 2)
					&& pItem[i + 1].sWord.length == 4
					&& Utility.strncmp( pItem[i + 1].sWord,2, pItem[i + 2].sWord, 2)) {// AABB 朴朴素素 枝枝叶叶
				int index = 0;
				GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i].sWord, index,
						pItem[i].sWord.length);
				index += pItem[i].sWord.length;
				GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i + 1].sWord,
						index, pItem[i + 1].sWord.length);
				index += pItem[i + 1].sWord.length;
				GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i + 2].sWord,
						index, pItem[i + 2].sWord.length);
				pItemRet[j].nHandle = pItem[i + 1].nHandle;
				i += 2;
				j += 1;
				bProcessed = true;
			} else if (pItem[i].nHandle == 28275)// PostFix
			{
				if (m_uPlace.m_dict.IsExist(pItem[i + 1].sWord, 4)) {
					GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i].sWord, 0,
							pItem[i].sWord.length);
					GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i + 1].sWord,
							pItem[i].sWord.length, pItem[i + 1].sWord.length);
					pItemRet[j].nHandle = 28275;
					i += 1;
					j += 1;
					bProcessed = true;
				} else if (pItem[i + 1].sWord.length == 2
						&& Utility.CC_Find("队".getBytes(), pItem[i + 1].sWord)) {
					GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i].sWord, 0,
							pItem[i].sWord.length);
					GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i + 1].sWord,
							pItem[i].sWord.length, pItem[i + 1].sWord.length);
					pItemRet[j].nHandle = 28276;
					i += 1;
					j += 1;
					bProcessed = true;
				} else if (pItem[i + 1].sWord.length == 2
						&& Utility.CC_Find("语文字杯".getBytes(),
								pItem[i + 1].sWord)) {
					GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i].sWord, 0,
							pItem[i].sWord.length);
					GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i + 1].sWord,
							pItem[i].sWord.length, pItem[i + 1].sWord.length);
					pItemRet[j].nHandle = 28282;
					i += 1;
					j += 1;
					bProcessed = true;
				} else if (pItem[i + 1].sWord.length == 2
						&& Utility.CC_Find("裔".getBytes(), pItem[i + 1].sWord)) {
					GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i].sWord, 0,
							pItem[i].sWord.length);
					GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i + 1].sWord,
							pItem[i].sWord.length, pItem[i + 1].sWord.length);
					pItemRet[j].nHandle = 28160;
					i += 1;
					j += 1;
					bProcessed = true;
				}
			} else if (pItem[i].nHandle == 30208 || pItem[i].nHandle == 28160)// v
			{
				if (pItem[i + 1].sWord.length == 2
						&& Utility.CC_Find("员".getBytes(), pItem[i + 1].sWord)) {
					GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i].sWord, 0,
							pItem[i].sWord.length);
					GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i + 1].sWord,
							pItem[i].sWord.length, pItem[i + 1].sWord.length);
					pItemRet[j].nHandle = 28160;
					i += 1;
					j += 1;
					bProcessed = true;
				}
			} else if (pItem[i].nHandle == 28280) {// www/nx ./w sina/nx;
				// EIM/nx -601/m
				int index = 0;
				GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i].sWord, index,
						pItem[i].sWord.length);
				index += pItem[i].sWord.length;
				pItemRet[j].nHandle = 28280;
				while (pItem[i + 1].nHandle == 28280
						|| Utility.strstr("..".getBytes(), pItem[i + 1].sWord) != -1
						|| (pItem[i + 1].nHandle == 27904 && Utility
								.IsAllNum(pItem[i + 1].sWord))) {
					GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i + 1].sWord,
							pItem[i].sWord.length, pItem[i + 1].sWord.length);
					index += pItem[i + 1].sWord.length;
					i += 1;
				}
				j += 1;
				bProcessed = true;
			}

			if (!bProcessed) {// If not processed,that's mean: not need to
				// adjust;
				// just copy to the final result
				GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i].sWord, 0,
						pItem[i].sWord.length);
				pItemRet[j++].nHandle = pItem[i].nHandle;
			}
			i++;
		}
		pItemRet[j].sWord[0] = 0;// Set ending
		return true;
	}

	protected double ComputePossibility(TagWordResult[] pItem) {
		int i = 0;
		double dResultPossibility = 0;
		while (pItem[i].sWord[0] != 0) {
			dResultPossibility += pItem[i].nHandle;
			// Compute the possibility of logP(Wi|Ti)
			if (pItem[i + 1].sWord[0] != 0)// Not the last one
			{// Compute the possibility of logP(Ti|Ti-1)
				dResultPossibility += Math.log((double) (m_POSTagger.m_context
						.GetContextPossibility(0, pItem[i].nHandle,
								pItem[i + 1].nHandle) + 1));
				dResultPossibility -= Math.log((double) (m_POSTagger.m_context
						.GetFrequency(0, pItem[i].nHandle) + 1));
			}
			i++;
		}
		return dResultPossibility;
	}

	protected boolean Sort() {
		double[] dPossibility = new double[Final.MAX_SEGMENT_NUM];
		double dTemp;
		int[] nIndex = new int[Final.MAX_SEGMENT_NUM];
		int nTemp;// Index

		for (int i = 0; i < m_Seg.m_nSegmentCount; i++) {// Computing the
			// possibility
			dPossibility[i] = ComputePossibility(m_Seg.m_pWordSeg[i]);
			nIndex[i] = i;// Record the index
		}

		// Sort with Bubble sort algorithm
		for (int i = 0; i < m_Seg.m_nSegmentCount; i++)
			for (int j = i + 1; j < m_Seg.m_nSegmentCount; j++) {
				if (dPossibility[i] < dPossibility[j]) {// Swap the possition
					// and value
					nTemp = nIndex[i];
					dTemp = dPossibility[i];
					nIndex[i] = nIndex[j];
					dPossibility[i] = dPossibility[j];
					nIndex[j] = nTemp;
					dPossibility[j] = dTemp;
				}
			}

		for (int i = 0; i < m_Seg.m_nSegmentCount; i++) {// Adjust the
			// segmentation and
			// POS result and
			// store them in the
			// final result
			// array
			// Store them according their possibility ascendly
			Adjust(m_Seg.m_pWordSeg[nIndex[i]], m_pResult[i]);
			m_dResultPossibility[i] = dPossibility[i];
		}
		return true;
	}
}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?