📄 csegment.java

📁 基于中科院的ICTCLAS实现中文分词系统开发工具是JAVA.经测试,效果很好
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
					if (trc.nPOS < 0)// Unknown words: P(Wi|Ci);while known
										// words:1
						dValue += trc.value;

					// Get the position index of current word in the position
					// map table
					nCurWordIndex = Utility.BinarySearch(trc.row
							* Final.MAX_SENTENCE_LEN + trc.col,
							m_npWordPosMapTable, m_nWordCount);
					nNextWordIndex = Utility.BinarySearch(trc2.row
							* Final.MAX_SENTENCE_LEN + trc2.col,
							m_npWordPosMapTable, m_nWordCount);
					aBinaryWordNet.SetElement(nCurWordIndex, nNextWordIndex,
							dValue, trc.nPOS, null);

				}
			}
		}
		return true;
	}

	protected boolean IsYearTime(byte[] sNum) {
		// Judge whether the sNum is a num genearating year
		int nLen = sNum.length;
		byte[] sTemp = new byte[3];
		Utility.strncpy(sTemp, sNum, 2);
		sTemp[2] = 0;
		if (Utility.IsAllSingleByte(sNum)
				&& (nLen == 4 || nLen == 2 && sNum[0] > '4'))// 1992年, 90年
			return true;
		if (Utility.IsAllNum(sNum)
				&& (nLen >= 6 || nLen == 4
						&& Utility.CC_Find("５６７８９".getBytes(), sTemp)))
			return true;
		if (Utility.GetCharCount("零○一二三四五六七八九壹贰叁肆伍陆柒捌玖".getBytes(), sNum) == (int) nLen / 2
				&& nLen >= 3)
			return true;
		if (nLen == 8 && Utility.GetCharCount("千仟零○".getBytes(), sNum) == 2)// 二仟零二年
			return true;
		if (nLen == 2 && Utility.GetCharCount("千仟".getBytes(), sNum) == 1)
			return true;
		if (nLen == 4
				&& Utility.GetCharCount("甲乙丙丁戊己庚辛壬癸".getBytes(), sNum) == 1
				&& Utility.GetCharCount("子丑寅卯辰巳午未申酉戌亥".getBytes(), GFCommon
						.bytesCopy(sNum, 2, sNum.length - 2)) == 1)
			return true;
		return false;
	}

	protected boolean GenerateWord(int[][] nSegRoute, int nIndex) {
		int i = 0, k = 0;
		int j, nStartVertex, nEndVertex, nPOS;
		byte[] sAtom = new byte[Final.WORD_MAXLENGTH];
		byte[] sNumCandidate = new byte[100];
		byte[] sCurWord = new byte[100];
		double fValue = 0;
		while (nSegRoute[nIndex][i] != -1 && nSegRoute[nIndex][i + 1] != -1
				&& nSegRoute[nIndex][i] < nSegRoute[nIndex][i + 1]) {
			nStartVertex = nSegRoute[nIndex][i];
			j = nStartVertex;// Set the start vertex
			nEndVertex = nSegRoute[nIndex][i + 1];// Set the end vertex
			nPOS = 0;
			m_graphSeg.m_segGraph.GetElement(nStartVertex, nEndVertex, fValue,
					nPOS, null);
			sAtom[0] = 0;

			int index = 0;
			while (j < nEndVertex) {// Generate the word according the
									// segmentation route
				GFCommon.bytesCopy(sAtom, m_graphSeg.m_sAtom[j], index,
						m_graphSeg.m_sAtom[j].length);
				index += m_graphSeg.m_sAtom[j].length;
				j++;
			}
			m_pWordSeg[nIndex][k].sWord[0] = 0;// Init the result ending
			GFCommon.bytesCopy(sNumCandidate, sAtom, 0, sAtom.length);
			while (sAtom[0] != 0
					&& (Utility.IsAllNum(sNumCandidate) || Utility
							.IsAllChineseNum(sNumCandidate))) {// Merge all
																// seperate
																// continue num
																// into one
																// number
				// sAtom[0]!=0: add in 2002-5-9
				GFCommon.bytesCopy(m_pWordSeg[nIndex][k].sWord, sNumCandidate,
						0, sNumCandidate.length);
				index += sNumCandidate.length;
				// Save them in the result segmentation
				i++;// Skip to next atom now
				sAtom[0] = 0;

				index = 0;
				while (j < nSegRoute[nIndex][i + 1]) {// Generate the word
														// according the
														// segmentation route
					GFCommon.bytesCopy(sAtom, m_graphSeg.m_sAtom[j], index,
							m_graphSeg.m_sAtom[j].length);
					index += m_graphSeg.m_sAtom[j].length;
					j++;
				}
				GFCommon.bytesCopy(sNumCandidate, sAtom, index, sAtom.length);
			}
			int nLen = m_pWordSeg[nIndex][k].sWord.length;
			if (nLen == 4
					&& Utility.CC_Find("第上成±—＋∶·．／".getBytes(),
							m_pWordSeg[nIndex][k].sWord)
					|| nLen == 1
					&& Utility.strchr("+-./".getBytes(),
							m_pWordSeg[nIndex][k].sWord[0]) != -1) {// Only one
																	// word
				GFCommon.bytesCopy(sCurWord, m_pWordSeg[nIndex][k].sWord, 0,
						m_pWordSeg[nIndex][k].sWord.length);// Record current
															// word
				i--;
			} else if (m_pWordSeg[nIndex][k].sWord[0] == 0)// Have never
															// entering the
															// while loop
			{
				GFCommon.bytesCopy(m_pWordSeg[nIndex][k].sWord, sAtom, 0,
						sAtom.length);
				// Save them in the result segmentation
				GFCommon.bytesCopy(sCurWord, sAtom, 0, sAtom.length);// Record
																		// current
																		// word
			} else {// It is a num
				if ("－－".equals(m_pWordSeg[nIndex][k].sWord)
						|| "—".equals(m_pWordSeg[nIndex][k].sWord)
						|| m_pWordSeg[nIndex][k].sWord[0] == '-'
						&& m_pWordSeg[nIndex][k].sWord[1] == 0)// The delimiter
																// "－－"
				{
					nPOS = 30464;// 'w'*256;Set the POS with 'w'
					i--;// Not num, back to previous word
				} else {// Adding time suffix

					byte[] sInitChar = new byte[3];
					int nCharIndex = 0;// Get first char
					sInitChar[nCharIndex] = m_pWordSeg[nIndex][k].sWord[nCharIndex];
					if (sInitChar[nCharIndex] < 0) {
						nCharIndex += 1;
						sInitChar[nCharIndex] = m_pWordSeg[nIndex][k].sWord[nCharIndex];
					}
					nCharIndex += 1;
					sInitChar[nCharIndex] = '\0';
					if (k > 0
							&& (Math.abs(m_pWordSeg[nIndex][k - 1].nHandle) == 27904 || Math
									.abs(m_pWordSeg[nIndex][k - 1].nHandle) == 29696)
							&& (Utility.strcmp(sInitChar, "—".getBytes()) || sInitChar[0] == '-')
							&& (m_pWordSeg[nIndex][k].sWord.length > nCharIndex)) {// 3-4月
																					// //27904='m'*256
						// Split the sInitChar from the original word
						byte[] bt = GFCommon
								.bytesCopy(m_pWordSeg[nIndex][k].sWord,
										nCharIndex,
										m_pWordSeg[nIndex][k].sWord.length
												- nCharIndex);
						GFCommon.bytesCopy(m_pWordSeg[nIndex][k + 1].sWord, bt,
								0, bt.length);
						m_pWordSeg[nIndex][k + 1].dValue = m_pWordSeg[nIndex][k].dValue;
						m_pWordSeg[nIndex][k + 1].nHandle = 27904;
						m_pWordSeg[nIndex][k].sWord[nCharIndex] = 0;
						m_pWordSeg[nIndex][k].dValue = 0;
						m_pWordSeg[nIndex][k].nHandle = 30464;// 'w'*256;
						m_graphOptimum.SetElement(nStartVertex,
								nStartVertex + 1, m_pWordSeg[nIndex][k].dValue,
								m_pWordSeg[nIndex][k].nHandle,
								m_pWordSeg[nIndex][k].sWord);
						nStartVertex += 1;
						k += 1;
					}
					nLen = m_pWordSeg[nIndex][k].sWord.length;
					if (sAtom.length == 2
							&& Utility.CC_Find("月日时分秒".getBytes(), sAtom)
							|| "月份".equals(GFString.getChineseString(sAtom,
									"gb2312"))) {// 2001年
						Utility.strcat(m_pWordSeg[nIndex][k].sWord, sAtom);
						Utility.strcpy(sCurWord, "未##时".getBytes());
						nPOS = -29696;// 't'*256;//Set the POS with 'm'
					} else if (Utility.strcmp(sAtom, "年".getBytes())) {
						if (IsYearTime(m_pWordSeg[nIndex][k].sWord))// strncmp(sAtom,"年",2)==0&&
						{// 1998年，
							Utility.strcat(m_pWordSeg[nIndex][k].sWord, sAtom);
							Utility.strcpy(sCurWord, "未##时".getBytes());
							nPOS = -29696;// Set the POS with 't'
						} else {
							Utility.strcpy(sCurWord, "未##数".getBytes());
							nPOS = -27904;// Set the POS with 'm'
							i--;// Can not be a time word
						}
					} else {
						// 早晨/t 五点/t
						byte[] bt = GFCommon.bytesCopy(
								m_pWordSeg[nIndex][k].sWord,
								m_pWordSeg[nIndex][k].sWord.length - 2,
								m_pWordSeg[nIndex][k].sWord.length
										- m_pWordSeg[nIndex][k].sWord.length
										+ 2);
						if (Utility.strcmp(bt, "点".getBytes())) {
							Utility.strcpy(sCurWord, "未##时".getBytes());
							nPOS = -29696;// Set the POS with 't'
						} else {
							bt = GFCommon.bytesCopy(
									m_pWordSeg[nIndex][k].sWord, nLen - 2,
									m_pWordSeg[nIndex][k].sWord.length - nLen
											+ 2);
							if (!Utility.CC_Find("∶·．／".getBytes(), bt)
									&& m_pWordSeg[nIndex][k].sWord[nLen - 1] != '.'
									&& m_pWordSeg[nIndex][k].sWord[nLen - 1] != '/') {
								Utility.strcpy(sCurWord, "未##数".getBytes());
								nPOS = -27904;// 'm'*256;Set the POS with 'm'
							} else if (nLen > sInitChar.length) {// Get rid
																	// of .
																	// example
																	// 1.
								if (m_pWordSeg[nIndex][k].sWord[nLen - 1] == '.'
										|| m_pWordSeg[nIndex][k].sWord[nLen - 1] == '/')
									m_pWordSeg[nIndex][k].sWord[nLen - 1] = 0;
								else
									m_pWordSeg[nIndex][k].sWord[nLen - 2] = 0;
								Utility.strcpy(sCurWord, "未##数".getBytes());
								nPOS = -27904;// 'm'*256;Set the POS with 'm'
								i--;
							}
						}
						i--;// Not num, back to previous word
					}
				}
				fValue = 0;
				nEndVertex = nSegRoute[nIndex][i + 1];// Ending POS changed to
														// latter
			}
			m_pWordSeg[nIndex][k].nHandle = nPOS;// Get the POS of current
													// word
			m_pWordSeg[nIndex][k].dValue = fValue;// (int)(MAX_FREQUENCE*exp(-fValue));//Return
													// the frequency of current
													// word
			m_graphOptimum.SetElement(nStartVertex, nEndVertex, fValue, nPOS,
					sCurWord);
			// Generate optimum segmentation graph according the segmentation
			// result
			i++;// Skip to next atom
			k++;// Accept next word
		}
		m_pWordSeg[nIndex][k].sWord[0] = 0;
		m_pWordSeg[nIndex][k].nHandle = -1;// Set ending
		return true;
	}

}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -