csegment.java

来自「基于中科院的ICTCLAS实现中文分词系统开发工具是JAVA.经测试,效果很好」· Java 代码 · 共 501 行 · 第 1/2 页
JAVA
501 行
package com.gftech.ictclas4j.segment;

import com.gftech.common.GFCommon;
import com.gftech.common.GFString;
import com.gftech.ictclas4j.utility.CDictionary;
import com.gftech.ictclas4j.utility.Final;
import com.gftech.ictclas4j.utility.TagWordResult;
import com.gftech.ictclas4j.utility.Utility;

public class CSegment {
	// Record the position map of possible words
	private int[] m_npWordPosMapTable;

	// Record the End position of possible words
	private int m_nWordCount;

	public TagWordResult[][] m_pWordSeg;

	// The segmentation result
	public int m_nSegmentCount;

	// The optimumized segmentation graph
	public CDynamicArray m_graphOptimum;

	// The segmentation graph
	public CSegGraph m_graphSeg;

	public CSegment() {
		// malloc buffer
		m_pWordSeg = new TagWordResult[Final.MAX_SEGMENT_NUM][];
		for (int i = 0; i < Final.MAX_SEGMENT_NUM; i++) {
			m_pWordSeg[i] = new TagWordResult[Final.MAX_WORDS];
		}
		// m_npWordPosMapTable=0;//Record the start position of possible words
		m_nWordCount = 0;// Record the End position of possible words
		m_graphOptimum.SetRowFirst(false);// Set row first
	}

	public boolean BiOptimumSegment(int nResultCount, double dSmoothingPara,
			CDictionary dictBinary, CDictionary dictCore) {
		int[][] nSegRoute;// The segmentation route
		nSegRoute = new int[Final.MAX_SEGMENT_NUM][];
		for (int i = 0; i < Final.MAX_SEGMENT_NUM; i++) {
			nSegRoute[i] = new int[Final.MAX_SENTENCE_LEN / 2];
			for (int j = 0; j < Final.MAX_SENTENCE_LEN / 8; j++)
				nSegRoute[i][j] = -1;
		}
		CDynamicArray[] aBiwordsNet = null;
		BiGraphGenerate(m_graphOptimum, aBiwordsNet[0], dSmoothingPara,
				dictBinary, dictCore);
		// Generate the biword link net

		CNShortPath sp = new CNShortPath(aBiwordsNet, nResultCount);
		sp.ShortPath();
		sp.Output(nSegRoute, false, m_nSegmentCount);
		int i = 0;
		m_graphSeg.m_segGraph = m_graphOptimum;
		m_graphOptimum.SetEmpty();// Set graph optimum empty

		while (i < m_nSegmentCount) {
			BiPath2UniPath(nSegRoute[i]);
			// Path convert to unipath
			GenerateWord(nSegRoute, i);
			// Gernerate word according the Segmentation route
			i++;
		}

		return true;
	}

	public boolean BiSegment(byte[] sSentence, double dSmoothingPara,
			CDictionary dictCore, CDictionary dictBinary, int nResultCount) {
		int[][] nSegRoute;// The segmentation route
		nSegRoute = new int[Final.MAX_SEGMENT_NUM][];
		int nLen = sSentence.length + 10;
		for (int i = 0; i < Final.MAX_SEGMENT_NUM; i++) {
			nSegRoute[i] = new int[nLen / 2];
			for (int j = 0; j < nLen / 8; j++)
				nSegRoute[i][j] = -1;
		}
		m_graphSeg.GenerateWordNet(sSentence, dictCore, true);// Generate
																// words array
		CDynamicArray[] aBiwordsNet = null;
		BiGraphGenerate(m_graphSeg.m_segGraph, aBiwordsNet[0], dSmoothingPara,
				dictBinary, dictCore);
		// Generate the biword link net

		CNShortPath sp = new CNShortPath(aBiwordsNet, nResultCount);
		sp.ShortPath();
		sp.Output(nSegRoute, false, m_nSegmentCount);

		m_graphOptimum.SetEmpty();// Set graph optimum empty
		int i = 0;
		while (i < m_nSegmentCount) {
			BiPath2UniPath(nSegRoute[i]);
			// Path convert to unipath
			GenerateWord(nSegRoute, i);
			// Gernerate word according the Segmentation route
			i++;
		}

		return true;
	}

	public boolean GetLastWord(TagWordResult[] pItem, byte[] sWordRet) {
		int nCount = 0;
		sWordRet[0] = 0;
		while (pItem[nCount].sWord[0] != 0) {
			Utility.strcpy(sWordRet, pItem[nCount].sWord);
			nCount += 1;
		}
		if (sWordRet[0] == 0)
			return true;
		else
			return false;
	}

	public int GetResultCount(TagWordResult[] pItem) {
		int nCount = 0;
		while (pItem[nCount].sWord[0] != 0) {
			nCount += 1;
		}
		return nCount;
	}

	public boolean OptimumSegmet(int nResultCount) {
		int[][] nSegRoute = new int[Final.MAX_SEGMENT_NUM][];
		for (int i = 0; i < Final.MAX_SEGMENT_NUM; i++) {
			nSegRoute[i] = new int[Final.MAX_SENTENCE_LEN / 2];
		}
		CNShortPath sp = new CNShortPath(m_graphOptimum, nResultCount);
		sp.ShortPath();
		sp.Output(nSegRoute, false, m_nSegmentCount);
		int i = 0;
		m_graphSeg.m_segGraph = m_graphOptimum;
		m_graphOptimum.SetEmpty();// Set graph optimum empty
		while (i < m_nSegmentCount) {
			GenerateWord(nSegRoute, i);
			// Gernerate word according the Segmentation route
			i++;
		}

		return true;
	}

	// Word Segmentation based on optimum segmentation graph
	// After unknown word recognition
	public boolean Segment(byte[] sSentence, CDictionary dictCore,
			int nResultCount) {
		nResultCount = 10;
		int[][] nSegRoute = new int[Final.MAX_SEGMENT_NUM][];
		for (int i = 0; i < Final.MAX_SEGMENT_NUM; i++) {
			nSegRoute[i] = new int[Final.MAX_SENTENCE_LEN / 2];
		}
		m_graphSeg.m_segGraph.SetRowFirst(false);
		m_graphOptimum.SetRowFirst(false);
		m_graphSeg.GenerateWordNet(sSentence, dictCore, false);
		CNShortPath sp = new CNShortPath(m_graphSeg.m_segGraph, nResultCount);
		sp.ShortPath();
		sp.Output(nSegRoute, false, m_nSegmentCount);
		m_graphOptimum.SetEmpty();// Set graph optimum empty

		int i = 0;
		while (i < m_nSegmentCount) {
			GenerateWord(nSegRoute, i);
			// Gernerate word according the Segmentation route
			i++;
		}

		return true;
	}

	protected boolean BiPath2UniPath(int[] npPath) {
		// BiPath convert to unipath
		int i = 0, nTemp = -1;

		while (npPath[i] != -1 && npPath[i] < m_nWordCount) {
			nTemp = m_npWordPosMapTable[npPath[i]];
			npPath[i] = nTemp / Final.MAX_SENTENCE_LEN;
			i++;
		}
		if (nTemp > 0)
			npPath[i++] = nTemp % Final.MAX_SENTENCE_LEN;
		npPath[i] = -1;
		return true;
	}

	/**
	 * <pre>
	 *      CDynamicArray &amp;aWord: the words array
	 *      CDynamicArray &amp;aBinaryWordNet:the net between words
	 *      double dSmoothingPara: the parameter of data smoothing
	 *      CDictionary &amp;DictBinary: the binary dictionary
	 *      CDictionary &amp;DictCore: the Core dictionary
	 * </pre>
	 */
	protected boolean BiGraphGenerate(CDynamicArray aWord,
			CDynamicArray aBinaryWordNet, double dSmoothingPara,
			CDictionary DictBinary, CDictionary DictCore) {

		TagArrayChain[] pTail = null;
		TagArrayChain[] pCur = null;
		TagArrayChain[] pNextWords = null;// Temp buffer
		int nWordIndex = 0, nTwoWordsFreq = 0, nCurWordIndex, nNextWordIndex;
		// nWordIndex: the index number of current word
		double dCurFreqency, dValue, dTemp;
		byte[] sTwoWords = new byte[Final.WORD_MAXLENGTH];
		m_nWordCount = aWord.GetTail(pTail);// Get tail element and return the
											// words count

		if (m_nWordCount > 0)// Word count is greater than 0
			m_npWordPosMapTable = new int[m_nWordCount];// Record the position
														// of possible words
		pCur = aWord.GetHead();
		for (TagArrayChain trc : pCur)// Set the position map of words
		{
			m_npWordPosMapTable[nWordIndex++] = trc.row
					* Final.MAX_SENTENCE_LEN + trc.col;

		}

		pCur = aWord.GetHead();
		for (TagArrayChain trc : pCur)//
		{
			if (trc.nPOS >= 0)// It's not an unknown words
				dCurFreqency = trc.value;
			else
				// Unknown words
				dCurFreqency = DictCore.GetFrequency(trc.sWord, 2);
			aWord.GetElement(trc.col, -1, pCur, pNextWords);// Get next words
															// which begin with
															// pCur->col

			for (TagArrayChain trc2 : pNextWords) {
				if (trc2.row == trc.col) {
					// Current words frequency
					Utility.strcpy(sTwoWords, trc.sWord);
					Utility.strcat(sTwoWords, Final.WORD_SEGMENTER.getBytes());
					Utility.strcat(sTwoWords, trc2.sWord);
					nTwoWordsFreq = DictBinary.GetFrequency(sTwoWords, 3);
					// Two linked Words frequency
					dTemp = (double) 1 / Final.MAX_FREQUENCE;
					// Smoothing
					dValue = -Math
							.log(dSmoothingPara
									* (1 + dCurFreqency)
									/ (Final.MAX_FREQUENCE + 80000)
									+ (1 - dSmoothingPara)
									* ((1 - dTemp) * nTwoWordsFreq
											/ (1 + dCurFreqency) + dTemp));
					// -log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
csegment.java - 源码说明

本页面展示了「基于中科院的ICTCLAS实现中文分词系统开发工具是JAVA.经测试,效果很好」中的 csegment.java 源码文件，采用 Java 编程语言编写，共 501 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与ICTCLAS相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?