cseggraph.java

来自「基于中科院的ICTCLAS实现中文分词系统开发工具是JAVA.经测试,效果很好」· Java 代码 · 共 232 行
JAVA
232 行
package com.gftech.ictclas4j.segment;

import com.gftech.common.GFCommon;
import com.gftech.ictclas4j.utility.CDictionary;
import com.gftech.ictclas4j.utility.Final;
import com.gftech.ictclas4j.utility.Utility;

public class CSegGraph {
	// segGraph: The segmentation word graph
	// Row first array
	public CDynamicArray m_segGraph;

	// pAtoms: the buffer for returned segmented atoms
	// Such as a Chinese Char, digit, single byte, or delimiters
	public byte[][] m_sAtom = new byte[Final.MAX_SENTENCE_LEN][Final.WORD_MAXLENGTH];

	// Save the individual length of atom in the array
	public int[] m_nAtomLength = new int[Final.MAX_SENTENCE_LEN];

	// pAtoms: the POS property
	public int[] m_nAtomPOS = new int[Final.MAX_SENTENCE_LEN];

	// The count of atoms
	public int m_nAtomCount;

	public CSegGraph() {
		m_segGraph.SetRowFirst(false);	
		//segGraph: The segmentation word graph
		//Row first array
	}

	// Generate the segmentation word net according the original sentence
	// sSentence: the sentence
	// dictCore: core dictionary
	// boolean bOriginalFreq=false: output original frequency
	public boolean GenerateWordNet(byte[] sSentence, CDictionary dictCore,
			boolean bOriginalFreq) {
//		Gernerate the word net from the sLine, that's list all the possible word
		int i=0,j;
		int nLen= sSentence.length;
		byte[] sWord=new byte[Final.WORD_MAXLENGTH] ;
		byte[] sTempWord=new byte[Final.WORD_MAXLENGTH] ;
		byte[] sWordMatch=new byte[Final.WORD_MAXLENGTH];
		int nWordIndex=0;
		int nHandleTemp=0;
		int k,nPOS;
		int[] nMatchFreq=new int[20];
		int[] nMatchHandle=new int[20];
		int nTotalFreq;
		int nMatchCount=0;
		double dValue=0;
		m_nAtomCount=0;
		m_segGraph.SetEmpty();//Set segmentation graph empty

		AtomSegment(sSentence);
		//Atomic Segmentation

	    for(i=0;i<m_nAtomCount;i++)//Init the cost array
	    {
			if(m_nAtomPOS[i]==Final.CT_CHINESE)//The atom is a Chinese Char
			{
				if(!bOriginalFreq)//Not original frequency
					m_segGraph.SetElement(i,i+1,Math.log(Final.MAX_FREQUENCE),0,null);//init the link with the maximum value
				else
					m_segGraph.SetElement(i,i+1,0,0,m_sAtom[i]);//init the link with the maximum value
			}
			else//Other atom
			{
				Utility.strcpy(sWord,m_sAtom[i]);//init the word 
				dValue=Final.MAX_FREQUENCE;
				switch(m_nAtomPOS[i])
				{
				case Final.CT_INDEX:
				case Final.CT_NUM:
					nPOS=-27904;//'m'*256
					Utility.strcpy(sWord,"未##数".getBytes());
					dValue=0;
					break;
				case Final.CT_DELIMITER:
					nPOS=30464;//'w'*256;
					break;
				case Final.CT_LETTER:
					nPOS=-'n'*256-'x';//
					dValue=0;
					Utility.strcpy(sWord,"未##串".getBytes());
					break;
				case Final.CT_SINGLE://12021-2129-3121
					if(Utility.GetCharCount("+-1234567890".getBytes(),m_sAtom[i])== m_sAtom[i].length)
					{
						nPOS=-27904;//'m'*256
						Utility.strcpy(sWord,"未##数".getBytes());
					}
					else
					{
						nPOS=-'n'*256-'x';//
						Utility.strcpy(sWord,"未##串".getBytes());
					}
					dValue=0;
					break;
				default:
					nPOS=m_nAtomPOS[i];//'?'*256;
					break;
				}
				if(!bOriginalFreq)//Not original frequency
					m_segGraph.SetElement(i,i+1,0,nPOS,null);//init the link with minimum
				else
					m_segGraph.SetElement(i,i+1,dValue,nPOS,sWord);//init the link with minimum
			}
	    }
		i=0;
		while(i<m_nAtomCount)//All the word
		{
		  Utility.strcpy(sWord,m_sAtom[i]);//Get the current atom
		  j=i+1;
		  if(Utility.strcmp(sWord,"月".getBytes()) &&Utility.strcmp(m_sAtom[i+1],"份".getBytes()) )//Don't split 月份
			  j+=1;
		  while(j<=m_nAtomCount&&dictCore.GetMaxMatch(sWord,sWordMatch, nHandleTemp))
		  {//Add a condition to control the end of string
		   //retrieve the dictionary with the word
	          if(Utility.strcmp(sWordMatch,sWord) )//find the current word
			  {
				  nTotalFreq=0;
				  dictCore.GetHandle(sWord, nMatchCount,nMatchHandle,nMatchFreq);
				  for(k=0;k<nMatchCount;k++)//Add the frequency
				  {
					 nTotalFreq+=nMatchFreq[k];
				  }
				  //Adding a rule to exclude some words to be formed.
				  if( sWord.length==4&&i>=1&&(Utility.IsAllNum( m_sAtom[i-1])||Utility.IsAllChineseNum(m_sAtom[i-1]))
						  &&(Utility.strncmp(sWord,0,"年".getBytes(),2) ||Utility.strncmp(sWord,0,"月".getBytes(),2) ))
				  {//1年内、1999年末
				     if(Utility.CC_Find("末内中底前间初".getBytes(),GFCommon.bytesCopy(sWord,2,sWord.length-2)))
					     break;
				  }
				  if(nMatchCount==1)//The possible word has only one POS, store it
				  {
					if(!bOriginalFreq)//Not original frequency
						m_segGraph.SetElement(i,j,-Math.log(nTotalFreq+1)+Math.log(Final.MAX_FREQUENCE),nMatchHandle[0],null);
					else
						m_segGraph.SetElement(i,j,nTotalFreq,nMatchHandle[0],sWord);
				  }
				  else 
				  {
						if(!bOriginalFreq)//Not original frequency
							m_segGraph.SetElement(i,j,-Math.log(nTotalFreq+1)+Math.log(Final.MAX_FREQUENCE),0,null);
						else
							m_segGraph.SetElement(i,j,nTotalFreq,0,sWord);
				  }
			  }
	          Utility.strcat(sWord,m_sAtom[j++]);
		  }
		  i+=1;//Start from i++;
		}
		return true;
	}

	// Segment the atomic members from the original sentence
	// sSentence: the sentence
	// pAtoms: the buffer for returned segmented atoms
	// Such as a Chinese Char, digit, single byte, or delimiters
	protected boolean AtomSegment(byte[] sSentence) {
		   int i=0, j=0,nCurType,nNextType;	
			//i is the pointer of sentence string
			//j is the pointer of pAtoms
			byte[] sChar=new byte[3];
			sChar[2]=0;//Set the char ending
			m_sAtom[j][0]=0;//Set the first word as null
			m_nAtomLength[j]=0;
			if(Utility.strncmp(sSentence,0,Final.SENTENCE_BEGIN.getBytes(), Final.SENTENCE_BEGIN.length()) )
			{
				Utility.strcpy(m_sAtom[j],Final.SENTENCE_BEGIN.getBytes());//Set the first word as sentence begining
				m_nAtomLength[j]= Final.SENTENCE_BEGIN.length();
				m_nAtomPOS[j]=Final.CT_SENTENCE_BEGIN;//init
				i+=m_nAtomLength[j];
				j+=1;
				m_sAtom[j][0]=0;//Set the first word as null
				m_nAtomLength[j]=0;
			}
			for( i=0;i<sSentence.length;i++)
			{
				if(Utility.strncmp(sSentence,i,Final.SENTENCE_END.getBytes(), Final.SENTENCE_END.length()) )
				{
					Utility.strcpy(m_sAtom[j],Final.SENTENCE_END.getBytes());//Set the first word as null
					m_nAtomLength[j]= Final.SENTENCE_END.length();
					m_nAtomPOS[j]=Final.CT_SENTENCE_END;//init
					i+=m_nAtomLength[j];
					j+=1;
					m_sAtom[j][0]=0;//Set the first word as null
					m_nAtomLength[j]=0;
					continue;
				}
				sChar[0]= sSentence[i];//Get the char with first byte
				sChar[1]=0;//
				i+=1;
				if(sChar[0]<0)//Two byte char
				{
					sChar[1]=sSentence[i];//Get the char with second byte
					i+=1;//i increased by 1
				}
				Utility.strcat(m_sAtom[j],sChar);
				nCurType=Utility.charType( sChar[0],sChar[1]);
				if(sChar[0]=='.'&&(Utility.charType( sSentence[i],sSentence[i+1])==Final.CT_NUM||( sSentence[i]>='0'&& sSentence[i]<='9')))
					nCurType=Final.CT_NUM;//Digit after . indicate . as a point in the numeric
				m_nAtomPOS[j]=nCurType;
				//Record its property, just convience for continuous processing
				
				if(nCurType==Final.CT_CHINESE||nCurType==Final.CT_INDEX||nCurType==Final.CT_DELIMITER||nCurType==Final.CT_OTHER)
				{//Chinese char, index number,delimiter and other is treated as atom
					m_nAtomLength[j]= m_sAtom[j].length;//Save its length
					j+=1;//Skip to next atom
					m_sAtom[j][0]=0;//init
				}
				else 
				{//Number,single char, letter
					nNextType=255;
					if(i< sSentence.length)
						nNextType=Utility.charType( sSentence[i] ,sSentence[i+1]);
					if(nNextType!=nCurType||i== sSentence.length)
					//Reaching end or next char type is different from current char
					{
						m_nAtomLength[j]= m_sAtom[j].length;//Save its length	
						j+=1;
						m_sAtom[j][0]=0;//init
					}
				}
			}
			m_nAtomCount=j;//The count of segmentation atoms
			return true;
	}

}
cseggraph.java - 源码说明

本页面展示了「基于中科院的ICTCLAS实现中文分词系统开发工具是JAVA.经测试,效果很好」中的 cseggraph.java 源码文件，采用 Java 编程语言编写，共 232 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与ICTCLAS相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?