📄 cspan.java

📁 基于中科院的ICTCLAS实现中文分词系统开发工具是JAVA.经测试,效果很好
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
package com.gftech.ictclas4j.tag;

import com.gftech.common.GFCommon;
import com.gftech.common.GFString;
import com.gftech.ictclas4j.utility.CContextStat;
import com.gftech.ictclas4j.utility.CDictionary;
import com.gftech.ictclas4j.utility.Final;
import com.gftech.ictclas4j.utility.TagWordResult;
import com.gftech.ictclas4j.utility.Utility;

public class CSpan {
	private Final.TAG_TYPE m_tagType;// The type of tagging

	private int m_nStartPos;

	private int[] m_nBestTag = new int[Final.MAX_WORDS_PER_SENTENCE];

	// Record the Best Tag

	private byte[][] m_sWords = new byte[Final.MAX_WORDS_PER_SENTENCE][Final.WORD_MAXLENGTH];

	private int[] m_nWordPosition = new int[Final.MAX_WORDS_PER_SENTENCE];

	private int[][] m_nTags = new int[Final.MAX_WORDS_PER_SENTENCE][Final.MAX_POS_PER_WORD];

	private byte[][] m_nBestPrev = new byte[Final.MAX_WORDS_PER_SENTENCE][Final.MAX_POS_PER_WORD];

	private int m_nCurLength;

	private double[][] m_dFrequency = new double[Final.MAX_WORDS_PER_SENTENCE][Final.MAX_POS_PER_WORD];

	public int m_nUnknownIndex;

	// The number of unknown word
	public int[][] m_nUnknownWords = new int[Final.MAX_UNKNOWN_PER_SENTENCE][2];

	// The start and ending possition of unknown position
	public double[] m_dWordsPossibility = new double[Final.MAX_UNKNOWN_PER_SENTENCE];

	// The possibility of unknown words
	public CContextStat m_context;// context

	public CSpan() {
		if (m_tagType != Final.TAG_TYPE.TT_NORMAL)
			m_nTags[0][0] = 100;// Begin tag
		else
			m_nTags[0][0] = 0;// Begin tag

		m_nTags[0][1] = -1;
		m_dFrequency[0][0] = 0;
		m_nCurLength = 1;
		m_nUnknownIndex = 0;
		m_nStartPos = 0;
		m_nWordPosition[1] = 0;
		m_sWords[0][0] = 0;

		m_tagType = Final.TAG_TYPE.TT_NORMAL;// Default tagging type
	}

	public boolean PlaceRecognize(CDictionary dictCore, CDictionary placeDict) {
		int nStart=1,nEnd=1,i=1,nTemp;
		  double dPanelty=1.0;//Panelty value
		  while(m_nBestTag[i]>-1)
		  {
			  if(m_nBestTag[i]==1)//1 Trigger the recognition procession
			  {
				nStart=i;
				nEnd=nStart+1;
				while(m_nBestTag[nEnd]==1)//
				{
					if(nEnd>nStart+1)
						dPanelty+=1.0;
					nEnd++;
				}
				while(m_nBestTag[nEnd]==2)//2,12,22
					nEnd++;
				nTemp=nEnd;
				while(m_nBestTag[nEnd]==3)
				{
					if(nEnd>nTemp)
						dPanelty+=1.0;		
					nEnd++;
				}
			  }
			  else if(m_nBestTag[i]==2)//1,11,21 Trigger the recognition
			  {
				dPanelty+=1.0;		
				nStart=i;
				nEnd=nStart+1;
				while(m_nBestTag[nEnd]==2)//2
					nEnd++;
				nTemp=nEnd;
				while(m_nBestTag[nEnd]==3)//2
				{
					if(nEnd>nTemp)
						dPanelty+=1.0;		
					nEnd++;
				}
			  }
			  if(nEnd>nStart)
			  {
					m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[nStart];
					m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[nEnd];
					m_dWordsPossibility[m_nUnknownIndex++]=ComputePossibility(nStart,nEnd-nStart+1,placeDict)+Math.log(dPanelty);
					nStart=nEnd;
			  }
			  if(i<nEnd)
				  i=nEnd;
			  else
				  i=i+1;
		  }
		  return true;
	}

	public boolean PersonRecognize(CDictionary personDict) {
		byte[] sPOS = new byte[Final.MAX_WORDS_PER_SENTENCE];
		for (int i = 0; i < sPOS.length; i++)
			sPOS[i] = "z".getBytes()[0];

		byte[] sPersonName = new byte[100];
		// 0 1 2 3 4 5
		String[] sPatterns = { "BBCD", "BBC", "BBE", "BBZ", "BCD", "BEE", "BE",
				"BG", "BXD", "BZ", "CDCD", "CD", "EE", "FB", "Y", "XD", "" };
		// BBCD BBC BBE BBZ BCD BEE BE BG
		double dFactor[] = { 0.003606, 0.000021, 0.001314, 0.000315, 0.656624,
				0.000021, 0.146116, 0.009136,
				// BXD BZ CDCD CD EE FB Y XD
				0.000042, 0.038971, 0, 0.090367, 0.000273, 0.009157, 0.034324,
				0.009735, 0 };
		// About parameter:
		/*
		 * BBCD 343 0.003606 BBC 2 0.000021 BBE 125 0.001314 BBZ 30 0.000315 BCD
		 * 62460 0.656624 BEE 0 0.000000 BE 13899 0.146116 BG 869 0.009136 BXD 4
		 * 0.000042 BZ 3707 0.038971 CD 8596 0.090367 EE 26 0.000273 FB 871
		 * 0.009157 Y 3265 0.034324 XD 926 0.009735
		 */
		// The person recognition patterns set
		// BBCD:姓+姓+名1+名2;
		// BBE: 姓+姓+单名;
		// BBZ: 姓+姓+双名成词;
		// BCD: 姓+名1+名2;
		// BE: 姓+单名;
		// BEE: 姓+单名+单名;韩磊磊
		// BG: 姓+后缀
		// BXD: 姓+姓双名首字成词+双名末字
		// BZ: 姓+双名成词;
		// B: 姓
		// CD: 名1+名2;
		// EE: 单名+单名;
		// FB: 前缀+姓
		// XD: 姓双名首字成词+双名末字
		// Y: 姓单名成词
		int nPatternLen[] = { 4, 3, 3, 3, 3, 3, 2, 2, 3, 2, 4, 2, 2, 2, 1, 2, 0 };
		int i = 0;
		for (i = 1; m_nBestTag[i] > -1; i++)
			// Convert to string from POS
			sPOS[i] = (byte) (m_nBestTag[i] + 'A');
		sPOS[i] = 0;
		int j = 1, k, nPos;// Find the proper pattern from the first POS
		int nLittleFreqCount;// Counter for the person name role with little
								// frequecy
		boolean bMatched = false;
		while (j < i) {
			bMatched = false;
			for (k = 0; !bMatched && nPatternLen[k] > 0; k++) {
//				 Find  the proper pattern   k
				if (Utility.strncmp(sPatterns[k].getBytes(),0, GFCommon.bytesCopy(sPOS , j,nPatternLen[k]), nPatternLen[k]) 
						&&  m_sWords[j - 1][0]!= "·".getBytes()[0] 
						&&  m_sWords[j + nPatternLen[k]][0]!= "·".getBytes()[0])  {
					// Rule  1  for exclusion:前缀+姓+名1(名2):  规则(前缀+姓)失效；
					if ( "FB".equals(sPatterns[k])
							&& (sPOS[j + 2] == 'E' || sPOS[j + 2] == 'C' || sPOS[j + 2] == 'G')) { 
						continue;
					}
					/*
					 * if((strcmp(sPatterns[k],"BEE")==0||strcmp(sPatterns[k],"EE")==0)&&strcmp(m_sWords[j+nPatternLen[k]-1],m_sWords[j+nPatternLen[k]-2])!=0)
					 * {//Rule 2 for exclusion:姓+单名+单名:单名+单名
					 * 若EE对应的字不同，规则失效.如：韩磊磊 continue; }
					 * 
					 * if(strcmp(sPatterns[k],"B")==0&&m_nBestTag[j+1]!=12)
					 * {//Rule 3 for exclusion: 若姓后不是后缀，规则失效.如：江主席、刘大娘 continue; }
					 */// Get the possible name
//					 Record the person position in the tag sequence
					nPos = j;
					sPersonName[0] = 0;
//					 Record the number of role with little frequency
					nLittleFreqCount = 0;
					
					int index=0;
					while (nPos < j + nPatternLen[k]) {// Get the possible
														// person name
					//
						if (m_nBestTag[nPos] < 4
								&& personDict.GetFrequency(m_sWords[nPos],
										m_nBestTag[nPos]) < Final.LITTLE_FREQUENCY)
							nLittleFreqCount++;// The counter increase
						GFCommon.bytesCopy(sPersonName, m_sWords[nPos],0,m_sWords[nPos].length);
						nPos += 1;
					}
					/*
					 * if(IsAllForeign(sPersonName)&&personDict.GetFrequency(m_sWords[j],1)<LITTLE_FREQUENCY)
					 * {//Exclusion foreign name //Rule 2 for
					 * exclusion:若均为外国人名用字 规则(名1+名2)失效 j+=nPatternLen[k]-1;
					 * continue; }
					 */if ("CDCD".equals(sPatterns[k])) {// Rule for
																// exclusion
					// 规则(名1+名2+名1+名2)本身是排除规则:女高音歌唱家迪里拜尔演唱
					// Rule 3 for exclusion:含外国人名用字 规则适用
					// 否则，排除规则失效:黑妞白妞姐俩拔了头筹。
						if (Utility.GetForeignCharCount(sPersonName) > 0)
							j += nPatternLen[k] - 1;
						continue;
					}
					/*
					 * if(strcmp(sPatterns[k],"CD")==0&&IsAllForeign(sPersonName))
					 * {// j+=nPatternLen[k]-1; continue; }
					 * if(nLittleFreqCount==nPatternLen[k]||nLittleFreqCount==3)
					 * //马哈蒂尔;小扎耶德与他的中国阿姨胡彩玲受华黎明大使之邀， //The all roles appear
					 * with two lower frequecy,we will ignore them continue;
					 */m_nUnknownWords[m_nUnknownIndex][0] = m_nWordPosition[j];
					m_nUnknownWords[m_nUnknownIndex][1] = m_nWordPosition[j
							+ nPatternLen[k]];
					m_dWordsPossibility[m_nUnknownIndex] = -Math.log(dFactor[k])
							+ ComputePossibility(j, nPatternLen[k], personDict);
					// Mutiply the factor
					m_nUnknownIndex += 1;
					j += nPatternLen[k];
					bMatched = true;
				}
			}
			if (!bMatched)// Not matched, add j by 1
				j += 1;
		}
		return true;
	}

	public boolean POSTagging(TagWordResult[] pWordItems, CDictionary dictCore,
			CDictionary dictUnknown) {
//		pWordItems: Items; nItemCount: the count of items;core dictionary and unknown recognition dictionary
	    int i=0,j,nStartPos;
		Reset(false);
	    while(i>-1&&pWordItems[i].sWord[0]!=0)
		{
			nStartPos=i;//Start Position
			i=GetFrom(pWordItems,nStartPos,dictCore,dictUnknown);
			GetBestPOS();
			switch(m_tagType)
			{
			case TT_NORMAL://normal POS tagging
				j=1;
				while(m_nBestTag[j]!=-1&&j<m_nCurLength)
				{//Store the best POS tagging
					pWordItems[j+nStartPos-1].nHandle=m_nBestTag[j];
					//Let 。be 0
					if(pWordItems[j+nStartPos-1].dValue>0&&dictCore.IsExist(pWordItems[j+nStartPos-1].sWord,-1))//Exist and update its frequncy as a POS value
						pWordItems[j+nStartPos-1].dValue=dictCore.GetFrequency(pWordItems[j+nStartPos-1].sWord,m_nBestTag[j]);
					j+=1;
				}
				break;
			case TT_PERSON://Person recognition
				PersonRecognize(dictUnknown);
				break;
			case TT_PLACE://Place name recognition
			case TT_TRANS_PERSON://Transliteration Person
				PlaceRecognize(dictCore,dictUnknown);
				break;
			default:
				break;
			}
			
			CSpan span=new CSpan();
			span.Reset(false);
		}
		return true;
	}

	// POS tagging with Hidden Markov Model
	public void SetTagType(Final.TAG_TYPE nType) {
		m_tagType=nType;
	}

	// Set the tag type
	public boolean LoadContext(String sFilename) {
		return m_context.Load(sFilename);
	}

	protected double ComputePossibility(int nStartPos, int nLength,
			CDictionary dict) {
		double dRetValue=0,dPOSPoss;
		//dPOSPoss: the possibility of a POS appears
		//dContextPoss: The possibility of context POS appears
		int nFreq;
		for(int i=nStartPos;i<nStartPos+nLength;i++)
		{
			nFreq=dict.GetFrequency(m_sWords[i],m_nBestTag[i]);
			//nFreq is word being the POS
			dPOSPoss=Math.log((double)(m_context.GetFrequency(0,m_nBestTag[i])+1))-Math.log((double)(nFreq+1));
			dRetValue+=dPOSPoss;
	/*		if(i<nStartPos+nLength-1)
			{
				dContextPoss=log((double)(m_context.GetContextPossibility(0,m_nBestTag[i],m_nBestTag[i+1])+1));
				dRetValue+=dPOSPoss-dContextPoss;
			}
	*/	}
		return dRetValue;
	}

	protected int GetFrom(TagWordResult[] pWordItems, int nIndex,
			CDictionary dictCore, CDictionary dictUnknown) {
		int nCount=0;
		int[] aPOS=new int[Final.MAX_POS_PER_WORD];
		int[] aFreq=new int[Final.MAX_POS_PER_WORD];
		int nFreq=0,j,nRetPos=0,nWordsIndex=0;
		boolean  bSplit=false;//Need to split in Transliteration recognition 
	    int i=1,nPOSCount;
		byte[] sCurWord=new byte[Final.WORD_MAXLENGTH];//Current word
		nWordsIndex=i+nIndex-1;
		
		for(;i<Final.MAX_WORDS_PER_SENTENCE&&pWordItems[nWordsIndex].sWord[0]!=0;i++)
		{
			if(m_tagType==Final.TAG_TYPE.TT_NORMAL||!dictUnknown.IsExist(pWordItems[nWordsIndex].sWord,44))
	        {//store current word
				GFCommon.bytesCopy(m_sWords[i],pWordItems[nWordsIndex].sWord,0,pWordItems[nWordsIndex].sWord.length);
	   		    m_nWordPosition[i+1]=m_nWordPosition[i]+ m_sWords[i].length;
			}
			else
			{
				if(!bSplit)
				{
					GFCommon.bytesCopy(m_sWords[i],pWordItems[nWordsIndex].sWord,0,2);//store current word
					m_sWords[i][2]=0;
					bSplit=true;
				}
				else
				{
					 int nLen= pWordItems[nWordsIndex].sWord.length-2;
					 byte[] bt=GFCommon.bytesCopy(pWordItems[nWordsIndex].sWord,0,2);
					 GFCommon.bytesCopy(m_sWords[i],bt,0,nLen);//store current word
					m_sWords[i][nLen]=0;
					bSplit=false;
				}
	   		    m_nWordPosition[i+1]=m_nWordPosition[i]+ m_sWords[i].length;
			}
			//Record the position of current word
			m_nStartPos=m_nWordPosition[i+1];
			//Move the Start POS to the ending
			if(m_tagType!=Final.TAG_TYPE.TT_NORMAL)
			{
				//Get the POSs from the unknown recognition dictionary
				GFCommon.bytesCopy( sCurWord,m_sWords[i],0,m_sWords[i].length);
				if(m_tagType==Final.TAG_TYPE.TT_TRANS_PERSON&&i>0&&Utility.charType( m_sWords[i-1][0],m_sWords[i-1][1])==Final.CT_CHINESE)
				{
					if(m_sWords[i][0]=='.'&&m_sWords[i][1]==0)
						GFCommon.bytesCopy(sCurWord,"．".getBytes(),0,"．".getBytes().length);
					else if(m_sWords[i][0]=='-'&&m_sWords[i][1]==0)
						GFCommon.bytesCopy(sCurWord,"－".getBytes(),0,"－".getBytes().length);
				}
				dictUnknown.GetHandle(sCurWord, nCount,aPOS,aFreq);
				nPOSCount=nCount+1;
				for(j=0;j<nCount;j++) 
				{//Get the POS set of sCurWord in the unknown dictionary
					m_nTags[i][j]=aPOS[j];
	   				m_dFrequency[i][j]=-Math.log((double)(1+aFreq[j]))+Math.log((double)(m_context.GetFrequency(0,aPOS[j])+nPOSCount));
				}
				//Get the POS set of sCurWord in the core dictionary
				//We ignore the POS in the core dictionary and recognize them as other (0).
				//We add their frequency to get the possibility as POS 0
				if(GFString.getChineseString(m_sWords[i],"gb2312").indexOf("始##始")==0)
				{
12 下一页
💿 文件大小 9037 K
👤 上传用户 heyuyutu
📂 所属分类 Java编程
🏷️ 相关标签

#ICTCLAS #JAVA #分
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -