📄 cdictionary.java

📁 基于中科院的ICTCLAS实现中文分词系统开发工具是JAVA.经测试,效果很好
💻 JAVA
字号:
package com.gftech.ictclas4j.utility;

import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;

import com.gftech.common.GFCommon;
import com.gftech.common.GFString;

public class CDictionary {
	public TagIndexTable[] m_IndexTable;

	public TagModifyTable[] m_pModifyTable;

	public CDictionary() {
		m_IndexTable=new TagIndexTable[Final.CC_NUM];
		m_pModifyTable=new TagModifyTable[Final.CC_NUM];
		for(int i=0;i<Final.CC_NUM;i++){
			m_IndexTable[i]=new TagIndexTable();
			m_pModifyTable[i]=new TagModifyTable();
		}
	}

	public boolean Optimum() {
		return false;
	}

	public boolean Merge(CDictionary dict2, int nRatio) {
		return false;
	}

	public boolean OutputChars(String sFilename) {
		return false;
	}

	public boolean Output(String sFilename) {
		return false;
	}

	public int GetFrequency(byte[] sWord, int nHandle) {
		return 0;
	}

	public boolean GetPOSString(int nPOS, String sPOSRet) {
		return false;
	}

	public int GetPOSValue(byte[] sPOS) {
		return 0;
	}

	public boolean GetMaxMatch(byte[] sWord, byte[] sWordRet, int  npHandleRet) {
		return false;
	}

	public boolean MergePOS(int nHandle) {
		return false;
	}

	public boolean GetHandle(byte[] sWord, int  pnCount, int[] pnHandle,
			int[] pnFrequency) {
		return false;
	}

	public boolean IsExist(byte[] sWord, int nHandle) {
		return false;
	}

	public boolean AddItem(byte[] sWord, int nHandle, int nFrequency) {
		return false;
	}

	public boolean DelItem(byte[] sWord, int nHandle) {
		return false;
	}

	public boolean Save(String sFilename) {
		File file;
		int i, j, nCount;
		int[] nBuffer = new int[3];
		TagWordChain[] pCur;

		file = new File(sFilename);

		if (!file.canWrite())
			return false;// fail while opening the file

		try {
			DataOutputStream out = new DataOutputStream(new FileOutputStream(
					file));
			for (i = 0; i < Final.CC_NUM; i++) {
				if (m_pModifyTable != null) {// Modification made
					nCount = m_IndexTable[i].getCount()
							+ m_pModifyTable[i].getNCount()
							- m_pModifyTable[i].getNDelete();
					out.writeInt(nCount);
					pCur = m_pModifyTable[i].getWordItemHead();
					j = 0;

					TagWordItem[] twi = m_IndexTable[i].getWordItemHead();
					// Output to the file after comparision
					while (pCur[j] != null && j < m_IndexTable[i].getCount()) {
						if (pCur[j].getData().getWord().length < twi[j]
								.getWord().length
								|| (pCur[j].getData().getWord().equals(
										twi[j].getWord()) && pCur[j].getData()
										.getHandle() < twi[j].getHandle())) {
							// Output the modified data to the file
							nBuffer[0] = pCur[j].getData().getFrequency();
							nBuffer[1] = pCur[j].getData().getWordLen();
							nBuffer[2] = pCur[j].getData().getHandle();
							for (int n : nBuffer)
								out.writeInt(n);
							if (nBuffer[1] > 0)// String length is more than 0
								out.write(pCur[j].getData().getWord());

							j++;
						} else if (twi[j].getFrequency() == -1) {
							// The item has been removed,so skip it
							j += 1;
						} else if (pCur[j].getData().getWord().length > twi[j]
								.getWord().length
								|| (pCur[j].getData().getWord().equals(
										twi[j].getWord()) && pCur[j].getData()
										.getHandle() > twi[j].getHandle())) {
							// Output the index table data to the file
							nBuffer[0] = twi[j].getFrequency();
							nBuffer[1] = twi[j].getWordLen();
							nBuffer[2] = twi[j].getHandle();
							for (int n : nBuffer)
								out.writeInt(n);
							if (nBuffer[1] > 0)// String length is more than 0
								out.write(twi[j].getWord());
							j += 1;// Get next item in the original table.
						}
					}
					if (j < m_IndexTable[i].getCount()) {
						while (j < m_IndexTable[i].getCount()) {
							if (twi[j].getFrequency() != -1) {// Has been
																// deleted
								nBuffer[0] = twi[j].getFrequency();
								nBuffer[1] = twi[j].getWordLen();
								nBuffer[2] = twi[j].getHandle();
								for (int n : nBuffer)
									out.writeInt(n);
								if (nBuffer[1] > 0)// String length is more
													// than 0
									out.write(twi[j].getWord());
							}
							j += 1;// Get next item in the original table.
						}
					} else
						// //No Modification
						for (TagWordChain tc : pCur)// Add the rest data to the
													// file.
						{
							nBuffer[0] = tc.getData().getFrequency();
							nBuffer[1] = tc.getData().getWordLen();
							nBuffer[2] = tc.getData().getHandle();
							for (int n : nBuffer)
								out.writeInt(n);
							if (nBuffer[1] > 0)// String length is more than 0
								out.write(tc.getData().getWord());
						}
				} else {
					out.writeInt(m_IndexTable[i].getCount());
					// write to the file
					j = 0;
					TagWordItem[] twi = m_IndexTable[i].getWordItemHead();
					while (j < m_IndexTable[i].getCount()) {
						nBuffer[0] = twi[j].getFrequency();
						nBuffer[1] = twi[j].getWordLen();
						nBuffer[2] = twi[j].getHandle();
						for (int n : nBuffer)
							out.writeInt(n);
						if (nBuffer[1] > 0)// String length is more than 0
							out.write(twi[j].getWord());
						j += 1;// Get next item in the original table.
					}
				}
			}
			out.close();
		} catch (FileNotFoundException e) { 
			e.printStackTrace();
		} catch (IOException e) { 
			e.printStackTrace();
		}
		return true;
	}

	public boolean Load(String sFilename, boolean bReset) {
		File file;
		int i, j;
		int[] nBuffer = new int[3];
		file = new File(sFilename);

		if (!file.canRead())
			return false;// fail while opening the file

		try {
			DelModified();

			DataInputStream in = new DataInputStream(new FileInputStream(file));
			for (i = 0; i < Final.CC_NUM; i++) {
				System.out.println("块"+i); 
				int count = GFCommon.bytes2int(Utility.readBytes(in,4),false);
				System.out.println("  count:"+count);
				m_IndexTable[i].setCount(count);
				if (count > 0){
					m_IndexTable[i].setCount(count);
					TagWordItem[] twis=new TagWordItem[count];
					for(int m=0;m<count;m++)
						twis[m]=new TagWordItem();
					m_IndexTable[i].setWordItemHead(twis);
				}
				else {
					// m_IndexTable[i].pWordItemHead=0;
					continue;
				}

				j = 0;
				while (j < count) {
					nBuffer[0] = GFCommon.bytes2int(Utility.readBytes(in,4),false);
					nBuffer[1] = GFCommon.bytes2int(Utility.readBytes(in,4),false);
					nBuffer[2] = GFCommon.bytes2int(Utility.readBytes(in,4),false);
					System.out.println("\n	wordLen:"+nBuffer[1]+"\n	frequency:"+nBuffer[0]+
							"\n	handle:"+nBuffer[2]);
					// m_IndexTable[i].pWordItemHead[j].sWord=new
					// char[nBuffer[1]+1];
					TagWordItem[] tis = m_IndexTable[i].getWordItemHead();
					if (nBuffer[1] > 0)// String length is more than 0
					{
						byte[] word = Utility.readBytes(in, nBuffer[1]);
						byte[] word2 = new byte[word.length + 1];
						GFCommon.bytesCopy(word2, word, 0, word.length);
						tis[j].setWord(word2);
						System.out.println("	word:"+GFString.bytes2hexstr(word));
						System.out.println("	word:"+GFString.getChineseString(word,"gb2312"));
					}

					if (bReset)// Reset the frequency
						tis[j].setFrequency(0);
					else
						tis[j].setFrequency(nBuffer[0]);
					tis[j].setWordLen(nBuffer[1]);
					tis[j].setHandle(nBuffer[2]);
					j += 1;// Get next item in the original table.
				}
			}

			in.close();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return true;
	}

	public int GetWordType(String sWord) {
		return 0;
	}

	public boolean PreProcessing(byte[] sWord, int nId, byte[] sWordRet,boolean bAdd)
	{
		   //Position for the delimeters
		   int nType=Utility.charType(sWord[0],sWord[1]);
		   int nLen=sWord.length;;
		   int nEnd=nLen-1,nBegin=0;
		   if(nLen==0)
			   return false;
		   while(nEnd>=0&&(char)sWord[nEnd]==' ')
				nEnd-=1;
		   while(nBegin<=nEnd&&(char)sWord[nBegin]==' ')
				nBegin+=1;
		   if(nBegin>nEnd)
			   return false;
		   if(nEnd!=nLen-1||nBegin!=0)
		   {	
			   byte[] b=GFCommon.bytesCopy(sWord,nBegin,nEnd-nBegin+1);
			   GFCommon.bytesCopy(sWord,b,0,nEnd-nBegin+1); 
			   sWord[nEnd-nBegin+1]=0;
		   }
		/*
		   if((bAdd||strlen(sWord)>4)&&IsAllChineseNum(sWord))
		   {  //Only convert the Chinese Num to 3755 while 
		      //Get the inner code of the first Chinese Char
		       strcpy(sWord,"????°?");
		   }
		*/   
		   if(nType==Final.CT_CHINESE)//&&IsAllChinese((unsigned char *)sWord)
		   {//Chinese word
			   nId=Utility.CC_ID(sWord[0],sWord[1]);
				   //Get the inner code of the first Chinese Char
				sWordRet[0]=sWord[2];//store the word,not store the first Chinese Char
				return true;
		   }
		/* if(nType==CT_NUM&&IsAllNum((unsigned char *)sWord))
		   {
			   *nId=3756;
		       //Get the inner code of the first Chinese Char
		       sWordRet[0]=0;//store the word,not store the first Chinese Char
			   return true;
		   }
		*/ if(nType==Final.CT_DELIMITER)
		   {//Delimiter
			   nId=3755;
		       //Get the inner code of the first Chinese Char
		       GFCommon.bytesCopy(sWordRet,sWord,0,sWord.length);//store the word,not store the first Chinese Char
			   return true;
		   }
		/*
		   if(nType==CT_LETTER&&IsAllLetter((unsigned char *)sWord))
		   {
			   *nId=3757;
		       //Get the inner code of the first Chinese Char
		       sWordRet[0]=0;//store the word,not store the first Chinese Char
			   return true;
		   }
		   if(nType==CT_SINGLE&&IsAllSingleByte((unsigned char *)sWord))
		   {
			   *nId=3758;
		       //Get the inner code of the first Chinese Char
		       sWordRet[0]=0;//store the word,not store the first Chinese Char
			   return true;
		   }
		   if(nType==CT_INDEX&&IsAllIndex((unsigned char *)sWord))
		   {
			   *nId=3759;
		       //Get the inner code of the first Chinese Char
		       sWordRet[0]=0;//store the word,not store the first Chinese Char
			   return true;
		   }
		*/
		   return false;//other invalid
		}

	// The data for modify
	protected boolean DelModified() {
		return false;
	}

	protected boolean FindInOriginalTable(int nInnerCode, String sWord,
			int nHandle, int[] nPosRet) {
		return false;
	}

	protected boolean FindInModifyTable(int nInnerCode, String sWord,
			int nHandle, TagWordChain[] pFindRet) {
		return false;
	}

}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -