cresult.java

来自「基于中科院的ICTCLAS实现中文分词系统开发工具是JAVA.经测试,效果很好」· Java 代码 · 共 778 行 · 第 1/2 页
JAVA
778 行
package com.gftech.ictclas4j.result;

import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;

import com.gftech.common.GFCommon;
import com.gftech.common.GFString;
import com.gftech.ictclas4j.segment.CSegment;
import com.gftech.ictclas4j.tag.CSpan;
import com.gftech.ictclas4j.unknown.CUnknowWord;
import com.gftech.ictclas4j.utility.CDictionary;
import com.gftech.ictclas4j.utility.Final;
import com.gftech.ictclas4j.utility.TagWordResult;
import com.gftech.ictclas4j.utility.Utility;

public class CResult {
	private CSegment m_Seg;// Seg class

	// Core dictionary,bigram dictionary
	private CDictionary m_dictCore;

	private CDictionary m_dictBigram;

	private CSpan m_POSTagger;// POS tagger

	// Person recognition 0:Only Segment;1: First Tag; 2:Second Type
	private CUnknowWord m_uPerson;

	private CUnknowWord m_uTransPerson;

	private CUnknowWord m_uPlace;

	public int m_nOperateType;

	// 0:PKU criterion;1:973 criterion; 2: XML criterion
	public int m_nOutputFormat;

	public double m_dSmoothingPara;

	public int m_nResultCount;

	public TagWordResult[][] m_pResult;

	// The buffer which store the segment and POS result
	// and They stored order by its possibility
	public double[] m_dResultPossibility = new double[Final.MAX_SEGMENT_NUM];

	public CResult() {
		// malloc buffer
		m_pResult = new TagWordResult[Final.MAX_SEGMENT_NUM][];
		for (int i = 0; i < Final.MAX_SEGMENT_NUM; i++) {
			m_pResult[i] = new TagWordResult[Final.MAX_WORDS];
		}

		m_dictCore.Load("data\\coreDict.dct", false);
		m_POSTagger.LoadContext("data\\lexical.ctx");
		/*
		 * m_dictCore.Load("data\\Dict.dct");
		 * m_POSTagger.LoadContext("data\\trainTest.ctx");
		 */
		/*
		 * 
		 * m_dictCore.AddItem("??·?",'d'*256,+500);
		 * m_dictCore.AddItem("??·?",'m'*256,-500);
		 * m_dictCore.AddItem("???ú",'n'*256,-2000);
		 * m_dictCore.AddItem("???ú",'r'*256,+2000);
		 * m_dictCore.AddItem("?§?ê",'t'*256,200);
		 * 
		 * m_dictCore.Optimum(); m_dictCore.Save("data\\coreDictOptimum.dct");
		 */

		m_POSTagger.SetTagType(Final.TAG_TYPE.TT_NORMAL);

		m_uPerson.Configure("data\\nr", Final.TAG_TYPE.TT_PERSON);
		// Set the person recognition configure
		m_uPlace.Configure("data\\ns", Final.TAG_TYPE.TT_PLACE);
		// Set the place recognition configure
		m_uTransPerson.Configure("data\\tr", Final.TAG_TYPE.TT_TRANS_PERSON);
		// Set the transliteration person recognition configure

		m_nOperateType = 2;// 0:Only Segment;1: First Tag; 2:Second Type
		m_nOutputFormat = 0;// 0:PKU criterion;1:973 criterion; 2: XML criterion

		m_dSmoothingPara = 0.1;// Smoothing parameter
		m_dictBigram.Load("data\\BigramDict.dct", false);
	}

	public boolean Processing(byte[] sSentence, int nCount) {
		int nIndex;
		byte[] sSegment = new byte[Final.MAX_SENTENCE_LEN * 2];

		// Unigram segment
		// m_Seg.Segment(sSentence,m_dictCore,nCount);
		// Bigram segment
		m_Seg.BiSegment(sSentence, m_dSmoothingPara, m_dictCore, m_dictBigram,
				nCount);

		m_nResultCount = m_Seg.m_nSegmentCount;
		// Record the number of result
		for (nIndex = 0; nIndex < m_Seg.m_nSegmentCount; nIndex++) {

			m_POSTagger.POSTagging(m_Seg.m_pWordSeg[nIndex], m_dictCore,
					m_dictCore);
			Output(m_Seg.m_pWordSeg[nIndex], sSegment, false);
			System.out.println("POS Tag " + nIndex + 1 + " " + sSegment);

			m_uPerson.Recognition(m_Seg.m_pWordSeg[nIndex],
					m_Seg.m_graphOptimum, m_Seg.m_graphSeg, m_dictCore);
			m_uTransPerson.Recognition(m_Seg.m_pWordSeg[nIndex],
					m_Seg.m_graphOptimum, m_Seg.m_graphSeg, m_dictCore);
			m_uPlace.Recognition(m_Seg.m_pWordSeg[nIndex],
					m_Seg.m_graphOptimum, m_Seg.m_graphSeg, m_dictCore);
		}
		// m_uPerson.Recognition(m_Seg.m_WordSeg[0],m_Seg.m_graphOptimum,m_Seg.m_graphSeg,m_dictCore);
		// Person Recognition

		System.out.println("After person recognition.");

		// Unigram
		// m_Seg.OptimumSegmet(nCount);
		// Bigram
		m_Seg.BiOptimumSegment(nCount, m_dSmoothingPara, m_dictBigram,
				m_dictCore);

		for (nIndex = 0; nIndex < m_Seg.m_nSegmentCount; nIndex++) {
			m_POSTagger.POSTagging(m_Seg.m_pWordSeg[nIndex], m_dictCore,
					m_dictCore);

			Output(m_Seg.m_pWordSeg[nIndex], sSegment, false);
			System.out.println("POS Tag " + nIndex + 1 + " "
					+ new String(sSegment));

		}
		System.out.println("After Sorting.");

		Sort();// Sort the ending

		for (nIndex = 0; nIndex < m_Seg.m_nSegmentCount; nIndex++) {
			Output(m_pResult[nIndex], sSegment, false);
			System.out.println("POS Tag " + (nIndex + 1) + "(P=Exp("
					+ m_dResultPossibility[nIndex] + ")):"
					+ new String(sSegment));
		}

		return true;
	}

	public boolean ParagraphProcessing(byte[] sParagraph, byte[] sResult) {
		byte[] sSentence;
		byte[] sChar = new byte[3];
		byte[] sSentenceResult;
		int nLen = sParagraph.length + 13;
		sSentence = new byte[nLen];// malloc buffer
		sSentenceResult = new byte[nLen * 3];// malloc buffer
		sSentence[0] = 0;
		int nPosIndex = 0, nParagraphLen = sParagraph.length, nSentenceIndex = 0;
		sChar[2] = 0;
		sResult[0] = 0;// Init the result
		boolean bFirstIgnore = true;
		int index = 0;

		// Add a sentence begin flag
		GFCommon.bytesCopy(sSentence, Final.SENTENCE_BEGIN.getBytes(), index,
				Final.SENTENCE_BEGIN.getBytes().length);
		index += Final.SENTENCE_BEGIN.getBytes().length;
		while (nPosIndex < nParagraphLen) {// Find a whole sentence which
			// separated by ! . \n \r
			sChar[0] = sParagraph[nPosIndex];// Get a char
			sChar[1] = 0;
			if (sParagraph[nPosIndex] < 0) {// double byte char
				nPosIndex += 1;
				sChar[1] = sParagraph[nPosIndex];
			}
			nPosIndex += 1;
			/*
			 * #define SEPERATOR_C_SENTENCE "。！？：；…" #define
			 * SEPERATOR_C_SUB_SENTENCE "、，（）“”‘’" #define SEPERATOR_E_SENTENCE
			 * "!?:;" #define SEPERATOR_E_SUB_SENTENCE ",()\042'" #define
			 * SEPERATOR_LINK "\n\r "
			 */
			if (Utility.CC_Find(Final.SEPERATOR_C_SENTENCE.getBytes(), sChar)
					|| Utility.CC_Find(Final.SEPERATOR_C_SUB_SENTENCE
							.getBytes(), sChar)
					|| Utility.strstr(Final.SEPERATOR_E_SENTENCE.getBytes(),
							sChar) != -1
					|| Utility.strstr(
							Final.SEPERATOR_E_SUB_SENTENCE.getBytes(), sChar) != -1
					|| Utility.strstr(Final.SEPERATOR_LINK.getBytes(), sChar) != -1) {// Reach
				// end
				// of a
				// sentence.Get
				// a
				// whole
				// sentence
				if (Utility.strstr(Final.SEPERATOR_LINK.getBytes(), sChar) == -1)// Not
				// link
				// seperator
				{
					GFCommon.bytesCopy(sSentence, sChar, index, sChar.length);
					index += sChar.length;
				}
				if (sSentence[0] != 0
						&& Utility.strcmp(sSentence, Final.SENTENCE_BEGIN
								.getBytes()) == false) {
					if (Utility.strstr(Final.SEPERATOR_C_SUB_SENTENCE
							.getBytes(), sChar) == -1
							&& Utility.strstr(Final.SEPERATOR_E_SUB_SENTENCE
									.getBytes(), sChar) == -1) {
						// Add sentence ending flag
						GFCommon
								.bytesCopy(sSentence, Final.SENTENCE_END
										.getBytes(), index, Final.SENTENCE_END
										.length());
						index += Final.SENTENCE_END.length();
					}

					Processing(sSentence, 1);// Processing and output the
					// result of current sentence.
					Output(m_pResult[0], sSentenceResult, bFirstIgnore);// Output
					// to
					// the
					// imediate
					// result
					// bFirstIgnore=true;
					GFCommon.bytesCopy(sResult, sSentenceResult, index,
							sSentenceResult.length);// Store in the result
					// buffer
				}

				// Link the result with the SEPERATOR_LINK
				if (Utility.strstr(Final.SEPERATOR_LINK.getBytes(), sChar) != -1) {
					GFCommon.bytesCopy(sResult, sChar, index, sChar.length);
					// Add a sentence
					GFCommon.bytesCopy(sSentence, Final.SENTENCE_BEGIN
							.getBytes(), 0,
							Final.SENTENCE_BEGIN.getBytes().length);
					index += Final.SENTENCE_BEGIN.getBytes().length;
					// begin flag

					// sSentence[0]=0;//New sentence, and begin new segmentation
					// bFirstIgnore=false;
				} else if (Utility.strstr(
						Final.SEPERATOR_C_SENTENCE.getBytes(), sChar) != -1
						|| Utility.strstr(
								Final.SEPERATOR_E_SENTENCE.getBytes(), sChar) != -1) {
					// Add a sentence
					GFCommon.bytesCopy(sSentence, Final.SENTENCE_BEGIN
							.getBytes(), 0,
							Final.SENTENCE_BEGIN.getBytes().length);
					// begin flag
					// sSentence[0]=0;//New sentence, and begin new segmentation
					// bFirstIgnore=false;
				} else {
					GFCommon.bytesCopy(sSentence, sChar, 0, sChar.length);// reset
					// current
					// sentence,
					// and
					// add the previous end at begin
					// position
				}
			} else
				// Other chars and store in the sentence buffer
				GFCommon.bytesCopy(sSentence, sChar, index, sChar.length);
		}
		if (sSentence[0] != 0
				&& Utility.strcmp(sSentence, Final.SENTENCE_BEGIN.getBytes()) == false) {
			// Add sentence ending flag
			GFCommon.bytesCopy(sSentence, Final.SENTENCE_END.getBytes(), index,
					Final.SENTENCE_END.getBytes().length);
			index += Final.SENTENCE_END.getBytes().length;
			Processing(sSentence, 1);// Processing and output the result of
			// current sentence.
			System.out.println(sSentence);
			Output(m_pResult[0], sSentenceResult, bFirstIgnore);
			// imediate result
			System.out.println(m_pResult);
			// Store in the result buffer
			GFCommon.bytesCopy(sResult, sSentenceResult, index,
					sSentenceResult.length);
		}
		return true;
	}

	public boolean FileProcessing(String sSourceFile, String sResultFile) {
		// The file pointer of read and write
		File fpSource;
		File fpResult;
		byte[] sParagraph;
		byte[] sParagraphResult;
		int nLineIndex = 1;
		sParagraph = new byte[4 * 1024];
		sParagraphResult = new byte[8 * 1024];

		fpSource = new File(sSourceFile);
		// Cannot open the source file to read
		if (fpSource.canRead() == false)
			return false;

		fpResult = new File(sResultFile);
		// Cannot open the result file to write
		if (fpResult.canWrite() == false)
			return false;
		try {
			DataInputStream in = new DataInputStream(new FileInputStream(
					fpSource));
			PrintWriter out = new PrintWriter(new FileOutputStream(fpResult));
			if (m_nOutputFormat == 2)// XML format
				out
						.println("<?xml version=\" 1.0\" encoding=\"gb2312\"?><result>");

			int first = -1;
			while ((first = in.read()) != -1) {
				sParagraph[0] = (byte) first;
				byte[] sParagraph2 = Utility.readBytes(in, 4 * 1024 - 1);
				GFCommon.bytesCopy(sParagraph, sParagraph2, 1, 4 * 1024 - 1);
				System.out.println(nLineIndex++);
				ParagraphProcessing(sParagraph, sParagraphResult);
				out.println(new String(sParagraphResult));
			}
			in.close();
			if (m_nOutputFormat == 2)// XML format
				out.println("</result>");
			out.close();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return true;
	}

	public boolean Output(TagWordResult[] pItem, byte[] sResult,
			boolean bFirstWordIgnore) {
		int i = 0;
		byte[] sTempBuffer = new byte[Final.WORD_MAXLENGTH];
		byte[] sPOS = new byte[3];
		sPOS[2] = 0;
		sResult[0] = 0;
		int index = 0;
		String result = "";

		if (bFirstWordIgnore)// Ignore first valid
			i = 1;

		// Not sentence ending flag
		while (pItem[i].sWord[0] != 0
				&& pItem[i].nHandle != Final.CT_SENTENCE_END) {
			// Get the POS string
			if (m_nOutputFormat != 0)// Not PKU format
				PKU2973POS(pItem[i].nHandle, sPOS);
			else// PKU format
			{
				sPOS[0] = (byte) (pItem[i].nHandle / 256);
				sPOS[1] = (byte) (pItem[i].nHandle % 256);
			}
			sPOS[m_nOperateType] = 0;// Set the sPOS with operate type

			if (m_nOutputFormat == 0)// PKU format
			{
				result += GFString.getChineseString(pItem[i].sWord, "gb2312");

				if (sPOS[0] != 0)// need POS
				{
					result += GFString.getChineseString(sPOS, "gb2312");
				}
				result += "  ";
			} else if (m_nOutputFormat == 1)// 973 format
			{
				result += GFString.getChineseString(pItem[i].sWord, "gb2312")
						+ "\\";
				if (sPOS[0] != 0)// need POS
				{
					result += "[" + GFString.getChineseString(sPOS, "gb2312")
							+ "]";
				}
			} else if (m_nOutputFormat == 2)// XML format
			{
				if (sPOS[0] != 0)// POS
				{
					result += "<any type=\""
							+ GFString.getChineseString(sPOS, "gb2312") + "\">";

				}
				result += "<src>"
cresult.java - 源码说明

本页面展示了「基于中科院的ICTCLAS实现中文分词系统开发工具是JAVA.经测试,效果很好」中的 cresult.java 源码文件，采用 Java 编程语言编写，共 778 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与ICTCLAS相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?