⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 unknownseg.java

📁 基于中科院的ICTCLAS实现中文分词系统 开发工具是JAVA.经测试,效果很好
💻 JAVA
字号:
package com.gftech.ictclas4j.segment;

import java.util.ArrayList;

import com.gftech.ictclas4j.bean.SegGraph;
import com.gftech.ictclas4j.bean.Span;
import com.gftech.ictclas4j.bean.WordResult;
import com.gftech.ictclas4j.utility.Dictionary;
import com.gftech.ictclas4j.utility.DynamicArray;
import com.gftech.ictclas4j.utility.Utility.TAG_TYPE;

public class UnknownSeg {

	Dictionary unDict;

	Span roleTag;

	int pos;

	String unknownFlags;

	public UnknownSeg() {
		roleTag = new Span();
	}

	public boolean recognition(ArrayList<WordResult> wrs, DynamicArray graphOpt, Dictionary coreDict) {
		int j = 0;
		int startPos = 0;
		int atomStart = 0;
		int atomEnd = 0;

		ArrayList<SegGraph> sgs = graphOpt.getSgs();
		roleTag.posTagging(wrs, coreDict, unDict);
		for (int i = 0; i < roleTag.m_nUnknownIndex; i++) {
			while (j < sgs.size() && startPos < roleTag.m_nUnknownWords[i][0]) {
				startPos += sgs.get(j++).getLen();
			}
			atomStart = j;
			while (j < sgs.size() && startPos < roleTag.m_nUnknownWords[i][1])
				startPos += sgs.get(j++).getLen();
			atomEnd = j;

			if (atomStart < atomEnd) {
				SegGraph sg = graphOpt.getElement(atomStart, atomEnd);
				if (sg != null && sg.getValue() > roleTag.m_dWordsPossibility[i]) {
					SegGraph sg2 = new SegGraph();
					sg2.setRow(atomStart);
					sg2.setCol(atomEnd);
					sg2.setValue(roleTag.m_dWordsPossibility[i]);
					sg2.setPos(pos);
					sg2.setWord(unknownFlags);
					graphOpt.setElement(sg2);
				}
			}
		}

		return true;
	}

	public boolean configure(String fileName, TAG_TYPE type) {
		if (fileName != null) {
			unDict = new Dictionary();
			unDict.load(fileName + ".dct");

			roleTag = new Span();
			roleTag.loadContext(fileName + ".ctx");

			roleTag.setType(type);
			switch (type) {
			case TT_PERSON:
			// Set the special flag for transliterations
			case TT_TRANS_PERSON:
				pos = -28274;// -'n'*256-'r';
				unknownFlags = "未##人";
				break;
			case TT_PLACE:
				pos = -28275;// -'n'*256-'s';
				unknownFlags = "未##地";
				break;
			default:
				pos = 0;
				break;
			}
			return true;
		}
		return false;
	}

	// Judge whether the name is a given name
	public boolean isGivenName(String sName) {
		String firstChar;
		String secondChar;
		// given Name Possibility
		double gnp = 0;
		// singleNamePossibility
		double snp = 0;

		if (sName != null) {
			if (sName.getBytes().length != 4)
				return false;

			firstChar = sName.substring(0, 1);
			secondChar = sName.substring(1);

			// The possibility of P(Wi|Ti)
			gnp += Math.log((double) unDict.getFrequency(firstChar, 2) + 1.0);
			gnp -= Math.log(roleTag.context.getFrequency(0, 2) + 1.0);
			gnp += Math.log((double) unDict.getFrequency(secondChar, 3) + 1.0);
			gnp -= Math.log(roleTag.context.getFrequency(0, 3) + 1.0);
			// The possibility of conversion from 2 to 3
			gnp += Math.log(roleTag.context.getContextPossibility(0, 2, 3) + 1.0);
			gnp -= Math.log(roleTag.context.getFrequency(0, 2) + 1.0);

			// The possibility of P(Wi|Ti)
			snp += Math.log((double) unDict.getFrequency(firstChar, 1) + 1.0);
			snp -= Math.log(roleTag.context.getFrequency(0, 1) + 1.0);
			snp += Math.log((double) unDict.getFrequency(secondChar, 4) + 1.0);
			snp -= Math.log(roleTag.context.getFrequency(0, 4) + 1.0);
			// The possibility of conversion from 1 to 4
			snp += Math.log(roleTag.context.getContextPossibility(0, 1, 4) + 1.0);
			snp -= Math.log(roleTag.context.getFrequency(0, 1) + 1.0);

			// 张震||m_dict.getFrequency(sFirstChar,1)/m_dict.getFrequency(sFirstChar,2)>=10
			// The possibility being a single given name is more than being a
			// 2-char given name
			if (snp >= gnp)
				return false;
			return true;
		}

		return false;
	}

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -