📄 span.java

📁 基于java语言的分词系统
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
package org.ictclas4j.bean;

import java.util.ArrayList;

import org.ictclas4j.utility.Utility;
import org.ictclas4j.utility.Utility.TAG_TYPE;


public class Span {

	public ContextStat context;

	TAG_TYPE tagType;

	private int[][] m_nTags;

	int[][] m_nBestPrev;

	int m_nStartPos;

	int[] m_nBestTag;

	int m_nCurLength;

	String[] m_sWords;

	double[][] m_dFrequency;

	public int[][] m_nUnknownWords;

	public int m_nUnknownIndex;

	public int[] m_nWordPosition;

	public double[] m_dWordsPossibility;

	public Span() {
		m_nTags = new int[Utility.MAX_WORDS_PER_SENTENCE][Utility.MAX_POS_PER_WORD];
		if (tagType != Utility.TAG_TYPE.TT_NORMAL)
			m_nTags[0][0] = 100;// Begin tag
		else
			m_nTags[0][0] = 0;// Begin tag
		m_nTags[0][1] = -1;
		m_nBestPrev = new int[Utility.MAX_WORDS_PER_SENTENCE][Utility.MAX_POS_PER_WORD];
		m_nBestTag = new int[Utility.MAX_WORDS_PER_SENTENCE];
		m_sWords = new String[Utility.MAX_WORDS_PER_SENTENCE];
		m_nUnknownWords = new int[Utility.MAX_UNKNOWN_PER_SENTENCE][2];
		m_nWordPosition = new int[Utility.MAX_WORDS_PER_SENTENCE];
		m_dWordsPossibility = new double[Utility.MAX_UNKNOWN_PER_SENTENCE];
		m_dFrequency = new double[Utility.MAX_WORDS_PER_SENTENCE][Utility.MAX_POS_PER_WORD];

		tagType = Utility.TAG_TYPE.TT_NORMAL;
	}

	public boolean loadContext(String fileName) {
		if (fileName != null) {
			context = new ContextStat();
			return context.load(fileName);
		}
		return false;
	}

	public void setType(TAG_TYPE type) {
		tagType = type;
	}

	public boolean posTagging(ArrayList<WordResult> wrList, Dictionary coreDict, Dictionary unknownDict) {
		int i = 0;
		int j, nStartPos;
		reset(false);
		while (i > -1 && i < wrList.size()) {
			nStartPos = i;// Start Position
			i = getFrom(wrList, nStartPos, coreDict, unknownDict);
			getBestPOS();
			switch (tagType) {
			case TT_NORMAL:// normal POS tagging
				j = 1;
				// Store the best POS tagging
				while (m_nBestTag[j] != -1 && j < m_nCurLength) {
					WordResult wr = wrList.get(j + nStartPos - 1);
					wr.setHandle(m_nBestTag[j]);
					// Let 。be 0
					// Exist and update its frequncy as a POS value
					if (wr.getValue() > 0 && coreDict.isExist(wr.getWord(), -1))
						wr.setValue(coreDict.getFreq(wr.getWord(), m_nBestTag[j]));
					j += 1;
				}
				break;
			case TT_PERSON:// Person recognition
				PersonRecognize(unknownDict);
				break;
			case TT_PLACE:// Place name recognition
			case TT_TRANS_PERSON:// Transliteration Person
				PlaceRecognize(coreDict, unknownDict);
				break;
			default:
				break;
			}
			reset();
		}
		return true;
	}

	public boolean reset(boolean isContinue) {
		if (!isContinue) {
			if (tagType != Utility.TAG_TYPE.TT_NORMAL)
				m_nTags[0][0] = 100;// Begin tag
			else
				m_nTags[0][0] = 0;// Begin tag
			m_nUnknownIndex = 0;
			m_dFrequency[0][0] = 0;
			m_nStartPos = 0;
		} else {
			// Get the last POS in the last sentence
			m_nTags[0][0] = m_nTags[m_nCurLength - 1][0];
			m_dFrequency[0][0] = m_dFrequency[m_nCurLength - 1][0];
		}

		// Get the last POS in the last sentence,set the -1 as end flag
		m_nTags[0][1] = -1;
		m_nCurLength = 1;
		m_nWordPosition[1] = m_nStartPos;
		m_sWords[0] = null;
		return true;
	}

	public boolean reset() {
		return reset(true);
	}

	private boolean disamb() {
		int i, j, k, nMinCandidate;
		double dMinFee = 0;
		double dTmp = 0;

		for (i = 1; i < m_nCurLength; i++)// For every word
		{
			for (j = 0; m_nTags[i][j] >= 0; j++)// For every word
			{
				nMinCandidate = Utility.MAX_POS_PER_WORD + 1;
				for (k = 0; m_nTags[i - 1][k] >= 0; k++) {
					// ConvertPOS(m_nTags[i-1][k],&nKey,&nPrevPOS);
					// ConvertPOS(m_nTags[i][j],&nKey,&nCurPOS);
					// dTmp=m_context.GetContextPossibility(nKey,nPrevPOS,nCurPOS);
					dTmp = -Math.log(context.getPossibility(0, m_nTags[i - 1][k], m_nTags[i][j]));
					dTmp += m_dFrequency[i - 1][k];// Add the fees
					if (nMinCandidate > 10 || dTmp < dMinFee)// Get the
					// minimum fee
					{
						nMinCandidate = k;
						dMinFee = dTmp;
					}
				}
				m_nBestPrev[i][j] = nMinCandidate;// The best previous for j
				m_dFrequency[i][j] = m_dFrequency[i][j] + dMinFee;
			}
		}

		return true;
	}

	public boolean getBestPOS() {
		disamb();
		for (int i = m_nCurLength - 1, j = 0; i > 0; i--)// ,j>=0
		{
			if (m_sWords[i] != null) {// Not virtual ending
				m_nBestTag[i] = m_nTags[i][j];// Record the best POS and its
				// possibility
			}
			j = m_nBestPrev[i][j];
		}
		int nEnd = m_nCurLength;// Set the end of POS tagging
		if (m_sWords[m_nCurLength - 1] == null)
			nEnd = m_nCurLength - 1;
		m_nBestTag[nEnd] = -1;
		return true;
	}

	/**
	 * 取得没有在dictUnknown中出现过的词的下一个位置
	 * @param wrList
	 * @param index
	 * @param coreDict
	 * @param unknownDict
	 * @return
	 */
	public int getFrom(ArrayList<WordResult> wrList, int index, Dictionary coreDict, Dictionary unknownDict) {

		int[] aPOS = new int[Utility.MAX_POS_PER_WORD];
		int[] aFreq = new int[Utility.MAX_POS_PER_WORD];
		int nFreq = 0, j, nRetPos = 0, nWordsIndex = 0;
		boolean bSplit = false;// Need to split in Transliteration recognition
		int i = 1, nPOSCount;
		String sCurWord;// Current word
		nWordsIndex = index ;

		for (; i < Utility.MAX_WORDS_PER_SENTENCE && nWordsIndex < wrList.size(); i++) {
			WordResult wr = wrList.get(nWordsIndex);
			String word = wr.getWord();
			if (tagType == Utility.TAG_TYPE.TT_NORMAL || !unknownDict.isExist(word, 44)) {
				// current word
				m_sWords[i] = word;// store
				m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].getBytes().length;
			}  
			
			// Record the position of current word
			m_nStartPos = m_nWordPosition[i + 1];
			// Move the Start POS to the ending
			if (tagType != Utility.TAG_TYPE.TT_NORMAL) {
				// Get the POSs from the unknown recognition dictionary
				sCurWord = m_sWords[i];
				if (tagType == Utility.TAG_TYPE.TT_TRANS_PERSON && i > 0
						&& Utility.charType(m_sWords[i - 1]) == Utility.CT_CHINESE) {
					if (".".equals(m_sWords[i]))
						sCurWord = "．";
					else if ("-".equals(m_sWords))
						sCurWord = "－";
				}
				ArrayList<WordItem> wis = unknownDict.getHandle(sCurWord);
				nPOSCount = wis.size() + 1;
				for (j = 0; j < wis.size(); j++) {
					aPOS[j] = wis.get(j).getHandle();
					aFreq[j] = wis.get(j).getFreq();
					m_nTags[i][j] = aPOS[j];
					m_dFrequency[i][j] = -Math.log((1 + aFreq[j]));
					m_dFrequency[i][j] += Math.log((context.getFreq(0, aPOS[j]) + nPOSCount));
				}

				if ("始##始".equals(m_sWords[i])) {
					m_nTags[i][j] = 100;
					m_dFrequency[i][j] = 0;
					j++;
				} else if ("末##末".equals(m_sWords[i])) {
					m_nTags[i][j] = 101;
					m_dFrequency[i][j] = 0;
					j++;
				} else {
					wis = coreDict.getHandle(m_sWords[i]);
					nFreq = 0;
					for (int k = 0; k < wis.size(); k++) {
						aFreq[k] = wis.get(k).getFreq();
						nFreq += aFreq[k];
					}
					if (wis.size() > 0) {
						m_nTags[i][j] = 0;
						m_dFrequency[i][j] = -Math.log((double) (1 + nFreq));
						m_dFrequency[i][j] += Math.log((double) (context.getFreq(0, 0) + nPOSCount));
						j++;
					}
				}
			} else// For normal POS tagging
			{
				j = 0;
				// Get the POSs from the unknown recognition dictionary
				if (wr.getHandle() > 0) {// The word has is only one POS
					// value
					// We have record its POS and nFrequncy in the items.
					m_nTags[i][j] = wr.getHandle();
					m_dFrequency[i][j] = -Math.log(wr.getValue())
							+ Math.log((double) (context.getFreq(0, m_nTags[i][j]) + 1));

					// Not permit the value less than 0
					if (m_dFrequency[i][j] < 0)
						m_dFrequency[i][j] = 0;
					j++;
				}

				// The word has multiple POSs, we should retrieve the
				// information from Core Dictionary
				else {
					if (wr.getHandle() < 0) {// The word has is only one POS
						m_nTags[i][j] = -wr.getHandle();
						m_dFrequency[i][j++] = wr.getValue();

					}
					ArrayList<WordItem> wis = coreDict.getHandle(m_sWords[i]);
					nPOSCount = wis.size();
					for (; j < wis.size(); j++) {
						// in the unknown dictionary
						aPOS[j] = wis.get(j).getHandle();
						aFreq[j] = wis.get(j).getFreq();
						m_nTags[i][j] = aPOS[j];
						m_dFrequency[i][j] = -Math.log(1 + aFreq[j])
								+ Math.log(context.getFreq(0, m_nTags[i][j]) + nPOSCount);
					}
				}
			}

			// We donot know the POS, so we have to guess them according lexical
			// knowledge
			if (j == 0) {
				j = guessPOS(i);// Guess the POS of current word
			}
			m_nTags[i][j] = -1;// Set the ending POS

			// No ambuguity, so we can break from the loop
			if (j == 1 && m_nTags[i][j] != Utility.CT_SENTENCE_BEGIN) {
				i++;
				m_sWords[i] = null;
				break;
			}
			if (!bSplit)
				nWordsIndex++;
		}
		if (nWordsIndex == wrList.size())
			nRetPos = -1;// Reaching ending

		if (m_nTags[i - 1][1] != -1)// ||m_sWords[i][0]==0
		{// Set end for words like "张/华/平"
			if (tagType != Utility.TAG_TYPE.TT_NORMAL)
				m_nTags[i][0] = 101;
			else
				m_nTags[i][0] = 1;

			m_dFrequency[i][0] = 0;
			m_sWords[i] = null;// Set virtual ending
			m_nTags[i++][1] = -1;
		}
		m_nCurLength = i;// The current word count
		if (nRetPos != -1)
			return nWordsIndex + 1;// Next start position
		return -1;// Reaching ending

	}

	/**
	 * <pre>
	 *          
	 *          BBCD 343 0.003606 
	 *          BBC 2 0.000021
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -