📄 postagger.java

📁 基于java语言的分词系统
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
package org.ictclas4j.segment;

import java.util.ArrayList;

import org.ictclas4j.bean.ContextStat;
import org.ictclas4j.bean.Dictionary;
import org.ictclas4j.bean.POS;
import org.ictclas4j.bean.SegNode;
import org.ictclas4j.bean.WordItem;
import org.ictclas4j.utility.DebugUtil;
import org.ictclas4j.utility.POSTag;
import org.ictclas4j.utility.Utility;
import org.ictclas4j.utility.Utility.TAG_TYPE;


/**
 * 未登录词的处理
 * 
 * @author sinboy
 * @since 2007.5.17 updated
 * 
 */
public class PosTagger {

	private Dictionary coreDict;

	private Dictionary unknownDict;

	private ContextStat context;

	private int pos;

	private TAG_TYPE tagType;

	String unknownFlags;

	public PosTagger(TAG_TYPE type, String fileName, Dictionary coreDict) {
		if (fileName != null) {
			this.coreDict = coreDict;
			if (type == Utility.TAG_TYPE.TT_NORMAL)
				this.unknownDict = coreDict;
			else {
				unknownDict = new Dictionary();
				unknownDict.load(fileName + ".dct");

			}
			context = new ContextStat();
			context.load(fileName + ".ctx");
			this.tagType = type;

			switch (type) {
			case TT_PERSON:
				// Set the special flag for transliterations
			case TT_TRANS_PERSON:
				pos = -POSTag.NOUN_PERSON;
				unknownFlags = "未##人";
				break;
			case TT_PLACE:
				pos = -POSTag.NOUN_SPACE;
				unknownFlags = "未##地";
				break;
			default:
				pos = 0;
				break;
			}
		}
	}

	/**
	 * 从经过初分的结果中，找出构成人名、地名或其它词的未登陆词
	 * 
	 * @param segGraph
	 * @param coreDict
	 * @return
	 */
	public boolean recognition(SegGraph segGraph, ArrayList<SegNode> sns) {

		if (segGraph != null && sns != null && coreDict != null && unknownDict != null && context != null) {
			posTag(sns);
			getBestPos(sns);
//			DebugUtil.outputPostag(sns);
			switch (tagType) {
			case TT_PERSON:// Person recognition
				personRecognize(segGraph, sns);
				break;
			case TT_PLACE:// Place name recognition
			case TT_TRANS_PERSON:// Transliteration Person
				placeRecognize(segGraph, sns, coreDict);
				break;
			}
		}

		return true;
	}

	public boolean recognition(ArrayList<SegNode> sns) {

		if (sns != null && unknownDict != null && context != null) {
			posTag(sns);
			getBestPos(sns);
			DebugUtil.outputPostag(sns);
			switch (tagType) {
			case TT_NORMAL:
				for (SegNode sn : sns) {
					if (sn.getPos() == 0) {
						sn.setPos(getBestTag(sn));
					}
				}
			}
		}

		return true;
	}

	/**
	 * 对所有的词性进行标记
	 * 
	 * @param frs
	 *            初次切分的结果
	 * @pararm startIndex 开始进行词性标记的位置
	 * @param coreDict
	 *            核心词典库
	 * @param unknownDict
	 *            未登陆词典库
	 * @return 下一个需要开始的位置
	 */
	public void posTag(ArrayList<SegNode> sns) {

		if (sns != null && coreDict != null && unknownDict != null && context != null) {
			int i = 0;
			String curWord = null;

			for (; i < sns.size(); i++) {
				SegNode sn = sns.get(i);
				sn.setAllPos(null);
				curWord = sn.getSrcWord();
				// if (tagType == Utility.TAG_TYPE.TT_NORMAL ||
				// !unknownDict.isExist(sn.getWord(), 44)) {
				//
				// }

				if (tagType != Utility.TAG_TYPE.TT_NORMAL) {

					// 把全角字符车成半角的字符
					if (tagType == Utility.TAG_TYPE.TT_TRANS_PERSON && i > 0) {
						String prevWord = sns.get(i - 1).getSrcWord();
						if (Utility.charType(prevWord) == Utility.CT_CHINESE) {
							if (".".equals(curWord))
								curWord = "．";
							else if ("-".equals(curWord))
								curWord = "－";
						}
					}

					// 从unknownDict词典库中获取当前的所有词性
					ArrayList<WordItem> wis = unknownDict.getHandle(curWord);
					for (int j = 0; wis != null && j < wis.size(); j++) {
						WordItem wi = wis.get(j);
						int tag = wi.getHandle();
						double freq = -Math.log((1 + wi.getFreq()));
						freq += Math.log((context.getFreq(0, wi.getHandle()) + wis.size() + 1));
						POS pos = new POS(tag, freq);
						sn.addPos(pos);
					}

					if (Utility.SENTENCE_BEGIN.equals(curWord))
						sn.addPos(new POS(100, 0));

					else if (Utility.SENTENCE_END.equals(curWord))
						sn.addPos(new POS(101, 0));
					else {
						int nFreq = 0;
						wis = coreDict.getHandle(curWord);
						if (wis != null) {
							for (WordItem wi : wis)
								nFreq += wi.getFreq();

							if (wis.size() > 0) {
								double freq = -Math.log((double) (1 + nFreq));
								freq += Math.log((double) (context.getFreq(0, 0) + wis.size()));
								sn.addPos(new POS(0, freq));
							}
						}
					}
				} else {
					if (sn.getPos() > 0) {
						int tag = sn.getPos();
						double value = -Math.log(sn.getValue());
						value += Math.log(context.getFreq(0, tag));
						if (value < 0)
							value = 0;
						sn.addPos(new POS(tag, value));
					} else {
						if (sn.getPos() < 0) {
							sn.setPos(-sn.getPos());
							sn.addPos(new POS(-sn.getPos(), sn.getValue()));
						}
						ArrayList<WordItem> wis = coreDict.getHandle(curWord);
						if (wis != null) {
							for (WordItem wi : wis) {
								int tag = wi.getHandle();
								double value = -Math.log(1 + wi.getFreq());
								value += Math.log(context.getFreq(0, tag) + wis.size());
								sn.addPos(new POS(tag, value));
							}
						}
					}
				}

				if (sn.getAllPos() == null)
					guessPos(tagType, sn);
				
				// 如果一个词节点对应的allPos为null，则说明它无法单独成词
				// 它的词性随下一个词的词性
				if (i - 1 >= 0 && sns.get(i - 1).getPosSize() == -1) {
					if (sn.getPosSize() > 0) {
						POS pos = new POS(sn.getAllPos().get(0).getTag(), 0);
						sns.get(i - 1).addPos(pos);
					}
				}
			}

			// 添加一个结束点
			SegNode last = sns.get(i - 1);
			if (last != null) {
				SegNode sn = new SegNode();
				int tag = 0;
				if (tagType != Utility.TAG_TYPE.TT_NORMAL)
					tag = 101;
				else
					tag = 1;
				POS pos = new POS(tag, 0);
				sn.addPos(pos);
				sns.add(sn);
			}
		}
	}

	/**
	 * 取得上一个词的N个词性虽和当前词的词性最匹配的那一个
	 */
	private void getBestPos(ArrayList<SegNode> sns) {
		ArrayList<POS> prevAllPos = null;
		ArrayList<POS> allPos = null;
		if (sns != null && context != null) {
			for (int i = 0; i < sns.size(); i++) {
				if (i == 0) {
					int pos = tagType != Utility.TAG_TYPE.TT_NORMAL ? 100 : 0;
					prevAllPos = new ArrayList<POS>();
					prevAllPos.add(new POS(pos, 0));
				} else {
					prevAllPos = sns.get(i - 1).getAllPos();
				}
				allPos = sns.get(i).getAllPos(); 
				if(allPos!=null)
				for (POS pos : allPos) {
					int minPrev = 0;
					double minFreq = 1000;
					for (int k = 0;prevAllPos!=null &&  k < prevAllPos.size(); k++) {
						POS prevPos = prevAllPos.get(k);
						double temp = context.getPossibility(0, prevPos.getTag(), pos.getTag());
						temp = -Math.log(temp) + prevPos.getFreq();
						if (temp < minFreq) {
							minFreq = temp;
							minPrev = k;
						}
					}

					pos.setPrev(minPrev);
					pos.setFreq(pos.getFreq() + minFreq);
				}
			}

			tagBest(sns);
		}
	}

	// 猜测该词的词性
	private int guessPos(TAG_TYPE tagType, SegNode sn) {
		int result = -1;
		if (sn != null && context != null) {
			int charType;

			String word = sn.getWord();
			double freq = 0;

			switch (tagType) {
			case TT_NORMAL:
				break;
			case TT_PERSON:
				if (word.indexOf("××") != -1) {
					freq = (double) 1 / (double) (context.getFreq(0, 6) + 1);
					sn.addPos(new POS(6, freq));
				} else {
					freq = (double) 1 / (double) (context.getFreq(0, 0) + 1);
					sn.addPos(new POS(0, freq));

					if (sn.getLen() >= 4) {
						freq = (double) 1 / (double) (context.getFreq(0, 0) + 1);
						sn.addPos(new POS(0, freq));
						freq = (double) 1 / (double) (context.getFreq(0, 11) * 8);
						sn.addPos(new POS(11, freq));
						freq = (double) 1 / (double) (context.getFreq(0, 12) * 8);
						sn.addPos(new POS(12, freq));
						freq = (double) 1 / (double) (context.getFreq(0, 13) * 8);
						sn.addPos(new POS(13, freq));
					} else if (sn.getLen() == 2) {
						freq = (double) 1 / (double) (context.getFreq(0, 0) + 1);
						sn.addPos(new POS(0, freq));
						charType = Utility.charType(word);
						if (charType == Utility.CT_OTHER || charType == Utility.CT_CHINESE) {
							freq = (double) 1 / (double) (context.getFreq(0, 1) + 1);
							sn.addPos(new POS(1, freq));
							freq = (double) 1 / (double) (context.getFreq(0, 2) + 1);
							sn.addPos(new POS(2, freq));
							freq = (double) 1 / (double) (context.getFreq(0, 3) + 1);
							sn.addPos(new POS(3, freq));
							freq = (double) 1 / (double) (context.getFreq(0, 4) + 1);
							sn.addPos(new POS(4, freq));
						}
						freq = (double) 1 / (double) (context.getFreq(0, 11) * 8);
						sn.addPos(new POS(11, freq));
						freq = (double) 1 / (double) (context.getFreq(0, 12) * 8);
						sn.addPos(new POS(12, freq));
						freq = (double) 1 / (double) (context.getFreq(0, 13) * 8);
						sn.addPos(new POS(13, freq));
					}
				}
				break;
			case TT_PLACE:
				freq = (double) 1 / (double) (context.getFreq(0, 0) + 1);
				sn.addPos(new POS(0, freq));

				if (sn.getLen() >= 4) {
					freq = (double) 1 / (double) (context.getFreq(0, 11) * 8);
					sn.addPos(new POS(11, freq));
					freq = (double) 1 / (double) (context.getFreq(0, 12) * 8);
					sn.addPos(new POS(12, freq));
					freq = (double) 1 / (double) (context.getFreq(0, 13) * 8);
					sn.addPos(new POS(13, freq));
				} else if (sn.getLen() == 2) {
					freq = (double) 1 / (double) (context.getFreq(0, 0) + 1);
					sn.addPos(new POS(0, freq));
					charType = Utility.charType(word);
					if (charType == Utility.CT_OTHER || charType == Utility.CT_CHINESE) {

						freq = (double) 1 / (double) (context.getFreq(0, 1) + 1);
						sn.addPos(new POS(1, freq));
						freq = (double) 1 / (double) (context.getFreq(0, 2) + 1);
						sn.addPos(new POS(2, freq));
						freq = (double) 1 / (double) (context.getFreq(0, 3) + 1);
						sn.addPos(new POS(3, freq));
						freq = (double) 1 / (double) (context.getFreq(0, 4) + 1);
						sn.addPos(new POS(4, freq));
					}
					freq = (double) 1 / (double) (context.getFreq(0, 11) * 8);
					sn.addPos(new POS(11, freq));
					freq = (double) 1 / (double) (context.getFreq(0, 12) * 8);
					sn.addPos(new POS(12, freq));
					freq = (double) 1 / (double) (context.getFreq(0, 13) * 8);
					sn.addPos(new POS(13, freq));
				}
				break;
			case TT_TRANS_PERSON:
				freq = (double) 1 / (double) (context.getFreq(0, 0) + 1);
				sn.addPos(new POS(0, freq));
				if (!Utility.isAllChinese(word)) {
					if (Utility.isAllLetter(word)) {
						freq = (double) 1 / (double) (context.getFreq(0, 1) + 1);
						sn.addPos(new POS(1, freq));
						freq = (double) 1 / (double) (context.getFreq(0, 11) + 1);
						sn.addPos(new POS(11, freq));
						freq = (double) 1 / (double) (context.getFreq(0, 2) * 2 + 1);
12 下一页
💿 文件大小 6647 K
👤 上传用户 ddddong
📂 所属分类多国语言处理
🏷️ 相关标签

#java #语言 #分
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -