📄 postagger.java
字号:
package org.ictclas4j.segment;
import java.util.ArrayList;
import org.ictclas4j.bean.ContextStat;
import org.ictclas4j.bean.Dictionary;
import org.ictclas4j.bean.POS;
import org.ictclas4j.bean.SegNode;
import org.ictclas4j.bean.WordItem;
import org.ictclas4j.utility.DebugUtil;
import org.ictclas4j.utility.POSTag;
import org.ictclas4j.utility.Utility;
import org.ictclas4j.utility.Utility.TAG_TYPE;
/**
* 未登录词的处理
*
* @author sinboy
* @since 2007.5.17 updated
*
*/
public class PosTagger {
private Dictionary coreDict;
private Dictionary unknownDict;
private ContextStat context;
private int pos;
private TAG_TYPE tagType;
String unknownFlags;
public PosTagger(TAG_TYPE type, String fileName, Dictionary coreDict) {
if (fileName != null) {
this.coreDict = coreDict;
if (type == Utility.TAG_TYPE.TT_NORMAL)
this.unknownDict = coreDict;
else {
unknownDict = new Dictionary();
unknownDict.load(fileName + ".dct");
}
context = new ContextStat();
context.load(fileName + ".ctx");
this.tagType = type;
switch (type) {
case TT_PERSON:
// Set the special flag for transliterations
case TT_TRANS_PERSON:
pos = -POSTag.NOUN_PERSON;
unknownFlags = "未##人";
break;
case TT_PLACE:
pos = -POSTag.NOUN_SPACE;
unknownFlags = "未##地";
break;
default:
pos = 0;
break;
}
}
}
/**
* 从经过初分的结果中,找出构成人名、地名或其它词的未登陆词
*
* @param segGraph
* @param coreDict
* @return
*/
public boolean recognition(SegGraph segGraph, ArrayList<SegNode> sns) {
if (segGraph != null && sns != null && coreDict != null && unknownDict != null && context != null) {
posTag(sns);
getBestPos(sns);
// DebugUtil.outputPostag(sns);
switch (tagType) {
case TT_PERSON:// Person recognition
personRecognize(segGraph, sns);
break;
case TT_PLACE:// Place name recognition
case TT_TRANS_PERSON:// Transliteration Person
placeRecognize(segGraph, sns, coreDict);
break;
}
}
return true;
}
public boolean recognition(ArrayList<SegNode> sns) {
if (sns != null && unknownDict != null && context != null) {
posTag(sns);
getBestPos(sns);
DebugUtil.outputPostag(sns);
switch (tagType) {
case TT_NORMAL:
for (SegNode sn : sns) {
if (sn.getPos() == 0) {
sn.setPos(getBestTag(sn));
}
}
}
}
return true;
}
/**
* 对所有的词性进行标记
*
* @param frs
* 初次切分的结果
* @pararm startIndex 开始进行词性标记的位置
* @param coreDict
* 核心词典库
* @param unknownDict
* 未登陆词典库
* @return 下一个需要开始的位置
*/
public void posTag(ArrayList<SegNode> sns) {
if (sns != null && coreDict != null && unknownDict != null && context != null) {
int i = 0;
String curWord = null;
for (; i < sns.size(); i++) {
SegNode sn = sns.get(i);
sn.setAllPos(null);
curWord = sn.getSrcWord();
// if (tagType == Utility.TAG_TYPE.TT_NORMAL ||
// !unknownDict.isExist(sn.getWord(), 44)) {
//
// }
if (tagType != Utility.TAG_TYPE.TT_NORMAL) {
// 把全角字符车成半角的字符
if (tagType == Utility.TAG_TYPE.TT_TRANS_PERSON && i > 0) {
String prevWord = sns.get(i - 1).getSrcWord();
if (Utility.charType(prevWord) == Utility.CT_CHINESE) {
if (".".equals(curWord))
curWord = ".";
else if ("-".equals(curWord))
curWord = "-";
}
}
// 从unknownDict词典库中获取当前的所有词性
ArrayList<WordItem> wis = unknownDict.getHandle(curWord);
for (int j = 0; wis != null && j < wis.size(); j++) {
WordItem wi = wis.get(j);
int tag = wi.getHandle();
double freq = -Math.log((1 + wi.getFreq()));
freq += Math.log((context.getFreq(0, wi.getHandle()) + wis.size() + 1));
POS pos = new POS(tag, freq);
sn.addPos(pos);
}
if (Utility.SENTENCE_BEGIN.equals(curWord))
sn.addPos(new POS(100, 0));
else if (Utility.SENTENCE_END.equals(curWord))
sn.addPos(new POS(101, 0));
else {
int nFreq = 0;
wis = coreDict.getHandle(curWord);
if (wis != null) {
for (WordItem wi : wis)
nFreq += wi.getFreq();
if (wis.size() > 0) {
double freq = -Math.log((double) (1 + nFreq));
freq += Math.log((double) (context.getFreq(0, 0) + wis.size()));
sn.addPos(new POS(0, freq));
}
}
}
} else {
if (sn.getPos() > 0) {
int tag = sn.getPos();
double value = -Math.log(sn.getValue());
value += Math.log(context.getFreq(0, tag));
if (value < 0)
value = 0;
sn.addPos(new POS(tag, value));
} else {
if (sn.getPos() < 0) {
sn.setPos(-sn.getPos());
sn.addPos(new POS(-sn.getPos(), sn.getValue()));
}
ArrayList<WordItem> wis = coreDict.getHandle(curWord);
if (wis != null) {
for (WordItem wi : wis) {
int tag = wi.getHandle();
double value = -Math.log(1 + wi.getFreq());
value += Math.log(context.getFreq(0, tag) + wis.size());
sn.addPos(new POS(tag, value));
}
}
}
}
if (sn.getAllPos() == null)
guessPos(tagType, sn);
// 如果一个词节点对应的allPos为null,则说明它无法单独成词
// 它的词性随下一个词的词性
if (i - 1 >= 0 && sns.get(i - 1).getPosSize() == -1) {
if (sn.getPosSize() > 0) {
POS pos = new POS(sn.getAllPos().get(0).getTag(), 0);
sns.get(i - 1).addPos(pos);
}
}
}
// 添加一个结束点
SegNode last = sns.get(i - 1);
if (last != null) {
SegNode sn = new SegNode();
int tag = 0;
if (tagType != Utility.TAG_TYPE.TT_NORMAL)
tag = 101;
else
tag = 1;
POS pos = new POS(tag, 0);
sn.addPos(pos);
sns.add(sn);
}
}
}
/**
* 取得上一个词的N个词性虽和当前词的词性最匹配的那一个
*/
private void getBestPos(ArrayList<SegNode> sns) {
ArrayList<POS> prevAllPos = null;
ArrayList<POS> allPos = null;
if (sns != null && context != null) {
for (int i = 0; i < sns.size(); i++) {
if (i == 0) {
int pos = tagType != Utility.TAG_TYPE.TT_NORMAL ? 100 : 0;
prevAllPos = new ArrayList<POS>();
prevAllPos.add(new POS(pos, 0));
} else {
prevAllPos = sns.get(i - 1).getAllPos();
}
allPos = sns.get(i).getAllPos();
if(allPos!=null)
for (POS pos : allPos) {
int minPrev = 0;
double minFreq = 1000;
for (int k = 0;prevAllPos!=null && k < prevAllPos.size(); k++) {
POS prevPos = prevAllPos.get(k);
double temp = context.getPossibility(0, prevPos.getTag(), pos.getTag());
temp = -Math.log(temp) + prevPos.getFreq();
if (temp < minFreq) {
minFreq = temp;
minPrev = k;
}
}
pos.setPrev(minPrev);
pos.setFreq(pos.getFreq() + minFreq);
}
}
tagBest(sns);
}
}
// 猜测该词的词性
private int guessPos(TAG_TYPE tagType, SegNode sn) {
int result = -1;
if (sn != null && context != null) {
int charType;
String word = sn.getWord();
double freq = 0;
switch (tagType) {
case TT_NORMAL:
break;
case TT_PERSON:
if (word.indexOf("××") != -1) {
freq = (double) 1 / (double) (context.getFreq(0, 6) + 1);
sn.addPos(new POS(6, freq));
} else {
freq = (double) 1 / (double) (context.getFreq(0, 0) + 1);
sn.addPos(new POS(0, freq));
if (sn.getLen() >= 4) {
freq = (double) 1 / (double) (context.getFreq(0, 0) + 1);
sn.addPos(new POS(0, freq));
freq = (double) 1 / (double) (context.getFreq(0, 11) * 8);
sn.addPos(new POS(11, freq));
freq = (double) 1 / (double) (context.getFreq(0, 12) * 8);
sn.addPos(new POS(12, freq));
freq = (double) 1 / (double) (context.getFreq(0, 13) * 8);
sn.addPos(new POS(13, freq));
} else if (sn.getLen() == 2) {
freq = (double) 1 / (double) (context.getFreq(0, 0) + 1);
sn.addPos(new POS(0, freq));
charType = Utility.charType(word);
if (charType == Utility.CT_OTHER || charType == Utility.CT_CHINESE) {
freq = (double) 1 / (double) (context.getFreq(0, 1) + 1);
sn.addPos(new POS(1, freq));
freq = (double) 1 / (double) (context.getFreq(0, 2) + 1);
sn.addPos(new POS(2, freq));
freq = (double) 1 / (double) (context.getFreq(0, 3) + 1);
sn.addPos(new POS(3, freq));
freq = (double) 1 / (double) (context.getFreq(0, 4) + 1);
sn.addPos(new POS(4, freq));
}
freq = (double) 1 / (double) (context.getFreq(0, 11) * 8);
sn.addPos(new POS(11, freq));
freq = (double) 1 / (double) (context.getFreq(0, 12) * 8);
sn.addPos(new POS(12, freq));
freq = (double) 1 / (double) (context.getFreq(0, 13) * 8);
sn.addPos(new POS(13, freq));
}
}
break;
case TT_PLACE:
freq = (double) 1 / (double) (context.getFreq(0, 0) + 1);
sn.addPos(new POS(0, freq));
if (sn.getLen() >= 4) {
freq = (double) 1 / (double) (context.getFreq(0, 11) * 8);
sn.addPos(new POS(11, freq));
freq = (double) 1 / (double) (context.getFreq(0, 12) * 8);
sn.addPos(new POS(12, freq));
freq = (double) 1 / (double) (context.getFreq(0, 13) * 8);
sn.addPos(new POS(13, freq));
} else if (sn.getLen() == 2) {
freq = (double) 1 / (double) (context.getFreq(0, 0) + 1);
sn.addPos(new POS(0, freq));
charType = Utility.charType(word);
if (charType == Utility.CT_OTHER || charType == Utility.CT_CHINESE) {
freq = (double) 1 / (double) (context.getFreq(0, 1) + 1);
sn.addPos(new POS(1, freq));
freq = (double) 1 / (double) (context.getFreq(0, 2) + 1);
sn.addPos(new POS(2, freq));
freq = (double) 1 / (double) (context.getFreq(0, 3) + 1);
sn.addPos(new POS(3, freq));
freq = (double) 1 / (double) (context.getFreq(0, 4) + 1);
sn.addPos(new POS(4, freq));
}
freq = (double) 1 / (double) (context.getFreq(0, 11) * 8);
sn.addPos(new POS(11, freq));
freq = (double) 1 / (double) (context.getFreq(0, 12) * 8);
sn.addPos(new POS(12, freq));
freq = (double) 1 / (double) (context.getFreq(0, 13) * 8);
sn.addPos(new POS(13, freq));
}
break;
case TT_TRANS_PERSON:
freq = (double) 1 / (double) (context.getFreq(0, 0) + 1);
sn.addPos(new POS(0, freq));
if (!Utility.isAllChinese(word)) {
if (Utility.isAllLetter(word)) {
freq = (double) 1 / (double) (context.getFreq(0, 1) + 1);
sn.addPos(new POS(1, freq));
freq = (double) 1 / (double) (context.getFreq(0, 11) + 1);
sn.addPos(new POS(11, freq));
freq = (double) 1 / (double) (context.getFreq(0, 2) * 2 + 1);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -