📄 postagger.java
字号:
sn.addPos(new POS(2, freq));
freq = (double) 1 / (double) (context.getFreq(0, 3) * 2 + 1);
sn.addPos(new POS(3, freq));
freq = (double) 1 / (double) (context.getFreq(0, 12) * 2 + 1);
sn.addPos(new POS(12, freq));
freq = (double) 1 / (double) (context.getFreq(0, 13) * 2 + 1);
sn.addPos(new POS(13, freq));
}
freq = (double) 1 / (double) (context.getFreq(0, 41) * 8);
sn.addPos(new POS(41, freq));
freq = (double) 1 / (double) (context.getFreq(0, 42) * 8);
sn.addPos(new POS(42, freq));
freq = (double) 1 / (double) (context.getFreq(0, 43) * 8);
sn.addPos(new POS(43, freq));
} else if (sn.getLen() >= 4) {
freq = (double) 1 / (double) (context.getFreq(0, 41) * 8);
sn.addPos(new POS(41, freq));
freq = (double) 1 / (double) (context.getFreq(0, 42) * 8);
sn.addPos(new POS(42, freq));
freq = (double) 1 / (double) (context.getFreq(0, 43) * 8);
sn.addPos(new POS(43, freq));
} else if (sn.getLen() == 2) {
charType = Utility.charType(word);
if (charType == Utility.CT_OTHER || charType == Utility.CT_CHINESE) {
freq = (double) 1 / (double) (context.getFreq(0, 1) * 2 + 1);
sn.addPos(new POS(1, freq));
freq = (double) 1 / (double) (context.getFreq(0, 2) * 2 + 1);
sn.addPos(new POS(2, freq));
freq = (double) 1 / (double) (context.getFreq(0, 3) * 2 + 1);
sn.addPos(new POS(3, freq));
freq = (double) 1 / (double) (context.getFreq(0, 30) * 8 + 1);
sn.addPos(new POS(30, freq));
freq = (double) 1 / (double) (context.getFreq(0, 11) * 4 + 1);
sn.addPos(new POS(11, freq));
freq = (double) 1 / (double) (context.getFreq(0, 12) * 4 + 1);
sn.addPos(new POS(12, freq));
freq = (double) 1 / (double) (context.getFreq(0, 13) * 4 + 1);
sn.addPos(new POS(13, freq));
freq = (double) 1 / (double) (context.getFreq(0, 21) * 2 + 1);
sn.addPos(new POS(21, freq));
freq = (double) 1 / (double) (context.getFreq(0, 22) * 2 + 1);
sn.addPos(new POS(22, freq));
freq = (double) 1 / (double) (context.getFreq(0, 23) * 2 + 1);
sn.addPos(new POS(23, freq));
}
freq = (double) 1 / (double) (context.getFreq(0, 41) * 8);
sn.addPos(new POS(41, freq));
freq = (double) 1 / (double) (context.getFreq(0, 42) * 8);
sn.addPos(new POS(42, freq));
freq = (double) 1 / (double) (context.getFreq(0, 43) * 8);
sn.addPos(new POS(43, freq));
}
break;
default:
break;
}
if (sn.getAllPos() != null)
result = sn.getAllPos().size();
}
return result;
}
/**
* 人名模式匹配
*
* <pre>
*
* BBCD 343 0.003606
* BBC 2 0.000021
* BBE 125 0.001314
* BBZ 30 0.000315
* BCD 62460 0.656624
* BEE 0 0.000000
* BE 13899 0.146116
* BG 869 0.009136
* BXD 4 0.000042
* BZ 3707 0.038971
* CD 8596 0.090367
* EE 26 0.000273
* FB 871 0.009157
* Y 3265 0.034324
* XD 926 0.009735
*
* The person recognition patterns set
* BBCD:姓+姓+名1+名2;
* BBE: 姓+姓+单名;
* BBZ: 姓+姓+双名成词;
* BCD: 姓+名1+名2;
* BE: 姓+单名;
* BEE: 姓+单名+单名;韩磊磊
* BG: 姓+后缀
* BXD: 姓+姓双名首字成词+双名末字
* BZ: 姓+双名成词;
* B: 姓
* CD: 名1+名2;
* EE: 单名+单名;
* FB: 前缀+姓
* XD: 姓双名首字成词+双名末字
* Y: 姓单名成词
* </pre>
*/
private void personRecognize(SegGraph segGraph, ArrayList<SegNode> sns) {
String sPos = null;
String personName = null;
// 人名识别模式
final String[] patterns = { "BBCD", "BBC", "BBE", "BBZ", "BCD", "BEE", "BE", "BG", "BXD", "BZ", "CDCD", "CD",
"EE", "FB", "Y", "XD", "" };
final double[] factor = { 0.003606, 0.000021, 0.001314, 0.000315, 0.656624, 0.000021, 0.146116, 0.009136,
0.000042, 0.038971, 0, 0.090367, 0.000273, 0.009157, 0.034324, 0.009735, 0 };
if (segGraph != null && sns != null) {
int j = 1, k, nPos;
boolean bMatched = false;
sPos = word2pattern(sns);
while (sPos != null && j < sPos.length()) {
bMatched = false;
for (k = 0; !bMatched && patterns[k].length() > 0; k++) {
// 如果当前句子中有符合该模式的字串,并且该字串前后都不是圆点,则认为是匹配的
if (sPos.substring(j).indexOf(patterns[k]) == 0 && !"·".equals(sns.get(j - 1).getWord())
&& !"·".equals(sns.get(j + patterns[k].length()))) {// Find
String temp = sPos.substring(j + 2);
if (temp.length() > 1)
temp = temp.substring(0, 1);
// Rule 1 for exclusion:前缀+姓+名1(名2): 规则(前缀+姓)失效;
if ("FB".equals(patterns[k]) && ("E".equals(temp) || "C".equals(temp) || "G".equals(temp))) {
continue;
}
nPos = j;
personName = "";
// Get the possible person name
while (nPos < j + patterns[k].length()) {
SegNode sn = sns.get(nPos);
if (sn.getPos() < 4
&& unknownDict.getFreq(sn.getWord(), sn.getPos()) < Utility.LITTLE_FREQUENCY)
personName += sn.getWord();
nPos += 1;
}
if ("CDCD".equals(patterns[k])) {
if (GetForeignCharCount(personName) > 0)
j += patterns[k].length() - 1;
continue;
}
SegNode usn = new SegNode();
usn.setRow(sns.get(j).getRow());
usn.setCol(sns.get(j + patterns[k].length() - 1).getCol());
usn.setWord(unknownFlags);
usn.setSrcWord(personName);
double value = -Math.log(factor[k]) + computePossibility(j, patterns[k].length(), sns);
usn.setPos(pos);
usn.setValue(value);
segGraph.insert(usn, true);
j += patterns[k].length();
bMatched = true;
}
}
if (!bMatched)// Not matched, add j by 1
j += 1;
}
}
}
// TODO:
private int GetForeignCharCount(String personName) {
return 0;
}
/**
* 地名模式匹配
*
*/
private void placeRecognize(SegGraph segGraph, ArrayList<SegNode> sns, Dictionary coreDict) {
if (segGraph != null && coreDict != null) {
int start = 1;
int end = 1;
double dPanelty = 1;
String srcWord = "";
for (int i = 1; i < sns.size(); i++) {
start = i;
end = start;
srcWord = sns.get(i).getSrcWord();
if (getBestTag(sns, i) == 1) {
for (end = i + 1; end < sns.size(); end++) {
int bestTag = getBestTag(sns, end);
if (bestTag == -1)
continue;
else if (bestTag == 1 || bestTag == 3) {
if (end > i + 1)
dPanelty += 1;
srcWord += sns.get(end).getSrcWord();
} else if (bestTag == 2)
srcWord += sns.get(end).getSrcWord();
else
break;
}
} else if (getBestTag(sns, i) == 2) {
dPanelty += 1;
for (end = i + 1; end < sns.size(); end++) {
int bestTag = getBestTag(sns, end);
if (bestTag == -1)
continue;
else if (bestTag == 3) {
if (end > i + 1)
dPanelty += 1;
srcWord += sns.get(end).getSrcWord();
} else if (bestTag == 2)
srcWord += sns.get(end).getSrcWord();
else
break;
}
}
if (end > start) {
SegNode newsn = new SegNode();
newsn.setRow(sns.get(start).getRow());
newsn.setCol(sns.get(end - 1).getCol());
newsn.setPos(pos);
newsn.setWord(unknownFlags);
newsn.setSrcWord(srcWord);
double value = computePossibility(start, end - start + 1, sns);
newsn.setValue(value);
segGraph.insert(newsn, true);
}
}
}
}
private int getBestTag(ArrayList<SegNode> sns, int index) {
if (sns != null && index >= 0 && index < sns.size()) {
SegNode sn = sns.get(index);
return getBestTag(sn);
}
return -1;
}
private int getBestTag(SegNode sn) {
if (sn != null) {
ArrayList<POS> allPos = sn.getAllPos();
if (allPos != null) {
for (POS pos : allPos) {
if (pos.isBest())
return pos.getTag();
}
}
}
return -1;
}
// Judge whether the name is a given name
public boolean isGivenName(String sName) {
String firstChar;
String secondChar;
// given Name Possibility
double gnp = 0;
// singleNamePossibility
double snp = 0;
if (sName != null) {
if (sName.getBytes().length != 4)
return false;
firstChar = sName.substring(0, 1);
secondChar = sName.substring(1);
// The possibility of P(Wi|Ti)
gnp += Math.log((double) unknownDict.getFreq(firstChar, 2) + 1.0);
gnp -= Math.log(context.getFreq(0, 2) + 1.0);
gnp += Math.log((double) unknownDict.getFreq(secondChar, 3) + 1.0);
gnp -= Math.log(context.getFreq(0, 3) + 1.0);
// The possibility of conversion from 2 to 3
gnp += Math.log(context.getPossibility(0, 2, 3) + 1.0);
gnp -= Math.log(context.getFreq(0, 2) + 1.0);
// The possibility of P(Wi|Ti)
snp += Math.log((double) unknownDict.getFreq(firstChar, 1) + 1.0);
snp -= Math.log(context.getFreq(0, 1) + 1.0);
snp += Math.log((double) unknownDict.getFreq(secondChar, 4) + 1.0);
snp -= Math.log(context.getFreq(0, 4) + 1.0);
// The possibility of conversion from 1 to 4
snp += Math.log(context.getPossibility(0, 1, 4) + 1.0);
snp -= Math.log(context.getFreq(0, 1) + 1.0);
// 张震||m_dict.getFrequency(sFirstChar,1)/m_dict.getFrequency(sFirstChar,2)>=10
// The possibility being a single given name is more than being a
// 2-char given name
if (snp >= gnp)
return false;
return true;
}
return false;
}
// 把经过初次分词后的链表形式转成人名字符串模式
private String word2pattern(ArrayList<SegNode> sns) {
String result = null;
if (sns != null) {
result = "";
for (SegNode sn : sns) {
result += (char) (getBestTag(sn) + 'A');
}
}
return result;
}
/**
* 标记出最佳词性
*
* @param sns
*/
private void tagBest(ArrayList<SegNode> sns) {
if (sns != null) {
int size = sns.size();
// 不考虑开始和结束标记
for (int i = size - 1, j = 0; i >= 0; i--) {
ArrayList<POS> allPos = sns.get(i).getAllPos();
if (allPos != null && allPos.size() > j) {
POS pos = allPos.get(j);
pos.setBest(true);
j = pos.getPrev();
} else if (i + 1 < size - 1) {
int tag = getBestTag(sns.get(i + 1));
POS pos = new POS(tag, 0);
pos.setBest(true);
sns.get(i).addPos(pos);
}
}
// 把结束点去掉,用到它的目的仅仅是为了得到最后一个“末##末”词的最优词性
if (size > 1) {
if (sns.get(size - 1).getWord() == null)
sns.remove(size - 1);
}
}
}
private double computePossibility(int startPos, int length, ArrayList<SegNode> sns) {
double retValue = 0, posPoss;
if (sns != null && unknownDict != null && context != null) {
for (int i = startPos; i < startPos + length && sns != null; i++) {
SegNode sn = sns.get(i);
int bestTag = getBestTag(sn);
if (bestTag != -1) {
int freq = unknownDict.getFreq(sn.getSrcWord(), bestTag);
posPoss = Math.log((double) (context.getFreq(0, sn.getPos()) + 1));
posPoss += -Math.log((double) (freq + 1));
retValue += posPoss;
}
}
}
return retValue;
}
public Dictionary getUnknownDict() {
return unknownDict;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -