📄 span.java
字号:
* BBE 125 0.001314
* BBZ 30 0.000315
* BCD 62460 0.656624
* BEE 0 0.000000
* BE 13899 0.146116
* BG 869 0.009136
* BXD 4 0.000042
* BZ 3707 0.038971
* CD 8596 0.090367
* EE 26 0.000273
* FB 871 0.009157
* Y 3265 0.034324
* XD 926 0.009735
*
* The person recognition patterns set
* BBCD:姓+姓+名1+名2;
* BBE: 姓+姓+单名;
* BBZ: 姓+姓+双名成词;
* BCD: 姓+名1+名2;
* BE: 姓+单名;
* BEE: 姓+单名+单名;韩磊磊
* BG: 姓+后缀
* BXD: 姓+姓双名首字成词+双名末字
* BZ: 姓+双名成词;
* B: 姓
* CD: 名1+名2;
* EE: 单名+单名;
* FB: 前缀+姓
* XD: 姓双名首字成词+双名末字
* Y: 姓单名成词
* </pre>
*/
public boolean PersonRecognize(Dictionary personDict) {
String sPOS = "z";
String sPersonName;
// 0 1 2 3 4 5
final String[] patterns = { "BBCD", "BBC", "BBE", "BBZ", "BCD", "BEE", "BE", "BG", "BXD", "BZ", "CDCD", "CD",
"EE", "FB", "Y", "XD", "" };
// BBCD BBC BBE BBZ BCD BEE BE BG
final double[] factor = { 0.003606, 0.000021, 0.001314, 0.000315, 0.656624, 0.000021, 0.146116, 0.009136,
// BXD BZ CDCD CD EE FB Y XD
0.000042, 0.038971, 0, 0.090367, 0.000273, 0.009157, 0.034324, 0.009735, 0 };
// About parameter:
final int patternLen[] = { 4, 3, 3, 3, 3, 3, 2, 2, 3, 2, 4, 2, 2, 2, 1, 2, 0 };
int i = 0;
for (i = 1; m_nBestTag[i] > -1; i++)
// Convert to string from POS
sPOS += (char) (m_nBestTag[i] + 'A');
int j = 1, k, nPos;// Find the proper pattern from the first POS
int nLittleFreqCount;// Counter for the person name role with little
// frequecy
boolean bMatched = false;
while (j < i) {
bMatched = false;
for (k = 0; !bMatched && patternLen[k] > 0; k++) {
if (sPOS.substring(j).indexOf(patterns[k]) == 0 && !"·".equals(m_sWords[j - 1])
&& !"·".equals(m_sWords[j + patternLen[k]])) {// Find
String temp = sPOS.substring(j + 2);
if (temp.length() > 1)
temp = temp.substring(0, 1);
// Rule 1 for exclusion:前缀+姓+名1(名2): 规则(前缀+姓)失效;
if ("FB".equals(patterns[k]) && ("E".equals(temp) || "C".equals(temp) || "G".equals(temp))) {
continue;
}
nPos = j;// Record the person position in the tag
// sequence
sPersonName = "";
nLittleFreqCount = 0;// Record the number of role with
// little frequency
while (nPos < j + patternLen[k]) {// Get the possible
// person name
if (m_nBestTag[nPos] < 4
&& personDict.getFreq(m_sWords[nPos], m_nBestTag[nPos]) < Utility.LITTLE_FREQUENCY)
nLittleFreqCount++;// The counter increase
sPersonName += m_sWords[nPos];
nPos += 1;
}
if ("CDCD".equals(patterns[k])) {
if (GetForeignCharCount(sPersonName) > 0)
j += patternLen[k] - 1;
continue;
}
m_nUnknownWords[m_nUnknownIndex][0] = m_nWordPosition[j];
m_nUnknownWords[m_nUnknownIndex][1] = m_nWordPosition[j + patternLen[k]];
m_dWordsPossibility[m_nUnknownIndex] = -Math.log(factor[k])
+ ComputePossibility(j, patternLen[k], personDict);
// Mutiply the factor
m_nUnknownIndex += 1;
j += patternLen[k];
bMatched = true;
}
}
if (!bMatched)// Not matched, add j by 1
j += 1;
}
return true;
}
private int guessPOS(int index) {
int j = 0, i = index, charType;
int nLen;
switch (tagType) {
case TT_NORMAL:
break;
case TT_PERSON:
j = 0;
if (m_sWords[index].indexOf("××") != -1) {
m_nTags[i][j] = 6;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 6) + 1);
} else {
m_nTags[i][j] = 0;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 0) + 1);
nLen = m_sWords[index].getBytes().length;
if (nLen >= 4) {
m_nTags[i][j] = 0;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 0) + 1);
m_nTags[i][j] = 11;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 11) * 8);
m_nTags[i][j] = 12;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 12) * 8);
m_nTags[i][j] = 13;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 13) * 8);
} else if (nLen == 2) {
m_nTags[i][j] = 0;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 0) + 1);
charType = Utility.charType(m_sWords[index]);
if (charType == Utility.CT_OTHER || charType == Utility.CT_CHINESE) {
m_nTags[i][j] = 1;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 1) + 1);
m_nTags[i][j] = 2;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 2) + 1);
m_nTags[i][j] = 3;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 3) + 1);
m_nTags[i][j] = 4;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 4) + 1);
}
m_nTags[i][j] = 11;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 11) * 8);
m_nTags[i][j] = 12;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 12) * 8);
m_nTags[i][j] = 13;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 13) * 8);
}
}
break;
case TT_PLACE:
j = 0;
m_nTags[i][j] = 0;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 0) + 1);
nLen = m_sWords[index].length();
if (nLen >= 4) {
m_nTags[i][j] = 11;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 11) * 8);
m_nTags[i][j] = 12;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 12) * 8);
m_nTags[i][j] = 13;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 13) * 8);
} else if (nLen == 2) {
m_nTags[i][j] = 0;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 0) + 1);
charType = Utility.charType(m_sWords[index]);
if (charType == Utility.CT_OTHER || charType == Utility.CT_CHINESE) {
m_nTags[i][j] = 1;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 1) + 1);
m_nTags[i][j] = 2;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 2) + 1);
m_nTags[i][j] = 3;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 3) + 1);
m_nTags[i][j] = 4;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 4) + 1);
}
m_nTags[i][j] = 11;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 11) * 8);
m_nTags[i][j] = 12;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 12) * 8);
m_nTags[i][j] = 13;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 13) * 8);
}
break;
case TT_TRANS_PERSON:
j = 0;
nLen = m_sWords[index].length();
m_nTags[i][j] = 0;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 0) + 1);
if (!Utility.isAllChinese(m_sWords[index])) {
if (Utility.isAllLetter(m_sWords[index])) {
m_nTags[i][j] = 1;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 1) + 1);
m_nTags[i][j] = 11;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 11) + 1);
m_nTags[i][j] = 2;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 2) * 2 + 1);
m_nTags[i][j] = 3;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 3) * 2 + 1);
m_nTags[i][j] = 12;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 12) * 2 + 1);
m_nTags[i][j] = 13;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 13) * 2 + 1);
}
m_nTags[i][j] = 41;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 41) * 8);
m_nTags[i][j] = 42;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 42) * 8);
m_nTags[i][j] = 43;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 43) * 8);
} else if (nLen >= 4) {
m_nTags[i][j] = 41;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 41) * 8);
m_nTags[i][j] = 42;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 42) * 8);
m_nTags[i][j] = 43;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 43) * 8);
} else if (nLen == 2) {
charType = Utility.charType(m_sWords[index]);
if (charType == Utility.CT_OTHER || charType == Utility.CT_CHINESE) {
m_nTags[i][j] = 1;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 1) * 2 + 1);
m_nTags[i][j] = 2;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 2) * 2 + 1);
m_nTags[i][j] = 3;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 3) * 2 + 1);
m_nTags[i][j] = 30;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 30) * 8 + 1);
m_nTags[i][j] = 11;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 11) * 4 + 1);
m_nTags[i][j] = 12;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 12) * 4 + 1);
m_nTags[i][j] = 13;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 13) * 4 + 1);
m_nTags[i][j] = 21;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 21) * 2 + 1);
m_nTags[i][j] = 22;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 22) * 2 + 1);
m_nTags[i][j] = 23;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 23) * 2 + 1);
}
m_nTags[i][j] = 41;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 41) * 8);
m_nTags[i][j] = 42;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 42) * 8);
m_nTags[i][j] = 43;
m_dFrequency[i][j++] = (double) 1 / (double) (context.getFreq(0, 43) * 8);
}
break;
default:
break;
}
return j;
}
int GetForeignCharCount(String personName) {
return 0;
}
public boolean PlaceRecognize(Dictionary coreDict, Dictionary placeDict) {
int nStart = 1, nEnd = 1, i = 1, nTemp;
double dPanelty = 1.0;// Panelty value
while (m_nBestTag[i] > -1) {
if (m_nBestTag[i] == 1)// 1 Trigger the recognition procession
{
nStart = i;
nEnd = nStart + 1;
while (m_nBestTag[nEnd] == 1)//
{
if (nEnd > nStart + 1)
dPanelty += 1.0;
nEnd++;
}
while (m_nBestTag[nEnd] == 2)
// 2,12,22
nEnd++;
nTemp = nEnd;
while (m_nBestTag[nEnd] == 3) {
if (nEnd > nTemp)
dPanelty += 1.0;
nEnd++;
}
} else if (m_nBestTag[i] == 2)// 1,11,21 Trigger the recognition
{
dPanelty += 1.0;
nStart = i;
nEnd = nStart + 1;
while (m_nBestTag[nEnd] == 2)
// 2
nEnd++;
nTemp = nEnd;
while (m_nBestTag[nEnd] == 3)// 2
{
if (nEnd > nTemp)
dPanelty += 1.0;
nEnd++;
}
}
if (nEnd > nStart) {
m_nUnknownWords[m_nUnknownIndex][0] = m_nWordPosition[nStart];
m_nUnknownWords[m_nUnknownIndex][1] = m_nWordPosition[nEnd];
m_dWordsPossibility[m_nUnknownIndex++] = ComputePossibility(nStart, nEnd - nStart + 1, placeDict)
+ Math.log(dPanelty);
nStart = nEnd;
}
if (i < nEnd)
i = nEnd;
else
i = i + 1;
}
return true;
}
private double ComputePossibility(int startPos, int length, Dictionary dict) {
double retValue = 0, posPoss;
int nFreq;
for (int i = startPos; i < startPos + length; i++) {
nFreq = dict.getFreq(m_sWords[i], m_nBestTag[i]);
// nFreq is word being the POS
posPoss = Math.log((double) (context.getFreq(0, m_nBestTag[i]) + 1)) - Math.log((double) (nFreq + 1));
retValue += posPoss;
}
return retValue;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -