cresult.java
来自「基于中科院的ICTCLAS实现中文分词系统 开发工具是JAVA.经测试,效果很好」· Java 代码 · 共 778 行 · 第 1/2 页
JAVA
778 行
+ GFString.getChineseString(pItem[i].sWord, "gb2312")
+ "</src>";
if (sPOS[0] != 0) {
result += "</any>";
}
}
i++;
}
return true;
}
protected boolean ChineseNameSplit(byte[] sPersonName, byte[] sSurname,
byte[] sSurname2, byte[] sGivenName, CDictionary personDict) {
int nSurNameLen = 4;
int nLen = sPersonName.length;
int nFreq;
int i = 0;
int nCharType;
int nFreqGiven;
byte[] sTemp = new byte[3];
if (nLen < 3 || nLen > 8)// Not a traditional Chinese person name
return false;
while (i < nLen)// No Including non-CHinese char
{
nCharType = Utility.charType(sPersonName[i], sPersonName[i + 1]);
if (nCharType != Final.CT_CHINESE && nCharType != Final.CT_OTHER)
return false;
i += 2;
}
sSurname2[0] = 0;// init
GFCommon.bytesCopy(sSurname, sPersonName, 0, nSurNameLen);
sSurname[nSurNameLen] = 0;
if (!personDict.IsExist(sSurname, 1)) {
nSurNameLen = 2;
sSurname[nSurNameLen] = 0;
if (!personDict.IsExist(sSurname, 1)) {
nSurNameLen = 0;
sSurname[nSurNameLen] = 0;
}
}
byte[] bt = GFCommon.bytesCopy(sPersonName, nSurNameLen,
sPersonName.length - nSurNameLen);
GFCommon.bytesCopy(sGivenName, bt, 0, bt.length);
if (nLen > 6) {
bt = GFCommon.bytesCopy(sPersonName, nSurNameLen,
sPersonName.length - nSurNameLen);
GFCommon.bytesCopy(sTemp, bt, 0, 2);
sTemp[2] = 0;// Get the second possible surname
if (personDict.IsExist(sTemp, 1)) {// Hongkong women's name:
// Surname+surname+given name
GFCommon.bytesCopy(sSurname2, sTemp, 0, sTemp.length);
bt = GFCommon.bytesCopy(sPersonName, nSurNameLen + 2,
sPersonName.length - nSurNameLen - 2);
GFCommon.bytesCopy(sGivenName, bt, 0, bt.length);
}
}
nFreq = personDict.GetFrequency(sSurname, 1);
GFCommon.bytesCopy(sTemp, sGivenName, 0, 2);
sTemp[2] = 0;
nFreqGiven = personDict.GetFrequency(sTemp, 2);
if (nSurNameLen != 4
&& ((nSurNameLen == 0 && nLen > 4)
|| sGivenName.length > 4
|| (Utility.GetForeignCharCount(sPersonName) >= 3
&& nFreq < personDict.GetFrequency("张"
.getBytes(), 1) / 40 && nFreqGiven < personDict
.GetFrequency("华".getBytes(), 2) / 20) || (nFreq < 10 && Utility
.GetForeignCharCount(sGivenName) == (nLen - nSurNameLen) / 2)))
return false;
// Single Surname+given name
if (nLen == 4 && m_uPerson.IsGivenName(sPersonName)) {
return false;
}
return true;
}
protected boolean PKU2973POS(int nHandle, byte[] sPOS973) {
int[] nHandleSet = { 24832, 24932, 24935, 24942, 25088, 25344, 25600,
25703, 25856, 26112, 26368, 26624, 26880, 27136, 27392, 27648,
27904, 28160, 28263, 28274, 28275, 28276, 28280, 28282, 28416,
28672, 28928, 29184, 29440, 29696, 29799, 29952, 30052, 30055,
30058, 30060, 30070, 30074, 30208, 30308, 30311, 30318, 30464,
30720, 30976, 31232 };
// "a", "ad","ag","an","b", "c", "d", "dg","e", "f","g", "h", "i", "j",
// "k", "l", "m", "n", "ng","nr","ns","nt","nx","nz","o", "p", "q", "r",
// "s", "t", "tg","u", "ud","ug","uj","ul","uv","uz","v",
// "vd","vg","vn","w", "x", "y", "z"
String[] sPOSRelated = { "a", "ad", "ga", "an", "f", "c", "d", "d",
"e", "nd", "g", "h", "i", "j", "k", "l", "m", "n", "gn", "nh",
"ns", "ni", "ws", "nz", "o", "p", "q", "r", "nl", "nt", "gt",
"u", "ud", "ug", "uj", "ul", "uv", "uz", "v", "vd", "gv", "vn",
"w", "x", "u", "a" };
/*
* "Bg","gf", "Rg","gr", "Mg","gm", "Yg","u", "Ug","u", "Qg","q",
*/
int nIndex = Utility.BinarySearch(nHandle, nHandleSet, 46);
if (nIndex == -1)
sPOS973[0] = "@".getBytes()[0];
else
GFCommon.bytesCopy(sPOS973, sPOSRelated[nIndex].getBytes(), 0,
sPOSRelated[nIndex].length());
return true;
}
protected boolean Adjust(TagWordResult[] pItem, TagWordResult[] pItemRet) {
int i = 0, j = 0;
int nLen;
byte[] sSurName = new byte[10];
byte[] sSurName2 = new byte[10];
byte[] sGivenName = new byte[10];
boolean bProcessed = false;// Have been processed
while (pItem[i].sWord[0] != 0) {
nLen = pItem[i].sWord.length;
bProcessed = false;
// Rule1: adjust person name
if (pItem[i].nHandle == 28274
&& ChineseNameSplit(pItem[i].sWord, sSurName, sSurName2,
sGivenName, m_uPerson.m_dict)
&& !"叶利钦".equals(GFString.getChineseString(pItem[i].sWord,
"gb2312")))// 'nr'
{// Divide name into surname and given name
if (sSurName[0] != 0) {
GFCommon.bytesCopy(pItemRet[j].sWord, sSurName, 0,
sSurName.length);
pItemRet[j++].nHandle = 28274;
}
if (sSurName2[0] != 0) {
GFCommon.bytesCopy(pItemRet[j].sWord, sSurName2, 0,
sSurName2.length);
pItemRet[j++].nHandle = 28274;
}
if (sGivenName[0] != 0) {
GFCommon.bytesCopy(pItemRet[j].sWord, sGivenName, 0,
sGivenName.length);
pItemRet[j++].nHandle = 28274;
}
bProcessed = true;
}
// Rule2 for overlap words ABB 一段段、一片片
else if (pItem[i].nHandle == 27904
&& pItem[i + 1].sWord.length == 2
&& Utility.strcmp(pItem[i + 1].sWord, pItem[i + 2].sWord)) {// (pItem[i+1].nHandle/256=='q'||pItem[i+1].nHandle/256=='a')&&
int index = 0;
GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i].sWord, index,
pItem[i].sWord.length);
index += pItem[i].sWord.length;
GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i + 1].sWord,
index, pItem[i + 1].sWord.length);
index += pItem[i + 1].sWord.length;
GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i + 2].sWord,
index, pItem[i + 2].sWord.length);
pItemRet[j].nHandle = 27904;
j += 1;
i += 2;
bProcessed = true;
}
// Rule3 for overlap words AA
else if (nLen == 2
&& Utility.strcmp(pItem[i].sWord, pItem[i + 1].sWord)) {
int index = 0;
GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i].sWord, index,
pItem[i].sWord.length);
index += pItem[i].sWord.length;
GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i + 1].sWord,
index, pItem[i + 1].sWord.length);
// 24832=='a'*256
pItemRet[j].nHandle = 24832;// a
if (pItem[i].nHandle / 256 == 'v'
|| pItem[i + 1].nHandle / 256 == 'v')// 30208='v'8256
{
pItemRet[j].nHandle = 30208;
}
if (pItem[i].nHandle / 256 == 'n'
|| pItem[i + 1].nHandle / 256 == 'n')// 30208='v'8256
{
pItemRet[j].nHandle = 'n' * 256;
}
i += 1;
if (pItem[i + 1].sWord.length == 2) {// AAB:洗/洗/脸、蒙蒙亮
if ((pItemRet[j].nHandle == 30208 && pItem[i + 1].nHandle / 256 == 'n')
|| (pItemRet[j].nHandle == 24832 && pItem[i + 1].nHandle / 256 == 'a')) {
GFCommon.bytesCopy(pItemRet[j].sWord,
pItem[i + 1].sWord, index,
pItem[i + 1].sWord.length);
i += 1;
}
}
j += 1;
bProcessed = true;
}
// Rule 4: AAB 洗/洗澡
else if (nLen == 2
&& Utility.strncmp(pItem[i].sWord,0, pItem[i + 1].sWord, 2)
&& pItem[i + 1].sWord.length == 4
&& (pItem[i].nHandle / 256 == 'v' || pItem[i].nHandle == 24832))// v,a
{
int index = 0;
GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i].sWord, index,
pItem[i].sWord.length);
index += pItem[i].sWord.length;
GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i + 1].sWord,
index, pItem[i + 1].sWord.length);
// 24832=='a'*256
pItemRet[j].nHandle = 24832;// 'a'
if (pItem[i].nHandle / 256 == 'v'
|| pItem[i + 1].nHandle / 256 == 'v')// 30208='v'8256
{
pItemRet[j].nHandle = 30208;
}
i += 1;
j += 1;
bProcessed = true;
} else if (pItem[i].nHandle / 256 == 'u'
&& pItem[i].nHandle % 256 == 0)// uj,ud,uv,uz,ul,ug->u
pItem[i].nHandle = 'u' * 256;
else if (nLen == 2
&& Utility.strncmp(pItem[i].sWord,0, pItem[i + 1].sWord, 2)
&& pItem[i + 1].sWord.length == 4
&& Utility.strncmp( pItem[i + 1].sWord,2, pItem[i + 2].sWord, 2)) {// AABB 朴朴素素 枝枝叶叶
int index = 0;
GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i].sWord, index,
pItem[i].sWord.length);
index += pItem[i].sWord.length;
GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i + 1].sWord,
index, pItem[i + 1].sWord.length);
index += pItem[i + 1].sWord.length;
GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i + 2].sWord,
index, pItem[i + 2].sWord.length);
pItemRet[j].nHandle = pItem[i + 1].nHandle;
i += 2;
j += 1;
bProcessed = true;
} else if (pItem[i].nHandle == 28275)// PostFix
{
if (m_uPlace.m_dict.IsExist(pItem[i + 1].sWord, 4)) {
GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i].sWord, 0,
pItem[i].sWord.length);
GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i + 1].sWord,
pItem[i].sWord.length, pItem[i + 1].sWord.length);
pItemRet[j].nHandle = 28275;
i += 1;
j += 1;
bProcessed = true;
} else if (pItem[i + 1].sWord.length == 2
&& Utility.CC_Find("队".getBytes(), pItem[i + 1].sWord)) {
GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i].sWord, 0,
pItem[i].sWord.length);
GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i + 1].sWord,
pItem[i].sWord.length, pItem[i + 1].sWord.length);
pItemRet[j].nHandle = 28276;
i += 1;
j += 1;
bProcessed = true;
} else if (pItem[i + 1].sWord.length == 2
&& Utility.CC_Find("语文字杯".getBytes(),
pItem[i + 1].sWord)) {
GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i].sWord, 0,
pItem[i].sWord.length);
GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i + 1].sWord,
pItem[i].sWord.length, pItem[i + 1].sWord.length);
pItemRet[j].nHandle = 28282;
i += 1;
j += 1;
bProcessed = true;
} else if (pItem[i + 1].sWord.length == 2
&& Utility.CC_Find("裔".getBytes(), pItem[i + 1].sWord)) {
GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i].sWord, 0,
pItem[i].sWord.length);
GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i + 1].sWord,
pItem[i].sWord.length, pItem[i + 1].sWord.length);
pItemRet[j].nHandle = 28160;
i += 1;
j += 1;
bProcessed = true;
}
} else if (pItem[i].nHandle == 30208 || pItem[i].nHandle == 28160)// v
{
if (pItem[i + 1].sWord.length == 2
&& Utility.CC_Find("员".getBytes(), pItem[i + 1].sWord)) {
GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i].sWord, 0,
pItem[i].sWord.length);
GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i + 1].sWord,
pItem[i].sWord.length, pItem[i + 1].sWord.length);
pItemRet[j].nHandle = 28160;
i += 1;
j += 1;
bProcessed = true;
}
} else if (pItem[i].nHandle == 28280) {// www/nx ./w sina/nx;
// EIM/nx -601/m
int index = 0;
GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i].sWord, index,
pItem[i].sWord.length);
index += pItem[i].sWord.length;
pItemRet[j].nHandle = 28280;
while (pItem[i + 1].nHandle == 28280
|| Utility.strstr("..".getBytes(), pItem[i + 1].sWord) != -1
|| (pItem[i + 1].nHandle == 27904 && Utility
.IsAllNum(pItem[i + 1].sWord))) {
GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i + 1].sWord,
pItem[i].sWord.length, pItem[i + 1].sWord.length);
index += pItem[i + 1].sWord.length;
i += 1;
}
j += 1;
bProcessed = true;
}
if (!bProcessed) {// If not processed,that's mean: not need to
// adjust;
// just copy to the final result
GFCommon.bytesCopy(pItemRet[j].sWord, pItem[i].sWord, 0,
pItem[i].sWord.length);
pItemRet[j++].nHandle = pItem[i].nHandle;
}
i++;
}
pItemRet[j].sWord[0] = 0;// Set ending
return true;
}
protected double ComputePossibility(TagWordResult[] pItem) {
int i = 0;
double dResultPossibility = 0;
while (pItem[i].sWord[0] != 0) {
dResultPossibility += pItem[i].nHandle;
// Compute the possibility of logP(Wi|Ti)
if (pItem[i + 1].sWord[0] != 0)// Not the last one
{// Compute the possibility of logP(Ti|Ti-1)
dResultPossibility += Math.log((double) (m_POSTagger.m_context
.GetContextPossibility(0, pItem[i].nHandle,
pItem[i + 1].nHandle) + 1));
dResultPossibility -= Math.log((double) (m_POSTagger.m_context
.GetFrequency(0, pItem[i].nHandle) + 1));
}
i++;
}
return dResultPossibility;
}
protected boolean Sort() {
double[] dPossibility = new double[Final.MAX_SEGMENT_NUM];
double dTemp;
int[] nIndex = new int[Final.MAX_SEGMENT_NUM];
int nTemp;// Index
for (int i = 0; i < m_Seg.m_nSegmentCount; i++) {// Computing the
// possibility
dPossibility[i] = ComputePossibility(m_Seg.m_pWordSeg[i]);
nIndex[i] = i;// Record the index
}
// Sort with Bubble sort algorithm
for (int i = 0; i < m_Seg.m_nSegmentCount; i++)
for (int j = i + 1; j < m_Seg.m_nSegmentCount; j++) {
if (dPossibility[i] < dPossibility[j]) {// Swap the possition
// and value
nTemp = nIndex[i];
dTemp = dPossibility[i];
nIndex[i] = nIndex[j];
dPossibility[i] = dPossibility[j];
nIndex[j] = nTemp;
dPossibility[j] = dTemp;
}
}
for (int i = 0; i < m_Seg.m_nSegmentCount; i++) {// Adjust the
// segmentation and
// POS result and
// store them in the
// final result
// array
// Store them according their possibility ascendly
Adjust(m_Seg.m_pWordSeg[nIndex[i]], m_pResult[i]);
m_dResultPossibility[i] = dPossibility[i];
}
return true;
}
}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?