📄 csegment.java
字号:
if (trc.nPOS < 0)// Unknown words: P(Wi|Ci);while known
// words:1
dValue += trc.value;
// Get the position index of current word in the position
// map table
nCurWordIndex = Utility.BinarySearch(trc.row
* Final.MAX_SENTENCE_LEN + trc.col,
m_npWordPosMapTable, m_nWordCount);
nNextWordIndex = Utility.BinarySearch(trc2.row
* Final.MAX_SENTENCE_LEN + trc2.col,
m_npWordPosMapTable, m_nWordCount);
aBinaryWordNet.SetElement(nCurWordIndex, nNextWordIndex,
dValue, trc.nPOS, null);
}
}
}
return true;
}
protected boolean IsYearTime(byte[] sNum) {
// Judge whether the sNum is a num genearating year
int nLen = sNum.length;
byte[] sTemp = new byte[3];
Utility.strncpy(sTemp, sNum, 2);
sTemp[2] = 0;
if (Utility.IsAllSingleByte(sNum)
&& (nLen == 4 || nLen == 2 && sNum[0] > '4'))// 1992年, 90年
return true;
if (Utility.IsAllNum(sNum)
&& (nLen >= 6 || nLen == 4
&& Utility.CC_Find("56789".getBytes(), sTemp)))
return true;
if (Utility.GetCharCount("零○一二三四五六七八九壹贰叁肆伍陆柒捌玖".getBytes(), sNum) == (int) nLen / 2
&& nLen >= 3)
return true;
if (nLen == 8 && Utility.GetCharCount("千仟零○".getBytes(), sNum) == 2)// 二仟零二年
return true;
if (nLen == 2 && Utility.GetCharCount("千仟".getBytes(), sNum) == 1)
return true;
if (nLen == 4
&& Utility.GetCharCount("甲乙丙丁戊己庚辛壬癸".getBytes(), sNum) == 1
&& Utility.GetCharCount("子丑寅卯辰巳午未申酉戌亥".getBytes(), GFCommon
.bytesCopy(sNum, 2, sNum.length - 2)) == 1)
return true;
return false;
}
protected boolean GenerateWord(int[][] nSegRoute, int nIndex) {
int i = 0, k = 0;
int j, nStartVertex, nEndVertex, nPOS;
byte[] sAtom = new byte[Final.WORD_MAXLENGTH];
byte[] sNumCandidate = new byte[100];
byte[] sCurWord = new byte[100];
double fValue = 0;
while (nSegRoute[nIndex][i] != -1 && nSegRoute[nIndex][i + 1] != -1
&& nSegRoute[nIndex][i] < nSegRoute[nIndex][i + 1]) {
nStartVertex = nSegRoute[nIndex][i];
j = nStartVertex;// Set the start vertex
nEndVertex = nSegRoute[nIndex][i + 1];// Set the end vertex
nPOS = 0;
m_graphSeg.m_segGraph.GetElement(nStartVertex, nEndVertex, fValue,
nPOS, null);
sAtom[0] = 0;
int index = 0;
while (j < nEndVertex) {// Generate the word according the
// segmentation route
GFCommon.bytesCopy(sAtom, m_graphSeg.m_sAtom[j], index,
m_graphSeg.m_sAtom[j].length);
index += m_graphSeg.m_sAtom[j].length;
j++;
}
m_pWordSeg[nIndex][k].sWord[0] = 0;// Init the result ending
GFCommon.bytesCopy(sNumCandidate, sAtom, 0, sAtom.length);
while (sAtom[0] != 0
&& (Utility.IsAllNum(sNumCandidate) || Utility
.IsAllChineseNum(sNumCandidate))) {// Merge all
// seperate
// continue num
// into one
// number
// sAtom[0]!=0: add in 2002-5-9
GFCommon.bytesCopy(m_pWordSeg[nIndex][k].sWord, sNumCandidate,
0, sNumCandidate.length);
index += sNumCandidate.length;
// Save them in the result segmentation
i++;// Skip to next atom now
sAtom[0] = 0;
index = 0;
while (j < nSegRoute[nIndex][i + 1]) {// Generate the word
// according the
// segmentation route
GFCommon.bytesCopy(sAtom, m_graphSeg.m_sAtom[j], index,
m_graphSeg.m_sAtom[j].length);
index += m_graphSeg.m_sAtom[j].length;
j++;
}
GFCommon.bytesCopy(sNumCandidate, sAtom, index, sAtom.length);
}
int nLen = m_pWordSeg[nIndex][k].sWord.length;
if (nLen == 4
&& Utility.CC_Find("第上成±—+∶·./".getBytes(),
m_pWordSeg[nIndex][k].sWord)
|| nLen == 1
&& Utility.strchr("+-./".getBytes(),
m_pWordSeg[nIndex][k].sWord[0]) != -1) {// Only one
// word
GFCommon.bytesCopy(sCurWord, m_pWordSeg[nIndex][k].sWord, 0,
m_pWordSeg[nIndex][k].sWord.length);// Record current
// word
i--;
} else if (m_pWordSeg[nIndex][k].sWord[0] == 0)// Have never
// entering the
// while loop
{
GFCommon.bytesCopy(m_pWordSeg[nIndex][k].sWord, sAtom, 0,
sAtom.length);
// Save them in the result segmentation
GFCommon.bytesCopy(sCurWord, sAtom, 0, sAtom.length);// Record
// current
// word
} else {// It is a num
if ("--".equals(m_pWordSeg[nIndex][k].sWord)
|| "—".equals(m_pWordSeg[nIndex][k].sWord)
|| m_pWordSeg[nIndex][k].sWord[0] == '-'
&& m_pWordSeg[nIndex][k].sWord[1] == 0)// The delimiter
// "--"
{
nPOS = 30464;// 'w'*256;Set the POS with 'w'
i--;// Not num, back to previous word
} else {// Adding time suffix
byte[] sInitChar = new byte[3];
int nCharIndex = 0;// Get first char
sInitChar[nCharIndex] = m_pWordSeg[nIndex][k].sWord[nCharIndex];
if (sInitChar[nCharIndex] < 0) {
nCharIndex += 1;
sInitChar[nCharIndex] = m_pWordSeg[nIndex][k].sWord[nCharIndex];
}
nCharIndex += 1;
sInitChar[nCharIndex] = '\0';
if (k > 0
&& (Math.abs(m_pWordSeg[nIndex][k - 1].nHandle) == 27904 || Math
.abs(m_pWordSeg[nIndex][k - 1].nHandle) == 29696)
&& (Utility.strcmp(sInitChar, "—".getBytes()) || sInitChar[0] == '-')
&& (m_pWordSeg[nIndex][k].sWord.length > nCharIndex)) {// 3-4月
// //27904='m'*256
// Split the sInitChar from the original word
byte[] bt = GFCommon
.bytesCopy(m_pWordSeg[nIndex][k].sWord,
nCharIndex,
m_pWordSeg[nIndex][k].sWord.length
- nCharIndex);
GFCommon.bytesCopy(m_pWordSeg[nIndex][k + 1].sWord, bt,
0, bt.length);
m_pWordSeg[nIndex][k + 1].dValue = m_pWordSeg[nIndex][k].dValue;
m_pWordSeg[nIndex][k + 1].nHandle = 27904;
m_pWordSeg[nIndex][k].sWord[nCharIndex] = 0;
m_pWordSeg[nIndex][k].dValue = 0;
m_pWordSeg[nIndex][k].nHandle = 30464;// 'w'*256;
m_graphOptimum.SetElement(nStartVertex,
nStartVertex + 1, m_pWordSeg[nIndex][k].dValue,
m_pWordSeg[nIndex][k].nHandle,
m_pWordSeg[nIndex][k].sWord);
nStartVertex += 1;
k += 1;
}
nLen = m_pWordSeg[nIndex][k].sWord.length;
if (sAtom.length == 2
&& Utility.CC_Find("月日时分秒".getBytes(), sAtom)
|| "月份".equals(GFString.getChineseString(sAtom,
"gb2312"))) {// 2001年
Utility.strcat(m_pWordSeg[nIndex][k].sWord, sAtom);
Utility.strcpy(sCurWord, "未##时".getBytes());
nPOS = -29696;// 't'*256;//Set the POS with 'm'
} else if (Utility.strcmp(sAtom, "年".getBytes())) {
if (IsYearTime(m_pWordSeg[nIndex][k].sWord))// strncmp(sAtom,"年",2)==0&&
{// 1998年,
Utility.strcat(m_pWordSeg[nIndex][k].sWord, sAtom);
Utility.strcpy(sCurWord, "未##时".getBytes());
nPOS = -29696;// Set the POS with 't'
} else {
Utility.strcpy(sCurWord, "未##数".getBytes());
nPOS = -27904;// Set the POS with 'm'
i--;// Can not be a time word
}
} else {
// 早晨/t 五点/t
byte[] bt = GFCommon.bytesCopy(
m_pWordSeg[nIndex][k].sWord,
m_pWordSeg[nIndex][k].sWord.length - 2,
m_pWordSeg[nIndex][k].sWord.length
- m_pWordSeg[nIndex][k].sWord.length
+ 2);
if (Utility.strcmp(bt, "点".getBytes())) {
Utility.strcpy(sCurWord, "未##时".getBytes());
nPOS = -29696;// Set the POS with 't'
} else {
bt = GFCommon.bytesCopy(
m_pWordSeg[nIndex][k].sWord, nLen - 2,
m_pWordSeg[nIndex][k].sWord.length - nLen
+ 2);
if (!Utility.CC_Find("∶·./".getBytes(), bt)
&& m_pWordSeg[nIndex][k].sWord[nLen - 1] != '.'
&& m_pWordSeg[nIndex][k].sWord[nLen - 1] != '/') {
Utility.strcpy(sCurWord, "未##数".getBytes());
nPOS = -27904;// 'm'*256;Set the POS with 'm'
} else if (nLen > sInitChar.length) {// Get rid
// of .
// example
// 1.
if (m_pWordSeg[nIndex][k].sWord[nLen - 1] == '.'
|| m_pWordSeg[nIndex][k].sWord[nLen - 1] == '/')
m_pWordSeg[nIndex][k].sWord[nLen - 1] = 0;
else
m_pWordSeg[nIndex][k].sWord[nLen - 2] = 0;
Utility.strcpy(sCurWord, "未##数".getBytes());
nPOS = -27904;// 'm'*256;Set the POS with 'm'
i--;
}
}
i--;// Not num, back to previous word
}
}
fValue = 0;
nEndVertex = nSegRoute[nIndex][i + 1];// Ending POS changed to
// latter
}
m_pWordSeg[nIndex][k].nHandle = nPOS;// Get the POS of current
// word
m_pWordSeg[nIndex][k].dValue = fValue;// (int)(MAX_FREQUENCE*exp(-fValue));//Return
// the frequency of current
// word
m_graphOptimum.SetElement(nStartVertex, nEndVertex, fValue, nPOS,
sCurWord);
// Generate optimum segmentation graph according the segmentation
// result
i++;// Skip to next atom
k++;// Accept next word
}
m_pWordSeg[nIndex][k].sWord[0] = 0;
m_pWordSeg[nIndex][k].nHandle = -1;// Set ending
return true;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -