📄 csegment.java
字号:
package com.gftech.ictclas4j.segment;
import com.gftech.common.GFCommon;
import com.gftech.common.GFString;
import com.gftech.ictclas4j.utility.CDictionary;
import com.gftech.ictclas4j.utility.Final;
import com.gftech.ictclas4j.utility.TagWordResult;
import com.gftech.ictclas4j.utility.Utility;
public class CSegment {
// Record the position map of possible words
private int[] m_npWordPosMapTable;
// Record the End position of possible words
private int m_nWordCount;
public TagWordResult[][] m_pWordSeg;
// The segmentation result
public int m_nSegmentCount;
// The optimumized segmentation graph
public CDynamicArray m_graphOptimum;
// The segmentation graph
public CSegGraph m_graphSeg;
public CSegment() {
// malloc buffer
m_pWordSeg = new TagWordResult[Final.MAX_SEGMENT_NUM][];
for (int i = 0; i < Final.MAX_SEGMENT_NUM; i++) {
m_pWordSeg[i] = new TagWordResult[Final.MAX_WORDS];
}
// m_npWordPosMapTable=0;//Record the start position of possible words
m_nWordCount = 0;// Record the End position of possible words
m_graphOptimum.SetRowFirst(false);// Set row first
}
public boolean BiOptimumSegment(int nResultCount, double dSmoothingPara,
CDictionary dictBinary, CDictionary dictCore) {
int[][] nSegRoute;// The segmentation route
nSegRoute = new int[Final.MAX_SEGMENT_NUM][];
for (int i = 0; i < Final.MAX_SEGMENT_NUM; i++) {
nSegRoute[i] = new int[Final.MAX_SENTENCE_LEN / 2];
for (int j = 0; j < Final.MAX_SENTENCE_LEN / 8; j++)
nSegRoute[i][j] = -1;
}
CDynamicArray[] aBiwordsNet = null;
BiGraphGenerate(m_graphOptimum, aBiwordsNet[0], dSmoothingPara,
dictBinary, dictCore);
// Generate the biword link net
CNShortPath sp = new CNShortPath(aBiwordsNet, nResultCount);
sp.ShortPath();
sp.Output(nSegRoute, false, m_nSegmentCount);
int i = 0;
m_graphSeg.m_segGraph = m_graphOptimum;
m_graphOptimum.SetEmpty();// Set graph optimum empty
while (i < m_nSegmentCount) {
BiPath2UniPath(nSegRoute[i]);
// Path convert to unipath
GenerateWord(nSegRoute, i);
// Gernerate word according the Segmentation route
i++;
}
return true;
}
public boolean BiSegment(byte[] sSentence, double dSmoothingPara,
CDictionary dictCore, CDictionary dictBinary, int nResultCount) {
int[][] nSegRoute;// The segmentation route
nSegRoute = new int[Final.MAX_SEGMENT_NUM][];
int nLen = sSentence.length + 10;
for (int i = 0; i < Final.MAX_SEGMENT_NUM; i++) {
nSegRoute[i] = new int[nLen / 2];
for (int j = 0; j < nLen / 8; j++)
nSegRoute[i][j] = -1;
}
m_graphSeg.GenerateWordNet(sSentence, dictCore, true);// Generate
// words array
CDynamicArray[] aBiwordsNet = null;
BiGraphGenerate(m_graphSeg.m_segGraph, aBiwordsNet[0], dSmoothingPara,
dictBinary, dictCore);
// Generate the biword link net
CNShortPath sp = new CNShortPath(aBiwordsNet, nResultCount);
sp.ShortPath();
sp.Output(nSegRoute, false, m_nSegmentCount);
m_graphOptimum.SetEmpty();// Set graph optimum empty
int i = 0;
while (i < m_nSegmentCount) {
BiPath2UniPath(nSegRoute[i]);
// Path convert to unipath
GenerateWord(nSegRoute, i);
// Gernerate word according the Segmentation route
i++;
}
return true;
}
public boolean GetLastWord(TagWordResult[] pItem, byte[] sWordRet) {
int nCount = 0;
sWordRet[0] = 0;
while (pItem[nCount].sWord[0] != 0) {
Utility.strcpy(sWordRet, pItem[nCount].sWord);
nCount += 1;
}
if (sWordRet[0] == 0)
return true;
else
return false;
}
public int GetResultCount(TagWordResult[] pItem) {
int nCount = 0;
while (pItem[nCount].sWord[0] != 0) {
nCount += 1;
}
return nCount;
}
public boolean OptimumSegmet(int nResultCount) {
int[][] nSegRoute = new int[Final.MAX_SEGMENT_NUM][];
for (int i = 0; i < Final.MAX_SEGMENT_NUM; i++) {
nSegRoute[i] = new int[Final.MAX_SENTENCE_LEN / 2];
}
CNShortPath sp = new CNShortPath(m_graphOptimum, nResultCount);
sp.ShortPath();
sp.Output(nSegRoute, false, m_nSegmentCount);
int i = 0;
m_graphSeg.m_segGraph = m_graphOptimum;
m_graphOptimum.SetEmpty();// Set graph optimum empty
while (i < m_nSegmentCount) {
GenerateWord(nSegRoute, i);
// Gernerate word according the Segmentation route
i++;
}
return true;
}
// Word Segmentation based on optimum segmentation graph
// After unknown word recognition
public boolean Segment(byte[] sSentence, CDictionary dictCore,
int nResultCount) {
nResultCount = 10;
int[][] nSegRoute = new int[Final.MAX_SEGMENT_NUM][];
for (int i = 0; i < Final.MAX_SEGMENT_NUM; i++) {
nSegRoute[i] = new int[Final.MAX_SENTENCE_LEN / 2];
}
m_graphSeg.m_segGraph.SetRowFirst(false);
m_graphOptimum.SetRowFirst(false);
m_graphSeg.GenerateWordNet(sSentence, dictCore, false);
CNShortPath sp = new CNShortPath(m_graphSeg.m_segGraph, nResultCount);
sp.ShortPath();
sp.Output(nSegRoute, false, m_nSegmentCount);
m_graphOptimum.SetEmpty();// Set graph optimum empty
int i = 0;
while (i < m_nSegmentCount) {
GenerateWord(nSegRoute, i);
// Gernerate word according the Segmentation route
i++;
}
return true;
}
protected boolean BiPath2UniPath(int[] npPath) {
// BiPath convert to unipath
int i = 0, nTemp = -1;
while (npPath[i] != -1 && npPath[i] < m_nWordCount) {
nTemp = m_npWordPosMapTable[npPath[i]];
npPath[i] = nTemp / Final.MAX_SENTENCE_LEN;
i++;
}
if (nTemp > 0)
npPath[i++] = nTemp % Final.MAX_SENTENCE_LEN;
npPath[i] = -1;
return true;
}
/**
* <pre>
* CDynamicArray &aWord: the words array
* CDynamicArray &aBinaryWordNet:the net between words
* double dSmoothingPara: the parameter of data smoothing
* CDictionary &DictBinary: the binary dictionary
* CDictionary &DictCore: the Core dictionary
* </pre>
*/
protected boolean BiGraphGenerate(CDynamicArray aWord,
CDynamicArray aBinaryWordNet, double dSmoothingPara,
CDictionary DictBinary, CDictionary DictCore) {
TagArrayChain[] pTail = null;
TagArrayChain[] pCur = null;
TagArrayChain[] pNextWords = null;// Temp buffer
int nWordIndex = 0, nTwoWordsFreq = 0, nCurWordIndex, nNextWordIndex;
// nWordIndex: the index number of current word
double dCurFreqency, dValue, dTemp;
byte[] sTwoWords = new byte[Final.WORD_MAXLENGTH];
m_nWordCount = aWord.GetTail(pTail);// Get tail element and return the
// words count
if (m_nWordCount > 0)// Word count is greater than 0
m_npWordPosMapTable = new int[m_nWordCount];// Record the position
// of possible words
pCur = aWord.GetHead();
for (TagArrayChain trc : pCur)// Set the position map of words
{
m_npWordPosMapTable[nWordIndex++] = trc.row
* Final.MAX_SENTENCE_LEN + trc.col;
}
pCur = aWord.GetHead();
for (TagArrayChain trc : pCur)//
{
if (trc.nPOS >= 0)// It's not an unknown words
dCurFreqency = trc.value;
else
// Unknown words
dCurFreqency = DictCore.GetFrequency(trc.sWord, 2);
aWord.GetElement(trc.col, -1, pCur, pNextWords);// Get next words
// which begin with
// pCur->col
for (TagArrayChain trc2 : pNextWords) {
if (trc2.row == trc.col) {
// Current words frequency
Utility.strcpy(sTwoWords, trc.sWord);
Utility.strcat(sTwoWords, Final.WORD_SEGMENTER.getBytes());
Utility.strcat(sTwoWords, trc2.sWord);
nTwoWordsFreq = DictBinary.GetFrequency(sTwoWords, 3);
// Two linked Words frequency
dTemp = (double) 1 / Final.MAX_FREQUENCE;
// Smoothing
dValue = -Math
.log(dSmoothingPara
* (1 + dCurFreqency)
/ (Final.MAX_FREQUENCE + 80000)
+ (1 - dSmoothingPara)
* ((1 - dTemp) * nTwoWordsFreq
/ (1 + dCurFreqency) + dTemp));
// -log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -