📄 segment.cs
字号:
{
nPOS = -27904;//'m'*256
sWord = "未##数";
}
else
{
nPOS = -28280; // -'n' * 256 - 'x'
sWord = "未##串";
}
dValue = 0;
break;
default:
nPOS = atomSegment[i].nPOS;//'?'*256;
break;
}
m_segGraph.SetElement(i, i + 1, new ChainContent(dValue, nPOS, sWord));//init the link with minimum
}
}
// 将所有可能的组词存入m_segGraph
for (int i = 0; i < atomSegment.Count; i++)//All the word
{
sWord = atomSegment[i].sWord;//Get the current atom
int j = i + 1;
while (j < atomSegment.Count && coreDict.GetMaxMatch(sWord, out sMaxMatchWord, out nPOSRet))
{
if (sMaxMatchWord == sWord) // 就是我们要找的词
{
WordInfo info = coreDict.GetWordInfo(sWord); // 该词可能就有多种词性
// 计算该词的所有词频之和
nTotalFreq = 0;
for (int k = 0; k < info.Count; k++)
nTotalFreq += info.Frequencies[k];
// 限制出现某些特殊词
if (sWord.Length == 2 && (sWord.StartsWith("年") || sWord.StartsWith("月")) && i >= 1 &&
(Utility.IsAllNum(atomSegment[i - 1].sWord) ||
Utility.IsAllChineseNum(atomSegment[i - 1].sWord)))
{
//1年内、1999年末
if ("末内中底前间初".IndexOf(sWord.Substring(1)) >= 0)
break;
}
// 如果该词只有一个词性,则存储,否则词性记录为 0
if (info.Count == 1)
m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, info.POSs[0], sWord));
else
m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, 0, sWord));
}
sWord += atomSegment[j++].sWord;
}
}
return m_segGraph;
}
#endregion
#region BiGraphGenerate Method
//====================================================================
// 生成两两词之间的二叉图表
//====================================================================
public static ColumnFirstDynamicArray<ChainContent> BiGraphGenerate(
RowFirstDynamicArray<ChainContent> aWord, double smoothPara, WordDictionary biDict, WordDictionary coreDict)
{
ColumnFirstDynamicArray<ChainContent> aBiWordNet = new ColumnFirstDynamicArray<ChainContent>();
ChainItem<ChainContent> pCur, pNextWords;
int nTwoWordsFreq = 0, nCurWordIndex, nNextWordIndex;
double dCurFreqency, dValue, dTemp;
string sTwoWords;
StringBuilder sb = new StringBuilder();
//Record the position map of possible words
int[] m_npWordPosMapTable = PreparePositionMap(aWord);
pCur = aWord.GetHead();
while (pCur != null)
{
if (pCur.Content.nPOS >= 0)
//It's not an unknown words
dCurFreqency = pCur.Content.eWeight;
else
//Unknown words
dCurFreqency = coreDict.GetFrequency(pCur.Content.sWord, 2);
//Get next words which begin with pCur.col(注:很特殊的对应关系)
pNextWords = aWord.GetFirstElementOfRow(pCur.col);
while (pNextWords != null && pNextWords.row == pCur.col)
{
sb.Remove(0, sb.Length);
sb.Append(pCur.Content.sWord);
sb.Append(Predefine.WORD_SEGMENTER);
sb.Append(pNextWords.Content.sWord);
sTwoWords = sb.ToString();
//Two linked Words frequency
nTwoWordsFreq = biDict.GetFrequency(sTwoWords, 3);
//Smoothing
dTemp = 1.0 / Predefine.MAX_FREQUENCE;
//-log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
dValue = -Math.Log(smoothPara * (1.0 + dCurFreqency) / (Predefine.MAX_FREQUENCE + 80000.0)
+ (1.0 - smoothPara) * ((1.0 - dTemp) * nTwoWordsFreq / (1.0 + dCurFreqency) +
dTemp));
//Unknown words: P(Wi|Ci);while known words:1
if (pCur.Content.nPOS < 0)
dValue += pCur.Content.nPOS;
//Get the position index of current word in the position map table
nCurWordIndex = Utility.BinarySearch(pCur.row * Predefine.MAX_SENTENCE_LEN + pCur.col, m_npWordPosMapTable);
nNextWordIndex = Utility.BinarySearch(pNextWords.row * Predefine.MAX_SENTENCE_LEN + pNextWords.col, m_npWordPosMapTable);
aBiWordNet.SetElement(nCurWordIndex, nNextWordIndex, new ChainContent(dValue, pCur.Content.nPOS, sTwoWords));
pNextWords = pNextWords.next; //Get next word
}
pCur = pCur.next;
}
return aBiWordNet;
}
//====================================================================
// 准备PositionMap,用于记录词的位置
//====================================================================
private static int[] PreparePositionMap(RowFirstDynamicArray<ChainContent> aWord)
{
int[] m_npWordPosMapTable;
ChainItem<ChainContent> pTail, pCur;
int nWordIndex = 0, m_nWordCount;
//Get tail element and return the words count
m_nWordCount = aWord.GetTail(out pTail);
if (m_nWordCount > 0)
m_npWordPosMapTable = new int[m_nWordCount];
else
m_npWordPosMapTable = null;
//Record the position of possible words
pCur = aWord.GetHead();
while (pCur != null)
{
m_npWordPosMapTable[nWordIndex++] = pCur.row * Predefine.MAX_SENTENCE_LEN + pCur.col;
pCur = pCur.next;
}
return m_npWordPosMapTable;
}
#endregion
#region Private Static Functions
#region BiPath2LinkedArray Method
//====================================================================
// 将BiPath转换为LinkedArray
// 例如“他说的确实在理”
// BiPath:(0, 1, 2, 3, 6, 9, 11, 12)
// 0 1 2 3 4 5 6 7 8 9 10 11 12
// 始##始 他 说 的 的确 确 确实 实 实在 在 在理 理 末##末
//====================================================================
private static WordLinkedArray BiPath2LinkedArray(int[] biPath, RowFirstDynamicArray<ChainContent> segGraph, List<AtomNode> atomSegment)
{
List<ChainItem<ChainContent>> list = segGraph.ToListItems();
StringBuilder sb = new StringBuilder();
WordLinkedArray result = new WordLinkedArray();
for (int i = 0; i < biPath.Length; i++)
{
WordNode node = new WordNode();
node.row = list[biPath[i]].row;
node.col = list[biPath[i]].col;
node.sWordInSegGraph = list[biPath[i]].Content.sWord;
node.theWord = new WordResult();
if (node.sWordInSegGraph == "未##人" || node.sWordInSegGraph == "未##地" ||
node.sWordInSegGraph == "未##数" || node.sWordInSegGraph == "未##时" || node.sWordInSegGraph == "未##串")
{
sb.Remove(0, sb.Length);
for (int j = node.row; j < node.col; j++)
sb.Append(atomSegment[j].sWord);
node.theWord.sWord = sb.ToString();
}
else
node.theWord.sWord = list[biPath[i]].Content.sWord;
node.theWord.nPOS = list[biPath[i]].Content.nPOS;
node.theWord.dValue = list[biPath[i]].Content.eWeight;
result.AppendNode(node);
}
return result;
}
#endregion
#region GenerateWord Method
//====================================================================
// Generate Word according the segmentation route
//====================================================================
private static WordResult[] GenerateWord(int[] uniPath, WordLinkedArray linkedArray, RowFirstDynamicArray<ChainContent> m_graphOptimum)
{
if (linkedArray.Count == 0)
return null;
//--------------------------------------------------------------------
//Merge all seperate continue num into one number
MergeContinueNumIntoOne(ref linkedArray);
//--------------------------------------------------------------------
//The delimiter "--"
ChangeDelimiterPOS(ref linkedArray);
//--------------------------------------------------------------------
//如果前一个词是数字,当前词以“-”或“-”开始,并且不止这一个字符,
//那么将此“-”符号从当前词中分离出来。
//例如 “3 / -4 / 月”需要拆分成“3 / - / 4 / 月”
SplitMiddleSlashFromDigitalWords(ref linkedArray);
//--------------------------------------------------------------------
//1、如果当前词是数字,下一个词是“月、日、时、分、秒、月份”中的一个,则合并,且当前词词性是时间
//2、如果当前词是可以作为年份的数字,下一个词是“年”,则合并,词性为时间,否则为数字。
//3、如果最后一个汉字是"点" ,则认为当前数字是时间
//4、如果当前串最后一个汉字不是"∶·./"和半角的'.''/',那么是数
//5、当前串最后一个汉字是"∶·./"和半角的'.''/',且长度大于1,那么去掉最后一个字符。例如"1."
CheckDateElements(ref linkedArray);
//--------------------------------------------------------------------
//输出结果
WordResult[] result = new WordResult[linkedArray.Count];
WordNode pCur = linkedArray.first;
int i = 0;
while (pCur != null)
{
WordResult item = new WordResult();
item.sWord = pCur.theWord.sWord;
item.nPOS = pCur.theWord.nPOS;
item.dValue = pCur.theWord.dValue;
result[i] = item;
m_graphOptimum.SetElement(pCur.row, pCur.col, new ChainContent(item.dValue, item.nPOS, pCur.sWordInSegGraph));
pCur = pCur.next;
i++;
}
return result;
}
#endregion
#region MergeContinueNumIntoOne Method
private static void MergeContinueNumIntoOne(ref WordLinkedArray linkedArray)
{
if (linkedArray.Count < 2)
return;
string tmp;
WordNode pCur = linkedArray.first;
WordNode pNext = pCur.next;
while (pNext != null)
{
if ((Utility.IsAllNum(pCur.theWord.sWord) || Utility.IsAllChineseNum(pCur.theWord.sWord)) &&
(Utility.IsAllNum(pNext.theWord.sWord) || Utility.IsAllChineseNum(pNext.theWord.sWord)))
{
tmp = pCur.theWord.sWord + pNext.theWord.sWord;
if (Utility.IsAllNum(tmp) || Utility.IsAllChineseNum(tmp))
{
pCur.theWord.sWord += pNext.theWord.sWord;
pCur.col = pNext.col;
pCur.next = pNext.next;
linkedArray.Count--;
pNext = pCur.next;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -