📄 span.cs
字号:
while (m_nBestTag[nEnd] == 2)
//2
nEnd++;
nTemp = nEnd;
while (m_nBestTag[nEnd] == 3)
//2
{
if (nEnd > nTemp)
dPanelty += 1.0;
nEnd++;
}
}
if (nEnd > nStart)
{
//=========== by zhenyulu: 避免上面强迫之嫌带来的负面影响
if (m_sWords[nEnd] == null)
nEnd--;
m_nUnknownWords[m_nUnknownWordsCount, 0] = m_nWordPosition[nStart];
m_nUnknownWords[m_nUnknownWordsCount, 1] = m_nWordPosition[nEnd];
m_dWordsPossibility[m_nUnknownWordsCount++] = ComputePossibility(nStart, nEnd - nStart + 1, placeDict) +
Math.Log(dPanelty);
nStart = nEnd;
}
if (i < nEnd)
i = nEnd;
else
i = i + 1;
}
return true;
}
#endregion
#region ReleaseSpan Method
public void ReleaseSpan()
{
m_context.ReleaseContextStat();
}
#endregion
#region SetTagType Method
//Set the tag type
public void SetTagType(TAG_TYPE nType)
{
m_tagType = nType;
}
public void SetTagType()
{
SetTagType(TAG_TYPE.TT_NORMAL);
}
#endregion
#region POSTagging Method
//POS tagging with Hidden Markov Model
public bool POSTagging(WordResult[] pWordItems, WordDictionary dictCore, WordDictionary dictUnknown)
{
//pWordItems: Items; nItemCount: the count of items;core dictionary and unknown recognition dictionary
int i = 0, j, nStartPos;
Reset(false);
while (i > -1 && i < pWordItems.Length && pWordItems[i].sWord != null)
{
nStartPos = i; //Start Position
i = GetFrom(pWordItems, nStartPos, dictCore, dictUnknown);
GetBestPOS();
switch (m_tagType)
{
case TAG_TYPE.TT_NORMAL:
//normal POS tagging
j = 1;
while (m_nBestTag[j] != -1 && j < m_nCurLength)
{
//Store the best POS tagging
pWordItems[j + nStartPos - 1].nPOS = m_nBestTag[j];
//Let 。be 0
if (pWordItems[j + nStartPos - 1].dValue > 0 && dictCore.IsExist(pWordItems[j + nStartPos - 1].sWord, -1))
//Exist and update its frequncy as a POS value
pWordItems[j + nStartPos - 1].dValue = dictCore.GetFrequency(pWordItems[j + nStartPos - 1].sWord, m_nBestTag[j]);
j += 1;
}
break;
case TAG_TYPE.TT_PERSON:
//Person recognition
PersonRecognize(dictUnknown);
break;
case TAG_TYPE.TT_PLACE:
//Place name recognition
case TAG_TYPE.TT_TRANS_PERSON:
//Transliteration Person
PlaceRecognize(dictCore, dictUnknown);
break;
default:
break;
}
Reset();
}
return true;
}
#endregion
#region GetFrom Method
private int GetFrom(WordResult[] pWordItems, int nIndex, WordDictionary dictCore, WordDictionary dictUnknown)
{
WordInfo info;
int[] aPOS = new int[Predefine.MAX_POS_PER_WORD];
int[] aFreq = new int[Predefine.MAX_POS_PER_WORD];
int nFreq = 0, j, nRetPos = 0, nWordsIndex = 0;
bool bSplit = false; //Need to split in Transliteration recognition
int i = 1, nPOSCount;
string sCurWord; //Current word
nWordsIndex = i + nIndex - 1;
for (i = 1; i < Predefine.MAX_WORDS_PER_SENTENCE && nWordsIndex < pWordItems.Length; i++)
{
if (m_tagType == TAG_TYPE.TT_NORMAL || !dictUnknown.IsExist(pWordItems[nWordsIndex].sWord, 44))
{
m_sWords[i] = pWordItems[nWordsIndex].sWord; //store current word
m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].Length;
}
else
{
if (!bSplit)
{
m_sWords[i] = pWordItems[nWordsIndex].sWord.Substring(0, 1);
//store current word
bSplit = true;
}
else
{
m_sWords[i] = pWordItems[nWordsIndex].sWord.Substring(1);
//store current word
bSplit = false;
}
m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].Length;
}
//Record the position of current word
m_nStartPos = m_nWordPosition[i + 1];
//Move the Start POS to the ending
if (m_tagType != TAG_TYPE.TT_NORMAL)
{
//Get the POSs from the unknown recognition dictionary
sCurWord = m_sWords[i];
if (m_tagType == TAG_TYPE.TT_TRANS_PERSON && i > 0 && m_sWords[i - 1] != null &&
Utility.charType(m_sWords[i - 1].ToCharArray()[0]) == Predefine.CT_CHINESE)
{
if (m_sWords[i] == ".")
sCurWord = ".";
else if (m_sWords[i] == "-")
sCurWord = "-";
}
info = dictUnknown.GetWordInfo(sCurWord);
if (info != null)
{
nPOSCount = info.Count + 1;
for (j = 0; j < info.Count; j++)
{
//Get the POS set of sCurWord in the unknown dictionary
m_nTags[i, j] = info.POSs[j];
m_dFrequency[i, j] = -Math.Log((double)(1 + info.Frequencies[j])) +
Math.Log((double)(m_context.GetFrequency(0, info.POSs[j]) + nPOSCount));
}
}
else
{
nPOSCount = 1;
j = 0;
}
//Get the POS set of sCurWord in the core dictionary
//We ignore the POS in the core dictionary and recognize them as other (0).
//We add their frequency to get the possibility as POS 0
if (string.Compare(m_sWords[i], "始##始") == 0)
{
m_nTags[i, j] = 100;
m_dFrequency[i, j] = 0;
j++;
}
else if (string.Compare(m_sWords[i], "末##末") == 0)
{
m_nTags[i, j] = 101;
m_dFrequency[i, j] = 0;
j++;
}
else
{
//dictCore.GetHandle(m_sWords[i], &nCount, aPOS, aFreq);
info = dictCore.GetWordInfo(m_sWords[i]);
nFreq = 0;
if (info != null)
{
for (int k = 0; k < info.Count; k++)
{
nFreq += info.Frequencies[k];
}
if (info.Count > 0)
{
m_nTags[i, j] = 0;
//m_dFrequency[i][j]=(double)(1+nFreq)/(double)(m_context.GetFrequency(0,0)+1);
m_dFrequency[i, j] = -Math.Log((double)(1 + nFreq)) + Math.Log((double)(m_context.GetFrequency(0, 0) + nPOSCount));
j++;
}
}
}
}
else
//For normal POS tagging
{
j = 0;
//Get the POSs from the unknown recognition dictionary
if (pWordItems[nWordsIndex].nPOS > 0)
{
//The word has is only one POS value
//We have record its POS and nFrequncy in the items.
m_nTags[i, j] = pWordItems[nWordsIndex].nPOS;
m_dFrequency[i, j] = -Math.Log(pWordItems[nWordsIndex].dValue) + Math.Log((double)(m_context.GetFrequency(0, m_nTags[i, j]) + 1));
if (m_dFrequency[i, j] < 0)
//Not permit the value less than 0
m_dFrequency[i, j] = 0;
j++;
}
else
{
//The word has multiple POSs, we should retrieve the information from Core Dictionary
if (pWordItems[nWordsIndex].nPOS < 0)
{
//The word has is only one POS value
//We have record its POS and nFrequncy in the items.
m_nTags[i, j] = -pWordItems[nWordsIndex].nPOS;
m_dFrequency[i, j++] = pWordItems[nWordsIndex].dValue;
}
//dictCore.GetHandle(m_sWords[i], &nCount, aPOS, aFreq);
info = dictCore.GetWordInfo(m_sWords[i]);
if (info != null)
{
nPOSCount = info.Count;
for (; j < info.Count; j++)
{
//Get the POS set of sCurWord in the unknown dictionary
m_nTags[i, j] = info.POSs[j];
m_dFrequency[i, j] = -Math.Log(1 + info.Frequencies[j]) + Math.Log(m_context.GetFrequency(0, m_nTags[i, j]) + nPOSCount);
}
}
}
}
if (j == 0)
{
//We donot know the POS, so we have to guess them according lexical knowledge
GuessPOS(i, out j); //Guess the POS of current word
}
m_nTags[i, j] = -1; //Set the ending POS
if (j == 1 && m_nTags[i, j] != Predefine.CT_SENTENCE_BEGIN)
//No ambuguity
{
//No ambuguity, so we can break from the loop
i++;
m_sWords[i] = null;
break;
}
if (!bSplit)
nWordsIndex++;
}
if (nWordsIndex == pWordItems.Length)
nRetPos = -1;
//Reaching ending
if (m_nTags[i - 1, 1] != -1)
//||m_sWords[i][0]==0
{
//Set end for words like "张/华/平"
if (m_tagType != TAG_TYPE.TT_NORMAL)
m_nTags[i, 0] = 101;
else
m_nTags[i, 0] = 1;
m_dFrequency[i, 0] = 0;
m_sWords[i] = null; //Set virtual ending
m_nTags[i++, 1] = -1;
}
m_nCurLength = i; //The current word count
if (nRetPos != -1)
return nWordsIndex + 1;
//Next start position
return -1; //Reaching ending
}
#endregion
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -