📄 span.cs

📁 只是中科院分词系统的SharpICTCLAS分词系统
💻 CS
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
               while (m_nBestTag[nEnd] == 2)
                  //2
                  nEnd++;
               nTemp = nEnd;
               while (m_nBestTag[nEnd] == 3)
               //2
               {
                  if (nEnd > nTemp)
                     dPanelty += 1.0;
                  nEnd++;
               }
            }
            if (nEnd > nStart)
            {
               //=========== by zhenyulu: 避免上面强迫之嫌带来的负面影响
               if (m_sWords[nEnd] == null)
                  nEnd--;

               m_nUnknownWords[m_nUnknownWordsCount, 0] = m_nWordPosition[nStart];
               m_nUnknownWords[m_nUnknownWordsCount, 1] = m_nWordPosition[nEnd];
               m_dWordsPossibility[m_nUnknownWordsCount++] = ComputePossibility(nStart, nEnd - nStart + 1, placeDict) +
                  Math.Log(dPanelty);
               nStart = nEnd;
            }
            if (i < nEnd)
               i = nEnd;
            else
               i = i + 1;
         }
         return true;
      }

      #endregion

      #region ReleaseSpan Method

      public void ReleaseSpan()
      {
         m_context.ReleaseContextStat();
      }

      #endregion

      #region SetTagType Method

      //Set the tag type
      public void SetTagType(TAG_TYPE nType)
      {
         m_tagType = nType;
      }

      public void SetTagType()
      {
         SetTagType(TAG_TYPE.TT_NORMAL);
      }

      #endregion

      #region POSTagging Method

      //POS tagging with Hidden Markov Model
      public bool POSTagging(WordResult[] pWordItems, WordDictionary dictCore, WordDictionary dictUnknown)
      {
         //pWordItems: Items; nItemCount: the count of items;core dictionary and unknown recognition dictionary
         int i = 0, j, nStartPos;
         Reset(false);
         while (i > -1 && i < pWordItems.Length && pWordItems[i].sWord != null)
         {
            nStartPos = i; //Start Position
            i = GetFrom(pWordItems, nStartPos, dictCore, dictUnknown);
            GetBestPOS();
            switch (m_tagType)
            {
               case TAG_TYPE.TT_NORMAL:
                  //normal POS tagging
                  j = 1;
                  while (m_nBestTag[j] != -1 && j < m_nCurLength)
                  {
                     //Store the best POS tagging
                     pWordItems[j + nStartPos - 1].nPOS = m_nBestTag[j];
                     //Let 。be 0
                     if (pWordItems[j + nStartPos - 1].dValue > 0 && dictCore.IsExist(pWordItems[j + nStartPos - 1].sWord, -1))
                        //Exist and update its frequncy as a POS value
                        pWordItems[j + nStartPos - 1].dValue = dictCore.GetFrequency(pWordItems[j + nStartPos - 1].sWord, m_nBestTag[j]);
                     j += 1;
                  }
                  break;
               case TAG_TYPE.TT_PERSON:
                  //Person recognition
                  PersonRecognize(dictUnknown);
                  break;
               case TAG_TYPE.TT_PLACE:
               //Place name recognition
               case TAG_TYPE.TT_TRANS_PERSON:
                  //Transliteration Person
                  PlaceRecognize(dictCore, dictUnknown);
                  break;
               default:
                  break;
            }
            Reset();
         }
         return true;
      }

      #endregion

      #region GetFrom Method

      private int GetFrom(WordResult[] pWordItems, int nIndex, WordDictionary dictCore, WordDictionary dictUnknown)
      {
         WordInfo info;
         int[] aPOS = new int[Predefine.MAX_POS_PER_WORD];
         int[] aFreq = new int[Predefine.MAX_POS_PER_WORD];
         int nFreq = 0, j, nRetPos = 0, nWordsIndex = 0;
         bool bSplit = false; //Need to split in Transliteration recognition 
         int i = 1, nPOSCount;
         string sCurWord; //Current word

         nWordsIndex = i + nIndex - 1;
         for (i = 1; i < Predefine.MAX_WORDS_PER_SENTENCE && nWordsIndex < pWordItems.Length; i++)
         {
            if (m_tagType == TAG_TYPE.TT_NORMAL || !dictUnknown.IsExist(pWordItems[nWordsIndex].sWord, 44))
            {
               m_sWords[i] = pWordItems[nWordsIndex].sWord; //store current word
               m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].Length;
            }
            else
            {
               if (!bSplit)
               {
                  m_sWords[i] = pWordItems[nWordsIndex].sWord.Substring(0, 1);
                  //store current word
                  bSplit = true;
               }
               else
               {
                  m_sWords[i] = pWordItems[nWordsIndex].sWord.Substring(1);
                  //store current word
                  bSplit = false;
               }
               m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].Length;
            }
            //Record the position of current word
            m_nStartPos = m_nWordPosition[i + 1];
            //Move the Start POS to the ending
            if (m_tagType != TAG_TYPE.TT_NORMAL)
            {
               //Get the POSs from the unknown recognition dictionary
               sCurWord = m_sWords[i];
               if (m_tagType == TAG_TYPE.TT_TRANS_PERSON && i > 0 && m_sWords[i - 1] != null &&
                  Utility.charType(m_sWords[i - 1].ToCharArray()[0]) == Predefine.CT_CHINESE)
               {
                  if (m_sWords[i] == ".")
                     sCurWord = "．";
                  else if (m_sWords[i] == "-")
                     sCurWord = "－";
               }

               info = dictUnknown.GetWordInfo(sCurWord);
               if (info != null)
               {
                  nPOSCount = info.Count + 1;
                  for (j = 0; j < info.Count; j++)
                  {
                     //Get the POS set of sCurWord in the unknown dictionary
                     m_nTags[i, j] = info.POSs[j];
                     m_dFrequency[i, j] = -Math.Log((double)(1 + info.Frequencies[j])) +
                        Math.Log((double)(m_context.GetFrequency(0, info.POSs[j]) + nPOSCount));
                  }
               }
               else
               {
                  nPOSCount = 1;
                  j = 0;
               }

               //Get the POS set of sCurWord in the core dictionary
               //We ignore the POS in the core dictionary and recognize them as other (0).
               //We add their frequency to get the possibility as POS 0
               if (string.Compare(m_sWords[i], "始##始") == 0)
               {
                  m_nTags[i, j] = 100;
                  m_dFrequency[i, j] = 0;
                  j++;
               }
               else if (string.Compare(m_sWords[i], "末##末") == 0)
               {
                  m_nTags[i, j] = 101;
                  m_dFrequency[i, j] = 0;
                  j++;
               }
               else
               {
                  //dictCore.GetHandle(m_sWords[i], &nCount, aPOS, aFreq);
                  info = dictCore.GetWordInfo(m_sWords[i]);
                  nFreq = 0;
                  if (info != null)
                  {
                     for (int k = 0; k < info.Count; k++)
                     {
                        nFreq += info.Frequencies[k];
                     }
                     if (info.Count > 0)
                     {
                        m_nTags[i, j] = 0;
                        //m_dFrequency[i][j]=(double)(1+nFreq)/(double)(m_context.GetFrequency(0,0)+1);
                        m_dFrequency[i, j] = -Math.Log((double)(1 + nFreq)) + Math.Log((double)(m_context.GetFrequency(0, 0) + nPOSCount));
                        j++;
                     }
                  }
               }
            }
            else
            //For normal POS tagging
            {
               j = 0;
               //Get the POSs from the unknown recognition dictionary
               if (pWordItems[nWordsIndex].nPOS > 0)
               {
                  //The word has  is only one POS value
                  //We have record its POS and nFrequncy in the items.
                  m_nTags[i, j] = pWordItems[nWordsIndex].nPOS;
                  m_dFrequency[i, j] = -Math.Log(pWordItems[nWordsIndex].dValue) + Math.Log((double)(m_context.GetFrequency(0, m_nTags[i, j]) + 1));
                  if (m_dFrequency[i, j] < 0)
                     //Not permit the value less than 0
                     m_dFrequency[i, j] = 0;
                  j++;
               }
               else
               {
                  //The word has multiple POSs, we should retrieve the information from Core Dictionary 
                  if (pWordItems[nWordsIndex].nPOS < 0)
                  {
                     //The word has  is only one POS value
                     //We have record its POS and nFrequncy in the items.
                     m_nTags[i, j] = -pWordItems[nWordsIndex].nPOS;
                     m_dFrequency[i, j++] = pWordItems[nWordsIndex].dValue;
                  }
                  //dictCore.GetHandle(m_sWords[i], &nCount, aPOS, aFreq);
                  info = dictCore.GetWordInfo(m_sWords[i]);
                  if (info != null)
                  {
                     nPOSCount = info.Count;
                     for (; j < info.Count; j++)
                     {
                        //Get the POS set of sCurWord in the unknown dictionary
                        m_nTags[i, j] = info.POSs[j];
                        m_dFrequency[i, j] = -Math.Log(1 + info.Frequencies[j]) + Math.Log(m_context.GetFrequency(0, m_nTags[i, j]) + nPOSCount);
                     }
                  }
               }
            }
            if (j == 0)
            {
               //We donot know the POS, so we have to guess them according lexical knowledge
               GuessPOS(i, out j); //Guess the POS of current word
            }
            m_nTags[i, j] = -1; //Set the ending POS 
            if (j == 1 && m_nTags[i, j] != Predefine.CT_SENTENCE_BEGIN)
            //No ambuguity
            {
               //No ambuguity, so we can break from the loop
               i++;
               m_sWords[i] = null;
               break;
            }
            if (!bSplit)
               nWordsIndex++;
         }
         if (nWordsIndex == pWordItems.Length)
            nRetPos = -1;
         //Reaching ending

         if (m_nTags[i - 1, 1] != -1)
         //||m_sWords[i][0]==0
         {
            //Set end for words like "张/华/平"
            if (m_tagType != TAG_TYPE.TT_NORMAL)
               m_nTags[i, 0] = 101;
            else
               m_nTags[i, 0] = 1;

            m_dFrequency[i, 0] = 0;
            m_sWords[i] = null; //Set virtual ending
            m_nTags[i++, 1] = -1;
         }
         m_nCurLength = i; //The current word count
         if (nRetPos != -1)
            return nWordsIndex + 1;
         //Next start position
         return -1; //Reaching ending
      }

      #endregion

   }
}
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -