📄 segment.cs

📁 只是中科院分词系统的SharpICTCLAS分词系统
💻 CS
📖 第 1 页 / 共 3 页
字号:
                     {
                        nPOS = -27904;//'m'*256
                        sWord = "未##数";
                     }
                     else
                     {
                        nPOS = -28280; // -'n' * 256 - 'x'
                        sWord = "未##串";
                     }
                     dValue = 0;
                     break;
                  default:
                     nPOS = atomSegment[i].nPOS;//'?'*256;
                     break;
               }
               m_segGraph.SetElement(i, i + 1, new ChainContent(dValue, nPOS, sWord));//init the link with minimum
            }
         }

         // 将所有可能的组词存入m_segGraph
         for (int i = 0; i < atomSegment.Count; i++)//All the word
         {
            sWord = atomSegment[i].sWord;//Get the current atom
            int j = i + 1;

            while (j < atomSegment.Count && coreDict.GetMaxMatch(sWord, out sMaxMatchWord, out nPOSRet))
            {
               if (sMaxMatchWord == sWord)  // 就是我们要找的词
               {
                  WordInfo info = coreDict.GetWordInfo(sWord); // 该词可能就有多种词性

                  // 计算该词的所有词频之和
                  nTotalFreq = 0;
                  for (int k = 0; k < info.Count; k++)
                     nTotalFreq += info.Frequencies[k];

                  // 限制出现某些特殊词
                  if (sWord.Length == 2 && (sWord.StartsWith("年") || sWord.StartsWith("月")) && i >= 1 &&
                     (Utility.IsAllNum(atomSegment[i - 1].sWord) ||
                     Utility.IsAllChineseNum(atomSegment[i - 1].sWord)))
                  {
                     //1年内、1999年末
                     if ("末内中底前间初".IndexOf(sWord.Substring(1)) >= 0)
                        break;
                  }

                  // 如果该词只有一个词性，则存储，否则词性记录为 0
                  if (info.Count == 1)
                     m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, info.POSs[0], sWord));
                  else
                     m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, 0, sWord));
               }

               sWord += atomSegment[j++].sWord;
            }
         }
         return m_segGraph;
      }

      #endregion

      #region BiGraphGenerate Method

      //====================================================================
      // 生成两两词之间的二叉图表
      //====================================================================
      public static ColumnFirstDynamicArray<ChainContent> BiGraphGenerate(
         RowFirstDynamicArray<ChainContent> aWord, double smoothPara, WordDictionary biDict, WordDictionary coreDict)
      {
         ColumnFirstDynamicArray<ChainContent> aBiWordNet = new ColumnFirstDynamicArray<ChainContent>();

         ChainItem<ChainContent> pCur, pNextWords;
         int nTwoWordsFreq = 0, nCurWordIndex, nNextWordIndex;
         double dCurFreqency, dValue, dTemp;
         string sTwoWords;
         StringBuilder sb = new StringBuilder();

         //Record the position map of possible words
         int[] m_npWordPosMapTable = PreparePositionMap(aWord);

         pCur = aWord.GetHead();
         while (pCur != null)
         {
            if (pCur.Content.nPOS >= 0)
               //It's not an unknown words
               dCurFreqency = pCur.Content.eWeight;
            else
               //Unknown words
               dCurFreqency = coreDict.GetFrequency(pCur.Content.sWord, 2);

            //Get next words which begin with pCur.col（注：很特殊的对应关系）
            pNextWords = aWord.GetFirstElementOfRow(pCur.col);

            while (pNextWords != null && pNextWords.row == pCur.col)
            {
               sb.Remove(0, sb.Length);
               sb.Append(pCur.Content.sWord);
               sb.Append(Predefine.WORD_SEGMENTER);
               sb.Append(pNextWords.Content.sWord);

               sTwoWords = sb.ToString();

               //Two linked Words frequency
               nTwoWordsFreq = biDict.GetFrequency(sTwoWords, 3);

               //Smoothing
               dTemp = 1.0 / Predefine.MAX_FREQUENCE;

               //-log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
               dValue = -Math.Log(smoothPara * (1.0 + dCurFreqency) / (Predefine.MAX_FREQUENCE + 80000.0)
                 + (1.0 - smoothPara) * ((1.0 - dTemp) * nTwoWordsFreq / (1.0 + dCurFreqency) +
                 dTemp));

               //Unknown words: P(Wi|Ci);while known words:1
               if (pCur.Content.nPOS < 0)
                  dValue += pCur.Content.nPOS;

               //Get the position index of current word in the position map table
               nCurWordIndex = Utility.BinarySearch(pCur.row * Predefine.MAX_SENTENCE_LEN + pCur.col, m_npWordPosMapTable);
               nNextWordIndex = Utility.BinarySearch(pNextWords.row * Predefine.MAX_SENTENCE_LEN + pNextWords.col, m_npWordPosMapTable);

               aBiWordNet.SetElement(nCurWordIndex, nNextWordIndex, new ChainContent(dValue, pCur.Content.nPOS, sTwoWords));

               pNextWords = pNextWords.next; //Get next word
            }
            pCur = pCur.next;
         }

         return aBiWordNet;
      }

      //====================================================================
      // 准备PositionMap，用于记录词的位置
      //====================================================================
      private static int[] PreparePositionMap(RowFirstDynamicArray<ChainContent> aWord)
      {
         int[] m_npWordPosMapTable;
         ChainItem<ChainContent> pTail, pCur;
         int nWordIndex = 0, m_nWordCount;

         //Get tail element and return the words count
         m_nWordCount = aWord.GetTail(out pTail);

         if (m_nWordCount > 0)
            m_npWordPosMapTable = new int[m_nWordCount];
         else
            m_npWordPosMapTable = null;

         //Record the  position of possible words
         pCur = aWord.GetHead();
         while (pCur != null)
         {
            m_npWordPosMapTable[nWordIndex++] = pCur.row * Predefine.MAX_SENTENCE_LEN + pCur.col;
            pCur = pCur.next;
         }

         return m_npWordPosMapTable;
      }

      #endregion

      #region Private Static Functions

      #region BiPath2LinkedArray Method

      //====================================================================
      // 将BiPath转换为LinkedArray
      // 例如“他说的确实在理”
      // BiPath：（0, 1, 2, 3, 6, 9, 11, 12）
      //    0    1   2   3   4     5   6     7   8     9   10    11  12
      // 始##始  他  说  的  的确  确  确实  实  实在  在  在理  理  末##末
      //====================================================================
      private static WordLinkedArray BiPath2LinkedArray(int[] biPath, RowFirstDynamicArray<ChainContent> segGraph, List<AtomNode> atomSegment)
      {
         List<ChainItem<ChainContent>> list = segGraph.ToListItems();
         StringBuilder sb = new StringBuilder();

         WordLinkedArray result = new WordLinkedArray();

         for (int i = 0; i < biPath.Length; i++)
         {
            WordNode node = new WordNode();

            node.row = list[biPath[i]].row;
            node.col = list[biPath[i]].col;
            node.sWordInSegGraph = list[biPath[i]].Content.sWord;

            node.theWord = new WordResult();
            if (node.sWordInSegGraph == "未##人" || node.sWordInSegGraph == "未##地" ||
               node.sWordInSegGraph == "未##数" || node.sWordInSegGraph == "未##时" || node.sWordInSegGraph == "未##串")
            {
               sb.Remove(0, sb.Length);
               for (int j = node.row; j < node.col; j++)
                  sb.Append(atomSegment[j].sWord);

               node.theWord.sWord = sb.ToString();
            }
            else
               node.theWord.sWord = list[biPath[i]].Content.sWord;

            node.theWord.nPOS = list[biPath[i]].Content.nPOS;
            node.theWord.dValue = list[biPath[i]].Content.eWeight;

            result.AppendNode(node);
         }

         return result;
      }

      #endregion

      #region GenerateWord Method

      //====================================================================
      // Generate Word according the segmentation route
      //====================================================================
      private static WordResult[] GenerateWord(int[] uniPath, WordLinkedArray linkedArray, RowFirstDynamicArray<ChainContent> m_graphOptimum)
      {
         if (linkedArray.Count == 0)
            return null;

         //--------------------------------------------------------------------
         //Merge all seperate continue num into one number
         MergeContinueNumIntoOne(ref linkedArray);

         //--------------------------------------------------------------------
         //The delimiter "－－"
         ChangeDelimiterPOS(ref linkedArray);

         //--------------------------------------------------------------------
         //如果前一个词是数字，当前词以“－”或“-”开始，并且不止这一个字符，
         //那么将此“－”符号从当前词中分离出来。
         //例如 “3 / -4 / 月”需要拆分成“3 / - / 4 / 月”
         SplitMiddleSlashFromDigitalWords(ref linkedArray);

         //--------------------------------------------------------------------
         //1、如果当前词是数字，下一个词是“月、日、时、分、秒、月份”中的一个，则合并,且当前词词性是时间
         //2、如果当前词是可以作为年份的数字，下一个词是“年”，则合并，词性为时间，否则为数字。
         //3、如果最后一个汉字是"点" ，则认为当前数字是时间
         //4、如果当前串最后一个汉字不是"∶·．／"和半角的'.''/'，那么是数
         //5、当前串最后一个汉字是"∶·．／"和半角的'.''/'，且长度大于1，那么去掉最后一个字符。例如"1."
         CheckDateElements(ref linkedArray);

         //--------------------------------------------------------------------
         //输出结果
         WordResult[] result = new WordResult[linkedArray.Count];

         WordNode pCur = linkedArray.first;
         int i = 0;
         while (pCur != null)
         {
            WordResult item = new WordResult();
            item.sWord = pCur.theWord.sWord;
            item.nPOS = pCur.theWord.nPOS;
            item.dValue = pCur.theWord.dValue;
            result[i] = item;

            m_graphOptimum.SetElement(pCur.row, pCur.col, new ChainContent(item.dValue, item.nPOS, pCur.sWordInSegGraph));

            pCur = pCur.next;
            i++;
         }

         return result;
      }

      #endregion

      #region MergeContinueNumIntoOne Method

      private static void MergeContinueNumIntoOne(ref WordLinkedArray linkedArray)
      {
         if (linkedArray.Count < 2)
            return;

         string tmp;
         WordNode pCur = linkedArray.first;
         WordNode pNext = pCur.next;

         while (pNext != null)
         {
            if ((Utility.IsAllNum(pCur.theWord.sWord) || Utility.IsAllChineseNum(pCur.theWord.sWord)) &&
               (Utility.IsAllNum(pNext.theWord.sWord) || Utility.IsAllChineseNum(pNext.theWord.sWord)))
            {
               tmp = pCur.theWord.sWord + pNext.theWord.sWord;
               if (Utility.IsAllNum(tmp) || Utility.IsAllChineseNum(tmp))
               {
                  pCur.theWord.sWord += pNext.theWord.sWord;
                  pCur.col = pNext.col;
                  pCur.next = pNext.next;
                  linkedArray.Count--;
                  pNext = pCur.next;
💿 文件大小 2421 K
👤 上传用户 wait2010
📂 所属分类人工智能/神经网络
🏷️ 相关标签

#SharpICTCLAS #分
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -