⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 segment.cs

📁 只是中科院分词系统的SharpICTCLAS分词系统
💻 CS
📖 第 1 页 / 共 3 页
字号:
                  continue;
               }
            }

            pCur = pCur.next;
            pNext = pNext.next;
         }
      }

      #endregion

      #region ChangeDelimiterPOS Method

      private static void ChangeDelimiterPOS(ref WordLinkedArray linkedArray)
      {
         WordNode pCur = linkedArray.first;
         while (pCur != null)
         {
            if (pCur.theWord.sWord == "--" || pCur.theWord.sWord == "—" || pCur.theWord.sWord == "-")
            {
               pCur.theWord.nPOS = 30464; //'w'*256;Set the POS with 'w'
               pCur.theWord.dValue = 0;
            }

            pCur = pCur.next;
         }
      }

      #endregion

      #region SplitMiddleSlashFromDigitalWords Method

      //====================================================================
      //如果前一个词是数字,当前词以“-”或“-”开始,并且不止这一个字符,
      //那么将此“-”符号从当前词中分离出来。
      //例如 “3 / -4 / 月”需要拆分成“3 / - / 4 / 月”
      //====================================================================
      private static void SplitMiddleSlashFromDigitalWords(ref WordLinkedArray linkedArray)
      {
         if (linkedArray.Count < 2)
            return;

         WordNode pCur = linkedArray.first.next;
         WordNode pPre = linkedArray.first;

         while (pCur != null)
         {
            //27904='m'*256
            if ((Math.Abs(pPre.theWord.nPOS) == 27904 || Math.Abs(pPre.theWord.nPOS) == 29696) &&
               (Utility.IsAllNum(pCur.theWord.sWord) || Utility.IsAllChineseNum(pCur.theWord.sWord)) &&
               ("--".IndexOf(pCur.theWord.sWord.ToCharArray()[0]) >= 0) && pCur.theWord.sWord.Length > 1)
            {
               // 将“-”拆分出来。
               WordNode newNode = new WordNode();
               newNode.row = pCur.row + 1;
               newNode.col = pCur.col;
               newNode.sWordInSegGraph = pCur.theWord.sWord.Substring(1);
               WordResult theWord = new WordResult();
               theWord.sWord = newNode.sWordInSegGraph;
               theWord.nPOS = 27904;
               theWord.dValue = pCur.theWord.dValue;
               newNode.theWord = theWord;

               pCur.col = pCur.row + 1;
               pCur.theWord.sWord = pCur.theWord.sWord.Substring(0, 1);
               pCur.theWord.nPOS = 30464; //'w'*256;
               pCur.theWord.dValue = 0;

               newNode.next = pCur.next;
               pCur.next = newNode;

               linkedArray.Count++;
            }
            pCur = pCur.next;
            pPre = pPre.next;
         }
      }

      #endregion

      #region CheckDateElements Method

      //====================================================================
      //1、如果当前词是数字,下一个词是“月、日、时、分、秒、月份”中的一个,则合并且当前词词性是时间
      //2、如果当前词是可以作为年份的数字,下一个词是“年”,则合并,词性为时间,否则为数字。
      //3、如果最后一个汉字是"点" ,则认为当前数字是时间
      //4、如果当前串最后一个汉字不是"∶·./"和半角的'.''/',那么是数
      //5、当前串最后一个汉字是"∶·./"和半角的'.''/',且长度大于1,那么去掉最后一个字符。例如"1."
      //====================================================================
      private static void CheckDateElements(ref WordLinkedArray linkedArray)
      {
         if (linkedArray.Count < 2)
            return;

         string nextWord;
         WordNode pCur = linkedArray.first;
         WordNode pNext = pCur.next;

         while (pNext != null)
         {
            if (Utility.IsAllNum(pCur.theWord.sWord) || Utility.IsAllChineseNum(pCur.theWord.sWord))
            {
               //===== 1、如果当前词是数字,下一个词是“月、日、时、分、秒、月份”中的一个,则合并且当前词词性是时间
               nextWord = pNext.theWord.sWord;
               if ((nextWord.Length == 1 && "月日时分秒".IndexOf(nextWord) != -1) || (nextWord.Length == 2 && nextWord == "月份"))
               {
                  //2001年
                  pCur.theWord.sWord += nextWord;
                  pCur.col = pNext.col;
                  pCur.sWordInSegGraph = "未##时";
                  pCur.theWord.nPOS = -29696; //'t'*256;//Set the POS with 'm'
                  pCur.next = pNext.next;
                  pNext = pCur.next;
                  linkedArray.Count--;
               }
               //===== 2、如果当前词是可以作为年份的数字,下一个词是“年”,则合并,词性为时间,否则为数字。
               else if (nextWord == "年")
               {
                  if (IsYearTime(pCur.theWord.sWord))
                  {
                     pCur.theWord.sWord += nextWord;
                     pCur.col = pNext.col;
                     pCur.sWordInSegGraph = "未##时";
                     pCur.theWord.nPOS = -29696; //'t'*256;//Set the POS with 'm'
                     pCur.next = pNext.next;
                     pNext = pCur.next;
                     linkedArray.Count--;
                  }
                  //===== 否则当前词就是数字了 =====
                  else
                  {
                     pCur.sWordInSegGraph = "未##数";
                     pCur.theWord.nPOS = -27904; //Set the POS with 'm'
                  }
               }
               else
               {
                  //===== 3、如果最后一个汉字是"点" ,则认为当前数字是时间
                  if (pCur.theWord.sWord.EndsWith("点"))
                  {
                     pCur.sWordInSegGraph = "未##时";
                     pCur.theWord.nPOS = -29696; //Set the POS with 't'
                  }
                  else
                  {
                     char[] tmpcharArray = pCur.theWord.sWord.ToCharArray();
                     string lastChar = tmpcharArray[tmpcharArray.Length - 1].ToString();
                     //===== 4、如果当前串最后一个汉字不是"∶·./"和半角的'.''/',那么是数
                     if ("∶·././".IndexOf(lastChar) == -1)
                     {
                        pCur.sWordInSegGraph = "未##数";
                        pCur.theWord.nPOS = -27904; //'m'*256;Set the POS with 'm'
                     }
                     //===== 5、当前串最后一个汉字是"∶·./"和半角的'.''/',且长度大于1,那么去掉最后一个字符。例如"1."
                     else if (pCur.theWord.sWord.Length > 1)
                     {
                        pCur.theWord.sWord = pCur.theWord.sWord.Substring(0, pCur.theWord.sWord.Length - 1);

                        pCur.sWordInSegGraph = "未##数";
                        pCur.theWord.nPOS = -27904; //'m'*256;Set the POS with 'm'
                     }
                  }
               }
            }

            pCur = pCur.next;
            pNext = pNext.next;
         }
      }

      #endregion

      #region IsYearTime Method

      private static bool IsYearTime(string sNum)
      {
         //Judge whether the sNum is a num genearating year
         int nLen = sNum.Length;
         char[] charArray = sNum.ToCharArray();

         //1992年, 90年
         if (Utility.IsAllNum(sNum) && (nLen == 4 || nLen == 2 && "5678956789".IndexOf(charArray[0]) != -1))
            return true;

         if (Utility.GetCharCount("零○一二三四五六七八九壹贰叁肆伍陆柒捌玖", sNum) == nLen && nLen >= 2)
            return true;

         //二仟零二年
         if (nLen == 4 && Utility.GetCharCount("千仟零○", sNum) == 2)
            return true;

         if (nLen == 1 && Utility.GetCharCount("千仟", sNum) == 1)
            return true;

         if (nLen == 2 && Regex.IsMatch(sNum, "^[甲乙丙丁戊己庚辛壬癸][子丑寅卯辰巳午未申酉戌亥]$"))
            return true;

         return false;
      }

      #endregion

      #endregion

      #region Events

      private void SendEvents(SegmentEventArgs e)
      {
         if (OnSegmentEvent != null)
            OnSegmentEvent(this, e);
      }

      private void OnAtomSegment(List<AtomNode> nodes)
      {
         StringBuilder sb = new StringBuilder();
         for (int i = 0; i < nodes.Count; i++)
            sb.Append(string.Format("{0}, ", nodes[i].sWord));

         sb.Append("\r\n");

         SendEvents(new SegmentEventArgs(SegmentStage.AtomSegment, sb.ToString()));
      }

      private void OnGenSegGraph(RowFirstDynamicArray<ChainContent> segGraph)
      {
         SendEvents(new SegmentEventArgs(SegmentStage.GenSegGraph, segGraph.ToString()));
      }

      private void OnGenBiSegGraph(ColumnFirstDynamicArray<ChainContent> biGraph)
      {
         SendEvents(new SegmentEventArgs(SegmentStage.GenBiSegGraph, biGraph.ToString()));
      }

      private void OnNShortPath(List<int[]> paths, RowFirstDynamicArray<ChainContent> segGraph)
      {
         List<ChainItem<ChainContent>> list = segGraph.ToListItems();
         string theWord;

         int[] aPath;
         StringBuilder sb = new StringBuilder();

         for (int i = 0; i < paths.Count; i++)
         {
            aPath = paths[i];
            for (int j = 0; j < aPath.Length; j++)
            {
               theWord = list[aPath[j]].Content.sWord;
               if (theWord == "未##人" || theWord == "未##地" || theWord == "未##数" || theWord == "未##时" || theWord == "未##串")
               {
                  for (int k = list[aPath[j]].row; k < list[aPath[j]].col; k++)
                     sb.Append(atomSegment[k].sWord);
                  sb.Append(", ");
               }
               else
                  sb.Append(string.Format("{0}, ", list[aPath[j]].Content.sWord));
            }

            sb.Append("\r\n");
         }

         SendEvents(new SegmentEventArgs(SegmentStage.NShortPath, sb.ToString()));
      }

      private void OnBeforeOptimize(List<WordResult[]> m_pWordSeg)
      {
         StringBuilder sb = new StringBuilder();
         for (int k = 0; k < m_pWordSeg.Count; k++)
         {
            for (int j = 0; j < m_pWordSeg[k].Length; j++)
               sb.Append(string.Format("{0}, ", m_pWordSeg[k][j].sWord));
            sb.Append("\r\n");
         }

         SendEvents(new SegmentEventArgs(SegmentStage.BeforeOptimize, sb.ToString()));
      }

      private void OnOptimumSegment(RowFirstDynamicArray<ChainContent> m_graphOptimum)
      {
         SendEvents(new SegmentEventArgs(SegmentStage.OptimumSegment, m_graphOptimum.ToString()));
      }

      private void OnGenBiOptimumSegGraph(ColumnFirstDynamicArray<ChainContent> biOptGraph)
      {
         SendEvents(new SegmentEventArgs(SegmentStage.GenBiSegGraph, biOptGraph.ToString()));
      }

      #endregion
   }
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -