📄 span.cs

📁 只是中科院分词系统的SharpICTCLAS分词系统
💻 CS
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/***********************************************************************************
 * ICTCLAS简介：计算所汉语词法分析系统ICTCLAS
 *              Institute of Computing Technology, Chinese Lexical Analysis System
 *              功能有：中文分词；词性标注；未登录词识别。
 *              分词正确率高达97.58%(973专家评测结果)，
 *              未登录词识别召回率均高于90%，其中中国人名的识别召回率接近98%;
 *              处理速度为31.5Kbytes/s。
 * 著作权：  Copyright(c)2002-2005中科院计算所 职务著作权人：张华平
 * 遵循协议：自然语言处理开放资源许可证1.0
 * Email: zhanghp@software.ict.ac.cn
 * Homepage:www.i3s.ac.cn
 * 
 *----------------------------------------------------------------------------------
 * 
 * Copyright (c) 2000, 2001
 *     Institute of Computing Tech.
 *     Chinese Academy of Sciences
 *     All rights reserved.
 *
 * This file is the confidential and proprietary property of
 * Institute of Computing Tech. and the posession or use of this file requires
 * a written license from the author.
 * Author:   Kevin Zhang
 *          (zhanghp@software.ict.ac.cn)、
 * 
 *----------------------------------------------------------------------------------
 * 
 * SharpICTCLAS：.net平台下的ICTCLAS
 *               是由河北理工大学经管学院吕震宇根据Free版ICTCLAS改编而成，
 *               并对原有代码做了部分重写与调整
 * 
 * Email: zhenyulu@163.com
 * Blog: http://www.cnblogs.com/zhenyulu
 * 
 ***********************************************************************************/
using System;
using System.Collections.Generic;
using System.Text;

namespace SharpICTCLAS
{
   public class Span
   {
      #region Public Fields

      //The number of unknown word
      public int m_nUnknownWordsCount;
      //The start and ending possition of unknown position
      public int[,] m_nUnknownWords = new int[Predefine.MAX_UNKNOWN_PER_SENTENCE, 2];
      //The possibility of unknown words
      public double[] m_dWordsPossibility = new double[Predefine.MAX_UNKNOWN_PER_SENTENCE];
      public ContextStat m_context = new ContextStat(); //context

      #endregion

      #region Private Fields

      private TAG_TYPE m_tagType; //The type of tagging
      private int m_nStartPos;

      private int[] m_nBestTag = new int[Predefine.MAX_WORDS_PER_SENTENCE];
      //Record the Best Tag

      private string[] m_sWords = new string[Predefine.MAX_WORDS_PER_SENTENCE];
      private int[] m_nWordPosition = new int[Predefine.MAX_WORDS_PER_SENTENCE];
      private int[,] m_nTags = new int[Predefine.MAX_WORDS_PER_SENTENCE, Predefine.MAX_POS_PER_WORD];
      private int[,] m_nBestPrev = new int[Predefine.MAX_WORDS_PER_SENTENCE, Predefine.MAX_POS_PER_WORD];
      private double[,] m_dFrequency = new double[Predefine.MAX_WORDS_PER_SENTENCE, Predefine.MAX_POS_PER_WORD];
      private int m_nCurLength;

      #endregion

      #region 构造函数

      public Span()
      {
         if (m_tagType != TAG_TYPE.TT_NORMAL)
            m_nTags[0, 0] = 100;
         //Begin tag
         else
            m_nTags[0, 0] = 0;
         //Begin tag

         m_nTags[0, 1] = -1;
         m_dFrequency[0, 0] = 0;
         m_nCurLength = 1;
         m_nUnknownWordsCount = 0;
         m_nStartPos = 0;
         m_nWordPosition[1] = 0;
         m_sWords[0] = null;

         m_tagType = TAG_TYPE.TT_NORMAL; //Default tagging type
      }

      #endregion

      #region Disamb Method

      private bool Disamb()
      {
         int i, j, k, nMinCandidate;
         double dMinFee = 0, dTmp;
         for (i = 1; i < m_nCurLength; i++)
         //For every word
         {
            for (j = 0; m_nTags[i, j] >= 0; j++)
            //For every word
            {
               nMinCandidate = Predefine.MAX_POS_PER_WORD + 1;
               for (k = 0; m_nTags[i - 1, k] >= 0; k++)
               {
                  dTmp = -Math.Log(m_context.GetContextPossibility(0, m_nTags[i - 1, k], m_nTags[i, j]));
                  dTmp += m_dFrequency[i - 1, k]; //Add the fees
                  if (nMinCandidate > 10 || dTmp < dMinFee)
                  //Get the minimum fee
                  {
                     nMinCandidate = k;
                     dMinFee = dTmp;
                  }
               }

               m_nBestPrev[i, j] = nMinCandidate; //The best previous for j
               m_dFrequency[i, j] = m_dFrequency[i, j] + dMinFee;
            }
         }
         return true;
      }

      #endregion

      #region Reset Method

      private bool Reset()
      {
         return Reset(true);
      }

      private bool Reset(bool bContinue)
      {
         if (!bContinue)
         {
            //||CC_Find("。！”〕〉》」〗】",m_sWords[m_nCurLength-1])
            if (m_tagType != TAG_TYPE.TT_NORMAL)
               //Get the last POS in the last sentence
               m_nTags[0, 0] = 100;
            //Begin tag
            else
               m_nTags[0, 0] = 0;
            //Begin tag
            m_nUnknownWordsCount = 0;
            m_dFrequency[0, 0] = 0;
            m_nStartPos = 0;
         }
         else
         {
            m_nTags[0, 0] = m_nTags[m_nCurLength - 1, 0];
            //Get the last POS in the last sentence
            m_dFrequency[0, 0] = m_dFrequency[m_nCurLength - 1, 0];
         }
         m_nTags[0, 1] = -1;
         //Get the last POS in the last sentence,set the -1 as end flag
         m_nCurLength = 1;
         m_nWordPosition[1] = m_nStartPos;
         m_sWords[0] = null;
         return true;
      }

      #endregion

      #region LoadContext Method

      public bool LoadContext(string sFilename)
      {
         return m_context.Load(sFilename);
      }

      #endregion

      #region GetBestPOS Method

      private bool GetBestPOS()
      {
         Disamb();
         for (int i = m_nCurLength - 1, j = 0; i > 0; i--)
         //,j>=0
         {
            if (m_sWords[i] != null)
            {
               //Not virtual ending
               m_nBestTag[i] = m_nTags[i, j]; //Record the best POS and its possibility
            }
            j = m_nBestPrev[i, j];
         }
         int nEnd = m_nCurLength; //Set the end of POS tagging
         if (m_sWords[m_nCurLength - 1] == null)
            nEnd = m_nCurLength - 1;
         m_nBestTag[nEnd] = -1;
         return true;
      }

      #endregion

      #region PersonRecognize Method

      public bool PersonRecognize(WordDictionary personDict)
      {
         StringBuilder sb = new StringBuilder();

         int i;
         string sPOS = "z", sPersonName;
         string[] sPatterns = { "BBCD", "BBC", "BBE", "BBZ", "BCD", "BEE", "BE", "BG", "BXD", "BZ", "CDCD", "CD", "EE", "FB", "Y", "XD", "" };
         double[] dFactor =   { 0.003606, 0.000021, 0.001314, 0.000315, 0.656624, 0.000021, 0.146116, 0.009136, 
            0.000042, 0.038971, 0, 0.090367, 0.000273, 0.009157, 0.034324, 0.009735, 0 };

         /*------------------------------------
         About parameter:
         
         BBCD  343      0.003606
         BBC   2        0.000021
         BBE   125      0.001314
         BBZ   30       0.000315
         BCD   62460    0.656624
         BEE   0        0.000000
         BE    13899    0.146116
         BG    869      0.009136
         BXD   4        0.000042
         BZ    3707     0.038971
         CD    8596     0.090367
         EE    26       0.000273
         FB    871      0.009157
         Y     3265     0.034324
         XD    926      0.009735

         The person recognition patterns set
         BBCD:姓+姓+名1+名2;
         BBE: 姓+姓+单名;
         BBZ: 姓+姓+双名成词;
         BCD: 姓+名1+名2;
         BE:  姓+单名;
         BEE: 姓+单名+单名;韩磊磊
         BG:  姓+后缀
         BXD: 姓+姓双名首字成词+双名末字
         BZ:  姓+双名成词;
         B:   姓
         CD:  名1+名2;
         EE:  单名+单名;
         FB:  前缀+姓
         XD:  姓双名首字成词+双名末字
         Y:   姓单名成词
         ------------------------------------*/

         int[] nPatternLen = { 4, 3, 3, 3, 3, 3, 2, 2, 3, 2, 4, 2, 2, 2, 1, 2, 0 };

         //Convert to string from POS
         sb.Append('z');
         for (i = 1; m_nBestTag[i] > -1; i++)
            sb.Append(Convert.ToChar(m_nBestTag[i] + Convert.ToInt32('A')));

         sPOS = sb.ToString();

         int j = 1, k, nPos; //Find the proper pattern from the first POS
         int nLittleFreqCount; //Counter for the person name role with little frequecy
         bool bMatched = false;
         while (j < i)
         {
            bMatched = false;
            for (k = 0; !bMatched && nPatternLen[k] > 0; k++)
            {
               if (string.Compare(sPatterns[k], 0, sPOS, j, nPatternLen[k]) == 0 &&
                  string.Compare(m_sWords[j - 1], "·") != 0 && string.Compare(m_sWords[j + nPatternLen[k]], "·") != 0)
               {
                  //Find the proper pattern k
                  if (string.Compare(sPatterns[k], "FB") == 0 && (sPOS[j + 2] == 'E' || sPOS[j + 2] == 'C' || sPOS[j + 2] == 'G'))
                  {
                     //Rule 1 for exclusion:前缀+姓+名1(名2): 规则(前缀+姓)失效；
                     continue;
                  }

                  /*			
                  if((strcmp(sPatterns[k],"BEE")==0||strcmp(sPatterns[k],"EE")==0)&&strcmp(m_sWords[j+nPatternLen[k]-1],m_sWords[j+nPatternLen[k]-2])!=0)
                  {//Rule 2 for exclusion:姓+单名+单名:单名+单名 若EE对应的字不同，规则失效.如：韩磊磊
                  continue;
                  }

                  if(strcmp(sPatterns[k],"B")==0&&m_nBestTag[j+1]!=12)
                  {//Rule 3 for exclusion: 若姓后不是后缀，规则失效.如：江主席、刘大娘
                  continue;
                  }
                   */
                  //Get the possible name

                  nPos = j; //Record the person position in the tag sequence
                  sPersonName = null;
                  nLittleFreqCount = 0; //Record the number of role with little frequency
                  while (nPos < j + nPatternLen[k])
                  {
                     //Get the possible person name
                     //
                     if (m_nBestTag[nPos] < 4 && personDict.GetFrequency(m_sWords[nPos], m_nBestTag[nPos]) < Predefine.LITTLE_FREQUENCY)
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -