⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 span.cs

📁 只是中科院分词系统的SharpICTCLAS分词系统
💻 CS
📖 第 1 页 / 共 3 页
字号:
                        nLittleFreqCount++;
                     //The counter increase
                     sPersonName += m_sWords[nPos];
                     nPos += 1;
                  }
                  /*
                  if(IsAllForeign(sPersonName)&&personDict.GetFrequency(m_sWords[j],1)<LITTLE_FREQUENCY)
                  {//Exclusion foreign name
                  //Rule 2 for exclusion:若均为外国人名用字 规则(名1+名2)失效
                  j+=nPatternLen[k]-1;
                  continue;
                  }
                   */
                  if (string.Compare(sPatterns[k], "CDCD") == 0)
                  {
                     //Rule for exclusion
                     //规则(名1+名2+名1+名2)本身是排除规则:女高音歌唱家迪里拜尔演唱
                     //Rule 3 for exclusion:含外国人名用字 规则适用
                     //否则,排除规则失效:黑妞白妞姐俩拔了头筹。
                     if (Utility.GetForeignCharCount(sPersonName) > 0)
                        j += nPatternLen[k] - 1;
                     continue;
                  }
                  /*
                  if(strcmp(sPatterns[k],"CD")==0&&IsAllForeign(sPersonName))
                  {//
                  j+=nPatternLen[k]-1;
                  continue;
                  }
                  if(nLittleFreqCount==nPatternLen[k]||nLittleFreqCount==3)
                  //马哈蒂尔;小扎耶德与他的中国阿姨胡彩玲受华黎明大使之邀,
                  //The all roles appear with two lower frequecy,we will ignore them
                  continue;
                   */
                  m_nUnknownWords[m_nUnknownWordsCount, 0] = m_nWordPosition[j];
                  m_nUnknownWords[m_nUnknownWordsCount, 1] = m_nWordPosition[j + nPatternLen[k]];
                  m_dWordsPossibility[m_nUnknownWordsCount] = -Math.Log(dFactor[k]) + ComputePossibility(j, nPatternLen[k], personDict);
                  //Mutiply the factor 
                  m_nUnknownWordsCount += 1;
                  j += nPatternLen[k];
                  bMatched = true;
               }
            }
            if (!bMatched)
               //Not matched, add j by 1
               j += 1;
         }
         return true;
      }

      #endregion

      #region GuessPOS Method

      //Guess the POS of No. nIndex word item
      private bool GuessPOS(int nIndex, out int pSubIndex)
      {
         int j = 0, i = nIndex, nCharType;
         int nLen;
         switch (m_tagType)
         {
            case TAG_TYPE.TT_NORMAL:
               m_nTags[i, j] = Utility.GetPOSValue("x"); //对于没有任何词性的词认为是字符串
               m_dFrequency[i, j++] = 0;
               break;
            case TAG_TYPE.TT_PERSON:
               j = 0;
               if ("××".IndexOf(m_sWords[nIndex]) != -1)
               {
                  m_nTags[i, j] = 6;
                  m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 6) + 1);
               }
               else
               {
                  m_nTags[i, j] = 0;
                  m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 0) + 1);
                  nLen = m_sWords[nIndex].Length;
                  if (nLen >= 2)
                  {
                     m_nTags[i, j] = 0;
                     m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 0) + 1);
                     m_nTags[i, j] = 11;
                     m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 11) * 8);
                     m_nTags[i, j] = 12;
                     m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 12) * 8);
                     m_nTags[i, j] = 13;
                     m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 13) * 8);
                  }
                  else if (nLen == 1)
                  {
                     m_nTags[i, j] = 0;
                     m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 0) + 1);
                     nCharType = Utility.charType(m_sWords[nIndex].ToCharArray()[0]);
                     if (nCharType == Predefine.CT_OTHER || nCharType == Predefine.CT_CHINESE)
                     {
                        m_nTags[i, j] = 1;
                        m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 1) + 1);
                        m_nTags[i, j] = 2;
                        m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 2) + 1);
                        m_nTags[i, j] = 3;
                        m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 3) + 1);
                        m_nTags[i, j] = 4;
                        m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 4) + 1);
                     }
                     m_nTags[i, j] = 11;
                     m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 11) * 8);
                     m_nTags[i, j] = 12;
                     m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 12) * 8);
                     m_nTags[i, j] = 13;
                     m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 13) * 8);
                  }
               }
               break;
            case TAG_TYPE.TT_PLACE:
               j = 0;
               m_nTags[i, j] = 0;
               m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 0) + 1);
               nLen = m_sWords[nIndex].Length;
               if (nLen >= 2)
               {
                  m_nTags[i, j] = 11;
                  m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 11) * 8);
                  m_nTags[i, j] = 12;
                  m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 12) * 8);
                  m_nTags[i, j] = 13;
                  m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 13) * 8);
               }
               else if (nLen == 1)
               {
                  m_nTags[i, j] = 0;
                  m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 0) + 1);
                  nCharType = Utility.charType(m_sWords[nIndex].ToCharArray()[0]);
                  if (nCharType == Predefine.CT_OTHER || nCharType == Predefine.CT_CHINESE)
                  {
                     m_nTags[i, j] = 1;
                     m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 1) + 1);
                     m_nTags[i, j] = 2;
                     m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 2) + 1);
                     m_nTags[i, j] = 3;
                     m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 3) + 1);
                     m_nTags[i, j] = 4;
                     m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 4) + 1);
                  }
                  m_nTags[i, j] = 11;
                  m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 11) * 8);
                  m_nTags[i, j] = 12;
                  m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 12) * 8);
                  m_nTags[i, j] = 13;
                  m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 13) * 8);
               }
               break;
            case TAG_TYPE.TT_TRANS_PERSON:
               j = 0;
               nLen = m_sWords[nIndex].Length;

               m_nTags[i, j] = 0;
               m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 0) + 1);

               if (!Utility.IsAllChinese(m_sWords[nIndex]))
               {
                  if (Utility.IsAllLetter(m_sWords[nIndex]))
                  {
                     m_nTags[i, j] = 1;
                     m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 1) + 1);
                     m_nTags[i, j] = 11;
                     m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 11) + 1);
                     m_nTags[i, j] = 2;
                     m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 2) * 2 + 1);
                     m_nTags[i, j] = 3;
                     m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 3) * 2 + 1);
                     m_nTags[i, j] = 12;
                     m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 12) * 2 + 1);
                     m_nTags[i, j] = 13;
                     m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 13) * 2 + 1);
                  }
                  m_nTags[i, j] = 41;
                  m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 41) * 8);
                  m_nTags[i, j] = 42;
                  m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 42) * 8);
                  m_nTags[i, j] = 43;
                  m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 43) * 8);
               }
               else if (nLen >= 2)
               {
                  m_nTags[i, j] = 41;
                  m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 41) * 8);
                  m_nTags[i, j] = 42;
                  m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 42) * 8);
                  m_nTags[i, j] = 43;
                  m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 43) * 8);
               }
               else if (nLen == 1)
               {
                  nCharType = Utility.charType(m_sWords[nIndex].ToCharArray()[0]);
                  if (nCharType == Predefine.CT_OTHER || nCharType == Predefine.CT_CHINESE)
                  {
                     m_nTags[i, j] = 1;
                     m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 1) * 2 + 1);
                     m_nTags[i, j] = 2;
                     m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 2) * 2 + 1);
                     m_nTags[i, j] = 3;
                     m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 3) * 2 + 1);
                     m_nTags[i, j] = 30;
                     m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 30) * 8 + 1);
                     m_nTags[i, j] = 11;
                     m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 11) * 4 + 1);
                     m_nTags[i, j] = 12;
                     m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 12) * 4 + 1);
                     m_nTags[i, j] = 13;
                     m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 13) * 4 + 1);
                     m_nTags[i, j] = 21;
                     m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 21) * 2 + 1);
                     m_nTags[i, j] = 22;
                     m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 22) * 2 + 1);
                     m_nTags[i, j] = 23;
                     m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 23) * 2 + 1);
                  }
                  m_nTags[i, j] = 41;
                  m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 41) * 8);
                  m_nTags[i, j] = 42;
                  m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 42) * 8);
                  m_nTags[i, j] = 43;
                  m_dFrequency[i, j++] = 1.0 / (m_context.GetFrequency(0, 43) * 8);
               }
               break;
            default:
               break;
         }
         pSubIndex = j;
         return true;
      }

      #endregion

      #region ComputePossibility Method

      private double ComputePossibility(int nStartPos, int nLength, WordDictionary dict)
      {
         double dRetValue = 0, dPOSPoss;
         //dPOSPoss: the possibility of a POS appears
         //dContextPoss: The possibility of context POS appears
         int nFreq;
         for (int i = nStartPos; i < nStartPos + nLength; i++)
         {
            nFreq = dict.GetFrequency(m_sWords[i], m_nBestTag[i]);
            //nFreq is word being the POS
            dPOSPoss = Math.Log((double)(m_context.GetFrequency(0, m_nBestTag[i]) + 1)) - Math.Log((double)(nFreq + 1));
            dRetValue += dPOSPoss;
            /*
             if(i<nStartPos+nLength-1)
             {
                dContextPoss=log((double)(m_context.GetContextPossibility(0,m_nBestTag[i],m_nBestTag[i+1])+1));
                dRetValue+=dPOSPoss-dContextPoss;
             }
             */
         }
         return dRetValue;
      }

      #endregion

      #region PlaceRecognize Method

      public bool PlaceRecognize(WordDictionary dictCore, WordDictionary placeDict)
      {
         int nStart = 1, nEnd = 1, i = 1, nTemp;
         double dPanelty = 1.0; //Panelty value
         while (m_nBestTag[i] > -1)
         {
            if (m_nBestTag[i] == 1)
            //1 Trigger the recognition procession
            {
               nStart = i;
               nEnd = nStart + 1;
               //=========== by zhenyulu: 此处nEnd = nStart + 1;有些强迫之嫌,因此后面处理了一下
               while (m_nBestTag[nEnd] == 1)
               //
               {
                  if (nEnd > nStart + 1)
                     dPanelty += 1.0;
                  nEnd++;
               }
               while (m_nBestTag[nEnd] == 2)
                  //2,12,22
                  nEnd++;
               nTemp = nEnd;
               while (m_nBestTag[nEnd] == 3)
               {
                  if (nEnd > nTemp)
                     dPanelty += 1.0;
                  nEnd++;
               }
            }
            else if (m_nBestTag[i] == 2)
            //1,11,21 Trigger the recognition
            {
               dPanelty += 1.0;
               nStart = i;
               nEnd = nStart + 1;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -