⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 worddictionary.cs

📁 只是中科院分词系统的SharpICTCLAS分词系统
💻 CS
📖 第 1 页 / 共 3 页
字号:
                  indexTable[i].WordItems[j].nFrequency = (nRatio * indexTable[i].WordItems[j].nFrequency + dict2.indexTable[i].WordItems[k].nFrequency) / (nRatio + 1);
                  j += 1;
                  k += 1;
               }
               //Get next word in the current dictionary
               else if (nCmpValue < 0)
               {
                  indexTable[i].WordItems[j].nFrequency = (nRatio * indexTable[i].WordItems[j].nFrequency) / (nRatio + 1);
                  j += 1;
               }
               else
               //Get next word in the second dictionary
               {
                  if (dict2.indexTable[i].WordItems[k].nFrequency > (nRatio + 1) / 10)
                  {
                     sWord = string.Format("{0}{1}", Utility.CC_ID2Char(i).ToString(), dict2.indexTable[i].WordItems[k].sWord);
                     AddItem(sWord, dict2.indexTable[i].WordItems[k].nPOS, dict2.indexTable[i].WordItems[k].nFrequency / (nRatio + 1));
                  }
                  k += 1;
               }
            }

            //words in current dictionary are left
            while (j < indexTable[i].nCount)
            {
               indexTable[i].WordItems[j].nFrequency = (nRatio * indexTable[i].WordItems[j].nFrequency) / (nRatio + 1);
               j += 1;
            }

            //words in Dict2 are left
            while (k < dict2.indexTable[i].nCount)
            {
               if (dict2.indexTable[i].WordItems[k].nFrequency > (nRatio + 1) / 10)
               {
                  sWord = string.Format("{0}{1}", Utility.CC_ID2Char(i).ToString(), dict2.indexTable[i].WordItems[k].sWord);
                  AddItem(sWord, dict2.indexTable[i].WordItems[k].nPOS, dict2.indexTable[i].WordItems[k].nFrequency / (nRatio + 1));
               }
               k += 1;
            }
         }
         return true;
      }

      #endregion

      #region Optimum Method

      //====================================================================
      //Delete word item which
      //(1)frequency is 0
      //(2)word is same as following but the POS value is parent set of the following
      //for example "江泽民/n/0" will deleted, because "江泽民/nr/0" is more detail and correct
      //====================================================================
      public bool Optimum()
      {
         int nPrevPOS, i, j, nPrevFreq;
         string sPrevWord, sCurWord;
         for (i = 0; i < Predefine.CC_NUM; i++)
         {
            j = 0;
            sPrevWord = null;
            nPrevPOS = 0;
            nPrevFreq = -1;
            while (j < indexTable[i].nCount)
            {
               sCurWord = string.Format("{0}{1}", Utility.CC_ID2Char(i).ToString(), indexTable[i].WordItems[j].sWord);
               if (nPrevPOS == 30720 || nPrevPOS == 26368 || nPrevPOS == 29031 ||
                 (sPrevWord == sCurWord && nPrevFreq == 0 && indexTable[i].WordItems[j].nPOS / 256 * 256 == nPrevPOS))
               {
                  //Delete Previous word item
                  //Delete word with POS 'x','g' 'qg'
                  DelItem(sPrevWord, nPrevPOS);
               }
               sPrevWord = sCurWord;
               nPrevPOS = indexTable[i].WordItems[j].nPOS;
               nPrevFreq = indexTable[i].WordItems[j].nFrequency;
               j += 1; //Get next item in the original table.
            }
         }
         return true;
      }

      #endregion

      #region Private Functions

      #region PreProcessing Method

      //====================================================================
      // Func Name  : PreProcessing
      // Description: Get the type of word
      // Parameters : sWord: the word
      // Returns    : the type
      //====================================================================
      private bool PreProcessing(ref string sWord, out int nId, out string sWordRet)
      {
         sWord = sWord.Trim();

         //Position for the delimeters
         int nType = Utility.charType(sWord.ToCharArray()[0]);

         if (sWord.Length != 0)
         {
            //Chinese word
            if (nType == Predefine.CT_CHINESE)
            {
               //Get the inner code of the first Chinese Char
               byte[] byteArray = Utility.String2ByteArray(sWord);
               nId = Utility.CC_ID(byteArray[0], byteArray[1]);

               //store the word,not store the first Chinese Char
               sWordRet = sWord.Substring(1);
               return true;
            }

            //Delimiter
            if (nType == Predefine.CT_DELIMITER)
            {
               nId = 3755;
               //Get the inner code of the first Chinese Char
               sWordRet = sWord; //store the word, not store the first Chinese Char
               return true;
            }
         }

         nId = 0;
         sWordRet = "";
         return false; //other invalid
      }

      #endregion

      #region FindInOriginalTable Method

      //====================================================================
      // Func Name  : FindInOriginalTable
      // Description: judge the word and handle exist in the inner table and its items
      // Parameters : nInnerCode: the inner code of the first CHines char
      //              sWord: the word
      //              nHandle:the handle number
      //              *nPosRet:the position which node is matched
      // Returns    : success or fail
      //====================================================================
      private bool FindInOriginalTable(int nInnerCode, string sWord, int nPOS, out int nPosRet)
      {
         WordItem[] pItems = indexTable[nInnerCode].WordItems;

         int nStart = 0, nEnd = indexTable[nInnerCode].nCount - 1;
         int nMid = (nStart + nEnd) / 2, nCmpValue;

         while (nStart <= nEnd)
         //Binary search
         {
            nCmpValue = Utility.CCStringCompare(pItems[nMid].sWord, sWord);
            if (nCmpValue == 0 && (pItems[nMid].nPOS == nPOS || nPOS == -1))
            {
               if (nPOS == -1)
               //Not very strict match
               {
                  nMid -= 1;
                  while (nMid >= 0 && string.Compare(pItems[nMid].sWord, sWord) == 0)
                     //Get the first item which match the current word
                     nMid--;
                  if (nMid < 0 || string.Compare(pItems[nMid].sWord, sWord) != 0)
                     nMid++;
               }
               nPosRet = nMid;
               return true;//find it
            }
            else if (nCmpValue < 0 || (nCmpValue == 0 && pItems[nMid].nPOS < nPOS && nPOS != -1))
            {
               nStart = nMid + 1;
            }
            else if (nCmpValue > 0 || (nCmpValue == 0 && pItems[nMid].nPOS > nPOS && nPOS != -1))
            {
               nEnd = nMid - 1;
            }
            nMid = (nStart + nEnd) / 2;
         }

         //Get the previous position
         nPosRet = nMid - 1;
         return false;
      }

      //====================================================================
      // Func Name  : FindInOriginalTable
      // Description: judge the word and handle exist in the inner table and its items
      // Parameters : nInnerCode: the inner code of the first CHines char
      //              sWord: the word
      //              nHandle:the handle number
      // Returns    : success or fail
      //====================================================================
      private bool FindInOriginalTable(int nInnerCode, string sWord, int nPOS)
      {
         WordItem[] pItems = indexTable[nInnerCode].WordItems;

         int nStart = 0, nEnd = indexTable[nInnerCode].nCount - 1;
         int nMid = (nStart + nEnd) / 2, nCmpValue;

         //Binary search
         while (nStart <= nEnd)
         {
            nCmpValue = Utility.CCStringCompare(pItems[nMid].sWord, sWord);

            if (nCmpValue == 0 && (pItems[nMid].nPOS == nPOS || nPOS == -1))
               return true;//find it
            else if (nCmpValue < 0 || (nCmpValue == 0 && pItems[nMid].nPOS < nPOS && nPOS != -1))
               nStart = nMid + 1;
            else if (nCmpValue > 0 || (nCmpValue == 0 && pItems[nMid].nPOS > nPOS && nPOS != -1))
               nEnd = nMid - 1;

            nMid = (nStart + nEnd) / 2;
         }
         return false;
      }

      #endregion

      #region FindInModifyTable Method

      //====================================================================
      // Func Name  : FindInModifyTable
      // Description: judge the word and handle exist in the modified table and its items
      // Parameters : nInnerCode: the inner code of the first CHines char
      //              sWord: the word
      //              nHandle:the handle number
      //              *pFindRet: the node found
      // Returns    : success or fail
      //====================================================================
      private bool FindInModifyTable(int nInnerCode, string sWord, int nPOS, out WordChain pFindRet)
      {
         WordChain pCur, pPre;
         if (modifyTable != null)
         {
            pCur = modifyTable[nInnerCode].pWordItemHead;
            pPre = null;
            while (pCur != null && (Utility.CCStringCompare(pCur.data.sWord, sWord) < 0 ||
               (string.Compare(pCur.data.sWord, sWord, true) == 0 && pCur.data.nPOS < nPOS)))
            //sort the link chain as alphabet
            {
               pPre = pCur;
               pCur = pCur.next;
            }

            pFindRet = pPre;

            if (pCur != null && string.Compare(pCur.data.sWord, sWord, true) == 0 && pCur.data.nPOS == nPOS)
               //The node exists, delete the node and return
               return true;
            else
               return false;
         }

         pFindRet = null;
         return false;
      }

      //====================================================================
      // Func Name  : FindInModifyTable
      // Description: judge the word and handle exist in the modified table and its items
      // Parameters : nInnerCode: the inner code of the first CHines char
      //              sWord: the word
      //              nHandle:the handle number
      //              *pFindRet: the node found
      // Returns    : success or fail
      //====================================================================
      private bool FindInModifyTable(int nInnerCode, string sWord, out WordChain pFindRet)
      {
         WordChain pCur, pPre;
         if (modifyTable != null)
         {
            pCur = modifyTable[nInnerCode].pWordItemHead;
            pPre = null;
            while (pCur != null && (Utility.CCStringCompare(pCur.data.sWord, sWord) < 0))
            {
               pPre = pCur;
               pCur = pCur.next;
            }

            pFindRet = pPre;

            if (pCur != null && string.Compare(pCur.data.sWord, sWord, true) == 0)
               return true;
            else
               return false;
         }

         pFindRet = null;
         return false;
      }

      //====================================================================
      // Func Name  : FindInModifyTable
      // Description: judge the word and handle exist in the modified table and its items
      // Parameters : nInnerCode: the inner code of the first CHines char
      //              sWord: the word
      //              nHandle:the handle number
      // Returns    : success or fail
      //====================================================================
      private bool FindInModifyTable(int nInnerCode, string sWord, int nPOS)
      {
         WordChain pCur, pPre;
         if (modifyTable != null)
         {
            pCur = modifyTable[nInnerCode].pWordItemHead;
            pPre = null;

            //sort the link chain as alphabet
            while (pCur != null && (Utility.CCStringCompare(pCur.data.sWord, sWord) < 0 ||
               (string.Compare(pCur.data.sWord, sWord, true) == 0 && pCur.data.nPOS < nPOS)))
            {
               pPre = pCur;
               pCur = pCur.next;
            }

            //The node exists
            if (pCur != null && string.Compare(pCur.data.sWord, sWord, true) == 0 &&
                (pCur.data.nPOS == nPOS || nPOS < 0))
               return true;
         }
         return false;
      }

      #endregion

      #region FindFirstMatchItemInOrgTbl Method

      //====================================================================
      // 查找第一个满足(int nInnerCode, string sWordFunc Name)条件的位置
      //====================================================================
      private bool FindFirstMatchItemInOrgTbl(int nInnerCode, string sWord, out int nPosRet)
      {
         WordItem[] pItems = indexTable[nInnerCode].WordItems;

         int nStart = 0, nEnd = indexTable[nInnerCode].nCount - 1;
         int nMid = (nStart + nEnd) / 2, nCmpValue;

         if (sWord.Length == 0)
         {
            nPosRet = 0;
            return true;
         }

         while (nStart <= nEnd)
         {
            nCmpValue = Utility.CCStringCompare(pItems[nMid].sWord, sWord);
            if (nCmpValue == 0)
            {
               //Get the first item which match the current word
               while (nMid >= 0 && pItems[nMid].sWord == sWord)
                  nMid--;

               nPosRet = ++nMid;
               return true;
            }
            else if (nCmpValue < 0)
               nStart = nMid + 1;
            else if (nCmpValue > 0)
               nEnd = nMid - 1;

            nMid = (nStart + nEnd) / 2;
         }

         nPosRet = -1;
         return false;
      }

      #endregion

      #endregion

   }
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -