📄 worddictionary.cs

📁 只是中科院分词系统的SharpICTCLAS分词系统
💻 CS
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/***********************************************************************************
 * ICTCLAS简介：计算所汉语词法分析系统ICTCLAS
 *              Institute of Computing Technology, Chinese Lexical Analysis System
 *              功能有：中文分词；词性标注；未登录词识别。
 *              分词正确率高达97.58%(973专家评测结果)，
 *              未登录词识别召回率均高于90%，其中中国人名的识别召回率接近98%;
 *              处理速度为31.5Kbytes/s。
 * 著作权：  Copyright(c)2002-2005中科院计算所 职务著作权人：张华平
 * 遵循协议：自然语言处理开放资源许可证1.0
 * Email: zhanghp@software.ict.ac.cn
 * Homepage:www.i3s.ac.cn
 * 
 *----------------------------------------------------------------------------------
 * 
 * Copyright (c) 2000, 2001
 *     Institute of Computing Tech.
 *     Chinese Academy of Sciences
 *     All rights reserved.
 *
 * This file is the confidential and proprietary property of
 * Institute of Computing Tech. and the posession or use of this file requires
 * a written license from the author.
 * Author:   Kevin Zhang
 *          (zhanghp@software.ict.ac.cn)、
 * 
 *----------------------------------------------------------------------------------
 * 
 * SharpICTCLAS：.net平台下的ICTCLAS
 *               是由河北理工大学经管学院吕震宇根据Free版ICTCLAS改编而成，
 *               并对原有代码做了部分重写与调整
 * 
 * Email: zhenyulu@163.com
 * Blog: http://www.cnblogs.com/zhenyulu
 * 
 ***********************************************************************************/
using System;
using System.Text;
using System.IO;

namespace SharpICTCLAS
{
   public class WordDictionary
   {
      public bool bReleased = true;

      public IndexTableItem[] indexTable;
      public ModifyTableItem[] modifyTable;

      #region Load Method

      public bool Load(string sFilename)
      {
         return Load(sFilename, false);
      }

      //====================================================================
      // Func Name  : Load
      // Description: Load the dictionary from the file .dct
      // Parameters : sFilename: the file name
      // Returns    : success or fail
      //====================================================================
      public bool Load(string sFilename, bool bReset)
      {
         int frequency, wordLength, pos;   //频率、词长、读取词性
         bool isSuccess = true;
         FileStream fileStream = null;
         BinaryReader binReader = null;

         try
         {
            fileStream = new FileStream(sFilename, FileMode.Open, FileAccess.Read);
            if (fileStream == null)
               return false;

            binReader = new BinaryReader(fileStream, Encoding.GetEncoding("gb2312"));

            indexTable = new IndexTableItem[Predefine.CC_NUM];

            bReleased = false;
            for (int i = 0; i < Predefine.CC_NUM; i++)
            {
               //读取以该汉字打头的词有多少个
               indexTable[i] = new IndexTableItem();
               indexTable[i].nCount = binReader.ReadInt32();

               if (indexTable[i].nCount <= 0)
                  continue;

               indexTable[i].WordItems = new WordItem[indexTable[i].nCount];

               for (int j = 0; j < indexTable[i].nCount; j++)
               {
                  indexTable[i].WordItems[j] = new WordItem();

                  frequency = binReader.ReadInt32();   //读取频率
                  wordLength = binReader.ReadInt32();  //读取词长
                  pos = binReader.ReadInt32();      //读取词性

                  if (wordLength > 0)
                     indexTable[i].WordItems[j].sWord = Utility.ByteArray2String(binReader.ReadBytes(wordLength));
                  else
                     indexTable[i].WordItems[j].sWord = "";

                  //Reset the frequency
                  if (bReset)
                     indexTable[i].WordItems[j].nFrequency = 0;
                  else
                     indexTable[i].WordItems[j].nFrequency = frequency;

                  indexTable[i].WordItems[j].nWordLen = wordLength;
                  indexTable[i].WordItems[j].nPOS = pos;
               }
            }
         }
         catch (Exception e)
         {
            Console.WriteLine(e.Message);
            isSuccess = false;
         }
         finally
         {
            if (binReader != null)
               binReader.Close();

            if (fileStream != null)
               fileStream.Close();
         }
         return isSuccess;
      }
      #endregion

      #region Save Method

      //====================================================================
      // Func Name  : Save
      // Description: Save the dictionary as the file .dct
      // Parameters : sFilename: the file name
      // Returns    : success or fail
      //====================================================================
      public bool Save(string sFilename)
      {
         bool isSuccess = true;
         FileStream outputFile = null;
         BinaryWriter writer = null;

         try
         {
            outputFile = new FileStream(sFilename, FileMode.Create, FileAccess.Write);
            if (outputFile == null)
               return false;

            writer = new BinaryWriter(outputFile, Encoding.GetEncoding("gb2312"));

            //对图一中所示的6768个数据块进行遍历
            for (int i = 0; i < Predefine.CC_NUM; i++)
            {
               //如果发生了修改，则完成indexTable与modifyTable归并排序式的合并工作并存盘（排序原则是先安sWord排，然后再按词性排）
               if (modifyTable != null)
                  MergeAndSaveIndexTableItem(writer, indexTable[i], modifyTable[i]);
               else
                  //否则直接写入indexTable
                  SaveIndexTableItem(writer, indexTable[i]);
            }
         }
         catch
         {
            isSuccess = false;
         }
         finally
         {
            if (writer != null)
               writer.Close();

            if (outputFile != null)
               outputFile.Close();
         }
         return isSuccess;
      }

      private void MergeAndSaveIndexTableItem(BinaryWriter writer, IndexTableItem item, ModifyTableItem modifyItem)
      {
         int j, nCount;   //频率、词长、读取词性
         WordChain pCur;

         //计算修改后有效词块的数目
         nCount = item.nCount + modifyItem.nCount - modifyItem.nDelete;
         writer.Write(nCount);

         pCur = modifyItem.pWordItemHead;

         j = 0;
         //对原表中的词块和修改表中的词块进行遍历,并把修改后的添加到原表中
         while (pCur != null && j < item.nCount)
         {
            //如果修改表中的词小于原表中对应位置的词或者长度相等但nHandle值比原表中的小,则把修改表中的写入到词典文件当中.
            if (Utility.CCStringCompare(pCur.data.sWord, item.WordItems[j].sWord) < 0 ||
               ((pCur.data.sWord == item.WordItems[j].sWord) &&
               (pCur.data.nPOS < item.WordItems[j].nPOS)))
            {
               //Output the modified data to the file
               SaveWordItem(writer, pCur.data);
               pCur = pCur.next;
            }
            //频度nFrequecy等于-1说明该词已被删除,跳过它
            else if (item.WordItems[j].nFrequency == -1)
               j++;
            //如果修改表中的词长度比原表中的长度大或  长度相等但句柄值要多,就把原表的词写入的词典文件中
            else if (Utility.CCStringCompare(pCur.data.sWord, item.WordItems[j].sWord) > 0 ||
               ((pCur.data.sWord == item.WordItems[j].sWord) &&
               (pCur.data.nPOS > item.WordItems[j].nPOS)))
            {
               //Output the index table data to the file
               SaveWordItem(writer, item.WordItems[j]);
               j++;
            }
         }
         //如果归并结束后indexTable有剩余，则继续写完indexTable中的数据
         if (j < item.nCount)
         {
            for (int i = j; i < item.nCount; i++)
               if (item.WordItems[j].nFrequency != -1)
                  SaveWordItem(writer, item.WordItems[i]);
         }
         //否则继续写完modifyTable中的数据
         else
            while (pCur != null)
            {
               //Output the modified data to the file
               SaveWordItem(writer, pCur.data);
               pCur = pCur.next;
            }
      }

      private void SaveIndexTableItem(BinaryWriter writer, IndexTableItem item)
      {
         writer.Write(item.nCount);

         for (int i = 0; i < item.nCount; i++)
            SaveWordItem(writer, item.WordItems[i]);
      }

      private void SaveWordItem(BinaryWriter writer, WordItem item)
      {
         int frequency = item.nFrequency;
         int wordLength = item.nWordLen;
         int handle = item.nPOS;

         writer.Write(frequency);
         writer.Write(wordLength);
         writer.Write(handle);

         if (wordLength > 0)
            writer.Write(Encoding.GetEncoding("gb2312").GetBytes(item.sWord));
      }

      #endregion

      #region AddItem Method
      //====================================================================
      // Func Name  : AddItem
      // Description: Add a word item to the dictionary
      // Parameters : sWord: the word
      //              nHandle:the handle number
      //              nFrequency: the frequency
      // Returns    : success or fail
      //====================================================================
      public bool AddItem(string sWord, int nPOS, int nFrequency)
      {
         int nPos, nFoundPos;
         WordChain pRet, pTemp, pNext;
         string sWordAdd;

         //预处理,去掉词的前后的空格
         if (!PreProcessing(ref sWord, out nPos, out sWordAdd))
            return false;

         if (FindInOriginalTable(nPos, sWordAdd, nPOS, out nFoundPos))
         {
            //The word exists in the original table, so add the frequency
            //Operation in the index table and its items
            if (indexTable[nPos].WordItems[nFoundPos].nFrequency == -1)
            {
               //The word item has been removed
               indexTable[nPos].WordItems[nFoundPos].nFrequency = nFrequency;

               if (modifyTable == null)
                  modifyTable = new ModifyTableItem[Predefine.CC_NUM];

               modifyTable[nPos].nDelete -= 1;
            }
            else
               indexTable[nPos].WordItems[nFoundPos].nFrequency += nFrequency;
            return true;
         }

         //The items not exists in the index table.
         //As following, we have to find the item whether exists in the modify data region
         //If exists, change the frequency .or else add a item
         if (modifyTable == null)
         {
            modifyTable = new ModifyTableItem[Predefine.CC_NUM];
            for (int i = 0; i < Predefine.CC_NUM; i++)
               modifyTable[i] = new ModifyTableItem();
         }

         if (FindInModifyTable(nPos, sWordAdd, nPOS, out pRet))
         {
            if (pRet != null)
               pRet = pRet.next;
            else
               pRet = modifyTable[nPos].pWordItemHead;

            pRet.data.nFrequency += nFrequency;
            return true;
         }

         //find the proper position to add the word to the modify data table and link
         pTemp = new WordChain(); //Allocate the word chain node
         pTemp.data = new WordItem();
         pTemp.data.nPOS = nPOS; //store the handle
         pTemp.data.nWordLen = Utility.GetWordLength(sWordAdd);
         pTemp.data.sWord = sWordAdd;
         pTemp.data.nFrequency = nFrequency;
         pTemp.next = null;
         if (pRet != null)
         {
            pNext = pRet.next; //Get the next item before the current item
            pRet.next = pTemp; //link the node to the chain
         }
         else
         {
            pNext = modifyTable[nPos].pWordItemHead;
            modifyTable[nPos].pWordItemHead = pTemp; //Set the pAdd as the head node
         }
         pTemp.next = pNext; //Very important!!!! or else it will lose some node

         modifyTable[nPos].nCount++; //the number increase by one
         return true;
      }

      #endregion

      #region DelItem Method

      public bool DelItem(string sWord, int nPOS)
      {
         string sWordDel;
         int nPos, nFoundPos, nTemp;
         WordChain pPre, pTemp, pCur;

         if (!PreProcessing(ref sWord, out nPos, out sWordDel))
            return false;

         if (FindInOriginalTable(nPos, sWordDel, nPOS, out nFoundPos))
         {
            //Not prepare the buffer
            if (modifyTable == null)
               modifyTable = new ModifyTableItem[Predefine.CC_NUM];

            indexTable[nPos].WordItems[nFoundPos].nFrequency = -1;
            modifyTable[nPos].nDelete += 1;

            //Remove all items which word is sWordDel,ignoring the handle
            if (nPOS == -1)
            {
               nTemp = nFoundPos + 1; //Check its previous position
               while (nTemp < indexTable[nPos].nCount &&
                  string.Compare(indexTable[nPos].WordItems[nFoundPos].sWord, sWordDel) == 0)
               {
                  indexTable[nPos].WordItems[nTemp].nFrequency = -1;
                  modifyTable[nPos].nDelete += 1;
                  nTemp += 1;
               }
            }
            return true;
         }
12 3 下一页
💿 文件大小 2421 K
👤 上传用户 wait2010
📂 所属分类人工智能/神经网络
🏷️ 相关标签

#SharpICTCLAS #分
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -