📄 worddictionary.cs
字号:
/***********************************************************************************
* ICTCLAS简介:计算所汉语词法分析系统ICTCLAS
* Institute of Computing Technology, Chinese Lexical Analysis System
* 功能有:中文分词;词性标注;未登录词识别。
* 分词正确率高达97.58%(973专家评测结果),
* 未登录词识别召回率均高于90%,其中中国人名的识别召回率接近98%;
* 处理速度为31.5Kbytes/s。
* 著作权: Copyright(c)2002-2005中科院计算所 职务著作权人:张华平
* 遵循协议:自然语言处理开放资源许可证1.0
* Email: zhanghp@software.ict.ac.cn
* Homepage:www.i3s.ac.cn
*
*----------------------------------------------------------------------------------
*
* Copyright (c) 2000, 2001
* Institute of Computing Tech.
* Chinese Academy of Sciences
* All rights reserved.
*
* This file is the confidential and proprietary property of
* Institute of Computing Tech. and the posession or use of this file requires
* a written license from the author.
* Author: Kevin Zhang
* (zhanghp@software.ict.ac.cn)、
*
*----------------------------------------------------------------------------------
*
* SharpICTCLAS:.net平台下的ICTCLAS
* 是由河北理工大学经管学院吕震宇根据Free版ICTCLAS改编而成,
* 并对原有代码做了部分重写与调整
*
* Email: zhenyulu@163.com
* Blog: http://www.cnblogs.com/zhenyulu
*
***********************************************************************************/
using System;
using System.Text;
using System.IO;
namespace SharpICTCLAS
{
public class WordDictionary
{
public bool bReleased = true;
public IndexTableItem[] indexTable;
public ModifyTableItem[] modifyTable;
#region Load Method
public bool Load(string sFilename)
{
return Load(sFilename, false);
}
//====================================================================
// Func Name : Load
// Description: Load the dictionary from the file .dct
// Parameters : sFilename: the file name
// Returns : success or fail
//====================================================================
public bool Load(string sFilename, bool bReset)
{
int frequency, wordLength, pos; //频率、词长、读取词性
bool isSuccess = true;
FileStream fileStream = null;
BinaryReader binReader = null;
try
{
fileStream = new FileStream(sFilename, FileMode.Open, FileAccess.Read);
if (fileStream == null)
return false;
binReader = new BinaryReader(fileStream, Encoding.GetEncoding("gb2312"));
indexTable = new IndexTableItem[Predefine.CC_NUM];
bReleased = false;
for (int i = 0; i < Predefine.CC_NUM; i++)
{
//读取以该汉字打头的词有多少个
indexTable[i] = new IndexTableItem();
indexTable[i].nCount = binReader.ReadInt32();
if (indexTable[i].nCount <= 0)
continue;
indexTable[i].WordItems = new WordItem[indexTable[i].nCount];
for (int j = 0; j < indexTable[i].nCount; j++)
{
indexTable[i].WordItems[j] = new WordItem();
frequency = binReader.ReadInt32(); //读取频率
wordLength = binReader.ReadInt32(); //读取词长
pos = binReader.ReadInt32(); //读取词性
if (wordLength > 0)
indexTable[i].WordItems[j].sWord = Utility.ByteArray2String(binReader.ReadBytes(wordLength));
else
indexTable[i].WordItems[j].sWord = "";
//Reset the frequency
if (bReset)
indexTable[i].WordItems[j].nFrequency = 0;
else
indexTable[i].WordItems[j].nFrequency = frequency;
indexTable[i].WordItems[j].nWordLen = wordLength;
indexTable[i].WordItems[j].nPOS = pos;
}
}
}
catch (Exception e)
{
Console.WriteLine(e.Message);
isSuccess = false;
}
finally
{
if (binReader != null)
binReader.Close();
if (fileStream != null)
fileStream.Close();
}
return isSuccess;
}
#endregion
#region Save Method
//====================================================================
// Func Name : Save
// Description: Save the dictionary as the file .dct
// Parameters : sFilename: the file name
// Returns : success or fail
//====================================================================
public bool Save(string sFilename)
{
bool isSuccess = true;
FileStream outputFile = null;
BinaryWriter writer = null;
try
{
outputFile = new FileStream(sFilename, FileMode.Create, FileAccess.Write);
if (outputFile == null)
return false;
writer = new BinaryWriter(outputFile, Encoding.GetEncoding("gb2312"));
//对图一中所示的6768个数据块进行遍历
for (int i = 0; i < Predefine.CC_NUM; i++)
{
//如果发生了修改,则完成indexTable与modifyTable归并排序式的合并工作并存盘(排序原则是先安sWord排,然后再按词性排)
if (modifyTable != null)
MergeAndSaveIndexTableItem(writer, indexTable[i], modifyTable[i]);
else
//否则直接写入indexTable
SaveIndexTableItem(writer, indexTable[i]);
}
}
catch
{
isSuccess = false;
}
finally
{
if (writer != null)
writer.Close();
if (outputFile != null)
outputFile.Close();
}
return isSuccess;
}
private void MergeAndSaveIndexTableItem(BinaryWriter writer, IndexTableItem item, ModifyTableItem modifyItem)
{
int j, nCount; //频率、词长、读取词性
WordChain pCur;
//计算修改后有效词块的数目
nCount = item.nCount + modifyItem.nCount - modifyItem.nDelete;
writer.Write(nCount);
pCur = modifyItem.pWordItemHead;
j = 0;
//对原表中的词块和修改表中的词块进行遍历,并把修改后的添加到原表中
while (pCur != null && j < item.nCount)
{
//如果修改表中的词小于原表中对应位置的词或者长度相等但nHandle值比原表中的小,则把修改表中的写入到词典文件当中.
if (Utility.CCStringCompare(pCur.data.sWord, item.WordItems[j].sWord) < 0 ||
((pCur.data.sWord == item.WordItems[j].sWord) &&
(pCur.data.nPOS < item.WordItems[j].nPOS)))
{
//Output the modified data to the file
SaveWordItem(writer, pCur.data);
pCur = pCur.next;
}
//频度nFrequecy等于-1说明该词已被删除,跳过它
else if (item.WordItems[j].nFrequency == -1)
j++;
//如果修改表中的词长度比原表中的长度大或 长度相等但句柄值要多,就把原表的词写入的词典文件中
else if (Utility.CCStringCompare(pCur.data.sWord, item.WordItems[j].sWord) > 0 ||
((pCur.data.sWord == item.WordItems[j].sWord) &&
(pCur.data.nPOS > item.WordItems[j].nPOS)))
{
//Output the index table data to the file
SaveWordItem(writer, item.WordItems[j]);
j++;
}
}
//如果归并结束后indexTable有剩余,则继续写完indexTable中的数据
if (j < item.nCount)
{
for (int i = j; i < item.nCount; i++)
if (item.WordItems[j].nFrequency != -1)
SaveWordItem(writer, item.WordItems[i]);
}
//否则继续写完modifyTable中的数据
else
while (pCur != null)
{
//Output the modified data to the file
SaveWordItem(writer, pCur.data);
pCur = pCur.next;
}
}
private void SaveIndexTableItem(BinaryWriter writer, IndexTableItem item)
{
writer.Write(item.nCount);
for (int i = 0; i < item.nCount; i++)
SaveWordItem(writer, item.WordItems[i]);
}
private void SaveWordItem(BinaryWriter writer, WordItem item)
{
int frequency = item.nFrequency;
int wordLength = item.nWordLen;
int handle = item.nPOS;
writer.Write(frequency);
writer.Write(wordLength);
writer.Write(handle);
if (wordLength > 0)
writer.Write(Encoding.GetEncoding("gb2312").GetBytes(item.sWord));
}
#endregion
#region AddItem Method
//====================================================================
// Func Name : AddItem
// Description: Add a word item to the dictionary
// Parameters : sWord: the word
// nHandle:the handle number
// nFrequency: the frequency
// Returns : success or fail
//====================================================================
public bool AddItem(string sWord, int nPOS, int nFrequency)
{
int nPos, nFoundPos;
WordChain pRet, pTemp, pNext;
string sWordAdd;
//预处理,去掉词的前后的空格
if (!PreProcessing(ref sWord, out nPos, out sWordAdd))
return false;
if (FindInOriginalTable(nPos, sWordAdd, nPOS, out nFoundPos))
{
//The word exists in the original table, so add the frequency
//Operation in the index table and its items
if (indexTable[nPos].WordItems[nFoundPos].nFrequency == -1)
{
//The word item has been removed
indexTable[nPos].WordItems[nFoundPos].nFrequency = nFrequency;
if (modifyTable == null)
modifyTable = new ModifyTableItem[Predefine.CC_NUM];
modifyTable[nPos].nDelete -= 1;
}
else
indexTable[nPos].WordItems[nFoundPos].nFrequency += nFrequency;
return true;
}
//The items not exists in the index table.
//As following, we have to find the item whether exists in the modify data region
//If exists, change the frequency .or else add a item
if (modifyTable == null)
{
modifyTable = new ModifyTableItem[Predefine.CC_NUM];
for (int i = 0; i < Predefine.CC_NUM; i++)
modifyTable[i] = new ModifyTableItem();
}
if (FindInModifyTable(nPos, sWordAdd, nPOS, out pRet))
{
if (pRet != null)
pRet = pRet.next;
else
pRet = modifyTable[nPos].pWordItemHead;
pRet.data.nFrequency += nFrequency;
return true;
}
//find the proper position to add the word to the modify data table and link
pTemp = new WordChain(); //Allocate the word chain node
pTemp.data = new WordItem();
pTemp.data.nPOS = nPOS; //store the handle
pTemp.data.nWordLen = Utility.GetWordLength(sWordAdd);
pTemp.data.sWord = sWordAdd;
pTemp.data.nFrequency = nFrequency;
pTemp.next = null;
if (pRet != null)
{
pNext = pRet.next; //Get the next item before the current item
pRet.next = pTemp; //link the node to the chain
}
else
{
pNext = modifyTable[nPos].pWordItemHead;
modifyTable[nPos].pWordItemHead = pTemp; //Set the pAdd as the head node
}
pTemp.next = pNext; //Very important!!!! or else it will lose some node
modifyTable[nPos].nCount++; //the number increase by one
return true;
}
#endregion
#region DelItem Method
public bool DelItem(string sWord, int nPOS)
{
string sWordDel;
int nPos, nFoundPos, nTemp;
WordChain pPre, pTemp, pCur;
if (!PreProcessing(ref sWord, out nPos, out sWordDel))
return false;
if (FindInOriginalTable(nPos, sWordDel, nPOS, out nFoundPos))
{
//Not prepare the buffer
if (modifyTable == null)
modifyTable = new ModifyTableItem[Predefine.CC_NUM];
indexTable[nPos].WordItems[nFoundPos].nFrequency = -1;
modifyTable[nPos].nDelete += 1;
//Remove all items which word is sWordDel,ignoring the handle
if (nPOS == -1)
{
nTemp = nFoundPos + 1; //Check its previous position
while (nTemp < indexTable[nPos].nCount &&
string.Compare(indexTable[nPos].WordItems[nFoundPos].sWord, sWordDel) == 0)
{
indexTable[nPos].WordItems[nTemp].nFrequency = -1;
modifyTable[nPos].nDelete += 1;
nTemp += 1;
}
}
return true;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -