📄 default.aspx.cs

📁 实现中文切词功能,应用lucene.net和中科院的切词算法
💻 CS
字号:
using System;
using System.Data;
using System.Configuration;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;

using Lucene.Net;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
using System.Collections.Generic;//list
using SharpICTCLAS;//word那个  
//using System.Web.SessionState;
using System.Text;
using System.IO;//词库的读取

public partial class _Default : System.Web.UI.Page 
{
    protected void Page_Load(object sender, EventArgs e)
    {

    }
    public class ICTCLASAnalyzer : Analyzer//词法分析器  这里主要定义的是  过滤掉的词 
    {
        public static readonly System.String[] stopword = new System.String[] { "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such", "t", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "我", "我们", "是", "的", "，" };

        //public static readonly System.String[] stopword = new System.String[368];
        
        public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
        {
            TokenStream result = new ICTCLASTokenizer(reader);
            result = new StandardFilter(result);//TokenFilter拓展类之一，过滤英文字符的复数和dot（.）号
            result = new LowerCaseFilter(result);//对所有英文小写化

            result = new StopFilter(result, stopword);//过滤掉指定的过滤词
            return result;
        }
    }
    class ICTCLASTokenizer : Tokenizer//继承于TokenStream，用于分词。一般扩展的自定义的分词都应该继承这个类
    {
        int nKind = 6;
        List<WordResult[]> result;//加入using SharpICTCLAS;后WordResult未定义消失 using System.Collections.Generic;//list
        int startIndex = 0;
        int endIndex = 0;
        int i = 1;
        private string sentence;
        public ICTCLASTokenizer(System.IO.TextReader reader)
        {

            this.input = reader;
            sentence = input.ReadToEnd();//读取从当前位置到TextReader的结尾的所以字符并将他们作为字符串返回
            sentence = sentence.Replace("\r\n", "");
            string DictPath = Path.Combine(Environment.CurrentDirectory, "Data") + Path.DirectorySeparatorChar;//数据词库
            WordSegment wordSegment = new WordSegment();
            wordSegment.InitWordSegment(DictPath);

            result = wordSegment.Segment(sentence, nKind);
        }
        //next 这个 有能力 自己修改滴  
        public override Lucene.Net.Analysis.Token Next()
        {
            Lucene.Net.Analysis.Token token = null;
            while (i < result[0].Length - 1)
            {
                string word = result[0][i].sWord;
                endIndex = startIndex + word.Length - 1;
                token = new Lucene.Net.Analysis.Token(word, startIndex, endIndex);
                startIndex = endIndex + 1;
                i++;
                return token;
            }
           
            return null;

        }

    }
    internal void Test(System.String text, bool verbose)//主函数
    {
        System.Console.Out.WriteLine(" Tokenizing string: " + text);//写文本流
        Test(new System.IO.StringReader(text), verbose, text.Length);//函数调用  verbose干什么用？？？  text.length文本的字符数
    }
    internal void Test(System.IO.TextReader reader, bool verbose, long bytes)
    {
        ICTCLASAnalyzer analyzer = new ICTCLASAnalyzer();//是不是在这的时候就已经把词过滤掉了呢？
        TokenStream stream = analyzer.TokenStream(null, reader);//null 那是fieldname
        System.DateTime start = System.DateTime.Now;//分词开始滴时间
        int count = 0;//用来记录关键字的个数
        for (Lucene.Net.Analysis.Token t = stream.Next(); t != null; t = stream.Next())
        {
            if (verbose)
            {
                //Response.Write(t.ToString());     //0,1表示”我”这个字在文本中StartOffset，EndOffset，最后的type表示文字的类型是英文,数字,还是是中文         
                TextBox2.Text += t.ToString();
            }
            count++;
        }
        System.DateTime end = System.DateTime.Now;//分词结束时间
        long time = end.Ticks - start.Ticks;//分词用的时间
        Response.Write("<br>" + time + " 毫秒查找 " + count + " 关键字" + "<br>");
        Response.Write((time * 1000.0) / count + " 微秒/关键字" + "<br>");
        Response.Write((bytes * 1000.0 * 60.0 * 60.0) / (time * 1000000.0) + " 兆字节/小时");
    }

    protected void Button1_Click(object sender, EventArgs e)
    {
        string strtest = TextBox1.Text;
        
        TextBox2.Text = "";
        try
        {
            Test(strtest, true);
        }
        catch (System.Exception etest)
        {
            System.Console.Out.WriteLine(" caught a " + e.GetType() + "\n with message: " + etest.Message + etest.ToString());
        }
    }


    //这后面的好像都没有运行~~~





    public static string DictPath = Path.Combine(Environment.CurrentDirectory, "Data") + Path.DirectorySeparatorChar;
    public static string coreDictFile = DictPath + "coreDict.dct";
    public static string biDictFile = DictPath + "BigramDict.dct";
    public static string contextFile = DictPath + "nr.ctx";
    public static string nrFile = DictPath + "tr.dct";
    protected void Button2_Click(object sender, EventArgs e)
    {





        string DictPath = Path.Combine(Environment.CurrentDirectory, "Data") + Path.DirectorySeparatorChar;
        Console.WriteLine("正在读入字典，请稍候...");

        WordDictionary dict = new WordDictionary();
        dict.Load(DictPath + "coreDict.dct");
        ShowWordsInfo(dict, '设');

        Console.WriteLine("\r\n向字典库插入“设计模式”一词...");
        dict.AddItem("设计模式", Utility.GetPOSValue("n"), 10);

        Console.WriteLine("\r\n修改完成，将字典写入磁盘文件coreDictNew.dct，请稍候...");
        dict.Save(DictPath + "coreDictNew.dct");

        Console.WriteLine("\r\n打开已写入的字典，请稍候...");
        dict.Load(DictPath + "coreDictNew.dct");
        ShowWordsInfo(dict, '设');

        Console.Write("按下回车键退出......");
        Console.ReadLine();


        //a.Main();
    
        //string weishibie = TextBox3.Text;
       
        //    string DictPath = Path.Combine(Environment.CurrentDirectory, "Data") + Path.DirectorySeparatorChar;
        //    Response.Write ("正在读入字典，请稍候...");

        //    WordDictionary dict = new WordDictionary();
        //    dict.Load(DictPath + "coreDict.dct");

        //     Response.Write ("\r\n向字典库插入" + weishibie + "一词...");
        //    dict.AddItem(weishibie, Utility.GetPOSValue("n"), 10);

        //     Response.Write ("\r\n修改完成，将字典写入磁盘文件coreDictNew.dct，请稍候...");
        //    dict.Save(DictPath + "coreDictNew.dct");

        //      Response.Write ("按下回车键退出......");
                     
       
    }

    public static void ShowWordsInfo(WordDictionary dict, char c)
    {
        int ccid = Utility.CC_ID(c);
        Console.WriteLine("====================================\r\n汉字:{0}, ID ：{1}\r\n", Utility.CC_ID2Char(ccid), ccid);

        Console.WriteLine("  词长  频率  词性   词");
        //IndexTableItem indextableitem=new IndexTableItem();
        //indextableitem.nCount = 1;
       
        //if (dict.Load(coreDictFile, false))
        //{
            for (int i = 0; i < dict.indexTable[ccid].nCount; i++)

                Console.WriteLine("{0,5} {1,6} {2,5}  ({3}){4}",
                dict.indexTable[ccid].WordItems[i].nWordLen,
                dict.indexTable[ccid].WordItems[i].nFrequency,
                Utility.GetPOSString(dict.indexTable[ccid].WordItems[i].nPOS),
                Utility.CC_ID2Char(ccid),
                dict.indexTable[ccid].WordItems[i].sWord);
        //}
        //else
        //    Console.WriteLine("wrong");

    }
    //protected void TextBox4_TextChanged(object sender, EventArgs e)
    //{
    //   //System.String[] stopword = new System.String[] { "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such", "t", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "我", "我们", "是", "的", "，" };
    //   //stopword &= TextBox4.Text.Trim().ToString();

       


    //    string[] arr = { TextBox4.Text, "wo" };
       
    //}


}
💿 文件大小 2785 K
👤 上传用户 laoniu
📂 所属分类其他
🏷️ 相关标签

#lucene #net #算法
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -