📄 default.aspx.cs
字号:
using System;
using System.Data;
using System.Configuration;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using Lucene.Net;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
using System.Collections.Generic;//list
using SharpICTCLAS;//word那个
//using System.Web.SessionState;
using System.Text;
using System.IO;//词库的读取
public partial class _Default : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
}
public class ICTCLASAnalyzer : Analyzer//词法分析器 这里主要定义的是 过滤掉的词
{
public static readonly System.String[] stopword = new System.String[] { "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such", "t", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "我", "我们", "是", "的", "," };
//public static readonly System.String[] stopword = new System.String[368];
public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
{
TokenStream result = new ICTCLASTokenizer(reader);
result = new StandardFilter(result);//TokenFilter拓展类之一,过滤英文字符的复数和dot(.)号
result = new LowerCaseFilter(result);//对所有英文小写化
result = new StopFilter(result, stopword);//过滤掉指定的过滤词
return result;
}
}
class ICTCLASTokenizer : Tokenizer//继承于TokenStream,用于分词。一般扩展的自定义的分词都应该继承这个类
{
int nKind = 6;
List<WordResult[]> result;//加入using SharpICTCLAS;后WordResult未定义消失 using System.Collections.Generic;//list
int startIndex = 0;
int endIndex = 0;
int i = 1;
private string sentence;
public ICTCLASTokenizer(System.IO.TextReader reader)
{
this.input = reader;
sentence = input.ReadToEnd();//读取从当前位置到TextReader的结尾的所以字符并将他们作为字符串返回
sentence = sentence.Replace("\r\n", "");
string DictPath = Path.Combine(Environment.CurrentDirectory, "Data") + Path.DirectorySeparatorChar;//数据词库
WordSegment wordSegment = new WordSegment();
wordSegment.InitWordSegment(DictPath);
result = wordSegment.Segment(sentence, nKind);
}
//next 这个 有能力 自己修改滴
public override Lucene.Net.Analysis.Token Next()
{
Lucene.Net.Analysis.Token token = null;
while (i < result[0].Length - 1)
{
string word = result[0][i].sWord;
endIndex = startIndex + word.Length - 1;
token = new Lucene.Net.Analysis.Token(word, startIndex, endIndex);
startIndex = endIndex + 1;
i++;
return token;
}
return null;
}
}
internal void Test(System.String text, bool verbose)//主函数
{
System.Console.Out.WriteLine(" Tokenizing string: " + text);//写文本流
Test(new System.IO.StringReader(text), verbose, text.Length);//函数调用 verbose干什么用??? text.length文本的字符数
}
internal void Test(System.IO.TextReader reader, bool verbose, long bytes)
{
ICTCLASAnalyzer analyzer = new ICTCLASAnalyzer();//是不是在这的时候就已经把词过滤掉了呢?
TokenStream stream = analyzer.TokenStream(null, reader);//null 那是fieldname
System.DateTime start = System.DateTime.Now;//分词开始滴时间
int count = 0;//用来记录关键字的个数
for (Lucene.Net.Analysis.Token t = stream.Next(); t != null; t = stream.Next())
{
if (verbose)
{
//Response.Write(t.ToString()); //0,1表示”我”这个字在文本中StartOffset,EndOffset,最后的type表示文字的类型是英文,数字,还是是中文
TextBox2.Text += t.ToString();
}
count++;
}
System.DateTime end = System.DateTime.Now;//分词结束时间
long time = end.Ticks - start.Ticks;//分词用的时间
Response.Write("<br>" + time + " 毫秒查找 " + count + " 关键字" + "<br>");
Response.Write((time * 1000.0) / count + " 微秒/关键字" + "<br>");
Response.Write((bytes * 1000.0 * 60.0 * 60.0) / (time * 1000000.0) + " 兆字节/小时");
}
protected void Button1_Click(object sender, EventArgs e)
{
string strtest = TextBox1.Text;
TextBox2.Text = "";
try
{
Test(strtest, true);
}
catch (System.Exception etest)
{
System.Console.Out.WriteLine(" caught a " + e.GetType() + "\n with message: " + etest.Message + etest.ToString());
}
}
//这后面的好像都没有运行~~~
public static string DictPath = Path.Combine(Environment.CurrentDirectory, "Data") + Path.DirectorySeparatorChar;
public static string coreDictFile = DictPath + "coreDict.dct";
public static string biDictFile = DictPath + "BigramDict.dct";
public static string contextFile = DictPath + "nr.ctx";
public static string nrFile = DictPath + "tr.dct";
protected void Button2_Click(object sender, EventArgs e)
{
string DictPath = Path.Combine(Environment.CurrentDirectory, "Data") + Path.DirectorySeparatorChar;
Console.WriteLine("正在读入字典,请稍候...");
WordDictionary dict = new WordDictionary();
dict.Load(DictPath + "coreDict.dct");
ShowWordsInfo(dict, '设');
Console.WriteLine("\r\n向字典库插入“设计模式”一词...");
dict.AddItem("设计模式", Utility.GetPOSValue("n"), 10);
Console.WriteLine("\r\n修改完成,将字典写入磁盘文件coreDictNew.dct,请稍候...");
dict.Save(DictPath + "coreDictNew.dct");
Console.WriteLine("\r\n打开已写入的字典,请稍候...");
dict.Load(DictPath + "coreDictNew.dct");
ShowWordsInfo(dict, '设');
Console.Write("按下回车键退出......");
Console.ReadLine();
//a.Main();
//string weishibie = TextBox3.Text;
// string DictPath = Path.Combine(Environment.CurrentDirectory, "Data") + Path.DirectorySeparatorChar;
// Response.Write ("正在读入字典,请稍候...");
// WordDictionary dict = new WordDictionary();
// dict.Load(DictPath + "coreDict.dct");
// Response.Write ("\r\n向字典库插入" + weishibie + "一词...");
// dict.AddItem(weishibie, Utility.GetPOSValue("n"), 10);
// Response.Write ("\r\n修改完成,将字典写入磁盘文件coreDictNew.dct,请稍候...");
// dict.Save(DictPath + "coreDictNew.dct");
// Response.Write ("按下回车键退出......");
}
public static void ShowWordsInfo(WordDictionary dict, char c)
{
int ccid = Utility.CC_ID(c);
Console.WriteLine("====================================\r\n汉字:{0}, ID :{1}\r\n", Utility.CC_ID2Char(ccid), ccid);
Console.WriteLine(" 词长 频率 词性 词");
//IndexTableItem indextableitem=new IndexTableItem();
//indextableitem.nCount = 1;
//if (dict.Load(coreDictFile, false))
//{
for (int i = 0; i < dict.indexTable[ccid].nCount; i++)
Console.WriteLine("{0,5} {1,6} {2,5} ({3}){4}",
dict.indexTable[ccid].WordItems[i].nWordLen,
dict.indexTable[ccid].WordItems[i].nFrequency,
Utility.GetPOSString(dict.indexTable[ccid].WordItems[i].nPOS),
Utility.CC_ID2Char(ccid),
dict.indexTable[ccid].WordItems[i].sWord);
//}
//else
// Console.WriteLine("wrong");
}
//protected void TextBox4_TextChanged(object sender, EventArgs e)
//{
// //System.String[] stopword = new System.String[] { "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such", "t", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "我", "我们", "是", "的", "," };
// //stopword &= TextBox4.Text.Trim().ToString();
// string[] arr = { TextBox4.Text, "wo" };
//}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -