📄 textanalyzer.cs
字号:
using System;
using System.Runtime.InteropServices;
using System.IO;
using System.Collections;
/// <summary>
/// -- 切词器 --
/// 功能:
/// 1.把一段任意长度的文本按照中文语言语法和习惯切分词汇,返回标准的切词字段序列
/// 2.对切词字段进行分析,提取有意义的实词,返回这段文本的关键词汇列表
/// </summary>
public class TextAnalyzer {
/// <summary>
/// 获取文本切词库是否已经载入
/// </summary>
public static bool Ready {get {return _ready;}}
/// <summary>
/// 载入文本切词库
/// </summary>
public static void Init() {
if (ICTCLAS_Init()) {
_ready = true;
}
//停用词列表是外部的配置文件,随时可以更改,初始化时先读入
STOP = new ArrayList();
StreamReader sr = new StreamReader("stoplist.txt", System.Text.Encoding.Default);
string line;
while ((line = sr.ReadLine()) != null) {
STOP.Add(line);
}
sr.Close();
}
/// <summary>
/// 卸载文本切词库
/// </summary>
public static void Exit() {
ICTCLAS_Exit();
_ready = false;
}
/// <summary>
/// 对一段文本进行分析,返回标准的切词结果字段
/// </summary>
/// <param name="text">待分析文本</param>
/// <returns>返回的切词结果,形如“中国/n 人民/n”……</returns>
public static string Process(string text) {
if (text == null) {
return null;
}
//如果切词库未导入,返回空。
if (!_ready)
return null;
string result = "";
//对待分析文本进行预处理。
string prepare = text.Trim();
for (int i = 0; i < UNKNOWN.Length; i++) prepare = prepare.Replace(UNKNOWN[i], ' ');
for (int i = 0; i < PUNCT.Length; i++) prepare = prepare.Replace(PUNCT[i], ' ');
for (int i = 0; i < FULL.Length; i++) prepare = prepare.Replace(FULL[i], HALF[i]);
//按300个字一段进行切分
for (int pos = 0; pos < prepare.Length; pos += 300) {
string in_string = "";
//分段,300个字一段
if (prepare.Length < pos + 300) in_string = prepare.Substring(pos);
else in_string = prepare.Substring(pos, 300);
//将string类型的输入字符串转换为byte[]类型
MemoryStream in_stream = new MemoryStream();
BinaryWriter in_writer = new BinaryWriter(in_stream, System.Text.Encoding.Default);
in_writer.Write(in_string.ToCharArray());
byte[] in_byteArray = in_stream.GetBuffer();
in_writer.Close();
in_stream.Close();
//在unsafe中运用指针,并调用DLL库方法,输出sbyte *类型的标准字段字符串,并转换为string类型
string out_string;
unsafe {
sbyte* out_sbytePtr = stackalloc sbyte[2048];
fixed(byte * in_bytePtr = in_byteArray) {
sbyte * in_sbytePtr = (sbyte *)in_bytePtr;
ICTCLAS_ParagraphProcess(in_sbytePtr, out_sbytePtr);
}
out_string = new string(out_sbytePtr);
}
out_string = out_string.Trim();
//分段各自添加在最终返回标准字段字符串后
result += out_string;
}
return result;
}
/// <summary>
/// 对一段文本进行分析,返回可被接受为有意义词汇的数组
/// </summary>
/// <param name="text">待分析文本</param>
/// <returns>返回的字符串数组,每个元素为一个词汇</returns>
public static string[] Parse(string text) {
ArrayList list = new ArrayList();
string result = Process(text);
string[] tokens = result.Split(' ');
foreach (string token in tokens) {
string item = token.Trim();
if (item == " " || token == "") continue;
if (item.IndexOf("/") == -1) continue;
string type = item.Substring(item.IndexOf("/")).ToLower();
string value = item.Substring(0, item.IndexOf("/"));
if (!type.StartsWith("/a") &&
!type.StartsWith("/f") &&
!type.StartsWith("/g") &&
!type.StartsWith("/j") &&
!type.StartsWith("/l") &&
!type.StartsWith("/n") &&
!type.StartsWith("/v")) {
continue;
}
if (STOP.Contains(value)) continue;
list.Add(value);
}
return list.ToArray(typeof(string)) as string[];
}
private static bool _ready = false;
//三个私有的DLL本地方法
[DllImport("ICTCLAS.dll", CharSet=CharSet.Auto)]
private static extern bool ICTCLAS_Init();
[DllImport("ICTCLAS.dll", CharSet=CharSet.Auto)]
private static extern bool ICTCLAS_Exit();
[DllImport("ICTCLAS.dll", CharSet=CharSet.Auto)]
unsafe private static extern bool ICTCLAS_ParagraphProcess(sbyte* paragraph, sbyte* result);
//四个私有的停用符号,替换符号集合
private const string UNKNOWN = "─ ━ ▔-—ˉ ̄‐ー―" +
"┄ ┅ ┈ ┉﹉﹊→←﹍﹎∣│︳︱|▕┃┆┇┊┋↑↓" +
"∕/╱↗↙\﹨╲↘↖<﹤≮≤《》≥≯﹥>∧∨^ˇ" +
"╳×√ ▼▽◢◣◥◤☉〒⊿▁▂▃▄▅▆▇█▎▍▌▋▊▉■" +
"┌┍┎┏ ┐┑┒┓└┕┖┗┘┙┚┛├┝┞┟┠" +
"┡┢┣ ┤┥┦┧┨┩┪┫┬┯┰┳┱┲┭┮" +
"┴┷┸┻┵┶┹┺┼┿╂╋┽┾╀╁╃" +
"╄╅╆╇╈╉╊═║╒╓╔╘╙╚ ╕╖╗" +
"╛╜╝╞╟╠¬¦╡╢╣╤╥╦╧╨╩╪╫╬" +
"∟﹁﹂﹃﹄︻︼︵︶︹︺︿﹀︽︾︷︸≈﹋﹌﹏" +
"︴‖∥﹢﹣﹦+=⊥「」()〇╭╮╯╰﹙﹚﹛﹜﹝﹞⌒" +
"∠△▲◇◆〈〉⊙○●◎□▓#〝〞゛゜ヽヾ〆ゝゞˊˋ˙–" +
"′″"〃‵'∵∴∷¨∶…~`·‥︰﹐﹒﹔﹕" +
"㊣㎎㎏㎜㎝㎞㎡㏄㏎㏑㏒㏕℡㈱﹖﹗﹟﹠﹡﹩﹪﹫℅℉≒≦≧" +
"々〔〕『』〖〗【】±÷∑∏∪∩∈∫∮≡≌∽∝≠∞" +
"♂♀°℃$¤¢£‰§№☆★※〓!¥%&*?@[]{}。,;:‘’“”、" +
"
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -