⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 textanalyzer.cs

📁 集成了中科院切词技术的中文切词工具
💻 CS
字号:
using System;
using System.Runtime.InteropServices;
using System.IO;
using System.Collections;


/// <summary>
/// -- 切词器 --
/// 功能:
///		1.把一段任意长度的文本按照中文语言语法和习惯切分词汇,返回标准的切词字段序列
///		2.对切词字段进行分析,提取有意义的实词,返回这段文本的关键词汇列表
/// </summary>
public class TextAnalyzer {
	
	/// <summary>
	/// 获取文本切词库是否已经载入
	/// </summary>
	public static bool Ready {get {return _ready;}}

	
	/// <summary>
	/// 载入文本切词库
	/// </summary>
	public static void Init() {
		if (ICTCLAS_Init()) {
			_ready = true;
		}

		//停用词列表是外部的配置文件,随时可以更改,初始化时先读入
		STOP = new ArrayList();
		StreamReader sr = new StreamReader("stoplist.txt", System.Text.Encoding.Default);
		string line;
		while ((line = sr.ReadLine()) != null) {
			STOP.Add(line);
		}
		sr.Close();
        


	}

	/// <summary>
	/// 卸载文本切词库
	/// </summary>
	public static void Exit() {
		ICTCLAS_Exit();
		_ready = false;
	}

	/// <summary>
	/// 对一段文本进行分析,返回标准的切词结果字段
	/// </summary>
	/// <param name="text">待分析文本</param>
	/// <returns>返回的切词结果,形如“中国/n 人民/n”……</returns>
	public static string Process(string text) {

		if (text == null) {
			return null;
		}
		//如果切词库未导入,返回空。
		if (!_ready) 
            return null;

		string result = "";
		//对待分析文本进行预处理。
		string prepare = text.Trim();
		for (int i = 0; i < UNKNOWN.Length; i++) prepare = prepare.Replace(UNKNOWN[i], ' ');
		for (int i = 0; i < PUNCT.Length; i++) prepare = prepare.Replace(PUNCT[i], ' ');
		for (int i = 0; i < FULL.Length; i++) prepare = prepare.Replace(FULL[i], HALF[i]);

		
		//按300个字一段进行切分
		for (int pos = 0; pos < prepare.Length; pos += 300) {
			
			string in_string = "";

			//分段,300个字一段
			if (prepare.Length < pos + 300) in_string = prepare.Substring(pos);
			else in_string = prepare.Substring(pos, 300);
		
			//将string类型的输入字符串转换为byte[]类型
			MemoryStream in_stream = new MemoryStream();
			BinaryWriter in_writer = new BinaryWriter(in_stream, System.Text.Encoding.Default);	
			in_writer.Write(in_string.ToCharArray());
			byte[] in_byteArray = in_stream.GetBuffer();
			in_writer.Close();
			in_stream.Close();

			//在unsafe中运用指针,并调用DLL库方法,输出sbyte *类型的标准字段字符串,并转换为string类型
			string out_string;
			unsafe {
				sbyte* out_sbytePtr = stackalloc sbyte[2048];
				fixed(byte * in_bytePtr = in_byteArray) {
					sbyte * in_sbytePtr = (sbyte *)in_bytePtr;
					ICTCLAS_ParagraphProcess(in_sbytePtr, out_sbytePtr);
				}
				out_string = new string(out_sbytePtr);
			}
			out_string = out_string.Trim();
			//分段各自添加在最终返回标准字段字符串后
			result += out_string;
		}
		return result;
	}

	/// <summary>
	/// 对一段文本进行分析,返回可被接受为有意义词汇的数组
	/// </summary>
	/// <param name="text">待分析文本</param>
	/// <returns>返回的字符串数组,每个元素为一个词汇</returns>
	public static string[] Parse(string text) {
		ArrayList list = new ArrayList();
		string result = Process(text);
		string[] tokens = result.Split(' ');
		foreach (string token in tokens) {
			string item = token.Trim();
			if (item == " " || token ==  "") continue;
			if (item.IndexOf("/") == -1) continue;
			string type = item.Substring(item.IndexOf("/")).ToLower();
			string value = item.Substring(0, item.IndexOf("/"));
			if (!type.StartsWith("/a") &&
				!type.StartsWith("/f") &&
				!type.StartsWith("/g") &&
				!type.StartsWith("/j") &&
				!type.StartsWith("/l") &&
				!type.StartsWith("/n") &&
				!type.StartsWith("/v")) {
				continue;
			} 
			if (STOP.Contains(value)) continue;
			list.Add(value);
		}
		return list.ToArray(typeof(string)) as string[];
	}


	private static bool _ready = false;

	//三个私有的DLL本地方法	
	[DllImport("ICTCLAS.dll", CharSet=CharSet.Auto)]
	private static extern bool ICTCLAS_Init();
	[DllImport("ICTCLAS.dll", CharSet=CharSet.Auto)]
	private static extern bool ICTCLAS_Exit();
	[DllImport("ICTCLAS.dll", CharSet=CharSet.Auto)]
	unsafe private static extern bool ICTCLAS_ParagraphProcess(sbyte* paragraph, sbyte* result);
   


	//四个私有的停用符号,替换符号集合
	private const string UNKNOWN = "─ ━ ▔-—ˉ ̄‐ー―" +
		"┄ ┅ ┈ ┉﹉﹊→←﹍﹎∣│︳︱|▕┃┆┇┊┋↑↓" +
		"∕/╱↗↙\﹨╲↘↖<﹤≮≤《》≥≯﹥>∧∨^ˇ" +
		"╳×√ ▼▽◢◣◥◤☉〒⊿▁▂▃▄▅▆▇█▎▍▌▋▊▉■" +
		"┌┍┎┏ ┐┑┒┓└┕┖┗┘┙┚┛├┝┞┟┠" +
		"┡┢┣ ┤┥┦┧┨┩┪┫┬┯┰┳┱┲┭┮" +
		"┴┷┸┻┵┶┹┺┼┿╂╋┽┾╀╁╃" +
		"╄╅╆╇╈╉╊═║╒╓╔╘╙╚ ╕╖╗" +
		"╛╜╝╞╟╠¬¦╡╢╣╤╥╦╧╨╩╪╫╬" +
		"∟﹁﹂﹃﹄︻︼︵︶︹︺︿﹀︽︾︷︸≈﹋﹌﹏" +
		"︴‖∥﹢﹣﹦+=⊥「」()〇╭╮╯╰﹙﹚﹛﹜﹝﹞⌒" +
		"∠△▲◇◆〈〉⊙○●◎□▓#〝〞゛゜ヽヾ〆ゝゞˊˋ˙–" +
		"′″"〃‵'∵∴∷¨∶…~`·‥︰﹐﹒﹔﹕" +
		"㊣㎎㎏㎜㎝㎞㎡㏄㏎㏑㏒㏕℡㈱﹖﹗﹟﹠﹡﹩﹪﹫℅℉≒≦≧" +
		"々〔〕『』〖〗【】±÷∑∏∪∩∈∫∮≡≌∽∝≠∞" +
		"♂♀°℃$¤¢£‰§№☆★※〓!¥%&*?@[]{}。,;:‘’“”、" +
		"

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -