📄 dictionary.cs
字号:
using System;
using System.Runtime.InteropServices;
using System.IO;
using System.Collections;
/// <summary>
/// -- 字典类 ---
/// 训练库中所有的样本,提取样本库所有词汇,计算每个词汇的DF值。
/// 对给定的新样本,运用每个词汇的训练信息得到该新样本的文本矢量,即特征的提取。
///
/// 这个类本身就是一个ArrayList,其元素是Word对象
/// </summary>
[Serializable()]
public class Dictionary : ArrayList {
/// <summary>
/// 记录初始所有样本的数量
/// </summary>
private int _size;
/// <summary>
/// 记录所有词汇的文本,是string对象,而不是Word类对象,但顺序和Dictionary本身中的一致,便于检索
/// </summary>
ArrayList _backup;
public Dictionary() {
_size = 0;
_backup = new ArrayList();
}
/// <summary>
/// 训练词典
/// </summary>
/// <param name="samples">输入一个数组包含所有样本的文本</param>
public void Train(string[] samples) {
_size = samples.Length;
this.Clear();
_backup.Clear();
ArrayList total = new ArrayList();
ArrayList current = new ArrayList();
foreach (string sample in samples) {
current.Clear();
string[] values = TextAnalyzer.Parse(sample);
foreach(string value in values) {
if (!current.Contains(value)) {
current.Add(value);
if (total.Contains(value)) {
(this[total.IndexOf(value)] as Word).Count++;
} else {
total.Add(value);
Word word = new Word(value);
word.Count = 1;
this.Add(word);
}
}
}
}
this.Sort();
int i = 0;
foreach(Word word in this) {
_backup.Add(word.Value);
word.ID = i;
i++;
}
}
/// <summary>
/// 测试新的文本,计算文本特征矢量
/// </summary>
/// <param name="text">输入一段新的文本</param>
/// <returns>返回一个矢量,每个分量代表每一个词汇在这段新文本中的TFIDF值</returns>
public double[] Test(string text) {
if (!Ready) return null;
string[] values = TextAnalyzer.Parse(text);
double[] vector = new double[this.Count];
foreach(string value in values) {
if (this[value] == null) continue;
vector[(this[value] as Word).ID] += 1;
}
double normal = 0.0;
for (int i = 0; i < this.Count; i++) {
if (vector[i] == 0) continue;
vector[i] = vector[i] * Math.Log((double)_size / (double)((this[i] as Word).Count) + 0.01);
normal += vector[i] * vector[i];
}
for (int i = 0; i < this.Count; i++) {
if (vector[i] == 0) continue;
vector[i] /= Math.Sqrt(normal);
}
return vector;
}
/// <summary>
/// 索引器,按照ID号检索返回词典中的某个词对象
/// </summary>
public new Word this[int id] {
get {
if (id >= 0 && id < this.Count) return base[id] as Word;
else return null;
}
}
/// <summary>
/// 索引器,按照词的具体内容检索返回词典中的某个词对象
/// </summary>
public Word this[string value] {
get {
if (_backup.Contains(value))
return this[_backup.IndexOf(value)] as Word;
else
return null;
}
}
public bool Ready {get {return _size != 0;}}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -