📄 unit1.~pas
字号:
if not self.ADOTable1.IsEmpty then
begin
self.ADOTable1.First;
while not self.ADOTable1.Eof do
begin
DocsClass[ADOTable1.FieldByName('DocIndex').AsInteger]:=
ADOTable1.FieldByName('ClassIndex').AsInteger;
self.ADOTable1.Next;
end;
end;
self.ADOTable1.Active:=false;
end;
procedure TForm1.FormDestroy(Sender: TObject);
begin
slFileNames.Free;
end;
procedure TForm1.btnReverseClick(Sender: TObject);
var DataDir:string;
DocIndex,j:integer;
slDoc,slDocWords:TStringList;
AWord:string;
slDocsIndex,slAllWords:TStringList;
WordIndex:integer;
begin
DataDir:=Extractfilepath(application.ExeName)+'DataSet';
slDoc:=TStringList.Create;
slDocWords:=TStringList.Create;
slDocsIndex:=TStringList.Create;
slAllWords:=TStringList.Create;
//文档倒排
for docindex:=1 to 600 do
begin
self.StatusBar1.SimpleText:='正在处理文档......'+inttostr(DocIndex)+'/600';
application.ProcessMessages;
slDoc.LoadFromFile(DataDir+'\'+inttostr(DocIndex)+'.txt');
slDocWords.Clear;
AWord:='';
for j:=1 to length(slDoc.Text) do
begin
if slDoc.Text[j]='/' then
begin
if slDocWords.IndexOf(AWord)=-1 then
slDocWords.Append(AWord);
AWord:='';
end
else
AWord:=AWord+slDoc.Text[j];
end;
for j:=0 to slDocWords.Count-1 do
begin
AWord:=slDocWords.Strings[j];
WordIndex:=slAllWords.IndexOf(AWord);
if WordIndex=-1 then
begin
slAllWords.Append(AWord);
WordIndex:=slAllWords.Count-1;
slDocsIndex.Append('');
end;
slDocsIndex.Strings[WordIndex]:=slDocsIndex.Strings[WordIndex]+inttostr(DocIndex)+',';
end;
end;
//保存到数据库
self.ADOTable1.TableName:='WordsTable';
self.ADOTable1.Active:=true;
if self.ADOTable1.IsEmpty then
begin
self.ADOTable1.Append;
self.ADOTable1.FieldByName('Words').AsString:='a';
self.ADOTable1.FieldByName('Docs').AsString:='a';
self.ADOTable1.Post;
end;
self.ADOTable1.First;
for WordIndex:=0 to slAllWords.Count-1 do
begin
self.StatusBar1.SimpleText:='正在保存到数据库.....'+inttostr(WordIndex+1)+'/'+inttostr(slAllWords.Count);
application.ProcessMessages;
if self.ADOTable1.Eof then
self.ADOTable1.Append
else
self.ADOTable1.Edit;
self.ADOTable1.FieldByName('Words').AsString:=slAllWords.Strings[WordIndex];
self.ADOTable1.FieldByName('Docs').AsString:=slDocsIndex.Strings[WordIndex];
self.ADOTable1.Post;
self.ADOTable1.Next;
end;
self.ADOTable1.Active:=false;
memo1.Text:=slAllWords.Text;
memo2.Text:=slDocsIndex.Text;
slDoc.Free;
slDocWords.Free;
slAllWords.Free;
slDocsIndex.Free;
end;
function DelRepeatWords(words:string):string;
var s:string;
i:integer;
sl:tstringlist;
begin
sl:=tstringlist.Create;
s:='';
for i:=1 to length(words) do
begin
if words[i]='/' then
begin
if sl.IndexOf(s)=-1 then
sl.Append(s);
s:='';
end
else
s:=s+words[i];
end;
result:=sl.Text;
sl.Free;
end;
procedure TForm1.btnNaiveClick(Sender: TObject);
const infinitesimal=0.0001;//无穷小,防止概率为0的极端情形
var
i,j,DocIndex,WordIndex,ClassIndex:integer;
slDocsIndex,slAllWords,slADoc:TStringList;
s,AWord,strDocIndex:string;
TestDocs:array[0..60] of integer;//10%的测试文档,记录测试文档号
TestDocsClass:array[0..60] of integer;//测试文档被分到的类标号
ClassDocCount:array[0..5] of integer;//类别文档数量
TestDocClassProb:array[0..5] of double;//测试文档类别概率
WordDF:array of array of integer;//词的文档频率
function IsInTestDocs(DocIndex:integer):boolean;
//该函数用于判断某文档是测试文档还是训练文档
var i:integer;
begin
result:=false;
for i:=1+low(TestDocs) to high(TestDocs) do
begin
if TestDocs[i]=DocIndex then
begin
result:=true;
break;
end;
end;
end;
begin
self.ADOTable1.TableName:='WordsTable';
self.ADOTable1.Active:=true;
if self.ADOTable1.IsEmpty then
begin
showmessage('数据库中没有文档集,请先倒排文档');
exit;
end;
slDocsIndex:=TStringlist.Create;
slAllWords:=TStringlist.Create;
memo1.Clear;
memo2.Clear;
setlength(WordDF,ADOTable1.RecordCount,6);//每个词语的类文档频繁
for wordindex:=0 to ADOTable1.RecordCount-1 do
begin
WordDF[WordIndex][0]:=0;
WordDF[WordIndex][1]:=0;
WordDF[WordIndex][2]:=0;
WordDF[WordIndex][3]:=0;
WordDF[WordIndex][4]:=0;
WordDF[WordIndex][5]:=0;
end;
self.ADOTable1.First;
while not self.ADOTable1.Eof do
begin
slAllWords.Append(ADOTable1.FieldByName('Words').AsString);
slDocsIndex.Append(ADOTable1.FieldByName('Docs').AsString);
self.StatusBar1.SimpleText:='正载入文档集......'+inttostr(slAllWords.Count)+'/'+inttostr(ADOTable1.RecordCount);
application.ProcessMessages;
self.ADOTable1.Next;
end;
self.ADOTable1.Active:=false;
//随机选择10%作为测试文档,文档号记录到TestDocs数组中
self.StatusBar1.SimpleText:='随机划分训练与测试集......';
s:='';
for i:=1+low(TestDocs) to high(TestDocs) do
begin
DocIndex:=1+random(600);
while IsInTestDocs(DocIndex) do
DocIndex:=1+random(600);
TestDocs[i]:=DocIndex;
s:=s+inttostr(DocIndex)+',';
end;
memo1.Lines.Append('测试集文档:'+s);
//统计训练集中每类的文档总数
for ClassIndex:=low(ClassDocCount) to high(ClassDocCount) do
ClassDocCount[ClassIndex]:=0;
for DocIndex:=1 to 600 do
begin
if not IsInTestDocs(DocIndex) then
ClassDocCount[DocsClass[DocIndex]]:=ClassDocCount[DocsClass[DocIndex]]+1;
end;
for ClassIndex:=1 to high(ClassDocCount) do
ClassDocCount[0]:=ClassDocCount[0]+ClassDocCount[ClassIndex];
s:='训练集中共有文档'+inttostr(ClassDocCount[0])+'篇,其中';
for ClassIndex:=1+low(ClassDocCount) to high(ClassDocCount) do
s:=s+'类'+inttostr(ClassIndex)+'为'+inttostr(ClassDocCount[ClassIndex])+'篇,';
memo1.Lines.Append('');
memo1.Lines.Append(s);
//计算词语的文档频率
for WordIndex:=0 to slAllWords.Count-1 do
begin
self.StatusBar1.SimpleText:='正在统计词的文档频率......'+inttostr(WordIndex);
application.ProcessMessages;
strDocIndex:=slDocsIndex.Strings[WordIndex];
s:='';
for i:=1 to length(strDocIndex) do
begin
if strDocIndex[i]=',' then
begin
DocIndex:=strtointdef(s,0);
ClassIndex:=DocsClass[DocIndex];
if not IsInTestDocs(DocIndex) then
WordDF[WordIndex][ClassIndex]:=WordDF[WordIndex][ClassIndex]+1;
s:='';
end
else
s:=s+strDocIndex[i];
end;
end;
memo1.Lines.Append('');
memo1.Lines.Append('完成词文档频率统计');
//测试文档
memo1.Lines.Append('');
memo1.Lines.Append('正在分类测试文档......');
self.StatusBar1.SimpleText:='';
TestDocsClass[0]:=0;//TestDocsClass数组记录测试文档被分到的类号,第0个元素记录正确分类数
slADoc:=TSTringList.Create;
for i:=1+low(TestDocs) to high(TestDocs) do
begin
DocIndex:=TestDocs[i];
slADoc.LoadFromFile('DataSet\'+inttostr(DocIndex)+'.txt');
slADoc.Text:=DelRepeatWords(slADoc.Text);
//计算文档属于每个类的概率
for j:=1 to 5 do
TestDocClassProb[j]:=1.0;
for j:=0 to slAdoc.Count-1 do
begin//文档中每个词概率的乘积,即朴素贝叶斯概率
wordindex:=slAllWords.IndexOf(slAdoc.Strings[j]);
if wordindex=-1 then
continue;
TestDocClassProb[1]:=TestDocClassProb[1]*(infinitesimal+WordDF[wordIndex][1]/ClassDocCount[1]);
TestDocClassProb[2]:=TestDocClassProb[2]*(infinitesimal+WordDF[wordIndex][2]/ClassDocCount[2]);
TestDocClassProb[3]:=TestDocClassProb[3]*(infinitesimal+WordDF[wordIndex][3]/ClassDocCount[3]);
TestDocClassProb[4]:=TestDocClassProb[4]*(infinitesimal+WordDF[wordIndex][4]/ClassDocCount[4]);
TestDocClassProb[5]:=TestDocClassProb[5]*(infinitesimal+WordDF[wordIndex][5]/ClassDocCount[5]);
end;
for j:=1 to 5 do
begin//乘上类概率
TestDocClassProb[1]:=TestDocClassProb[1]*ClassDocCount[1];
TestDocClassProb[2]:=TestDocClassProb[2]*ClassDocCount[2];
TestDocClassProb[3]:=TestDocClassProb[3]*ClassDocCount[3];
TestDocClassProb[4]:=TestDocClassProb[4]*ClassDocCount[4];
TestDocClassProb[5]:=TestDocClassProb[5]*ClassDocCount[5];
end;
TestDocClassProb[0]:=-1;
for j:=1 to 5 do
begin//找到最大概率将文档归类
if TestDocClassProb[0]<TestDocClassProb[j] then
begin
TestDocClassProb[0]:=TestDocClassProb[j];
ClassIndex:=j;
end;
end;
//记录文档所属类号及正确分类数
TestDocsClass[i]:=ClassIndex;
//TestDocsClass数组记录第i篇测试文档程序分类后分到的类号,
//TestDocsClass的第i篇文档的文档号为TestDocs[i]
//所以第i篇测试文档实际所属类为DocsClass[TestDocs[i]],被分到的类为TestDocsClass[i]
if ClassIndex=DocsClass[DocIndex] then
TestDocsClass[0]:=TestDocsClass[0]+1;//TestDocsClass[0]为累计正确分类文档数
memo1.Lines.Append('文档'+inttostr(DocIndex)+'应属于类'+inttostr(DocsClass[DocIndex])+
',被分到类'+inttostr(ClassIndex));
application.ProcessMessages;
end;
slADoc.Free;
memo1.Lines.Append('正确分类数量:'+inttostr(TestDocsClass[0])+
',分类正确率:'+floattostr(TestDocsClass[0]/(high(TestDocs)-low(TestDocs))));
slDocsIndex.Free;
slAllWords.Free;
end;
end.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -