📄 unit1.~pas

📁 整个实验是在Windows环境下使用delphi完成的。选取了600篇文档
💻 ~PAS
📖 第 1 页 / 共 2 页
字号:
上一页 12
    if not self.ADOTable1.IsEmpty then
    begin
        self.ADOTable1.First;
        while not self.ADOTable1.Eof do
        begin
            DocsClass[ADOTable1.FieldByName('DocIndex').AsInteger]:=
                ADOTable1.FieldByName('ClassIndex').AsInteger;
            self.ADOTable1.Next;
        end;
    end;
    self.ADOTable1.Active:=false;
end;

procedure TForm1.FormDestroy(Sender: TObject);
begin
    slFileNames.Free;
end;
procedure TForm1.btnReverseClick(Sender: TObject);
var DataDir:string;
    DocIndex,j:integer;
    slDoc,slDocWords:TStringList;
    AWord:string;
    slDocsIndex,slAllWords:TStringList;
    WordIndex:integer;

begin
    DataDir:=Extractfilepath(application.ExeName)+'DataSet';
    slDoc:=TStringList.Create;
    slDocWords:=TStringList.Create;
    slDocsIndex:=TStringList.Create;
    slAllWords:=TStringList.Create;
    //文档倒排
    for docindex:=1 to 600 do
    begin
        self.StatusBar1.SimpleText:='正在处理文档......'+inttostr(DocIndex)+'/600';
        application.ProcessMessages;
        slDoc.LoadFromFile(DataDir+'\'+inttostr(DocIndex)+'.txt');
        slDocWords.Clear;
        AWord:='';
        for j:=1 to length(slDoc.Text) do
        begin
            if slDoc.Text[j]='/' then
            begin
                if slDocWords.IndexOf(AWord)=-1 then
                    slDocWords.Append(AWord);
                AWord:='';
            end
            else
                AWord:=AWord+slDoc.Text[j];
        end;
        for j:=0 to slDocWords.Count-1 do
        begin
            AWord:=slDocWords.Strings[j];
            WordIndex:=slAllWords.IndexOf(AWord);
            if WordIndex=-1 then
            begin
                slAllWords.Append(AWord);
                WordIndex:=slAllWords.Count-1;
                slDocsIndex.Append('');
            end;
            slDocsIndex.Strings[WordIndex]:=slDocsIndex.Strings[WordIndex]+inttostr(DocIndex)+',';
        end;
    end;
    //保存到数据库
    self.ADOTable1.TableName:='WordsTable';
    self.ADOTable1.Active:=true;
    if self.ADOTable1.IsEmpty then
    begin
        self.ADOTable1.Append;
        self.ADOTable1.FieldByName('Words').AsString:='a';
        self.ADOTable1.FieldByName('Docs').AsString:='a';
        self.ADOTable1.Post;
    end;
    self.ADOTable1.First;
    for WordIndex:=0 to slAllWords.Count-1 do
    begin
        self.StatusBar1.SimpleText:='正在保存到数据库.....'+inttostr(WordIndex+1)+'/'+inttostr(slAllWords.Count);
        application.ProcessMessages;
        if self.ADOTable1.Eof then
            self.ADOTable1.Append
        else
            self.ADOTable1.Edit;
        self.ADOTable1.FieldByName('Words').AsString:=slAllWords.Strings[WordIndex];
        self.ADOTable1.FieldByName('Docs').AsString:=slDocsIndex.Strings[WordIndex];
        self.ADOTable1.Post;
        self.ADOTable1.Next;
    end;
    
    self.ADOTable1.Active:=false;
    memo1.Text:=slAllWords.Text;
    memo2.Text:=slDocsIndex.Text;

    slDoc.Free;
    slDocWords.Free;
    slAllWords.Free;
    slDocsIndex.Free;
end;

function DelRepeatWords(words:string):string;
var s:string;
    i:integer;
    sl:tstringlist;
begin
    sl:=tstringlist.Create;
    s:='';
    for i:=1 to length(words) do
    begin
        if words[i]='/' then
        begin
            if sl.IndexOf(s)=-1 then
                sl.Append(s);
            s:='';
        end
        else
            s:=s+words[i];
    end;
    result:=sl.Text;
    sl.Free;
end;

procedure TForm1.btnNaiveClick(Sender: TObject);
const infinitesimal=0.0001;//无穷小，防止概率为0的极端情形
var
    i,j,DocIndex,WordIndex,ClassIndex:integer;
    slDocsIndex,slAllWords,slADoc:TStringList;
    s,AWord,strDocIndex:string;
    TestDocs:array[0..60] of integer;//10%的测试文档，记录测试文档号
    TestDocsClass:array[0..60] of integer;//测试文档被分到的类标号
    ClassDocCount:array[0..5] of integer;//类别文档数量
    TestDocClassProb:array[0..5] of double;//测试文档类别概率
    WordDF:array of array of integer;//词的文档频率

    function IsInTestDocs(DocIndex:integer):boolean;
    //该函数用于判断某文档是测试文档还是训练文档
    var i:integer;
    begin
        result:=false;
        for i:=1+low(TestDocs) to high(TestDocs) do
        begin
            if TestDocs[i]=DocIndex then
            begin
                result:=true;
                break;
            end;
        end;
    end;
begin
    self.ADOTable1.TableName:='WordsTable';
    self.ADOTable1.Active:=true;
    if self.ADOTable1.IsEmpty then
    begin
        showmessage('数据库中没有文档集，请先倒排文档');
        exit;
    end;
    slDocsIndex:=TStringlist.Create;
    slAllWords:=TStringlist.Create;
    memo1.Clear;
    memo2.Clear;
    setlength(WordDF,ADOTable1.RecordCount,6);//每个词语的类文档频繁
    for wordindex:=0 to ADOTable1.RecordCount-1 do
    begin
        WordDF[WordIndex][0]:=0;
        WordDF[WordIndex][1]:=0;
        WordDF[WordIndex][2]:=0;
        WordDF[WordIndex][3]:=0;
        WordDF[WordIndex][4]:=0;
        WordDF[WordIndex][5]:=0;
    end;
    self.ADOTable1.First;
    while not self.ADOTable1.Eof do
    begin
        slAllWords.Append(ADOTable1.FieldByName('Words').AsString);
        slDocsIndex.Append(ADOTable1.FieldByName('Docs').AsString);
        self.StatusBar1.SimpleText:='正载入文档集......'+inttostr(slAllWords.Count)+'/'+inttostr(ADOTable1.RecordCount);
        application.ProcessMessages;
        self.ADOTable1.Next;
    end;
    self.ADOTable1.Active:=false;
    //随机选择10%作为测试文档，文档号记录到TestDocs数组中
    self.StatusBar1.SimpleText:='随机划分训练与测试集......';
    s:='';
    for i:=1+low(TestDocs) to high(TestDocs) do
    begin
        DocIndex:=1+random(600);
        while IsInTestDocs(DocIndex) do
            DocIndex:=1+random(600);
        TestDocs[i]:=DocIndex;
        s:=s+inttostr(DocIndex)+',';
    end;
    memo1.Lines.Append('测试集文档：'+s);
    //统计训练集中每类的文档总数
    for ClassIndex:=low(ClassDocCount) to high(ClassDocCount) do
        ClassDocCount[ClassIndex]:=0;
    for DocIndex:=1 to 600 do
    begin
        if not IsInTestDocs(DocIndex) then
            ClassDocCount[DocsClass[DocIndex]]:=ClassDocCount[DocsClass[DocIndex]]+1;
    end;
    for ClassIndex:=1 to high(ClassDocCount) do
        ClassDocCount[0]:=ClassDocCount[0]+ClassDocCount[ClassIndex];
    s:='训练集中共有文档'+inttostr(ClassDocCount[0])+'篇，其中';
    for ClassIndex:=1+low(ClassDocCount) to high(ClassDocCount) do
        s:=s+'类'+inttostr(ClassIndex)+'为'+inttostr(ClassDocCount[ClassIndex])+'篇，';
    memo1.Lines.Append('');
    memo1.Lines.Append(s);
    //计算词语的文档频率
    for WordIndex:=0 to slAllWords.Count-1 do
    begin
        self.StatusBar1.SimpleText:='正在统计词的文档频率......'+inttostr(WordIndex);
        application.ProcessMessages;
        strDocIndex:=slDocsIndex.Strings[WordIndex];
        s:='';
        for i:=1 to length(strDocIndex) do
        begin
            if strDocIndex[i]=',' then
            begin
                DocIndex:=strtointdef(s,0);
                ClassIndex:=DocsClass[DocIndex];
                if not IsInTestDocs(DocIndex) then
                    WordDF[WordIndex][ClassIndex]:=WordDF[WordIndex][ClassIndex]+1;
                s:='';
            end
            else
                s:=s+strDocIndex[i];
        end;
    end;
    memo1.Lines.Append('');
    memo1.Lines.Append('完成词文档频率统计');
    //测试文档
    memo1.Lines.Append('');
    memo1.Lines.Append('正在分类测试文档......');
    self.StatusBar1.SimpleText:='';
    TestDocsClass[0]:=0;//TestDocsClass数组记录测试文档被分到的类号,第0个元素记录正确分类数
    slADoc:=TSTringList.Create;
    for i:=1+low(TestDocs) to high(TestDocs) do
    begin
        DocIndex:=TestDocs[i];
        slADoc.LoadFromFile('DataSet\'+inttostr(DocIndex)+'.txt');
        slADoc.Text:=DelRepeatWords(slADoc.Text);
        //计算文档属于每个类的概率
        for j:=1 to 5 do
            TestDocClassProb[j]:=1.0;
        for j:=0 to slAdoc.Count-1 do
        begin//文档中每个词概率的乘积，即朴素贝叶斯概率
            wordindex:=slAllWords.IndexOf(slAdoc.Strings[j]);
            if wordindex=-1 then
                continue;
            TestDocClassProb[1]:=TestDocClassProb[1]*(infinitesimal+WordDF[wordIndex][1]/ClassDocCount[1]);
            TestDocClassProb[2]:=TestDocClassProb[2]*(infinitesimal+WordDF[wordIndex][2]/ClassDocCount[2]);
            TestDocClassProb[3]:=TestDocClassProb[3]*(infinitesimal+WordDF[wordIndex][3]/ClassDocCount[3]);
            TestDocClassProb[4]:=TestDocClassProb[4]*(infinitesimal+WordDF[wordIndex][4]/ClassDocCount[4]);
            TestDocClassProb[5]:=TestDocClassProb[5]*(infinitesimal+WordDF[wordIndex][5]/ClassDocCount[5]);
        end;
        for j:=1 to 5 do
        begin//乘上类概率
            TestDocClassProb[1]:=TestDocClassProb[1]*ClassDocCount[1];
            TestDocClassProb[2]:=TestDocClassProb[2]*ClassDocCount[2];
            TestDocClassProb[3]:=TestDocClassProb[3]*ClassDocCount[3];
            TestDocClassProb[4]:=TestDocClassProb[4]*ClassDocCount[4];
            TestDocClassProb[5]:=TestDocClassProb[5]*ClassDocCount[5];
        end;
        TestDocClassProb[0]:=-1;
        for j:=1 to 5 do
        begin//找到最大概率将文档归类
            if TestDocClassProb[0]<TestDocClassProb[j] then
            begin
                TestDocClassProb[0]:=TestDocClassProb[j];
                ClassIndex:=j;
            end;
        end;
        //记录文档所属类号及正确分类数
        TestDocsClass[i]:=ClassIndex;
        //TestDocsClass数组记录第i篇测试文档程序分类后分到的类号，
        //TestDocsClass的第i篇文档的文档号为TestDocs[i]
        //所以第i篇测试文档实际所属类为DocsClass[TestDocs[i]]，被分到的类为TestDocsClass[i]
        if ClassIndex=DocsClass[DocIndex] then
            TestDocsClass[0]:=TestDocsClass[0]+1;//TestDocsClass[0]为累计正确分类文档数
        memo1.Lines.Append('文档'+inttostr(DocIndex)+'应属于类'+inttostr(DocsClass[DocIndex])+
                '，被分到类'+inttostr(ClassIndex));
        application.ProcessMessages;
    end;
    slADoc.Free;
    memo1.Lines.Append('正确分类数量：'+inttostr(TestDocsClass[0])+
        '，分类正确率：'+floattostr(TestDocsClass[0]/(high(TestDocs)-low(TestDocs))));
    slDocsIndex.Free;
    slAllWords.Free;
end;

end.
上一页 12
💿 文件大小 2797 K
👤 上传用户 yxm_325
📂 所属分类多国语言处理
🏷️ 相关标签

#Windows #delphi #600 #实验
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -