📄 ucalcweight.~pas
字号:
unit UCalcWeight;
{
///////////////////////////////
计算权重的单元
//////////////////////////////
}
interface
uses Classes,UDic,UDocuments,Math,SysUtils;
Type
//某个词的权重
PWordWeight = ^TWordWeight;
TWordWeight = record
Word : TWordType;
Weight : real;
end;
//某个文档中所有关键词的权重
//PDocWeight = ^TDocWeight;
TDocWeight = class
private
FItems : TList;
FID : integer;
private
//从流中读取和写进数据
procedure LoadFromStream(Stream : TStream);
procedure SaveToStream(Stream : TStream);
//求单词索引
Function IndexOf(Word:string):integer;
public
procedure Clear();
//求某个词的权重
Function Weight(Word : string):real;overload;
//求某个词的权重
Function WordWeight_Index(Index : integer):PWordWeight;
//设置某个词的权重,如果词不存在则添加
procedure AddWeight(Word : string; Weight : real);
//词的数量
Function Count():integer;
public
//文档号
property DocID : Integer read FID;
public
constructor Create(DocID : integer);
Destructor Destroy();override;
end;
//所有文档权重
TDocWeights = Class
private
FItems : TList;
private
//从流中读取和写进数据
procedure LoadFromStream(Stream : TStream);
procedure SaveToStream(Stream : TStream);
Function IndexOf(DocID : integer):Integer;
public
//清空
procedure Clear();
//求某个文档的权重类
Function Weight_ID(DocID : integer):TDocWeight;
//求某个文档的权重类
Function Weight_Index(Index : integer):TDocWeight;
//求某个文档中单词的权重
Function Weight_IDWord(DocID : integer;Word : string):Real;
//设置某个文档中某个单词的权重
procedure AddWeight(DocID : integer; Word : string ;weight : real);
//文档数
Function Count():integer;
public
//把类保存到文件中
Function Save(FileName : string):Boolean;
//从文件中加载所有权重
Function Load(FileName : string):Boolean;
public
Constructor Create();
Destructor Destroy();override;
end;
//计算权重
TCalcWeight = class
private
FDocWeights : TDocWeights;
FVSMList : TVSMList;
FDocs : TAbstractDocs;
public
//计算权重
Function Calc():Boolean;
public
Property DocWeights:TDocWeights Read FDocWeights Write FDocWeights;
property VSMList : TVSMList Read FVSMList Write FVSMList;
property Docs : TAbstractDocs read FDocs Write FDocs;
public
constructor Create();overload;
constructor Create(docw:TDocWeights;vsmList : TVSMList;docs : TAbstractDocs);overload;
end;
implementation
{ TCalcWeight }
function TCalcWeight.Calc: Boolean;
var
Index : integer;
i : integer;
tf : integer;
df : integer;
Weight : real;
DicParams : TList;
Params : TParams;
UnitaryDenominator : array of real;//归一化因子的数组
ParamsCount : integer;
FDocCount : integer;
DocLen : integer;
begin
Result := False;
FDocCount := FDocs.GetDocCount();
if FDocCount > 0 then
begin
DicParams := TList.Create();
try
///求计算权重的参数和归一化因子
SetLength(UnitaryDenominator,FVSMList.Count);
for Index := 0 to FVSMList.Count -1 do
begin
Params := FVSMList.VSM(Index).CalcParam();
DicParams.Add(Params);
ParamsCount := Params.df;
UnitaryDenominator[Index] := 0.0;
for i :=0 to ParamsCount-1 do
begin
UnitaryDenominator[Index] := UnitaryDenominator[Index] + (Power(Params.tf_Index(i),2))*(Power(Log2(FDocCount/ParamsCount+0.01),2))
end;
UnitaryDenominator[Index] := Power(UnitaryDenominator[Index],0.5);
end;
//进行计算 ,对文档列表进行循环计算
FDocs.FirstDoc;
while not FDocs.EndDoc do
begin
//文档长度
DocLen := Length(FDocs.getText());
for Index := 0 to FVSMList.Count -1 do
begin
Params := TParams(DicParams[index]);
tf := Params.tf_ID(FDocs.GetID);
if tf <> -1 then
begin
//只有文档中包含本关键子才计算权重
df := Params.df;
//根据文档中提供的公式进行计算
weight := ((tf * Log2(FDocCount/df+0.01))/UnitaryDenominator[Index]);
//这里最合理的是再除与文档的长度,这样得出的权重就会更合理
//weight := ((tf * Log2(FDocCount/df+0.01))/UnitaryDenominator[Index])/DocLen;
//weight := ((tf * Log2(FDocCount/df+0.01))/DocLen);
FDocWeights.AddWeight(FDocs.GetID,FVSMList.GetWords(Index),weight);
end;
end;
FDocs.NextDoc ;
end;
Result := true;
finally
DicParams.Clear();
DicParams.Free;
end;
end;
end;
constructor TCalcWeight.Create(docw: TDocWeights; vsmList: TVSMList;
docs: TAbstractDocs);
begin
FDocWeights := docw;
FVSMList := vsmList;
FDocs := docs;
end;
constructor TCalcWeight.Create;
begin
end;
{ TDocWeight }
procedure TDocWeight.Clear;
var
Index : integer;
begin
For Index :=0 to FItems.Count -1 do
begin
Dispose(PWordWeight(FItems[Index]));
end;
FItems.Clear;
end;
function TDocWeight.Count: integer;
begin
Result := FItems.Count ;
end;
constructor TDocWeight.Create(DocID: integer);
begin
FItems := TList.Create ;
FID := DocID;
end;
destructor TDocWeight.Destroy;
begin
Clear();
FItems.Free;
inherited;
end;
function TDocWeight.IndexOf(Word: string): integer;
var
index : integer;
begin
Result := -1;
For index :=0 to FItems.Count-1 do
begin
if WordWeight_Index(Index).Word = Word then
begin
Result := Index;
break;
end;
end;
end;
procedure TDocWeight.LoadFromStream(Stream: TStream);
var
iCount : integer;
WordWeight : TWordWeight;
i : integer;
begin
Clear();
Stream.Read(iCount,SizeOf(Integer));
For i:=0 to iCount-1 do
begin
Stream.Read(WordWeight,Sizeof(TWordWeight));
AddWeight(WordWeight.Word,WordWeight.Weight);
end;
Stream.Read(FID,SizeOf(Integer));
end;
procedure TDocWeight.SaveToStream(Stream: TStream);
var
iCount : integer;
P : PWordWeight;
i : integer;
begin
iCount := FItems.Count ;
Stream.Write(iCount,sizeOf(Integer));
For i :=0 to iCount-1 do
begin
P := PWordWeight(FItems[i]);
Stream.Write(P^,Sizeof(TWordWeight));
end;
Stream.Write(FID,SizeOf(Integer));
end;
procedure TDocWeight.AddWeight(Word: string; Weight: real);
var
Index : Integer;
P : PWordWeight;
begin
Index := IndexOf(Word);
if Index > -1 then
begin
P := WordWeight_Index(Index);
P.Weight := Weight;
end
else
begin
new(P);
P.Word := Word;
P.Weight := Weight;
FItems.Add(P);
end;
end;
function TDocWeight.Weight(Word: string): real;
var
Index : integer;
begin
Index := IndexOf(Word);
if Index >-1 then
begin
Result := WordWeight_Index(Index).Weight ;
end
else
Result := -1;
end;
function TDocWeight.WordWeight_Index(Index: integer): PWordWeight;
begin
Result := PWordWeight(FItems[Index]);
end;
{ TDocWeights }
procedure TDocWeights.Clear;
var
I : integer;
begin
For I := 0 to FItems.Count -1 do
begin
TDocWeight(FItems[i]).Free;
end;
Clear();
end;
function TDocWeights.Count: integer;
begin
Result := FItems.Count ;
end;
constructor TDocWeights.Create;
begin
FItems := TList.Create ;
end;
destructor TDocWeights.Destroy;
begin
Clear();
FItems.Free;
inherited;
end;
function TDocWeights.IndexOf(DocID: integer): Integer;
var
i : integer;
begin
Result := -1;
For i :=0 to FItems.Count -1 do
begin
if (DocID = TDocWeight(FItems[i]).DocID) then
begin
Result := i;
break;
end;
end;
end;
function TDocWeights.Load(FileName: string): Boolean;
var
Stream : TStream;
begin
Result := False;
Stream := TFileStream.Create(FileName,fmOpenRead);
try
LoadFromStream(Stream);
Result := true;
finally
Stream.Free;
end;
end;
procedure TDocWeights.LoadFromStream(Stream: TStream);
var
icount: integer;
i : integer;
DocWeight : TDocWeight;
begin
Stream.Read(iCount,SizeOf(Integer));
For i:=0 to iCount-1 do
begin
DocWeight := TDocWeight.Create(-1);
DocWeight.LoadFromStream(Stream);
FItems.Add(DocWeight);
end;
end;
function TDocWeights.Save(FileName: string): Boolean;
var
Stream : TStream;
begin
Result := False;
Stream := TFileStream.Create(FileName,fmCreate);
try
SaveToStream(stream);
Result := true;
finally
stream.Free;
end;
end;
procedure TDocWeights.SaveToStream(Stream: TStream);
var
icount : integer;
i : integer;
begin
iCount := FItems.Count ;
Stream.Write(iCount,SizeOf(iCount));
For i :=0 to iCount-1 do
begin
TDocWeight(FItems[i]).SaveToStream(Stream);
end;
end;
procedure TDocWeights.AddWeight(DocID: integer; Word: string; weight: real);
var
docWeight : TDocWeight;
begin
docWeight := Weight_ID(DocID);
if docWeight <> nil then
begin
docWeight.AddWeight(word,weight);
end
else
begin
docWeight := TDocWeight.Create(DocID);
docWeight.AddWeight(Word,weight);
FItems.Add(docWeight);
end;
end;
function TDocWeights.Weight_IDWord(DocID: integer; Word: string): Real;
var
docWeight : TDocWeight;
begin
Result := -1;
docWeight := Weight_ID(DocID);
if docWeight <> nil then
begin
Result := docWeight.Weight(Word);
end;
end;
function TDocWeights.Weight_ID(DocID: integer): TDocWeight;
var
i : integer;
begin
i := IndexOf(DocID);
if i> -1 then
begin
Result := Weight_Index(i);
end
else
begin
Result := nil;
end;
end;
function TDocWeights.Weight_Index(Index: integer): TDocWeight;
begin
Result := TDocWeight(FItems[Index]);
end;
end.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -