⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 ucalcweight.~pas

📁 用于中文分词的算法。包括逆向分词和反向分词
💻 ~PAS
字号:
unit UCalcWeight;
{
///////////////////////////////
 计算权重的单元
//////////////////////////////
}
interface
uses Classes,UDic,UDocuments,Math,SysUtils;

Type
  //某个词的权重
  PWordWeight = ^TWordWeight;
  TWordWeight = record
    Word : TWordType;
    Weight : real;
  end;

  //某个文档中所有关键词的权重
  //PDocWeight = ^TDocWeight;
  TDocWeight = class
  private
    FItems : TList;
    FID : integer;
  private
    //从流中读取和写进数据
    procedure LoadFromStream(Stream : TStream);
    procedure SaveToStream(Stream : TStream);
    //求单词索引
    Function IndexOf(Word:string):integer;
  public
    procedure Clear();
    //求某个词的权重
    Function Weight(Word : string):real;overload;
    //求某个词的权重
    Function WordWeight_Index(Index : integer):PWordWeight;
    //设置某个词的权重,如果词不存在则添加
    procedure AddWeight(Word : string; Weight : real);
    //词的数量
    Function Count():integer;
  public
    //文档号
    property DocID : Integer read FID;
  public
    constructor Create(DocID : integer);
    Destructor Destroy();override;
  end;

  //所有文档权重
  TDocWeights = Class
  private
    FItems : TList;
  private
    //从流中读取和写进数据
    procedure LoadFromStream(Stream : TStream);
    procedure SaveToStream(Stream : TStream);
    Function  IndexOf(DocID : integer):Integer;
  public
    //清空
    procedure Clear();
    //求某个文档的权重类
    Function Weight_ID(DocID : integer):TDocWeight; 
    //求某个文档的权重类
    Function Weight_Index(Index : integer):TDocWeight;
    //求某个文档中单词的权重
    Function Weight_IDWord(DocID : integer;Word : string):Real;
    //设置某个文档中某个单词的权重
    procedure AddWeight(DocID : integer; Word : string ;weight : real); 
    //文档数
    Function Count():integer;
  public
    //把类保存到文件中
    Function Save(FileName : string):Boolean;
    //从文件中加载所有权重
    Function Load(FileName : string):Boolean;
  public
    Constructor Create();
    Destructor Destroy();override;
  end;

  //计算权重
  TCalcWeight = class
  private
     FDocWeights : TDocWeights;
     FVSMList : TVSMList;
     FDocs : TAbstractDocs;
  public
    //计算权重
    Function Calc():Boolean;
  public
    Property DocWeights:TDocWeights Read FDocWeights Write FDocWeights;
    property VSMList : TVSMList Read FVSMList Write FVSMList;
    property Docs : TAbstractDocs read FDocs Write FDocs;
  public
    constructor Create();overload;
    constructor Create(docw:TDocWeights;vsmList : TVSMList;docs : TAbstractDocs);overload;
  end;
implementation


{ TCalcWeight }

function TCalcWeight.Calc: Boolean;
var
  Index : integer;
  i : integer;
  tf : integer;
  df : integer;
  Weight : real;
  DicParams : TList;
  Params : TParams;
  UnitaryDenominator : array of real;//归一化因子的数组
  ParamsCount : integer;
  FDocCount : integer;
  DocLen : integer;
begin
  Result := False;
  FDocCount := FDocs.GetDocCount();  
  if FDocCount > 0 then
  begin
    DicParams := TList.Create();
    try
      ///求计算权重的参数和归一化因子
      SetLength(UnitaryDenominator,FVSMList.Count);
      for Index := 0 to FVSMList.Count -1 do
      begin
         Params := FVSMList.VSM(Index).CalcParam();
         DicParams.Add(Params);
         ParamsCount := Params.df;
         UnitaryDenominator[Index] := 0.0;
         for i :=0 to ParamsCount-1 do
         begin
           UnitaryDenominator[Index] := UnitaryDenominator[Index] + (Power(Params.tf_Index(i),2))*(Power(Log2(FDocCount/ParamsCount+0.01),2))
         end;
         UnitaryDenominator[Index] := Power(UnitaryDenominator[Index],0.5);
      end;
      //进行计算  ,对文档列表进行循环计算
      FDocs.FirstDoc;
      while not FDocs.EndDoc do
      begin
       //文档长度
        DocLen := Length(FDocs.getText());
        for Index := 0 to FVSMList.Count -1 do
        begin
          Params := TParams(DicParams[index]);
          tf := Params.tf_ID(FDocs.GetID);
          if tf <> -1 then
          begin
            //只有文档中包含本关键子才计算权重
            df := Params.df;
            //根据文档中提供的公式进行计算
            weight := ((tf * Log2(FDocCount/df+0.01))/UnitaryDenominator[Index]);
            
            //这里最合理的是再除与文档的长度,这样得出的权重就会更合理 
            //weight := ((tf * Log2(FDocCount/df+0.01))/UnitaryDenominator[Index])/DocLen;
            //weight := ((tf * Log2(FDocCount/df+0.01))/DocLen);
            FDocWeights.AddWeight(FDocs.GetID,FVSMList.GetWords(Index),weight);
          end; 
        end;
        FDocs.NextDoc ;
      end;
      Result := true;
    finally
      DicParams.Clear();
      DicParams.Free;
    end;
  end;
end;

constructor TCalcWeight.Create(docw: TDocWeights; vsmList: TVSMList;
  docs: TAbstractDocs);
begin
  FDocWeights := docw;
  FVSMList := vsmList;
  FDocs := docs;
end;

constructor TCalcWeight.Create;
begin

end;

{ TDocWeight }

procedure TDocWeight.Clear;
var
  Index : integer;
begin
  For Index :=0 to FItems.Count -1 do
  begin
    Dispose(PWordWeight(FItems[Index]));
  end;
  FItems.Clear;
end;

function TDocWeight.Count: integer;
begin
  Result := FItems.Count ;
end;

constructor TDocWeight.Create(DocID: integer);
begin
  FItems := TList.Create ;
  FID := DocID;
end;

destructor TDocWeight.Destroy;
begin
  Clear();
  FItems.Free;
  inherited;
end;

function TDocWeight.IndexOf(Word: string): integer;
var
  index : integer;
begin
  Result := -1;
  For index :=0 to FItems.Count-1 do
  begin
    if WordWeight_Index(Index).Word = Word then
    begin
      Result := Index;
      break;
    end;
  end;
end;

procedure TDocWeight.LoadFromStream(Stream: TStream);
var
  iCount : integer;
  WordWeight : TWordWeight;
  i : integer;
begin
  Clear();
  Stream.Read(iCount,SizeOf(Integer));
  For i:=0 to iCount-1 do
  begin
    Stream.Read(WordWeight,Sizeof(TWordWeight));
    AddWeight(WordWeight.Word,WordWeight.Weight);
  end;
  Stream.Read(FID,SizeOf(Integer)); 
end;

procedure TDocWeight.SaveToStream(Stream: TStream);
var
  iCount : integer;
  P : PWordWeight;
  i : integer;
begin
  iCount := FItems.Count ;
  Stream.Write(iCount,sizeOf(Integer));
  For i :=0 to iCount-1 do
  begin
    P := PWordWeight(FItems[i]);
    Stream.Write(P^,Sizeof(TWordWeight)); 
  end;
  Stream.Write(FID,SizeOf(Integer)); 
end;

procedure TDocWeight.AddWeight(Word: string; Weight: real);
var
  Index : Integer;
  P : PWordWeight;
begin
  Index := IndexOf(Word);
  if Index > -1 then
  begin
    P := WordWeight_Index(Index);
    P.Weight := Weight;
  end
  else
  begin
    new(P);
    P.Word := Word;
    P.Weight := Weight;
    FItems.Add(P); 
  end;
end;

function TDocWeight.Weight(Word: string): real;
var
  Index : integer;
begin
  Index := IndexOf(Word);
  if Index >-1 then
  begin
    Result := WordWeight_Index(Index).Weight ;
  end
  else
    Result := -1;
end;

function TDocWeight.WordWeight_Index(Index: integer): PWordWeight;
begin
  Result := PWordWeight(FItems[Index]);
end;

{ TDocWeights }

procedure TDocWeights.Clear;
var
  I : integer;
begin
  For I := 0 to FItems.Count -1 do
  begin
    TDocWeight(FItems[i]).Free;
  end;
  Clear();
end;

function TDocWeights.Count: integer;
begin
  Result := FItems.Count ;
end;

constructor TDocWeights.Create;
begin
  FItems := TList.Create ;
end;

destructor TDocWeights.Destroy;
begin
  Clear();
  FItems.Free;
  inherited;
end;

function TDocWeights.IndexOf(DocID: integer): Integer;
var
  i : integer;
begin
  Result := -1;
  For i :=0 to FItems.Count -1 do
  begin
    if (DocID = TDocWeight(FItems[i]).DocID) then
    begin
      Result := i;
      break;
    end;
  end;
end;

function TDocWeights.Load(FileName: string): Boolean;
var
  Stream : TStream;
begin
  Result := False;
  Stream := TFileStream.Create(FileName,fmOpenRead);
  try
    LoadFromStream(Stream);
    Result := true;
  finally
    Stream.Free;
  end;
end;

procedure TDocWeights.LoadFromStream(Stream: TStream);
var
  icount: integer;
  i : integer;
  DocWeight : TDocWeight;
begin
  Stream.Read(iCount,SizeOf(Integer));
  For i:=0 to iCount-1 do
  begin
    DocWeight := TDocWeight.Create(-1);
    DocWeight.LoadFromStream(Stream);
    FItems.Add(DocWeight); 
  end;
end;

function TDocWeights.Save(FileName: string): Boolean;
var
  Stream : TStream;
begin
  Result := False;
  Stream := TFileStream.Create(FileName,fmCreate);
  try
    SaveToStream(stream);
    Result := true;
  finally
    stream.Free;
  end;
end;

procedure TDocWeights.SaveToStream(Stream: TStream);
var
  icount : integer;
  i : integer;
begin
  iCount := FItems.Count ;
  Stream.Write(iCount,SizeOf(iCount));
  For i :=0 to iCount-1 do
  begin
    TDocWeight(FItems[i]).SaveToStream(Stream); 
  end;
end;

procedure TDocWeights.AddWeight(DocID: integer; Word: string; weight: real);
var
  docWeight : TDocWeight;
begin
  docWeight := Weight_ID(DocID);
  if docWeight <> nil then
  begin
     docWeight.AddWeight(word,weight);
  end
  else
  begin
    docWeight := TDocWeight.Create(DocID);
    docWeight.AddWeight(Word,weight);
    FItems.Add(docWeight);
  end;
end;

function TDocWeights.Weight_IDWord(DocID: integer; Word: string): Real;
var
  docWeight : TDocWeight;
begin
  Result := -1;
  docWeight := Weight_ID(DocID);
  if docWeight <> nil then
  begin
    Result := docWeight.Weight(Word);
  end;
end;

function TDocWeights.Weight_ID(DocID: integer): TDocWeight;
var
  i : integer;
begin
  i := IndexOf(DocID);
  if i> -1 then
  begin
    Result := Weight_Index(i);
  end
  else
  begin
    Result := nil;
  end;
end;

function TDocWeights.Weight_Index(Index: integer): TDocWeight;
begin
  Result := TDocWeight(FItems[Index]);
end;

end.
 

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -