⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 udic.pas

📁 用于中文分词的算法。包括逆向分词和反向分词
💻 PAS
📖 第 1 页 / 共 2 页
字号:
unit UDic;

{
////////////////////////////////

  字典类----用于保存关键词各种参数,以及向量

////////////////////////////////
}

interface
uses SysUtils,Classes,ADODB;
const
  //最大的词长度
  MAX_WORD_LENGTH = 10;
type
  //词类型
  TWordType = string[MAX_WORD_LENGTH];
   //向量的记录
  PVector = ^TVector;
  TVector = record
    FID : integer;
    FPos : integer;
  end;
  //权重的参数
  PParam = ^TParam;
  TParam = Record
    FID : integer;
    tf : integer;
  end;
  //计算权重的参数
  TParams = class
  private
    FItems : TList;
  private
    procedure Clear();
    Function IndexOf(FID : integer):integer;
  public
    //根据文档号去除词频
    Function tf_ID(DocID : integer):Integer;
    //根据索引号取词频
    Function tf_Index(Index : integer):integer;
    //添加词频
    Procedure Addtf(FID : integer;tf : integer);
    //取得文档的频率
    Function df():Integer;
    //取得参数的数量
    Function Count():integer;
    //根据索引取出文档号
    Function DocID(Index : integer):Integer;
  public
    Constructor Create();
    Destructor Destroy();override;
  end;
  //向量空间
  TVSM = Class
  private
     FVectors : TList;
     FParams : TParams;
  private

     procedure SaveToStream(stream : TStream);
     procedure LoadFromStream(stream : TStream);
     //返回词在某个文档中的频数
     Function tf(id : integer):integer;
     //返回包含这个词的文档数
     Function df():integer;
     //通过文档号和位置求索引
     Function IndexOf(ID,Pos : integer):integer;
  public
     procedure Clear();
     //procedure AddVector(vector:TVector);overload;
     procedure AddVector(id,pos : integer);overload;
     Function Vector(Index : integer):PVector;
     Function Count():integer;
     //添加向量,文档号没有重复
     procedure AddVSM(ParamVSM : TVSM);
     //计算返回参数
     Function CalcParam():TParams;
     //科隆
     procedure CloneTo(PVSM : TVSM);
     //文档号唯一化
     procedure SigleDocID();
     //返回最多的那个文档号
     Function MaxDocID():integer;
  public
     Constructor Create();overload;
     Constructor Create(ID : integer; Pos : integer);overload;
     Destructor Destroy(); override;
  end;
  //向量空间列表
  TVSMList = Class
  private
    FKeyWords : TStrings;
  private

     procedure SaveToStream(stream : TStream);
     procedure LoadFromStream(stream : TStream);
  public
    procedure Clear();
    //添加一个向量
    procedure Add(Word : string;ID,Pos : integer);overload;
    procedure Add(Word : string);overload;
    //保存向量到文件
    procedure SaveVector(FileName:string);
    //从文件重读取向量
    procedure ReadVector(FileName:string);
    //
    Function Count():integer;
    //取一个向量
    Function VSM(index : integer):TVSM;overload;
    //取一个向量
    Function VSM(Word : string):TVSM;overload;
    //取一个关键字
    Function GetWords(index : integer):String;

  public
    Constructor Create();
    Destructor Destroy(); override;
  end;

 //抽象的字典
  TAbstractDic = class
  private
    FKeyWords : TStrings;
    //FVSMList : TVSMList;
  protected
   //添加一个新词
    procedure AddNew(word : string);
    //添加一个词
    procedure Add(Word : string);
    //
    Procedure Sort();
  public
    //加载文档列表
    procedure LoadDic();virtual;abstract;
  public
    //测试字符串是否是一个词
    Function IsWords(AWord:String):Boolean;virtual;
  //public
   // property VSMList : TVSMList Read FVSMList ;//Write FVSMList;
  public
    Constructor Create();
    Destructor Destroy(); override;
  end;
  
  //文件字典
  TFileDic = Class(TAbstractDic)
  private
    FDicFile : string;
  public
     //加载字典
     procedure LoadDic();override;
  public
     Constructor Create(DicFile:string);
  end;

  //数据库字典
  TDBDic = class(TAbstractDic)
  private
    FConnectionString : string;
    FTableName : string;
    FFieldName : string;
    FConnect : TAdoConnection;
    FQuery : TADoQuery;
  private
    //连接数据库
     Function ConnDB():Boolean;
     //生成Sql语言
     Function MakeSql():String;
     //查询
     Function Query():boolean;
  private
    procedure LoadDic();override;
  public
    constructor Create(ConnectionString : string;TableName : string;FieldName :string);
  end;


implementation
uses UCalcWeight;
{ TVSM }

procedure TVSM.AddVector(id, pos: integer);
var
  PV : PVector;
begin
  if IndexOf(ID,pos)=-1 then
  begin
    new(PV);
    PV.FID := id;
    PV.FPos := pos;
    FVectors.Add(PV);
  end;
end;

{procedure TVSM.AddVector(vector: TVector);
begin
  AddVector(vector.FID,vector.FPos)
end;
}
function TVSM.CalcParam: TParams;
var
  Index : integer;
begin
   FParams.Clear();
   For Index := 0 to FVectors.Count-1 do
   begin
     FParams.Addtf(PVector(FVectors[Index]).FID,0);
   end;
   For Index := 0 to FParams.Count -1 do
   begin
      FParams.Addtf(FParams.DocID(Index),tf(FParams.DocID(Index))); 
   end;
   result := FParams;
end;

procedure TVSM.Clear;
var
  i : integer;
begin
  For i := 0 to FVectors.Count -1 do
  begin
    Dispose(PVector(FVectors[i]));
  end;
  FVectors.Clear();
end;

constructor TVSM.Create;
begin
  FVectors := TList.Create;
  FParams := TParams.Create();
end;

function TVSM.Count: integer;
begin
  result := Self.FVectors.Count ;
end;

constructor TVSM.Create(ID, Pos: integer);
begin
  FVectors := TList.Create;
  FParams := TParams.Create();
  Self.AddVector(ID,Pos);
end;

destructor TVSM.Destroy;
begin
  FParams.Free;
  Clear();
  FVectors.Free;   
  inherited;
end;

function TVSM.df: integer;
var
  i : integer;
  DocIDs : TList;
  TmpID :integer;
begin
  DocIDs := TList.Create ;
  For i := 0 to FVectors.Count -1 do
  begin
     TmpID := PVector(FVectors).FID;
     if DocIDs.IndexOf(Pointer(TmpID)) < 0 then
     begin
       DocIDs.Add(Pointer(TmpID)); 
     end;
  end;
  Result := DocIDs.Count ;
  DocIDs.Free;
end;

procedure TVSM.LoadFromStream(stream: TStream);
var
  Count : integer;
  i : integer;
  ID : integer;
  Pos : integer;
begin
  Stream.Read(Count,SizeOf(integer));
  Clear();
  For i:=0 to Count-1 do
  begin
    Stream.Read(ID,SizeOf(integer));
    Stream.Read(Pos, SizeOf(integer));
    AddVector(ID,Pos);
  end;   
end;

procedure TVSM.SaveToStream(stream: TStream);
var
  Cnt : integer;
  i : integer;
  ID : integer;
  Pos : integer;
begin
  Cnt := FVectors.Count ;
  stream.Write(Cnt,sizeOf(integer));
  for i:=0 to Cnt-1 do
  begin
    ID := PVector(FVectors[i]).FID ;
    Pos := PVector(FVectors[i]).FPos;
    stream.Write(ID,SizeOf(integer));
    Stream.Write(Pos,SizeOf(integer));
  end;
  
end;

function TVSM.tf(id: integer): integer;
var
  i : integer;
begin
  Result := 0;
  For i:=0 to FVectors.Count -1 do
  begin
    if(id = PVector(FVectors[i]).FID) then
    begin
      Inc(Result);
    end;
  end;
end;

function TVSM.Vector(Index: integer): PVector;
begin
  result := PVector(FVectors[index]);
end;

function TVSM.IndexOf(ID, Pos: integer): integer;
var
  Index : integer;
begin
  Result := -1;
  For Index :=0 to FVectors.Count -1 do
  begin
    if(ID = PVector(FVectors[index]).FID)and(Pos = PVector(FVectors[index]).FPos)then
    begin
      Result := Index;
      break;
    end;
  end;

end;

procedure TVSM.AddVSM(ParamVSM: TVSM);
var
  i : integer;
begin
  if ParamVSM=nil then exit;
  For i:=0 to ParamVSM.Count-1 do
  begin
    //if self.tf(ParamVSM.Vector(i).FID)=0 then
    AddVector(ParamVSM.Vector(i).FID,ParamVSM.Vector(i).FPos);
  end;
end;

procedure TVSM.CloneTo(PVSM: TVSM);
var
  Stream : TStream;
begin
  Stream := TMemoryStream.Create ;
  try
    SaveToStream(Stream);
    Stream.Position := 0;
    PVSM.LoadFromStream(Stream);
  finally
    Stream.Free;
  end;
end;

procedure TVSM.SigleDocID;
var
  I,J : integer;
  ID : integer;
  P : PVector;
  cnt : integer;
begin
 //标志那些重复
  For I := 0 to Count -1 do
  begin
    ID := Self.Vector(I).FID;
    if ID <> -1 then
    begin
      For J := I+1 to Count-1 do
      begin
        if ID = Self.Vector(J).FID then
        begin
          Self.Vector(J).FID := -1;
        end;
      end;
    end;
  end;
  //删除那些重复的
  cnt := Count-1;
  For I := cnt downTo 0 do
  begin
    if Vector(I).FID =-1 then

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -