⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 udic.~pas

📁 用于中文分词的算法。包括逆向分词和反向分词
💻 ~PAS
字号:
unit UDic;
{
////////////////////////////////

  字典类----用于保存关键词各种参数,以及向量

////////////////////////////////
}
interface
uses SysUtils,Classes,ADODB;
const
  //最大的词长度
  MAX_WORD_LENGTH = 10;
type
  //词类型
  TWordType = string[MAX_WORD_LENGTH];
   //向量的记录
  PVector = ^TVector;
  TVector = record
    FID : integer;
    FPos : integer;
  end;
  //权重的参数
  PParam = ^TParam;
  TParam = Record
    FID : integer;
    tf : integer;
  end;
  //计算权重的参数
  TParams = class
  private
    FItems : TList;
  private
    procedure Clear();
    Function IndexOf(FID : integer):integer;
  public
    //根据文档号去除词频
    Function tf_ID(DocID : integer):Integer;
    //根据索引号取词频
    Function tf_Index(Index : integer):integer;
    //添加词频
    Procedure Addtf(FID : integer;tf : integer);
    //取得文档的频率
    Function df():Integer;
    //取得参数的数量
    Function Count():integer;
    //根据索引取出文档号
    Function DocID(Index : integer):Integer;
  public
    Constructor Create();
    Destructor Destroy();override;
  end;
  //向量空间
  TVSM = Class
  private
     FVectors : TList;
     FParams : TParams;
  private
     procedure Clear();
     procedure SaveToStream(stream : TStream);
     procedure LoadFromStream(stream : TStream);
     //返回词在某个文档中的频数
     Function tf(id : integer):integer;
     //返回包含这个词的文档数
     Function df():integer;
     //通过文档号和位置求索引
     Function IndexOf(ID,Pos : integer):integer;
  public
     //procedure AddVector(vector:TVector);overload;
     procedure AddVector(id,pos : integer);overload;
     Function Vector(Index : integer):PVector;
     Function Count():integer;
     //计算返回参数
     Function CalcParam():TParams;
  public
     Constructor Create();overload;
     Constructor Create(ID : integer; Pos : integer);overload;
     Destructor Destroy(); override;
  end;
  //向量空间列表
  TVSMList = Class
  private
    FKeyWords : TStrings;
  private
     procedure Clear();
     procedure SaveToStream(stream : TStream);
     procedure LoadFromStream(stream : TStream);
  public
    //添加一个向量
    procedure Add(Word : string;ID,Pos : integer);overload;
    procedure Add(Word : string);overload;
    //保存向量到文件
    procedure SaveVector(FileName:string);
    //从文件重读取向量
    procedure ReadVector(FileName:string);
    //
    Function Count():integer;
    //取一个向量
    Function VSM(index : integer):TVSM;
    //取一个关键字
    Function GetWords(index : integer):String;
  public
    Constructor Create();
    Destructor Destroy(); override;
  end;

 //抽象的字典
  TAbstractDic = class
  private
    FKeyWords : TStrings;
    //FVSMList : TVSMList;
  protected
   //添加一个新词
    procedure AddNew(word : string);
    //添加一个词
    procedure Add(Word : string);
    //
    Procedure Sort();
  public
    //加载文档列表
    procedure LoadDic();virtual;abstract;
  public
    //测试字符串是否是一个词
    Function IsWords(AWord:String):Boolean;virtual;
  //public
   // property VSMList : TVSMList Read FVSMList ;//Write FVSMList;
  public
    Constructor Create();
    Destructor Destroy(); override;
  end;
  
  //文件字典
  TFileDic = Class(TAbstractDic)
  private
    FDicFile : string;
  public
     //加载字典
     procedure LoadDic();override;
  public
     Constructor Create(DicFile:string);
  end;

  //数据库字典
  TDBDic = class(TAbstractDic)
  private
    FConnectionString : string;
    FTableName : string;
    FFieldName : string;
    FConnect : TAdoConnection;
    FQuery : TADoQuery;
  private
    //连接数据库
     Function ConnDB():Boolean;
     //生成Sql语言
     Function MakeSql():String;
     //查询
     Function Query():boolean;
  private
    procedure LoadDic();override;
  public
    constructor Create(ConnectionString : string;TableName : string;FieldName :string);
  end;


implementation
uses UCalcWeight;
{ TVSM }

procedure TVSM.AddVector(id, pos: integer);
var
  PV : PVector;
begin
  if IndexOf(ID,pos)=-1 then
  begin
    new(PV);
    PV.FID := id;
    PV.FPos := pos;
    FVectors.Add(PV);
  end;
end;

{procedure TVSM.AddVector(vector: TVector);
begin
  AddVector(vector.FID,vector.FPos)
end;
}
function TVSM.CalcParam: TParams;
var
  Index : integer;
begin
   FParams.Clear();
   For Index := 0 to FVectors.Count-1 do
   begin
     FParams.Addtf(PVector(FVectors[Index]).FID,0);
   end;
   For Index := 0 to FParams.Count -1 do
   begin
      FParams.Addtf(FParams.DocID(Index),tf(FParams.DocID(Index))); 
   end;
   result := FParams;
end;

procedure TVSM.Clear;
var
  i : integer;
begin
  For i := 0 to FVectors.Count -1 do
  begin
    Dispose(PVector(FVectors[i]));
  end;
  FVectors.Clear();
end;

constructor TVSM.Create;
begin
  FVectors := TList.Create;
  FParams := TParams.Create();
end;

function TVSM.Count: integer;
begin
  result := Self.FVectors.Count ;
end;

constructor TVSM.Create(ID, Pos: integer);
begin
  FVectors := TList.Create;
  FParams := TParams.Create();
  Self.AddVector(ID,Pos);
end;

destructor TVSM.Destroy;
begin
  FParams.Free;
  Clear();
  FVectors.Free;   
  inherited;
end;

function TVSM.df: integer;
var
  i : integer;
  DocIDs : TList;
  TmpID :integer;
begin
  DocIDs := TList.Create ;
  For i := 0 to FVectors.Count -1 do
  begin
     TmpID := PVector(FVectors).FID;
     if DocIDs.IndexOf(Pointer(TmpID)) < 0 then
     begin
       DocIDs.Add(Pointer(TmpID)); 
     end;
  end;
  Result := DocIDs.Count ;
  DocIDs.Free;
end;

procedure TVSM.LoadFromStream(stream: TStream);
var
  Count : integer;
  i : integer;
  ID : integer;
  Pos : integer;
begin
  Stream.Read(Count,SizeOf(integer));
  Clear();
  For i:=0 to Count-1 do
  begin
    Stream.Read(ID,SizeOf(integer));
    Stream.Read(Pos, SizeOf(integer));
    AddVector(ID,Pos);
  end;   
end;

procedure TVSM.SaveToStream(stream: TStream);
var
  Count : integer;
  i : integer;
  ID : integer;
  Pos : integer;
begin
  Count := FVectors.Count ;
  stream.Write(Count,sizeOf(integer));
  for i:=0 to Count-1 do
  begin
    ID := PVector(FVectors[i]).FID ;
    Pos := PVector(FVectors[i]).FPos;
    stream.Write(ID,SizeOf(integer));
    Stream.Write(Pos,SizeOf(integer));
  end;
  
end;

function TVSM.tf(id: integer): integer;
var
  i : integer;
begin
  Result := 0;
  For i:=0 to FVectors.Count -1 do
  begin
    if(id = PVector(FVectors[i]).FID) then
    begin
      Inc(Result);
    end;
  end;
end;

function TVSM.Vector(Index: integer): PVector;
begin
  result := PVector(FVectors[index]);
end;

function TVSM.IndexOf(ID, Pos: integer): integer;
var
  Index : integer;
begin
  Result := -1;
  For Index :=0 to FVectors.Count -1 do
  begin
    if(ID = PVector(FVectors[index]).FID)and(Pos = PVector(FVectors[index]).FPos)then
    begin
      Result := Index;
      break;
    end;
  end;

end;

{ TAbstractDic }

procedure TAbstractDic.Add(Word: string);
begin
  begin
    AddNew(Word);  
  end;
end;

procedure TAbstractDic.AddNew(word: string);
begin
  FKeyWords.Add(word);
end;

constructor TAbstractDic.Create;
begin
  FKeyWords := TStringList.Create();
end;

destructor TAbstractDic.Destroy;
begin
  FKeyWords.Free;
end;

function TAbstractDic.IsWords(AWord: String): Boolean;
var
  Index : Integer;
begin
  Result := false;
  Index := FKeyWords.IndexOf(AWord);
  if Index >=0 then
  begin
    //TVSM(FKeyWords.Objects[Index]).AddVector(ID,Pos);
    Result := true;
  end;
end;
procedure TAbstractDic.Sort;
begin
  TStringList(FKeyWords).Sorted := true;
end;

{ TFileDic }

constructor TFileDic.Create(DicFile: string);
begin
  Inherited Create();
  FDicFile := DicFile;
end;

procedure TFileDic.LoadDic;
var
  F : TextFile;
  tmpStr : String;
begin
  inherited;
  if FileExists(FDicFile) then
  begin
    AssignFile(F,FDicFile) ;
    try
      Reset(F);
      While not Eof(F) do
      begin
        readln(F,tmpStr);
        AddNew(tmpStr);
      end;
      Sort();
    finally
      CloseFile(F);
    end;
  end;
end;

{ TDBDic }

function TDBDic.ConnDB: Boolean;
begin
    try
    FConnect.ConnectionString := FConnectionString;
    FConnect.Connected := true;
    FQuery.Connection := FConnect;
    Result := true;
  Except
    Result := false;
  end;
end;

constructor TDBDic.Create(ConnectionString, TableName, FieldName: string);
begin
  Inherited Create();
  FConnectionString := ConnectionString;
  FTableName := TableName;
  FFieldName := FieldName;
  FConnect := TAdoConnection.Create(nil);
  FQuery := TAdoQuery.Create(nil);
end;

procedure TDBDic.LoadDic;
begin
  inherited;
  if ConnDB() then
  begin
    if Query() then
    begin
      if not FQuery.IsEmpty then
      begin
        While not FQuery.Eof do
        begin
          self.AddNew(FQuery.Fields[0].asString); 
          FQuery.Next;
        end;
      end;
    end;
  end; 
end;

function TDBDic.MakeSql: String;
begin
  Result := 'SELECT '+FFieldName+' FROM '+FTableName;
end;

function TDBDic.Query: boolean;
begin
  FQuery.Close;
  FQuery.SQL.Clear;
  FQuery.SQL.Text := MakeSql();
  try
    FQuery.Open;
    Result := true;
  except
    Result := false;
  end;
end;

{ TParams }

procedure TParams.Addtf(FID, tf: integer);
var
  Index : integer;
  P : PParam;
begin
  Index := IndexOf(FID);
 if Index <> -1 then
 begin
   P := PParam(FItems[Index]);
   P.tf := tf;
 end
 else
 begin
   new(P);
   P.FID := FID;
   p.tf := tf;
   FItems.Add(P);
 end;
end;

procedure TParams.Clear;
var
  Index : integer;
  P : PParam;
begin
  For Index :=0 to FItems.Count-1 do
  begin
    P := PParam(FItems[Index]);
    Dispose(P);
  end;
  FItems.Clear();
end;

function TParams.Count: integer;
begin
  result := FItems.Count ;
end;

constructor TParams.Create;
begin
  FItems := TList.Create();
end;

destructor TParams.Destroy;
begin
  Clear();
  FItems.Free;
  inherited;
end;

function TParams.df: Integer;
begin
  result := FItems.Count;
end;

function TParams.DocID(Index: integer): Integer;
begin
  Result := PParam(FItems[Index]).FID ;
end;

function TParams.IndexOf(FID: integer): integer;
var
  P : PParam;
  I : integer;
begin
  Result := -1;
  For I:=0 to FItems.Count-1 do
  begin
    P := PParam(FItems[I]);
    if (P.FID = FID) then
    begin
       Result := I;
       break;
    end;
  end;
end;

function TParams.tf_ID(DocID: integer): Integer;
var
  Index : integer;
begin
  Index := IndexOf(DocID);
  result := tf_Index(Index);
end;

function TParams.tf_Index(Index: integer): integer;
begin
  result := -1;
  if (Index >= 0) and (Index < FItems.Count) then
  begin
    Result := PParam(FItems[Index]).tf ;
  end;
end;

{ TVSMList }

procedure TVSMList.Add(Word: string; ID, Pos: integer);
var
  Index : integer;
begin
  Index := FKeyWords.IndexOf(Word);
  if(Index < 0)then
  begin
    FKeyWords.AddObject(Word,TVSM.Create(ID,Pos));
  end
  else
  begin
    TVSM(FKeyWords.Objects[Index]).AddVector(ID,Pos); 
  end;
end;

procedure TVSMList.Add(Word: string);
begin
  if(FKeyWords.IndexOf(Word)<0) then
  begin
    FKeyWords.AddObject(Word,TVSM.Create()); 
  end;
end;

procedure TVSMList.Clear;
var
  i : integer;
begin
  For i:=0 to FKeyWords.Count -1 do
  begin
    TVSM(FKeyWords.Objects[i]).Free;
  end;
  FKeyWords.Clear();
end;

function TVSMList.Count: integer;
begin
  result := FKeyWords.Count;
end;

constructor TVSMList.Create;
begin
  FKeyWords := TStringList.Create();
end;

destructor TVSMList.Destroy;
begin
  Clear();
  FKeyWords.Free;
  inherited;
end;

function TVSMList.GetWords(index: integer): String;
begin
  result := FKeyWords[index];
end;

procedure TVSMList.LoadFromStream(stream: TStream);
var
  Count : integer;
  WordLen : integer;
  Word : TWordType;
  i : integer;
begin
  Stream.Read(Count,sizeof(integer));
  Clear();
  for i:=0 to Count-1 do
  begin
    Stream.Read(Word,SizeOf(TWordType));
    //Stream.Read(Word,WordLen);
    Self.Add(Word);
    TVSM(FKeyWords.Objects[i]).LoadFromStream(Stream);
  end;

end;

procedure TVSMList.ReadVector(FileName: string);
var
  stream : TStream;
begin
  stream := TFileStream.Create(FileName,fmOpenRead);
  try
    self.LoadFromStream(stream);
  finally
    Stream.Free;
  end; 
end;

procedure TVSMList.SaveToStream(stream: TStream);
var
  Count : integer;
  WordLen : integer;
  Word : TWordType;
  i : integer;
begin
  Count := FKeyWords.Count;
  Stream.Write(Count,SizeOf(integer));
  For i:=0 to Count-1 do
  begin
    //wordLen := Length(FKeyWords[i]);
    Word := FKeyWords[i];
    Stream.Write(Word,SizeOf(TWordType));
    TVSM(FKeyWords.Objects[i]).SaveToStream(stream);
  end;

end;

procedure TVSMList.SaveVector(FileName: string);
var
  Stream : TStream;
begin
  Stream := TFileStream.Create(FileName,fmCreate);
  try
    SaveToStream(Stream);
  finally
    stream.Free;
  end;
end;

function TVSMList.VSM(index: integer): TVSM;
begin
  result := TVSM(FKeyWords.Objects[index]);
end;

end.

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -