📄 udic.pas
字号:
unit UDic;
{
////////////////////////////////
字典类----用于保存关键词各种参数,以及向量
////////////////////////////////
}
interface
uses SysUtils,Classes,ADODB;
const
//最大的词长度
MAX_WORD_LENGTH = 10;
type
//词类型
TWordType = string[MAX_WORD_LENGTH];
//向量的记录
PVector = ^TVector;
TVector = record
FID : integer;
FPos : integer;
end;
//权重的参数
PParam = ^TParam;
TParam = Record
FID : integer;
tf : integer;
end;
//计算权重的参数
TParams = class
private
FItems : TList;
private
procedure Clear();
Function IndexOf(FID : integer):integer;
public
//根据文档号去除词频
Function tf_ID(DocID : integer):Integer;
//根据索引号取词频
Function tf_Index(Index : integer):integer;
//添加词频
Procedure Addtf(FID : integer;tf : integer);
//取得文档的频率
Function df():Integer;
//取得参数的数量
Function Count():integer;
//根据索引取出文档号
Function DocID(Index : integer):Integer;
public
Constructor Create();
Destructor Destroy();override;
end;
//向量空间
TVSM = Class
private
FVectors : TList;
FParams : TParams;
private
procedure SaveToStream(stream : TStream);
procedure LoadFromStream(stream : TStream);
//返回词在某个文档中的频数
Function tf(id : integer):integer;
//返回包含这个词的文档数
Function df():integer;
//通过文档号和位置求索引
Function IndexOf(ID,Pos : integer):integer;
public
procedure Clear();
//procedure AddVector(vector:TVector);overload;
procedure AddVector(id,pos : integer);overload;
Function Vector(Index : integer):PVector;
Function Count():integer;
//添加向量,文档号没有重复
procedure AddVSM(ParamVSM : TVSM);
//计算返回参数
Function CalcParam():TParams;
//科隆
procedure CloneTo(PVSM : TVSM);
//文档号唯一化
procedure SigleDocID();
//返回最多的那个文档号
Function MaxDocID():integer;
public
Constructor Create();overload;
Constructor Create(ID : integer; Pos : integer);overload;
Destructor Destroy(); override;
end;
//向量空间列表
TVSMList = Class
private
FKeyWords : TStrings;
private
procedure SaveToStream(stream : TStream);
procedure LoadFromStream(stream : TStream);
public
procedure Clear();
//添加一个向量
procedure Add(Word : string;ID,Pos : integer);overload;
procedure Add(Word : string);overload;
//保存向量到文件
procedure SaveVector(FileName:string);
//从文件重读取向量
procedure ReadVector(FileName:string);
//
Function Count():integer;
//取一个向量
Function VSM(index : integer):TVSM;overload;
//取一个向量
Function VSM(Word : string):TVSM;overload;
//取一个关键字
Function GetWords(index : integer):String;
public
Constructor Create();
Destructor Destroy(); override;
end;
//抽象的字典
TAbstractDic = class
private
FKeyWords : TStrings;
//FVSMList : TVSMList;
protected
//添加一个新词
procedure AddNew(word : string);
//添加一个词
procedure Add(Word : string);
//
Procedure Sort();
public
//加载文档列表
procedure LoadDic();virtual;abstract;
public
//测试字符串是否是一个词
Function IsWords(AWord:String):Boolean;virtual;
//public
// property VSMList : TVSMList Read FVSMList ;//Write FVSMList;
public
Constructor Create();
Destructor Destroy(); override;
end;
//文件字典
TFileDic = Class(TAbstractDic)
private
FDicFile : string;
public
//加载字典
procedure LoadDic();override;
public
Constructor Create(DicFile:string);
end;
//数据库字典
TDBDic = class(TAbstractDic)
private
FConnectionString : string;
FTableName : string;
FFieldName : string;
FConnect : TAdoConnection;
FQuery : TADoQuery;
private
//连接数据库
Function ConnDB():Boolean;
//生成Sql语言
Function MakeSql():String;
//查询
Function Query():boolean;
private
procedure LoadDic();override;
public
constructor Create(ConnectionString : string;TableName : string;FieldName :string);
end;
implementation
uses UCalcWeight;
{ TVSM }
procedure TVSM.AddVector(id, pos: integer);
var
PV : PVector;
begin
if IndexOf(ID,pos)=-1 then
begin
new(PV);
PV.FID := id;
PV.FPos := pos;
FVectors.Add(PV);
end;
end;
{procedure TVSM.AddVector(vector: TVector);
begin
AddVector(vector.FID,vector.FPos)
end;
}
function TVSM.CalcParam: TParams;
var
Index : integer;
begin
FParams.Clear();
For Index := 0 to FVectors.Count-1 do
begin
FParams.Addtf(PVector(FVectors[Index]).FID,0);
end;
For Index := 0 to FParams.Count -1 do
begin
FParams.Addtf(FParams.DocID(Index),tf(FParams.DocID(Index)));
end;
result := FParams;
end;
procedure TVSM.Clear;
var
i : integer;
begin
For i := 0 to FVectors.Count -1 do
begin
Dispose(PVector(FVectors[i]));
end;
FVectors.Clear();
end;
constructor TVSM.Create;
begin
FVectors := TList.Create;
FParams := TParams.Create();
end;
function TVSM.Count: integer;
begin
result := Self.FVectors.Count ;
end;
constructor TVSM.Create(ID, Pos: integer);
begin
FVectors := TList.Create;
FParams := TParams.Create();
Self.AddVector(ID,Pos);
end;
destructor TVSM.Destroy;
begin
FParams.Free;
Clear();
FVectors.Free;
inherited;
end;
function TVSM.df: integer;
var
i : integer;
DocIDs : TList;
TmpID :integer;
begin
DocIDs := TList.Create ;
For i := 0 to FVectors.Count -1 do
begin
TmpID := PVector(FVectors).FID;
if DocIDs.IndexOf(Pointer(TmpID)) < 0 then
begin
DocIDs.Add(Pointer(TmpID));
end;
end;
Result := DocIDs.Count ;
DocIDs.Free;
end;
procedure TVSM.LoadFromStream(stream: TStream);
var
Count : integer;
i : integer;
ID : integer;
Pos : integer;
begin
Stream.Read(Count,SizeOf(integer));
Clear();
For i:=0 to Count-1 do
begin
Stream.Read(ID,SizeOf(integer));
Stream.Read(Pos, SizeOf(integer));
AddVector(ID,Pos);
end;
end;
procedure TVSM.SaveToStream(stream: TStream);
var
Cnt : integer;
i : integer;
ID : integer;
Pos : integer;
begin
Cnt := FVectors.Count ;
stream.Write(Cnt,sizeOf(integer));
for i:=0 to Cnt-1 do
begin
ID := PVector(FVectors[i]).FID ;
Pos := PVector(FVectors[i]).FPos;
stream.Write(ID,SizeOf(integer));
Stream.Write(Pos,SizeOf(integer));
end;
end;
function TVSM.tf(id: integer): integer;
var
i : integer;
begin
Result := 0;
For i:=0 to FVectors.Count -1 do
begin
if(id = PVector(FVectors[i]).FID) then
begin
Inc(Result);
end;
end;
end;
function TVSM.Vector(Index: integer): PVector;
begin
result := PVector(FVectors[index]);
end;
function TVSM.IndexOf(ID, Pos: integer): integer;
var
Index : integer;
begin
Result := -1;
For Index :=0 to FVectors.Count -1 do
begin
if(ID = PVector(FVectors[index]).FID)and(Pos = PVector(FVectors[index]).FPos)then
begin
Result := Index;
break;
end;
end;
end;
procedure TVSM.AddVSM(ParamVSM: TVSM);
var
i : integer;
begin
if ParamVSM=nil then exit;
For i:=0 to ParamVSM.Count-1 do
begin
//if self.tf(ParamVSM.Vector(i).FID)=0 then
AddVector(ParamVSM.Vector(i).FID,ParamVSM.Vector(i).FPos);
end;
end;
procedure TVSM.CloneTo(PVSM: TVSM);
var
Stream : TStream;
begin
Stream := TMemoryStream.Create ;
try
SaveToStream(Stream);
Stream.Position := 0;
PVSM.LoadFromStream(Stream);
finally
Stream.Free;
end;
end;
procedure TVSM.SigleDocID;
var
I,J : integer;
ID : integer;
P : PVector;
cnt : integer;
begin
//标志那些重复
For I := 0 to Count -1 do
begin
ID := Self.Vector(I).FID;
if ID <> -1 then
begin
For J := I+1 to Count-1 do
begin
if ID = Self.Vector(J).FID then
begin
Self.Vector(J).FID := -1;
end;
end;
end;
end;
//删除那些重复的
cnt := Count-1;
For I := cnt downTo 0 do
begin
if Vector(I).FID =-1 then
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -