📄 uparser.pas
字号:
unit UParser;
{
//////////////////////////////////////////
词分析类
这个分析类的速度很慢,应该还能优化
//////////////////////////////////////////
}
interface
uses Classes,UDic;
Type
//分析器
TParser = class
private
FDic : TAbstractDic;
FVSMList : TVSMList;
private
FID : integer;
private
//分割纯中文字符--正向最大分割
Function SegmentChinese_Positive(iPos:integer;Text:String):string;
//分割纯中文字符--反向最大分割
Function SegmentChinese_Negative(iPos:integer;Text:String):string;
public
//分割处理函数
Function Segment(Text : string;bDirection:boolean):string;
public
property VSMList : TVSMList read FVSMList write FVSMList;
//property DocID : integer read FID write FID;
public
//分析函数
procedure Parser(TextID : integer; Text:String);
public
Constructor Create(Dic:TAbstractDic;VSMList : TVSMList);
Destructor Destroy();override;
end;
implementation
{ TParser }
constructor TParser.Create(Dic:TAbstractDic;VSMList : TVSMList);
begin
FDic := Dic;
FVSMList := VSMList;
FID := -1;
end;
destructor TParser.Destroy;
begin
inherited;
end;
procedure TParser.Parser(TextID: integer; Text: String);
begin
FID := TextID;
//这里进行正反两次分词
Segment(Text,true);
Segment(Text,false);
end;
function TParser.SegmentChinese_Negative(iPos: integer; Text: String): string;
var
s1,s2 : String;
len,wLen : integer;
w : string;
IsW : boolean;
pos : integer;
begin
Pos := iPos+Length(Text);
s1 := Text;
while s1<>'' do
begin
len := Length(s1);
wLen := MAX_WORD_LENGTH;
if len < MAX_WORD_LENGTH then wLen := len;
//取出一个词,对这个词进行分析
w := Copy(s1,len-wLen+1,wlen);
IsW := FDic.IsWords(W);
while (wlen>2) and(IsW=False) do
begin
dec(wlen,2);
w := Copy(w,3,wlen);
IsW := FDic.IsWords(W);
end;
//词分析完毕,如果是关键词,那么就添加,到向量中去
Pos := Pos - wLen;
if IsW and (FID<>-1) then
begin
FVSMList.Add(w,FID,Pos);
end;
s2 := w+'/'+s2;
s1 := Copy(s1,0,len-wLen);
end;
Result := s2;
end;
Function TParser.SegmentChinese_Positive(iPos:integer;Text: String):String;var
s1,s2 : String;
len : integer;
w : string;
IsW : boolean;
pos : integer;
begin
Pos := iPos;
s1 := Text;
while s1<>'' do
begin
len := Length(s1);
if len > MAX_WORD_LENGTH then len := MAX_WORD_LENGTH;
//取出一个词,对这个词进行分析
w := Copy(s1,0,len);
IsW := FDic.IsWords(W);
while (len>2) and(IsW=False) do
begin
dec(len,2);
w := Copy(w,0,len);
IsW := FDic.IsWords(W);
end;
//词分析完毕,如果是关键词,那么就添加,到向量中去
if IsW and (FID<>-1) then
begin
FVSMList.Add(w,FID,Pos);
end;
s2 :=s2+ w+'/';
Pos := Pos + Length(W);
s1 := Copy(s1,length(w)+1,Length(s1)-length(w));
end;
Result := s2;
end;
Function TParser.Segment(Text: string;bDirection:boolean):string;var
s1,s2,tmpStr : string;
i,len : integer;
ch : char;
pos : integer;
begin
s1 := text;
pos := 0;
while s1<>'' do
begin //首先去除空格,换行,段落符号 i := 1; len := length(s1); while (i <= len) and (s1[i] = ' ') or (s1[i] =#10) or (s1[i] =#13) do inc(i); if (i > len) then begin break; end; if (i > 1) then begin pos := pos+i; s1 := copy(s1,i,len-i+1); end; ///这里处理ASCII字符 ch := s1[1]; if(ch < #128)then begin i := 1; len := length(s1); while(i<=len) and (s1[i]< #128) and (s1[i] <> ' ') and (s1[i]<>#10) and (s1[i]<> #13)do begin inc(i); end; s2 := s2+Copy(s1,0,i-1) + '/'; if (i <= len)then begin pos := pos+i; s1 := Copy(s1,i,len-i+1); continue; end else begin break; end; end else if (ch < #176) then // 中文标点等非汉字字符 begin i := 1; len := length(s1); while(i<=len) and (s1[i]<#176) and (s1[i]>=#161) and (not ((s1[i]=#161) and ((s1[i+1]>=#162) and (s1[i+1]<=#168)))) and (not ((s1[i]=#161) and ((s1[i+1]>=#171) and (s1[i+1]<=#191)))) and (not ((s1[i]=#163) and ((s1[i+1]=#172) or (s1[i+1]=#161)) or (s1[i+1]=#168) or (s1[i+1]=#169) or (s1[i+1]=#186) or (s1[i+1]=#187) or (s1[i+1]=#191))) do begin inc(i,2); // 假定没有半个汉字 end; if (i=1) then i:=i+2; // 不处理中文空格 if not((ch=#161) and (s1[i]=#161))then begin if (i <= len)then // 其他的非汉字双字节字符可能连续输出 s2 :=s2 + Copy(s1,0,i-1) + '/' else break; // yhf end; //准备下一次的处理 if (i <= len)then begin pos := pos+i; s1:=Copy(s1,i,len-i+1); end else break; continue; end; // 以下处理汉字串 i := 1; len := length(s1); while(i<=len) and (s1[i]>=#176) do inc(i,2); tmpStr := Copy(s1,0,i-1); if(bDirection) then s2 :=s2+SegmentChinese_Positive(pos,tmpStr) else s2 :=s2+SegmentChinese_Negative(pos,tmpStr); if (i <= len)then begin pos := pos+i; s1 :=Copy(s1,i,len-i+1); end else break; end; Result := s2;end;end.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -