📄 ucndiv.pas
字号:
unit uCnDiv;
interface
uses
SysUtils, StrUtils, Classes;
const
StopWord: packed array[0..50] of string[2] = ('-','—',';','/',' ','。',
',',':','、','!','?','的','了','在','你','我','她','它','他','到','是',
'《','〉','》','〈','~','【','】','〖','〗','×','∥','■','▲',
'○','●','→','─','·','…','★','“','”','‘','’','(',')','—',')',
'们','并');
type
TDictItem = record
diText: string;
diFreq: Integer;
end;
PDictItem = ^TDictItem;
TDict = class
private
FCount: Integer;
FData: TList;
FMaxWordLen: Integer;
function GetItem(AIndex: Integer): PDictItem;
protected
public
constructor Create;
destructor Destroy; override;
function Find(const AStr: string): Integer;
procedure LoadFromFile(const AFile: string);
property Count: Integer read FCount;
property Items[AIndex: Integer]: PDictItem read GetItem;
property MaxWordLen: Integer read FMaxWordLen;
end;
TCnDiv = class
private
FDict: TDict;
protected
function IsStopWord(const AStr: string): Boolean;
procedure Step1(const AStr: string; AWordList: TStrings);
procedure Step2(const AStr: string; AWordList: TStrings);
public
constructor Create;
destructor Destroy; override;
procedure Process(const AStr: string; AWordList: TStrings);
end;
implementation
{ TDict }
constructor TDict.Create;
begin
inherited Create;
FData:=TList.Create;
FCount:=0;
end;
destructor TDict.Destroy;
begin
FData.Free;
inherited;
end;
function TDict.Find(const AStr: string): Integer;
var
i: Integer;
PItem: PDictItem;
begin
Result:=-1;
for i:=0 to FData.Count - 1 do
begin
PItem:=PDictItem(FData[i]);
if AnsiSameText(AStr,PItem^.diText) then
begin
Result:=PItem^.diFreq;
Break;
end;
end;
end;
function TDict.GetItem(AIndex: Integer): PDictItem;
begin
Result:=PDictItem(FData[AIndex]);
end;
procedure TDict.LoadFromFile(const AFile: string);
const
chDelimiter = ',';
var
F: TextFile;
i, tmpPos: Integer;
Line, tmpWord, tmpFreq: string;
PNewItem: PDictItem;
begin
AssignFile(F,AFile);
Reset(F);
FCount:=0;
FMaxWordLen:=0;
FData.Clear;
while not Eof(F) do
begin
Readln(F,Line);
tmpPos:=Pos(chDelimiter,Line);
tmpWord:=LeftBStr(Line,tmpPos - 1);
tmpFreq:=RightBStr(Line,Length(Line) - tmpPos);
New(PNewItem);
PNewItem^.diText:=tmpWord;
PNewItem^.diFreq:=StrToIntDef(tmpFreq,1);
FData.Add(PNewItem);
Inc(FCount);
if Length(tmpWord) > FMaxWordLen then
FMaxWordLen:=Length(tmpWord);
end;
CloseFile(F);
end;
{ TCnDiv }
constructor TCnDiv.Create;
begin
inherited Create;
FDict:=TDict.Create;
FDict.LoadFromFile('dict\cn.dat');
end;
destructor TCnDiv.Destroy;
begin
FDict.Free;
inherited;
end;
function TCnDiv.IsStopWord(const AStr: string): Boolean;
var
i: Integer;
begin
Result:=False;
for i:=Low(StopWord) to High(StopWord) do
if AnsiSameText(AStr,Stopword[i]) then
begin
Result:=True;
Break;
end;
end;
procedure TCnDiv.Process(const AStr: string; AWordList: TStrings);
var
i: Integer;
begin
if not Assigned(AWordList) then Exit;
if AStr = '' then Exit;
{for i:=0 to FDict.Count - 1 do
begin
AWordList.Add(FDict.Items[i]^.diText);
end;}
Step1(AStr,AWordList);
end;
procedure TCnDiv.Step1(const AStr: string; AWordList: TStrings);
var
ix: Integer;
tmpWord, tmpChr: string;
begin
ix:=1;
tmpWord:='';
while ix <= Length(AStr) do
begin
tmpChr:=MidBStr(AStr,ix,CharLength(AStr,ix));
if not IsStopWord(tmpChr) then
tmpWord:=tmpWord + tmpChr
else
begin
if tmpWord <> '' then
begin
Step2(tmpWord,AWordList);
tmpWord:='';
end;
end;
ix:=NextCharIndex(AStr,ix);
end;
end;
procedure TCnDiv.Step2(const AStr: string; AWordList: TStrings);
var
ix: Integer;
tmpWord, tmpChr: string;
begin
//AWordList.Add(AStr);
ix:=1;
tmpWord:='';
while ix <= Length(AStr) do
begin
tmpChr:=MidBStr(AStr,ix,CharLength(AStr,ix));
tmpWord:=tmpWord + tmpChr;
if FDict.Find(tmpWord) > -1 then
begin
AWordList.Add(tmpWord);
tmpWord:='';
end
else if Length(tmpWord) > FDict.MaxWordLen then
begin
tmpChr:=LeftBStr(tmpWord,CharLength(AStr,1));
AWordList.Add(tmpChr);
ix:=ix - Length(tmpWord) + Length(tmpChr);
tmpWord:='';
end;
ix:=NextCharIndex(AStr,ix);
end;
end;
end.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -