📄 ucndiv.pas
字号:
unit uCnDiv;
interface
uses
SysUtils, StrUtils, Classes;
const
StopWord: packed array[0..50] of string[2] = ('-','—',';','/',' ','。',
',',':','、','!','?','的','了','在','你','我','她','它','他','到','是',
'《','〉','》','〈','~','【','】','〖','〗','×','∥','■','▲',
'○','●','→','─','·','…','★','“','”','‘','’','(',')','—',')',
'们',' ');
type
TDictItem = record
diText: string;
diFreq: Integer;
end;
PDictItem = ^TDictItem;
TDict = class
private
FCount: Integer;
FData: array[Byte] of TList;
FMaxWordLen: Integer;
protected
public
constructor Create;
destructor Destroy; override;
function Find(const AStr: string): Integer;
procedure Load(const APath: string);
property Count: Integer read FCount;
property MaxWordLen: Integer read FMaxWordLen;
end;
TCnDiv = class
private
FDict: TDict;
protected
function IsStopWord(const AStr: string): Boolean;
procedure Step1(const AStr: string; AWordList: TStrings);
procedure Step2(const AStr: string; AWordList: TStrings);
public
constructor Create;
destructor Destroy; override;
procedure Process(const AStr: string; AWordList: TStrings);
end;
implementation
{ TDict }
constructor TDict.Create;
var
i: Integer;
begin
inherited Create;
for i:=Low(FData) to High(FData) do
FData[i]:=TList.Create;
FCount:=0;
end;
destructor TDict.Destroy;
var
i: Integer;
begin
for i:=Low(FData) to High(FData) do
FData[i].Free;
inherited;
end;
function TDict.Find(const AStr: string): Integer;
var
i: Integer;
PItem: PDictItem;
Data: TList;
begin
Result:=-1;
Data:=FData[Ord(AStr[1])];
for i:=0 to Data.Count - 1 do
begin
PItem:=PDictItem(Data[i]);
if AnsiSameText(AStr,PItem^.diText) then
begin
Result:=PItem^.diFreq;
Break;
end;
end;
end;
procedure TDict.Load(const APath: string);
const
chDelimiter = ',';
var
F: TextFile;
i, tmpPos: Integer;
Line, tmpWord, tmpFreq, FileName: string;
PNewItem: PDictItem;
begin
FMaxWordLen:=0;
FCount:=0;
for i:=Low(FData) to High(FData) do
begin
FileName:=Format('%s%s',[IncludeTrailingPathDelimiter(APath),
FormatFloat('000',i)]);
AssignFile(F,FileName);
Reset(F);
FData[i].Clear;
while not Eof(F) do
begin
Readln(F,Line);
tmpPos:=Pos(chDelimiter,Line);
tmpWord:=LeftBStr(Line,tmpPos - 1);
tmpFreq:=RightBStr(Line,Length(Line) - tmpPos);
New(PNewItem);
PNewItem^.diText:=tmpWord;
PNewItem^.diFreq:=StrToIntDef(tmpFreq,1);
FData[i].Add(PNewItem);
Inc(FCount);
if Length(tmpWord) > FMaxWordLen then
FMaxWordLen:=Length(tmpWord);
end;
CloseFile(F);
end;
end;
{ TCnDiv }
constructor TCnDiv.Create;
begin
inherited Create;
FDict:=TDict.Create;
FDict.Load('dict\');
end;
destructor TCnDiv.Destroy;
begin
FDict.Free;
inherited;
end;
function TCnDiv.IsStopWord(const AStr: string): Boolean;
var
i: Integer;
begin
Result:=False;
for i:=Low(StopWord) to High(StopWord) do
if AnsiSameText(AStr,Stopword[i]) then
begin
Result:=True;
Break;
end;
end;
procedure TCnDiv.Process(const AStr: string; AWordList: TStrings);
var
i: Integer;
begin
if not Assigned(AWordList) then Exit;
if AStr = '' then Exit;
{for i:=0 to FDict.Count - 1 do
begin
AWordList.Add(FDict.Items[i]^.diText);
end;}
Step1(AStr,AWordList);
end;
procedure TCnDiv.Step1(const AStr: string; AWordList: TStrings);
var
ix: Integer;
tmpWord, tmpChr, NotCnWord: string;
begin
ix:=1;
tmpWord:='';
NotCnWord:='';
while ix <= Length(AStr) do
begin
tmpChr:=MidBStr(AStr,ix,CharLength(AStr,ix));
if not IsStopWord(tmpChr) then
begin
//如果发现非中文字符就将它作为分隔符
if ByteType(tmpChr,1) = mbSingleByte then
begin
if tmpWord <> '' then
begin
Step2(tmpWord,AWordList);
tmpWord:='';
end;
//通过检查字节类型,将非中文部分提取出来
while (ByteType(tmpChr,1) = mbSingleByte) and (not IsStopWord(tmpChr)) do
begin
NotCnWord:=NotCnWord + tmpChr;
Inc(ix);
tmpChr:=MidBStr(AStr,ix,CharLength(AStr,ix));
end;
AWordList.Add(NotCnWord);
NotCnWord:='';
if tmpChr <> '' then
begin
Dec(ix);
tmpChr:='';
end;
end
else
tmpWord:=tmpWord + tmpChr;
end
else
begin
if tmpWord <> '' then
begin
Step2(tmpWord,AWordList);
tmpWord:='';
AWordList.Add(tmpChr);
end;
end; //end if
ix:=NextCharIndex(AStr,ix);
end; //end while
if tmpWord <> '' then
AWordList.Add(tmpWord);
end;
procedure TCnDiv.Step2(const AStr: string; AWordList: TStrings);
var
ix: Integer;
tmpWord, tmpChr: string;
begin
//AWordList.Add(AStr);
ix:=1;
tmpWord:='';
while ix <= Length(AStr) do
begin
tmpChr:=MidBStr(AStr,ix,CharLength(AStr,ix));
tmpWord:=tmpWord + tmpChr;
if FDict.Find(tmpWord) > -1 then
begin
AWordList.Add(tmpWord);
tmpWord:='';
end
else if Length(tmpWord) > FDict.MaxWordLen then
begin
tmpChr:=LeftBStr(tmpWord,CharLength(AStr,1));
AWordList.Add(tmpChr);
ix:=ix - Length(tmpWord) + Length(tmpChr);
tmpWord:='';
end;
ix:=NextCharIndex(AStr,ix);
end;
if tmpWord <> '' then
AWordList.Add(tmpWord);
end;
end.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -