📄 ucndiv.pas

📁 简单的汉语分词算法
💻 PAS
字号:
unit uCnDiv;

interface

uses
  SysUtils, StrUtils, Classes;

const
  StopWord: packed array[0..50] of string[2] = ('－','—','；','／','　','。',
    '，','：','、','！','？','的','了','在','你','我','她','它','他','到','是',
    '《','〉','》','〈','～','【','】','〖','〗','×','∥','■','▲',
    '○','●','→','─','·','…','★','“','”','‘','’','（','）','—',')',
    '们',' ');

type
  TDictItem = record
    diText: string;
    diFreq: Integer;
  end;
  PDictItem = ^TDictItem;

  TDict = class
  private
    FCount: Integer;
    FData: array[Byte] of TList;
    FMaxWordLen: Integer;
  protected

  public
    constructor Create;
    destructor Destroy; override;
    function Find(const AStr: string): Integer;
    procedure Load(const APath: string);
    property Count: Integer read FCount;
    property MaxWordLen: Integer read FMaxWordLen;
  end;

  TCnDiv = class
  private
    FDict: TDict;
  protected
    function IsStopWord(const AStr: string): Boolean;
    procedure Step1(const AStr: string; AWordList: TStrings);
    procedure Step2(const AStr: string; AWordList: TStrings);
  public
   constructor Create;
   destructor Destroy; override;
   procedure Process(const AStr: string; AWordList: TStrings);
  end;

implementation

{ TDict }

constructor TDict.Create;
var
  i: Integer;
begin
  inherited Create;
  for i:=Low(FData) to High(FData) do
    FData[i]:=TList.Create;
  FCount:=0;
end;

destructor TDict.Destroy;
var
  i: Integer;
begin
  for i:=Low(FData) to High(FData) do
    FData[i].Free;
  inherited;
end;

function TDict.Find(const AStr: string): Integer;
var
  i: Integer;
  PItem: PDictItem;
  Data: TList;
begin
  Result:=-1;
  Data:=FData[Ord(AStr[1])];
  for i:=0 to Data.Count - 1 do
  begin
    PItem:=PDictItem(Data[i]);
    if AnsiSameText(AStr,PItem^.diText) then
    begin
      Result:=PItem^.diFreq;
      Break;
    end;
  end;
end;

procedure TDict.Load(const APath: string);
const
  chDelimiter = ',';
var
  F: TextFile;
  i, tmpPos: Integer;
  Line, tmpWord, tmpFreq, FileName: string;
  PNewItem: PDictItem;
begin
  FMaxWordLen:=0;
  FCount:=0;
  for i:=Low(FData) to High(FData) do
  begin
    FileName:=Format('%s%s',[IncludeTrailingPathDelimiter(APath),
      FormatFloat('000',i)]);
    AssignFile(F,FileName);
    Reset(F);
    FData[i].Clear;
    while not Eof(F) do
    begin
      Readln(F,Line);
      tmpPos:=Pos(chDelimiter,Line);
      tmpWord:=LeftBStr(Line,tmpPos - 1);
      tmpFreq:=RightBStr(Line,Length(Line) - tmpPos);
      New(PNewItem);
      PNewItem^.diText:=tmpWord;
      PNewItem^.diFreq:=StrToIntDef(tmpFreq,1);
      FData[i].Add(PNewItem);
      Inc(FCount);
      if Length(tmpWord) > FMaxWordLen then
        FMaxWordLen:=Length(tmpWord);
    end;
    CloseFile(F);
  end;
end;

{ TCnDiv }

constructor TCnDiv.Create;
begin
  inherited Create;
  FDict:=TDict.Create;
  FDict.Load('dict\');
end;

destructor TCnDiv.Destroy;
begin
  FDict.Free;
  inherited;
end;

function TCnDiv.IsStopWord(const AStr: string): Boolean;
var
  i: Integer;
begin
  Result:=False;
  for i:=Low(StopWord) to High(StopWord) do
    if AnsiSameText(AStr,Stopword[i]) then
    begin
      Result:=True;
      Break;
    end;
end;

procedure TCnDiv.Process(const AStr: string; AWordList: TStrings);
var
  i: Integer;
begin
  if not Assigned(AWordList) then Exit;
  if AStr = '' then Exit;
  {for i:=0 to FDict.Count - 1 do
  begin
    AWordList.Add(FDict.Items[i]^.diText);
  end;}
  Step1(AStr,AWordList);
end;

procedure TCnDiv.Step1(const AStr: string; AWordList: TStrings);
var
  ix: Integer;
  tmpWord, tmpChr, NotCnWord: string;
begin
  ix:=1;
  tmpWord:='';
  NotCnWord:='';
  while ix <= Length(AStr) do
  begin
    tmpChr:=MidBStr(AStr,ix,CharLength(AStr,ix));
    if not IsStopWord(tmpChr) then
    begin
      //如果发现非中文字符就将它作为分隔符
      if ByteType(tmpChr,1) = mbSingleByte then
      begin
        if tmpWord <> '' then
        begin
          Step2(tmpWord,AWordList);
          tmpWord:='';
        end;
        //通过检查字节类型，将非中文部分提取出来
        while (ByteType(tmpChr,1) = mbSingleByte) and (not IsStopWord(tmpChr)) do
        begin
          NotCnWord:=NotCnWord + tmpChr;
          Inc(ix);
          tmpChr:=MidBStr(AStr,ix,CharLength(AStr,ix));
        end;
        AWordList.Add(NotCnWord);
        NotCnWord:='';
        if tmpChr <> '' then
        begin
          Dec(ix);
          tmpChr:='';
        end;
      end
      else
        tmpWord:=tmpWord + tmpChr;
    end
    else
    begin
      if tmpWord <> '' then
      begin
        Step2(tmpWord,AWordList);
        tmpWord:='';
        AWordList.Add(tmpChr);
      end;
    end; //end if
    ix:=NextCharIndex(AStr,ix);
  end; //end while
  if tmpWord <> '' then
    AWordList.Add(tmpWord);
end;

procedure TCnDiv.Step2(const AStr: string; AWordList: TStrings);
var
  ix: Integer;
  tmpWord, tmpChr: string;
begin
  //AWordList.Add(AStr);
  ix:=1;
  tmpWord:='';
  while ix <= Length(AStr) do
  begin
    tmpChr:=MidBStr(AStr,ix,CharLength(AStr,ix));
    tmpWord:=tmpWord + tmpChr;
    if FDict.Find(tmpWord) > -1 then
    begin
      AWordList.Add(tmpWord);
      tmpWord:='';
    end
    else if Length(tmpWord) > FDict.MaxWordLen then
    begin
      tmpChr:=LeftBStr(tmpWord,CharLength(AStr,1));
      AWordList.Add(tmpChr);
      ix:=ix - Length(tmpWord) + Length(tmpChr);
      tmpWord:='';
    end;
    ix:=NextCharIndex(AStr,ix);
  end;
  if tmpWord <> '' then
    AWordList.Add(tmpWord);
end;

end.
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -