⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 ucndiv.pas

📁 Delphi实现的简单中文分词
💻 PAS
字号:
unit uCnDiv;

interface

uses
  SysUtils, StrUtils, Classes;

const
  StopWord: packed array[0..50] of string[2] = ('-','—',';','/',' ','。',
    ',',':','、','!','?','的','了','在','你','我','她','它','他','到','是',
    '《','〉','》','〈','~','【','】','〖','〗','×','∥','■','▲',
    '○','●','→','─','·','…','★','“','”','‘','’','(',')','—',')',
    '们','并');

type
  TDictItem = record
    diText: string;
    diFreq: Integer;
  end;
  PDictItem = ^TDictItem;

  TDict = class
  private
    FCount: Integer;
    FData: TList;
    FMaxWordLen: Integer;
    function GetItem(AIndex: Integer): PDictItem;
  protected

  public
    constructor Create;
    destructor Destroy; override;
    function Find(const AStr: string): Integer;
    procedure LoadFromFile(const AFile: string);
    property Count: Integer read FCount;
    property Items[AIndex: Integer]: PDictItem read GetItem;
    property MaxWordLen: Integer read FMaxWordLen;
  end;

  TCnDiv = class
  private
    FDict: TDict;
  protected
    function IsStopWord(const AStr: string): Boolean;
    procedure Step1(const AStr: string; AWordList: TStrings);
    procedure Step2(const AStr: string; AWordList: TStrings);
  public
   constructor Create;
   destructor Destroy; override;
   procedure Process(const AStr: string; AWordList: TStrings);
  end;

implementation

{ TDict }

constructor TDict.Create;
begin
  inherited Create;
  FData:=TList.Create;
  FCount:=0;
end;

destructor TDict.Destroy;
begin
  FData.Free;
  inherited;
end;

function TDict.Find(const AStr: string): Integer;
var
  i: Integer;
  PItem: PDictItem;
begin
  Result:=-1;
  for i:=0 to FData.Count - 1 do
  begin
    PItem:=PDictItem(FData[i]);
    if AnsiSameText(AStr,PItem^.diText) then
    begin
      Result:=PItem^.diFreq;
      Break;
    end;
  end;
end;

function TDict.GetItem(AIndex: Integer): PDictItem;
begin
  Result:=PDictItem(FData[AIndex]);
end;

procedure TDict.LoadFromFile(const AFile: string);
const
  chDelimiter = ',';
var
  F: TextFile;
  i, tmpPos: Integer;
  Line, tmpWord, tmpFreq: string;
  PNewItem: PDictItem;
begin
  AssignFile(F,AFile);
  Reset(F);
  FCount:=0;
  FMaxWordLen:=0;
  FData.Clear;
  while not Eof(F) do
  begin
    Readln(F,Line);
    tmpPos:=Pos(chDelimiter,Line);
    tmpWord:=LeftBStr(Line,tmpPos - 1);
    tmpFreq:=RightBStr(Line,Length(Line) - tmpPos);
    New(PNewItem);
    PNewItem^.diText:=tmpWord;
    PNewItem^.diFreq:=StrToIntDef(tmpFreq,1);
    FData.Add(PNewItem);
    Inc(FCount);
    if Length(tmpWord) > FMaxWordLen then
      FMaxWordLen:=Length(tmpWord);
  end;
  CloseFile(F);
end;

{ TCnDiv }

constructor TCnDiv.Create;
begin
  inherited Create;
  FDict:=TDict.Create;
  FDict.LoadFromFile('dict\cn.dat');
end;

destructor TCnDiv.Destroy;
begin
  FDict.Free;
  inherited;
end;

function TCnDiv.IsStopWord(const AStr: string): Boolean;
var
  i: Integer;
begin
  Result:=False;
  for i:=Low(StopWord) to High(StopWord) do
    if AnsiSameText(AStr,Stopword[i]) then
    begin
      Result:=True;
      Break;
    end;
end;

procedure TCnDiv.Process(const AStr: string; AWordList: TStrings);
var
  i: Integer;
begin
  if not Assigned(AWordList) then Exit;
  if AStr = '' then Exit;
  {for i:=0 to FDict.Count - 1 do
  begin
    AWordList.Add(FDict.Items[i]^.diText);
  end;}
  Step1(AStr,AWordList);
end;

procedure TCnDiv.Step1(const AStr: string; AWordList: TStrings);
var
  ix: Integer;
  tmpWord, tmpChr: string;
begin
  ix:=1;
  tmpWord:='';
  while ix <= Length(AStr) do
  begin
    tmpChr:=MidBStr(AStr,ix,CharLength(AStr,ix));
    if not IsStopWord(tmpChr) then
      tmpWord:=tmpWord + tmpChr
    else
    begin
      if tmpWord <> '' then
      begin
        Step2(tmpWord,AWordList);
        tmpWord:='';
      end;
    end;
    ix:=NextCharIndex(AStr,ix);
  end;
end;

procedure TCnDiv.Step2(const AStr: string; AWordList: TStrings);
var
  ix: Integer;
  tmpWord, tmpChr: string;
begin
  //AWordList.Add(AStr);
  ix:=1;
  tmpWord:='';
  while ix <= Length(AStr) do
  begin
    tmpChr:=MidBStr(AStr,ix,CharLength(AStr,ix));
    tmpWord:=tmpWord + tmpChr;
    if FDict.Find(tmpWord) > -1 then
    begin
      AWordList.Add(tmpWord);
      tmpWord:='';
    end
    else if Length(tmpWord) > FDict.MaxWordLen then
    begin
      tmpChr:=LeftBStr(tmpWord,CharLength(AStr,1));
      AWordList.Add(tmpChr);
      ix:=ix - Length(tmpWord) + Length(tmpChr);
      tmpWord:='';
    end;
    ix:=NextCharIndex(AStr,ix);
  end;
end;

end.

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -