⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 hmmseg.pas

📁 基于隐马尔科夫模型的分词标注程序
💻 PAS
📖 第 1 页 / 共 5 页
字号:
    if CompareMem(result.w, key, len) and (result.cat = cat) then exit;
    result := result.next;
  end;
end;

function TSegHashTable.existword(key: PWideChar; len: integer; var h: TSegHashRec): boolean;
begin
  h := existword(key, len);
  result := h <> nil
end;
{
function TSegHashTable.existword(key: PWideChar; len, cat: integer; var h: TSegHashRec): boolean;
begin
  h := existword(key, len, cat);
  result := h <> nil
end;
}
function TSegHashTable.existword(key: PWideChar; len, cat: integer; var extra: integer): boolean;
var h: TSegHashRec;
begin
  len := len shl 1;
  h := ht[word(_hash(PChar(key), len))];
  extra := -1;
  while h <> nil do
  begin
    if CompareMem(h.w, key, len) then
    begin
      if extra = -1 then extra := h.extra;
      if (h.cat = cat) then begin result := true; exit; end;
    end;
    h := h.next;
  end;
  result := false;
end;

function pku_code_to_index(const S: string): integer;
var
  L, H, I, C: Integer;
begin
  Result := -1;
  L := 2;
  H := NCATS+1;
  while L <= H do
  begin
    I := (L + H) shr 1;
    C := CompareStr(pku_pos_codes[I], S);
    if C < 0 then L := I + 1 else
    begin
      if C = 0 then
      begin
        Result := I;
        exit
      end;
      H := I - 1;
    end;
  end;
end;

function tagname(c: integer): string; forward;
function tagnamex(c: integer): string; forward;

function TSegmentor.LoadCoreDic(const datadir: string): TSegHashTable;
const segdicfile={$ifdef segv2}'segdic.txt'{$else}'segdic.old'{$endif};
var
  f: TLineStream;
  i, j, count, cat, len, nline, len1: integer;
  w{, w0}: WideString;
  s: string;
  w0: PWideChar;
  filename: string;
  is_chinese_name, is_foreign_name: boolean;
  with_l: boolean; ncat, lcount, total_count: integer;///
begin  filename := datadir + segdicfile;  if very_verbose then writeln(erroutput, 'loading ', filename);
  //OutputDebugString(PChar(Filename+crln));

  try
    f := TLineStream.Create(filename);
  except
    error('cannot find ' + filename);
  end;
  result := TSegHashTable.Create;
  nline := 0;
  while f.wReadLn(w) do
  begin    inc(nline);
    for i := 1 to length(w) do
      if w[i] = ' ' then break;
    //w0 := copy(w, 1, i-1);
    len1 := i-1;
    getmem(w0, i shl 1);
    Move(w[1], w0^, len1 shl 1); w0[len1] := #0; //lstrcpynw(w0, PWideChar(w), i);

    // format: 艰苦 an 2 ad 6 a 67
    len := length(w);
    inc(i);
    j := i;
    with_l := false; ncat := 0; total_count := 0;///
    repeat
      while (j <= len) and (w[j] <> ' ') do inc(j);
      if j > len then error(segdicfile + ': ' + w);
      s := copy(w, i, j-i);
      is_chinese_name := s = 'nR';
      if is_chinese_name then s[2] :=  'r';
      is_foreign_name := s = 'nr';
      cat := code2index(s);
      if cat = -1 then error(format('%s at line %d: unknown cat: %s', [segdicfile, nline, s]));
      inc(j);
      i := j;
      while (j <= len) and (w[j] <> ' ') do inc(j);
      try
        count := StrToInt(copy(w, i, j-i));
      except
        error(format('%s at line %d: bad integer: %s', [segdicfile, nline, s]));
      end;
      if is_chinese_name
      then result.hashrec(PWideChar(w0), count, cat).extra :=  _xName // w0 shared!
      else if is_foreign_name
      then result.hashrec(PWideChar(w0), count, cat).extra :=  _xForeignName // w0 shared!
      else result.hashrec(PWideChar(w0), count, cat); // w0 shared!
      inc(j);
      i := j;
      if cat=18 then begin with_l := true; lcount := count end; ///
      inc(ncat);///
      inc(total_count, count); ///
    until j > len;
    {if with_l then ///
      if (lcount = 0) or //
         (lcount > 0) and (ncat = 2) and (total_count = lcount)
      then
      else writeln(w);}
    if iscc(w0^) then
    begin
      if longest_word[w0^] < len1
      then longest_word[w0^] := len1;
    end
  end;
  f.Free;
end;

var
  bigram: comlib.THashTable; // utf8-encoded! 2007-4-19

procedure TSegmentor.LoadBigram(const datadir: string);
const mybigramfile={$ifdef segv2}'mybigram.txt'{$else}'mybigram.old'{$endif};
var
  f: TLineStream;
  w2: WideString;
  s, s1, s2: string;
  c, nline: integer;
  h: TSegHashRec;
  filename: string;
begin  filename := datadir + mybigramfile;  if very_verbose then writeln(erroutput, 'loading ', filename);
  //OutputDebugString(PChar(Filename+crln));
  try
    f := TLineStream.Create(filename);
  except
    error('cannot find ' + filename);
  end;
  bigram := comlib.THashTable.Create(4096);
  nline := 0;
  while f.ReadLn(s) do
  begin
    inc(nline);
    split(s, ' ', s1, s2);
    {$ifdef segv2}
    c := pku_code_to_index(s2);
    {$else}
    c := StrToIntDef(s2, 0);
    if c = lq_nr then continue;
    {$endif}
    bigram.HashRec(PChar(s1), length(s1), true).data := c;
    w2 := utf8decode(copy(s1, pos(',', s1) + 1, maxint)); //20070419
    h := dic.existword(PWideChar(w2), length(w2), c);
    if h=nil
    then error(mybigramfile + '(line ' + IntToStr(nline) + '): '
            + {$ifdef linux}utf8encode{$endif}(w2) + ' ' + s2)
    else h.extra := _xBigram;
  end;
  f.Free;
end;

procedure TSegmentor.LoadZsuffix(const datadir: string);
const myzfile={$ifdef segv2}'zsuffix.txt'{$else}'zsuffix.old'{$endif};
var
  f: TLineStream;
  w: WideString;
  h: TSegHashRec;
  c: char;  filename: string;
begin  filename := datadir + myzfile;  if very_verbose then writeln(erroutput, 'loading ', filename);
  //OutputDebugString(PChar(Filename+crln));
  try
    f := TLineStream.Create(filename);
  except
    error('cannot find ' + filename);
  end;
  while f.wReadLn(w) do
  begin    {$ifdef segv2}
    c := char(w[length(w)]);
    if not (c in ['s','t','z', 'k', 'D', 'T', 'j', 'm', 'R']) then continue;
    setlength(w, length(w)-3);
    {$endif}
    h := dic.existword(PWideChar(w), length(w));
    if h = nil
    then begin
           if verbose then writeln(ErrOutput, 'z not in dic: ', {$ifdef linux}Utf8Encode{$endif}(w))
         end  
    else {$ifdef segv2}
         with h do
         case c of
           's': extra := extra or _xNS;
           't': extra := extra or _xNT;
           'z': extra := extra or _xNZ;
           'k': extra := extra or _xK;
           'D': extra := extra or _xDate;
           'T': extra := extra or _xTime;
           'j': extra := extra or _xJ;
           'm': extra := extra or _xNum;
           'R': extra := extra or _xRenVerb or _xNSuf;
         end
         {$else}
         with h do extra := extra or _xNS;
         {$endif}
  end;
  f.Free;
end;

procedure TSegmentor.LoadNprefix(const datadir: string);
var
  f: TLineStream;
  w: WideString;
  h: TSegHashRec;  i, len: integer;
  filename: string;
begin  filename := datadir + 'nprefix.txt';  if very_verbose then writeln(erroutput, 'loading ', filename);
  //OutputDebugString(PChar(Filename+crln));
  try
    f := TLineStream.Create(filename);
  except
    error('cannot find ' + filename);
  end;
  while f.wreadln(w) do
  begin
    i := pos(' ', w);    if i <= 0 then error(w);
    h := dic.existword(PWideChar(w), i-1);
    if h = nil
    then begin
           if verbose then writeln(ErrOutput, 'npre not in dic: ', w)
         end  
    else with h do extra := extra or _xNPre;
  end;
  f.Free;
end;

procedure TSegmentor.Load_name_suffix(const datadir: string);
var
  f: TLineStream;
  w: WideString;
  h: TSegHashRec;
  filename: string;
begin  filename := datadir + 'nsuffix.txt';  if very_verbose then writeln(erroutput, 'loading ', filename);
  //OutputDebugString(PChar(Filename+crln));
  try
    f := TLineStream.Create(filename);
  except
    error('cannot find ' + filename);
  end;
  while f.wReadLn(w) do
  begin
    h := dic.existword(PWideChar(w), length(w));
    if h = nil
    then begin
           if verbose then writeln(ErrOutput, 'nsuf not in dic: ', w)
         end  
    else with h do extra := extra or _xNSuf;
  end;
  f.Free;
end;

procedure TSegmentor.Load_VVO(const datadir: string);
var
  f: TLineStream;
  w: WideString;
  h: TSegHashRec;
  filename: string;
begin  filename := datadir + 'vvo.txt';  if very_verbose then writeln(erroutput, 'loading ', filename);
  //OutputDebugString(PChar(Filename+crln));
  try
    f := TLineStream.Create(filename);
  except
    error('cannot find ' + filename);
  end;
  while f.wReadLn(w) do
  begin
    h := dic.existword(PWideChar(w), length(w));
    if h = nil
    then begin
           if verbose then writeln(ErrOutput, 'vvo not in dic: ', w)
         end  
    else with h do extra := extra or _xVVO;
  end;
  f.Free;
end;

procedure TSegmentor.Load_AA(const datadir: string);
var
  f: TLineStream;
  w1: WideString;
  h: TSegHashRec;
  filename: string;
begin  filename := datadir + 'aa.txt';  if very_verbose then writeln(erroutput, 'loading ', filename);
  //OutputDebugString(PChar(Filename+crln));
  try
    f := TLineStream.Create(filename);
  except
    error('cannot find ' + filename);
  end;
  while f.wReadLn(w1) do
  begin
    h := dic.existword(PWideChar(w1), length(w1)-1);
    if h = nil
    then begin
           if verbose then writeln(ErrOutput, 'aa not in dic: ', w1)
         end  
    else with h do
         case chr(ord(w1[length(w1)])) of
          '1': extra := extra or _xAA;
          '2': extra := extra or _xABB;
          '4': extra := extra or _xAABB;
         end
  end;
  f.Free;
end;

function lq_maxfind(pw: PWideChar; pwlen: integer; var len: integer; var h: TSegHashRec): boolean;
begin
  result := false;
  if iscc(pw^)
  then len := seg_x.longest_word[pw^]
  else len := 3;
  if len > pwlen then len := pwlen;
  while len > 0 do
  begin
    h := lqdic.existword(pw, len);
    if h <> nil then begin result := true; exit; end;
    dec(len);
  end;
end;

type
  PRepeatInfo = ^TRepeatInfo;
  TRepeatInfo = record
    cat: integer; {-1: not set}
    index, len: byte;
  end;

function isRepeatWord2(pw: PWideChar; var len: integer; r: PRepeatInfo): boolean;
var
  h: TSegHashRec;
  extra: integer;
begin
  result := false;
  if (pw[0] = pw[1])
  then if (pw[2] <> #0) and
          (pw[2] = pw[3]) and // 平平淡淡
          lqdic.existword(@pw[1], 2, h) and (h.extra and _xAABB <> 0)
       then begin
              len := 4;
              result := true;
              if lqdic.existword(@pw[1], 2, lq_a) <> nil // inefficient but I am lazy
              then r.cat := lq_z
              else r.cat := -1;
              r.index := 1;
              r.len := 2;
            end
       else begin
              if (pw[2] <> #0) then
                if lqdic.existword(@pw[1], 2, lq_v, extra) and (extra and _xVVO <> 0) { 必须是“散散步”这样的v+n离合动词。“建建筑”不对 }
                then begin
                       result := true;
                       len := 3;
                       r.cat := lq_v;
                       r.index := 1;
                       r.len := 2;
                     end
                else if lqdic.existword(pw, 1, lq_a, extra) and (extra and _xAA <> 0) // 浓浓
                then begin
                       result := true;
                       len := 2;
                       r.cat := lq_z; // lq_a ??
                       r.index := 0;
                       r.len := 1;
                     end
                else if lqdic.existword(pw, 1, lq_o) <> nil // 啪啪啪
                then begin
                       result := true;
                       len := 2;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -