⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 hmmseg.pas

📁 基于隐马尔科夫模型的分词标注程序
💻 PAS
📖 第 1 页 / 共 5 页
字号:
unit hmmseg;
{$J+}

// 删除了姓"兵"

interface

uses {$ifdef win32}windows, {$endif}comlib, mystr;

// HMM seg/tag besed on liuqun's resource, but my algorithm

{  2003-10-16 决定今天开始实现 HMM 分词标注算法。
   词图是一个关键的数据结构。
}
const
  MAXPRED = 16; // 够不够?
  pos_foreign_name = 1;
  pos_chinese_name = 2;
  pos_chinese_name_double = 3; // 注意,对于词典中的人名,=0
  pos_chinese_name_special_surname_last = 4; // 老梁,小张
  pos_chinese_name_special_surname_first = 5; // 梁某,张总
  {$ifdef segv2}NCATS = 56;{$else}NCATS = 48;{$endif}
  {比 isNumberChar2 多了 “第”}{$ifdef linux}
  Type4_NumberChars_j: PWideChar = '第零兆%分之半几俩卅两点%‰-/'; // 多 deleted to bend to PEOPLE DAILY
  Type4_NumberChars_f: PWideChar = '第零%分之半幾倆卅兩點%‰-/'; // 多 deleted to bend to PEOPLE DAILY{$else}  Type4_NumberChars_j: PWideChar = '第零〇兆%分之半几俩卅两点%‰-/'; // 多 deleted to bend to PEOPLE DAILY
  Type4_NumberChars_f: PWideChar = '第零〇%分之半幾倆卅兩點%‰-/'; // 多 deleted to bend to PEOPLE DAILY{$endif}
      //数, or 數 deleted
  D: array[0..4] of PWideChar =
  (
    '0.123456789',
    '0.·123456789',
    '○O一二三四五六七八九十百廿千万亿',  //千萬億 deleted from type 4
    '壹贰叁肆伍陆柒捌玖拾伯仟',
    nil // set dynamically
  );
  PUNS = ':;()[],."!<>{}=';
  dt_Date = 1;
  dt_Time = 2;

type
  TSegHashRec = class
    w: PWideChar;
    count: integer; // 在训练语料中的出现次数
    cat: integer;
    next: TSegHashRec;
    extra: integer;  // search for _xBigram...
  end;

  TSegHashTable = class
    ht: array[0..65535] of TSegHashRec;
    nrec: integer; { number of records }
    function hashrec(key: PWideChar; count, cat: integer): TSegHashRec;
    function existword(key: PWideChar; len: integer): TSegHashRec; overload;
    function existword(key: PWideChar; len: integer; var h: TSegHashRec): boolean; overload;
    function existword(key: PWideChar; len, cat: integer): TSegHashRec; overload;
    //function existword(key: PWideChar; len, cat: integer; var h: TSegHashRec): boolean; overload;
    function existword(key: PWideChar; len, cat: integer; var extra: integer): boolean; overload;
//    destructor Destroy; override;
//    procedure Clear;
//    procedure LoadFromFile(const filename: string);
  end;

  TTagMode = (tag_normal, tag_no, tag_namex, tag_neon, tag_only);
  TSegmentor = class
    dic: TSegHashTable;
    C1_count: array[0..NCATS+1] of integer;
    C1_C2: array[0..NCATS+1, 0..NCATS+1] of integer;
    {$ifdef g3}
    C1_C2_C3: array[0..NCATS+1, 0..NCATS+1, 0..NCATS+1] of integer;
    {$endif}
    longest_word: array [MinCC..MaxCC] of byte;
    CC, jcc: TNameFlags;
    np0, np1, np2, np3: TProb;
    constructor Create(const datadir: string; traditional: boolean);
    procedure InitMatrix(const datadir: string);
    function Prob2(c1, c2: integer): real; // P(c2|c1) 转移概率
{$ifdef g3}
    function Prob3(c0, c1, c2: integer): real; // P(c2|c0,c1) 转移概率
{$endif}    
    function GenProb(pw: PWideChar; len, cat: integer): real; // P(w|c) 生成概率
    function LoadCoreDic(const datadir: string): TSegHashTable;
    procedure LoadBigram(const datadir: string);
    procedure LoadZsuffix(const datadir: string);
    procedure LoadNprefix(const datadir: string);
    procedure Load_name_suffix(const datadir: string);
    procedure Load_VVO(const datadir: string);
    procedure Load_AA(const datadir: string);
  end;
  TSegLang = (Chinese_S, Chinese_T);

var
  tag_scheme: (tag_pku, tag_ldc, tag_863);
  do_mwu, escape_space: boolean;
  mwu: TWideStringList;
  tagmode: TTagMode;
  //mwu_attr: array of Integer;
  seg_unit_count: integer;
  segged_word_count: integer;
  do_nt, split_surname, show_foreign_name: boolean;
  calc_prob, show_prob: boolean;
  seg_f, seg_j: TSegmentor;
  very_verbose: boolean = false;
  very_very_verbose: boolean = false;  names_: widestring = '';  segtagdir: string;  msr_mode: boolean = false;  segdatadir: string;  strict_name: boolean = false;  just_tag: boolean = false;procedure InitSegmentors;
procedure SelectSegmentor(_traditional: boolean);
procedure InitSegmentor1(lang: TSegLang);
function SegTag(pw: PWideChar; len: integer): string;
{$ifdef dll}
type
  SegRec = record
    start, len, cat: integer;
  end;
  PSegRec = ^SegRec;
var
  _x_nwords: integer;
  _x_buf, _xxbuf: PSegRec;
function segtag_1(pw: PWideChar; len: integer; buf: PSegRec; bufsize: integer): integer; stdcall;
function is_cat(cat: integer; catstr: PChar; len: integer): boolean; stdcall;
function tag2str(cats: PChar; ncats: integer; catstr: PChar; len: integer): boolean; stdcall;
function code2index(const s: string): integer; stdcall;
{$endif}
procedure True_UW(verbose: boolean);
procedure calc_prob2(gram: integer; const c1s, c2s, c3s: string);
function pku_code_to_index(const S: string): integer;

const
{ 863 评测标记:
普通名词:n	时间名词:nt	方位名词:nd	处所名词:nl
人名:nh	地名:ns	团体、机构、组织的专名:ni
其它专名:nz    动词:v	        形容词:a	区别词:b
副词:d         数词:m         量词:q	        代词;r
介词:p         连词:c         叹词:e	        拟声词:o
助词:u	        前接成分:h     后接成分:k    习用语:i
简称:j	        语素字:g       非语素字:x    标点:wp
字符串:ws

要从 pku 转换:
  t  nt
  f  nd
  s  nl
  l  i
  w  wp
  y  u
  z  a
  nr nh
  nt ni
  u. u
  ad d
  .g g
  an n
  nx ws
  vd d
  vn n
}
  // because of data sparseness, some pos tags are equated with others
  // but let's wait to see if this really improves peformance
  equ_pos: array[0..1] of record t1, t2: string end = (
    (t1: 'dy'; t2: 'd'),
    (t1: 'rn'; t2: 'n')
  );

  pos863: array[2..NCATS+1] of string = (
  {$ifdef segv2}
     'a', 'a', 'g', 'a', 'b', 'b', 'c', 'd', 'd','d', 'e', 'nd', 'h', //14
     'i', 'j', 'k', 'l', 'm', 'm', 'n', 'g', 'nh', 'nh', 'ns', 'ni', 'ws', //27
     'nz', 'o', 'p', 'q', 'r', 'r', 'n', 'r', 'r', 'r', 'nl', 'nt',    //39
     'nt', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'v', 'd', 'g', 'v', //51
     'r', 'r', 'wp', 'u', 'u', 'a');
  {$else}
     'a', 'a', 'g', 'a', 'b', 'g', 'c', 'd', 'g', 'e', 'nd', 'h',   //13
     'i', 'j', 'k', 'l', 'm', 'g', 'n', 'g', 'nh', 'ns', 'ni', 'ws',  //25
     'nz', 'o', 'p', 'q', 'r', 'g', 's', 'nt', 'g', 'u', 'u', 'u',   //37
     'u', 'u', 'u', 'u', 'v', 'v', 'g', 'v', 'wp', 'u', 'g', 'z');
  {$endif}
  pku_pos_codes: array[2..NCATS+1] of string = (
  {$ifdef segv2}
     'a', 'ad', 'ag', 'an', 'b', 'bg', 'c', 'd', 'dg','dy', 'e', 'f', 'h', //14
     'i', 'j', 'k', 'l', 'm', 'mg', 'n', 'ng', 'nr', 'nrf', 'ns', 'nt', 'nx', //27
     'nz', 'o', 'p', 'q', 'rd', 'rg', 'rn', 'rr', 'ry', 'rz', 's', 't',    //39
     'tg', 'u', 'ud', 'ug', 'uj', 'ul', 'uv', 'uz', 'v', 'vd', 'vg', 'vn', //51
     'vry', 'vrz', 'w', 'y', 'yg', 'z');
     // rg in ancient chinese, means 大/ad 破/v 之/rg
     // ng 表示年号
  {$else}
     'a', 'ad', 'ag', 'an', 'b', 'bg', 'c', 'd', 'dg', 'e', 'f', 'h',   //13
     'i', 'j', 'k', 'l', 'm', 'mg', 'n', 'ng', 'nr', 'ns', 'nt', 'nx',  //25
     'nz', 'o', 'p', 'q', 'r', 'rg', 's', 't', 'tg', 'u', 'ud', 'ug',   //37
     'uj', 'ul', 'uv', 'uz', 'v', 'vd', 'vg', 'vn', 'w', 'y', 'yg', 'z');
  {$endif}
  posldc: array[2..NCATS+1] of string = (
     'VA', 'AD', 'VA', 'NN', 'JJ', 'JJ', 'CC', 'AD', 'AD','AD', 'IJ', 'LC', 'h', //14
     'i', 'NN', 'k', 'DEC', 'CD', 'CD', 'NN', 'NN', 'NR', 'NR', 'NR', 'NR', 'FW', //27
     'NR', 'ON', 'P', 'M', 'DT', 'PN', 'NN', 'PN', 'PN', 'DT', 'NN', 'NT',    //39
     'NT', 'u', 'DER', 'AS', 'DEG', 'AS', 'DEV', 'AS', 'VV', 'VV', 'VV', 'NN', //51
     'VV', 'VV', 'PU', 'SP', 'SP', 'JJ');

  lq_sbegin = 0;
  lq_send = 1;
  lq_a = 2;
  lq_b = 6;
  lq_c = 8;
  lq_d = 9;
  {$ifdef segv2}
  lq_f = 13;
  lq_h = 14;
  lq_j = 16;
  lq_k = 17;
  lq_DEC = 18; //!!
  lq_m = 19;
  lq_n = 21;
  lq_ng = 22;
  lq_nr = 23;
  lq_nrf = 24;
  lq_ns = 25;
  lq_nt = 26;
  lq_nx = 27;
  lq_nz = 28;
  lq_o = 29;
  lq_p = 30;
  lq_q = 31;
  lq_r = 37; // lq_rz
  lq_rz = 37; 
  lq_s = 38;
  lq_t = 39;
  lq_tg = 40;
  lq_u = 41;
  lq_uj = 44; // 的
  lq_ul = 45;
  lq_uz = 47;
  lq_v = 48;
  lq_vn = 51;
  lq_w = 54;
  lq_y = 55;
  lq_z = 57;
  {$else}
  lq_f = 12;
  lq_m = 18;
  lq_n = 20;
  lq_ng = 21;
  lq_nr = 22;
  lq_nrf = lq_nr;
  lq_ns = 23;
  lq_nx = 25;
  lq_nz = 26;
  lq_p = 28;
  lq_q = 29;
  lq_r = 30;
  lq_t = 33;
  lq_v = 42;
  lq_vn = 45;
  lq_w = 46;
  lq_z = 49;
  {$endif}

  lq_x = NCATS+2;
  lq_g = NCATS+3;

implementation

uses
  SysUtils,
  Math,
  StrUtils,
  IniFiles,
  segapi;

type
  TPOS_PRED = record
      pos: integer; // 采用刘群的词性代码
      lex: TSegHashRec;
      predNode: byte;
      best_pred_cat_index: byte;
      x: integer; { 这个词的其他信息,如是不是外国人名 }
      score: real;
  end;
  TStateNode = record
    predList: array [0..MAXPRED] of TPOS_PRED;
    npred: byte; // 对每一种词性,记录一个最佳(路径最短)的前驱
  end;
  Ttime_f_jt = array[0..3] of string;
const
  time_f_jt: Ttime_f_jt = ('前','后', '以前', '以后');
  time_f_ft: Ttime_f_jt = ('前','後', '以前', '以後');
var
  wGraph: array[0..MAXSEGWORDS] of TStateNode;
  slen, Total_Count: integer;
  traditional: boolean;
  Type4CharsHead, month_day_hour, WrongLastNumberChars: PWideChar;
  dian, shi: WideChar;
  c_sijian, c_zone_1: string;
  time_f: Ttime_f_jt;
  transliterated_foreign_names: TSimpleWideStringList;
  foreign_names, foreign_org_names: TSimpleStringList;

procedure TSegmentor.InitMatrix(const datadir: string);
const c1c2file={$ifdef segv2}'c1c2.txt'{$else}'lq_c1c2.txt'{$endif};
var
  f: text;
  i, j, k: integer;
begin
  assign(f, datadir + c1c2file);
  try
    reset(f);
  except
    error('cannot find ' + c1c2file);
  end;
  Total_count := 0;
  for i := 0 to NCATS+1 do
  begin
    read(f, C1_count[i]);
    {$ifdef segv2}if c1_count[i] = 0 then inc(c1_count[i]);{$endif} // vrz
    inc(Total_Count, C1_count[i]);
  end;
  for i := 0 to NCATS+1 do for j := 0 to NCATS+1 do read(f, C1_C2[i, j]);
  close(f);
  {$ifdef g3}
  C1_C2[0, 0] := c1_count[0]; // lq_sbegin = 0
  assign(f, datadir + 'c1c2c3.txt');
  try
    reset(f);
  except
    error('cannot find c1c2c3.txt');
  end;
  for i := 0 to NCATS+1 do
    for j := 0 to NCATS+1 do
      for k := 0 to NCATS+1 do
        read(f, C1_C2_C3[i, j, k]);
  close(f);
  {$endif}
end;

const
  _xBigram = 1;  // 1: word bigram determines the cat of second word
  _xNS = 2;      // 2: n]ns
  _xNPre = 4;    // 4: name prefix
  _xNSuf = 8;    // 8: name suffix
  _xVVO = 16;    // 16: vvo
  _xNT = 32;     // 32: n]nt
  _xNZ = 64;     // 64: n]nz
  _xAA = 128;
  _xABB = 256;
  _xK = 512;
  _xDate = 1024;
  _xTime = 2048;
  _xJ = 4096;
  _xNum = 8192;
  _xAABB = 16384;
  _xRenVerb = 32768;
  _xName  = 65536;
  _xForeignName  = 131072;

var
  seg_x: TSegmentor;
  lqdic: TSegHashTable;

function TSegHashTable.hashrec(key: PWideChar; count, cat: integer): TSegHashRec;
var
  h1: word;
  len: integer;

  function InsertKey(var hr: TSegHashRec): TSegHashRec;
  begin
    if hr = nil { empty bucket }
    then begin
           hr := TSegHashRec.Create;
           //getmem(hr.w, (len + 1) shl 1);
           //lstrcpyw(hr.w, key);
           hr.w := key;
           hr.count := count;
           hr.cat := cat;
           hr.next := nil;
           hr.extra := 0;
           result := hr;
           inc(nrec);
       end
    else if CompareMem(hr.w, key, (len+1) shl 1) and (hr.cat = cat)
    then inc(hr.count, count) {check repetition first }
    else result := InsertKey(hr.next)
  end;
begin
  len := wstrlen(key);
  h1 := word(_hash(PChar(key), len shl 1));
  result := InsertKey(ht[h1]);
end;

function TSegHashTable.existword(key: PWideChar; len: integer): TSegHashRec;
begin
  len := len shl 1;
  result := ht[word(_hash(PChar(key), len))];
  while result <> nil do
  begin
    if CompareMem(result.w, key, len) {and (key[len]=#0)} then exit;
    result := result.next;
  end;
end;

function TSegHashTable.existword(key: PWideChar; len, cat: integer): TSegHashRec;
begin
  len := len shl 1;
  result := ht[word(_hash(PChar(key), len))];
  while result <> nil do
  begin

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -