⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 hmmseg.pas

📁 基于隐马尔科夫模型的分词标注程序
💻 PAS
📖 第 1 页 / 共 5 页
字号:
                   then result := 1 / Total_Count
                   else begin
                          result := name_prob(pw, len)/256{512{32}; // 根据“分割成词”
                          if result = 0 then result := 1 / Total_Count/5; //??应该用1/C1_Count[i1]/5 ??
                        end;
                     // 刘群的词典里有很多这样的 nr
                   if do_mwu then
                   begin
                     move(pw^, ww, len shl 1);
                     ww[len] := #0;
                     if mwu.Find(ww, len) then result := result * 4;
                   end;
                 end
            else result := 1/C1_Count[cat]/5
       else result := 1/C1_Count[cat]/5
  else begin result := h.count/C1_Count[cat]; {if cat = lq_nrf then result := result / 10 {何长工}end;
  if calc_prob then writeln(WideCharLenToString(pw, len), '/', tagnamex(cat), '=',result:6:4);
end;

function get_ldc_word_tag(pw: PWideChar; len, cat, next_cat: integer): string;
var
  i: integer;
  tag: string;
  u: array[0..255] of char;
begin
{$ifdef linux}
  UnicodeToUtf8(u, sizeof(u), pw, len);
  result := u;
{$else}
  result := WideCharLenToString(pw, len);
{$endif}
  tag := tagname(cat);
  case cat of
   lq_u:
    if oneof(result, ['等', '等等'])
    then tag := 'ETC'
    else if result = '所'
    then tag := 'MSP'
    else if result = '之'
    then tag := 'DEG'
    else tag := 'LC';
   lq_c: if oneof(result, ['虽然','因为','如果','若','尽管','假如','若是','要是','只有','即使','只要','一旦'])
         then tag := 'CS'
         else if oneof(result, ['但','但是'])
         then tag := 'AD';
   lq_m: if wstrscan('第首', pw^) <> nil then tag := 'OD' else
         if oneof(result, ['全部']) then tag := 'DT';
   lq_v: if oneof(result, ['是','为','非']) then tag := 'VC' else
         if oneof(result, ['有','没','没有','无']) then tag := 'VE';
   lq_p: if oneof(result, ['把','将']) then tag := 'BA' else
         if oneof(result, ['被','给']) then
           if next_cat = lq_v
           then tag := 'SB'
           else tag := 'LB';
   lq_a: if next_cat in [lq_n, lq_m, lq_vn] then tag := 'JJ' else tag := 'VA';
   lq_d: if (next_cat = lq_v) and (result = '没有') then tag := 'VV'; 
   lq_b: if (result = '所有') then tag := 'DT'; 
   lq_rz: if next_cat in [lq_v, lq_d, lq_p] then tag := 'PN';
   lq_f: if (next_cat = lq_q) and oneof(result, ['上']) then tag := 'DT' else
         if (next_cat = lq_m) and (result = '最后') then tag := 'JJ';
  end;
  result := result + '/' + tag;
end;

function get_word(pw: PWideChar; len: integer): string;
var
  i: integer;
  w: array[0..255] of WideChar;
  u: array[0..255] of char;
begin
{$ifdef linux}
  UnicodeToUtf8(u, sizeof(u), pw, len);
  result := u;
{$else}
  result := WideCharLenToString(pw, len);
{$endif}
  if not escape_space then exit;
  // check for ' ', '/', '\'
  i := 0;
  while i < len do
    if ord(pw[i]) in [ord(' '), ord('/'), ord('\')] then break else inc(i);
  if i = len then exit;

  move(pw^, w, len);
  repeat
    case w[i] of
    ' ': begin move(w[i+1], w[i+2], len-1-i); w[i] := '\'; inc(i); w[i+1] := 's'; inc(len); end;
    '/': begin move(w[i+1], w[i+2], len-1-i); w[i] := '\'; inc(i); w[i+1] := 'f'; inc(len); end;
    '\': begin move(w[i+1], w[i+2], len-1-i); w[i] := '\'; inc(i); w[i+1] := '\'; inc(len); end;
    end;
    inc(i);
  until i >= len;
end;

function utf8encodelen(pw: PWideChar; len: integer): utf8string;
var
  u: array[0..255] of char;
begin
  UnicodeToUtf8(u, sizeof(u), pw, len);
  result := u;
end;

function constraining_bigram(pw: PWideChar; p0, n0, len: integer; c2: integer): boolean;
var
  len2: integer;
  w: array[0..31] of WideChar;
  h: comlib.PHashRec;
begin
  // 根据“把/p 这/r 篇/q 报道/v 编辑/v 一/m 下/q”调整
  //                p0  n0
  // “报道”应为名词

  len2 := n0-p0;
  if len2 + len >= 30 then begin result := false; exit end;
  move(pw[p0], w, len2 shl 1);
  w[len2] := ',';
  inc(len2);
  move(pw[n0], w[len2], len shl 1);
  inc(len2, len);
  h := bigram.hashrec(utf8encodelen(w, len2), false);
  result := (h <> nil) and (h.data = c2);
end;

function tagnamex(c: integer): string;
begin
  case c of
   0: result := '<s>';
   1: result := '</s>';
   else result := tagname(c)
  end;
end;

function dt_type(pw: PWideChar; len: integer): integer;
var
  h: TSegHashRec;
begin
  h := lqdic.existword(pw, len);
  if (h <> nil) then
    if (h.extra and _xDate <> 0)
    then result := dt_Date
    else if (h.extra and _xTime <> 0)
    then result := dt_Time
    else if (len=1) and (wstrscan('春夏秋冬', pw^) <> nil)
         then result := dt_Date
         else result := 0
  else if wstrscan('年月日天份', pw[len-1]) <> nil
       then result := dt_Date
       else if (pw[len-1]=shi) or (pw[len-1]=dian) or (wstrscan('半分晚夜', pw[len-1]) <> nil)
       then result := dt_Time
       else result := 0
end;

function find_time(const s1, s2: string; var j: integer): boolean;
begin
  if s1 = c_sijian then begin j := 1; result := true end else
  if oneof(s1, [c_zone_1,'西部']) and (s2 = c_sijian)
  then begin j := 2; result := true end
  else result := false
end;

function istime(pw: PWideChar; len: integer; var time_len: integer): boolean;
// 2004-09-01 08:35
// 2004-08-22 11:35:07
var ndash, ncolon: integer;
begin
  result := false;
  time_len := 0;
  ndash := 0; ncolon := 0;
  while (time_len < len) do
  begin
    if pw[time_len] < #128
    then case ord(pw[time_len]) of
           ord('0')..ord('9'), ord(' '): ;
           ord('-'): inc(ndash);
           ord(':'): inc(ncolon);
           else exit;
         end
    else exit;
    inc(time_len);
  end;
  result := (ndash in [0,2]) and (ncolon > 0);
  if result then
  begin
    repeat dec(time_len) until pw[time_len] <> ' ';
    inc(time_len);
  end;
end;

function isfraction(pw: PWideChar; len: integer; var time_len: integer): boolean;
// 1/3
var nslash: integer;
begin
  result := false;
  time_len := 0;
  nslash := 0;
  while (time_len < len) do
  begin
    if pw[time_len] < #128
    then case ord(pw[time_len]) of
           ord('0')..ord('9'): ;
           ord('/'): inc(nslash);
           else exit;
         end
    else exit;
    inc(time_len);
  end;
  result := (nslash = 1);
  if result then
  begin
    repeat dec(time_len) until pw[time_len] <> ' ';
    inc(time_len);
  end;
end;


var names_loaded: boolean = false;
    freq0, freq1, freq2, freq3: array[mincc..maxcc] of integer;
    total0, total1, total2, total3: integer;

procedure load_names;
var f: text;
    w,w2: widestring;
    lineno, i:integer;
begin
  guarded_reset(f, segtagdir + 'n');
  while not eof(f) do
  begin
    readln(f, w);
    inc(lineno);
    { check validness }
    for i := 1 to length(w) do
    if (w[i] = ' ') or isCC(w[i])
    then
    else begin
           writeln('bad cc in line ', lineno, ': ', w);
           halt;
         end;

    if w[2] = ' ' then
    begin
      inc(freq0[w[1]]); inc(total0);
      w2 := copy(w, 3, 2);
    end else
    begin
      w2 := copy(w, 4, 2);
{      s := copy(w, 1, 2);
      if n2.indexof(s) = -1 then
      begin
        n2.add(s);
        write(s, '  ');
      end;}
     end;

    case length(w2) of
      1: begin inc(freq3[w2[1]]); inc(total3); end;
      2: begin
           inc(freq1[w2[1]]); inc(total1);
           inc(freq2[w2[2]]); inc(total2);
         end
    end
  end;
  close(f);
end;

function new_gen_prob(pw: PWideChar; len: integer): real;
begin
  if not names_loaded then begin load_names; names_loaded := true end;
  case len of
    2: result := freq0[pw^]/total0*freq3[(pw+1)^]/total3;
    3: result := freq0[pw^]/total0*freq1[(pw+1)^]/total1*freq2[(pw+2)^]/total2;
    4: result :=               0.5*freq1[(pw+2)^]/total1*freq2[(pw+3)^]/total2;
    else error('bad names!');
  end;
end;

(*
var
  allcc: array[MinCC..MaxCC] of integer;
  total_freq: integer;*)
function gen_name(pw: PWideChar; len: integer): real;
(*const inited: boolean = false;
var s: string;
    w: widechar;
    f: text;
    wch: WideChar;
    count: integer;*)
var h, h1: TSegHashRec;
begin
  // p := name_prob(name|史晓东) * p(史晓东)/p(name)
  result := name_prob(pw, len)/512;  //17:36 2004-11-11
  //result := new_gen_prob(pw, len)/64; {tested 1,2,4,8,...256}
    // this one is slightly better than the above formula for person name recognition
    // but for other NE, slightly inferior results are obtained
   // uniform prob as 3e-10 is not good, about 75% f-measure
  if (len = 3) and lqdic.existword(pw+1, 2, h) then
  begin
    len := 0; h1 := h;
    repeat
      inc(len, h.count);
      h := h.next;
    until (h = nil) or not CompareMem(h.w, h1.w, 2);
    result := result / (len+1)/50;
  end;
  (*
  if not inited then
  begin
    guarded_reset(f, segtagdir + 'cc.txt');
    fillchar(cc, sizeof(cc), 0);
    while not eof(f) do
    begin
      readln(f, s);
      if s[3] = ' ' then
      begin
        MultiByteToWideChar(0, 0, PChar(s), 2,  @w, 1); { 1 wide char }
        allcc[w] := StrToInt(copy(s, 4, maxint));
      end;
    end;
    close(f);
    total_freq := 0;
    for wch := MinCC to MaxCC do inc(total_freq, allcc[wch]);
    inited := true
  end;
  dec(len);
  while (len >= 0) do
  begin
    count := allcc[pw^];
    if count = 0 then inc(count);
    result := result * count / total_freq; break;
    dec(len);
  end;*)
end;

function SegTag1(pw: PWideChar; len: integer): string;
var
  i, j, k, n0, best: integer;
  min_score, score, p, p0: real;
  {$ifdef g3}c0, {$endif}_n0, c1, c2: integer;
  s: string;
  ww: array[0..31] of WideChar;
  h: TSegHashRec;
  usebigram: boolean;
  words: array[0..255] of record
    start, len: integer;
    cat: integer;
    x: integer;
  end;
  nwords: integer;
  name_ok, name_suffix, nt_found: boolean;
label
  label_time, labeltime;
begin
  if calc_prob
  then CreateWGraph2(pw, len)
  else if just_tag
  then CreateWGraph3(pw, len)
  else begin
         slen := len;
         CreateWGraph(pw, len);
       end;
  // Vertibi 算法?
  for i := 1 to slen do
  with wGraph[i] do
  begin
    if npred = 0 then continue; // 比如一串数字
    //mywriteln(format('at %d', [i]));
    for j := 0 to npred - 1 do
    begin
      n0 := predList[j].predNode;
      min_score := MaxDouble;
      c2 := predList[j].pos;
      usebigram := false;
      name_suffix := false;
      if very_very_verbose then write(get_word(@pw[n0], i-n0) + '/'+tagnamex(c2));
      // 正规的HMM 中,生成概率与前一个词无关
      if (c2 = lq_nr) and (predList[j].x in [pos_foreign_name, pos_chinese_name,
            pos_chinese_name_double, pos_chinese_name_special_surname_first,
            pos_chinese_name_special_surname_last])
      then begin
             case predList[j].x of
              pos_foreign_name: p := (i-n0)*ln(0.0009);
                // 根据“韦斯利·哈里斯”, "路易斯安娜"
                // 调整,比这个数值小则不能识别为一个完整人名
                // 以上调整后为0.0006
                // 根据语料库改为0.001
              pos_chinese_name_special_surname_last:
                p := ln(np0[pw[n0+1]] * 0.00002);  //根据“老梁顿感不妙”调整
              pos_chinese_name_special_surname_first:
                p := ln(np0[pw[n0]] * 0.00003);    //根据“钱总可以说是我们推荐的”调整
             else begin
                    // the following is mathematically WRONG!!!
                    // should use p := name_prob(name|史晓东) * p(史晓东)/p(name)
                    p := gen_name(@pw[n0], i-n0);
                    // but the result is inferior!!
                    //p := name_prob(@pw[n0], i-n0)/{256}512{1024{2048}; // 根据“分割成词”
                      // 512 是根据人民日报语料库调整,当时
                      // 如选256: F=97.98%
                      //     512:  F=97.99%
                      //     1024:F=97.99%  2004/08/18
                    if p = 0 then p := 1 / Total_Count/5; //??应该用1/C1_Count[i1]/5 ??
                    if (i-n0=3) and (lqdic.existword(@pw[n0], 2) <> nil)
                    then p := p / 2; {阳光射进来}
                    p := ln(p);
                    //writeln(WideCharLenToString(@pw[n0], i-n0));
                  end;
             end;

             // check name prefix and suffix
             //k := wGraph[n0].best_pred_cat_index;
             _n0 := wGra

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -