hmmseg.pas

来自「基于隐马尔科夫模型的分词标注程序」· PAS 代码 · 共 2,061 行 · 第 1/5 页
PAS
2,061 行
           // 但是，“哈里逊·福特不得” 是错误的。在外国人名识别算法没有改变以前，
           // 暂且不允许最后两个字是词。
           if isCC(pw[len-3]) and lq_MaxFind(@pw[len-2], pwlen - (len-2), j, h) and (j > 1)
           then dec(len, 2);
         end;
         if len > 1 then  // 不允许单个汉字的外国名字
           // 外国地名？
           if is_foreign_place_name(pw, len)
           then AddWGraphEdge1(i, i+len, lq_ns, 0)
           else AddWGraphEdge1(i, i+len, lq_nr, pos_foreign_name);

         (*if with_dot and (len >= 5) then
         begin
                 inc(i, len-1);
                 inc(pw, len-1);
                 continue   // 不要再探索其他可能性了！
                end;*)
         (*if len > 1 then AddWGraphEdge1(i, i+len, lq_nr, pos_foreign_name);
         if with_dot and (len >= 5) and (i >= max_i) then  // not 总\理纳瓦兹·谢里夫
         begin
           // 但是，“哈里逊·福特不得” 是错误的。在外国人名识别算法没有改变以前，
           // 暂且不允许最后两个字是词。
           if isCC(pw[len-3]) and lq_MaxFind(@pw[len-2], j, h) and (j > 1)
           then
           else begin
                 inc(i, len-1);
                 inc(pw, len-1);
                 continue   // 不要再探索其他可能性了！
                end
         end;*)
       end;
  // recognizes Japanese names
  if is_jpn_person_name(pw, len) then
    AddWGraphEdge1(i, i+len, lq_nr, pos_foreign_name);
  end;

  // 未登录词识别
  if do_mwu then
    if mwu.MaxFind(pw, j) then
    begin  // "神舟五号"；"神舟"都被识别
      len := wstrlen(mwu[j]);
      if len > 2 then
        move(pw^, ww, (len - 1) shl 1);
      repeat
        if Integer(mwu.Objects[j]) <> lq_nr // 我的人名识别程序应该已经加入了
        then AddWGraphEdge1(i, i+len, Integer(mwu.Objects[j]), 0);
        repeat
          dec(len);
          ww[len] := #0;
        until (len <= 1) or mwu.Find(ww, j);
      until len <= 1;
    end;

  if isRepeatWord2(pw, j, @r)
  then begin
         if lq_MaxFind(pw, pwlen, len, h) and (len >= j) { “慢慢地”是更长的词 }
         then begin
                j := len;
                if not AddWGraphEdge(i, i+j, h) then j := 1; // 嘀嘀咕咕，
              end
         else begin
                // 把 i..i+j 这个词加到词图
                // 叠词的词性一般是已知的，除了 AABB 有可能表示动词(搂搂抱抱)，
                //   形容词（清清楚楚），名词（是是非非）以外
                if r.cat <> -1
                then AddWGraphEdge1(i, i+j, r.cat, 0)
                else begin
                       h := lqdic.existword(@pw[r.index], r.len);
                       assert(h <> nil);
                       AddWGraphEdge(i, i+j, h);
                     end;
              end
       end
  else if isNumberWord2(pw, j) { a number composed of more than 2 double-byte cc? }
  then begin
         // 十五大！
         if lq_MaxFind(pw, pwlen, len, h) and (len > j)
         then begin j:=len; AddWGraphEdge(i, i+j, h) end
         else
         // the original treatment is just AddWGraphEdge1(i, i+j, lq_m, 0);
         // now I try to conform to PKU People's Daily Corpus
         if (pw[j] <> #0) and ((wstrscan('年月日分', pw[j]) <> nil) or (pw[j]=shi))
            //and not (lq_MaxFind(@pw[j], len, h) and (len > 1))
         then if (pw[j] = '年')
              then if (j=4) and (pw[j+1] <> '度')
                   then begin inc(j); AddWGraphEdge1(i, i+j, lq_t, 0); end
                   else AddWGraphEdge1(i, i+j, lq_m, 0)
              else if (pw[j] = '分')   // 这里有很大的改进机会: 8点30 分， 五点十 分
                   then begin
                          if (j>=3) and ((pw[j-3]=dian) or (pw[j-2]=dian)) or (i>0) and ((pw-1)^ = shi)
                          then begin inc(j); AddWGraphEdge1(i, i+j, lq_t, 0); end
                          else AddWGraphEdge1(i, i+j, lq_m, 0);
                        end
              else begin
                     if (pw[j] = '月') and (pw[j+1] = '份') then inc(j);
                     if (pw[j] = '日')
                     then begin if (j<=3) and (pw[j+1] <> '元') then inc(j); end
                     else inc(j);
                     AddWGraphEdge1(i, i+j, lq_t, 0);
                   end
         else AddWGraphEdge1(i, i+j, lq_m, 0);
       end
  else if lq_MaxFind(pw, pwlen, j, h) then
  begin
    if (j = 1) and isNumberChar2(pw^, len) and (len=2) and
       (pw[j] <> #0) and (wstrscan('日时時', pw[j]) <> nil) // 一月一日
    then begin
           inc(j);
           AddWGraphEdge1(i, i+j, lq_t, 0);
         end
    else begin
           { get valid shorter words }
           max_i := i+j;
           AddWGraphEdge(i, i+j, h);
           if (h.extra and _xK <> 0) and (i>=2) then Add_Compound(i, j, h);

           if isCC(pw^) then  // you don't add '-' of '--'
           while (j > 1) do
           begin
             dec(j);
             h := lqdic.existword(pw, j);
             if h <> nil then AddWGraphEdge(i, i+j, h);
           end;
         end
  end
  else if IsFullWidthLetter(pw, j) then AddWGraphEdge1(i, i+j, lq_nx, 0)
  else if pw[0] < #128 then
  begin
    j := 1;
    if blankchar(pw[0]) then
    begin
      while blankchar(pw[j]) do inc(j);
      AddWGraphEdge1(i, i+j, lq_nx, 0);
    end
    // 标点符号应该作为单独一个单词
    else if (wstrscan(PUNS, pw[0]) <> nil)
    then AddWGraphEdge1(i, i+j, lq_w, 0)
    else begin
            while (pw[j] <> #0) and (pw[j] < #128) and
                  ((pw[j] = '.') and isdigit(pw[j+1]) or (wstrscan(PUNS, pw[j]) = nil))
            do inc(j);
            (*
            if (pw[j-1] = '"') {or (pw[j-1] = ')') and (j > 1)} then dec(j);
            if (j > 1) and (wstrscan(',.;:', pw[j-1]) <> nil) and isalpha(WideChar2Char(pw[j-2])) then dec(j);
            *)
            if (j = 1) and not isalpha(WideChar2Char(pw[0]))
            then if isdigit(pw[0])
                 then begin
                        if (pw[1] <> #0) and (wstrscan(month_day_hour, pw[1]) <> nil) //and // 1月1日
                            //not (lq_MaxFind(@pw[1], len, h) and (len > 1)) 10月/底
                        then begin
                               if (pw[j] = '月') and (pw[j+1] = '份') then inc(j)
                               else if (pw[j] = shi) and (pw[j+1] = '半') then inc(j);

                               inc(j);
                               AddWGraphEdge1(i, i+j, lq_t, 0);
                             end
                        else AddWGraphEdge1(i, i+j, lq_m, 0)
                      end
                 else AddWGraphEdge1(i, i+j, lq_w, 0)
            else // check English names
                 if isenglishname(pw, j, etype, ext_type)
                 then AddWGraphEdge1(i, i+j, etype, ext_type)
                 else AddWGraphEdge1(i, i+j, lq_nx, 0);
         end
  end
  else { assume a one-cc word }
  begin
    h := lqdic.existword(pw, 1);
    if h <> nil
    then begin AddWGraphEdge(i, i+1, h);
               // ghy
               if wstrscan(PWideChar(names_), pw^) <> nil
               then AddWGraphEdge1(i, i+1, lq_nr, 0);
         end
    else AddWGraphEdge1(i, i+1, lq_x, 0);
    j := 1;
  end;
  inc(i, j);
  inc(pw, j);
  dec(pwlen, j);
  end;
  inc(slen);
  with wGraph[slen] do
  begin
    npred := 1;
    predList[0].pos := lq_send;
    predList[0].predNode := slen-1;
    predList[0].lex := nil;
  end;
end;

procedure CreateWGraph2(var pw: PWideChar; len: integer);
var
  j: integer;
  pwx, pw1, pw2: PWideChar;
begin
  for j := 1 to len do wGraph[j].npred := 0;
  with wGraph[0] do
  begin
    npred := 1;
    predList[0].pos := lq_sbegin;
    predList[0].score := 0;
  end;

  pwx := pw;
  pw1 := pw;
  pw2 := pw;
  slen := 0;
  // format: 迈向/v  充满/v  希望/n  的/u  新/a  世纪/n
  repeat
    while (pw2^ <> #0) and (pw2^ <> '/') do inc(pw2);
    if pw2^ = #0 then break;
    // pw1..pw2 is a word
    j := pw2-pw1;
    move(pw1^, pwx^, j shl 1);
    inc(pwx, j);
    pw1 := pw2+1;
    while (pw2^ <> #0) and (pw2^ <> ' ') do inc(pw2);
    AddWGraphEdge1(slen, slen+j, pku_code_to_index(WideCharLenToString(pw1, pw2-pw1)), 0);
    while (pw2^ <> #0) and (pw2^ = ' ') do inc(pw2);
    inc(slen, j);
    pw1 := pw2;
  until pw1^ = #0;
  inc(slen);
  with wGraph[slen] do
  begin
    npred := 1;
    predList[0].pos := lq_send;
    predList[0].predNode := slen-1;
    predList[0].lex := nil;
  end;
end;

procedure CreateWGraph3(var pw: PWideChar; len: integer);
var
  j: integer;
  pwx, pw1, pw2: PWideChar;
  h: TSegHashRec;
begin
  for j := 1 to len do wGraph[j].npred := 0;
  with wGraph[0] do
  begin
    npred := 1;
    predList[0].pos := lq_sbegin;
    predList[0].score := 0;
  end;

  pwx := pw;
  pw1 := pw;
  pw2 := pw;
  slen := 0;
  // format: 迈向  充满  希望  的  新  世纪
  repeat
    while (pw2^ <> #0) and (pw2^ <> ' ') do inc(pw2);
    // pw1..pw2 is a word
    j := pw2-pw1;
    move(pw1^, pwx^, j shl 1);
    inc(pwx, j);
    h := lqdic.existword(pw1, j);
    if h <> nil
    then AddWGraphEdge(slen, slen+j, h)
    else AddWGraphEdge1(slen, slen+j, lq_n, 0); // assume UNK is noun!

    while (pw2^ <> #0) and (pw2^ = ' ') do inc(pw2);
    inc(slen, j);

    pw1 := pw2;
  until pw1^ = #0;
  inc(slen);
  with wGraph[slen] do
  begin
    npred := 1;
    predList[0].pos := lq_send;
    predList[0].predNode := slen-1;
    predList[0].lex := nil;
  end;
end;

function tagname(c: integer): string;
begin
    case c of
      lq_g: result := 'g';
      lq_x: result := 'x';
      else case tag_scheme of
             tag_pku: result := pku_pos_codes[c];
             tag_ldc: result := posldc[c];
             tag_863: result := pos863[c];
           end
    end;
end;

function TSegmentor.Prob2(c1, c2: integer): real; // P(c2|c1) 转移概率
var
  h: integer;
begin
  {if tag_scheme = tag_ldc then
    begin
      if c1 = lq_vn then c1 := lq_n;
      if c2 = lq_vn then c2 := lq_n;
    end;}
  if (c1 >= NCATS+2) or (c2 >= NCATS+2) then begin result := 1/Total_Count; exit end;
  h := C1_C2[c1, c2];
  if h = 0
  then result := 1/Total_Count
  else result := h/C1_Count[c1];
  result := result * 0.9 + C1_Count[c2] / Total_Count * 0.1;
  {if tag_scheme = tag_ldc then
    if c2 = lq_vn
    then result := result + Prob2(c1, lq_n)
    else if c1 = lq_vn
    then result := result + Prob2(lq_n, c2);}
  //OutputDebugString(PChar(format('%s,%s=%5.3f',[tagnamex(c1), tagnamex(c2),result])));
  if calc_prob then writeln(tagnamex(c1), ',', tagnamex(c2), '=',result:5:3);
end;

function code2index(const s: string): integer;
begin
  if s = 'g' then result := lq_g else
  if s = 'x' then result := lq_x else
  result := pku_code_to_index(s);
end;

function is_cat(cat: integer; catstr: PChar; len: integer): boolean; stdcall;
var s, t: string;
begin
 setstring(s, catstr, len);
 t := tagname(cat);
 result := s = t;
 if not result then
    result := (t[1] = 'r') and (s='r') or
              (t = 'v') and (s = 'aux') or
              (t = 'rd') and (s = 'det');
end;

function tag2str(cats: PChar; ncats: integer; catstr: PChar; len: integer): boolean; stdcall;
var s: string;
    i: integer;
begin
  s := '';
  for i := 0 to ncats-1 do s := s + tagname(ord(cats[i])) + ' ';
  result := length(s) < len;
  if result then strpcopy(catstr, s);
end;

procedure calc_prob2(gram: integer; const c1s, c2s, c3s: string);
var c1, c2, c3: integer;
    p: real;
begin
  if (c1s = '') or (c2s = '') then error('bad cat!');
  if (gram = 3) and (c3s = '') then error('bad cat!');
  c1 := code2index(c1s); c2 := code2index(c2s);
  if (c1 = -1) or (c2 = -1) then error('bad cat!');
  if (gram = 3) then
  begin
    c3 := code2index(c3s);
    if c3 = -1 then error('bad cat!');
  end;
  if gram = 2
  then begin
         p := seg_x.Prob2(c1, c2);
         writeln(c1s, ',', c2s, '=', p:5:3, ' -logp=', -ln(p):5:3);
       end
  else begin
        {$ifdef g3}
         p := seg_x.Prob3(c1, c2, c3);
         writeln(c1s, ',', c2s, '=', c3s, '=', p:5:3, ' -logp=', -ln(p):5:3);
        {$else}
         writeln('not implemented');
        {$endif}
       end;
  halt;
end;

{$ifdef g3}
function TSegmentor.Prob3(c0, c1, c2: integer): real; // P(c2|c0,c1) 转移概率
const
  trigram_weight = 0.9;
var
  h: integer;
begin
  {if tag_scheme = tag_ldc then
    begin
      if c0 = lq_vn then c0 := lq_n;
      if c1 = lq_vn then c1 := lq_n;
      if c2 = lq_vn then c2 := lq_n;
    end;}
  if (c0 >= NCATS+2) or (c1 >= NCATS+2) or (c2 >= NCATS+2)
  then begin result := 1/Total_Count/5;{5,10 same} exit end;{??}

  if C1_C2_C3[c0, c1, c2] = 0
  then result := 1/Total_Count*10 {1, 2,5 is better than 10}
  else result := C1_C2_C3[c0, c1, c2]/C1_C2[c0, c1];
  result := result * trigram_weight + prob2(c1, c2) * (1-trigram_weight);
end;
{$endif}

function TSegmentor.GenProb(pw: PWideChar; len, cat: integer): real; // P(w|c) 生成概率
var
  h: TSegHashRec;
  ww: array[0..31] of WideChar;
begin
  // ?? 叠词还不准
  case cat of
   lq_send: begin result := 1; exit end; // 产生 </s>
   lq_x, { x 字 } lq_g: { g } begin result := 1 / Total_Count; exit end;
  end;
  h := lqdic.existword(pw, len, cat);
  if (h = nil) or (h.count = 0) // 训练语料库里没出现
  then if cat = lq_nr
       then if len > 1
            then begin
                   if (h <> nil) or not iscc(pw^) // 现在允许英文人名
hmmseg.pas - 源码说明

本页面展示了「基于隐马尔科夫模型的分词标注程序」中的 hmmseg.pas 源码文件，采用 PAS 编程语言编写，共 2,061 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与模型相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?