hmmseg.pas

来自「基于隐马尔科夫模型的分词标注程序」· PAS 代码 · 共 2,061 行 · 第 1/5 页
PAS
2,061 行
                       while pw[len] = pw^ do inc(len);
                       r.cat := lq_o;
                       r.index := 0;
                       r.len := 1;
                     end  {
                else if lqdic.existword(pw, 1, lq_q) <> nil // 艘艘
                then begin
                       result := true;
                       len := 2;
                       r.cat := lq_q;
                       r.index := 0;
                       r.len := 1;
                     end   }
            end
  else if (pw^ = '一') and (pw[1] <> #0) and (pw[1] = pw[2])
       then begin
              if lqdic.existword(@pw[1], 1, lq_q) <> nil  // 一排排
              then begin
                     result := true;
                     len := 3;
                     r.cat := lq_m; // 北大认为“一排排”是数词。限定词也许更合适!!!
                                    // 对HMM 来说，影响生成概率
                     r.index := 0;
                     r.len := 1;
                   end
            end
  else if (pw[1] <> #0) and (pw[1] = pw[2])   {孤单单}
       then begin
              if lqdic.existword(pw, 2, lq_a, extra) and (extra and _xABB <> 0)
              then begin
                     result := true;
                     len := 3;
                     r.cat := lq_z; // lq_a ?
                     r.index := 0;
                     r.len := 2;
                   end
            end;
end;

function AddWGraphEdge(i, j: byte; h: TSegHashRec): boolean;
var
  w: PWideChar;
  k: integer;
label
  rr;
begin
  // 对“瞩目中华”，因为“瞩”不出现在词典里，所有加“目”没有意义
  if wGraph[i].npred = 0 then begin result := false; exit; end;
  result := true;
  w := h.w;
  with wGraph[j] do
  repeat
    if npred >= MAXPRED then exit;
     {if tag_scheme = tag_ldc then
      if h.cat = lq_vn
      then h.cat := lq_n; //??}

    k := 0;
    while k < npred do
      if predList[k].pos = h.cat then
      begin
        // put the cat at the end of the same cat
        repeat inc(k) until (k >= npred) or (predList[k].pos <> h.cat);
        if (k < npred)
        then {if predList[k].predNode = i
             then goto rr
             else} move(predList[k], predList[k+1], (npred - k) * sizeof(predList[0]));
        break
      end
      else inc(k);
    predList[k].pos := h.cat;
    predList[k].predNode := i;
    predList[k].x := 0;
    predList[k].lex := h;
    //mywriteln(format('%d,%d: %s %s', [i, j, lqInt2cat(h.cat), WideCharToString(h.w)]));
    // ??? count???
    inc(npred);
rr:
    h := h.next;
  until (h = nil) or (WideStringCompare(w, h.w) <> 0);
end;

procedure AddWGraphEdge1(i, j: byte; cat, extra_info: integer);
var
  k: integer;
begin
  if wGraph[i].npred = 0 then exit;
  with wGraph[j] do
  begin
    if npred >= MAXPRED then exit;
    k := 0;
    while k < npred do
      if predList[k].pos = cat then
      begin
        // put the cat at the end of the same cat
        repeat inc(k) until (k >= npred) or (predList[k].pos <> cat);
        if (k < npred)
        then move(predList[k], predList[k+1], (npred - k) * sizeof(predList[0]));
        break
      end
      else inc(k);
    predList[k].pos := cat;
    predList[k].predNode := i;
    predList[k].x := extra_info;
    predList[k].lex := nil;
    //mywriteln(format('%d,%d: %s', [i, j, lqInt2cat(cat)]));
    // ??? count???
    inc(npred);
  end
end;

procedure Add_Compound(i, j: byte; h: TSegHashRec);
var
  k: integer;
begin
  with wGraph[i] do
    for k := 0 to npred - 1 do
      if predList[k].pos in [lq_n, lq_ns, lq_b, lq_vn, lq_nz] then
      if i+j-predList[k].predNode > 2 then {受/骗/者 将不会变成 受/骗者}
      begin
        AddWGraphEdge1(predList[k].predNode, i+j, lq_n, 0);
        break
      end
end;

function isNumberChar2x(ch: WideChar; var charType: integer): boolean;
var i: integer;
begin
  for i := low(D) to High(D) do
    if wstrscan(D[i], ch) <> nil then
    begin
      charType := i;
      result := true;
      exit
    end;
  result := false;
end;

function isNumberWord2(pw: PWideChar; var len: integer): boolean;
{数字的识别应该用有限自动机，现在暂且如此做}
var
  charType, charType2, j: integer;
  h: TSegHashRec;
begin
  len := 0;
  if (pw^ <> #0) and isNumberChar2x(pw^, charType) then
  begin
    inc(len);
    case charType of
     4: if (pw^ = '第') then
        begin
          if not isNumberChar2(pw[1], charType) or (charType = 4)
          then begin result := false; exit end;
          inc(len);
        end
        else if pw^ = #$FF0D {'－'} then
        begin
          if not (isNumberChar2(pw[1], charType) and (charType = 1))
          then begin result := false; exit end;
          inc(len);
        end
        else if pw^ = dian {点} then begin result := false; exit end
        else if pw^ = '分' then begin result := false; exit end;
     0: if (pw^ = '.') and not (isdigit((pw+1)^)) then
        begin result := false; exit end;
     1: if (wstrscan('·', pw^) <> nil) and not (isdigit((pw+1)^)) then
        begin result := false; exit end;
    end;

    if (charType = 4) and (wstrscan(Type4CharsHead, pw^) <> nil)
    then charType := 2; {handle 两千斤}
    while (pw[len] <> #0) do
      if wstrscan('％%‰', pw[len]) <> nil
      then begin inc(len); break end
      else if isNumberChar2x(pw[len], charType2) and // 300多年：原来是isNumberChar2x
         ((charType2 = charType) or
          (charType2 = 4) and not ((pw[len]='分') and (pw[len+1]<>'之')) and
                              not ((pw[len]='之') and (pw[len-1]<>'分')) and
                              not ((pw[len]='千') and (wstrscan('万亿0', pw[len-1])<>nil))
          )
           {数字类型必须相同，“六828千”不是一个数词}
      then inc(len)
      else if (wstrscan({'∶:：}'∶·.．', pw[len]) <> nil) and isNumberChar(pw[len+1])
           then inc(len, 2)
      else if (WSTRSCAN(',/', pw[len]) <> NIL) and (charType = 0) and isdigit(pw[len+1])
           then inc(len, 2)
           else break;
    { 最后一个字不能是有些字 }
    if (WStrScan(WrongLastNumberChars, pw[len-1]) <> nil)
    then dec(len)
    else if isdigit(pw[len-1]) and isalpha(pw[len])
    then begin result := false; exit end // 3d 
    {else if lq_maxfind(@pw[len-1], j, h) and (j>1)// 五亿千瓦
    then begin writeln(widecharlentostring(pw, len+1));dec(len)end}
    {else if (pw[len] <> #0) and (pw[len-1] < #128) and (pw[len] < #128) and (pw[len] <> '"')
         then begin
                if wstrscan(')(,', pw[len]) = nil then
                begin
                  result := false;
                  exit
                end
              end; { 863-306 }
  end;
  result := (len > 1) or (len = 1) and (charType in [0,1]) ;
end;

function isNumberWord3(pw: PWideChar; len: integer): boolean;
var
  charType: integer;
begin
  while (len > 0) do
  begin
    dec(len);
    if not IsNumberChar2x(pw[len], charType) then begin result := false; exit end;
  end;
  result := true
end;

function recognize_foreign_name(pw: PWideChar; var with_dot: boolean): integer;
var
  len: integer;
begin
  len := 1;
  while isCC(pw[len]) and (ForeignNameCharMiddle in CC[pw[len]]) do inc(len);
  if not isCC(pw[len]) then dec(len);
  while (len > 0) and not (ForeignNameCharLast in CC[pw[len]]) do dec(len);
  if len = 0 then begin result := 0; exit; end;

  with_dot := false;
  // pw[0..len] is a possible name
  // a fragment?
  if (wstrscan('·˙—', pw[len+1]) <> nil) and
     IsCC(pw[len+2]) and
     (ForeignNameCharFirst in CC[pw[len+2]])
  then begin
         result := len + 2 + recognize_foreign_name(@pw[len+2], with_dot);
         with_dot := true
       end
  else result := len + 1;
end;

function used_after_name(pw: PWideChar): boolean;
var h: TSegHashRec;
begin
  h := lqdic.existword(pw, 2);
  result := (h <> nil) and (h.extra and _xNSuf <> 0);
end;

function possible_name(i: integer; pw: PWideChar; len: integer): boolean;
var
  d: real;
  k: integer;
begin        {
  if (i > 0) then
  with wGraph[i] do
  for k := 0 to npred-1 do
    if predList[k].pos = cat then
           {// check name prefix and suffix
             _n0 := wGraph[n0].predList[k].predNode;
             h := lqdic.existword(@pw[_n0], n0-_n0);
             if (h <> nil) and (h.extra and _xNPre <> 0) then p := p*1.2;}
  (*if used_after_name(@pw[len])
  then case len of
         2: d := 0.00000025; {储值: 0.00000165 成词:0.00000239}
         3: d := 0.00000000006; // 双发多
       end
  else *)
       if (pw[len] <> #0) and (wstrscan('·', pw[len]) <> nil)
       then d := 0.0000025
       else
       case len of
         2: if pw^ = (pw+1)^
            then d := 0.000025
            else d := 0.0000025; {d := 0.0000050;   {0.0000025; {储值: 0.00000165 成词:0.00000239}
         3: if lqdic.existword(pw+1, 2) <> nil
            then d := 0.0000006
            else d := 0.0000000006; {d := 0.0000000012;{0.0000000006; // 双发多}
       end;
  if strict_name then d := d * 1000;
  if (len = 3) and (lqdic.existword(pw, 2) <> nil) then d := d * 1000;
  result := name_prob(pw, len) > d;
end;

function is_foreign_place_name(pw: PWideChar; len: integer): boolean;
var w: WideChar;
begin
  w := pw[len];
  pw[len] := #0;
  result := transliterated_foreign_names.IndexOf(pw) <> -1;
  pw[len] := w;
end;

function isenglishname(pw: PWideChar; len: integer; var etype, ext_type: integer): boolean;
var i, count, j: integer;
    ew: array[0..4] of string;
    s: string;
begin
  result := false;
  for i:=0 to len-1 do if not (ord(pw[i]) in [32, ord('a')..ord('z'), ord('A')..ord('Z')]) then exit;
  s := UpperCase(WideCharLenToString(pw, len));
  if foreign_org_names.Find(s, j) or (pos('.COM', s) > 0) then
  begin
    result := true;
    etype := lq_nt;
    ext_type := 0;
    exit;
  end;
  tokenize(s, ew, count, 5);
  if count >= 5 then exit;
  for i:=0 to count-1 do
    if not foreign_names.Find(ew[i], j) then exit;
  etype := lq_nr;
  ext_type := pos_foreign_name;
  result := true
end;

procedure CreateWGraph(pw: PWideChar; pwlen: integer);
var
  i, j, len, max_i, etype, ext_type: integer;
  r: TRepeatInfo;
  h: TSegHashRec;
  ww: array[0..31] of WideChar;
  with_dot: boolean;
label
  nameloop;
begin
  for i := 1 to slen do wGraph[i].npred := 0;
  with wGraph[0] do
  begin
    npred := 1;
    predList[0].pos := lq_sbegin;
    predList[0].score := 0;
  end;
  i := 0;
  max_i := 0;
  while i < slen do
  begin
  if isCC(pw^) then
  begin
  //Add_Chinese_Names;
  if isDoubleSurname(pw)
  then begin
         if isCC(pw[2])
         then begin
                if isCC(pw[3]) and (np1[pw[2]] <> 0) and (np2[pw[3]] <> 0)
                then begin
                       // 可以构成双名
                       AddWGraphEdge1(i, i+4, lq_nr, pos_chinese_name_double);
                     end;
                // 可能构成单名？
                if np3[pw[2]] <> 0
                then begin
                       AddWGraphEdge1(i, i+3, lq_nr, pos_chinese_name_double);
                       inc(i, 3);
                       inc(pw, 3);
                     end
                else begin // 皇甫姑娘
                       AddWGraphEdge1(i, i+2, lq_nrf, 0);
                       inc(i, 2);
                       inc(pw, 2);
                     end
              end
         else begin
                AddWGraphEdge1(i, i+2, lq_nrf, 0);
                inc(i, 2);
                inc(pw, 2);
              end;
         continue;
       end
  else if (0 in CC[pw^])
  then begin
         // 老张，小沈
         if (i > 0) and (wstrscan('老小阿', (pw-1)^) <> nil)
         then AddWGraphEdge1(i-1, i+1, lq_nr, pos_chinese_name_special_surname_last);

         if isCC(pw[1])
         then begin
                if isCC(pw[2]) and ( possible_name(i, pw, 3) or
                                     (pw[1] = pw[2]) and (pw[1] = '某') )
                then begin
                       // 可以构成双名
                       if not used_after_name(@pw[1]) then
                       begin
                         AddWGraphEdge1(i, i+3, lq_nr, pos_chinese_name);
                         if wstrscan('山区村乡镇县市路岛', pw[2]) <> nil then
                           AddWGraphEdge1(i, i+3, lq_ns, 0); // 简单的地名识别
                       end;

                       {if (0 in CC[pw[1]]) and (np0[pw^] > 0.01) and (name_prob(pw+1, 3) > 0.0000005)
                       then AddWGraphEdge1(i, i+4, lq_nr, 0); // 陈方安生}
                     end;
                // 可能构成单名？
                if (wstrscan('老总總某家氏×' {母父犯}, pw[1]) <> nil)
                then AddWGraphEdge1(i, i+2, lq_nr, pos_chinese_name_special_surname_first)
                else if possible_name(i, pw, 2)
                then AddWGraphEdge1(i, i+2, lq_nr, pos_chinese_name);
              end
         else if pw[1] = #$00D7 {×}
              then if pw[1] = pw[2]
                   then AddWGraphEdge1(i, i+3, lq_nr, pos_chinese_name)
                   else AddWGraphEdge1(i, i+2, lq_nr, pos_chinese_name)
       end;

  //Add_Foreign_Names;
  if (ForeignNameCharFirst in CC[pw^])
  then begin // 这里要注意我的外国人名用字表肯定会被突破
         len := recognize_foreign_name(pw, with_dot);
         // 阿姆斯特朗登月时 => 阿姆斯特朗登, 加以简单处理
         if (len > 5) and isCC(pw[len-2]) and (ForeignNameCharLast in CC[pw[len-2]]) then
           AddWGraphEdge1(i, i+len-1, lq_nr, pos_foreign_name);

         if (len >= 4) and (i >= max_i) then  // not 总\理纳瓦兹·谢里夫
         begin
hmmseg.pas - 源码说明

本页面展示了「基于隐马尔科夫模型的分词标注程序」中的 hmmseg.pas 源码文件，采用 PAS 编程语言编写，共 2,061 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与模型相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?