⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 segtag.pas

📁 基于隐马尔科夫模型的分词标注程序
💻 PAS
📖 第 1 页 / 共 2 页
字号:
  POS_REC = record
    position, len: word;
    tag: integer;
  end;
function convert_sent_to_position(const w: string; var p: array of POS_REC; var k: integer; want_orig: boolean): string;
var
  len, i, j, m: integer;
begin
  len := length(w);
  j := 1;
  i := 2;
  k := 0;
  p[0].position := 0;
  if want_orig then result := '';
  // format: 迈向/v  充满/v  希望/n  的/u  新/a  世纪/n
  repeat
    while (i <= len) and not (w[i] in ['/',' ']) do inc(i);
    // j..i is a word, except when '/' may be part of the word
    if w[i] = '/' then
    begin
      m := i + 1;
      while (m <= len) and (w[m] <> ' ') do
      begin
        if w[m] = '/' then i := m;
        inc(m);
      end
    end;
    while w[j] = '[' do inc(j);
    if want_orig then result := result + copy(w, j, i-j);         // word
    if k >= high(p) then exit;
    p[k].len := i-j;
    if i > len then begin inc(k); break; end;
    if w[i] = ' ' then p[k].tag := 0 else // only segmentation
    begin
      inc(i); j := i;
      while (i <= len) and not (ord(w[i]) in [ord(' '), ord(']')]) do inc(i);
      if tag_scheme = tag_pku
      then p[k].tag := pku_code_to_index(copy(w, j, i-j))
      else if tag_scheme = tag_ldc
      then p[k].tag := ldc_hm.getid(copy(w, j, i-j))
      else error('bad tagging scheme');
    end;
    inc(k); p[k].position := p[k-1].position + p[k-1].len;
    if (i<=len) and (w[i] = ']') then while (i <= len) and (w[i] <> ' ') do inc(i);
    inc(i);
    while (i <= len) and (w[i] = ' ') do inc(i);
    j := i;
    inc(i);
  until j > len;
end;

function common_words(var p1, p2: array of POS_REC; len1, len2: integer; var common_tags: integer): integer;
var
  i, k: integer;
begin
  // assume p1 is our result, p2 is standard
  result := 0;
  common_tags := 0;
  k := 0;
  i := 0;
  repeat
    while (i < len1) and (p1[i].position < p2[k].position) do inc(i);
    if i >= len1 then exit;
    while (k < len2) and (p1[i].position > p2[k].position) do inc(k);
    if k >= len2 then exit;
    if (p1[i].position = p2[k].position) and (p1[i].len = p2[k].len) then 
    begin
      inc(result);
      if p1[i].tag = p2[k].tag then inc(common_tags);
    end;
    inc(i); inc(k)
  until false;
end;

var
  p1, p2: array[0..1023] of POS_REC;
  nline, len1, len2, ok_total, my_total, my_correct, my_correct_tag, ok, ok_tag: integer;
  ok_total_1, my_total_1, my_correct_1: integer;
  s, outfilename, s1, s2: string;
  w: WideString;
  f: TLineStream;
  g: text;
  p, r: real;
  segrates: TMyStringList;
begin
  segrates := TMyStringList.Create;
  if FileExists('segrate.txt') then segrates.LoadFromFile('segrate.txt');
  s := segrates.Values[filename];
  if s = ''
  then ok_total_1 := 0
  else begin
         split(s, ' ', s1, s2);
         ok_total_1 := StrToIntDef(s1, 0);
         split(s2, ' ', s, s1);
         my_total_1 := StrToIntDef(s, 0);
         my_correct_1 := StrToIntDef(s1, 0);
       end;

  try
    if user_designated_codepage = ' ' then SelectSegmentor(not is_jt(ReadFileHeadAsWideChar(filename, 8192)));
    f := TLineStream.Create(filename);
  except
    error('cannot open ' + filename);
  end;
  ok_total := 0; my_total := 0; my_correct := 0; my_correct_tag := 0; nline := 0;
  if verbose then
  begin
    outfilename := ExtractFilePath(filename) + '_' + ExtractFileName(filename);
    assign(g, outfilename);
    rewrite(g);
  end;
  while f.ReadLn(s) do
  begin
    inc(nline);
    if f.isUnicode = _t_utf8 then s := utf8toansi(s);
    if s = '' then continue;
    if verbose and (nline mod 1000 = 0) then write(#13, nline);
    w := convert_sent_to_position(s, p2, len2, true);  // the standard
    inc(ok_total, len2);
    s := {segtag}SegTag_and_MWU(PWideChar(w), length(w));

    convert_sent_to_position(s, p1, len1, false);
    inc(my_total, len1);
    ok := common_words(p1,p2,len1,len2, ok_tag);
    inc(my_correct, ok);
    inc(my_correct_tag, ok_tag);
    if verbose then writeln(g, {s}w);
  end;
  if verbose then writeln(#13, nline);

  if ok_total_1 > 0 then
  begin
    if my_correct_1=0 then p := 1e-20 else p := my_correct_1/my_total_1;
    if my_correct_1=0 then r := 1e-20 else r := my_correct_1/ok_total_1;
    writeln(format('good_total=%d, my_total=%d, my_correct=%d'#13#10'precision=%5.3f%%, recall=%5.3f%%, F=%5.3f%%',
      [ok_total_1, my_total_1, my_correct_1, p*100, r*100, 2*p*r/(p+r)*100]));
    writeln;
  end;

  if my_correct=0 then p := 1e-20 else p := my_correct/my_total;
  if my_correct=0 then r := 1e-20 else r := my_correct/ok_total;{$ifdef win32}
  cwriteln(Foreground_Green + FOREGROUND_INTENSITY,{$else}  writeln({$endif}    format('good_total=%d, my_total=%d, my_correct=%d, tag_correct=%d'#13#10'seg precision=%5.3f%%, recall=%5.3f%%, F=%5.3f%%',
    [ok_total, my_total, my_correct, my_correct_tag, p*100, r*100, 2*p*r/(p+r)*100]));

  if my_correct_tag=0 then p := 1e-20 else p := my_correct_tag/my_total;
  if my_correct_tag=0 then r := 1e-20 else r := my_correct_tag/ok_total;
{$ifdef win32}
  cwriteln(Foreground_Green + FOREGROUND_INTENSITY,{$else}  writeln({$endif}    format('tag precision=%5.3f%%, recall=%5.3f%%, F=%5.3f%%',
    [p*100, r*100, 2*p*r/(p+r)*100]));

  f.Free;
  if verbose then begin close(g); writeln(ErrOutput, {'writing result to '}'writing source to ', outfilename); end;

  segrates.Values[filename] := format('%d %d %d', [ok_total, my_total, my_correct]);
  segrates.SaveToFile('segrate.txt');
  halt;
end;

procedure seg1(const w: WideString);
begin
  if (seg_j <> nil) then SelectSegmentor(not is_jt(PWideChar(w)));
  writeln(segtag(PWideChar(w), length(w)));
  halt;
end;

procedure namex(const filename: string);
var
  f: TLineStream;
  s, source, target: String;
  c: char;
  i, j, len, totalfile: integer;
  clk: Clock; // t1: dword;
begin
  clk.Start; //t1 := gettickcount;
  try
    f := TLineStream.Create(filename);
  except
    error('cannot open ' + filename);
  end;
  //ChDir(ExtractFileDir(filename));
  totalfile := 0;
  while f.ReadLn(s) do
  begin
    if (s='') then break; // 2005-4-11
    if not (s[1] in ['S','T']) then error('NE script file format!');
    // get two file names, allowing spaces if quoted by "
    len := length(s);
    i := 2;
    while (i <= len) and (s[i] = ' ') do inc(i);
    if i > len then error('file format!');
    if s[i] = '"' then c := '"' else c := ' ';
    j := i;
    while (i <= len) and (s[i] <> c) do inc(i);
    if c = '"' then inc(i);
    if i > len then error('file format!');
    source := strip_quotes(copy(s, j, i-j));
    while (i <= len) and (s[i] = ' ') do inc(i);
    target := strip_quotes(copy(s, i, maxint));
    SelectSegmentor(s[1] = 'T');
    segfile(source, target, false);
    inc(totalfile);
  end;
  writeln(errOutput, format(#13#10'%s %s %s using %d ms', [sp_form(totalfile, 'file'), sp_form(seg_unit_count, 'sentence'), sp_form(segged_word_count, 'word'),    {gettickcount-t1}Clk.TimeElapsed_MS]));
  halt;
end;

var
  clk: clock; //t1: dword;
  i, ok, totalfile: integer;
  x, filename, dir, outfile, path, c1: string;
  f: TSearchRec;
  seg1sent, auto_detect, scriptfile: boolean;
  calc_bigram: integer = 0;
begin
  if paramcount = 0 then help;
  benchmarking := false;
  seg1sent := false;
  user_designated_codepage := ' ';
  very_verbose := false;
  i := 1;
  mwu_count := -1;
  segdatadir := '';
  while i <= paramcount do
  begin
    x := ParamStr(i);
    if copy(x, 1, 2) = '-u' then
    begin
      do_mwu := true;
      if length(x) > 2 then
        mwu_count := StrToIntDef(copy(x,3,5), mwu_count);
    end
    else
    if x = '-s' then tagmode := tag_no else
    if x = '-n' then tagmode := tag_neon else
    if x = '-t' then tagmode := tag_only else
    if x = '-8' then tag_scheme := tag_863 else
    if x = '-l' then begin tag_scheme := tag_ldc; ldc_hm := THashMap.Create(64); end else
    if x = '-i' then ignoreSpace := true else
    if x = '-b' then benchmarking := true else
    if x = '-p' then split_surname := true else
    if x = '-z' then do_nt := true else
    if x = '-d' then calc_prob := true else
    if x = '-h' then show_prob := true else
    if x = '-m' then begin msr_mode := true; do_nt := true end else
    if x = '-f' then show_foreign_name := true else
    if x = '-e' then escape_space := true else
    if x = '-S' then user_designated_codepage := 'S' else
    if x = '-T' then user_designated_codepage := 'T' else
    if x = '-G' then user_designated_codepage := 'G' else
    if x = '-x' then tagmode := tag_namex else
    if x = '-X' then begin tagmode := tag_namex; scriptfile := true end else
    if x = '-U' then utf8_output := true else
    if x = '-1' then begin seg1sent := true; inc(i); filename := ParamStr(i) end else
    if x = '-2' then begin c1 := paramstr(i+1); filename := paramstr(i+2); inc(i, 2); calc_bigram := 2 end else
    if x = '-3' then begin c1 := paramstr(i+1); filename := paramstr(i+2); outfile := paramstr(i+3); inc(i, 3); calc_bigram := 3 end else
    if x = '-v' then verbose := true else
    if x = '-vv' then very_verbose := true else
    if x = '-vvv' then very_very_verbose := true else
    if x = '-D' then begin inc(i); segdatadir := ParamStr(i) end else
    if x = '-N' then strict_name := true else
    if x = '-g' then just_tag := true else
    if filename = '' then filename := x
    else dir := x;
    inc(i);
  end;

  if filename = '' then help;
  if dir <> '' then
    if DirectoryExists(dir)
    then begin
           if not (dir[length(dir)] in [':', PathDelim]) then dir := dir + PathDelim
         end
    else begin
           outfile := dir; dir := '';
           if pos('*', filename) > 0 then
             warn(outfile + ' will only contain results for the last matching file!');
         end;

  if very_verbose then verbose := true;
  if very_verbose then writeln(errOutput, 'try to init segmentors.');

  auto_detect := user_designated_codepage = ' ';
  if auto_detect and (segdatadir <> '') then error('please specicify sublanguage');
  case user_designated_codepage of
    'S': InitSegmentor1(Chinese_S);
    'T': InitSegmentor1(Chinese_T);
    'G': begin
           segdatadir := 'ghy';
           InitSegmentor1(Chinese_T);
         end;
    else if calc_bigram <> 0
         then error('use -T or -S to specify sublanguage!')
         else InitSegmentors;
  end;

  if very_verbose then writeln(errOutput, 'inited segmentors.');
  if calc_bigram <> 0 then calc_prob2(calc_bigram, c1, filename, outfile);
  {
  filename := '据此,广州市要求,今年全市国内生产总值较上一年增长百分之十三;工业增加值增长' +
'百分之十三点五(工业总产值增长百分之十五);农业增加值增长百分之五点五(农业总产'+
'值增长百分之六点五);第三产业增加值增长百分之十三点五;市属全社会固定资产投资增'+
'长百分之十一;社会消费品零售总额增长百分之十五;外贸出口增长百分之五;进口增长百'+
'分之十;实际利用外资增长百分之八;零售物价总水平控制在百分之六以内;预算内财政收'+
'入增长百分之十;科技进步对工业经济增长贡献率在去年基础上再提高一点八个百分点;城'+
'镇登记失业率控制在百分之三以下;人口自然增长率控制在百分之八点二以下。';
}
{filename := '丁午寿议员亦会根据《释义及通则条例》动议另一项决议案,议决把2003年2月19日'+
'提交立法会会议省览的《2003年进出口(一般)(修订)规例》;《2003年进出口(登记)'+
'(修订)规例》;《2003年进出口(移离物品)(修订)规例》;《2003年储备商品(进出'+
'口及储备存货管制(修订)规例》;《〈2001年应课税品(修订)条例〉(2001年第19号)'+
'2003年(生效日期)公告》;以及《〈2002年进出口(电子交易)条例〉(2002年第24号)'+
'2003年(生效日期)公告》的修订期限延展至2003年4月9日的会议。';}

  if seg1sent then seg1(filename);
  if benchmarking then benchmark(filename);
  if tagmode = tag_namex then
  begin
    do_nt := true;
    if scriptfile then namex(filename);
  end;

  total_cc := 0;
  clk.Start; //t1 := gettickcount;
  totalfile := 0;
  path := extractfilepath(filename);
  ok := findfirst(filename, faAnyFile, f);
  while ok = 0 do
  begin
    if f.attr and fadirectory = 0 then
    begin
      if dir = ''
      then segfile(path + f.name, outfile, auto_detect)
      else segfile(path + f.name, dir + f.Name, auto_detect);
      inc(totalfile);
    end;
    ok := findnext(f);
  end;
  findclose(f);
  clk.Stop; //t1 := gettickcount-t1;
  writeln(errOutput, format(#13#10'%s %s %s using %d ms, %s/s', [sp_form(totalfile, 'file'), sp_form(seg_unit_count, 'sentence'),
    sp_form(segged_word_count, 'word'), {t1}clk.TimeElapsed_MS, GetClearMemorySize(total_cc*1000 div max({t1}clk.TimeElapsed_MS,1))]));
  {$ifdef win32}if very_verbose then writeln(errOutput, 'used ', GetClearMemorySize(MemUsed));{$endif}
end.
{$endif}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -