📄 segtag.pas
字号:
POS_REC = record
position, len: word;
tag: integer;
end;
function convert_sent_to_position(const w: string; var p: array of POS_REC; var k: integer; want_orig: boolean): string;
var
len, i, j, m: integer;
begin
len := length(w);
j := 1;
i := 2;
k := 0;
p[0].position := 0;
if want_orig then result := '';
// format: 迈向/v 充满/v 希望/n 的/u 新/a 世纪/n
repeat
while (i <= len) and not (w[i] in ['/',' ']) do inc(i);
// j..i is a word, except when '/' may be part of the word
if w[i] = '/' then
begin
m := i + 1;
while (m <= len) and (w[m] <> ' ') do
begin
if w[m] = '/' then i := m;
inc(m);
end
end;
while w[j] = '[' do inc(j);
if want_orig then result := result + copy(w, j, i-j); // word
if k >= high(p) then exit;
p[k].len := i-j;
if i > len then begin inc(k); break; end;
if w[i] = ' ' then p[k].tag := 0 else // only segmentation
begin
inc(i); j := i;
while (i <= len) and not (ord(w[i]) in [ord(' '), ord(']')]) do inc(i);
if tag_scheme = tag_pku
then p[k].tag := pku_code_to_index(copy(w, j, i-j))
else if tag_scheme = tag_ldc
then p[k].tag := ldc_hm.getid(copy(w, j, i-j))
else error('bad tagging scheme');
end;
inc(k); p[k].position := p[k-1].position + p[k-1].len;
if (i<=len) and (w[i] = ']') then while (i <= len) and (w[i] <> ' ') do inc(i);
inc(i);
while (i <= len) and (w[i] = ' ') do inc(i);
j := i;
inc(i);
until j > len;
end;
function common_words(var p1, p2: array of POS_REC; len1, len2: integer; var common_tags: integer): integer;
var
i, k: integer;
begin
// assume p1 is our result, p2 is standard
result := 0;
common_tags := 0;
k := 0;
i := 0;
repeat
while (i < len1) and (p1[i].position < p2[k].position) do inc(i);
if i >= len1 then exit;
while (k < len2) and (p1[i].position > p2[k].position) do inc(k);
if k >= len2 then exit;
if (p1[i].position = p2[k].position) and (p1[i].len = p2[k].len) then
begin
inc(result);
if p1[i].tag = p2[k].tag then inc(common_tags);
end;
inc(i); inc(k)
until false;
end;
var
p1, p2: array[0..1023] of POS_REC;
nline, len1, len2, ok_total, my_total, my_correct, my_correct_tag, ok, ok_tag: integer;
ok_total_1, my_total_1, my_correct_1: integer;
s, outfilename, s1, s2: string;
w: WideString;
f: TLineStream;
g: text;
p, r: real;
segrates: TMyStringList;
begin
segrates := TMyStringList.Create;
if FileExists('segrate.txt') then segrates.LoadFromFile('segrate.txt');
s := segrates.Values[filename];
if s = ''
then ok_total_1 := 0
else begin
split(s, ' ', s1, s2);
ok_total_1 := StrToIntDef(s1, 0);
split(s2, ' ', s, s1);
my_total_1 := StrToIntDef(s, 0);
my_correct_1 := StrToIntDef(s1, 0);
end;
try
if user_designated_codepage = ' ' then SelectSegmentor(not is_jt(ReadFileHeadAsWideChar(filename, 8192)));
f := TLineStream.Create(filename);
except
error('cannot open ' + filename);
end;
ok_total := 0; my_total := 0; my_correct := 0; my_correct_tag := 0; nline := 0;
if verbose then
begin
outfilename := ExtractFilePath(filename) + '_' + ExtractFileName(filename);
assign(g, outfilename);
rewrite(g);
end;
while f.ReadLn(s) do
begin
inc(nline);
if f.isUnicode = _t_utf8 then s := utf8toansi(s);
if s = '' then continue;
if verbose and (nline mod 1000 = 0) then write(#13, nline);
w := convert_sent_to_position(s, p2, len2, true); // the standard
inc(ok_total, len2);
s := {segtag}SegTag_and_MWU(PWideChar(w), length(w));
convert_sent_to_position(s, p1, len1, false);
inc(my_total, len1);
ok := common_words(p1,p2,len1,len2, ok_tag);
inc(my_correct, ok);
inc(my_correct_tag, ok_tag);
if verbose then writeln(g, {s}w);
end;
if verbose then writeln(#13, nline);
if ok_total_1 > 0 then
begin
if my_correct_1=0 then p := 1e-20 else p := my_correct_1/my_total_1;
if my_correct_1=0 then r := 1e-20 else r := my_correct_1/ok_total_1;
writeln(format('good_total=%d, my_total=%d, my_correct=%d'#13#10'precision=%5.3f%%, recall=%5.3f%%, F=%5.3f%%',
[ok_total_1, my_total_1, my_correct_1, p*100, r*100, 2*p*r/(p+r)*100]));
writeln;
end;
if my_correct=0 then p := 1e-20 else p := my_correct/my_total;
if my_correct=0 then r := 1e-20 else r := my_correct/ok_total;{$ifdef win32}
cwriteln(Foreground_Green + FOREGROUND_INTENSITY,{$else} writeln({$endif} format('good_total=%d, my_total=%d, my_correct=%d, tag_correct=%d'#13#10'seg precision=%5.3f%%, recall=%5.3f%%, F=%5.3f%%',
[ok_total, my_total, my_correct, my_correct_tag, p*100, r*100, 2*p*r/(p+r)*100]));
if my_correct_tag=0 then p := 1e-20 else p := my_correct_tag/my_total;
if my_correct_tag=0 then r := 1e-20 else r := my_correct_tag/ok_total;
{$ifdef win32}
cwriteln(Foreground_Green + FOREGROUND_INTENSITY,{$else} writeln({$endif} format('tag precision=%5.3f%%, recall=%5.3f%%, F=%5.3f%%',
[p*100, r*100, 2*p*r/(p+r)*100]));
f.Free;
if verbose then begin close(g); writeln(ErrOutput, {'writing result to '}'writing source to ', outfilename); end;
segrates.Values[filename] := format('%d %d %d', [ok_total, my_total, my_correct]);
segrates.SaveToFile('segrate.txt');
halt;
end;
procedure seg1(const w: WideString);
begin
if (seg_j <> nil) then SelectSegmentor(not is_jt(PWideChar(w)));
writeln(segtag(PWideChar(w), length(w)));
halt;
end;
procedure namex(const filename: string);
var
f: TLineStream;
s, source, target: String;
c: char;
i, j, len, totalfile: integer;
clk: Clock; // t1: dword;
begin
clk.Start; //t1 := gettickcount;
try
f := TLineStream.Create(filename);
except
error('cannot open ' + filename);
end;
//ChDir(ExtractFileDir(filename));
totalfile := 0;
while f.ReadLn(s) do
begin
if (s='') then break; // 2005-4-11
if not (s[1] in ['S','T']) then error('NE script file format!');
// get two file names, allowing spaces if quoted by "
len := length(s);
i := 2;
while (i <= len) and (s[i] = ' ') do inc(i);
if i > len then error('file format!');
if s[i] = '"' then c := '"' else c := ' ';
j := i;
while (i <= len) and (s[i] <> c) do inc(i);
if c = '"' then inc(i);
if i > len then error('file format!');
source := strip_quotes(copy(s, j, i-j));
while (i <= len) and (s[i] = ' ') do inc(i);
target := strip_quotes(copy(s, i, maxint));
SelectSegmentor(s[1] = 'T');
segfile(source, target, false);
inc(totalfile);
end;
writeln(errOutput, format(#13#10'%s %s %s using %d ms', [sp_form(totalfile, 'file'), sp_form(seg_unit_count, 'sentence'), sp_form(segged_word_count, 'word'), {gettickcount-t1}Clk.TimeElapsed_MS]));
halt;
end;
var
clk: clock; //t1: dword;
i, ok, totalfile: integer;
x, filename, dir, outfile, path, c1: string;
f: TSearchRec;
seg1sent, auto_detect, scriptfile: boolean;
calc_bigram: integer = 0;
begin
if paramcount = 0 then help;
benchmarking := false;
seg1sent := false;
user_designated_codepage := ' ';
very_verbose := false;
i := 1;
mwu_count := -1;
segdatadir := '';
while i <= paramcount do
begin
x := ParamStr(i);
if copy(x, 1, 2) = '-u' then
begin
do_mwu := true;
if length(x) > 2 then
mwu_count := StrToIntDef(copy(x,3,5), mwu_count);
end
else
if x = '-s' then tagmode := tag_no else
if x = '-n' then tagmode := tag_neon else
if x = '-t' then tagmode := tag_only else
if x = '-8' then tag_scheme := tag_863 else
if x = '-l' then begin tag_scheme := tag_ldc; ldc_hm := THashMap.Create(64); end else
if x = '-i' then ignoreSpace := true else
if x = '-b' then benchmarking := true else
if x = '-p' then split_surname := true else
if x = '-z' then do_nt := true else
if x = '-d' then calc_prob := true else
if x = '-h' then show_prob := true else
if x = '-m' then begin msr_mode := true; do_nt := true end else
if x = '-f' then show_foreign_name := true else
if x = '-e' then escape_space := true else
if x = '-S' then user_designated_codepage := 'S' else
if x = '-T' then user_designated_codepage := 'T' else
if x = '-G' then user_designated_codepage := 'G' else
if x = '-x' then tagmode := tag_namex else
if x = '-X' then begin tagmode := tag_namex; scriptfile := true end else
if x = '-U' then utf8_output := true else
if x = '-1' then begin seg1sent := true; inc(i); filename := ParamStr(i) end else
if x = '-2' then begin c1 := paramstr(i+1); filename := paramstr(i+2); inc(i, 2); calc_bigram := 2 end else
if x = '-3' then begin c1 := paramstr(i+1); filename := paramstr(i+2); outfile := paramstr(i+3); inc(i, 3); calc_bigram := 3 end else
if x = '-v' then verbose := true else
if x = '-vv' then very_verbose := true else
if x = '-vvv' then very_very_verbose := true else
if x = '-D' then begin inc(i); segdatadir := ParamStr(i) end else
if x = '-N' then strict_name := true else
if x = '-g' then just_tag := true else
if filename = '' then filename := x
else dir := x;
inc(i);
end;
if filename = '' then help;
if dir <> '' then
if DirectoryExists(dir)
then begin
if not (dir[length(dir)] in [':', PathDelim]) then dir := dir + PathDelim
end
else begin
outfile := dir; dir := '';
if pos('*', filename) > 0 then
warn(outfile + ' will only contain results for the last matching file!');
end;
if very_verbose then verbose := true;
if very_verbose then writeln(errOutput, 'try to init segmentors.');
auto_detect := user_designated_codepage = ' ';
if auto_detect and (segdatadir <> '') then error('please specicify sublanguage');
case user_designated_codepage of
'S': InitSegmentor1(Chinese_S);
'T': InitSegmentor1(Chinese_T);
'G': begin
segdatadir := 'ghy';
InitSegmentor1(Chinese_T);
end;
else if calc_bigram <> 0
then error('use -T or -S to specify sublanguage!')
else InitSegmentors;
end;
if very_verbose then writeln(errOutput, 'inited segmentors.');
if calc_bigram <> 0 then calc_prob2(calc_bigram, c1, filename, outfile);
{
filename := '据此,广州市要求,今年全市国内生产总值较上一年增长百分之十三;工业增加值增长' +
'百分之十三点五(工业总产值增长百分之十五);农业增加值增长百分之五点五(农业总产'+
'值增长百分之六点五);第三产业增加值增长百分之十三点五;市属全社会固定资产投资增'+
'长百分之十一;社会消费品零售总额增长百分之十五;外贸出口增长百分之五;进口增长百'+
'分之十;实际利用外资增长百分之八;零售物价总水平控制在百分之六以内;预算内财政收'+
'入增长百分之十;科技进步对工业经济增长贡献率在去年基础上再提高一点八个百分点;城'+
'镇登记失业率控制在百分之三以下;人口自然增长率控制在百分之八点二以下。';
}
{filename := '丁午寿议员亦会根据《释义及通则条例》动议另一项决议案,议决把2003年2月19日'+
'提交立法会会议省览的《2003年进出口(一般)(修订)规例》;《2003年进出口(登记)'+
'(修订)规例》;《2003年进出口(移离物品)(修订)规例》;《2003年储备商品(进出'+
'口及储备存货管制(修订)规例》;《〈2001年应课税品(修订)条例〉(2001年第19号)'+
'2003年(生效日期)公告》;以及《〈2002年进出口(电子交易)条例〉(2002年第24号)'+
'2003年(生效日期)公告》的修订期限延展至2003年4月9日的会议。';}
if seg1sent then seg1(filename);
if benchmarking then benchmark(filename);
if tagmode = tag_namex then
begin
do_nt := true;
if scriptfile then namex(filename);
end;
total_cc := 0;
clk.Start; //t1 := gettickcount;
totalfile := 0;
path := extractfilepath(filename);
ok := findfirst(filename, faAnyFile, f);
while ok = 0 do
begin
if f.attr and fadirectory = 0 then
begin
if dir = ''
then segfile(path + f.name, outfile, auto_detect)
else segfile(path + f.name, dir + f.Name, auto_detect);
inc(totalfile);
end;
ok := findnext(f);
end;
findclose(f);
clk.Stop; //t1 := gettickcount-t1;
writeln(errOutput, format(#13#10'%s %s %s using %d ms, %s/s', [sp_form(totalfile, 'file'), sp_form(seg_unit_count, 'sentence'),
sp_form(segged_word_count, 'word'), {t1}clk.TimeElapsed_MS, GetClearMemorySize(total_cc*1000 div max({t1}clk.TimeElapsed_MS,1))]));
{$ifdef win32}if very_verbose then writeln(errOutput, 'used ', GetClearMemorySize(MemUsed));{$endif}
end.
{$endif}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -