📄 segtag.pas
字号:
{
今日开始试图改造分词标注程序,
既可以生成目前以pku为基础的标注集
又可以生成滨州汉语树库的标注集
主要的思路:
设计一个新的更细的标注集,是两个标注集的细化,然后可以简单地通过合并某些
词类来生成两个标记集。
首先:
pku的标记集里面有些是没有意义的,如l之类。这些需要尽快去除。
}
{$ifndef dll}{$apptype console}{$endif}
{$ifdef dll}
library mandel_segtag;
{$endif}
uses
{$ifndef linux}windows,{$endif}
SysUtils,
Classes,
Math,
comlib,
mystr,
neww,
hmmseg in 'hmmseg.pas';
var
mwu_count: integer;
verbose, ignoreSpace, benchmarking: boolean;
total_cc: int64;
utf8_output: boolean = false;
user_designated_codepage: char;
ldc_hm: THashMap;
const
copyrightstr = 'segtag 1.21 copyright (c) Mandel Shi 2007/5/7' {$ifdef linux}+' (linux)'{$endif};
procedure help;
begin
//writeln('segtag 0.14 copyright (c) Mandel Shi 2004/8/18');
writeln(copyrightstr);
writeln(' This program uses partial resources of ICTCLAS build 0705, ');
writeln(' which is converted by Xiang Xiaowen');
writeln;
writeln('segtag [options] filename [outfilename|outdir]');
writeln(' -u turn on Unknown Word Recognition');
writeln(' -z chunk proper names aggressively');
writeln(' -v verbose mode');
writeln(' -s just seg, no tagging');
writeln(' -m MSR-seg compatible mode');
writeln(' -n use neon-style segmentation separator');
writeln(' -t output tag only');
writeln(' -i ignore spaces in source text');
writeln(' -p split Chinese surname and given name (ICTCLAS compatible mode)');
writeln(' -e escape spaces and slashes');
writeln(' -x 863 named entity recognition for one file');
writeln(' -X 863 named entity recognition for a script file');
writeln(' -S file is in Simplified Chinese');
writeln(' -T file is in Traditional Chinese');
writeln(' -G file is in Ancient Chinese');
writeln(' -D designate the segdata subdirectory');
writeln(' -U output in utf8 (default on linux)');
writeln(' -N stricter name recognition');
writeln(' -l use ldc penn tree Chinese tagset');
writeln('options used by me (others don''t use, please):');
writeln(' -b benchmark the segmentation precision and recall (-v to obtain source)');
writeln(' -8 use 863 tagset instead of pku''s (obsolete)');
writeln(' -h show score');
writeln(' -d show score of tagged sentence');
writeln(' -f show foreign names');
writeln(' -1 segment one sentence');
writeln(' -2 show bigram prob');
writeln(' -3 show trigram prob');
writeln(' -vv very verbose mode');
writeln(' -vvv very very verbose mode');
writeln(' e.g.');
writeln(' segtag test1.txt test1_seg.txt');
writeln(' segtag -s -z test*.txt d:\result');
halt;
end;
procedure delete_space(w: PWideChar; var len: integer);
var w1: PWideChar;
begin
while w^ <> #0 do
begin
if w^ = ' '
then begin
w1 := w;
repeat inc(w1) until w1^ <> ' ';
dec(len, w1-w);
wstrcpy(w, w1);
end;
inc(w)
end
end;
function SegTag_and_MWU(pw: PWideChar; len: integer): string;
begin
if do_mwu then
begin
mwu := TWideStringList.Create;
// 最少频率 4;最长词长 7: 符拉迪沃斯托克
recognizeMWU(pw, 7, 4, mwu, true, verbose);
// delete known words from lexicon
True_UW(verbose);
end;
result := segtag(pw, len);
if do_mwu then mwu.Free;
end;
procedure segfile(const filename: string; outfile: string; auto_detect: boolean);
var
nline, i, fsize, spaces: integer;
s, w1, w2: PWideChar;
f: TBufStream;
g: text;
wc: WideChar;
ft: boolean;
t_w, w_x: WideString;
b: word;
ii: TLineStream;
begin if very_verbose then writeln(erroutput, 'segmenting ', filename);
if auto_detect then
begin
try
s := ReadFileHeadAsWideChar(filename, 8192); except
writeln('cannot open ', filename);
exit;
end;
ft := not is_jt(s);
if verbose then
if ft then writeln(ErrOutput, filename, ' is Traditional Chinese') else writeln(ErrOutput, filename, ' is Simplified Chinese');
SelectSegmentor(ft);
FreeMem(s);
end;
fsize := mygetfilesize(filename);
if do_mwu and (fsize < 400000) then // 400K is too big
begin
if very_verbose then writeln(errOutput, 'MTU detection...');
mwu := TWideStringList.Create;
if mwu_count = -1
then mwu_count := 4 * trunc(log2(fsize));
if verbose then writeln(ErrOutput, 'at most ', mwu_count, ' UW');
// 最少频率 4;最长词长 6
s := ReadFileAsWideChar(filename);
recognizeMWU(s, 6, 4, mwu, true, verbose);
// 删去一些意义不大的
for i:= mwu.Count-1 downto mwu_count do
begin
freemem(mwu[i]);
dispose(mwu.List^[i]);
mwu.List^[i] := nil
end;
if mwu.Count > mwu_count then mwu.Count := mwu_count;
// delete known words from lexicon
True_UW(verbose);
FreeMem(s);
end;
try
if tagmode = tag_namex
then begin
if outfile = '' then
begin
outfile := 'namex.txt';
writeln(erroutput, 'assume the output file is "namex.txt"')
end;
f := TBufStream.Create(outfile, fmCreate, 8192);
b := $FEFF; // UNICODE BOM
f.Write(b, 2);
end
else begin
assign(g, outfile);
rewrite(g)
end;
except
writeln(ErrOutput, 'cannot create ' + outfile);
exit;
end;
if very_verbose then writeln(errOutput, 'begin segmenting...');
nline := 0;
ii := TLineStream.Create(filename);
while ii.wReadLn(w_x) do
begin
inc(nline);
if verbose and (nline mod 100 = 0) then write(Erroutput, #13, nline);
if w_x = '' then
begin
if tagmode = tag_namex
then f.write(wcrln, 4)
else writeln(g);
continue;
end;
s := PWideChar(w_x);
w1 := s;
// trim leading spaces
spaces := 0;
while (w1^ = #$3000) or (w1^ = ' ') or (w1^ = #9) do begin inc(w1); inc(spaces) end;
if (tagmode = tag_namex) and (spaces>0) then f.Write(PWideChar(w1-spaces)^, spaces shl 1); // the standard says so
spaces:=0;
w2 := s + length(w_x) - 1;
if w2^ = #13 then dec(w2);
while (w2 >= w1) and ( (w2^ = #13) or (w2^ = #$3000) or (w2^ = ' ') or (w2^ = #9) or (w2^ >=#$E000) and (w2^ <= #$F8FF)) do begin dec(w2); inc(spaces) end;
// ^ private use area
if w2 >= w1
then begin
inc(w2);
wc := w2^; // otherwise the following line will truncate s for UNIX files!
w2^ := #0;
i := w2 - w1;
if ignoreSpace then delete_space(w1, i);
try
if tagmode = tag_namex
then begin
t_w := segtag(w1, i);
if spaces > 0 // the standard says so
then begin
w2^ := wc;
t_w := t_w + copy(w2, 0, spaces);
end;
t_w := t_w + crln;
f.Write(PWideChar(t_w)^, length(t_w) shl 1)
end
else if utf8_output
then writeln(g, ansitoutf8(segtag(w1, i)))
else writeln(g, segtag(w1, i));
except
writeln(ErrOutput, filename, '? ', widecharlentostring(w1, i));
end;
w2^ := wc;
end
else if tagmode = tag_namex
then begin if spaces>0 then f.Write(PWideChar(w2)^, spaces shl 1); f.Write(wcrln^, 4) end
else writeln(g);
end;
ii.Free;
if tagmode = tag_namex then f.Free else close(g);
if verbose then writeln(Erroutput, #13, filename, ' ', sp_form(nline, 'line'));
inc(total_cc, fsize);
if do_mwu and (fsize < 400000) then mwu.Free;
end;
{$ifdef dll}
var lang: TSegLang = Chinese_S;
procedure set_verbose(verbose_level: integer); stdcall;
begin
case verbose_level of
1: verbose := true;
2: begin verbose := true; very_verbose := true end;
else begin verbose := false; very_verbose := false end;
end
end;
procedure set_segtag_dir(dir: PChar); stdcall;
begin
segtagdir := dir;
end;
procedure set_segtagdata_dir(dir: PChar); stdcall;
begin
segdatadir := dir;
end;
procedure set_traditional; stdcall;
begin
lang := Chinese_T
end;
procedure GetCopyrightString(buf: PChar); stdcall;
begin
strpcopy(buf, copyrightstr);
end;
function init_seg(errmsg: PChar): boolean; stdcall;
{$J+}
const inited: boolean = false;
begin
result := true;
if inited then exit;
try
InitSegmentor1(lang); // Simplified Chinese
inited := true;
except
on e: Exception do
begin
StrPCopy(errmsg, e.Message);
result := false;
end;
end;
end;
procedure set_tagmode(newtagmode: TTagMode); stdcall;
begin
tagmode := newtagmode;
end;
function seg_file(filename, outfile: PChar; RecongnizeUnknownWords: boolean; max_uw: integer): boolean; stdcall;
begin
mwu_count := max_uw;
do_mwu := RecongnizeUnknownWords;
try
segfile(filename, outfile, false);
result := true;
except
result := false;
end
end;
function seg_sent(pw: PWideChar; len: integer; outbuf: PChar; buflen: integer): boolean; stdcall;
var s: string;
w1, w2: PWideChar;
begin
try
w1 := pw;
w2 := pw + len - 1;
while (w1^ = #$3000) or (w1^ = ' ') or (w1^ = #9) or (w1^ >=#$E000) and (w1^ <= #$F8FF) do inc(w1);
while (w2 >= w1) and ( (w2^ = #13) or (w2^ = #$3000) or (w2^ = ' ') or (w2^ = #9) or (w2^ = #10) or(w2^ >=#$E000) and (w2^ <= #$F8FF)) do dec(w2);
// ^ private use area
if w2 >= w1
then begin
inc(w2);
w2^ := #0;
len := w2 - w1;
if ignoreSpace then delete_space(w1, len);
s := segtag(w1, len);
//writeln(s, length(s), '==', buflen);
result := length(s) < buflen;
if result then strpcopy(outbuf, s);
end
else outbuf^ := #0;
except
result := false;
end
end;
exports
init_seg, seg_file, seg_sent, segtag_1, set_tagmode, set_traditional,
set_verbose, set_segtag_dir, set_segtagdata_dir, is_cat, tag2str, code2index;
end.
{$else}
procedure benchmark(const filename: string);
type
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -