📄 hmmseg.pas
字号:
if CompareMem(result.w, key, len) and (result.cat = cat) then exit;
result := result.next;
end;
end;
function TSegHashTable.existword(key: PWideChar; len: integer; var h: TSegHashRec): boolean;
begin
h := existword(key, len);
result := h <> nil
end;
{
function TSegHashTable.existword(key: PWideChar; len, cat: integer; var h: TSegHashRec): boolean;
begin
h := existword(key, len, cat);
result := h <> nil
end;
}
function TSegHashTable.existword(key: PWideChar; len, cat: integer; var extra: integer): boolean;
var h: TSegHashRec;
begin
len := len shl 1;
h := ht[word(_hash(PChar(key), len))];
extra := -1;
while h <> nil do
begin
if CompareMem(h.w, key, len) then
begin
if extra = -1 then extra := h.extra;
if (h.cat = cat) then begin result := true; exit; end;
end;
h := h.next;
end;
result := false;
end;
function pku_code_to_index(const S: string): integer;
var
L, H, I, C: Integer;
begin
Result := -1;
L := 2;
H := NCATS+1;
while L <= H do
begin
I := (L + H) shr 1;
C := CompareStr(pku_pos_codes[I], S);
if C < 0 then L := I + 1 else
begin
if C = 0 then
begin
Result := I;
exit
end;
H := I - 1;
end;
end;
end;
function tagname(c: integer): string; forward;
function tagnamex(c: integer): string; forward;
function TSegmentor.LoadCoreDic(const datadir: string): TSegHashTable;
const segdicfile={$ifdef segv2}'segdic.txt'{$else}'segdic.old'{$endif};
var
f: TLineStream;
i, j, count, cat, len, nline, len1: integer;
w{, w0}: WideString;
s: string;
w0: PWideChar;
filename: string;
is_chinese_name, is_foreign_name: boolean;
with_l: boolean; ncat, lcount, total_count: integer;///
begin filename := datadir + segdicfile; if very_verbose then writeln(erroutput, 'loading ', filename);
//OutputDebugString(PChar(Filename+crln));
try
f := TLineStream.Create(filename);
except
error('cannot find ' + filename);
end;
result := TSegHashTable.Create;
nline := 0;
while f.wReadLn(w) do
begin inc(nline);
for i := 1 to length(w) do
if w[i] = ' ' then break;
//w0 := copy(w, 1, i-1);
len1 := i-1;
getmem(w0, i shl 1);
Move(w[1], w0^, len1 shl 1); w0[len1] := #0; //lstrcpynw(w0, PWideChar(w), i);
// format: 艰苦 an 2 ad 6 a 67
len := length(w);
inc(i);
j := i;
with_l := false; ncat := 0; total_count := 0;///
repeat
while (j <= len) and (w[j] <> ' ') do inc(j);
if j > len then error(segdicfile + ': ' + w);
s := copy(w, i, j-i);
is_chinese_name := s = 'nR';
if is_chinese_name then s[2] := 'r';
is_foreign_name := s = 'nr';
cat := code2index(s);
if cat = -1 then error(format('%s at line %d: unknown cat: %s', [segdicfile, nline, s]));
inc(j);
i := j;
while (j <= len) and (w[j] <> ' ') do inc(j);
try
count := StrToInt(copy(w, i, j-i));
except
error(format('%s at line %d: bad integer: %s', [segdicfile, nline, s]));
end;
if is_chinese_name
then result.hashrec(PWideChar(w0), count, cat).extra := _xName // w0 shared!
else if is_foreign_name
then result.hashrec(PWideChar(w0), count, cat).extra := _xForeignName // w0 shared!
else result.hashrec(PWideChar(w0), count, cat); // w0 shared!
inc(j);
i := j;
if cat=18 then begin with_l := true; lcount := count end; ///
inc(ncat);///
inc(total_count, count); ///
until j > len;
{if with_l then ///
if (lcount = 0) or //
(lcount > 0) and (ncat = 2) and (total_count = lcount)
then
else writeln(w);}
if iscc(w0^) then
begin
if longest_word[w0^] < len1
then longest_word[w0^] := len1;
end
end;
f.Free;
end;
var
bigram: comlib.THashTable; // utf8-encoded! 2007-4-19
procedure TSegmentor.LoadBigram(const datadir: string);
const mybigramfile={$ifdef segv2}'mybigram.txt'{$else}'mybigram.old'{$endif};
var
f: TLineStream;
w2: WideString;
s, s1, s2: string;
c, nline: integer;
h: TSegHashRec;
filename: string;
begin filename := datadir + mybigramfile; if very_verbose then writeln(erroutput, 'loading ', filename);
//OutputDebugString(PChar(Filename+crln));
try
f := TLineStream.Create(filename);
except
error('cannot find ' + filename);
end;
bigram := comlib.THashTable.Create(4096);
nline := 0;
while f.ReadLn(s) do
begin
inc(nline);
split(s, ' ', s1, s2);
{$ifdef segv2}
c := pku_code_to_index(s2);
{$else}
c := StrToIntDef(s2, 0);
if c = lq_nr then continue;
{$endif}
bigram.HashRec(PChar(s1), length(s1), true).data := c;
w2 := utf8decode(copy(s1, pos(',', s1) + 1, maxint)); //20070419
h := dic.existword(PWideChar(w2), length(w2), c);
if h=nil
then error(mybigramfile + '(line ' + IntToStr(nline) + '): '
+ {$ifdef linux}utf8encode{$endif}(w2) + ' ' + s2)
else h.extra := _xBigram;
end;
f.Free;
end;
procedure TSegmentor.LoadZsuffix(const datadir: string);
const myzfile={$ifdef segv2}'zsuffix.txt'{$else}'zsuffix.old'{$endif};
var
f: TLineStream;
w: WideString;
h: TSegHashRec;
c: char; filename: string;
begin filename := datadir + myzfile; if very_verbose then writeln(erroutput, 'loading ', filename);
//OutputDebugString(PChar(Filename+crln));
try
f := TLineStream.Create(filename);
except
error('cannot find ' + filename);
end;
while f.wReadLn(w) do
begin {$ifdef segv2}
c := char(w[length(w)]);
if not (c in ['s','t','z', 'k', 'D', 'T', 'j', 'm', 'R']) then continue;
setlength(w, length(w)-3);
{$endif}
h := dic.existword(PWideChar(w), length(w));
if h = nil
then begin
if verbose then writeln(ErrOutput, 'z not in dic: ', {$ifdef linux}Utf8Encode{$endif}(w))
end
else {$ifdef segv2}
with h do
case c of
's': extra := extra or _xNS;
't': extra := extra or _xNT;
'z': extra := extra or _xNZ;
'k': extra := extra or _xK;
'D': extra := extra or _xDate;
'T': extra := extra or _xTime;
'j': extra := extra or _xJ;
'm': extra := extra or _xNum;
'R': extra := extra or _xRenVerb or _xNSuf;
end
{$else}
with h do extra := extra or _xNS;
{$endif}
end;
f.Free;
end;
procedure TSegmentor.LoadNprefix(const datadir: string);
var
f: TLineStream;
w: WideString;
h: TSegHashRec; i, len: integer;
filename: string;
begin filename := datadir + 'nprefix.txt'; if very_verbose then writeln(erroutput, 'loading ', filename);
//OutputDebugString(PChar(Filename+crln));
try
f := TLineStream.Create(filename);
except
error('cannot find ' + filename);
end;
while f.wreadln(w) do
begin
i := pos(' ', w); if i <= 0 then error(w);
h := dic.existword(PWideChar(w), i-1);
if h = nil
then begin
if verbose then writeln(ErrOutput, 'npre not in dic: ', w)
end
else with h do extra := extra or _xNPre;
end;
f.Free;
end;
procedure TSegmentor.Load_name_suffix(const datadir: string);
var
f: TLineStream;
w: WideString;
h: TSegHashRec;
filename: string;
begin filename := datadir + 'nsuffix.txt'; if very_verbose then writeln(erroutput, 'loading ', filename);
//OutputDebugString(PChar(Filename+crln));
try
f := TLineStream.Create(filename);
except
error('cannot find ' + filename);
end;
while f.wReadLn(w) do
begin
h := dic.existword(PWideChar(w), length(w));
if h = nil
then begin
if verbose then writeln(ErrOutput, 'nsuf not in dic: ', w)
end
else with h do extra := extra or _xNSuf;
end;
f.Free;
end;
procedure TSegmentor.Load_VVO(const datadir: string);
var
f: TLineStream;
w: WideString;
h: TSegHashRec;
filename: string;
begin filename := datadir + 'vvo.txt'; if very_verbose then writeln(erroutput, 'loading ', filename);
//OutputDebugString(PChar(Filename+crln));
try
f := TLineStream.Create(filename);
except
error('cannot find ' + filename);
end;
while f.wReadLn(w) do
begin
h := dic.existword(PWideChar(w), length(w));
if h = nil
then begin
if verbose then writeln(ErrOutput, 'vvo not in dic: ', w)
end
else with h do extra := extra or _xVVO;
end;
f.Free;
end;
procedure TSegmentor.Load_AA(const datadir: string);
var
f: TLineStream;
w1: WideString;
h: TSegHashRec;
filename: string;
begin filename := datadir + 'aa.txt'; if very_verbose then writeln(erroutput, 'loading ', filename);
//OutputDebugString(PChar(Filename+crln));
try
f := TLineStream.Create(filename);
except
error('cannot find ' + filename);
end;
while f.wReadLn(w1) do
begin
h := dic.existword(PWideChar(w1), length(w1)-1);
if h = nil
then begin
if verbose then writeln(ErrOutput, 'aa not in dic: ', w1)
end
else with h do
case chr(ord(w1[length(w1)])) of
'1': extra := extra or _xAA;
'2': extra := extra or _xABB;
'4': extra := extra or _xAABB;
end
end;
f.Free;
end;
function lq_maxfind(pw: PWideChar; pwlen: integer; var len: integer; var h: TSegHashRec): boolean;
begin
result := false;
if iscc(pw^)
then len := seg_x.longest_word[pw^]
else len := 3;
if len > pwlen then len := pwlen;
while len > 0 do
begin
h := lqdic.existword(pw, len);
if h <> nil then begin result := true; exit; end;
dec(len);
end;
end;
type
PRepeatInfo = ^TRepeatInfo;
TRepeatInfo = record
cat: integer; {-1: not set}
index, len: byte;
end;
function isRepeatWord2(pw: PWideChar; var len: integer; r: PRepeatInfo): boolean;
var
h: TSegHashRec;
extra: integer;
begin
result := false;
if (pw[0] = pw[1])
then if (pw[2] <> #0) and
(pw[2] = pw[3]) and // 平平淡淡
lqdic.existword(@pw[1], 2, h) and (h.extra and _xAABB <> 0)
then begin
len := 4;
result := true;
if lqdic.existword(@pw[1], 2, lq_a) <> nil // inefficient but I am lazy
then r.cat := lq_z
else r.cat := -1;
r.index := 1;
r.len := 2;
end
else begin
if (pw[2] <> #0) then
if lqdic.existword(@pw[1], 2, lq_v, extra) and (extra and _xVVO <> 0) { 必须是“散散步”这样的v+n离合动词。“建建筑”不对 }
then begin
result := true;
len := 3;
r.cat := lq_v;
r.index := 1;
r.len := 2;
end
else if lqdic.existword(pw, 1, lq_a, extra) and (extra and _xAA <> 0) // 浓浓
then begin
result := true;
len := 2;
r.cat := lq_z; // lq_a ??
r.index := 0;
r.len := 1;
end
else if lqdic.existword(pw, 1, lq_o) <> nil // 啪啪啪
then begin
result := true;
len := 2;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -