📄 hmmseg.pas
字号:
while pw[len] = pw^ do inc(len);
r.cat := lq_o;
r.index := 0;
r.len := 1;
end {
else if lqdic.existword(pw, 1, lq_q) <> nil // 艘艘
then begin
result := true;
len := 2;
r.cat := lq_q;
r.index := 0;
r.len := 1;
end }
end
else if (pw^ = '一') and (pw[1] <> #0) and (pw[1] = pw[2])
then begin
if lqdic.existword(@pw[1], 1, lq_q) <> nil // 一排排
then begin
result := true;
len := 3;
r.cat := lq_m; // 北大认为“一排排”是数词。限定词也许更合适!!!
// 对HMM 来说,影响生成概率
r.index := 0;
r.len := 1;
end
end
else if (pw[1] <> #0) and (pw[1] = pw[2]) {孤单单}
then begin
if lqdic.existword(pw, 2, lq_a, extra) and (extra and _xABB <> 0)
then begin
result := true;
len := 3;
r.cat := lq_z; // lq_a ?
r.index := 0;
r.len := 2;
end
end;
end;
function AddWGraphEdge(i, j: byte; h: TSegHashRec): boolean;
var
w: PWideChar;
k: integer;
label
rr;
begin
// 对“瞩目中华”,因为“瞩”不出现在词典里,所有加“目”没有意义
if wGraph[i].npred = 0 then begin result := false; exit; end;
result := true;
w := h.w;
with wGraph[j] do
repeat
if npred >= MAXPRED then exit;
{if tag_scheme = tag_ldc then
if h.cat = lq_vn
then h.cat := lq_n; //??}
k := 0;
while k < npred do
if predList[k].pos = h.cat then
begin
// put the cat at the end of the same cat
repeat inc(k) until (k >= npred) or (predList[k].pos <> h.cat);
if (k < npred)
then {if predList[k].predNode = i
then goto rr
else} move(predList[k], predList[k+1], (npred - k) * sizeof(predList[0]));
break
end
else inc(k);
predList[k].pos := h.cat;
predList[k].predNode := i;
predList[k].x := 0;
predList[k].lex := h;
//mywriteln(format('%d,%d: %s %s', [i, j, lqInt2cat(h.cat), WideCharToString(h.w)]));
// ??? count???
inc(npred);
rr:
h := h.next;
until (h = nil) or (WideStringCompare(w, h.w) <> 0);
end;
procedure AddWGraphEdge1(i, j: byte; cat, extra_info: integer);
var
k: integer;
begin
if wGraph[i].npred = 0 then exit;
with wGraph[j] do
begin
if npred >= MAXPRED then exit;
k := 0;
while k < npred do
if predList[k].pos = cat then
begin
// put the cat at the end of the same cat
repeat inc(k) until (k >= npred) or (predList[k].pos <> cat);
if (k < npred)
then move(predList[k], predList[k+1], (npred - k) * sizeof(predList[0]));
break
end
else inc(k);
predList[k].pos := cat;
predList[k].predNode := i;
predList[k].x := extra_info;
predList[k].lex := nil;
//mywriteln(format('%d,%d: %s', [i, j, lqInt2cat(cat)]));
// ??? count???
inc(npred);
end
end;
procedure Add_Compound(i, j: byte; h: TSegHashRec);
var
k: integer;
begin
with wGraph[i] do
for k := 0 to npred - 1 do
if predList[k].pos in [lq_n, lq_ns, lq_b, lq_vn, lq_nz] then
if i+j-predList[k].predNode > 2 then {受/骗/者 将不会变成 受/骗者}
begin
AddWGraphEdge1(predList[k].predNode, i+j, lq_n, 0);
break
end
end;
function isNumberChar2x(ch: WideChar; var charType: integer): boolean;
var i: integer;
begin
for i := low(D) to High(D) do
if wstrscan(D[i], ch) <> nil then
begin
charType := i;
result := true;
exit
end;
result := false;
end;
function isNumberWord2(pw: PWideChar; var len: integer): boolean;
{数字的识别应该用有限自动机,现在暂且如此做}
var
charType, charType2, j: integer;
h: TSegHashRec;
begin
len := 0;
if (pw^ <> #0) and isNumberChar2x(pw^, charType) then
begin
inc(len);
case charType of
4: if (pw^ = '第') then
begin
if not isNumberChar2(pw[1], charType) or (charType = 4)
then begin result := false; exit end;
inc(len);
end
else if pw^ = #$FF0D {'-'} then
begin
if not (isNumberChar2(pw[1], charType) and (charType = 1))
then begin result := false; exit end;
inc(len);
end
else if pw^ = dian {点} then begin result := false; exit end
else if pw^ = '分' then begin result := false; exit end;
0: if (pw^ = '.') and not (isdigit((pw+1)^)) then
begin result := false; exit end;
1: if (wstrscan('·', pw^) <> nil) and not (isdigit((pw+1)^)) then
begin result := false; exit end;
end;
if (charType = 4) and (wstrscan(Type4CharsHead, pw^) <> nil)
then charType := 2; {handle 两千斤}
while (pw[len] <> #0) do
if wstrscan('%%‰', pw[len]) <> nil
then begin inc(len); break end
else if isNumberChar2x(pw[len], charType2) and // 300多年:原来是isNumberChar2x
((charType2 = charType) or
(charType2 = 4) and not ((pw[len]='分') and (pw[len+1]<>'之')) and
not ((pw[len]='之') and (pw[len-1]<>'分')) and
not ((pw[len]='千') and (wstrscan('万亿0', pw[len-1])<>nil))
)
{数字类型必须相同,“六828千”不是一个数词}
then inc(len)
else if (wstrscan({'∶::}'∶·..', pw[len]) <> nil) and isNumberChar(pw[len+1])
then inc(len, 2)
else if (WSTRSCAN(',/', pw[len]) <> NIL) and (charType = 0) and isdigit(pw[len+1])
then inc(len, 2)
else break;
{ 最后一个字不能是有些字 }
if (WStrScan(WrongLastNumberChars, pw[len-1]) <> nil)
then dec(len)
else if isdigit(pw[len-1]) and isalpha(pw[len])
then begin result := false; exit end // 3d
{else if lq_maxfind(@pw[len-1], j, h) and (j>1)// 五亿千瓦
then begin writeln(widecharlentostring(pw, len+1));dec(len)end}
{else if (pw[len] <> #0) and (pw[len-1] < #128) and (pw[len] < #128) and (pw[len] <> '"')
then begin
if wstrscan(')(,', pw[len]) = nil then
begin
result := false;
exit
end
end; { 863-306 }
end;
result := (len > 1) or (len = 1) and (charType in [0,1]) ;
end;
function isNumberWord3(pw: PWideChar; len: integer): boolean;
var
charType: integer;
begin
while (len > 0) do
begin
dec(len);
if not IsNumberChar2x(pw[len], charType) then begin result := false; exit end;
end;
result := true
end;
function recognize_foreign_name(pw: PWideChar; var with_dot: boolean): integer;
var
len: integer;
begin
len := 1;
while isCC(pw[len]) and (ForeignNameCharMiddle in CC[pw[len]]) do inc(len);
if not isCC(pw[len]) then dec(len);
while (len > 0) and not (ForeignNameCharLast in CC[pw[len]]) do dec(len);
if len = 0 then begin result := 0; exit; end;
with_dot := false;
// pw[0..len] is a possible name
// a fragment?
if (wstrscan('·˙—', pw[len+1]) <> nil) and
IsCC(pw[len+2]) and
(ForeignNameCharFirst in CC[pw[len+2]])
then begin
result := len + 2 + recognize_foreign_name(@pw[len+2], with_dot);
with_dot := true
end
else result := len + 1;
end;
function used_after_name(pw: PWideChar): boolean;
var h: TSegHashRec;
begin
h := lqdic.existword(pw, 2);
result := (h <> nil) and (h.extra and _xNSuf <> 0);
end;
function possible_name(i: integer; pw: PWideChar; len: integer): boolean;
var
d: real;
k: integer;
begin {
if (i > 0) then
with wGraph[i] do
for k := 0 to npred-1 do
if predList[k].pos = cat then
{// check name prefix and suffix
_n0 := wGraph[n0].predList[k].predNode;
h := lqdic.existword(@pw[_n0], n0-_n0);
if (h <> nil) and (h.extra and _xNPre <> 0) then p := p*1.2;}
(*if used_after_name(@pw[len])
then case len of
2: d := 0.00000025; {储值: 0.00000165 成词:0.00000239}
3: d := 0.00000000006; // 双发多
end
else *)
if (pw[len] <> #0) and (wstrscan('·', pw[len]) <> nil)
then d := 0.0000025
else
case len of
2: if pw^ = (pw+1)^
then d := 0.000025
else d := 0.0000025; {d := 0.0000050; {0.0000025; {储值: 0.00000165 成词:0.00000239}
3: if lqdic.existword(pw+1, 2) <> nil
then d := 0.0000006
else d := 0.0000000006; {d := 0.0000000012;{0.0000000006; // 双发多}
end;
if strict_name then d := d * 1000;
if (len = 3) and (lqdic.existword(pw, 2) <> nil) then d := d * 1000;
result := name_prob(pw, len) > d;
end;
function is_foreign_place_name(pw: PWideChar; len: integer): boolean;
var w: WideChar;
begin
w := pw[len];
pw[len] := #0;
result := transliterated_foreign_names.IndexOf(pw) <> -1;
pw[len] := w;
end;
function isenglishname(pw: PWideChar; len: integer; var etype, ext_type: integer): boolean;
var i, count, j: integer;
ew: array[0..4] of string;
s: string;
begin
result := false;
for i:=0 to len-1 do if not (ord(pw[i]) in [32, ord('a')..ord('z'), ord('A')..ord('Z')]) then exit;
s := UpperCase(WideCharLenToString(pw, len));
if foreign_org_names.Find(s, j) or (pos('.COM', s) > 0) then
begin
result := true;
etype := lq_nt;
ext_type := 0;
exit;
end;
tokenize(s, ew, count, 5);
if count >= 5 then exit;
for i:=0 to count-1 do
if not foreign_names.Find(ew[i], j) then exit;
etype := lq_nr;
ext_type := pos_foreign_name;
result := true
end;
procedure CreateWGraph(pw: PWideChar; pwlen: integer);
var
i, j, len, max_i, etype, ext_type: integer;
r: TRepeatInfo;
h: TSegHashRec;
ww: array[0..31] of WideChar;
with_dot: boolean;
label
nameloop;
begin
for i := 1 to slen do wGraph[i].npred := 0;
with wGraph[0] do
begin
npred := 1;
predList[0].pos := lq_sbegin;
predList[0].score := 0;
end;
i := 0;
max_i := 0;
while i < slen do
begin
if isCC(pw^) then
begin
//Add_Chinese_Names;
if isDoubleSurname(pw)
then begin
if isCC(pw[2])
then begin
if isCC(pw[3]) and (np1[pw[2]] <> 0) and (np2[pw[3]] <> 0)
then begin
// 可以构成双名
AddWGraphEdge1(i, i+4, lq_nr, pos_chinese_name_double);
end;
// 可能构成单名?
if np3[pw[2]] <> 0
then begin
AddWGraphEdge1(i, i+3, lq_nr, pos_chinese_name_double);
inc(i, 3);
inc(pw, 3);
end
else begin // 皇甫姑娘
AddWGraphEdge1(i, i+2, lq_nrf, 0);
inc(i, 2);
inc(pw, 2);
end
end
else begin
AddWGraphEdge1(i, i+2, lq_nrf, 0);
inc(i, 2);
inc(pw, 2);
end;
continue;
end
else if (0 in CC[pw^])
then begin
// 老张,小沈
if (i > 0) and (wstrscan('老小阿', (pw-1)^) <> nil)
then AddWGraphEdge1(i-1, i+1, lq_nr, pos_chinese_name_special_surname_last);
if isCC(pw[1])
then begin
if isCC(pw[2]) and ( possible_name(i, pw, 3) or
(pw[1] = pw[2]) and (pw[1] = '某') )
then begin
// 可以构成双名
if not used_after_name(@pw[1]) then
begin
AddWGraphEdge1(i, i+3, lq_nr, pos_chinese_name);
if wstrscan('山区村乡镇县市路岛', pw[2]) <> nil then
AddWGraphEdge1(i, i+3, lq_ns, 0); // 简单的地名识别
end;
{if (0 in CC[pw[1]]) and (np0[pw^] > 0.01) and (name_prob(pw+1, 3) > 0.0000005)
then AddWGraphEdge1(i, i+4, lq_nr, 0); // 陈方安生}
end;
// 可能构成单名?
if (wstrscan('老总總某家氏×' {母父犯}, pw[1]) <> nil)
then AddWGraphEdge1(i, i+2, lq_nr, pos_chinese_name_special_surname_first)
else if possible_name(i, pw, 2)
then AddWGraphEdge1(i, i+2, lq_nr, pos_chinese_name);
end
else if pw[1] = #$00D7 {×}
then if pw[1] = pw[2]
then AddWGraphEdge1(i, i+3, lq_nr, pos_chinese_name)
else AddWGraphEdge1(i, i+2, lq_nr, pos_chinese_name)
end;
//Add_Foreign_Names;
if (ForeignNameCharFirst in CC[pw^])
then begin // 这里要注意我的外国人名用字表肯定会被突破
len := recognize_foreign_name(pw, with_dot);
// 阿姆斯特朗登月时 => 阿姆斯特朗登, 加以简单处理
if (len > 5) and isCC(pw[len-2]) and (ForeignNameCharLast in CC[pw[len-2]]) then
AddWGraphEdge1(i, i+len-1, lq_nr, pos_foreign_name);
if (len >= 4) and (i >= max_i) then // not 总\理纳瓦兹·谢里夫
begin
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -