📄 hmmseg.pas
字号:
// 但是,“哈里逊·福特不得” 是错误的。在外国人名识别算法没有改变以前,
// 暂且不允许最后两个字是词。
if isCC(pw[len-3]) and lq_MaxFind(@pw[len-2], pwlen - (len-2), j, h) and (j > 1)
then dec(len, 2);
end;
if len > 1 then // 不允许单个汉字的外国名字
// 外国地名?
if is_foreign_place_name(pw, len)
then AddWGraphEdge1(i, i+len, lq_ns, 0)
else AddWGraphEdge1(i, i+len, lq_nr, pos_foreign_name);
(*if with_dot and (len >= 5) then
begin
inc(i, len-1);
inc(pw, len-1);
continue // 不要再探索其他可能性了!
end;*)
(*if len > 1 then AddWGraphEdge1(i, i+len, lq_nr, pos_foreign_name);
if with_dot and (len >= 5) and (i >= max_i) then // not 总\理纳瓦兹·谢里夫
begin
// 但是,“哈里逊·福特不得” 是错误的。在外国人名识别算法没有改变以前,
// 暂且不允许最后两个字是词。
if isCC(pw[len-3]) and lq_MaxFind(@pw[len-2], j, h) and (j > 1)
then
else begin
inc(i, len-1);
inc(pw, len-1);
continue // 不要再探索其他可能性了!
end
end;*)
end;
// recognizes Japanese names
if is_jpn_person_name(pw, len) then
AddWGraphEdge1(i, i+len, lq_nr, pos_foreign_name);
end;
// 未登录词识别
if do_mwu then
if mwu.MaxFind(pw, j) then
begin // "神舟五号";"神舟"都被识别
len := wstrlen(mwu[j]);
if len > 2 then
move(pw^, ww, (len - 1) shl 1);
repeat
if Integer(mwu.Objects[j]) <> lq_nr // 我的人名识别程序应该已经加入了
then AddWGraphEdge1(i, i+len, Integer(mwu.Objects[j]), 0);
repeat
dec(len);
ww[len] := #0;
until (len <= 1) or mwu.Find(ww, j);
until len <= 1;
end;
if isRepeatWord2(pw, j, @r)
then begin
if lq_MaxFind(pw, pwlen, len, h) and (len >= j) { “慢慢地”是更长的词 }
then begin
j := len;
if not AddWGraphEdge(i, i+j, h) then j := 1; // 嘀嘀咕咕,
end
else begin
// 把 i..i+j 这个词加到词图
// 叠词的词性一般是已知的,除了 AABB 有可能表示动词(搂搂抱抱),
// 形容词(清清楚楚),名词(是是非非)以外
if r.cat <> -1
then AddWGraphEdge1(i, i+j, r.cat, 0)
else begin
h := lqdic.existword(@pw[r.index], r.len);
assert(h <> nil);
AddWGraphEdge(i, i+j, h);
end;
end
end
else if isNumberWord2(pw, j) { a number composed of more than 2 double-byte cc? }
then begin
// 十五大!
if lq_MaxFind(pw, pwlen, len, h) and (len > j)
then begin j:=len; AddWGraphEdge(i, i+j, h) end
else
// the original treatment is just AddWGraphEdge1(i, i+j, lq_m, 0);
// now I try to conform to PKU People's Daily Corpus
if (pw[j] <> #0) and ((wstrscan('年月日分', pw[j]) <> nil) or (pw[j]=shi))
//and not (lq_MaxFind(@pw[j], len, h) and (len > 1))
then if (pw[j] = '年')
then if (j=4) and (pw[j+1] <> '度')
then begin inc(j); AddWGraphEdge1(i, i+j, lq_t, 0); end
else AddWGraphEdge1(i, i+j, lq_m, 0)
else if (pw[j] = '分') // 这里有很大的改进机会: 8点30 分, 五点十 分
then begin
if (j>=3) and ((pw[j-3]=dian) or (pw[j-2]=dian)) or (i>0) and ((pw-1)^ = shi)
then begin inc(j); AddWGraphEdge1(i, i+j, lq_t, 0); end
else AddWGraphEdge1(i, i+j, lq_m, 0);
end
else begin
if (pw[j] = '月') and (pw[j+1] = '份') then inc(j);
if (pw[j] = '日')
then begin if (j<=3) and (pw[j+1] <> '元') then inc(j); end
else inc(j);
AddWGraphEdge1(i, i+j, lq_t, 0);
end
else AddWGraphEdge1(i, i+j, lq_m, 0);
end
else if lq_MaxFind(pw, pwlen, j, h) then
begin
if (j = 1) and isNumberChar2(pw^, len) and (len=2) and
(pw[j] <> #0) and (wstrscan('日时時', pw[j]) <> nil) // 一月一日
then begin
inc(j);
AddWGraphEdge1(i, i+j, lq_t, 0);
end
else begin
{ get valid shorter words }
max_i := i+j;
AddWGraphEdge(i, i+j, h);
if (h.extra and _xK <> 0) and (i>=2) then Add_Compound(i, j, h);
if isCC(pw^) then // you don't add '-' of '--'
while (j > 1) do
begin
dec(j);
h := lqdic.existword(pw, j);
if h <> nil then AddWGraphEdge(i, i+j, h);
end;
end
end
else if IsFullWidthLetter(pw, j) then AddWGraphEdge1(i, i+j, lq_nx, 0)
else if pw[0] < #128 then
begin
j := 1;
if blankchar(pw[0]) then
begin
while blankchar(pw[j]) do inc(j);
AddWGraphEdge1(i, i+j, lq_nx, 0);
end
// 标点符号应该作为单独一个单词
else if (wstrscan(PUNS, pw[0]) <> nil)
then AddWGraphEdge1(i, i+j, lq_w, 0)
else begin
while (pw[j] <> #0) and (pw[j] < #128) and
((pw[j] = '.') and isdigit(pw[j+1]) or (wstrscan(PUNS, pw[j]) = nil))
do inc(j);
(*
if (pw[j-1] = '"') {or (pw[j-1] = ')') and (j > 1)} then dec(j);
if (j > 1) and (wstrscan(',.;:', pw[j-1]) <> nil) and isalpha(WideChar2Char(pw[j-2])) then dec(j);
*)
if (j = 1) and not isalpha(WideChar2Char(pw[0]))
then if isdigit(pw[0])
then begin
if (pw[1] <> #0) and (wstrscan(month_day_hour, pw[1]) <> nil) //and // 1月1日
//not (lq_MaxFind(@pw[1], len, h) and (len > 1)) 10月/底
then begin
if (pw[j] = '月') and (pw[j+1] = '份') then inc(j)
else if (pw[j] = shi) and (pw[j+1] = '半') then inc(j);
inc(j);
AddWGraphEdge1(i, i+j, lq_t, 0);
end
else AddWGraphEdge1(i, i+j, lq_m, 0)
end
else AddWGraphEdge1(i, i+j, lq_w, 0)
else // check English names
if isenglishname(pw, j, etype, ext_type)
then AddWGraphEdge1(i, i+j, etype, ext_type)
else AddWGraphEdge1(i, i+j, lq_nx, 0);
end
end
else { assume a one-cc word }
begin
h := lqdic.existword(pw, 1);
if h <> nil
then begin AddWGraphEdge(i, i+1, h);
// ghy
if wstrscan(PWideChar(names_), pw^) <> nil
then AddWGraphEdge1(i, i+1, lq_nr, 0);
end
else AddWGraphEdge1(i, i+1, lq_x, 0);
j := 1;
end;
inc(i, j);
inc(pw, j);
dec(pwlen, j);
end;
inc(slen);
with wGraph[slen] do
begin
npred := 1;
predList[0].pos := lq_send;
predList[0].predNode := slen-1;
predList[0].lex := nil;
end;
end;
procedure CreateWGraph2(var pw: PWideChar; len: integer);
var
j: integer;
pwx, pw1, pw2: PWideChar;
begin
for j := 1 to len do wGraph[j].npred := 0;
with wGraph[0] do
begin
npred := 1;
predList[0].pos := lq_sbegin;
predList[0].score := 0;
end;
pwx := pw;
pw1 := pw;
pw2 := pw;
slen := 0;
// format: 迈向/v 充满/v 希望/n 的/u 新/a 世纪/n
repeat
while (pw2^ <> #0) and (pw2^ <> '/') do inc(pw2);
if pw2^ = #0 then break;
// pw1..pw2 is a word
j := pw2-pw1;
move(pw1^, pwx^, j shl 1);
inc(pwx, j);
pw1 := pw2+1;
while (pw2^ <> #0) and (pw2^ <> ' ') do inc(pw2);
AddWGraphEdge1(slen, slen+j, pku_code_to_index(WideCharLenToString(pw1, pw2-pw1)), 0);
while (pw2^ <> #0) and (pw2^ = ' ') do inc(pw2);
inc(slen, j);
pw1 := pw2;
until pw1^ = #0;
inc(slen);
with wGraph[slen] do
begin
npred := 1;
predList[0].pos := lq_send;
predList[0].predNode := slen-1;
predList[0].lex := nil;
end;
end;
procedure CreateWGraph3(var pw: PWideChar; len: integer);
var
j: integer;
pwx, pw1, pw2: PWideChar;
h: TSegHashRec;
begin
for j := 1 to len do wGraph[j].npred := 0;
with wGraph[0] do
begin
npred := 1;
predList[0].pos := lq_sbegin;
predList[0].score := 0;
end;
pwx := pw;
pw1 := pw;
pw2 := pw;
slen := 0;
// format: 迈向 充满 希望 的 新 世纪
repeat
while (pw2^ <> #0) and (pw2^ <> ' ') do inc(pw2);
// pw1..pw2 is a word
j := pw2-pw1;
move(pw1^, pwx^, j shl 1);
inc(pwx, j);
h := lqdic.existword(pw1, j);
if h <> nil
then AddWGraphEdge(slen, slen+j, h)
else AddWGraphEdge1(slen, slen+j, lq_n, 0); // assume UNK is noun!
while (pw2^ <> #0) and (pw2^ = ' ') do inc(pw2);
inc(slen, j);
pw1 := pw2;
until pw1^ = #0;
inc(slen);
with wGraph[slen] do
begin
npred := 1;
predList[0].pos := lq_send;
predList[0].predNode := slen-1;
predList[0].lex := nil;
end;
end;
function tagname(c: integer): string;
begin
case c of
lq_g: result := 'g';
lq_x: result := 'x';
else case tag_scheme of
tag_pku: result := pku_pos_codes[c];
tag_ldc: result := posldc[c];
tag_863: result := pos863[c];
end
end;
end;
function TSegmentor.Prob2(c1, c2: integer): real; // P(c2|c1) 转移概率
var
h: integer;
begin
{if tag_scheme = tag_ldc then
begin
if c1 = lq_vn then c1 := lq_n;
if c2 = lq_vn then c2 := lq_n;
end;}
if (c1 >= NCATS+2) or (c2 >= NCATS+2) then begin result := 1/Total_Count; exit end;
h := C1_C2[c1, c2];
if h = 0
then result := 1/Total_Count
else result := h/C1_Count[c1];
result := result * 0.9 + C1_Count[c2] / Total_Count * 0.1;
{if tag_scheme = tag_ldc then
if c2 = lq_vn
then result := result + Prob2(c1, lq_n)
else if c1 = lq_vn
then result := result + Prob2(lq_n, c2);}
//OutputDebugString(PChar(format('%s,%s=%5.3f',[tagnamex(c1), tagnamex(c2),result])));
if calc_prob then writeln(tagnamex(c1), ',', tagnamex(c2), '=',result:5:3);
end;
function code2index(const s: string): integer;
begin
if s = 'g' then result := lq_g else
if s = 'x' then result := lq_x else
result := pku_code_to_index(s);
end;
function is_cat(cat: integer; catstr: PChar; len: integer): boolean; stdcall;
var s, t: string;
begin
setstring(s, catstr, len);
t := tagname(cat);
result := s = t;
if not result then
result := (t[1] = 'r') and (s='r') or
(t = 'v') and (s = 'aux') or
(t = 'rd') and (s = 'det');
end;
function tag2str(cats: PChar; ncats: integer; catstr: PChar; len: integer): boolean; stdcall;
var s: string;
i: integer;
begin
s := '';
for i := 0 to ncats-1 do s := s + tagname(ord(cats[i])) + ' ';
result := length(s) < len;
if result then strpcopy(catstr, s);
end;
procedure calc_prob2(gram: integer; const c1s, c2s, c3s: string);
var c1, c2, c3: integer;
p: real;
begin
if (c1s = '') or (c2s = '') then error('bad cat!');
if (gram = 3) and (c3s = '') then error('bad cat!');
c1 := code2index(c1s); c2 := code2index(c2s);
if (c1 = -1) or (c2 = -1) then error('bad cat!');
if (gram = 3) then
begin
c3 := code2index(c3s);
if c3 = -1 then error('bad cat!');
end;
if gram = 2
then begin
p := seg_x.Prob2(c1, c2);
writeln(c1s, ',', c2s, '=', p:5:3, ' -logp=', -ln(p):5:3);
end
else begin
{$ifdef g3}
p := seg_x.Prob3(c1, c2, c3);
writeln(c1s, ',', c2s, '=', c3s, '=', p:5:3, ' -logp=', -ln(p):5:3);
{$else}
writeln('not implemented');
{$endif}
end;
halt;
end;
{$ifdef g3}
function TSegmentor.Prob3(c0, c1, c2: integer): real; // P(c2|c0,c1) 转移概率
const
trigram_weight = 0.9;
var
h: integer;
begin
{if tag_scheme = tag_ldc then
begin
if c0 = lq_vn then c0 := lq_n;
if c1 = lq_vn then c1 := lq_n;
if c2 = lq_vn then c2 := lq_n;
end;}
if (c0 >= NCATS+2) or (c1 >= NCATS+2) or (c2 >= NCATS+2)
then begin result := 1/Total_Count/5;{5,10 same} exit end;{??}
if C1_C2_C3[c0, c1, c2] = 0
then result := 1/Total_Count*10 {1, 2,5 is better than 10}
else result := C1_C2_C3[c0, c1, c2]/C1_C2[c0, c1];
result := result * trigram_weight + prob2(c1, c2) * (1-trigram_weight);
end;
{$endif}
function TSegmentor.GenProb(pw: PWideChar; len, cat: integer): real; // P(w|c) 生成概率
var
h: TSegHashRec;
ww: array[0..31] of WideChar;
begin
// ?? 叠词还不准
case cat of
lq_send: begin result := 1; exit end; // 产生 </s>
lq_x, { x 字 } lq_g: { g } begin result := 1 / Total_Count; exit end;
end;
h := lqdic.existword(pw, len, cat);
if (h = nil) or (h.count = 0) // 训练语料库里没出现
then if cat = lq_nr
then if len > 1
then begin
if (h <> nil) or not iscc(pw^) // 现在允许英文人名
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -