📄 hmmseg.pas
字号:
then result := 1 / Total_Count
else begin
result := name_prob(pw, len)/256{512{32}; // 根据“分割成词”
if result = 0 then result := 1 / Total_Count/5; //??应该用1/C1_Count[i1]/5 ??
end;
// 刘群的词典里有很多这样的 nr
if do_mwu then
begin
move(pw^, ww, len shl 1);
ww[len] := #0;
if mwu.Find(ww, len) then result := result * 4;
end;
end
else result := 1/C1_Count[cat]/5
else result := 1/C1_Count[cat]/5
else begin result := h.count/C1_Count[cat]; {if cat = lq_nrf then result := result / 10 {何长工}end;
if calc_prob then writeln(WideCharLenToString(pw, len), '/', tagnamex(cat), '=',result:6:4);
end;
function get_ldc_word_tag(pw: PWideChar; len, cat, next_cat: integer): string;
var
i: integer;
tag: string;
u: array[0..255] of char;
begin
{$ifdef linux}
UnicodeToUtf8(u, sizeof(u), pw, len);
result := u;
{$else}
result := WideCharLenToString(pw, len);
{$endif}
tag := tagname(cat);
case cat of
lq_u:
if oneof(result, ['等', '等等'])
then tag := 'ETC'
else if result = '所'
then tag := 'MSP'
else if result = '之'
then tag := 'DEG'
else tag := 'LC';
lq_c: if oneof(result, ['虽然','因为','如果','若','尽管','假如','若是','要是','只有','即使','只要','一旦'])
then tag := 'CS'
else if oneof(result, ['但','但是'])
then tag := 'AD';
lq_m: if wstrscan('第首', pw^) <> nil then tag := 'OD' else
if oneof(result, ['全部']) then tag := 'DT';
lq_v: if oneof(result, ['是','为','非']) then tag := 'VC' else
if oneof(result, ['有','没','没有','无']) then tag := 'VE';
lq_p: if oneof(result, ['把','将']) then tag := 'BA' else
if oneof(result, ['被','给']) then
if next_cat = lq_v
then tag := 'SB'
else tag := 'LB';
lq_a: if next_cat in [lq_n, lq_m, lq_vn] then tag := 'JJ' else tag := 'VA';
lq_d: if (next_cat = lq_v) and (result = '没有') then tag := 'VV';
lq_b: if (result = '所有') then tag := 'DT';
lq_rz: if next_cat in [lq_v, lq_d, lq_p] then tag := 'PN';
lq_f: if (next_cat = lq_q) and oneof(result, ['上']) then tag := 'DT' else
if (next_cat = lq_m) and (result = '最后') then tag := 'JJ';
end;
result := result + '/' + tag;
end;
function get_word(pw: PWideChar; len: integer): string;
var
i: integer;
w: array[0..255] of WideChar;
u: array[0..255] of char;
begin
{$ifdef linux}
UnicodeToUtf8(u, sizeof(u), pw, len);
result := u;
{$else}
result := WideCharLenToString(pw, len);
{$endif}
if not escape_space then exit;
// check for ' ', '/', '\'
i := 0;
while i < len do
if ord(pw[i]) in [ord(' '), ord('/'), ord('\')] then break else inc(i);
if i = len then exit;
move(pw^, w, len);
repeat
case w[i] of
' ': begin move(w[i+1], w[i+2], len-1-i); w[i] := '\'; inc(i); w[i+1] := 's'; inc(len); end;
'/': begin move(w[i+1], w[i+2], len-1-i); w[i] := '\'; inc(i); w[i+1] := 'f'; inc(len); end;
'\': begin move(w[i+1], w[i+2], len-1-i); w[i] := '\'; inc(i); w[i+1] := '\'; inc(len); end;
end;
inc(i);
until i >= len;
end;
function utf8encodelen(pw: PWideChar; len: integer): utf8string;
var
u: array[0..255] of char;
begin
UnicodeToUtf8(u, sizeof(u), pw, len);
result := u;
end;
function constraining_bigram(pw: PWideChar; p0, n0, len: integer; c2: integer): boolean;
var
len2: integer;
w: array[0..31] of WideChar;
h: comlib.PHashRec;
begin
// 根据“把/p 这/r 篇/q 报道/v 编辑/v 一/m 下/q”调整
// p0 n0
// “报道”应为名词
len2 := n0-p0;
if len2 + len >= 30 then begin result := false; exit end;
move(pw[p0], w, len2 shl 1);
w[len2] := ',';
inc(len2);
move(pw[n0], w[len2], len shl 1);
inc(len2, len);
h := bigram.hashrec(utf8encodelen(w, len2), false);
result := (h <> nil) and (h.data = c2);
end;
function tagnamex(c: integer): string;
begin
case c of
0: result := '<s>';
1: result := '</s>';
else result := tagname(c)
end;
end;
function dt_type(pw: PWideChar; len: integer): integer;
var
h: TSegHashRec;
begin
h := lqdic.existword(pw, len);
if (h <> nil) then
if (h.extra and _xDate <> 0)
then result := dt_Date
else if (h.extra and _xTime <> 0)
then result := dt_Time
else if (len=1) and (wstrscan('春夏秋冬', pw^) <> nil)
then result := dt_Date
else result := 0
else if wstrscan('年月日天份', pw[len-1]) <> nil
then result := dt_Date
else if (pw[len-1]=shi) or (pw[len-1]=dian) or (wstrscan('半分晚夜', pw[len-1]) <> nil)
then result := dt_Time
else result := 0
end;
function find_time(const s1, s2: string; var j: integer): boolean;
begin
if s1 = c_sijian then begin j := 1; result := true end else
if oneof(s1, [c_zone_1,'西部']) and (s2 = c_sijian)
then begin j := 2; result := true end
else result := false
end;
function istime(pw: PWideChar; len: integer; var time_len: integer): boolean;
// 2004-09-01 08:35
// 2004-08-22 11:35:07
var ndash, ncolon: integer;
begin
result := false;
time_len := 0;
ndash := 0; ncolon := 0;
while (time_len < len) do
begin
if pw[time_len] < #128
then case ord(pw[time_len]) of
ord('0')..ord('9'), ord(' '): ;
ord('-'): inc(ndash);
ord(':'): inc(ncolon);
else exit;
end
else exit;
inc(time_len);
end;
result := (ndash in [0,2]) and (ncolon > 0);
if result then
begin
repeat dec(time_len) until pw[time_len] <> ' ';
inc(time_len);
end;
end;
function isfraction(pw: PWideChar; len: integer; var time_len: integer): boolean;
// 1/3
var nslash: integer;
begin
result := false;
time_len := 0;
nslash := 0;
while (time_len < len) do
begin
if pw[time_len] < #128
then case ord(pw[time_len]) of
ord('0')..ord('9'): ;
ord('/'): inc(nslash);
else exit;
end
else exit;
inc(time_len);
end;
result := (nslash = 1);
if result then
begin
repeat dec(time_len) until pw[time_len] <> ' ';
inc(time_len);
end;
end;
var names_loaded: boolean = false;
freq0, freq1, freq2, freq3: array[mincc..maxcc] of integer;
total0, total1, total2, total3: integer;
procedure load_names;
var f: text;
w,w2: widestring;
lineno, i:integer;
begin
guarded_reset(f, segtagdir + 'n');
while not eof(f) do
begin
readln(f, w);
inc(lineno);
{ check validness }
for i := 1 to length(w) do
if (w[i] = ' ') or isCC(w[i])
then
else begin
writeln('bad cc in line ', lineno, ': ', w);
halt;
end;
if w[2] = ' ' then
begin
inc(freq0[w[1]]); inc(total0);
w2 := copy(w, 3, 2);
end else
begin
w2 := copy(w, 4, 2);
{ s := copy(w, 1, 2);
if n2.indexof(s) = -1 then
begin
n2.add(s);
write(s, ' ');
end;}
end;
case length(w2) of
1: begin inc(freq3[w2[1]]); inc(total3); end;
2: begin
inc(freq1[w2[1]]); inc(total1);
inc(freq2[w2[2]]); inc(total2);
end
end
end;
close(f);
end;
function new_gen_prob(pw: PWideChar; len: integer): real;
begin
if not names_loaded then begin load_names; names_loaded := true end;
case len of
2: result := freq0[pw^]/total0*freq3[(pw+1)^]/total3;
3: result := freq0[pw^]/total0*freq1[(pw+1)^]/total1*freq2[(pw+2)^]/total2;
4: result := 0.5*freq1[(pw+2)^]/total1*freq2[(pw+3)^]/total2;
else error('bad names!');
end;
end;
(*
var
allcc: array[MinCC..MaxCC] of integer;
total_freq: integer;*)
function gen_name(pw: PWideChar; len: integer): real;
(*const inited: boolean = false;
var s: string;
w: widechar;
f: text;
wch: WideChar;
count: integer;*)
var h, h1: TSegHashRec;
begin
// p := name_prob(name|史晓东) * p(史晓东)/p(name)
result := name_prob(pw, len)/512; //17:36 2004-11-11
//result := new_gen_prob(pw, len)/64; {tested 1,2,4,8,...256}
// this one is slightly better than the above formula for person name recognition
// but for other NE, slightly inferior results are obtained
// uniform prob as 3e-10 is not good, about 75% f-measure
if (len = 3) and lqdic.existword(pw+1, 2, h) then
begin
len := 0; h1 := h;
repeat
inc(len, h.count);
h := h.next;
until (h = nil) or not CompareMem(h.w, h1.w, 2);
result := result / (len+1)/50;
end;
(*
if not inited then
begin
guarded_reset(f, segtagdir + 'cc.txt');
fillchar(cc, sizeof(cc), 0);
while not eof(f) do
begin
readln(f, s);
if s[3] = ' ' then
begin
MultiByteToWideChar(0, 0, PChar(s), 2, @w, 1); { 1 wide char }
allcc[w] := StrToInt(copy(s, 4, maxint));
end;
end;
close(f);
total_freq := 0;
for wch := MinCC to MaxCC do inc(total_freq, allcc[wch]);
inited := true
end;
dec(len);
while (len >= 0) do
begin
count := allcc[pw^];
if count = 0 then inc(count);
result := result * count / total_freq; break;
dec(len);
end;*)
end;
function SegTag1(pw: PWideChar; len: integer): string;
var
i, j, k, n0, best: integer;
min_score, score, p, p0: real;
{$ifdef g3}c0, {$endif}_n0, c1, c2: integer;
s: string;
ww: array[0..31] of WideChar;
h: TSegHashRec;
usebigram: boolean;
words: array[0..255] of record
start, len: integer;
cat: integer;
x: integer;
end;
nwords: integer;
name_ok, name_suffix, nt_found: boolean;
label
label_time, labeltime;
begin
if calc_prob
then CreateWGraph2(pw, len)
else if just_tag
then CreateWGraph3(pw, len)
else begin
slen := len;
CreateWGraph(pw, len);
end;
// Vertibi 算法?
for i := 1 to slen do
with wGraph[i] do
begin
if npred = 0 then continue; // 比如一串数字
//mywriteln(format('at %d', [i]));
for j := 0 to npred - 1 do
begin
n0 := predList[j].predNode;
min_score := MaxDouble;
c2 := predList[j].pos;
usebigram := false;
name_suffix := false;
if very_very_verbose then write(get_word(@pw[n0], i-n0) + '/'+tagnamex(c2));
// 正规的HMM 中,生成概率与前一个词无关
if (c2 = lq_nr) and (predList[j].x in [pos_foreign_name, pos_chinese_name,
pos_chinese_name_double, pos_chinese_name_special_surname_first,
pos_chinese_name_special_surname_last])
then begin
case predList[j].x of
pos_foreign_name: p := (i-n0)*ln(0.0009);
// 根据“韦斯利·哈里斯”, "路易斯安娜"
// 调整,比这个数值小则不能识别为一个完整人名
// 以上调整后为0.0006
// 根据语料库改为0.001
pos_chinese_name_special_surname_last:
p := ln(np0[pw[n0+1]] * 0.00002); //根据“老梁顿感不妙”调整
pos_chinese_name_special_surname_first:
p := ln(np0[pw[n0]] * 0.00003); //根据“钱总可以说是我们推荐的”调整
else begin
// the following is mathematically WRONG!!!
// should use p := name_prob(name|史晓东) * p(史晓东)/p(name)
p := gen_name(@pw[n0], i-n0);
// but the result is inferior!!
//p := name_prob(@pw[n0], i-n0)/{256}512{1024{2048}; // 根据“分割成词”
// 512 是根据人民日报语料库调整,当时
// 如选256: F=97.98%
// 512: F=97.99%
// 1024:F=97.99% 2004/08/18
if p = 0 then p := 1 / Total_Count/5; //??应该用1/C1_Count[i1]/5 ??
if (i-n0=3) and (lqdic.existword(@pw[n0], 2) <> nil)
then p := p / 2; {阳光射进来}
p := ln(p);
//writeln(WideCharLenToString(@pw[n0], i-n0));
end;
end;
// check name prefix and suffix
//k := wGraph[n0].best_pred_cat_index;
_n0 := wGra
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -