📄 hmmseg.pas
字号:
unit hmmseg;
{$J+}
// 删除了姓"兵"
interface
uses {$ifdef win32}windows, {$endif}comlib, mystr;
// HMM seg/tag besed on liuqun's resource, but my algorithm
{ 2003-10-16 决定今天开始实现 HMM 分词标注算法。
词图是一个关键的数据结构。
}
const
MAXPRED = 16; // 够不够?
pos_foreign_name = 1;
pos_chinese_name = 2;
pos_chinese_name_double = 3; // 注意,对于词典中的人名,=0
pos_chinese_name_special_surname_last = 4; // 老梁,小张
pos_chinese_name_special_surname_first = 5; // 梁某,张总
{$ifdef segv2}NCATS = 56;{$else}NCATS = 48;{$endif}
{比 isNumberChar2 多了 “第”}{$ifdef linux}
Type4_NumberChars_j: PWideChar = '第零兆%分之半几俩卅两点%‰-/'; // 多 deleted to bend to PEOPLE DAILY
Type4_NumberChars_f: PWideChar = '第零%分之半幾倆卅兩點%‰-/'; // 多 deleted to bend to PEOPLE DAILY{$else} Type4_NumberChars_j: PWideChar = '第零〇兆%分之半几俩卅两点%‰-/'; // 多 deleted to bend to PEOPLE DAILY
Type4_NumberChars_f: PWideChar = '第零〇%分之半幾倆卅兩點%‰-/'; // 多 deleted to bend to PEOPLE DAILY{$endif}
//数, or 數 deleted
D: array[0..4] of PWideChar =
(
'0.123456789',
'0.·123456789',
'○O一二三四五六七八九十百廿千万亿', //千萬億 deleted from type 4
'壹贰叁肆伍陆柒捌玖拾伯仟',
nil // set dynamically
);
PUNS = ':;()[],."!<>{}=';
dt_Date = 1;
dt_Time = 2;
type
TSegHashRec = class
w: PWideChar;
count: integer; // 在训练语料中的出现次数
cat: integer;
next: TSegHashRec;
extra: integer; // search for _xBigram...
end;
TSegHashTable = class
ht: array[0..65535] of TSegHashRec;
nrec: integer; { number of records }
function hashrec(key: PWideChar; count, cat: integer): TSegHashRec;
function existword(key: PWideChar; len: integer): TSegHashRec; overload;
function existword(key: PWideChar; len: integer; var h: TSegHashRec): boolean; overload;
function existword(key: PWideChar; len, cat: integer): TSegHashRec; overload;
//function existword(key: PWideChar; len, cat: integer; var h: TSegHashRec): boolean; overload;
function existword(key: PWideChar; len, cat: integer; var extra: integer): boolean; overload;
// destructor Destroy; override;
// procedure Clear;
// procedure LoadFromFile(const filename: string);
end;
TTagMode = (tag_normal, tag_no, tag_namex, tag_neon, tag_only);
TSegmentor = class
dic: TSegHashTable;
C1_count: array[0..NCATS+1] of integer;
C1_C2: array[0..NCATS+1, 0..NCATS+1] of integer;
{$ifdef g3}
C1_C2_C3: array[0..NCATS+1, 0..NCATS+1, 0..NCATS+1] of integer;
{$endif}
longest_word: array [MinCC..MaxCC] of byte;
CC, jcc: TNameFlags;
np0, np1, np2, np3: TProb;
constructor Create(const datadir: string; traditional: boolean);
procedure InitMatrix(const datadir: string);
function Prob2(c1, c2: integer): real; // P(c2|c1) 转移概率
{$ifdef g3}
function Prob3(c0, c1, c2: integer): real; // P(c2|c0,c1) 转移概率
{$endif}
function GenProb(pw: PWideChar; len, cat: integer): real; // P(w|c) 生成概率
function LoadCoreDic(const datadir: string): TSegHashTable;
procedure LoadBigram(const datadir: string);
procedure LoadZsuffix(const datadir: string);
procedure LoadNprefix(const datadir: string);
procedure Load_name_suffix(const datadir: string);
procedure Load_VVO(const datadir: string);
procedure Load_AA(const datadir: string);
end;
TSegLang = (Chinese_S, Chinese_T);
var
tag_scheme: (tag_pku, tag_ldc, tag_863);
do_mwu, escape_space: boolean;
mwu: TWideStringList;
tagmode: TTagMode;
//mwu_attr: array of Integer;
seg_unit_count: integer;
segged_word_count: integer;
do_nt, split_surname, show_foreign_name: boolean;
calc_prob, show_prob: boolean;
seg_f, seg_j: TSegmentor;
very_verbose: boolean = false;
very_very_verbose: boolean = false; names_: widestring = ''; segtagdir: string; msr_mode: boolean = false; segdatadir: string; strict_name: boolean = false; just_tag: boolean = false;procedure InitSegmentors;
procedure SelectSegmentor(_traditional: boolean);
procedure InitSegmentor1(lang: TSegLang);
function SegTag(pw: PWideChar; len: integer): string;
{$ifdef dll}
type
SegRec = record
start, len, cat: integer;
end;
PSegRec = ^SegRec;
var
_x_nwords: integer;
_x_buf, _xxbuf: PSegRec;
function segtag_1(pw: PWideChar; len: integer; buf: PSegRec; bufsize: integer): integer; stdcall;
function is_cat(cat: integer; catstr: PChar; len: integer): boolean; stdcall;
function tag2str(cats: PChar; ncats: integer; catstr: PChar; len: integer): boolean; stdcall;
function code2index(const s: string): integer; stdcall;
{$endif}
procedure True_UW(verbose: boolean);
procedure calc_prob2(gram: integer; const c1s, c2s, c3s: string);
function pku_code_to_index(const S: string): integer;
const
{ 863 评测标记:
普通名词:n 时间名词:nt 方位名词:nd 处所名词:nl
人名:nh 地名:ns 团体、机构、组织的专名:ni
其它专名:nz 动词:v 形容词:a 区别词:b
副词:d 数词:m 量词:q 代词;r
介词:p 连词:c 叹词:e 拟声词:o
助词:u 前接成分:h 后接成分:k 习用语:i
简称:j 语素字:g 非语素字:x 标点:wp
字符串:ws
要从 pku 转换:
t nt
f nd
s nl
l i
w wp
y u
z a
nr nh
nt ni
u. u
ad d
.g g
an n
nx ws
vd d
vn n
}
// because of data sparseness, some pos tags are equated with others
// but let's wait to see if this really improves peformance
equ_pos: array[0..1] of record t1, t2: string end = (
(t1: 'dy'; t2: 'd'),
(t1: 'rn'; t2: 'n')
);
pos863: array[2..NCATS+1] of string = (
{$ifdef segv2}
'a', 'a', 'g', 'a', 'b', 'b', 'c', 'd', 'd','d', 'e', 'nd', 'h', //14
'i', 'j', 'k', 'l', 'm', 'm', 'n', 'g', 'nh', 'nh', 'ns', 'ni', 'ws', //27
'nz', 'o', 'p', 'q', 'r', 'r', 'n', 'r', 'r', 'r', 'nl', 'nt', //39
'nt', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'v', 'd', 'g', 'v', //51
'r', 'r', 'wp', 'u', 'u', 'a');
{$else}
'a', 'a', 'g', 'a', 'b', 'g', 'c', 'd', 'g', 'e', 'nd', 'h', //13
'i', 'j', 'k', 'l', 'm', 'g', 'n', 'g', 'nh', 'ns', 'ni', 'ws', //25
'nz', 'o', 'p', 'q', 'r', 'g', 's', 'nt', 'g', 'u', 'u', 'u', //37
'u', 'u', 'u', 'u', 'v', 'v', 'g', 'v', 'wp', 'u', 'g', 'z');
{$endif}
pku_pos_codes: array[2..NCATS+1] of string = (
{$ifdef segv2}
'a', 'ad', 'ag', 'an', 'b', 'bg', 'c', 'd', 'dg','dy', 'e', 'f', 'h', //14
'i', 'j', 'k', 'l', 'm', 'mg', 'n', 'ng', 'nr', 'nrf', 'ns', 'nt', 'nx', //27
'nz', 'o', 'p', 'q', 'rd', 'rg', 'rn', 'rr', 'ry', 'rz', 's', 't', //39
'tg', 'u', 'ud', 'ug', 'uj', 'ul', 'uv', 'uz', 'v', 'vd', 'vg', 'vn', //51
'vry', 'vrz', 'w', 'y', 'yg', 'z');
// rg in ancient chinese, means 大/ad 破/v 之/rg
// ng 表示年号
{$else}
'a', 'ad', 'ag', 'an', 'b', 'bg', 'c', 'd', 'dg', 'e', 'f', 'h', //13
'i', 'j', 'k', 'l', 'm', 'mg', 'n', 'ng', 'nr', 'ns', 'nt', 'nx', //25
'nz', 'o', 'p', 'q', 'r', 'rg', 's', 't', 'tg', 'u', 'ud', 'ug', //37
'uj', 'ul', 'uv', 'uz', 'v', 'vd', 'vg', 'vn', 'w', 'y', 'yg', 'z');
{$endif}
posldc: array[2..NCATS+1] of string = (
'VA', 'AD', 'VA', 'NN', 'JJ', 'JJ', 'CC', 'AD', 'AD','AD', 'IJ', 'LC', 'h', //14
'i', 'NN', 'k', 'DEC', 'CD', 'CD', 'NN', 'NN', 'NR', 'NR', 'NR', 'NR', 'FW', //27
'NR', 'ON', 'P', 'M', 'DT', 'PN', 'NN', 'PN', 'PN', 'DT', 'NN', 'NT', //39
'NT', 'u', 'DER', 'AS', 'DEG', 'AS', 'DEV', 'AS', 'VV', 'VV', 'VV', 'NN', //51
'VV', 'VV', 'PU', 'SP', 'SP', 'JJ');
lq_sbegin = 0;
lq_send = 1;
lq_a = 2;
lq_b = 6;
lq_c = 8;
lq_d = 9;
{$ifdef segv2}
lq_f = 13;
lq_h = 14;
lq_j = 16;
lq_k = 17;
lq_DEC = 18; //!!
lq_m = 19;
lq_n = 21;
lq_ng = 22;
lq_nr = 23;
lq_nrf = 24;
lq_ns = 25;
lq_nt = 26;
lq_nx = 27;
lq_nz = 28;
lq_o = 29;
lq_p = 30;
lq_q = 31;
lq_r = 37; // lq_rz
lq_rz = 37;
lq_s = 38;
lq_t = 39;
lq_tg = 40;
lq_u = 41;
lq_uj = 44; // 的
lq_ul = 45;
lq_uz = 47;
lq_v = 48;
lq_vn = 51;
lq_w = 54;
lq_y = 55;
lq_z = 57;
{$else}
lq_f = 12;
lq_m = 18;
lq_n = 20;
lq_ng = 21;
lq_nr = 22;
lq_nrf = lq_nr;
lq_ns = 23;
lq_nx = 25;
lq_nz = 26;
lq_p = 28;
lq_q = 29;
lq_r = 30;
lq_t = 33;
lq_v = 42;
lq_vn = 45;
lq_w = 46;
lq_z = 49;
{$endif}
lq_x = NCATS+2;
lq_g = NCATS+3;
implementation
uses
SysUtils,
Math,
StrUtils,
IniFiles,
segapi;
type
TPOS_PRED = record
pos: integer; // 采用刘群的词性代码
lex: TSegHashRec;
predNode: byte;
best_pred_cat_index: byte;
x: integer; { 这个词的其他信息,如是不是外国人名 }
score: real;
end;
TStateNode = record
predList: array [0..MAXPRED] of TPOS_PRED;
npred: byte; // 对每一种词性,记录一个最佳(路径最短)的前驱
end;
Ttime_f_jt = array[0..3] of string;
const
time_f_jt: Ttime_f_jt = ('前','后', '以前', '以后');
time_f_ft: Ttime_f_jt = ('前','後', '以前', '以後');
var
wGraph: array[0..MAXSEGWORDS] of TStateNode;
slen, Total_Count: integer;
traditional: boolean;
Type4CharsHead, month_day_hour, WrongLastNumberChars: PWideChar;
dian, shi: WideChar;
c_sijian, c_zone_1: string;
time_f: Ttime_f_jt;
transliterated_foreign_names: TSimpleWideStringList;
foreign_names, foreign_org_names: TSimpleStringList;
procedure TSegmentor.InitMatrix(const datadir: string);
const c1c2file={$ifdef segv2}'c1c2.txt'{$else}'lq_c1c2.txt'{$endif};
var
f: text;
i, j, k: integer;
begin
assign(f, datadir + c1c2file);
try
reset(f);
except
error('cannot find ' + c1c2file);
end;
Total_count := 0;
for i := 0 to NCATS+1 do
begin
read(f, C1_count[i]);
{$ifdef segv2}if c1_count[i] = 0 then inc(c1_count[i]);{$endif} // vrz
inc(Total_Count, C1_count[i]);
end;
for i := 0 to NCATS+1 do for j := 0 to NCATS+1 do read(f, C1_C2[i, j]);
close(f);
{$ifdef g3}
C1_C2[0, 0] := c1_count[0]; // lq_sbegin = 0
assign(f, datadir + 'c1c2c3.txt');
try
reset(f);
except
error('cannot find c1c2c3.txt');
end;
for i := 0 to NCATS+1 do
for j := 0 to NCATS+1 do
for k := 0 to NCATS+1 do
read(f, C1_C2_C3[i, j, k]);
close(f);
{$endif}
end;
const
_xBigram = 1; // 1: word bigram determines the cat of second word
_xNS = 2; // 2: n]ns
_xNPre = 4; // 4: name prefix
_xNSuf = 8; // 8: name suffix
_xVVO = 16; // 16: vvo
_xNT = 32; // 32: n]nt
_xNZ = 64; // 64: n]nz
_xAA = 128;
_xABB = 256;
_xK = 512;
_xDate = 1024;
_xTime = 2048;
_xJ = 4096;
_xNum = 8192;
_xAABB = 16384;
_xRenVerb = 32768;
_xName = 65536;
_xForeignName = 131072;
var
seg_x: TSegmentor;
lqdic: TSegHashTable;
function TSegHashTable.hashrec(key: PWideChar; count, cat: integer): TSegHashRec;
var
h1: word;
len: integer;
function InsertKey(var hr: TSegHashRec): TSegHashRec;
begin
if hr = nil { empty bucket }
then begin
hr := TSegHashRec.Create;
//getmem(hr.w, (len + 1) shl 1);
//lstrcpyw(hr.w, key);
hr.w := key;
hr.count := count;
hr.cat := cat;
hr.next := nil;
hr.extra := 0;
result := hr;
inc(nrec);
end
else if CompareMem(hr.w, key, (len+1) shl 1) and (hr.cat = cat)
then inc(hr.count, count) {check repetition first }
else result := InsertKey(hr.next)
end;
begin
len := wstrlen(key);
h1 := word(_hash(PChar(key), len shl 1));
result := InsertKey(ht[h1]);
end;
function TSegHashTable.existword(key: PWideChar; len: integer): TSegHashRec;
begin
len := len shl 1;
result := ht[word(_hash(PChar(key), len))];
while result <> nil do
begin
if CompareMem(result.w, key, len) {and (key[len]=#0)} then exit;
result := result.next;
end;
end;
function TSegHashTable.existword(key: PWideChar; len, cat: integer): TSegHashRec;
begin
len := len shl 1;
result := ht[word(_hash(PChar(key), len))];
while result <> nil do
begin
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -