📄 segmenter.pl

📁 中文分词算法。Perl语言编写。wordlist.txt为词库。
💻 PL
字号:
#!/usr/bin/perl

# Read in the lexicon
open(WRDS, "wordlist.txt") or die "Can't open wordlist\n";
while (<WRDS>) {
    chomp;
    $cwords{$_} = 1;
    if (length($_) == 6) {
	if (!exists($cwords{substr($_, 0, 4)})) { 
	    $cwords{substr($_, 0, 4)} = 2;
	}
    } 
#    if (length($_) == 8) {
#	if (!exists($cwords{substr($_, 0, 4)})) { 
#	    $cwords{substr($_, 0, 4)} = 2;
#	}
#	if (!exists($cwords{substr($_, 0, 6)})) { 
#	    $cwords{substr($_, 0, 6)} = 2;
#	}
#    } 
#    if (length($_) == 10) {
#	if (!exists($cwords{substr($_, 0, 4)})) { 
#	    $cwords{substr($_, 0, 4)} = 2;
#	}
#	if (!exists($cwords{substr($_, 0, 6)})) { 
#	    $cwords{substr($_, 0, 6)} = 2;
#	}
#	if (!exists($cwords{substr($_, 0, 8)})) { 
#	    $cwords{substr($_, 0, 8)} = 2;
#	}
#    } 
}
close(WRDS);

# Numbers
$numbers  = "零○一二三四五六七八九十百千万亿０１２３４５６７８９．点第";
$numbers .= "多半数几俩卅两壹贰叁肆伍陆柒捌玖拾伯仟";
for ($n = 0; $n < length($numbers); $n+=2) {
    $cnumbers{substr($numbers, $n, 2)} = 1;
}

# Wide ASCII words
$wascii =  "ａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ．";
$wascii .= "ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺ－";
$wascii .= "";
for ($n = 0; $n < length($wascii); $n+=2) {
    $cascii{substr($wascii, $n, 2)} = 1;
}

# Foreign name transliteration characters
$foreign =  "阿克拉加内亚斯贝巴尔姆爱兰尤利西詹乔伊费杰罗纳布可夫福赫勒柯特";
$foreign .= "劳伦坦史芬尼根登都伯林伍泰胥黎俄科索沃金森奥霍瓦茨普蒂塞维大利";
$foreign .= "格莱德冈萨雷墨哥弗库澳马哈多兹戈乌奇切诺戴里诸塞吉基延科达塔博";
$foreign .= "卡雅来莫波艾哈迈蓬安卢什比摩曼乃休合赖米那迪凯莱温帕桑佩蒙博托";
$foreign .= "谢格泽洛及希卜鲁匹齐兹印古埃努烈达累法贾图喀土穆腓基冉休盖耶沙";
$foreign .= "逊宾麦华万";
for ($n = 0; $n < length($foreign); $n+=2) {
    $cforeign{substr($foreign, $n, 2)} = 1;
}

#Chinese surnames
$surname  = "艾安敖白班包宝保鲍贝毕边卞柏卜蔡曹岑柴昌常陈成程迟池褚楚";
$surname .= "储淳崔戴刀邓狄刁丁董窦杜端段樊范方房斐费丰封冯凤伏福傅盖甘";
$surname .= "高戈耿龚宫勾苟辜谷古顾官关管桂郭韩杭郝禾何贺赫衡洪侯胡花";
$surname .= "华黄霍稽姬吉纪季贾简翦姜江蒋焦晋金靳荆居康柯空孔匡邝况赖蓝";
$surname .= "郎朗劳乐雷冷黎李理厉利励连廉练良梁廖林凌刘柳隆龙楼娄卢吕鲁";
$surname .= "陆路伦罗洛骆麻马麦满茅毛梅孟米苗缪闵明莫牟穆倪聂牛钮农潘庞";
$surname .= "裴彭皮朴平蒲溥浦戚祁齐钱强乔秦丘邱仇裘屈瞿权冉饶任荣容阮";
$surname .= "瑞芮萨赛沙单商邵佘申沈盛石史寿舒斯宋苏孙邰谭谈汤唐陶滕";
$surname .= "田佟仝屠涂万汪王危韦魏卫蔚温闻翁巫邬伍武吴奚习夏鲜冼";
$surname .= "项萧解谢辛邢幸熊徐许宣薛荀颜阎言严彦晏燕杨阳姚叶蚁易殷银尹";
$surname .= "应英游尤於鱼虞俞余禹喻郁尉元袁岳云臧曾查翟詹湛张章招赵甄";
$surname .= "郑钟周诸朱竺祝庄卓宗邹祖左";
$uncommonsurname = "车和全时水同文席于";
for ($n = 0; $n < length($surname); $n+=2) {
    $csurname{substr($surname, $n, 2)} = 1;
}
for ($n = 0; $n < length($uncommonsurname); $n+=2) {
    $uncommoncsurname{substr($uncommonsurname, $n, 2)} = 1;
}

# Add in 2 character surnames; also add to lexicon so they'll be segmented as one unit
$csurname{"东郭"} = 1; $cwords{"东郭"} = 1;
$csurname{"公孙"} = 1; $cwords{"公孙"} = 1;
$csurname{"皇甫"} = 1; $cwords{"皇甫"} = 1;
$csurname{"慕容"} = 1; $cwords{"慕容"} = 1;
$csurname{"欧阳"} = 1; $cwords{"欧阳"} = 1;
$csurname{"单于"} = 1; $cwords{"单于"} = 1;
$csurname{"司空"} = 1; $cwords{"司空"} = 1;
$csurname{"司马"} = 1; $cwords{"司马"} = 1;
$csurname{"司徒"} = 1; $cwords{"司徒"} = 1;
$csurname{"澹台"} = 1; $cwords{"澹台"} = 1;
$csurname{"诸葛"} = 1; $cwords{"诸葛"} = 1;

#Not in name
$notname  = "的说对在和是被最所那这有将会与於他为";
$notname .= "、：，。★〖〗（）⊙～【】—·？！“”　";
for ($n = 0; $n < length($notname); $n+=2) {
    $cnotname{substr($notname, $n, 2)} = 1;
}


sub add_ChineseNames {
    ($tmpline) = @_;
    $tlen = length($tmpline);
    $newline = "";
    for ($m = 0; $m < $tlen; $m++) {
	$tchar = substr($tmpline, $m, 1);
	$currtoken = "";
	if ($tchar =~ /^\s$/) { 
	    $newline .= $tchar;
	} else {
	    $currtoken = "";
	    while ($tchar !~ /^\s$/ and $m < $tlen) {
		$currtoken .= $tchar;
		$m++;
		$tchar = substr($tmpline, $m, 1);
	    }

	    if (defined($csurname{$currtoken}) or
		defined($uncommoncsurname{$currtoken})) { # found a surname, see what follows
		# go past following spaces
		$tchar = substr($tmpline, $m, 1);
		$spaces = "";
		while ($tchar =~ /\s/ and $m < $tlen) {
		    $spaces .= $tchar;
		    $m++;
		    $tchar = substr($tmpline, $m, 1);
		}
		# Get next token
		$tchar = substr($tmpline, $m, 1);
		$currtoken2 = "";
		while ($tchar !~ /\s/ and $m < $tlen) {
		    $currtoken2 .= $tchar;
		    $m++;
		    $tchar = substr($tmpline, $m, 1);
		}
		# go past following spaces
		$tchar = substr($tmpline, $m, 1);
		$spaces2 = "";
		while ($tchar =~ /\s/ and $m < $tlen) {
		    $spaces2 .= $tchar;
		    $m++;
		    $tchar = substr($tmpline, $m, 1);
		}
		# Get next token
		$tchar = substr($tmpline, $m, 1);
		$currtoken3 = "";
		while ($tchar !~ /\s/ and $m < $tlen) {
		    $currtoken3 .= $tchar;
		    $m++;
		    $tchar = substr($tmpline, $m, 1);
		}
		if (isChinese($currtoken2) and (length($currtoken2) == 2) 
		    and (!defined($cnotname{$currtoken2})) and 
		    isChinese($currtoken3) and length($currtoken3) == 2 and
		    !defined($cnotname{$currtoken3})) 
		{
		    $newline .= $currtoken . $currtoken2 . $currtoken3;
		    $cwords{$currtoken . $currtoken2 . $currtoken3} = 1;
		    $cwords{$currtoken . $currtoken2} = 2;  # short version for checking
		} elsif (isChinese($currtoken2) and (length($currtoken2) == 2) 
			 and (!defined($cnotname{$currtoken2})))
		{
		    $newline .= $currtoken . $currtoken2 . $spaces2 . $currtoken3;
		    $cwords{$currtoken . $currtoken2} = 1;
		} elsif (defined($csurname{$currtoken}) and 
			 isChinese($currtoken2) and (length($currtoken2) == 4) 
			 and (!defined($cnotname{$currtoken2})))
		{
		    $newline .= $currtoken . $currtoken2 . $spaces2 . $currtoken3;
		    $cwords{$currtoken . $currtoken2} = 1;
		    $cwords{$currtoken . substr($currtoken2, 0, 2)} = 2; # short version to check
		} elsif (defined($uncommoncsurname{$currtoken}) and 
			 isChinese($currtoken2) and (length($currtoken2) == 4) 
			 and (!defined($cnotname{$currtoken2})) 
			 and ($cwords{$currtoken2} != 1))
		{
		    $newline .= $currtoken . $currtoken2 . $spaces2 . $currtoken3;
		    $cwords{$currtoken . $currtoken2} = 1;
		    $cwords{$currtoken . substr($currtoken2, 0, 2)} = 2; # short version to check
		} else {
		    $newline .= $currtoken . $spaces . $currtoken2 . $spaces2 . $currtoken3;
		}
				 
	    } else {
		$newline .= $currtoken;
	    }
	    $m--; # reset so won't skip space
	}
    }
    
    $newline;
}


#sub cword_start {
#    my($tword) = @_;
#    if (grep(/^$tword/, @cwordlist) > 0) {
#	return 1;
#    } else {
#	return 0;
#    }
#}

sub isChinese {
    my($cchar) = @_;
    for ($b = 0; $b < length($cchar); $b++) {
	if (unpack("C", substr($cchar, $b, 1)) < 128) {
	    return 0;
	} 
    }
    return 1;
}


sub allnum {
    ($localnum) = @_;
    for ($k = 0; $k < length($localnum); $k+=2) {
	if (!defined($cnumbers{substr($localnum, $k, 2)})) {
	    return 0;
	}
    }
    return 1;
}

sub allwascii {
    ($localstr) = @_;
    for ($k = 0; $k < length($localstr); $k+=2) {
	if (!defined($cascii{substr($localstr, $k, 2)})) {
	    return 0;
	}
    }
    return 1;
}

sub allforeign {
    ($localstr) = @_;
    for ($k = 0; $k < length($localstr); $k+=2) {
	if (!defined($cforeign{substr($localstr, $k, 2)})) {
	    return 0;
	}
    }
    return 1;
}


sub segmentline() {
    my($line) = @_;

    $chinaccum = "";
    $outline = "";
    $linelen = length($line);
    for ($i = 0; $i < $linelen; $i++) {
	$char1 = substr($line, $i, 1);
	if (unpack("C", $char1) > 127) {
	    $chinchar = substr($line, $i, 2);
	    if ($chinaccum eq "") {
		$outline .= " " unless $i == 0;
		$chinaccum = $chinchar;
	    } else {
		    if (exists($cwords{$chinaccum . $chinchar}) and
			$cwords{$chinaccum . $chinchar} == 1) { # is in lexicon
			$chinaccum .= $chinchar;
		    } elsif (allnum($chinaccum) and defined($cnumbers{$chinchar})) {
			$chinaccum .= $chinchar;
		    } elsif (allwascii($chinaccum) and defined($cascii{$chinchar})) {
			$chinaccum .= $chinchar;
		    } elsif (allforeign($chinaccum) and defined($cforeign{$chinchar}) and
			     $cwords{substr($line, $i, 4)} != 1 and
			     $cwords{substr($line, $i, 4)} != 2) {
			$chinaccum .= $chinchar;
		    } elsif (exists($cwords{$chinaccum . $chinchar}) and
			     ($cwords{$chinaccum . $chinchar} == 2) and
			     exists($cwords{$chinaccum . $chinchar . substr($line, $i+2, 2)}) and
			     (($cwords{$chinaccum . $chinchar . substr($line, $i+2, 2)} == 1) or
			      ($cwords{$chinaccum . $chinchar . substr($line, $i+2, 2)} == 2)))
		    { # starts a word in the lexicon
			$chinaccum .= $chinchar;
			
		    } else {
			$outline .= $chinaccum . " ";
			$chinaccum = $chinchar;  # start anew
		    }
		}
	    $i++;
	} else {  # Plain ascii text, attach any accumulated Chinese and then ascii
	    if ($chinaccum ne "") {
		$outline .= $chinaccum . " ";
		$chinaccum = "";
	    }
	    $outline .= $char1;
	}
    }
    
    $chinline = add_ChineseNames($outline);
    $chinline;
}

1;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -