📄 pub_splitword_www.php
字号:
<?php
/*******************************
//织梦分词算法 www.dedecms.com
//带词性标志的算法,使用方法:
//普通分词 tryNumName,识别数量词及人名,$tryDiff 岐义处理
//$strok = $sp->SplitRMM(string $str,bool $tryNumName,bool $tryDiff)
//在DedeCms内使用的词典是从热门词中选出来的词典,不适用于通用的分词算法
//本算法经过简化处理,取消了部份消岐规则
********************************/
class SplitWord
{
var $RankDic = Array();
var $OneNameDic = Array();
var $TwoNameDic = Array();
var $NewWord = Array();
var $SourceString = '';
var $ResultString = '';
var $SplitChar = ' '; //分隔符
var $SplitLen = 4; //保留词长度
var $EspecialChar = "和|的|是";
var $NewWordLimit = "在|的|与|或|就|你|我|他|她|有|了|是|其|能|对|地";
//这里可以按需要加入常用的量词,
//程序会检测词语第一个字是否为这些词和上一个词是否为数词,然后结合为单词
var $CommonUnit = "年|月|日|时|分|秒|点|元|百|千|万|亿|位|辆";
var $CnNumber = "0|1|2|3|4|5|6|7|8|9|+|-|%|.|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s |t|u|v|w|x|y|z|A|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z";
var $CnSgNum = "一|二|三|四|五|六|七|八|九|十|百|千|万|亿|数";
var $MaxLen = 13; //词典最大 7 中文字,这里的数值为字节数组的最大索引
var $MinLen = 3; //最小 2 中文字,这里的数值为字节数组的最大索引
var $CnTwoName = "端木 南宫 谯笪 轩辕 令狐 钟离 闾丘 长孙 鲜于 宇文 司徒 司空 上官 欧阳 公孙 西门 东门 左丘 东郭 呼延 慕容 司马 夏侯 诸葛 东方 赫连 皇甫 尉迟 申屠";
var $CnOneName = "赵钱孙李周吴郑王冯陈褚卫蒋沈韩杨朱秦尤许何吕施张孔曹严华金魏陶姜戚谢邹喻柏水窦章云苏潘葛奚范彭郎鲁韦昌马苗凤花方俞任袁柳酆鲍史唐费廉岑薛雷贺倪汤滕殷罗毕郝邬安常乐于时傅皮卡齐康伍余元卜顾孟平黄穆萧尹姚邵堪汪祁毛禹狄米贝明臧计伏成戴谈宋茅庞熊纪舒屈项祝董粱杜阮蓝闵席季麻强贾路娄危江童颜郭梅盛林刁钟徐邱骆高夏蔡田樊胡凌霍虞万支柯咎管卢莫经房裘缪干解应宗宣丁贲邓郁单杭洪包诸左石崔吉钮龚程嵇邢滑裴陆荣翁荀羊於惠甄魏加封芮羿储靳汲邴糜松井段富巫乌焦巴弓牧隗谷车侯宓蓬全郗班仰秋仲伊宫宁仇栾暴甘钭厉戎祖武符刘姜詹束龙叶幸司韶郜黎蓟薄印宿白怀蒲台从鄂索咸籍赖卓蔺屠蒙池乔阴郁胥能苍双闻莘党翟谭贡劳逄姬申扶堵冉宰郦雍郤璩桑桂濮牛寿通边扈燕冀郏浦尚农温别庄晏柴翟阎充慕连茹习宦艾鱼容向古易慎戈廖庚终暨居衡步都耿满弘匡国文寇广禄阙东殴殳沃利蔚越夔隆师巩厍聂晁勾敖融冷訾辛阚那简饶空曾沙须丰巢关蒯相查后江游竺";
//------------------------------
//php4构造函数
//------------------------------
function SplitWord($loaddic=true){
$this->__construct($loaddic);
}
//------------------------------
//php5构造函数
//------------------------------
function __construct($loaddic=true)
{
if($loaddic)
{
//载入姓氏词典
for($i=0;$i<strlen($this->CnOneName);$i++){
$this->OneNameDic[$this->CnOneName[$i].$this->CnOneName[$i+1]] = 1;
$i++;
}
$twoname = explode(" ",$this->CnTwoName);
foreach($twoname as $n){ $this->TwoNameDic[$n] = 1; }
unset($twoname);
unset($this->CnTwoName);
unset($this->CnOneName);
//高级分词,预先载入词典以提分词高速度
$dicfile = dirname(__FILE__)."/data/dede_wwwdic.csv";
$fp = fopen($dicfile,'r');
while($line = fgets($fp,64)){
$ws = explode(' ',$line);
$this->RankDic[strlen($ws[0])][$ws[0]] = $ws[1];
}
fclose($fp);
}//是否载入词典,如果不需要用分词功能,可以不载入。
}
//--------------------------
//析放资源
//--------------------------
function Clear()
{
unset($this->RankDic);
}
//----------------------------
//设置源字符串
//----------------------------
function SetSource($str){
$this->SourceString = trim($this->ReviseString($str));
$this->ResultString = "";
}
//-----------------------------
//检查字符串是否不存在中文
//-----------------------------
function NotGBK($str)
{
if($str=="") return "";
//因为粗分的时候已经处理,因此不必要检查所的字符
if( ord($str[0])>0x80 ) return false;
else return true;
}
//-----------------------------
//RMM分词算法
//-----------------------------
function SplitRMM($str="",$tryNumName=true,$tryDiff=true){
if($str!="") $this->SetSource(trim($str));
if($this->SourceString=="") return "";
//对文本进行粗分
$this->SourceString = $this->ReviseString($this->SourceString);
//对特定文本进行分离
$spwords = explode(" ",$this->SourceString);
$spLen = count($spwords);
$spc = $this->SplitChar;
for($i=($spLen-1);$i>=0;$i--){
if(trim($spwords[$i])=="") continue;
if($this->NotGBK($spwords[$i])){
$this->ResultString = $spwords[$i].$spc.$this->ResultString;
}
else
{
if(isset($spwords[$i][1])) $c = $spwords[$i][0].$spwords[$i][1];
else{
$this->ResultString = $spwords[$i].$spc.$this->ResultString;
continue;
}
$n = hexdec(bin2hex($c));
if($c=="《") //书名
{ $this->ResultString = $spwords[$i].$spc.$this->ResultString; }
else if($n>0xA13F && $n < 0xAA40) //标点符号
{ $this->ResultString = $spwords[$i].$spc.$this->ResultString; }
else //正常短句
{
if(strlen($spwords[$i]) <= $this->SplitLen)
{
//如果结束符为特殊分割词,分离处理
if(preg_match("/".$this->EspecialChar."$/",$spwords[$i],$regs)){
$spwords[$i] = preg_replace("/".$regs[0]."$/","",$spwords[$i]).$spc.$regs[0];
}
//是否为常用单位
if(!preg_match("/^".$this->CommonUnit."/",$spwords[$i]) || $i==0){
$this->ResultString = $spwords[$i].$spc.$this->ResultString;
}else{
$this->ResultString = $spwords[$i-1].$spwords[$i].$spc.$this->ResultString;
$i--;
}
}
else{
$this->ResultString = $this->RunRMM($spwords[$i],$tryNumName,$tryDiff).$spc.$this->ResultString;
}
}
}
}
$this->ResultString = preg_replace("/ {1,}/"," ",$this->ResultString);
//return $this->ParNumber($this->ResultString);
return $this->ResultString;
}
//------------------------
//对常规数量词进行识别
//------------------------
function ParNumber($str){
if($str == "") return "";
$ws = explode(' ',$str);
$wlen = count($ws);
$spc = $this->SplitChar;
$reStr = "";
for($i=0;$i<$wlen;$i++){
if($ws[$i]=="") continue;
if($i>=$wlen-1) $reStr .= $spc.$ws[$i];
else{ $reStr .= $spc.$ws[$i]; }
}
return $reStr;
}
//-------------------------------
//进行名字识别和其它数词识别
//--------------------------------
function ParOther($WordArray)
{
$wlen = count($WordArray)-1;
$rsStr = "";
$spc = $this->SplitChar;
for($i=$wlen;$i>=0;$i--)
{
//数量词
if(preg_match("/".$this->CnSgNum."/",$WordArray[$i])){
$rsStr .= $spc.$WordArray[$i];
if($i>0 && preg_match("/^".$this->CommonUnit."/",$WordArray[$i-1]))
{ $rsStr .= $WordArray[$i-1]; $i--; }
else{
while($i>0 && preg_match("/".$this->CnSgNum."/",$WordArray[$i-1]))
{ $rsStr .= $WordArray[$i-1]; $i--; }
}
continue;
}
//双字姓
if(strlen($WordArray[$i])==4 && isset($this->TwoNameDic[$WordArray[$i]]))
{
$rsStr .= $spc.$WordArray[$i];
if($i>0&&strlen($WordArray[$i-1])==2){
$rsStr .= $WordArray[$i-1];$i--;
if($i>0&&strlen($WordArray[$i-1])==2){ $rsStr .= $WordArray[$i-1];$i--; }
}
}
//单字姓
else if(strlen($WordArray[$i])==2 && isset($this->OneNameDic[$WordArray[$i]]))
{
$rsStr .= $spc.$WordArray[$i];
if($i>0&&strlen($WordArray[$i-1])==2){
if(preg_match("/".$this->EspecialChar."/",$WordArray[$i-1])) continue;
$rsStr .= $WordArray[$i-1];$i--;
if($i>0 && strlen($WordArray[$i-1])==2 &&
!preg_match("/".$this->EspecialChar."/",$WordArray[$i-1]))
{ $rsStr .= $WordArray[$i-1];$i--; }
}
}
//普通词汇
else{
$rsStr .= $spc.$WordArray[$i];
}
}
//返回本段分词结果
$rsStr = preg_replace("/^".$spc."/","",$rsStr);
return $rsStr;
}
//对全中文字符串进行逆向匹配方式分解
function RunRMM($str,$tryNumName=true,$tryDiff=true)
{
$spc = $this->SplitChar;
$spLen = strlen($str);
$rsStr = "";
$okWord = "";
$tmpWord = "";
$WordArray = Array();
//逆向字典匹配
for($i=($spLen-1);$i>=0;)
{
//当i达到最小可能词的时候
if($i<=$this->MinLen){
if($i==1){
$WordArray[] = substr($str,0,2);
}else
{
$w = substr($str,0,$this->MinLen+1);
if($this->IsWord($w)){
$WordArray[] = $w;
}else{
$WordArray[] = substr($str,2,2);
$WordArray[] = substr($str,0,2);
}
}
$i = -1; break;
}
//分析在最小词以上时的情况
if($i>=$this->MaxLen) $maxPos = $this->MaxLen;
else $maxPos = $i;
$isMatch = false;
for($j=$maxPos;$j>=0;$j=$j-2){
$w = substr($str,$i-$j,$j+1);
if($this->IsWord($w)){
$WordArray[] = $w;
$i = $i-$j-1;
$isMatch = true;
break;
}
}
if(!$isMatch){
if($i>1) {
$WordArray[] = $str[$i-1].$str[$i];
$i = $i-2;
}
}
}//End For
//名字和数量词识别
if($tryNumName)
{ $rsStr = $this->ParOther($WordArray); }
else{
$wlen = count($WordArray)-1;
for($i=$wlen;$i>=0;$i--){
$rsStr .= $spc.$WordArray[$i];
}
}
//消岐处理
if($tryDiff) $rsStr = $this->TestDiff(trim($rsStr));
return $rsStr;
}
//----------------------------
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -