📄 segment.class.php
字号:
<?php
class segment
{
var $tagdic = array();
var $onenamedic = array();
var $twonamedic = array();
var $hashdic = array();
var $result = array();
var $inputstring = '';
var $splitlen = 4; //保留词长度
var $especialchar = '和|的|是|在';
var $newwordlimit = '在|的|与|或|就|你|我|他|她|有|了|是|其|能|对|地';
var $commonunit = '年|月|日|时|分|秒|点|元|百|千|万|亿|位|辆|条|个';
var $pchar = 0;
var $cnnumber = array('0','1','2','3','4','5','6','7','8','9','+','-','*','%','.','=','/','[',']','{','}','(',')','~','Ⅰ','Ⅱ','Ⅲ','Ⅳ','Ⅴ','Ⅵ','Ⅶ','Ⅷ','Ⅸ','¥');
//过滤字符
var $trimchars = array(',','。','?','!',':','、','▲','△','▼','▽','★','☆','◆','◇','■','□','●','○','⊙','㊣','◎','▂','▁','▃','▄','▅','▆','▇','█','▏','▎','▍','▌','▋','▊','◢','◣','◥','◤','▲','▼','♀','♂','卍','※');
var $fnums = '0123456789+-*%.=/[]{}()~123456789\$';
var $cnsgnum = '0|1|2|3|4|5|6|7|8|9|零|一|二|三|四|五|六|七|八|九|十|百|千|万|亿|数';
var $maxlen = 13;
var $minlen = 3;
var $cntwoname = '端木 南宫 谯笪 轩辕 令狐 钟离 闾丘 长孙 鲜于 宇文 司徒 司空 上官 欧阳 公孙 西门 东门 左丘 东郭 呼延 慕容 司马 夏侯 诸葛 东方 赫连 皇甫 尉迟 申屠';
var $cnonename = '赵钱孙李周吴郑王冯陈褚卫蒋沈韩杨朱秦尤许何吕施张孔曹严华金魏陶姜戚谢邹喻柏水窦章云苏潘葛奚范彭郎鲁韦昌马苗凤花方俞任袁柳酆鲍史唐费廉岑薛雷贺倪汤滕殷罗毕郝邬安常乐于时傅皮卡齐康伍余元卜顾孟平黄穆萧尹姚邵堪汪祁毛禹狄米贝明臧计伏成戴谈宋茅庞熊纪舒屈项祝董粱杜阮蓝闵席季麻强贾路娄危江童颜郭梅盛林刁钟徐邱骆高夏蔡田樊胡凌霍虞万支柯咎管卢莫经房裘缪干解应宗宣丁贲邓郁单杭洪包诸左石崔吉钮龚程嵇邢滑裴陆荣翁荀羊於惠甄魏加封芮羿储靳汲邴糜松井段富巫乌焦巴弓牧隗谷车侯宓蓬全郗班仰秋仲伊宫宁仇栾暴甘钭厉戎祖武符刘姜詹束龙叶幸司韶郜黎蓟薄印宿白怀蒲台从鄂索咸籍赖卓蔺屠蒙池乔阴郁胥能苍双闻莘党翟谭贡劳逄姬申扶堵冉宰郦雍郤璩桑桂濮牛寿通边扈燕冀郏浦尚农温别庄晏柴翟阎充慕连茹习宦艾鱼容向古易慎戈廖庚终暨居衡步都耿满弘匡国文寇广禄阙东殴殳沃利蔚越夔隆师巩厍聂晁勾敖融冷訾辛阚那简饶空曾沙须丰巢关蒯相查后江游竺';
function segment($dictfile = '')
{
$this->__construct($dictfile);
}
function __construct($dictfile = '')
{
$cnonenamecount = strlen($this->cnonename);
for($i=0; $i<$cnonenamecount; $i++)
{
$this->onenamedic[$this->cnonename[$i].$this->cnonename[$i+1]] = 1;
$i++;
}
$twoname = explode(' ', $this->cntwoname);
foreach($twoname as $n)
{
$this->twonamedic[$n] = 1;
}
unset($twoname, $this->cnonename, $this->cntwoname);
if(!$dictfile) $dictfile = dirname(__file__).'/dict/dict_gbk.dat';
$fp = @fopen($dictfile, 'rb');
$i = 0;
while($this->hashdic[$i++] = fread($fp, 65536));
@fclose($fp);
}
function word_hash($word)
{
$i = 0;
$c = $t = '';
$hashcode = $pincode = 1;
while($c = ord($word[$i++]))
{
if($c&0x80)
{
$t = ord($word[$i++]);
$hashcode*=((($c&0x7f)<<8)|$t);
$pincode*=$t;
}
else
{
$hashcode*=$c;
$pincode*=$c;
}
$hashcode=abs($hashcode)%261223;
$pincode=abs($pincode)%8285839;
}
if($hashcode<0) $hashcode=abs($hashcode)%261223;
if($pincode<0) $pincode=abs($pincode)%8285839;
$hashcode += 47;
$pincode++;
return array('hash_pos'=>$hashcode*3,'pincode'=>$pincode);
}
function close()
{
unset($this->hashdic);
}
function set_text($text)
{
if(strtolower(CHARSET) == 'utf-8') $text = iconv('utf-8', 'gbk', $text);
$text = strip_tags($text);
$this->inputstring = trim($this->initstring($text));
$this->resultstring = '';
}
function get_words($method = 1)
{
$this->result = array();
$this->pchar = -1;
$spwords = explode(' ', $this->inputstring);
$splen = sizeof($spwords);
for($i=0; $i<$splen; $i++)
{
if(trim($spwords[$i]) == '') continue;
if(!($oc=ord($spwords[$i][0])&0x80))
{
if($oc<43 || $oc>57|| $oc==44 ||$oc==47)
{
$this->result[++$this->pchar]= $spwords[$i];
}
else
{
$nextword = '';
@$nextword = substr($this->resultstring, 0, strpos($this->resultstring, ' '));
if(ereg('^'.$this->commonunit,$nextword))
{
$this->result[$this->pchar] .= $spwords[$i];
}
else
{
$this->result[++$this->pchar] = $spwords[$i];
}
}
}
else
{
$c = $spwords[$i][0].$spwords[$i][1];
$n = hexdec(bin2hex($c));
if($c=='《' || ($n>0xa13f && $n < 0xaa40))
{
$this->result[++$this->pchar]= $spwords[$i];
}
else
{
if(strlen($spwords[$i]) <= $this->splitlen)
{
if(ereg($this->especialchar.'$',$spwords[$i],$regs))
{
$spwords[$i] = ereg_replace($regs[0].'$', '', $spwords[$i]).$regs[0];
}
if(!ereg('^'.$this->commonunit,$spwords[$i]) || $i==0)
{
$this->result[++$this->pchar]= $spwords[$i];
}
elseif($i!=0)
{
$this->result[$this->pchar].= $spwords[$i];
}
}
else
{
if($method == 0)
{
//正向最大匹配算法
$this->seg_mm($spwords[$i]);
}
elseif($method == 1)
{
//正向最小匹配算法
$this->seg_nm($spwords[$i]);
}
}
}
}
}
$text = implode(' ', array_filter($this->result, 'is_ok'));
if(strtolower(CHARSET) == 'utf-8') $text = iconv('gbk', 'utf-8', $text);
return $text;
}
function seg_mm($str)
{
$slen = strlen($str);
$maxpos = $slen-$this->minlen-1;
$wordarray = array();
for($i=0; $i<$slen;)
{
if($i>=$maxpos)
{
if($this->minlen==1)
{
$wordarray[] = substr($str,$maxpos,2);
}
else
{
$w = substr($str, $i, $this->minlen+1);
if($this->isword($w))
{
$wordarray[] = $w;
}
else
{
while($i<=$slen-2)
{
$wordarray[] = substr($str,$i,2);
$i+=2;
}
}
}
$i = $slen; break;
}
$maxlenght = $this->maxlen+1>$slen-$i ? $slen-$i : $this->maxlen+1;
for($j=$maxlenght; $j>=$this->minlen+1; $j=$j-2)
{
$w = substr($str,$i,$j);
if($this->isword($w))
{
$wordarray[] = $w;
$i += $j;
break;
}
}
if($j < $this->minlen+1)
{
$wordarray[] = $str[$i].$str[$i+1];
$i += 2;
}
}
$this->matchother($wordarray);
return;
}
function seg_nm($str)
{
$slen = strlen($str);
$maxpos = $slen-$this->minlen-1;
$wordarray = array();
for($i=0; $i<$slen;)
{
if($i >= $maxpos)
{
if($this->minlen==1)
{
$wordarray[] = substr($str,$maxpos,2);
}
else
{
$w = substr($str,$i,$this->minlen+1);
if($this->isword($w))
{
$wordarray[] = $w;
}
else
{
while($i<=$slen-2)
{
$wordarray[] = substr($str,$i,2);
$i+=2;
}
}
}
break;
}
$maxlenght = $this->maxlen+1 > $slen-$i ? $slen-$i : $this->maxlen+1;
for($j=$this->minlen+1; $j<=$maxlenght; $j+=2)
{
$w = substr($str,$i,$j);
if($this->isword($w))
{
$wordarray[] = $w;
$i +=$j;
break;
}
}
if($j > $maxlenght)
{
$wordarray[] = substr($str,$i,2);
$i += 2;
}
}
$this->matchother($wordarray);
return;
}
function matchother($wordarray)
{
$wordcount = count($wordarray)-1;
for($i=0; $i<=$wordcount; $i++)
{
$this->result[++$this->pchar] = $wordarray[$i];
if(ereg($this->cnsgnum,$wordarray[$i]))
{
if($i<$wordcount&& ereg('^'.$this->commonunit, $wordarray[$i+1]))
{
$this->result[$this->pchar].= $wordarray[++$i];
}
else
{
while($i<=$wordcount && ereg($this->cnsgnum, $wordarray[$i+1]))
{
$this->result[$this->pchar].= $wordarray[++$i];
}
}
continue;
}
}
}
function isword($inputword)
{
static $iswordarray = array();
if(isset($iswordarray[$inputword]))return true;
if(!$hash=&$this->word_hash($inputword))return false;
$hash_pos=$hash['hash_pos'];
$hashdic=&$this->hashdic;
$segment=$hash['hash_pos']>>16;
$offset=$hash['hash_pos']&0xffff;
$hash_pin_key = (ord($hashdic[$segment][$offset+2])<<16)|(ord($hashdic[$segment][$offset+1])<<8)|ord($hashdic[$segment][$offset]);
if($hash['pincode'] == $hash_pin_key)
{
$iswordarray[$inputword] = 1;
return true;
}
elseif($hash_pin_key&0x800000)
{
$offsetpos = 0x7fffff&$hash_pin_key;
do{
$segment=$offsetpos>>16;
$offset=$offsetpos&0xffff;
$hash_pin_code=(ord($hashdic[$segment][$offset+2])<<16)|(ord($hashdic[$segment][$offset+1])<<8)|ord($hashdic[$segment][$offset]);$offset+=3;
if(($hash_pin_code&0x7fffff)==$hash['pincode'])
{
$iswordarray[$inputword]=1;
return true;
}
if($offset>=65536)
{
$offset-=65536;
$segment++;
}
}
while(($hash_pin_code&0x800000)&&($offsetpos=(ord($hashdic[$segment][$offset+2])<<16)|(ord($hashdic[$segment][$offset+1])<<8)|ord($hashdic[$segment][$offset])));
}
return false;
}
function initstring($str)
{
$spc =' ';
$slen = strlen($str);
if($slen==0) return '';
$okstr = '';
$oc=$i=0;
$prechar = 0;
while($oc=ord($str[$i]))
{
if($oc < 0x81)
{
if($oc < 33)
{
if($prechar!=0&&$oc!=13&&$str[$i]!=10) $okstr .= $spc;
$prechar=0;
$i++;
continue;
}
elseif(($oc!=44)&&($oc<42 ||$oc>58)&&($oc<64 ||$oc>90)&&($oc<67 ||$oc>70)&&($oc<97 ||$oc>122)&&$oc!=95)
{
if($prechar==0)
{
$okstr .= $str[$i]; $prechar=3;
}
else
{
$okstr .= $spc.$str[$i]; $prechar=3;
}
}
else
{
if($prechar==2 || $prechar==3)
{
$okstr .= $spc.$str[$i]; $prechar=1;
}
else
{
$okstr .= $str[$i];
$prechar=1;
if($oc==58 || $oc==67 || $oc==69)
{
$prechar=3;
}
else
{
$prechar=1;
}
}
}
}
else
{
if($prechar!=0 && $prechar!=2) $okstr .= $spc;
if(isset($str[$i+1]))
{
$c = $str[$i].$str[$i+1];
if(false!==$idx=array_search($c,$this->cnnumber))
{
$okstr .= $this->fnums[$idx]; $prechar = 2; $i+=2; continue;
}
elseif(false!==array_search($c,$this->trimchars))
{
$i+=2;
continue;
}
$n = hexdec(bin2hex($c));
if($n>0xa13f && $n < 0xaa40)
{
if($c=='《')
{
if($prechar!=0) $okstr .= $spc.' 《';
else $okstr .= ' 《';
$prechar = 2;
}
elseif($c=='》')
{
$okstr .= '》 ';
$prechar = 3;
}
else
{
if($prechar!=0) $okstr .= $spc.$c;
else $okstr .= $c;
$prechar = 3;
}
}
else
{
$okstr .= $c;
$prechar = 2;
}
$i++;
}
}
$i++;
}
return $okstr;
}
}
function is_ok($str)
{
return $str != ' ';
}
?>
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -