⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 segment.class.php

📁 Phpcms2008 是一款基于 PHP+Mysql 架构的网站内容管理系统
💻 PHP
字号:
<?php
class segment
{
	var $tagdic = array();
	var $onenamedic = array();
	var $twonamedic = array();
	var $hashdic = array();
	var $result = array();
	var $inputstring = '';
	var $splitlen = 4; //保留词长度
	var $especialchar = '和|的|是|在';
	var $newwordlimit = '在|的|与|或|就|你|我|他|她|有|了|是|其|能|对|地';
	var $commonunit = '年|月|日|时|分|秒|点|元|百|千|万|亿|位|辆|条|个';
	var $pchar = 0;
	var $cnnumber = array('0','1','2','3','4','5','6','7','8','9','+','-','*','%','.','=','/','[',']','{','}','(',')','~','Ⅰ','Ⅱ','Ⅲ','Ⅳ','Ⅴ','Ⅵ','Ⅶ','Ⅷ','Ⅸ','¥');
	//过滤字符
	var $trimchars = array(',','。','?','!',':','、','▲','△','▼','▽','★','☆','◆','◇','■','□','●','○','⊙','㊣','◎','▂','▁','▃','▄','▅','▆','▇','█','▏','▎','▍','▌','▋','▊','◢','◣','◥','◤','▲','▼','♀','♂','卍','※');
	var $fnums = '0123456789+-*%.=/[]{}()~123456789\$';
	var $cnsgnum = '0|1|2|3|4|5|6|7|8|9|零|一|二|三|四|五|六|七|八|九|十|百|千|万|亿|数';
	var $maxlen = 13;
	var $minlen = 3;
	var $cntwoname = '端木 南宫 谯笪 轩辕 令狐 钟离 闾丘 长孙 鲜于 宇文 司徒 司空 上官 欧阳 公孙 西门 东门 左丘 东郭 呼延 慕容 司马 夏侯 诸葛 东方 赫连 皇甫 尉迟 申屠';
	var $cnonename = '赵钱孙李周吴郑王冯陈褚卫蒋沈韩杨朱秦尤许何吕施张孔曹严华金魏陶姜戚谢邹喻柏水窦章云苏潘葛奚范彭郎鲁韦昌马苗凤花方俞任袁柳酆鲍史唐费廉岑薛雷贺倪汤滕殷罗毕郝邬安常乐于时傅皮卡齐康伍余元卜顾孟平黄穆萧尹姚邵堪汪祁毛禹狄米贝明臧计伏成戴谈宋茅庞熊纪舒屈项祝董粱杜阮蓝闵席季麻强贾路娄危江童颜郭梅盛林刁钟徐邱骆高夏蔡田樊胡凌霍虞万支柯咎管卢莫经房裘缪干解应宗宣丁贲邓郁单杭洪包诸左石崔吉钮龚程嵇邢滑裴陆荣翁荀羊於惠甄魏加封芮羿储靳汲邴糜松井段富巫乌焦巴弓牧隗谷车侯宓蓬全郗班仰秋仲伊宫宁仇栾暴甘钭厉戎祖武符刘姜詹束龙叶幸司韶郜黎蓟薄印宿白怀蒲台从鄂索咸籍赖卓蔺屠蒙池乔阴郁胥能苍双闻莘党翟谭贡劳逄姬申扶堵冉宰郦雍郤璩桑桂濮牛寿通边扈燕冀郏浦尚农温别庄晏柴翟阎充慕连茹习宦艾鱼容向古易慎戈廖庚终暨居衡步都耿满弘匡国文寇广禄阙东殴殳沃利蔚越夔隆师巩厍聂晁勾敖融冷訾辛阚那简饶空曾沙须丰巢关蒯相查后江游竺';
	
	function segment($dictfile = '')
	{
		$this->__construct($dictfile);
	}

	function __construct($dictfile = '')
	{
		$cnonenamecount = strlen($this->cnonename);
		for($i=0; $i<$cnonenamecount; $i++)
		{
			$this->onenamedic[$this->cnonename[$i].$this->cnonename[$i+1]] = 1;
			$i++;
		}
		$twoname = explode(' ', $this->cntwoname);
		foreach($twoname as $n)
		{
			$this->twonamedic[$n] = 1;
		}
		unset($twoname, $this->cnonename, $this->cntwoname);
		if(!$dictfile) $dictfile = dirname(__file__).'/dict/dict_gbk.dat';
		$fp = @fopen($dictfile, 'rb');
		$i = 0;
		while($this->hashdic[$i++] = fread($fp, 65536));
		@fclose($fp);
	}

	function word_hash($word)
	{
		$i = 0;
		$c = $t = '';
		$hashcode = $pincode = 1;
		while($c = ord($word[$i++]))
		{
			if($c&0x80)
			{
				$t = ord($word[$i++]);
				$hashcode*=((($c&0x7f)<<8)|$t);
				$pincode*=$t;
			}
			else
			{
				$hashcode*=$c;
				$pincode*=$c;
			}
			$hashcode=abs($hashcode)%261223;
			$pincode=abs($pincode)%8285839;
		}
		if($hashcode<0) $hashcode=abs($hashcode)%261223;
		if($pincode<0) $pincode=abs($pincode)%8285839;
		$hashcode += 47;
		$pincode++;
		return array('hash_pos'=>$hashcode*3,'pincode'=>$pincode);
	}

	function close()
	{
		unset($this->hashdic);
	}

	function set_text($text)
	{
		if(strtolower(CHARSET) == 'utf-8') $text = iconv('utf-8', 'gbk', $text);
		$text = strip_tags($text);
		$this->inputstring = trim($this->initstring($text));
		$this->resultstring = '';
	}

	function get_words($method = 1)
	{
		$this->result = array();
		$this->pchar = -1;
		$spwords = explode(' ', $this->inputstring);
		$splen = sizeof($spwords);
		for($i=0; $i<$splen; $i++)
		{
			if(trim($spwords[$i]) == '') continue;
			if(!($oc=ord($spwords[$i][0])&0x80))
			{
				if($oc<43 || $oc>57|| $oc==44 ||$oc==47)
				{
					$this->result[++$this->pchar]= $spwords[$i];
				}
				else
				{
					$nextword = '';
					@$nextword = substr($this->resultstring, 0, strpos($this->resultstring, ' '));
					if(ereg('^'.$this->commonunit,$nextword))
					{
						$this->result[$this->pchar] .= $spwords[$i];
					}
					else
					{
						$this->result[++$this->pchar] = $spwords[$i];
					}
				}
			}
			else
			{
				$c = $spwords[$i][0].$spwords[$i][1];
				$n = hexdec(bin2hex($c));
				if($c=='《' || ($n>0xa13f && $n < 0xaa40))
				{
					$this->result[++$this->pchar]= $spwords[$i];
				}
				else
				{
					if(strlen($spwords[$i]) <= $this->splitlen)
					{
						if(ereg($this->especialchar.'$',$spwords[$i],$regs))
						{
							$spwords[$i] = ereg_replace($regs[0].'$', '', $spwords[$i]).$regs[0];
						}
						if(!ereg('^'.$this->commonunit,$spwords[$i]) || $i==0)
						{
							$this->result[++$this->pchar]= $spwords[$i];
						}
						elseif($i!=0)
						{
							$this->result[$this->pchar].= $spwords[$i];
						}
					}
					else
					{
						if($method == 0)
						{
							//正向最大匹配算法
							$this->seg_mm($spwords[$i]);
						}
						elseif($method == 1)
						{
							//正向最小匹配算法
							$this->seg_nm($spwords[$i]);
						}
					}
				}
			}
		}
		$text = implode(' ', array_filter($this->result, 'is_ok'));
		if(strtolower(CHARSET) == 'utf-8') $text = iconv('gbk', 'utf-8', $text);
		return $text;
	}

	function seg_mm($str)
	{
		$slen = strlen($str);
		$maxpos = $slen-$this->minlen-1;
		$wordarray = array();
		for($i=0; $i<$slen;)
		{
			if($i>=$maxpos)
			{
				if($this->minlen==1)
				{
					$wordarray[] = substr($str,$maxpos,2);
				}
				else
				{
					$w = substr($str, $i, $this->minlen+1);
					if($this->isword($w))
					{
						$wordarray[] = $w;
					}
					else
					{
						while($i<=$slen-2)
						{
							$wordarray[] = substr($str,$i,2);
							$i+=2;
						}
					}
				}
				$i = $slen; break;
			}
			$maxlenght = $this->maxlen+1>$slen-$i ? $slen-$i : $this->maxlen+1;
			for($j=$maxlenght; $j>=$this->minlen+1; $j=$j-2)
			{
				$w = substr($str,$i,$j);
				if($this->isword($w))
				{
					$wordarray[] = $w;
					$i += $j;
					break;
				}
			}
			if($j < $this->minlen+1)
			{
				$wordarray[] = $str[$i].$str[$i+1];
				$i += 2;
			}
		}
		$this->matchother($wordarray);
		return;
	}

	function seg_nm($str)
	{
		$slen = strlen($str);
		$maxpos = $slen-$this->minlen-1;
		$wordarray = array();
		for($i=0; $i<$slen;)
		{
			if($i >= $maxpos)
			{
				if($this->minlen==1)
				{
					$wordarray[] = substr($str,$maxpos,2);
				}
				else
				{
					$w = substr($str,$i,$this->minlen+1);
					if($this->isword($w))
					{
						$wordarray[] = $w;
					}
					else
					{
						while($i<=$slen-2)
						{
							$wordarray[] = substr($str,$i,2);
							$i+=2;
						}
					}
				}
				break;
			}
			$maxlenght = $this->maxlen+1 > $slen-$i ? $slen-$i : $this->maxlen+1;
			for($j=$this->minlen+1; $j<=$maxlenght; $j+=2)
			{
				$w = substr($str,$i,$j);
				if($this->isword($w))
				{
					$wordarray[] = $w;
					$i +=$j;
					break;
				}
			}
			if($j > $maxlenght)
			{
				$wordarray[] = substr($str,$i,2);
				$i += 2;
			}
		}
		$this->matchother($wordarray);
		return;
	}

	function matchother($wordarray)
	{
		$wordcount = count($wordarray)-1;
		for($i=0; $i<=$wordcount; $i++)
		{
			$this->result[++$this->pchar] = $wordarray[$i];
			if(ereg($this->cnsgnum,$wordarray[$i]))
			{
				if($i<$wordcount&& ereg('^'.$this->commonunit, $wordarray[$i+1]))
				{
					$this->result[$this->pchar].= $wordarray[++$i];
				}
				else
				{
					while($i<=$wordcount && ereg($this->cnsgnum, $wordarray[$i+1]))
					{
						$this->result[$this->pchar].= $wordarray[++$i]; 
					}
				}
				continue;
			}
		}
	}

	function isword($inputword)
	{
		static $iswordarray = array();
		if(isset($iswordarray[$inputword]))return true;
		if(!$hash=&$this->word_hash($inputword))return false;
		$hash_pos=$hash['hash_pos'];
		$hashdic=&$this->hashdic;
		$segment=$hash['hash_pos']>>16;
		$offset=$hash['hash_pos']&0xffff;
		$hash_pin_key = (ord($hashdic[$segment][$offset+2])<<16)|(ord($hashdic[$segment][$offset+1])<<8)|ord($hashdic[$segment][$offset]);
		if($hash['pincode'] == $hash_pin_key)
		{
			$iswordarray[$inputword] = 1;
			return true;
		}
		elseif($hash_pin_key&0x800000)
		{
			$offsetpos = 0x7fffff&$hash_pin_key;
			do{
				$segment=$offsetpos>>16;
				$offset=$offsetpos&0xffff;
				$hash_pin_code=(ord($hashdic[$segment][$offset+2])<<16)|(ord($hashdic[$segment][$offset+1])<<8)|ord($hashdic[$segment][$offset]);$offset+=3;
				if(($hash_pin_code&0x7fffff)==$hash['pincode'])
				{
					$iswordarray[$inputword]=1;
					return true;
				}
				if($offset>=65536)
				{
					$offset-=65536;
					$segment++;
				}
			}
			while(($hash_pin_code&0x800000)&&($offsetpos=(ord($hashdic[$segment][$offset+2])<<16)|(ord($hashdic[$segment][$offset+1])<<8)|ord($hashdic[$segment][$offset])));
		}
		return false;
	}

	function initstring($str)
	{
		$spc =' ';
		$slen = strlen($str);
		if($slen==0) return '';
		$okstr = '';
		$oc=$i=0;
		$prechar = 0;
		while($oc=ord($str[$i]))
		{
			if($oc < 0x81)
			{
				if($oc < 33)
				{
					if($prechar!=0&&$oc!=13&&$str[$i]!=10) $okstr .= $spc;
					$prechar=0;
					$i++;
					continue;
				}
				elseif(($oc!=44)&&($oc<42 ||$oc>58)&&($oc<64 ||$oc>90)&&($oc<67 ||$oc>70)&&($oc<97 ||$oc>122)&&$oc!=95)
				{
					if($prechar==0)
					{
						$okstr .= $str[$i]; $prechar=3;
					}
					else
					{ 
						$okstr .= $spc.$str[$i]; $prechar=3;
					}
				}
				else
				{
					if($prechar==2 || $prechar==3)
					{ 
						$okstr .= $spc.$str[$i]; $prechar=1;
					}
					else
					{
						$okstr .= $str[$i];
						$prechar=1;
						if($oc==58 || $oc==67 || $oc==69)
						{
							$prechar=3;
						}
						else
						{ 
							$prechar=1;
						}
					}
				}
			}
			else
			{
				if($prechar!=0 && $prechar!=2) $okstr .= $spc;
				if(isset($str[$i+1]))
				{
					$c = $str[$i].$str[$i+1];
					if(false!==$idx=array_search($c,$this->cnnumber))
					{ 
						$okstr .= $this->fnums[$idx]; $prechar = 2; $i+=2; continue; 
					}
					elseif(false!==array_search($c,$this->trimchars))
					{
						$i+=2; 
						continue;
					}
					$n = hexdec(bin2hex($c));
					if($n>0xa13f && $n < 0xaa40)
					{
						if($c=='《')
						{
							if($prechar!=0) $okstr .= $spc.' 《';
							else $okstr .= ' 《';
							$prechar = 2;
						}
						elseif($c=='》')
						{
							$okstr .= '》 ';
							$prechar = 3;
						}
						else
						{
							if($prechar!=0) $okstr .= $spc.$c;
							else $okstr .= $c;
							$prechar = 3;
						}
					}
					else
					{
						$okstr .= $c;
						$prechar = 2;
					}
					$i++;
				}
			}
			$i++;
		}
		return $okstr;
	}
}

function is_ok($str)
{
	return $str != ' ';
}
?>

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -