📄 pub_splitword_www.php
字号:
//自动摘要功能
//$keyword是指定的关键字或GetIndexText返回的内容
//建议不要用太多的关键字
//----------------------------
function AutoDescription($str,$keyword,$strlen)
{
$this->SourceString = $this->ReviseString($this->SourceString);
//对特定文本进行分离
$spwords = explode(" ",$this->SourceString);
$keywords = explode(" ",$this->keywords);
$regstr = "";
foreach($keywords as $k=>$v)
{
if($v=="") continue;
if(ord($v[0])>0x80 && strlen($v)<3) continue;
if($regstr=="") $regstr .= "($v)";
else $regstr .= "|($v)";
}
foreach($spwords as $v)
{
}
}
//----------------------------------
//对分词结果进行消岐处理
//----------------------------------
function TestDiff($str){
$str = preg_replace("/ {1,}/"," ",$str);
if($str == ""||$str == " ") return "";
$ws = explode(' ',$str);
$wlen = count($ws);
$spc = $this->SplitChar;
$reStr = "";
for($i=0;$i<$wlen;$i++){
//循环到最后一个词不处理
if($i>=($wlen-1)) {
$reStr .= $spc.$ws[$i];
}
//其它词的处理
else{
//叠词规则
if($ws[$i]==$ws[$i+1]){
$reStr .= $spc.$ws[$i].$ws[$i+1];
$i++; continue;
}
//单字词和二三字词之间的岐义处理
if(strlen($ws[$i])==2 && strlen($ws[$i+1])<8 && strlen($ws[$i+1])>2){
$addw = $ws[$i].$ws[$i+1];
$t = 6;
$testok = false;
while($t>=4){
$w = substr($addw,0,$t);
if($this->IsWord($w)
&& ($this->GetRank($w) > $this->GetRank($ws[$i+1])*2) ){
$limitW = substr($ws[$i+1],strlen($ws[$i+1])-$t-2,strlen($ws[$i+1])-strlen($w)+2);
if($limitW!="") $reStr .= $spc.$w.$spc.$limitW;
else $reStr .= $spc.$w;
$testok = true;
break;
}
$t = $t-2;
}
if(!$testok) $reStr .= $spc.$ws[$i];
else $i++;
}
//前后均为二字到三字的词进行交叉岐义处理
else if(strlen($ws[$i])>2 && strlen($ws[$i])<8
&& strlen($ws[$i+1])>2 && strlen($ws[$i+1])<8)
{
$t21 = substr($ws[$i+1],0,2);
$t22 = substr($ws[$i+1],0,4);
//如果上一个词接下一个词的首字为词
if($this->IsWord($ws[$i].$t21)){
if(strlen($ws[$i])==6||strlen($ws[$i+1])==6){
$reStr .= $spc.$ws[$i].$t21.$spc.substr($ws[$i+1],2,strlen($ws[$i+1])-2);
$i++;
}else{
$reStr .= $spc.$ws[$i];
}
}
//对于下一个词为3字词或2字词进行不同的处理
else if(strlen($ws[$i+1])==6){
if($this->IsWord($ws[$i].$t22)){
$reStr .= $spc.$ws[$i].$t22.$spc.$ws[$i+1][4].$ws[$i+1][5];
$i++;
}else{ $reStr .= $spc.$ws[$i]; }
}
//
//两字词交叉识别,视情况选择
//
else if(strlen($ws[$i+1])==4){
$addw = $ws[$i].$ws[$i+1];
$t = strlen($ws[$i+1])-2;
$testok = false;
while($t>0){
$w = substr($addw,0,strlen($ws[$i])+$t);
if($this->IsWord($w)
&& ($this->GetRank($w) > $this->GetRank($ws[$i+1])*2) )
{
$limitW = substr($ws[$i+1],$t,strlen($ws[$i+1])-$t);
if($limitW!="") $reStr .= $spc.$w.$spc.$limitW;
else $reStr .= $spc.$w;
$testok = true;
break;
}
$t = $t-2;
}
if(!$testok) $reStr .= $spc.$ws[$i];
else $i++;
}
else
{ $reStr .= $spc.$ws[$i]; }
}
//超过四字词或小于二字的词不作处理
else{
$reStr .= $spc.$ws[$i];
}
}
}//End For
return $reStr;
}
//---------------------------------
//判断词典里是否存在某个词
//---------------------------------
function IsWord($okWord){
$slen = strlen($okWord);
if($slen > $this->MaxLen) return false;
else return isset($this->RankDic[$slen][$okWord]);
}
//------------------------------
//整理字符串(对标点符号,中英文混排等初步处理)
//------------------------------
function ReviseString($str)
{
$spc = $this->SplitChar;
$slen = strlen($str);
if($slen==0) return '';
$okstr = '';
$prechar = 0; // 0-空白 1-英文 2-中文 3-符号
for($i=0;$i<$slen;$i++){
if(ord($str[$i]) < 0x81)
{
//英文的空白符号
if(ord($str[$i]) < 33){
//$str[$i]!="\r"&&$str[$i]!="\n"
if($prechar!=0) $okstr .= $spc;
$prechar=0;
continue;
}else if(preg_match("/[^0-9a-zA-Z@\.%#:\\/\\&_-]/",$str[$i]))
{
if($prechar==0)
{ $okstr .= $str[$i]; $prechar=3;}
else
{ $okstr .= $spc.$str[$i]; $prechar=3;}
}else
{
if($prechar==2||$prechar==3)
{ $okstr .= $spc.$str[$i]; $prechar=1;}
else
{
if(preg_match("/@#%:/",$str[$i])){ $okstr .= $str[$i]; $prechar=3; }
else { $okstr .= $str[$i]; $prechar=1; }
}
}
}
else{
//如果上一个字符为非中文和非空格,则加一个空格
if($prechar!=0 && $prechar!=2) $okstr .= $spc;
//如果中文字符
if(isset($str[$i+1])){
$c = $str[$i].$str[$i+1];
if(preg_match("/".$this->CnNumber."/",$c))
{ $okstr .= $this->GetAlabNum($c); $prechar = 2; $i++; continue; }
$n = hexdec(bin2hex($c));
if($n>0xA13F && $n < 0xAA40)
{
if($c=="《"){
if($prechar!=0) $okstr .= $spc." 《";
else $okstr .= " 《";
$prechar = 2;
}
else if($c=="》"){
$okstr .= "》 ";
$prechar = 3;
}
else{
if($prechar!=0) $okstr .= $spc.$c;
else $okstr .= $c;
$prechar = 3;
}
}
else{
$okstr .= $c;
$prechar = 2;
}
$i++;
}
}//中文字符
}//结束循环
return $okstr;
}
//-----------------------------------------
//尝试识别新词,字符串参数为已经分词处理的串
//----------------------------------------
function FindNewWord($str,$maxlen=6)
{
$okstr = "";
return $str;
}
//----------------------------------------------
//除去字串中的重复词,生成索引字符串,字符串参数为已经分词处理的串
//--------------------------------------------------
function GetIndexText($okstr,$ilen=-1)
{
if($okstr=="") return "";
$ws = explode(" ",$okstr);
$okstr = "";
$wks = "";
foreach($ws as $w)
{
$w = trim($w);
//排除小于2的字符
if(strlen($w)<2) continue;
//排除数字或日期
if(!preg_match("/[^0-9:-]/",$w)) continue;
if(strlen($w)==2&&ord($w[0])>0x80) continue;
if(isset($wks[$w])) $wks[$w]++;
else $wks[$w] = 1;
}
if(is_array($wks))
{
arsort($wks);
if($ilen==-1)
{ foreach($wks as $w=>$v)
{
if($this->GetRank($w)>500) $okstr .= $w." ";
}
}
else
{
foreach($wks as $w=>$v){
if((strlen($okstr)+strlen($w)+1)<$ilen) $okstr .= $w." ";
else break;
}
}
}
return trim($okstr);
}
//---------------------
//获得词的词频
//--------------------
function GetRank($w){
if(isset($this->RankDic[strlen($w)][$w])) return $this->RankDic[strlen($w)][$w];
else return 0;
}
//----------------------------
//把全角数字或英文单词转为半角
//---------------------------
function GetAlabNum($fnum)
{
$nums = array("0","1","2","3","4","5","6",
"7","8","9","+","-","%",".",
"a","b","c","d","e","f","g","h","i","j","k","l","m",
"n","o","p","q","r","s ","t","u","v","w","x","y","z",
"A","B","C","D","E","F","G","H","I","J","K","L","M",
"N","O","P","Q","R","S","T","U","V","W","X","Y","Z");
$fnums = "0123456789+-%.abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
for($i=0;$i<count($nums);$i++){
if($nums[$i]==$fnum) return $fnums[$i];
}
return $fnum;
}
}//End Class
?>
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -