⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 pub_splitword_www.php

📁 强大的PHP内容管理系统尽量不要让站长把时间都花费在为您修正说明上。压缩包解压
💻 PHP
📖 第 1 页 / 共 2 页
字号:
  //自动摘要功能
  //$keyword是指定的关键字或GetIndexText返回的内容
  //建议不要用太多的关键字
  //----------------------------
  function AutoDescription($str,$keyword,$strlen)
  {
  	$this->SourceString = $this->ReviseString($this->SourceString);
  	//对特定文本进行分离
  	$spwords = explode(" ",$this->SourceString);
  	$keywords = explode(" ",$this->keywords);
  	$regstr = "";
  	foreach($keywords as $k=>$v)
  	{
  		if($v=="") continue;
  		if(ord($v[0])>0x80 && strlen($v)<3) continue;
  		if($regstr=="") $regstr .= "($v)";
  		else $regstr .= "|($v)";
  	}
  	
  	foreach($spwords as $v)
  	{
  		
  	}
  }
  
  //----------------------------------
  //对分词结果进行消岐处理
  //----------------------------------
  function TestDiff($str){
  	$str = preg_replace("/ {1,}/"," ",$str);
  	if($str == ""||$str == " ") return "";
  	$ws = explode(' ',$str);
  	$wlen = count($ws);
  	$spc = $this->SplitChar;
  	$reStr = "";
  	for($i=0;$i<$wlen;$i++){
  		//循环到最后一个词不处理
  		if($i>=($wlen-1)) {
  			$reStr .= $spc.$ws[$i];
  		}
  		//其它词的处理
  		else{
  			//叠词规则
  			if($ws[$i]==$ws[$i+1]){
  				$reStr .= $spc.$ws[$i].$ws[$i+1];
  				$i++; continue;
  			}
  			//单字词和二三字词之间的岐义处理
  			if(strlen($ws[$i])==2 && strlen($ws[$i+1])<8 && strlen($ws[$i+1])>2){
  				$addw = $ws[$i].$ws[$i+1];
  				$t = 6;
  				$testok = false;
  				while($t>=4){
  				  $w = substr($addw,0,$t);
  				  if($this->IsWord($w) 
  				  && ($this->GetRank($w) > $this->GetRank($ws[$i+1])*2) ){
  					   $limitW = substr($ws[$i+1],strlen($ws[$i+1])-$t-2,strlen($ws[$i+1])-strlen($w)+2);
  					   if($limitW!="") $reStr .= $spc.$w.$spc.$limitW;
  					   else $reStr .= $spc.$w;
  					   $testok = true;
  					   break;
  				  }
  				  $t = $t-2;
  			  }
  			  if(!$testok) $reStr .= $spc.$ws[$i];
  			  else $i++;
  			}
  			//前后均为二字到三字的词进行交叉岐义处理
  			else if(strlen($ws[$i])>2 && strlen($ws[$i])<8
  			&& strlen($ws[$i+1])>2 && strlen($ws[$i+1])<8)
  			{
  				$t21 = substr($ws[$i+1],0,2);
  				$t22 = substr($ws[$i+1],0,4);
  				//如果上一个词接下一个词的首字为词
  				if($this->IsWord($ws[$i].$t21)){
  					if(strlen($ws[$i])==6||strlen($ws[$i+1])==6){
  						$reStr .= $spc.$ws[$i].$t21.$spc.substr($ws[$i+1],2,strlen($ws[$i+1])-2);
  						$i++;
  					}else{
  						$reStr .= $spc.$ws[$i];
  					}
  				}
  				//对于下一个词为3字词或2字词进行不同的处理
  				else if(strlen($ws[$i+1])==6){
  					if($this->IsWord($ws[$i].$t22)){
  						$reStr .= $spc.$ws[$i].$t22.$spc.$ws[$i+1][4].$ws[$i+1][5];
  						$i++;
  					}else{ $reStr .= $spc.$ws[$i]; }
  				}
  				//
  				//两字词交叉识别,视情况选择
  				//
  				else if(strlen($ws[$i+1])==4){
  					$addw = $ws[$i].$ws[$i+1];
  					$t = strlen($ws[$i+1])-2;
  					$testok = false;
  					while($t>0){
  						$w = substr($addw,0,strlen($ws[$i])+$t);
  						if($this->IsWord($w) 
  				     && ($this->GetRank($w) > $this->GetRank($ws[$i+1])*2) )
  				    {
  				       $limitW = substr($ws[$i+1],$t,strlen($ws[$i+1])-$t);
  					     if($limitW!="") $reStr .= $spc.$w.$spc.$limitW;
  					     else $reStr .= $spc.$w;
  					     $testok = true;
  					     break;
  				    }
  				    $t = $t-2;
  					}
  					if(!$testok) $reStr .= $spc.$ws[$i];
  			    else $i++;
  				}
  				else
  				{ $reStr .= $spc.$ws[$i]; }
  			
  			}
  			//超过四字词或小于二字的词不作处理
  			else{
  				$reStr .= $spc.$ws[$i];
  			}
  		}
    }//End For
  	return $reStr;
  }
  //---------------------------------
  //判断词典里是否存在某个词
  //---------------------------------
  function IsWord($okWord){
  	$slen = strlen($okWord);
  	if($slen > $this->MaxLen) return false;
  	else return isset($this->RankDic[$slen][$okWord]);
  }
  //------------------------------
  //整理字符串(对标点符号,中英文混排等初步处理)
  //------------------------------
  function ReviseString($str)
  {
  	$spc = $this->SplitChar;
    $slen = strlen($str);
    if($slen==0) return '';
    $okstr = '';
    $prechar = 0; // 0-空白 1-英文 2-中文 3-符号
    for($i=0;$i<$slen;$i++){
      if(ord($str[$i]) < 0x81)
      {
        //英文的空白符号
        if(ord($str[$i]) < 33){
          //$str[$i]!="\r"&&$str[$i]!="\n"
          if($prechar!=0) $okstr .= $spc;
          $prechar=0;
          continue; 
        }else if(preg_match("/[^0-9a-zA-Z@\.%#:\\/\\&_-]/",$str[$i]))
        {
          if($prechar==0)
          {	$okstr .= $str[$i]; $prechar=3;}
          else
          { $okstr .= $spc.$str[$i]; $prechar=3;}
        }else
        {
        	if($prechar==2||$prechar==3)
        	{ $okstr .= $spc.$str[$i]; $prechar=1;}
        	else
        	{ 
        	  if(preg_match("/@#%:/",$str[$i])){ $okstr .= $str[$i]; $prechar=3; }
        	  else { $okstr .= $str[$i]; $prechar=1; }
        	}
        }
      }
      else{
        //如果上一个字符为非中文和非空格,则加一个空格
        if($prechar!=0 && $prechar!=2) $okstr .= $spc;
        //如果中文字符
        if(isset($str[$i+1])){
          $c = $str[$i].$str[$i+1];
          
          if(preg_match("/".$this->CnNumber."/",$c))
          { $okstr .= $this->GetAlabNum($c); $prechar = 2; $i++; continue; }
          
          $n = hexdec(bin2hex($c));
          if($n>0xA13F && $n < 0xAA40)
          {
            if($c=="《"){
            	if($prechar!=0) $okstr .= $spc." 《";
            	else $okstr .= " 《";
            	$prechar = 2;
            }
            else if($c=="》"){
            	$okstr .= "》 ";
            	$prechar = 3;
            }
            else{
            	if($prechar!=0) $okstr .= $spc.$c;
            	else $okstr .= $c;
            	$prechar = 3; 
            }
          }
          else{
            $okstr .= $c;
            $prechar = 2;
          }
          $i++;
        }
      }//中文字符
    }//结束循环
    return $okstr;
  }
  //-----------------------------------------
	//尝试识别新词,字符串参数为已经分词处理的串
	//----------------------------------------
  function FindNewWord($str,$maxlen=6)
  {
    $okstr = "";
    return $str;
  }
  //----------------------------------------------
  //除去字串中的重复词,生成索引字符串,字符串参数为已经分词处理的串
  //--------------------------------------------------
  function GetIndexText($okstr,$ilen=-1)
  {
    if($okstr=="") return "";
    $ws = explode(" ",$okstr);
    $okstr = "";
    $wks = "";
    foreach($ws as $w)
    {
      $w = trim($w);
      //排除小于2的字符
      if(strlen($w)<2) continue;
      //排除数字或日期
      if(!preg_match("/[^0-9:-]/",$w)) continue;
      if(strlen($w)==2&&ord($w[0])>0x80) continue;
      if(isset($wks[$w])) $wks[$w]++;
      else $wks[$w] = 1;
    }
    if(is_array($wks))
    {
      arsort($wks);
      if($ilen==-1)
      { foreach($wks as $w=>$v)
      	{
      		if($this->GetRank($w)>500) $okstr .= $w." ";
        }
      }
      else
      {
        foreach($wks as $w=>$v){
          if((strlen($okstr)+strlen($w)+1)<$ilen) $okstr .= $w." ";
          else break;
        }
      }
    }
    return trim($okstr);
  }
  //---------------------
  //获得词的词频
  //--------------------
  function GetRank($w){
  	if(isset($this->RankDic[strlen($w)][$w])) return $this->RankDic[strlen($w)][$w];
  	else return 0;
  }
  //----------------------------
  //把全角数字或英文单词转为半角
  //---------------------------
  function GetAlabNum($fnum)
  {
	  $nums = array("0","1","2","3","4","5","6",
	  "7","8","9","+","-","%",".",
	  "a","b","c","d","e","f","g","h","i","j","k","l","m",
	  "n","o","p","q","r","s ","t","u","v","w","x","y","z",
	  "A","B","C","D","E","F","G","H","I","J","K","L","M",
	  "N","O","P","Q","R","S","T","U","V","W","X","Y","Z");
	  $fnums = "0123456789+-%.abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
	  for($i=0;$i<count($nums);$i++){
	  	if($nums[$i]==$fnum) return $fnums[$i];
	  }
	  return $fnum;
  }
}//End Class

?>

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -