utf8num.php
来自「PHP 知识管理系统(基于树结构的知识管理系统), 英文原版的PHP源码。」· PHP 代码 · 共 170 行
PHP
170 行
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Analysis_Analyzer_Common */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num extends Zend_Search_Lucene_Analysis_Analyzer_Common
{
/**
* Current char position in an UTF-8 stream
*
* @var integer
*/
private $_position;
/**
* Current binary position in an UTF-8 stream
*
* @var integer
*/
private $_bytePosition;
/**
* Stream length
*
* @var integer
*/
private $_streamLength;
/**
* Reset token stream
*/
public function reset()
{
$this->_position = 0;
$this->_bytePosition = 0;
// convert input into UTF-8
if (strcasecmp($this->_encoding, 'utf8' ) != 0 &&
strcasecmp($this->_encoding, 'utf-8') != 0 ) {
$this->_input = iconv($this->_encoding, 'UTF-8', $this->_input);
$this->_encoding = 'UTF-8';
}
// Get UTF-8 string length.
// It also checks if it's a correct utf-8 string
$this->_streamLength = iconv_strlen($this->_input, 'UTF-8');
}
/**
* Check, that character is a letter
*
* @param string $char
* @return boolean
*/
private static function _isAlNum($char)
{
if (strlen($char) > 1) {
// It's an UTF-8 character
return true;
}
return ctype_alnum($char);
}
/**
* Get next UTF-8 char
*
* @param string $char
* @return boolean
*/
private function _nextChar()
{
$char = $this->_input[$this->_bytePosition++];
if (( ord($char) & 0xC0 ) == 0xC0) {
$addBytes = 1;
if (ord($char) & 0x20 ) {
$addBytes++;
if (ord($char) & 0x10 ) {
$addBytes++;
}
}
$char .= substr($this->_input, $this->_bytePosition, $addBytes);
$this->_bytePosition += $addBytes;
}
$this->_position++;
return $char;
}
/**
* Tokenization stream API
* Get next token
* Returns null at the end of stream
*
* @return Zend_Search_Lucene_Analysis_Token|null
*/
public function nextToken()
{
if ($this->_input === null) {
return null;
}
while ($this->_position < $this->_streamLength) {
// skip white space
while ($this->_position < $this->_streamLength &&
!self::_isAlNum($char = $this->_nextChar())) {
$char = '';
}
$termStartPosition = $this->_position - 1;
$termText = $char;
// read token
while ($this->_position < $this->_streamLength &&
self::_isAlNum($char = $this->_nextChar())) {
$termText .= $char;
}
// Empty token, end of stream.
if ($termText == '') {
return null;
}
$token = new Zend_Search_Lucene_Analysis_Token(
$termText,
$termStartPosition,
$this->_position - 1);
$token = $this->normalize($token);
if ($token !== null) {
return $token;
}
// Continue if token is skipped
}
return null;
}
}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?