📄 unicode.php
字号:
{ // Little Endian $UTF16_val2 = $chval4 * 0x100 + $chval3; } // Check that this is a low surrogate if ( ( $UTF16_val2 >= 0xDC00 ) && ( $UTF16_val2 <= 0xDFFF ) ) { // Low surrogate found following high surrogate // Add both to the output $output .= chr( $chval1 ) . chr ( $chval2 ) . chr( $chval3 ) . chr ( $chval4 ); // Skip over the low surrogate $pos += 2; } else { // Low surrogate not found after high surrogate // Don't add either to the output // Only the High surrogate is skipped and processing continues after it } } else { // Error - not enough data for low surrogate - end processing continue 1; } } else { // Low surrogate of a surrogate pair // This should not happen - it means this is a lone low surrogate // Dont add it to the output } } // Return the result return $output;}/******************************************************************************* End of Function: UTF16_fix******************************************************************************//******************************************************************************** Function: UTF8_to_unicode_array** Description: Converts a string encoded with Unicode UTF-8, to an array of* numbers which represent unicode character numbers** Parameters: utf8_text - a string containing the UTF-8 data** Returns: output - the array containing the unicode character numbers*******************************************************************************/function UTF8_to_unicode_array( $utf8_text ){ // Create an array to receive the unicode character numbers output $output = array( ); // Cycle through the characters in the UTF-8 string for ( $pos = 0; $pos < strlen( $utf8_text ); $pos++ ) { // Retreive the current numerical character value $chval = ord($utf8_text{$pos}); // Check what the first character is - it will tell us how many bytes the // Unicode value covers if ( ( $chval >= 0x00 ) && ( $chval <= 0x7F ) ) { // 1 Byte UTF-8 Unicode (7-Bit ASCII) Character $bytes = 1; $outputval = $chval; // Since 7-bit ASCII is unaffected, the output equals the input } else if ( ( $chval >= 0xC0 ) && ( $chval <= 0xDF ) ) { // 2 Byte UTF-8 Unicode $bytes = 2; $outputval = $chval & 0x1F; // The first byte is bitwise ANDed with 0x1F to remove the leading 110b } else if ( ( $chval >= 0xE0 ) && ( $chval <= 0xEF ) ) { // 3 Byte UTF-8 Unicode $bytes = 3; $outputval = $chval & 0x0F; // The first byte is bitwise ANDed with 0x0F to remove the leading 1110b } else if ( ( $chval >= 0xF0 ) && ( $chval <= 0xF7 ) ) { // 4 Byte UTF-8 Unicode $bytes = 4; $outputval = $chval & 0x07; // The first byte is bitwise ANDed with 0x07 to remove the leading 11110b } else if ( ( $chval >= 0xF8 ) && ( $chval <= 0xFB ) ) { // 5 Byte UTF-8 Unicode $bytes = 5; $outputval = $chval & 0x03; // The first byte is bitwise ANDed with 0x03 to remove the leading 111110b } else if ( ( $chval >= 0xFC ) && ( $chval <= 0xFD ) ) { // 6 Byte UTF-8 Unicode $bytes = 6; $outputval = $chval & 0x01; // The first byte is bitwise ANDed with 0x01 to remove the leading 1111110b } else { // Invalid Code - do nothing $bytes = 0; } // Check if the byte was valid if ( $bytes !== 0 ) { // The byte was valid // Check if there is enough data left in the UTF-8 string to allow the // retrieval of the remainder of this unicode character if ( $pos + $bytes - 1 < strlen( $utf8_text ) ) { // The UTF-8 string is long enough // Cycle through the number of bytes required, // minus the first one which has already been done while ( $bytes > 1 ) { $pos++; $bytes--; // Each remaining byte is coded with 6 bits of data and 10b on the high // order bits. Hence we need to shift left by 6 bits (0x40) then add the // current characer after it has been bitwise ANDed with 0x3F to remove the // highest two bits. $outputval = $outputval*0x40 + ( (ord($utf8_text{$pos})) & 0x3F ); } // Add the calculated Unicode number to the output array $output[] = $outputval; } } } // Return the resulting array return $output;}/******************************************************************************* End of Function: UTF8_to_unicode_array******************************************************************************//******************************************************************************** Function: UTF16_to_unicode_array** Description: Converts a string encoded with Unicode UTF-16, to an array of* numbers which represent unicode character numbers** Parameters: utf16_text - a string containing the UTF-16 data* MSB_first - True will cause processing as Big Endian UTF-16 (Motorola, MSB first)* False will cause processing as Little Endian UTF-16 (Intel, LSB first)** Returns: output - the array containing the unicode character numbers*******************************************************************************/function UTF16_to_unicode_array( $utf16_text, $MSB_first ){ // Create an array to receive the unicode character numbers output $output = array( ); // Initialise the current position in the string $pos = 0; // Cycle through each group of bytes, ensuring the coding is correct while ( $pos < strlen( $utf16_text ) ) { // Retreive the current numerical character value $chval1 = ord($utf16_text{$pos}); // Skip over character just read $pos++; // Check if there is another character available if ( $pos < strlen( $utf16_text ) ) { // Another character is available - get it for the second half of the UTF-16 value $chval2 = ord( $utf16_text{$pos} ); } else { // Error - no second byte to this UTF-16 value - end processing continue 1; } // Skip over character just read $pos++; // Calculate the 16 bit unicode value if ( $MSB_first ) { // Big Endian $UTF16_val = $chval1 * 0x100 + $chval2; } else { // Little Endian $UTF16_val = $chval2 * 0x100 + $chval1; } if ( ( ( $UTF16_val >= 0x0000 ) && ( $UTF16_val <= 0xD7FF ) ) || ( ( $UTF16_val >= 0xE000 ) && ( $UTF16_val <= 0xFFFF ) ) ) { // Normal Character (Non Surrogate pair) // Add it to the output $output[] = $UTF16_val; } else if ( ( $UTF16_val >= 0xD800 ) && ( $UTF16_val <= 0xDBFF ) ) { // High surrogate of a surrogate pair // Now we need to read the low surrogate // Check if there is another 2 characters available if ( ( $pos + 3 ) < strlen( $utf16_text ) ) { // Another 2 characters are available - get them $chval3 = ord( $utf16_text{$pos} ); $chval4 = ord( $utf16_text{$pos+1} ); // Calculate the second 16 bit unicode value if ( $MSB_first ) { // Big Endian $UTF16_val2 = $chval3 * 0x100 + $chval4; } else { // Little Endian $UTF16_val2 = $chval4 * 0x100 + $chval3; } // Check that this is a low surrogate if ( ( $UTF16_val2 >= 0xDC00 ) && ( $UTF16_val2 <= 0xDFFF ) ) { // Low surrogate found following high surrogate // Add both to the output $output[] = 0x10000 + ( ( $UTF16_val - 0xD800 ) * 0x400 ) + ( $UTF16_val2 - 0xDC00 ); // Skip over the low surrogate $pos += 2; } else { // Low surrogate not found after high surrogate // Don't add either to the output // The high surrogate is skipped and processing continued } } else { // Error - not enough data for low surrogate - end processing continue 1; } } else { // Low surrogate of a surrogate pair // This should not happen - it means this is a lone low surrogate // Don't add it to the output } } // Return the result return $output;}/******************************************************************************* End of Function: UTF16_to_unicode_array******************************************************************************//******************************************************************************** Function: unicode_array_to_UTF8** Description: Converts an array of unicode character numbers to a string* encoded by UTF-8
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -