📄 stringencode.cc
字号:
#ifdef WIN32#include <malloc.h>#endif // WIN32#ifdef POSIX#include <alloca.h>#define _alloca alloca#endif // POSIX#include "basictypes.h"#include "common.h"#include "stringencode.h"#include "stringutils.h"namespace utils_base {/////////////////////////////////////////////////////////////////////////////// String Encoding Utilities/////////////////////////////////////////////////////////////////////////////static const char HEX[] = "0123456789abcdef";char hex_encode(unsigned char val) { ASSERT(val < 16); return (val < 16) ? HEX[val] : '!';}unsigned char hex_decode(char ch) { char lower = tolower(ch); ASSERT(((ch >= '0') && (ch <= '9')) || ((lower >= 'a') && (lower <= 'z'))); return (ch <= '9') ? (ch - '0') : ((lower - 'a') + 10);}size_t escape(char * buffer, size_t buflen, const char * source, size_t srclen, const char * illegal, char escape) { ASSERT(NULL != buffer); // TODO: estimate output size if (buflen <= 0) return 0; size_t srcpos = 0, bufpos = 0; while ((srcpos < srclen) && (bufpos + 1 < buflen)) { char ch = source[srcpos++]; if ((ch == escape) || ::strchr(illegal, ch)) { if (bufpos + 2 >= buflen) break; buffer[bufpos++] = escape; } buffer[bufpos++] = ch; } buffer[bufpos] = '\0'; return bufpos;}size_t unescape(char * buffer, size_t buflen, const char * source, size_t srclen, char escape) { ASSERT(NULL != buffer); // TODO: estimate output size if (buflen <= 0) return 0; size_t srcpos = 0, bufpos = 0; while ((srcpos < srclen) && (bufpos + 1 < buflen)) { char ch = source[srcpos++]; if ((ch == escape) && (srcpos < srclen)) { ch = source[srcpos++]; } buffer[bufpos++] = ch; } buffer[bufpos] = '\0'; return bufpos;}size_t encode(char * buffer, size_t buflen, const char * source, size_t srclen, const char * illegal, char escape) { ASSERT(NULL != buffer); // TODO: estimate output size if (buflen <= 0) return 0; size_t srcpos = 0, bufpos = 0; while ((srcpos < srclen) && (bufpos + 1 < buflen)) { char ch = source[srcpos++]; if ((ch != escape) && !::strchr(illegal, ch)) { buffer[bufpos++] = ch; } else if (bufpos + 3 >= buflen) { break; } else { buffer[bufpos+0] = escape; buffer[bufpos+1] = hex_encode((static_cast<unsigned char>(ch) >> 4) & 0xF); buffer[bufpos+2] = hex_encode((static_cast<unsigned char>(ch) ) & 0xF); bufpos += 3; } } buffer[bufpos] = '\0'; return bufpos;}size_t decode(char * buffer, size_t buflen, const char * source, size_t srclen, char escape) { if (buflen <= 0) return 0; size_t srcpos = 0, bufpos = 0; while ((srcpos < srclen) && (bufpos + 1 < buflen)) { char ch = source[srcpos++]; if ((ch == escape) && (srcpos + 1 < srclen)) { buffer[bufpos++] = (hex_decode(source[srcpos]) << 4) | hex_decode(source[srcpos+1]); srcpos += 2; } else { buffer[bufpos++] = ch; } } buffer[bufpos] = '\0'; return bufpos;}const char* unsafe_filename_characters() { // It might be better to have a single specification which is the union of // all operating systems, unless one system is overly restrictive.#ifdef WIN32 return "\\/:*?\"<>|";#else // !WIN32 // TODO#endif // !WIN23}const unsigned char URL_UNSAFE = 0x1; // 0-33 "#$%&+,/:;<=>?@[\]^`{|} 127const unsigned char XML_UNSAFE = 0x2; // "&'<>const unsigned char HTML_UNSAFE = 0x2; // "&'<>// ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 6 5 7 8 9 : ; < = > ?//@ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _//` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ const unsigned char ASCII_CLASS[128] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,0,3,1,1,1,3,2,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,3,1,3,1, 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0, 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,};size_t url_encode(char * buffer, size_t buflen, const char * source, size_t srclen) { if (NULL == buffer) return srclen * 3 + 1; if (buflen <= 0) return 0; size_t srcpos = 0, bufpos = 0; while ((srcpos < srclen) && (bufpos + 1 < buflen)) { unsigned char ch = source[srcpos++]; if ((ch < 128) && (ASCII_CLASS[ch] & URL_UNSAFE)) { if (bufpos + 3 >= buflen) { break; } buffer[bufpos+0] = '%'; buffer[bufpos+1] = hex_encode((ch >> 4) & 0xF); buffer[bufpos+2] = hex_encode((ch ) & 0xF); bufpos += 3; } else { buffer[bufpos++] = ch; } } buffer[bufpos] = '\0'; return bufpos;}size_t url_decode(char * buffer, size_t buflen, const char * source, size_t srclen) { if (NULL == buffer) return srclen + 1; if (buflen <= 0) return 0; size_t srcpos = 0, bufpos = 0; while ((srcpos < srclen) && (bufpos + 1 < buflen)) { unsigned char ch = source[srcpos++]; if (ch == '+') { buffer[bufpos++] = ' '; } else if ((ch == '%') && (srcpos + 1 < srclen)) { buffer[bufpos++] = (hex_decode(source[srcpos]) << 4) | hex_decode(source[srcpos+1]); srcpos += 2; } else { buffer[bufpos++] = ch; } } buffer[bufpos] = '\0'; return bufpos;}size_t utf8_decode(const char* source, size_t srclen, unsigned long* value) { const unsigned char* s = reinterpret_cast<const unsigned char*>(source); if ((s[0] & 0x80) == 0x00) { // Check s[0] == 0xxxxxxx *value = s[0]; return 1; } if ((srclen < 2) || ((s[1] & 0xC0) != 0x80)) { // Check s[1] != 10xxxxxx return 0; } // Accumulate the trailer byte values in value16, and combine it with the // relevant bits from s[0], once we've determined the sequence length. unsigned long value16 = (s[1] & 0x3F); if ((s[0] & 0xE0) == 0xC0) { // Check s[0] == 110xxxxx *value = ((s[0] & 0x1F) << 6) | value16; return 2; } if ((srclen < 3) || ((s[2] & 0xC0) != 0x80)) { // Check s[2] != 10xxxxxx return 0; } value16 = (value16 << 6) | (s[2] & 0x3F); if ((s[0] & 0xF0) == 0xE0) { // Check s[0] == 1110xxxx *value = ((s[0] & 0x0F) << 12) | value16; return 3; } if ((srclen < 4) || ((s[3] & 0xC0) != 0x80)) { // Check s[3] != 10xxxxxx return 0; } value16 = (value16 << 6) | (s[3] & 0x3F); if ((s[0] & 0xF8) == 0xF0) { // Check s[0] == 11110xxx *value = ((s[0] & 0x07) << 18) | value16; return 4; } return 0;}size_t utf8_encode(char* buffer, size_t buflen, unsigned long value) { if ((value <= 0x7F) && (buflen >= 1)) { buffer[0] = static_cast<unsigned char>(value); return 1; } if ((value <= 0x7FF) && (buflen >= 2)) { buffer[0] = 0xC0 | static_cast<unsigned char>(value >> 6); buffer[1] = 0x80 | static_cast<unsigned char>(value & 0x3F); return 2; } if ((value <= 0xFFFF) && (buflen >= 3)) { buffer[0] = 0xE0 | static_cast<unsigned char>(value >> 12); buffer[1] = 0x80 | static_cast<unsigned char>((value >> 6) & 0x3F); buffer[2] = 0x80 | static_cast<unsigned char>(value & 0x3F); return 3; } if ((value <= 0x1FFFFF) && (buflen >= 4)) { buffer[0] = 0xF0 | static_cast<unsigned char>(value >> 18); buffer[1] = 0x80 | static_cast<unsigned char>((value >> 12) & 0x3F); buffer[2] = 0x80 | static_cast<unsigned char>((value >> 6) & 0x3F); buffer[3] = 0x80 | static_cast<unsigned char>(value & 0x3F); return 4; } return 0;}size_t html_encode(char * buffer, size_t buflen, const char * source, size_t srclen) { ASSERT(NULL != buffer); // TODO: estimate output size if (buflen <= 0) return 0; size_t srcpos = 0, bufpos = 0; while ((srcpos < srclen) && (bufpos + 1 < buflen)) { unsigned char ch = source[srcpos]; if (ch < 128) { srcpos += 1; if (ASCII_CLASS[ch] & HTML_UNSAFE) { const char * escseq = 0; size_t esclen = 0; switch (ch) { case '<': escseq = "<"; esclen = 4; break; case '>': escseq = ">"; esclen = 4; break; case '\'': escseq = "'"; esclen = 5; break; case '\"': escseq = """; esclen = 6; break; case '&': escseq = "&"; esclen = 5; break; default: ASSERT(false); }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -