⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 unichar.cpp

📁 一OCR的相关资料。.希望对研究OCR的朋友有所帮助.
💻 CPP
字号:
///////////////////////////////////////////////////////////////////////// File:        unichar.cpp// Description: Unicode character/ligature class.// Author:      Ray Smith// Created:     Wed Jun 28 17:05:01 PDT 2006//// (C) Copyright 2006, Google Inc.// Licensed under the Apache License, Version 2.0 (the "License");// you may not use this file except in compliance with the License.// You may obtain a copy of the License at// http://www.apache.org/licenses/LICENSE-2.0// Unless required by applicable law or agreed to in writing, software// distributed under the License is distributed on an "AS IS" BASIS,// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.// See the License for the specific language governing permissions and// limitations under the License./////////////////////////////////////////////////////////////////////////#include "unichar.h"#define UNI_MAX_LEGAL_UTF32 0x0010FFFF// Construct from a utf8 string. If len<0 then the string is null terminated.// If the string is too long to fit in the UNICHAR then it takes only what// will fit. Checks for illegal input and stops at an illegal sequence.// The resulting UNICHAR may be empty.UNICHAR::UNICHAR(const char* utf8_str, int len) {  int total_len = 0;  int step = 0;  if (len < 0) {    for (len = 0; utf8_str[len] != 0 && len < UNICHAR_LEN; ++len);  }  for (total_len = 0; total_len < len; total_len += step) {    step = utf8_step(utf8_str + total_len);    if (total_len + step > UNICHAR_LEN)      break;  // Too long.    if (step == 0)      break;  // Illegal first byte.    int i;    for (i = 1; i < step; ++i)      if ((utf8_str[total_len + i] & 0xc0) != 0x80)        break;    if (i < step)      break;  // Illegal surrogate  }  memcpy(chars, utf8_str, total_len);  if (total_len < UNICHAR_LEN) {    chars[UNICHAR_LEN - 1] = total_len;    while (total_len < UNICHAR_LEN - 1)      chars[total_len++] = 0;  }}// Construct from a single UCS4 character. Illegal values are ignored,// resulting in an empty UNICHAR.UNICHAR::UNICHAR(int unicode) {  const int bytemask = 0xBF;  const int bytemark = 0x80;  if (unicode < 0x80) {    chars[UNICHAR_LEN - 1] = 1;    chars[2] = 0;    chars[1] = 0;    chars[0] = static_cast<char>(unicode);  } else if (unicode < 0x800) {    chars[UNICHAR_LEN - 1] = 2;    chars[2] = 0;    chars[1] = static_cast<char>((unicode | bytemark) & bytemask);    unicode >>= 6;    chars[0] = static_cast<char>(unicode | 0xc0);  } else if (unicode < 0x10000) {    chars[UNICHAR_LEN - 1] = 3;    chars[2] = static_cast<char>((unicode | bytemark) & bytemask);    unicode >>= 6;    chars[1] = static_cast<char>((unicode | bytemark) & bytemask);    unicode >>= 6;    chars[0] = static_cast<char>(unicode | 0xe0);  } else if (unicode <= UNI_MAX_LEGAL_UTF32) {    chars[UNICHAR_LEN - 1] = 4;    chars[3] = static_cast<char>((unicode | bytemark) & bytemask);    unicode >>= 6;    chars[2] = static_cast<char>((unicode | bytemark) & bytemask);    unicode >>= 6;    chars[1] = static_cast<char>((unicode | bytemark) & bytemask);    unicode >>= 6;    chars[0] = static_cast<char>(unicode | 0xf0);  } else {    memset(chars, 0, UNICHAR_LEN);  }}// Get the first character as UCS-4.int UNICHAR::first_uni() const {  static const int utf8_offsets[5] = {    0, 0, 0x3080, 0xE2080, 0x3C82080  };  int uni = 0;  int len = utf8_step(chars);  const char* src = chars;  switch (len) {  default:    break;  case 4:    uni += *src++;    uni <<= 6;  case 3:    uni += *src++;    uni <<= 6;  case 2:    uni += *src++;    uni <<= 6;  case 1:    uni += *src++;  }  uni -= utf8_offsets[len];  return uni;}// Get a terminated UTF8 string: Must delete[] it after use.char* UNICHAR::utf8_str() const {  int len = utf8_len();  char* str = new char[len + 1];  memcpy(str, chars, len);  str[len] = 0;  return str;}// Get the number of bytes in the first character of the given utf8 string.int UNICHAR::utf8_step(const char* utf8_str) {  static const char utf8_bytes[256] = {    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0  };  return utf8_bytes[*utf8_str];}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -