⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 unicode.cpp

📁 ncbi源码
💻 CPP
字号:
/* * =========================================================================== * PRODUCTION $Log: unicode.cpp,v $ * PRODUCTION Revision 1000.0  2004/06/01 19:43:29  gouriano * PRODUCTION PRODUCTION: IMPORTED [GCC34_MSVC7] Dev-tree R1.3 * PRODUCTION * =========================================================================== *//*  $Id: unicode.cpp,v 1000.0 2004/06/01 19:43:29 gouriano Exp $ * ========================================================================== * *                            PUBLIC DOMAIN NOTICE *               National Center for Biotechnology Information * *  This software/database is a "United States Government Work" under the *  terms of the United States Copyright Act.  It was written as part of *  the author's official duties as a United States Government employee and *  thus cannot be copyrighted.  This software/database is freely available *  to the public for use. The National Library of Medicine and the U.S. *  Government have not placed any restriction on its use or reproduction. * *  Although all reasonable efforts have been taken to ensure the accuracy *  and reliability of the software and data, the NLM and the U.S. *  Government do not and cannot warrant the performance or results that *  may be obtained by using this software or data. The NLM and the U.S. *  Government disclaim all warranties, express or implied, including *  warranties of performance, merchantability or fitness for any particular *  purpose. * *  Please cite the author in any work or product based on this material. * * ========================================================================== * * Author: Aleksey Vinokurov * * File Description: *    Unicode transformation library * */#include <ncbi_pch.hpp>#include <util/unicode.hpp>BEGIN_NCBI_SCOPEBEGIN_SCOPE(utf8)#include "unicode_plans/plan00.inc"#include "unicode_plans/plan01.inc"#include "unicode_plans/plan02.inc"#include "unicode_plans/plan03.inc"#include "unicode_plans/plan04.inc"#include "unicode_plans/plan1e.inc"#include "unicode_plans/plan20.inc"#include "unicode_plans/plan21.inc"#include "unicode_plans/plan22.inc"#include "unicode_plans/plan23.inc"#include "unicode_plans/plan24.inc"#include "unicode_plans/plan25.inc"#include "unicode_plans/plan26.inc"#include "unicode_plans/plan27.inc"#include "unicode_plans/plan30.inc"#include "unicode_plans/plane0.inc"#include "unicode_plans/plane2.inc"#include "unicode_plans/plane3.inc"#include "unicode_plans/plane4.inc"#include "unicode_plans/plane5.inc"#include "unicode_plans/plane6.inc"#include "unicode_plans/plane7.inc"#include "unicode_plans/plane8.inc"#include "unicode_plans/planea.inc"#include "unicode_plans/planeb.inc"#include "unicode_plans/planfb.inc"#include "unicode_plans/planfe.inc"static TUnicodeTable g_DefaultUnicodeTable ={    &s_Plan_00h, &s_Plan_01h, &s_Plan_02h, &s_Plan_03h, &s_Plan_04h, 0, 0, 0,  // Plan 00 - 07    0, 0, 0, 0, 0, 0, 0, 0,  // Plan 08 - 0F    0, 0, 0, 0, 0, 0, 0, 0,  // Plan 10 - 17    0, 0, 0, 0, 0, 0, &s_Plan_1Eh, 0,  // Plan 18 - 1F    &s_Plan_20h, &s_Plan_21h, &s_Plan_22h, &s_Plan_23h, &s_Plan_24h, &s_Plan_25h, &s_Plan_26h, &s_Plan_27h,  // Plan 20 - 27    0, 0, 0, 0, 0, 0, 0, 0,  // Plan 28 - 2F    &s_Plan_30h, 0, 0, 0, 0, 0, 0, 0,  // Plan 30 - 37    0, 0, 0, 0, 0, 0, 0, 0,  // Plan 38 - 3F    0, 0, 0, 0, 0, 0, 0, 0,  // Plan 40 - 47    0, 0, 0, 0, 0, 0, 0, 0,  // Plan 48 - 4F    0, 0, 0, 0, 0, 0, 0, 0,  // Plan 50 - 57    0, 0, 0, 0, 0, 0, 0, 0,  // Plan 58 - 5F    0, 0, 0, 0, 0, 0, 0, 0,  // Plan 60 - 67    0, 0, 0, 0, 0, 0, 0, 0,  // Plan 68 - 6F    0, 0, 0, 0, 0, 0, 0, 0,  // Plan 70 - 77    0, 0, 0, 0, 0, 0, 0, 0,  // Plan 78 - 7F    0, 0, 0, 0, 0, 0, 0, 0,  // Plan 80 - 87    0, 0, 0, 0, 0, 0, 0, 0,  // Plan 88 - 8F    0, 0, 0, 0, 0, 0, 0, 0,  // Plan 90 - 97    0, 0, 0, 0, 0, 0, 0, 0,  // Plan 98 - 9F    0, 0, 0, 0, 0, 0, 0, 0,  // Plan A0 - A7    0, 0, 0, 0, 0, 0, 0, 0,  // Plan A8 - AF    0, 0, 0, 0, 0, 0, 0, 0,  // Plan B0 - B7    0, 0, 0, 0, 0, 0, 0, 0,  // Plan B8 - BF    0, 0, 0, 0, 0, 0, 0, 0,  // Plan C0 - C7    0, 0, 0, 0, 0, 0, 0, 0,  // Plan C8 - CF    0, 0, 0, 0, 0, 0, 0, 0,  // Plan D0 - D7    0, 0, 0, 0, 0, 0, 0, 0,  // Plan D8 - DF    &s_Plan_E0h, 0, &s_Plan_E2h, &s_Plan_E3h, &s_Plan_E4h, &s_Plan_E5h, &s_Plan_E6h, &s_Plan_E7h,  // Plan E0 - E7    &s_Plan_E8h, 0, &s_Plan_EAh, &s_Plan_EBh, 0, 0, 0, 0,  // Plan E8 - EF    0, 0, 0, 0, 0, 0, 0, 0,  // Plan F0 - F7    0, 0, 0, &s_Plan_FBh, 0, 0, &s_Plan_FEh, 0   // Plan F8 - FF};const SUnicodeTranslation*UnicodeToAscii(TUnicode character, const TUnicodeTable* table){    if (!table) {        table = &g_DefaultUnicodeTable;    }    unsigned int thePlanNo   = (character & 0xFF00) >> 8;    unsigned int theOffset = character & 0xFF;    const TUnicodePlan* thePlan = (*table)[thePlanNo];    if ( !thePlan ) return 0;    return &((*thePlan)[theOffset]);}TUnicode UTF8ToUnicode( const char* theUTF ){    int seq_len;    const char *p = theUTF;    char counter = *p++;    unsigned char c;    if ( ((*theUTF) & 0xC0) != 0xC0 )    {        TUnicode RC = 0;        RC |= (unsigned char)theUTF[0];        return RC;    }    TUnicode acc = counter & 037;    while((counter <<= 1) < 0) {        c = *p++;        if((c & ~077) != 0200) { // Broken UTF-8 chain            seq_len = p - theUTF;            return ~0;        }        acc = (acc << 6) | (c & 077);    }    return acc;}int UTF8ToUnicode( const char* theUTF, TUnicode* theUnicode ){    int seq_len;    const char *p = theUTF;    char counter = *p++;    unsigned char c;    if ( (unsigned char )theUTF[0] < 0x80 ) {        // This is one character UTF8. I.e. regular character.        *theUnicode = *theUTF;        return 1;    }    if ( ((*theUTF) & 0xC0) != 0xC0 ) {        // This is not a unicode        return 0;    }    TUnicode acc = counter & 037;    while((counter <<= 1) < 0) {        c = *p++;        if((c & ~077) != 0200) { // Broken UTF-8 chain            seq_len = p - theUTF;            return ~0;        }        acc = (acc << 6) | (c & 077);    } // while    seq_len = p - theUTF;    *theUnicode = acc;    return seq_len;}string UnicodeToUTF8( TUnicode theUnicode ){    char theBuffer[10];    int theLength = UnicodeToUTF8( theUnicode, theBuffer, 10 );    return string( theBuffer, theLength );}int UnicodeToUTF8( TUnicode theUnicode, char *theBuffer, int theBufLength ){    int Length = 0;    if (theUnicode < 0x80) {        Length = 1;        if ( Length > theBufLength ) return 0;        theBuffer[0] = char(theUnicode);    }    else if (theUnicode < 0x800) {        Length = 2;        if ( Length > theBufLength ) return 0;        theBuffer[0] = char(0xC0 | theUnicode>>6);        theBuffer[1] = char(0x80 | theUnicode & 0x3F);    }    else if (theUnicode < 0x10000) {        Length = 3;        if ( Length > theBufLength ) return 0;        theBuffer[0] = char(0xE0 | theUnicode>>12);        theBuffer[1] = char(0x80 | theUnicode>>6 & 0x3F);        theBuffer[2] = char(0x80 | theUnicode & 0x3F);    }    else if (theUnicode < 0x200000) {        Length = 4;        if ( Length > theBufLength ) return 0;        theBuffer[0] = char(0xF0 | theUnicode>>18);        theBuffer[1] = char(0x80 | theUnicode>>12 & 0x3F);        theBuffer[2] = char(0x80 | theUnicode>>6 & 0x3F);        theBuffer[3] = char(0x80 | theUnicode & 0x3F);    }    return Length;}int UTF8ToAscii( const char* src, char* dst,                 int dstLen, const TUnicodeTable* table){    if ( !src || !dst || dstLen == 0 ) return 0;    long srcPos = 0;    long dstPos = 0;    long srcLen = strlen( src );    for ( srcPos = 0; srcPos < srcLen; )    {        // Assign quck pointers        char* pDst = &(dst[dstPos]);        const char* pSrc = &(src[srcPos]);        TUnicode theUnicode;        int utfLen = UTF8ToUnicode( pSrc, &theUnicode );        if ( (utfLen == 0) || (utfLen == -1) ) {            // Skip the error.            srcPos++;            continue;        }        srcPos += utfLen;        // Find the correct substitution.        const SUnicodeTranslation*            pSubst = UnicodeToAscii( theUnicode, table );        // Check if the unicode has a translation        if ( !pSubst ) {            continue;        }        // Check if type is eSkip or substituting string is empty.        if ( (pSubst->Type ==  eSkip) ||             !(pSubst->Subst) ) {            continue;        }        // Check if type is eAsIs        if (pSubst->Type ==  eAsIs) {            memcpy( pDst, pSrc, utfLen );            dstPos += utfLen;            continue;        }        // Check the remaining length and put the result in there.        int substLen = strlen( pSubst->Subst );        if ( (dstPos + substLen) > dstLen ) {            return -1; // Unsufficient space;        }        // Copy the substituting value into the destignation string        memcpy( pDst, pSubst->Subst, substLen );        dstPos += substLen;    }    return dstPos;}string UTF8ToAsciiString( const char* src, const TUnicodeTable* table){    if ( !src ) return 0;    string dst;    long srcPos = 0;    long srcLen = strlen( src );    for ( srcPos = 0; srcPos < srcLen; )    {        // Assign quck pointers        const char* pSrc = &(src[srcPos]);        TUnicode theUnicode;        int utfLen = UTF8ToUnicode( pSrc, &theUnicode );        if ( (utfLen == 0) || (utfLen == -1) ) {            // Skip the error.            srcPos++;            continue;        }        srcPos += utfLen;        // Find the correct substitution.        const SUnicodeTranslation*            pSubst = UnicodeToAscii( theUnicode, table );        // Check if the unicode has a translation        if ( !pSubst ) {            srcPos += utfLen;            continue;        }        // Check if type is eSkip or substituting string is empty.        if ( (pSubst->Type ==  eSkip) ||             !(pSubst->Subst) ) {            srcPos += utfLen;            continue;        }        // Check if type is eAsIs        if (pSubst->Type ==  eAsIs) {            dst += string( pSrc, utfLen );            srcPos += utfLen;            continue;        }        // Copy the substituting value into the destignation string        dst += pSubst->Subst;    }    return dst;}END_SCOPE(utf8)END_NCBI_SCOPE/* * ========================================================================== * $Log: unicode.cpp,v $ * Revision 1000.0  2004/06/01 19:43:29  gouriano * PRODUCTION: IMPORTED [GCC34_MSVC7] Dev-tree R1.3 * * Revision 1.3  2004/05/17 21:06:02  gorelenk * Added include of PCH ncbi_pch.hpp * * Revision 1.2  2004/05/13 21:18:12  ucko * Respect constness in UnicodeToAscii. * * Revision 1.1  2004/05/06 18:15:29  gouriano * Imported from pubmed/xmldb * * ========================================================================== */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -