⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 unicode.xs

📁 source of perl for linux application,
💻 XS
字号:
/* $Id: Unicode.xs,v 2.3 2006/05/03 18:24:10 dankogai Exp $ */#define PERL_NO_GET_CONTEXT#include "EXTERN.h"#include "perl.h"#include "XSUB.h"#define U8 U8#include "../Encode/encode.h"#define FBCHAR			0xFFFd#define BOM_BE			0xFeFF#define BOM16LE			0xFFFe#define BOM32LE			0xFFFe0000#define issurrogate(x)		(0xD800 <= (x)  && (x) <= 0xDFFF )#define isHiSurrogate(x)	(0xD800 <= (x)  && (x) <  0xDC00 )#define isLoSurrogate(x)	(0xDC00 <= (x)  && (x) <= 0xDFFF )#define invalid_ucs2(x)         ( issurrogate(x) || 0xFFFF < (x) )static UVenc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian){    U8 *s = *sp;    UV v = 0;    if (s+size > e) {    croak("Partial character %c",(char) endian);    }    switch(endian) {    case 'N':    v = *s++;    v = (v << 8) | *s++;    case 'n':    v = (v << 8) | *s++;    v = (v << 8) | *s++;    break;    case 'V':    case 'v':    v |= *s++;    v |= (*s++ << 8);    if (endian == 'v')        break;    v |= (*s++ << 16);    v |= (*s++ << 24);    break;    default:    croak("Unknown endian %c",(char) endian);    break;    }    *sp = s;    return v;}voidenc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value){    U8 *d = (U8 *)SvGROW(result,SvCUR(result)+size+1);    switch(endian) {    case 'v':    case 'V':    d += SvCUR(result);    SvCUR_set(result,SvCUR(result)+size);    while (size--) {        *d++ = (U8)(value & 0xFF);        value >>= 8;    }    break;    case 'n':    case 'N':    SvCUR_set(result,SvCUR(result)+size);    d += SvCUR(result);    while (size--) {        *--d = (U8)(value & 0xFF);        value >>= 8;    }    break;    default:    croak("Unknown endian %c",(char) endian);    break;    }}MODULE = Encode::Unicode PACKAGE = Encode::UnicodePROTOTYPES: DISABLE#define attr(k, l)  (hv_exists((HV *)SvRV(obj),k,l) ? \    *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef)voiddecode_xs(obj, str, check = 0)SV *	objSV *	strIV	checkCODE:{    U8 endian   = *((U8 *)SvPV_nolen(attr("endian", 6)));    int size    =   SvIV(attr("size",   4));    int ucs2    = SvTRUE(attr("ucs2",   4));    int renewed = SvTRUE(attr("renewed",  7));    SV *result  = newSVpvn("",0);    STRLEN ulen;    U8 *s = (U8 *)SvPVbyte(str,ulen);    U8 *e = (U8 *)SvEND(str);    ST(0) = sv_2mortal(result);    SvUTF8_on(result);    if (!endian && s+size <= e) {    UV bom;    endian = (size == 4) ? 'N' : 'n';    bom = enc_unpack(aTHX_ &s,e,size,endian);        if (bom != BOM_BE) {        if (bom == BOM16LE) {        endian = 'v';        }        else if (bom == BOM32LE) {        endian = 'V';        }        else {        croak("%"SVf":Unrecognised BOM %"UVxf,                      *hv_fetch((HV *)SvRV(obj),"Name",4,0),              bom);        }    }#if 1    /* Update endian for next sequence */    if (renewed) {        hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);    }#endif    }    while (s < e && s+size <= e) {    UV ord = enc_unpack(aTHX_ &s,e,size,endian);    U8 *d;    if (issurrogate(ord)) {        if (ucs2 || size == 4) {        if (check) {            croak("%"SVf":no surrogates allowed %"UVxf,              *hv_fetch((HV *)SvRV(obj),"Name",4,0),              ord);        }        if (s+size <= e) {                    /* skip the next one as well */            enc_unpack(aTHX_ &s,e,size,endian);        }        ord = FBCHAR;        }        else {        UV lo;        if (!isHiSurrogate(ord)) {            if (check) {            croak("%"SVf":Malformed HI surrogate %"UVxf,                  *hv_fetch((HV *)SvRV(obj),"Name",4,0),                  ord);            }            else {            ord = FBCHAR;            }        }            else {            if (s+size > e) {            /* Partial character */            s -= size;   /* back up to 1st half */            break;       /* And exit loop */            }            lo = enc_unpack(aTHX_ &s,e,size,endian);            if (!isLoSurrogate(lo)){            if (check) {                croak("%"SVf":Malformed LO surrogate %"UVxf,                  *hv_fetch((HV *)SvRV(obj),"Name",4,0),                  ord);            }            else {                ord = FBCHAR;            }            }            else {            ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);            }        }        }    }    if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) {        if (check) {        croak("%"SVf":Unicode character %"UVxf" is illegal",              *hv_fetch((HV *)SvRV(obj),"Name",4,0),              ord);        } else {        ord = FBCHAR;        }    }    d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1);    d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0);    SvCUR_set(result,d - (U8 *)SvPVX(result));    }    if (s < e) {    /* unlikely to happen because it's fixed-length -- dankogai */    if (check & ENCODE_WARN_ON_ERR){        Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character",            *hv_fetch((HV *)SvRV(obj),"Name",4,0));    }    }    if (check && !(check & ENCODE_LEAVE_SRC)){    if (s < e) {        Move(s,SvPVX(str),e-s,U8);        SvCUR_set(str,(e-s));    }    else {        SvCUR_set(str,0);    }    *SvEND(str) = '\0';    }    XSRETURN(1);}voidencode_xs(obj, utf8, check = 0)SV *	objSV *	utf8IV	checkCODE:{    U8 endian   = *((U8 *)SvPV_nolen(attr("endian", 6)));    int size    =   SvIV(attr("size",   4));    int ucs2    = SvTRUE(attr("ucs2",   4));    int renewed = SvTRUE(attr("renewed",  7));    SV *result  = newSVpvn("",0);    STRLEN ulen;    U8 *s = (U8 *)SvPVutf8(utf8,ulen);    U8 *e = (U8 *)SvEND(utf8);    ST(0) = sv_2mortal(result);    if (!endian) {    endian = (size == 4) ? 'N' : 'n';    enc_pack(aTHX_ result,size,endian,BOM_BE);#if 1    /* Update endian for next sequence */    if (renewed){        hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);    }#endif    }    while (s < e && s+UTF8SKIP(s) <= e) {    STRLEN len;    UV ord = utf8n_to_uvuni(s, e-s, &len, 0);        s += len;    if (size != 4 && invalid_ucs2(ord)) {        if (!issurrogate(ord)){        if (ucs2) {            if (check) {            croak("%"SVf":code point \"\\x{%"UVxf"}\" too high",                  *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord);            }            enc_pack(aTHX_ result,size,endian,FBCHAR);        }else{            UV hi = ((ord - 0x10000) >> 10)   + 0xD800;            UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;            enc_pack(aTHX_ result,size,endian,hi);            enc_pack(aTHX_ result,size,endian,lo);        }        }        else {        /* not supposed to happen */        enc_pack(aTHX_ result,size,endian,FBCHAR);        }    }    else {        enc_pack(aTHX_ result,size,endian,ord);    }    }    if (s < e) {    /* UTF-8 partial char happens often on PerlIO.       Since this is okay and normal, we do not warn.       But this is critical when you choose to LEAVE_SRC       in which case we die */    if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)){        Perl_croak(aTHX_ "%"SVf":partial character is not allowed "               "when CHECK = 0x%" UVuf,               *hv_fetch((HV *)SvRV(obj),"Name",4,0), check);    }        }    if (check && !(check & ENCODE_LEAVE_SRC)){    if (s < e) {        Move(s,SvPVX(utf8),e-s,U8);        SvCUR_set(utf8,(e-s));    }    else {        SvCUR_set(utf8,0);    }    *SvEND(utf8) = '\0';    }     XSRETURN(1);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -