⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 normalize.xs

📁 source of perl for linux application,
💻 XS
📖 第 1 页 / 共 2 页
字号:
#include "EXTERN.h"#include "perl.h"#include "XSUB.h"/* These 5 files are prepared by mkheader */#include "unfcmb.h"#include "unfcan.h"#include "unfcpt.h"#include "unfcmp.h"#include "unfexc.h"/* Perl 5.6.1 ? */#ifndef uvuni_to_utf8#define uvuni_to_utf8   uv_to_utf8#endif /* uvuni_to_utf8 *//* Perl 5.6.1 ? */#ifndef utf8n_to_uvuni#define utf8n_to_uvuni  utf8_to_uv#endif /* utf8n_to_uvuni *//* UTF8_ALLOW_BOM is used before Perl 5.8.0 */#ifdef UTF8_ALLOW_BOM#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FFFF)#else#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)#endif/* if utf8n_to_uvuni() sets retlen to 0 (?) */#define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character"/* utf8_hop() hops back before start. Maybe broken UTF-8 */#define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start"/* It should never happen as there is no instance in UTF-8 and UTF-EBCDIC;   according to Versioning and Stability in UAX#15, no new composition   should come in future. */#define ErrLongerThanSrc "panic (Unicode::Normalize %s): longer than source"/* uvuni_to_utf8 wants UTF8_MAXBYTES free bytes available */#define ErrTargetNotEnough "panic (Unicode::Normalize %s): target not enough"/* At present, char > 0x10ffff are unaffected without complaint, right? */#define VALID_UTF_MAX    (0x10ffff)#define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))/* size of array for combining characters *//* enough as an initial value? */#define CC_SEQ_SIZE (10)#define CC_SEQ_STEP  (5)/* HANGUL begin */#define Hangul_SBase  0xAC00#define Hangul_SFinal 0xD7A3#define Hangul_SCount  11172#define Hangul_NCount    588#define Hangul_LBase  0x1100#define Hangul_LFinal 0x1112#define Hangul_LCount     19#define Hangul_VBase  0x1161#define Hangul_VFinal 0x1175#define Hangul_VCount     21#define Hangul_TBase  0x11A7#define Hangul_TFinal 0x11C2#define Hangul_TCount     28#define Hangul_IsS(u)  ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))#define Hangul_IsN(u)  (((u) - Hangul_SBase) % Hangul_TCount == 0)#define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))#define Hangul_IsL(u)  ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))#define Hangul_IsV(u)  ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))#define Hangul_IsT(u)  ((Hangul_TBase  < (u)) && ((u) <= Hangul_TFinal))/* HANGUL end *//* this is used for canonical ordering of combining characters (c.c.). */typedef struct {    U8 cc;	/* combining class */    UV uv;	/* codepoint */    STRLEN pos; /* position */} UNF_cc;static int compare_cc(const void *a, const void *b){    int ret_cc;    ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;    if (ret_cc)	return ret_cc;    return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos )	 - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );}static U8* dec_canonical(UV uv){    U8 ***plane, **row;    if (OVER_UTF_MAX(uv))	return NULL;    plane = (U8***)UNF_canon[uv >> 16];    if (! plane)	return NULL;    row = plane[(uv >> 8) & 0xff];    return row ? row[uv & 0xff] : NULL;}static U8* dec_compat(UV uv){    U8 ***plane, **row;    if (OVER_UTF_MAX(uv))	return NULL;    plane = (U8***)UNF_compat[uv >> 16];    if (! plane)	return NULL;    row = plane[(uv >> 8) & 0xff];    return row ? row[uv & 0xff] : NULL;}static UV composite_uv(UV uv, UV uv2){    UNF_complist ***plane, **row, *cell, *i;    if (!uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))	return 0;    if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {	UV lindex = uv  - Hangul_LBase;	UV vindex = uv2 - Hangul_VBase;	return(Hangul_SBase + (lindex * Hangul_VCount + vindex) *	       Hangul_TCount);    }    if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {	UV tindex = uv2 - Hangul_TBase;	return(uv + tindex);    }    plane = UNF_compos[uv >> 16];    if (! plane)	return 0;    row = plane[(uv >> 8) & 0xff];    if (! row)	return 0;    cell = row[uv & 0xff];    if (! cell)	return 0;    for (i = cell; i->nextchar; i++) {	if (uv2 == i->nextchar)	    return i->composite;    }    return 0;}static U8 getCombinClass(UV uv){    U8 **plane, *row;    if (OVER_UTF_MAX(uv))	return 0;    plane = (U8**)UNF_combin[uv >> 16];    if (! plane)	return 0;    row = plane[(uv >> 8) & 0xff];    return row ? row[uv & 0xff] : 0;}static U8* pv_cat_decompHangul(U8* d, UV uv){    UV sindex =  uv - Hangul_SBase;    UV lindex =  sindex / Hangul_NCount;    UV vindex = (sindex % Hangul_NCount) / Hangul_TCount;    UV tindex =  sindex % Hangul_TCount;    if (! Hangul_IsS(uv))	return d;    d = uvuni_to_utf8(d, (lindex + Hangul_LBase));    d = uvuni_to_utf8(d, (vindex + Hangul_VBase));    if (tindex)	d = uvuni_to_utf8(d, (tindex + Hangul_TBase));    return d;}static char* sv_2pvunicode(SV *sv, STRLEN *lp){    char *s;    STRLEN len;    s = SvPV(sv,len);    if (!SvUTF8(sv)) {	SV* tmpsv = sv_2mortal(newSVpvn(s, len));	if (!SvPOK(tmpsv))	    s = SvPV_force(tmpsv,len);	sv_utf8_upgrade(tmpsv);	s = SvPV(tmpsv,len);    }    if (lp)	*lp = len;    return s;}staticU8* pv_utf8_decompose(U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat){    U8* p = s;    U8* e = s + slen;    U8* dstart = *dp;    U8* d = dstart;    while (p < e) {	STRLEN retlen;	UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);	if (!retlen)	    croak(ErrRetlenIsZero, "decompose");	p += retlen;	if (Hangul_IsS(uv)) {	    STRLEN cur = d - dstart;	    if (dlen < cur + UTF8_MAXLEN * 3) {		dlen += UTF8_MAXLEN * 3;		Renew(dstart, dlen+1, U8);		d = dstart + cur;	    }	    d = pv_cat_decompHangul(d, uv);	}	else {	    U8* r = iscompat ? dec_compat(uv) : dec_canonical(uv);	    if (r) {		STRLEN len = (STRLEN)strlen((char *)r);		STRLEN cur = d - dstart;		if (dlen < cur + len) {		    dlen += len;		    Renew(dstart, dlen+1, U8);		    d = dstart + cur;		}		while (len--)		    *d++ = *r++;	    }	    else {		STRLEN cur = d - dstart;		if (dlen < cur + UTF8_MAXLEN) {		    dlen += UTF8_MAXLEN;		    Renew(dstart, dlen+1, U8);		    d = dstart + cur;		}		d = uvuni_to_utf8(d, uv);	    }	}    }    *dp = dstart;    return d;}staticU8* pv_utf8_reorder(U8* s, STRLEN slen, U8* d, STRLEN dlen){    U8* p = s;    U8* e = s + slen;    U8* dend = d + dlen;    UNF_cc  seq_ary[CC_SEQ_SIZE];    UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */    UNF_cc* seq_ext = NULL; /* extend if need */    STRLEN seq_max = CC_SEQ_SIZE;    STRLEN cc_pos = 0;    if (dlen < slen || dlen < slen + UTF8_MAXLEN)	croak(ErrTargetNotEnough, "reorder");    dend -= UTF8_MAXLEN; /* safety */    while (p < e) {	U8 curCC;	STRLEN retlen;	UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);	if (!retlen)	    croak(ErrRetlenIsZero, "reorder");	p += retlen;	curCC = getCombinClass(uv);	if (curCC != 0) {	    if (seq_max < cc_pos + 1) { /* extend if need */		seq_max = cc_pos + CC_SEQ_STEP; /* new size */		if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */		    STRLEN i;		    New(0, seq_ext, seq_max, UNF_cc);		    for (i = 0; i < cc_pos; i++)			seq_ext[i] = seq_ary[i];		}		else {		    Renew(seq_ext, seq_max, UNF_cc);		}		seq_ptr = seq_ext; /* use seq_ext from now */	    }	    seq_ptr[cc_pos].cc  = curCC;	    seq_ptr[cc_pos].uv  = uv;	    seq_ptr[cc_pos].pos = cc_pos;	    ++cc_pos;	    if (p < e)		continue;	}	if (cc_pos) {	    STRLEN i;	    if (cc_pos > 1) /* reordered if there are two c.c.'s */		qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc);	    for (i = 0; i < cc_pos; i++) {		d = uvuni_to_utf8(d, seq_ptr[i].uv);		if (dend < d) /* real end is dend + UTF8_MAXLEN */		    croak(ErrLongerThanSrc, "reorder");	    }	    cc_pos = 0;	}	if (curCC == 0) {	    d = uvuni_to_utf8(d, uv);	    if (dend < d) /* real end is dend + UTF8_MAXLEN */		croak(ErrLongerThanSrc, "reorder");	}    }    if (seq_ext)	Safefree(seq_ext);    return d;}staticU8* pv_utf8_compose(U8* s, STRLEN slen, U8* d, STRLEN dlen, bool iscontig){    U8* p = s;    U8* e = s + slen;    U8* dend = d + dlen;    UV uvS = 0; /* code point of the starter */    bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */    U8 preCC = 0;    UV  seq_ary[CC_SEQ_SIZE];    UV* seq_ptr = seq_ary; /* use array at the beginning */    UV* seq_ext = NULL; /* extend if need */    STRLEN seq_max = CC_SEQ_SIZE;    STRLEN cc_pos = 0;    if (dlen < slen || dlen < slen + UTF8_MAXLEN)	croak(ErrTargetNotEnough, "compose");    dend -= UTF8_MAXLEN; /* safety */    while (p < e) {	U8 curCC;	STRLEN retlen;	UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);	if (!retlen)	    croak(ErrRetlenIsZero, "compose");	p += retlen;	curCC = getCombinClass(uv);	if (!valid_uvS) {	    if (curCC == 0) {		uvS = uv; /* the first Starter is found */		valid_uvS = TRUE;		if (p < e)		    continue;	    }	    else {		d = uvuni_to_utf8(d, uv);		if (dend < d) /* real end is dend + UTF8_MAXLEN */		    croak(ErrLongerThanSrc, "compose");		continue;	    }	}	else {	    bool composed;	    /* blocked */	    if (iscontig && cc_pos || /* discontiguous combination */		 curCC != 0 && preCC == curCC || /* blocked by same CC */		 preCC > curCC) /* blocked by higher CC: revised D2 */		composed = FALSE;	    /* not blocked:		 iscontig && cc_pos == 0      -- contiguous combination		 curCC == 0 && preCC == 0     -- starter + starter		 curCC != 0 && preCC < curCC  -- lower CC */	    else {		/* try composition */		UV uvComp = composite_uv(uvS, uv);		if (uvComp && !isExclusion(uvComp))  {		    uvS = uvComp;		    composed = TRUE;		    /* preCC should not be changed to curCC */		    /* e.g. 1E14 = 0045 0304 0300 where CC(0304) == CC(0300) */		    if (p < e)			continue;		}		else		    composed = FALSE;	    }	    if (!composed) {		preCC = curCC;		if (curCC != 0 || !(p < e)) {		    if (seq_max < cc_pos + 1) { /* extend if need */			seq_max = cc_pos + CC_SEQ_STEP; /* new size */			if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */			    New(0, seq_ext, seq_max, UV);			    Copy(seq_ary, seq_ext, cc_pos, UV);			}			else {			    Renew(seq_ext, seq_max, UV);			}			seq_ptr = seq_ext; /* use seq_ext from now */		    }		    seq_ptr[cc_pos] = uv;		    ++cc_pos;		}		if (curCC != 0 && p < e)		    continue;	    }	}	d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */	if (dend < d) /* real end is dend + UTF8_MAXLEN */	    croak(ErrLongerThanSrc, "compose");	if (cc_pos) {	    STRLEN i;	    for (i = 0; i < cc_pos; i++) {		d = uvuni_to_utf8(d, seq_ptr[i]);		if (dend < d) /* real end is dend + UTF8_MAXLEN */		    croak(ErrLongerThanSrc, "compose");	    }	    cc_pos = 0;	}	uvS = uv;    }    if (seq_ext)	Safefree(seq_ext);    return d;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -