📄 normalize.xs
字号:
MODULE = Unicode::Normalize PACKAGE = Unicode::NormalizeSV*decompose(src, compat = &PL_sv_no) SV * src SV * compat PROTOTYPE: $;$ PREINIT: SV* dst; U8 *s, *d, *dend; STRLEN slen, dlen; CODE: s = (U8*)sv_2pvunicode(src,&slen); dst = newSVpvn("", 0); dlen = slen; New(0, d, dlen+1, U8); dend = pv_utf8_decompose(s, slen, &d, dlen, (bool)SvTRUE(compat)); sv_setpvn(dst, (char *)d, dend - d); SvUTF8_on(dst); Safefree(d); RETVAL = dst; OUTPUT: RETVALSV*reorder(src) SV * src PROTOTYPE: $ PREINIT: SV* dst; U8 *s, *d, *dend; STRLEN slen, dlen; CODE: s = (U8*)sv_2pvunicode(src,&slen); dst = newSVpvn("", 0); dlen = slen + UTF8_MAXLEN; d = (U8*)SvGROW(dst,dlen+1); SvUTF8_on(dst); dend = pv_utf8_reorder(s, slen, d, dlen); *dend = '\0'; SvCUR_set(dst, dend - d); RETVAL = dst; OUTPUT: RETVALSV*compose(src) SV * src PROTOTYPE: $ ALIAS: composeContiguous = 1 PREINIT: SV* dst; U8 *s, *d, *dend; STRLEN slen, dlen; CODE: s = (U8*)sv_2pvunicode(src,&slen); dst = newSVpvn("", 0); dlen = slen + UTF8_MAXLEN; d = (U8*)SvGROW(dst,dlen+1); SvUTF8_on(dst); dend = pv_utf8_compose(s, slen, d, dlen, (bool)ix); *dend = '\0'; SvCUR_set(dst, dend - d); RETVAL = dst; OUTPUT: RETVALSV*NFD(src) SV * src PROTOTYPE: $ ALIAS: NFKD = 1 PREINIT: SV *dst; U8 *s, *t, *tend, *d, *dend; STRLEN slen, tlen, dlen; CODE: /* decompose */ s = (U8*)sv_2pvunicode(src,&slen); tlen = slen; New(0, t, tlen+1, U8); tend = pv_utf8_decompose(s, slen, &t, tlen, (bool)ix); *tend = '\0'; tlen = tend - t; /* no longer know real tlen */ /* reorder */ dst = newSVpvn("", 0); dlen = tlen + UTF8_MAXLEN; d = (U8*)SvGROW(dst,dlen+1); SvUTF8_on(dst); dend = pv_utf8_reorder(t, tlen, d, dlen); *dend = '\0'; SvCUR_set(dst, dend - d); /* return */ Safefree(t); RETVAL = dst; OUTPUT: RETVALSV*NFC(src) SV * src PROTOTYPE: $ ALIAS: NFKC = 1 FCC = 2 PREINIT: SV *dst; U8 *s, *t, *tend, *u, *uend, *d, *dend; STRLEN slen, tlen, ulen, dlen; CODE: /* decompose */ s = (U8*)sv_2pvunicode(src,&slen); tlen = slen; New(0, t, tlen+1, U8); tend = pv_utf8_decompose(s, slen, &t, tlen, (bool)(ix==1)); *tend = '\0'; tlen = tend - t; /* no longer know real tlen */ /* reorder */ ulen = tlen + UTF8_MAXLEN; New(0, u, ulen+1, U8); uend = pv_utf8_reorder(t, tlen, u, ulen); *uend = '\0'; ulen = uend - u; /* compose */ dst = newSVpvn("", 0); dlen = ulen + UTF8_MAXLEN; d = (U8*)SvGROW(dst,dlen+1); SvUTF8_on(dst); dend = pv_utf8_compose(u, ulen, d, dlen, (bool)(ix==2)); *dend = '\0'; SvCUR_set(dst, dend - d); /* return */ Safefree(t); Safefree(u); RETVAL = dst; OUTPUT: RETVALSV*checkNFD(src) SV * src PROTOTYPE: $ ALIAS: checkNFKD = 1 PREINIT: STRLEN srclen, retlen; U8 *s, *e, *p, curCC, preCC; bool result = TRUE; CODE: s = (U8*)sv_2pvunicode(src,&srclen); e = s + srclen; preCC = 0; for (p = s; p < e; p += retlen) { UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); if (!retlen) croak(ErrRetlenIsZero, "checkNFD or -NFKD"); curCC = getCombinClass(uv); if (preCC > curCC && curCC != 0) { /* canonical ordering violated */ result = FALSE; break; } if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) { result = FALSE; break; } preCC = curCC; } RETVAL = boolSV(result); OUTPUT: RETVALSV*checkNFC(src) SV * src PROTOTYPE: $ ALIAS: checkNFKC = 1 PREINIT: STRLEN srclen, retlen; U8 *s, *e, *p, curCC, preCC; bool result = TRUE; bool isMAYBE = FALSE; CODE: s = (U8*)sv_2pvunicode(src,&srclen); e = s + srclen; preCC = 0; for (p = s; p < e; p += retlen) { UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); if (!retlen) croak(ErrRetlenIsZero, "checkNFC or -NFKC"); curCC = getCombinClass(uv); if (preCC > curCC && curCC != 0) { /* canonical ordering violated */ result = FALSE; break; } /* get NFC/NFKC property */ if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */ ; /* YES */ else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) { result = FALSE; break; } else if (isComp2nd(uv)) isMAYBE = TRUE; else if (ix) { char *canon, *compat; /* NFKC_NO when having compatibility mapping. */ canon = (char *) dec_canonical(uv); compat = (char *) dec_compat(uv); if (compat && !(canon && strEQ(canon, compat))) { result = FALSE; break; } } /* end of get NFC/NFKC property */ preCC = curCC; } if (isMAYBE && result) /* NO precedes MAYBE */ XSRETURN_UNDEF; RETVAL = boolSV(result); OUTPUT: RETVALSV*checkFCD(src) SV * src PROTOTYPE: $ ALIAS: checkFCC = 1 PREINIT: STRLEN srclen, retlen; U8 *s, *e, *p, curCC, preCC; bool result = TRUE; bool isMAYBE = FALSE; CODE: s = (U8*)sv_2pvunicode(src,&srclen); e = s + srclen; preCC = 0; for (p = s; p < e; p += retlen) { U8 *sCan; UV uvLead; STRLEN canlen = 0; UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); if (!retlen) croak(ErrRetlenIsZero, "checkFCD or -FCC"); sCan = (U8*) dec_canonical(uv); if (sCan) { STRLEN canret; canlen = (STRLEN)strlen((char *) sCan); uvLead = utf8n_to_uvuni(sCan, canlen, &canret, AllowAnyUTF); if (!canret) croak(ErrRetlenIsZero, "checkFCD or -FCC"); } else { uvLead = uv; } curCC = getCombinClass(uvLead); if (curCC != 0 && curCC < preCC) { /* canonical ordering violated */ result = FALSE; break; } if (ix) { if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) { result = FALSE; break; } else if (isComp2nd(uv)) isMAYBE = TRUE; } if (sCan) { STRLEN canret; UV uvTrail; U8* eCan = sCan + canlen; U8* pCan = utf8_hop(eCan, -1); if (pCan < sCan) croak(ErrHopBeforeStart); uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, AllowAnyUTF); if (!canret) croak(ErrRetlenIsZero, "checkFCD or -FCC"); preCC = getCombinClass(uvTrail); } else { preCC = curCC; } } if (isMAYBE && result) /* NO precedes MAYBE */ XSRETURN_UNDEF; RETVAL = boolSV(result); OUTPUT: RETVALU8getCombinClass(uv) UV uv PROTOTYPE: $boolisExclusion(uv) UV uv PROTOTYPE: $boolisSingleton(uv) UV uv PROTOTYPE: $boolisNonStDecomp(uv) UV uv PROTOTYPE: $boolisComp2nd(uv) UV uv PROTOTYPE: $ ALIAS: isNFC_MAYBE = 1 isNFKC_MAYBE = 2SV*isNFD_NO(uv) UV uv PROTOTYPE: $ ALIAS: isNFKD_NO = 1 PREINIT: bool result = FALSE; CODE: if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) result = TRUE; /* NFD_NO or NFKD_NO */ RETVAL = boolSV(result); OUTPUT: RETVALSV*isComp_Ex(uv) UV uv PROTOTYPE: $ ALIAS: isNFC_NO = 0 isNFKC_NO = 1 PREINIT: bool result = FALSE; CODE: if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) result = TRUE; /* NFC_NO or NFKC_NO */ else if (ix) { char *canon, *compat; canon = (char *) dec_canonical(uv); compat = (char *) dec_compat(uv); if (compat && (!canon || strNE(canon, compat))) result = TRUE; /* NFC_NO or NFKC_NO */ } RETVAL = boolSV(result); OUTPUT: RETVALSV*getComposite(uv, uv2) UV uv UV uv2 PROTOTYPE: $$ PREINIT: UV composite; CODE: composite = composite_uv(uv, uv2); RETVAL = composite ? newSVuv(composite) : &PL_sv_undef; OUTPUT: RETVALSV*getCanon(uv) UV uv PROTOTYPE: $ ALIAS: getCompat = 1 CODE: if (Hangul_IsS(uv)) { U8 tmp[3 * UTF8_MAXLEN + 1]; U8 *t = tmp; U8 *e = pv_cat_decompHangul(t, uv); RETVAL = newSVpvn((char *)t, e - t); } else { U8* rstr = ix ? dec_compat(uv) : dec_canonical(uv); if (!rstr) XSRETURN_UNDEF; RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr)); } SvUTF8_on(RETVAL); OUTPUT: RETVALvoidsplitOnLastStarter(src) SV * src PREINIT: SV *svp; STRLEN srclen; U8 *s, *e, *p; PPCODE: s = (U8*)sv_2pvunicode(src,&srclen); e = s + srclen; p = e; while (s < p) { UV uv; p = utf8_hop(p, -1); if (p < s) croak(ErrHopBeforeStart); uv = utf8n_to_uvuni(p, e - p, NULL, AllowAnyUTF); if (getCombinClass(uv) == 0) /* Last Starter found */ break; } svp = sv_2mortal(newSVpvn((char*)s, p - s)); SvUTF8_on(svp); XPUSHs(svp); svp = sv_2mortal(newSVpvn((char*)p, e - p)); SvUTF8_on(svp); XPUSHs(svp);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -