⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 utf8decode.t

📁 UNIX下perl实现代码
💻 T
字号:
#!./perlBEGIN {    chdir 't' if -d 't';    @INC = '../lib';}no utf8;print "1..78\n";my $test = 1;# This table is based on Markus Kuhn's UTF-8 Decode Stress Tester,# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt,# version dated 2000-09-02. # We use the \x notation instead of raw binary bytes for \x00-\x1f\x7f-\xff# because e.g. many patch programs have issues with binary data.my @MK = split(/\n/, <<__EOMK__);1	Correct UTF-81.1.1 y "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5"	-		11	ce:ba:e1:bd:b9:cf:83:ce:bc:ce:b5	52	Boundary conditions 2.1	First possible sequence of certain length2.1.1 y "\x00"			0		1	00	12.1.2 y "\xc2\x80"			80		2	c2:80	12.1.3 y "\xe0\xa0\x80"		800		3	e0:a0:80	12.1.4 y "\xf0\x90\x80\x80"		10000		4	f0:90:80:80	12.1.5 y "\xf8\x88\x80\x80\x80"	200000		5	f8:88:80:80:80	12.1.6 y "\xfc\x84\x80\x80\x80\x80"	4000000		6	fc:84:80:80:80:80	12.2	Last possible sequence of certain length2.2.1 y "\x7f"			7f		1	7f	12.2.2 y "\xdf\xbf"			7ff		2	df:bf	1# The ffff is illegal unless UTF8_ALLOW_FFFF2.2.3 n "\xef\xbf\xbf"			ffff		3	ef:bf:bf	1	character 0xffff2.2.4 y "\xf7\xbf\xbf\xbf"			1fffff		4	f7:bf:bf:bf	12.2.5 y "\xfb\xbf\xbf\xbf\xbf"			3ffffff		5	fb:bf:bf:bf:bf	12.2.6 y "\xfd\xbf\xbf\xbf\xbf\xbf"		7fffffff	6	fd:bf:bf:bf:bf:bf	12.3	Other boundary conditions2.3.1 y "\xed\x9f\xbf"		d7ff		3	ed:9f:bf	12.3.2 y "\xee\x80\x80"		e000		3	ee:80:80	12.3.3 y "\xef\xbf\xbd"			fffd		3	ef:bf:bd	12.3.4 y "\xf4\x8f\xbf\xbf"		10ffff		4	f4:8f:bf:bf	12.3.5 y "\xf4\x90\x80\x80"		110000		4	f4:90:80:80	13	Malformed sequences3.1	Unexpected continuation bytes3.1.1 n "\x80"			-		1	80	-	unexpected continuation byte 0x803.1.2 n "\xbf"			-		1	bf	-	unexpected continuation byte 0xbf3.1.3 n "\x80\xbf"			-		2	80:bf	-	unexpected continuation byte 0x803.1.4 n "\x80\xbf\x80"		-		3	80:bf:80	-	unexpected continuation byte 0x803.1.5 n "\x80\xbf\x80\xbf"		-		4	80:bf:80:bf	-	unexpected continuation byte 0x803.1.6 n "\x80\xbf\x80\xbf\x80"	-		5	80:bf:80:bf:80	-	unexpected continuation byte 0x803.1.7 n "\x80\xbf\x80\xbf\x80\xbf"	-		6	80:bf:80:bf:80:bf	-	unexpected continuation byte 0x803.1.8 n "\x80\xbf\x80\xbf\x80\xbf\x80"	-		7	80:bf:80:bf:80:bf:80	-	unexpected continuation byte 0x803.1.9 n "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"				-	64	80:81:82:83:84:85:86:87:88:89:8a:8b:8c:8d:8e:8f:90:91:92:93:94:95:96:97:98:99:9a:9b:9c:9d:9e:9f:a0:a1:a2:a3:a4:a5:a6:a7:a8:a9:aa:ab:ac:ad:ae:af:b0:b1:b2:b3:b4:b5:b6:b7:b8:b9:ba:bb:bc:bd:be:bf	-	unexpected continuation byte 0x803.2	Lonely start characters3.2.1 n "\xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf "	-	64 	c0:20:c1:20:c2:20:c3:20:c4:20:c5:20:c6:20:c7:20:c8:20:c9:20:ca:20:cb:20:cc:20:cd:20:ce:20:cf:20:d0:20:d1:20:d2:20:d3:20:d4:20:d5:20:d6:20:d7:20:d8:20:d9:20:da:20:db:20:dc:20:dd:20:de:20:df:20	-	unexpected non-continuation byte 0x20 after start byte 0xc03.2.2 n "\xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef "	-	32	e0:20:e1:20:e2:20:e3:20:e4:20:e5:20:e6:20:e7:20:e8:20:e9:20:ea:20:eb:20:ec:20:ed:20:ee:20:ef:20	-	unexpected non-continuation byte 0x20 after start byte 0xe03.2.3 n "\xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 "	-	16	f0:20:f1:20:f2:20:f3:20:f4:20:f5:20:f6:20:f7:20	-	unexpected non-continuation byte 0x20 after start byte 0xf03.2.4 n "\xf8 \xf9 \xfa \xfb "		-	8	f8:20:f9:20:fa:20:fb:20	-	unexpected non-continuation byte 0x20 after start byte 0xf83.2.5 n "\xfc \xfd "			-	4	fc:20:fd:20	-	unexpected non-continuation byte 0x20 after start byte 0xfc3.3	Sequences with last continuation byte missing3.3.1 n "\xc0"			-	1	c0	-	1 byte, need 23.3.2 n "\xe0\x80"			-	2	e0:80	-	2 bytes, need 33.3.3 n "\xf0\x80\x80"		-	3	f0:80:80	-	3 bytes, need 43.3.4 n "\xf8\x80\x80\x80"		-	4	f8:80:80:80	-	4 bytes, need 53.3.5 n "\xfc\x80\x80\x80\x80"	-	5	fc:80:80:80:80	-	5 bytes, need 63.3.6 n "\xdf"			-	1	df	-	1 byte, need 23.3.7 n "\xef\xbf"			-	2	ef:bf	-	2 bytes, need 33.3.8 n "\xf7\xbf\xbf"			-	3	f7:bf:bf	-	3 bytes, need 43.3.9 n "\xfb\xbf\xbf\xbf"			-	4	fb:bf:bf:bf	-	4 bytes, need 53.3.10 n "\xfd\xbf\xbf\xbf\xbf"		-	5	fd:bf:bf:bf:bf	-	5 bytes, need 63.4	Concatenation of incomplete sequences3.4.1 n "\xc0\xe0\x80\xf0\x80\x80\xf8\x80\x80\x80\xfc\x80\x80\x80\x80\xdf\xef\xbf\xf7\xbf\xbf\xfb\xbf\xbf\xbf\xfd\xbf\xbf\xbf\xbf"	-	30	c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf	-	unexpected non-continuation byte 0xe0 after start byte 0xc03.5	Impossible bytes3.5.1 n "\xfe"			-	1	fe	-	byte 0xfe3.5.2 n "\xff"			-	1	ff	-	byte 0xff3.5.3 n "\xfe\xfe\xff\xff"			-	4	fe:fe:ff:ff	-	byte 0xfe4	Overlong sequences4.1	Examples of an overlong ASCII character4.1.1 n "\xc0\xaf"			-	2	c0:af	-	2 bytes, need 14.1.2 n "\xe0\x80\xaf"		-	3	e0:80:af	-	3 bytes, need 14.1.3 n "\xf0\x80\x80\xaf"		-	4	f0:80:80:af	-	4 bytes, need 14.1.4 n "\xf8\x80\x80\x80\xaf"	-	5	f8:80:80:80:af	-	5 bytes, need 14.1.5 n "\xfc\x80\x80\x80\x80\xaf"	-	6	fc:80:80:80:80:af	-	6 bytes, need 14.2	Maximum overlong sequences4.2.1 n "\xc1\xbf"			-	2	c1:bf	-	2 bytes, need 14.2.2 n "\xe0\x9f\xbf"		-	3	e0:9f:bf	-	3 bytes, need 24.2.3 n "\xf0\x8f\xbf\xbf"		-	4	f0:8f:bf:bf	-	4 bytes, need 34.2.4 n "\xf8\x87\xbf\xbf\xbf"		-	5	f8:87:bf:bf:bf	-	5 bytes, need 44.2.5 n "\xfc\x83\xbf\xbf\xbf\xbf"		-	6	fc:83:bf:bf:bf:bf	-	6 bytes, need 54.3	Overlong representation of the NUL character4.3.1 n "\xc0\x80"			-	2	c0:80	-	2 bytes, need 14.3.2 n "\xe0\x80\x80"		-	3	e0:80:80	-	3 bytes, need 14.3.3 n "\xf0\x80\x80\x80"		-	4	f0:80:80:80	-	4 bytes, need 14.3.4 n "\xf8\x80\x80\x80\x80"	-	5	f8:80:80:80:80	-	5 bytes, need 14.3.5 n "\xfc\x80\x80\x80\x80\x80"	-	6	fc:80:80:80:80:80	-	6 bytes, need 15	Illegal code positions5.1	Single UTF-16 surrogates5.1.1 n "\xed\xa0\x80"		-	3	ed:a0:80	-	UTF-16 surrogate 0xd8005.1.2 n "\xed\xad\xbf"			-	3	ed:ad:bf	-	UTF-16 surrogate 0xdb7f5.1.3 n "\xed\xae\x80"		-	3	ed:ae:80	-	UTF-16 surrogate 0xdb805.1.4 n "\xed\xaf\xbf"			-	3	ed:af:bf	-	UTF-16 surrogate 0xdbff5.1.5 n "\xed\xb0\x80"		-	3	ed:b0:80	-	UTF-16 surrogate 0xdc005.1.6 n "\xed\xbe\x80"		-	3	ed:be:80	-	UTF-16 surrogate 0xdf805.1.7 n "\xed\xbf\xbf"			-	3	ed:bf:bf	-	UTF-16 surrogate 0xdfff5.2	Paired UTF-16 surrogates5.2.1 n "\xed\xa0\x80\xed\xb0\x80"		-	6	ed:a0:80:ed:b0:80	-	UTF-16 surrogate 0xd8005.2.2 n "\xed\xa0\x80\xed\xbf\xbf"		-	6	ed:a0:80:ed:bf:bf	-	UTF-16 surrogate 0xd8005.2.3 n "\xed\xad\xbf\xed\xb0\x80"		-	6	ed:ad:bf:ed:b0:80	-	UTF-16 surrogate 0xdb7f5.2.4 n "\xed\xad\xbf\xed\xbf\xbf"		-	6	ed:ad:bf:ed:bf:bf	-	UTF-16 surrogate 0xdb7f5.2.5 n "\xed\xae\x80\xed\xb0\x80"		-	6	ed:ae:80:ed:b0:80	-	UTF-16 surrogate 0xdb805.2.6 n "\xed\xae\x80\xed\xbf\xbf"		-	6	ed:ae:80:ed:bf:bf	-	UTF-16 surrogate 0xdb805.2.7 n "\xed\xaf\xbf\xed\xb0\x80"		-	6	ed:af:bf:ed:b0:80	-	UTF-16 surrogate 0xdbff5.2.8 n "\xed\xaf\xbf\xed\xbf\xbf"		-	6	ed:af:bf:ed:bf:bf	-	UTF-16 surrogate 0xdbff5.3	Other illegal code positions5.3.1 n "\xef\xbf\xbe"			-	3	ef:bf:be	-	byte order mark 0xfffe# The ffff is illegal unless UTF8_ALLOW_FFFF5.3.2 n "\xef\xbf\xbf"			-	3	ef:bf:bf	-	character 0xffff__EOMK__# 104..181{    my $WARNCNT;    my $id;    local $SIG{__WARN__} =	sub {	    print "# $id: @_";	    $WARNCNT++;	    $WARNMSG = "@_";	};    sub moan {	print "$id: @_";    }        sub test_unpack_U {	$WARNCNT = 0;	$WARNMSG = "";	unpack('U*', $_[0]);    }    for (@MK) {	if (/^(?:\d+(?:\.\d+)?)\s/ || /^#/) {	    # print "# $_\n";	} elsif (/^(\d+\.\d+\.\d+[bu]?)\s+([yn])\s+"(.+)"\s+([0-9a-f]{1,8}|-)\s+(\d+)\s+([0-9a-f]{2}(?::[0-9a-f]{2})*)(?:\s+((?:\d+|-)(?:\s+(.+))?))?$/) {	    $id = $1;	    my ($okay, $bytes, $Unicode, $byteslen, $hex, $charslen, $error) =		($2, $3, $4, $5, $6, $7, $8);	    my @hex = split(/:/, $hex);	    unless (@hex == $byteslen) {		my $nhex = @hex;		moan "amount of hex ($nhex) not equal to byteslen ($byteslen)\n";	    }	    {		use bytes;		my $bytesbyteslen = length($bytes);		unless ($bytesbyteslen == $byteslen) {		    moan "bytes length() ($bytesbyteslen) not equal to $byteslen\n";		}	    }	    if ($okay eq 'y') {		test_unpack_U($bytes);		if ($WARNCNT) {		    moan "unpack('U*') false negative\n";		    print "not ";		}	    } elsif ($okay eq 'n') {		test_unpack_U($bytes);		if ($WARNCNT == 0 || ($error ne '' && $WARNMSG !~ /$error/)) {		    moan "unpack('U*') false positive\n";		    print "not ";		}	    }	    print "ok $test\n";	    $test++; 	} else {	    moan "unknown format\n";	}    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -