#!perl -w
plan
tests
=> 59;
$SIG
{__WARN__} =
sub
{
print
"$_[0]"
; };
my
$u
= latin1(
"abcæøå"
);
ok(
$u
->latin1,
"abcæøå"
);
ok(
$u
->
length
, 6);
ok(
$u
->ucs4,
"\0\0\0a\0\0\0b\0\0\0c\0\0\0æ\0\0\0ø\0\0\0å"
);
ok(
$u
->utf32,
"\0\0\0a\0\0\0b\0\0\0c\0\0\0æ\0\0\0ø\0\0\0å"
);
ok(
$u
->utf32be,
"\0\0\0a\0\0\0b\0\0\0c\0\0\0æ\0\0\0ø\0\0\0å"
);
ok(
$u
->utf32le,
"a\0\0\0b\0\0\0c\0\0\0æ\0\0\0ø\0\0\0å\0\0\0"
);
ok(
$u
->utf16,
"\0a\0b\0c\0æ\0ø\0å"
);
ok(
$u
->utf16be,
"\0a\0b\0c\0æ\0ø\0å"
);
ok(
$u
->ucs2,
"\0a\0b\0c\0æ\0ø\0å"
);
ok(
$u
->utf16le,
"a\0b\0c\0æ\0ø\0å\0"
);
ok(
$u
->utf8,
"abcæøå"
);
ok(
$u
->utf7,
"abc+AOYA+ADl-"
);
ok(
$u
->
hex
,
"U+0061 U+0062 U+0063 U+00e6 U+00f8 U+00e5"
);
$u
= latin1(
"abc"
);
$a
=
$u
->latin1(
"def"
);
$b
=
$u
->latin1;
$u
->latin1(
"ghi"
);
ok(
$a
,
"abc"
);
ok(
$b
,
"def"
);
ok(
$u
->latin1,
"ghi"
);
$u
= utf16(
"aa\0bcc\0d"
);
print
"Expect 2 lines of warnings...\n"
;
my
$x
=
$u
->latin1;
ok(
$x
,
"bd"
);
$x
=
"\0\0\0a\0\0bb\0\3cc\0\1\0\2\0\0\0\0"
;
$u
= ucs4(
$x
);
ok(
$u
->
length
, 7);
ok(
$u
->
hex
,
"U+0061 U+6262 U+d898 U+df63 U+d800 U+dc02 U+0000"
);
ok(
$u
->ucs4,
$x
);
$a
=
$u
->ucs4(
""
);
ok(
$a
,
$x
);
ok(
$u
->
length
, 0);
$u
= utf32le(
"a\0\0\0"
.
"bb\0\0"
.
"cc\3\0"
.
"\2\0\1\0"
.
"\0\0\0\0"
);
ok(
$u
->
length
, 7);
ok(
$u
->
hex
,
"U+0061 U+6262 U+d898 U+df63 U+d800 U+dc02 U+0000"
);
ok(
$u
->ucs4,
$x
);
print
"Expect 2 lines of warnings...\n"
;
$u
->ucs4(
" \0\x10\xff\xff\0\x11\0\0\0\0\0\0"
);
ok(
$u
->
hex
,
"U+dbff U+dfff U+0000"
);
$u
= utf8(
""
);
ok(
$u
->
length
, 0);
ok(
$u
->utf8,
""
);
$u
= utf8(
"abc"
);
my
$old
=
$u
->utf8(
"def"
);
ok(
$old
,
"abc"
);
ok(
$u
->latin1,
"def"
);
$u
= utf16(
"\0a\0å\1\0\7a\0aa"
);
$x
=
unpack
(
"H*"
,
$u
->utf8);
ok(
$x
,
"61c3a5c480dda161e68480"
);
my
$u2
= utf8(
$u
->utf8);
ok(
$u
->utf16,
$u2
->utf16);
print
"Surrogates...\n"
;
$u
= ucs4(
"\0\1\0\0\0\x10\xFF\xFF"
);
$x
=
unpack
(
"H*"
,
$u
->utf8);
ok(
$x
,
"f0908080f48fbfbf"
);
$u
->utf8(
pack
(
"H*"
,
$x
));
ok(
$u
->ucs4,
"\0\1\0\0\0\x10\xFF\xFF"
);
print
"Expect a warning with this incomplete surrogate pair...\n"
;
$u
= utf16(
"\xd8\x00"
);
$u2
= utf8(
$u
->utf8);
ok(
$u2
->
hex
,
"U+d800"
);
print
"...and lots of noice from this...\n"
;
$u
= utf8(
"¤¤a\xf7¤¤¤b\xf8¤¤¤¤c\xfc¤¤¤¤¤d\xfd\xfe\xffef"
);
print
$u
->
hex
,
"\n"
;
ok(
$u
->utf8,
"abcdef"
);
$u
= utf7(
"A+ImIDkQ."
);
ok(
$u
->
hex
,
"U+0041 U+2262 U+0391 U+002e"
);
my
$utf7
=
$u
->utf7(
"Hi Mom +Jjo-!"
);
ok(
$utf7
,
qr/^A\+ImIDkQ-?\.$/
);
ok(
$u
->
hex
,
"U+0048 U+0069 U+0020 U+004d U+006f U+006d U+0020 U+263a U+0021"
);
ok(
$u
->utf7 eq
"Hi Mom +Jjo-!"
||
$u
->utf7 eq
"Hi Mom +JjoAIQ-"
);
$u
= utf7(
"+ZeVnLIqe-"
);
ok(
$u
->
hex
,
"U+65e5 U+672c U+8a9e"
);
ok(
$u
->utf7,
"+ZeVnLIqe-"
);
my
$text
=
<<'EOT';
Below is the full Chinese text of the Analects (+itaKng-).
The sources for the text are:
"The sayings of Confucius," James R. Ware, trans. +U/BTFw-:
+ZYeB9FH6ckh5Pg-, 1980. (Chinese text with English translation)
+Vttm+E6UfZM-, +W4tRQ066bOg-, +UxdOrA-: +Ti1XC2b4Xpc-, 1990.
"The Chinese Classics with a Translation, Critical and
Exegetical Notes, Prolegomena, and Copius Indexes," James
Legge, trans., Taipei: Southern Materials Center Publishing,
Inc., 1991. (Chinese text with English translation)
Big Five and GB versions of the text are being made available
separately.
Neither the Big Five nor GB contain all the characters used in
this text. Missing characters have been indicated using their
Unicode/ISO 10646 code points. "U+-" followed by four
hexadecimal digits indicates a Unicode/10646 code (e.g.,
U+-9F08). There is no good solution to the problem of the small
size of the Big Five/GB character sets; this represents the
solution I find personally most satisfactory.
(omitted...)
I have tried to minimize this problem by using variant
characters where they were available and the character
actually in the text was not. Only variants listed as such in
the +XrdxmVtXUXg- were used.
(omitted...)
John H. Jenkins
+TpVPXGBG-
John_Jenkins@taligent.com
5 January 1993
EOT
$u
= utf7(
$text
);
my
$utf
=
$u
->utf7;
unless
(
$utf
eq
$text
) {
print
$u
->
length
,
" $utf\n"
;
open
(F,
">utf7-$$.orig"
);
print
F
$text
;
open
(F,
">utf7-$$.enc"
);
print
F
$utf
;
close
(F);
system
(
"diff -u0 utf7-$$.orig utf7-$$.enc"
);
unlink
(
"utf7-$$.orig"
,
"utf7-$$.enc"
);
}
ok(
$utf
,
$text
);
for
my
$len
(1 .. 6) {
$u
= Unicode::String->new;
$u
->
pack
(
map
{1000 +
$_
} 1 ..
$len
);
$u2
= utf7(
$u
->utf7);
ok(
$u
->utf16,
$u2
->utf16);
}
$Unicode::String::UTF7_OPTIONAL_DIRECT_CHARS
= 0;
$u
= latin1(
"a=4!æøå"
);
$utf
=
$u
->utf7;
ok(
$utf7
!~ /[=!]/);
ok(utf7(
$utf
)->latin1,
"a=4!æøå"
);
$u
= utf16(
"ÿþa\0b\0c\0"
);
ok(
$u
->
hex
,
"U+feff U+0061 U+0062 U+0063"
);
ok(
$u
->latin1,
"abc"
);
$u
= utf16(
"þÿ\0a\0b\0c"
);
ok(
$u
->
hex
,
"U+feff U+0061 U+0062 U+0063"
);
ok(
$u
->latin1,
"abc"
);
$u
= utf16le(
"ÿþa\0b\0c\0"
);
ok(
$u
->
hex
,
"U+feff U+0061 U+0062 U+0063"
);
ok(
$u
->latin1,
"abc"
);
$u
= utf16le(
"þÿ\0a\0b\0c"
);
ok(
$u
->
hex
,
"U+feff U+0061 U+0062 U+0063"
);
ok(
$u
->latin1,
"abc"
);