From Code to Community: Sponsoring The Perl and Raku Conference 2025 Learn more

#!perl -w
use strict;
use Test qw(plan ok);
plan tests => 59;
use Unicode::String qw(latin1 ucs4 utf32le utf16 utf16le utf8 utf7);
#use Devel::Dump;
$SIG{__WARN__} = sub { print "$_[0]"; };
my $u = latin1("abcæøå");
#Dump($u);
#---- Test Latin1 encoding ----
ok($u->latin1, "abcæøå");
ok($u->length, 6);
ok($u->ucs4, "\0\0\0a\0\0\0b\0\0\0c\0\0\0æ\0\0\0ø\0\0\0å");
ok($u->utf32, "\0\0\0a\0\0\0b\0\0\0c\0\0\0æ\0\0\0ø\0\0\0å");
ok($u->utf32be, "\0\0\0a\0\0\0b\0\0\0c\0\0\0æ\0\0\0ø\0\0\0å");
ok($u->utf32le, "a\0\0\0b\0\0\0c\0\0\0æ\0\0\0ø\0\0\0å\0\0\0");
ok($u->utf16, "\0a\0b\0c\0æ\0ø\0å");
ok($u->utf16be, "\0a\0b\0c\0æ\0ø\0å");
ok($u->ucs2, "\0a\0b\0c\0æ\0ø\0å");
ok($u->utf16le, "a\0b\0c\0æ\0ø\0å\0");
ok($u->utf8, "abcæøå");
ok($u->utf7, "abc+AOYA+ADl-");
ok($u->hex, "U+0061 U+0062 U+0063 U+00e6 U+00f8 U+00e5");
$u = latin1("abc");
$a = $u->latin1("def");
$b = $u->latin1;
$u->latin1("ghi");
ok($a, "abc");
ok($b, "def");
ok($u->latin1, "ghi");
$u = utf16("aa\0bcc\0d");
print "Expect 2 lines of warnings...\n";
my $x = $u->latin1;
ok($x, "bd");
#---- Test UCS4 encoding ----
$x = "\0\0\0a\0\0bb\0\3cc\0\1\0\2\0\0\0\0";
$u = ucs4($x);
ok($u->length, 7);
ok($u->hex, "U+0061 U+6262 U+d898 U+df63 U+d800 U+dc02 U+0000");
ok($u->ucs4, $x);
$a = $u->ucs4("");
ok($a, $x);
ok($u->length, 0);
$u = utf32le("a\0\0\0" . "bb\0\0" . "cc\3\0" . "\2\0\1\0" . "\0\0\0\0");
ok($u->length, 7);
ok($u->hex, "U+0061 U+6262 U+d898 U+df63 U+d800 U+dc02 U+0000");
ok($u->ucs4, $x);
print "Expect 2 lines of warnings...\n";
$u->ucs4(" \0\x10\xff\xff\0\x11\0\0\0\0\0\0");
ok($u->hex, "U+dbff U+dfff U+0000");
#--- Test UTF8 encoding ---
$u = utf8("");
ok($u->length, 0);
ok($u->utf8, "");
$u = utf8("abc");
my $old = $u->utf8("def");
ok($old, "abc");
ok($u->latin1, "def");
$u = utf16("\0a\0å\1\0\7a\0aa");
$x = unpack("H*", $u->utf8);
ok($x, "61c3a5c480dda161e68480");
my $u2 = utf8($u->utf8);
ok($u->utf16, $u2->utf16);
# Test surrogates and utf8
print "Surrogates...\n";
$u = ucs4("\0\1\0\0\0\x10\xFF\xFF");
$x = unpack("H*", $u->utf8);
ok($x, "f0908080f48fbfbf");
$u->utf8(pack("H*", $x));
ok($u->ucs4, "\0\1\0\0\0\x10\xFF\xFF");
print "Expect a warning with this incomplete surrogate pair...\n";
$u = utf16("\xd8\x00");
$u2 = utf8($u->utf8);
ok($u2->hex, "U+d800");
print "...and lots of noice from this...\n";
$u = utf8("¤¤a\xf7¤¤¤b\xf8¤¤¤¤c\xfc¤¤¤¤¤d\xfd\xfe\xffef");
print $u->hex, "\n";
ok($u->utf8, "abcdef");
#--- Test UTF7 encoding ---
# Examples from RFC 1642...
#
# Example. The Unicode sequence "A<NOT IDENTICAL TO><ALPHA>."
# (hexadecimal 0041,2262,0391,002E) may be encoded as follows:
#
# A+ImIDkQ.
#
# Example. The Unicode sequence "Hi Mom <WHITE SMILING FACE>!"
# (hexadecimal 0048, 0069, 0020, 004D, 006F, 004D, 0020, 263A, 0021)
# may be encoded as follows:
#
# Hi Mom +Jjo-!
$u = utf7("A+ImIDkQ.");
ok($u->hex, "U+0041 U+2262 U+0391 U+002e");
my $utf7 = $u->utf7("Hi Mom +Jjo-!");
ok($utf7, qr/^A\+ImIDkQ-?\.$/);
ok($u->hex, "U+0048 U+0069 U+0020 U+004d U+006f U+006d U+0020 U+263a U+0021");
ok($u->utf7 eq "Hi Mom +Jjo-!" || $u->utf7 eq "Hi Mom +JjoAIQ-");
# Example. The Unicode sequence representing the Han characters for
# the Japanese word "nihongo" (hexadecimal 65E5,672C,8A9E) may be
# encoded as follows:
$u = utf7("+ZeVnLIqe-");
ok($u->hex, "U+65e5 U+672c U+8a9e");
ok($u->utf7, "+ZeVnLIqe-");
# Appendix A -- Examples
#
# Here is a longer example, taken from a document originally in Big5
# code. It has been condensed for brevity. There are two versions: the
# first uses optional characters from set O (and thus may not pass
# through some mail gateways), and the second uses no optional
# characters.
my $text = <<'EOT';
Below is the full Chinese text of the Analects (+itaKng-).
The sources for the text are:
"The sayings of Confucius," James R. Ware, trans. +U/BTFw-:
+ZYeB9FH6ckh5Pg-, 1980. (Chinese text with English translation)
+Vttm+E6UfZM-, +W4tRQ066bOg-, +UxdOrA-: +Ti1XC2b4Xpc-, 1990.
"The Chinese Classics with a Translation, Critical and
Exegetical Notes, Prolegomena, and Copius Indexes," James
Legge, trans., Taipei: Southern Materials Center Publishing,
Inc., 1991. (Chinese text with English translation)
Big Five and GB versions of the text are being made available
separately.
Neither the Big Five nor GB contain all the characters used in
this text. Missing characters have been indicated using their
Unicode/ISO 10646 code points. "U+-" followed by four
hexadecimal digits indicates a Unicode/10646 code (e.g.,
U+-9F08). There is no good solution to the problem of the small
size of the Big Five/GB character sets; this represents the
solution I find personally most satisfactory.
(omitted...)
I have tried to minimize this problem by using variant
characters where they were available and the character
actually in the text was not. Only variants listed as such in
the +XrdxmVtXUXg- were used.
(omitted...)
John H. Jenkins
+TpVPXGBG-
John_Jenkins@taligent.com
5 January 1993
EOT
$u = utf7($text);
my $utf = $u->utf7;
unless ($utf eq $text) {
print $u->length, " $utf\n";
open(F, ">utf7-$$.orig"); print F $text;
open(F, ">utf7-$$.enc"); print F $utf;
close(F);
system("diff -u0 utf7-$$.orig utf7-$$.enc");
unlink("utf7-$$.orig", "utf7-$$.enc");
}
ok($utf, $text);
# Test encoding of different encoding byte lengths
for my $len (1 .. 6) {
$u = Unicode::String->new;
$u->pack(map {1000 + $_} 1 .. $len);
$u2 = utf7($u->utf7);
ok($u->utf16, $u2->utf16);
}
$Unicode::String::UTF7_OPTIONAL_DIRECT_CHARS = 0;
$u = latin1("a=4!æøå");
$utf = $u->utf7;
ok($utf7 !~ /[=!]/);
ok(utf7($utf)->latin1, "a=4!æøå");
#--- Swapped bytes ---
$u = utf16("ÿþa\0b\0c\0");
ok($u->hex, "U+feff U+0061 U+0062 U+0063");
ok($u->latin1, "abc");
$u = utf16("þÿ\0a\0b\0c");
ok($u->hex, "U+feff U+0061 U+0062 U+0063");
ok($u->latin1, "abc");
$u = utf16le("ÿþa\0b\0c\0");
ok($u->hex, "U+feff U+0061 U+0062 U+0063");
ok($u->latin1, "abc");
$u = utf16le("þÿ\0a\0b\0c");
ok($u->hex, "U+feff U+0061 U+0062 U+0063");
ok($u->latin1, "abc");