t/encoding.t - metacpan.org


            
              1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
              #!perl -w
use strict;
use Test qw(plan ok);
plan tests => 59;
use Unicode::String qw(latin1 ucs4 utf32le utf16 utf16le utf8 utf7);
#use Devel::Dump;
$SIG{__WARN__} = sub { print "$_[0]"; };
my $u = latin1("abcæøå");
#Dump($u);
#---- Test Latin1 encoding ----
ok($u->latin1, "abcæøå");
ok($u->length, 6);
ok($u->ucs4, "\0\0\0a\0\0\0b\0\0\0c\0\0\0æ\0\0\0ø\0\0\0å");
ok($u->utf32, "\0\0\0a\0\0\0b\0\0\0c\0\0\0æ\0\0\0ø\0\0\0å");
ok($u->utf32be, "\0\0\0a\0\0\0b\0\0\0c\0\0\0æ\0\0\0ø\0\0\0å");
ok($u->utf32le, "a\0\0\0b\0\0\0c\0\0\0æ\0\0\0ø\0\0\0å\0\0\0");
ok($u->utf16, "\0a\0b\0c\0æ\0ø\0å");
ok($u->utf16be, "\0a\0b\0c\0æ\0ø\0å");
ok($u->ucs2, "\0a\0b\0c\0æ\0ø\0å");
ok($u->utf16le, "a\0b\0c\0æ\0ø\0å\0");
ok($u->utf8, "abcÃ¦Ã¸Ã¥");
ok($u->utf7, "abc+AOYA+ADl-");
ok($u->hex, "U+0061 U+0062 U+0063 U+00e6 U+00f8 U+00e5");
$u = latin1("abc");
$a = $u->latin1("def");
$b = $u->latin1;
$u->latin1("ghi");
ok($a, "abc");
ok($b, "def");
ok($u->latin1, "ghi");
$u = utf16("aa\0bcc\0d");
print "Expect 2 lines of warnings...\n";
my $x = $u->latin1;
ok($x, "bd");
#---- Test UCS4 encoding ----
$x = "\0\0\0a\0\0bb\0\3cc\0\1\0\2\0\0\0\0";
$u = ucs4($x);
ok($u->length, 7);
ok($u->hex, "U+0061 U+6262 U+d898 U+df63 U+d800 U+dc02 U+0000");
ok($u->ucs4, $x);
$a = $u->ucs4("");
ok($a, $x);
ok($u->length, 0);
$u = utf32le("a\0\0\0" . "bb\0\0" . "cc\3\0" . "\2\0\1\0" . "\0\0\0\0");
ok($u->length, 7);
ok($u->hex, "U+0061 U+6262 U+d898 U+df63 U+d800 U+dc02 U+0000");
ok($u->ucs4, $x);
print "Expect 2 lines of warnings...\n";
$u->ucs4("    \0\x10\xff\xff\0\x11\0\0\0\0\0\0");
ok($u->hex, "U+dbff U+dfff U+0000");
#--- Test UTF8 encoding ---
$u = utf8("");
ok($u->length, 0);
ok($u->utf8, "");
$u = utf8("abc");
my $old = $u->utf8("def");
ok($old, "abc");
ok($u->latin1, "def");
$u = utf16("\0a\0å\1\0\7a\0aa");
$x = unpack("H*", $u->utf8);
ok($x, "61c3a5c480dda161e68480");
my $u2 = utf8($u->utf8);
ok($u->utf16, $u2->utf16);
# Test surrogates and utf8
print "Surrogates...\n";
$u = ucs4("\0\1\0\0\0\x10\xFF\xFF");
$x = unpack("H*", $u->utf8);
ok($x, "f0908080f48fbfbf");
$u->utf8(pack("H*", $x));
ok($u->ucs4, "\0\1\0\0\0\x10\xFF\xFF");
print "Expect a warning with this incomplete surrogate pair...\n";
$u = utf16("\xd8\x00");
$u2 = utf8($u->utf8);
ok($u2->hex, "U+d800");
print "...and lots of noice from this...\n";
$u = utf8("¤¤a\xf7¤¤¤b\xf8¤¤¤¤c\xfc¤¤¤¤¤d\xfd\xfe\xffef");
print $u->hex, "\n";
ok($u->utf8, "abcdef");
#--- Test UTF7 encoding ---
# Examples from RFC 1642...
#
#      Example. The Unicode sequence "A<NOT IDENTICAL TO><ALPHA>."
#      (hexadecimal 0041,2262,0391,002E) may be encoded as follows:
#
#            A+ImIDkQ.
#
#      Example. The Unicode sequence "Hi Mom <WHITE SMILING FACE>!"
#      (hexadecimal 0048, 0069, 0020, 004D, 006F, 004D, 0020, 263A, 0021)
#      may be encoded as follows:
#
#            Hi Mom +Jjo-!
$u = utf7("A+ImIDkQ.");
ok($u->hex, "U+0041 U+2262 U+0391 U+002e");
my $utf7 = $u->utf7("Hi Mom +Jjo-!");
ok($utf7, qr/^A\+ImIDkQ-?\.$/);
ok($u->hex, "U+0048 U+0069 U+0020 U+004d U+006f U+006d U+0020 U+263a U+0021");
ok($u->utf7 eq "Hi Mom +Jjo-!" || $u->utf7 eq "Hi Mom +JjoAIQ-");
#      Example. The Unicode sequence representing the Han characters for
#      the Japanese word "nihongo" (hexadecimal 65E5,672C,8A9E) may be
#      encoded as follows:
$u = utf7("+ZeVnLIqe-");
ok($u->hex, "U+65e5 U+672c U+8a9e");
ok($u->utf7, "+ZeVnLIqe-");
# Appendix A -- Examples
#
#   Here is a longer example, taken from a document originally in Big5
#   code. It has been condensed for brevity. There are two versions: the
#   first uses optional characters from set O (and thus may not pass
#   through some mail gateways), and the second uses no optional
#   characters.
my $text = <<'EOT';
   Below is the full Chinese text of the Analects (+itaKng-).
   The sources for the text are:
   "The sayings of Confucius," James R. Ware, trans.  +U/BTFw-:
   +ZYeB9FH6ckh5Pg-, 1980.  (Chinese text with English translation)
   +Vttm+E6UfZM-, +W4tRQ066bOg-, +UxdOrA-:  +Ti1XC2b4Xpc-, 1990.
   "The Chinese Classics with a Translation, Critical and
   Exegetical Notes, Prolegomena, and Copius Indexes," James
   Legge, trans., Taipei:  Southern Materials Center Publishing,
   Inc., 1991.  (Chinese text with English translation)
   Big Five and GB versions of the text are being made available
   separately.
   Neither the Big Five nor GB contain all the characters used in
   this text.  Missing characters have been indicated using their
   Unicode/ISO 10646 code points.  "U+-" followed by four
   hexadecimal digits indicates a Unicode/10646 code (e.g.,
   U+-9F08).  There is no good solution to the problem of the small
   size of the Big Five/GB character sets; this represents the
   solution I find personally most satisfactory.
   (omitted...)
   I have tried to minimize this problem by using variant
   characters where they were available and the character
   actually in the text was not.  Only variants listed as such in
   the +XrdxmVtXUXg- were used.
   (omitted...)
   John H. Jenkins
   +TpVPXGBG-
   John_Jenkins@taligent.com
   5 January 1993
EOT
$u = utf7($text);
my $utf = $u->utf7;
unless ($utf eq $text) {
   print $u->length, " $utf\n";
   open(F, ">utf7-$$.orig"); print F $text;
   open(F, ">utf7-$$.enc");  print F $utf;
   close(F);
   system("diff -u0 utf7-$$.orig utf7-$$.enc");
   unlink("utf7-$$.orig", "utf7-$$.enc");
}
ok($utf, $text);
# Test encoding of different encoding byte lengths
for my $len (1 .. 6) {
   $u = Unicode::String->new;
   $u->pack(map {1000 + $_} 1 .. $len);
   $u2 = utf7($u->utf7);
   ok($u->utf16, $u2->utf16);
}
$Unicode::String::UTF7_OPTIONAL_DIRECT_CHARS = 0;
$u = latin1("a=4!æøå");
$utf = $u->utf7;
ok($utf7 !~ /[=!]/);
ok(utf7($utf)->latin1, "a=4!æøå");
#--- Swapped bytes ---
$u = utf16("ÿþa\0b\0c\0");
ok($u->hex, "U+feff U+0061 U+0062 U+0063");
ok($u->latin1, "abc");
$u = utf16("þÿ\0a\0b\0c");
ok($u->hex, "U+feff U+0061 U+0062 U+0063");
ok($u->latin1, "abc");
$u = utf16le("ÿþa\0b\0c\0");
ok($u->hex, "U+feff U+0061 U+0062 U+0063");
ok($u->latin1, "abc");
$u = utf16le("þÿ\0a\0b\0c");
ok($u->hex, "U+feff U+0061 U+0062 U+0063");
ok($u->latin1, "abc");
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)