From Code to Community: Sponsoring The Perl and Raku Conference 2025 Learn more

#! /usr/local/bin/perl -w
# vim: syntax=perl
# vim: tabstop=4
use strict;
use Test;
BEGIN {
plan tests => 5;
}
sub int2utf8;
my $codes = {};
foreach (0 .. 0xcfff
# 0 .. 0x11_000,
# 0x10_000 .. 0x11_000,
# 0x200_000 .. 0x201_000, # :-( # Not supported by Perl 5.6
# 0x4_000_000 .. 0x4_001_000, # :-( # Not supported by Perl 5.6
) {
$codes->{$_} = int2utf8 $_;
}
my $cd_int = Locale::Recode->new (from => 'UTF-8',
to => 'INTERNAL');
ok !$cd_int->getError;
my $cd_rev = Locale::Recode->new (from => 'INTERNAL',
to => 'UTF-8');
ok !$cd_rev->getError;
# Convert into internal representation.
my $result_int = 1;
while (my ($ucs4, $outbuf) = each %$codes) {
my $result = $cd_int->recode ($outbuf);
unless ($result && $outbuf->[0] == $ucs4) {
$result_int = 0;
last;
}
}
ok $result_int;
# Convert from internal representation.
my $result_rev = 1;
if (1) {
# FIXME: This test only succeeds with use bytes in Perl >= 5.8.0.
# However, this will fail with Perl <= Perl 5.6.0. :-(
# FIXME: Is it really fixed now?
while (my ($ucs4, $code) = each %$codes) {
my $outbuf = [ $ucs4 ];
my $result = $cd_rev->recode ($outbuf);
unless ($result && $code eq $outbuf) {
$result_rev = 0;
last;
}
}
}
ok $result_rev;
# Check handling of unknown characters. This assumes that the
# character set is a subset of US-ASCII.
my $test_string1 = "\xffSupergirl\xff";
$cd_rev = Locale::Recode->new (from => 'ASCII',
to => 'UTF-8',
);
$result_rev = $cd_rev->recode ($test_string1);
ok $result_rev && $test_string1 eq "�Supergirl�";
sub int2utf8
{
my $ucs4 = shift;
if ($ucs4 <= 0x7f) {
return chr $ucs4;
} elsif ($ucs4 <= 0x7ff) {
return pack ("C2",
(0xc0 | (($ucs4 >> 6) & 0x1f)),
(0x80 | ($ucs4 & 0x3f)));
} elsif ($ucs4 <= 0xffff) {
return pack ("C3",
(0xe0 | (($ucs4 >> 12) & 0xf)),
(0x80 | (($ucs4 >> 6) & 0x3f)),
(0x80 | ($ucs4 & 0x3f)));
} elsif ($ucs4 <= 0x1fffff) {
return pack ("C4",
(0xf0 | (($ucs4 >> 18) & 0x7)),
(0x80 | (($ucs4 >> 12) & 0x3f)),
(0x80 | (($ucs4 >> 6) & 0x3f)),
(0x80 | ($ucs4 & 0x3f)));
} elsif ($ucs4 <= 0x3ffffff) {
return pack ("C5",
(0xf0 | (($ucs4 >> 24) & 0x3)),
(0x80 | (($ucs4 >> 18) & 0x3f)),
(0x80 | (($ucs4 >> 12) & 0x3f)),
(0x80 | (($ucs4 >> 6) & 0x3f)),
(0x80 | ($ucs4 & 0x3f)));
} else {
return pack ("C6",
(0xf0 | (($ucs4 >> 30) & 0x3)),
(0x80 | (($ucs4 >> 24) & 0x1)),
(0x80 | (($ucs4 >> 18) & 0x3f)),
(0x80 | (($ucs4 >> 12) & 0x3f)),
(0x80 | (($ucs4 >> 6) & 0x3f)),
(0x80 | ($ucs4 & 0x3f)));
}
}
# Local Variables:
# mode: perl
# perl-indent-level: 4
# perl-continued-statement-offset: 4
# perl-continued-brace-offset: 0
# perl-brace-offset: -4
# perl-brace-imaginary-offset: 0
# perl-label-offset: -4
# cperl-indent-level: 4
# cperl-continued-statement-offset: 2
# tab-width: 4
# End: