—#! /bin/false
# vim: set autoindent shiftwidth=4 tabstop=4:
# Conversion routines for UTF-8 (perl < 5.8.0).
# Copyright (C) 2002-2017 Guido Flohr <guido.flohr@cantanea.com>,
# all rights reserved.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
package
Locale::RecodeData::UTF_8;
use
strict;
require
Locale::RecodeData;
sub
_recode
{
if
(
$_
[0]->{_from} eq
'INTERNAL'
) {
return
$_
[0]->_fromInternal (
$_
[1]);
}
else
{
return
$_
[0]->_toInternal (
$_
[1]);
}
}
# This routine assumes that the internal representation is always sane
# and contains valid codes only.
sub
_fromInternal
{
$_
[1] =
join
''
,
map
{
if
(
$_
<= 0x7f) {
chr
$_
;
}
elsif
(
$_
<= 0x7ff) {
pack
(
"C2"
,
(0xc0 | ((
$_
>> 6) & 0x1f)),
(0x80 | (
$_
& 0x3f)));
}
elsif
(
$_
<= 0xffff) {
pack
(
"C3"
,
(0xe0 | ((
$_
>> 12) & 0xf)),
(0x80 | ((
$_
>> 6) & 0x3f)),
(0x80 | (
$_
& 0x3f)));
}
elsif
(
$_
<= 0x1fffff) {
pack
(
"C4"
,
(0xf0 | ((
$_
>> 18) & 0x7)),
(0x80 | ((
$_
>> 12) & 0x3f)),
(0x80 | ((
$_
>> 6) & 0x3f)),
(0x80 | (
$_
& 0x3f)));
}
elsif
(
$_
<= 0x3ffffff) {
pack
(
"C5"
,
(0xf0 | ((
$_
>> 24) & 0x3)),
(0x80 | ((
$_
>> 18) & 0x3f)),
(0x80 | ((
$_
>> 12) & 0x3f)),
(0x80 | ((
$_
>> 6) & 0x3f)),
(0x80 | (
$_
& 0x3f)));
}
else
{
pack
(
"C6"
,
(0xf0 | ((
$_
>> 30) & 0x3)),
(0x80 | ((
$_
>> 24) & 0x1)),
(0x80 | ((
$_
>> 18) & 0x3f)),
(0x80 | ((
$_
>> 12) & 0x3f)),
(0x80 | ((
$_
>> 6) & 0x3f)),
(0x80 | (
$_
& 0x3f)));
}
} @{
$_
[1]};
return
1;
}
# Decode UTF-8 into integers. We do not bother to care about possibly
# configured replacement characters here and simply fall back to 0xfffd.
# Rationale: the internal format is never output directly and the other
# encoders will handle the replacement character correctly.
sub
_toInternal
{
if
($] >= 5.006) {
$_
[1] = [
unpack
"U*"
,
$_
[1] ];
return
1;
}
# Sigh, we have to decode ourselves. FIXME: Should be optimized.
# The routine is awfully slow.
# It also does not necessarily detect illegal multi-byte sequences.
my
@chars
= ();
my
@bytes
=
unpack
"C*"
,
$_
[1];
BYTE:
while
(
@bytes
) {
my
$byte
=
shift
@bytes
;
if
(
$byte
< 0x80) {
push
@chars
,
$byte
;
}
elsif
(
$byte
< 0xc0 ||
$byte
> 0xfd) {
push
@chars
, 0xfffd;
}
else
{
my
$num_bytes
;
my
$char
;
if
(
$byte
< 0xe0) {
$char
=
$byte
& 0x1f;
$num_bytes
= 1;
}
elsif
(
$byte
< 0xf0) {
$char
=
$byte
& 0xf;
$num_bytes
= 2;
}
elsif
(
$byte
< 0xf8) {
$char
=
$byte
& 0x7;
$num_bytes
= 3;
}
elsif
(
$byte
< 0xfc) {
$char
=
$byte
& 0x3;
$num_bytes
= 4;
}
else
{
$char
=
$byte
& 0x1;
$num_bytes
= 5;
}
for
(
my
$i
= 0;
$i
<
$num_bytes
; ++
$i
) {
my
$next
=
shift
@bytes
;
if
(!
defined
$next
||
$next
< 0x80 ||
$next
> 0xbf) {
push
@chars
, 0xfffd;
next
BYTE;
}
else
{
$char
<<= 6;
$char
|=
$next
& 0x3f;
}
}
push
@chars
,
$char
;
}
}
$_
[1] = \
@chars
;
return
1;
}
1;
__END__
=head1 NAME
Locale::RecodeData::UTF_8 - Conversion routines for UTF-8
=head1 SYNOPSIS
This module is internal to libintl. Do not use directly!
=head1 DESCRIPTION
This modules contains the conversion tables for UTF-8. It is capable of
converting from UTF-8 to the internal format of libintl-perl and vice
versa. It is only suitable for Perl versions E<lt>= 5.8.0. However,
you do not have to bother about version checking, Locale::Recode(3)
will do that for you.
=head1 CHARACTER TABLE
=head1 AUTHOR
Copyright (C) 2002-2017 L<Guido Flohr|http://www.guido-flohr.net/>
(L<mailto:guido.flohr@cantanea.com>), all rights reserved. See the source
code for details!code for details!
=head1 SEE ALSO
Locale::RecodeData(3), Locale::Recode(3), perl(1)
=cut
Local Variables:
mode: perl
perl-indent-level: 4
perl-continued-statement-offset: 4
perl-continued-brace-offset: 0
perl-brace-offset: -4
perl-brace-imaginary-offset: 0
perl-label-offset: -4
cperl-indent-level: 4
cperl-continued-statement-offset: 2
tab-width: 4
End: