NAME
Encode::Bootstring - Encode and decode utf8 into a set of basic code points
VERSION
VERSION 0.01
SYNOPSIS
$BS = new Encode::Bootstring(
BASIC => ["a".."z", "A".."Z", "0".."9"],
TMAX => 53,
SKEW => 78,
INITIAL_BIAS => 32,
TMIN => 38,
DAMP => 40,
DELIMITER => '_',
);
$bootstring = $BS->encode($utf8);
$utf8 = $BS->encode($bootstring);
All parameters are optional. Refer to RFC3492 for details of each parameter. The above parameters are suitable for encoding a variety of alphabets to ascii letters and numbers.
# Constructor # sub new { my $invocant = shift; my $class = ref($invocant) || $invocant; my $self = { @_ }; bless $self, $class; $self->_initialize(); return $self; }
# Initializer # # This load the basic code points table and set constants for encoding # and decoding. # Note: Are these constants reasonable? # sub _initialize { my $self = shift;
# Read parameters from new();
%{$self} = ( %{$self}, @_ );
# BASE is number of basic code points
$self->{BASE} = scalar @{$self->{BASIC}};
# Defaults
$self->{DELIMITER} ||= '-';
$self->{TMIN} ||= 1;
$self->{TMAX} ||= $self->{BASE} - 1;
$self->{INITIAL_N} = $self->{BASE} + 1;
$self->{INITIAL_BIAS} ||= 72;
$self->{SKEW} ||= 38;
$self->{DAMP} ||= 700;
# Render a modification of ascii table
$self->newtable();
}
# Handle errors # sub _croak { require Carp; Carp::croak(@_); }
# Create a variation of the ascii table (or part of it or beyond) # where all basic code points are first. # sub newtable { my $self = shift;
my $n = 0;
# Put basic code points in beginning of table
for ( @{$self->{BASIC}} ) {
$self->{ord}{$_} = $n;
$n++;
$self->{maxord} = ord if not exists $self->{maxord} or $self->{maxord} < ord;
}
# Put skipped chars after basic code points
for ( 0..$self->{maxord} ) {
my $c = chr $_;
unless ( exists $self->{ord}{$c} ) {
$self->{ord}{$c} = $n;
$n++;
} else {
}
}
# Create a reverse map
%{$self->{chr}} = reverse %{$self->{ord}};
}
# Input int output char using modified table # sub nchr { my($self,$c) = @_;
#return $_[0] > $self->{maxord} ? chr($_[0]) : $self->{chr}{$_[0]} ;
return $c > $self->{maxord} ? chr($c) : $self->{chr}{$c} ;
}
# Input char output char using modified table # sub nord { my($self,$c) = @_;
return exists $self->{ord}{$c} ? $self->{ord}{$c} : ord($c) ;
}
# Hex code of ascii/utf8 char # sub hex4 { return sprintf('%04x', ord(shift)); }
# Dump modified table, for testing # sub dumptable { my $self = shift;
for (0..$self->{maxord}) {
printf "%d = %s\n", $_, $self->nchr($_);
}
}
# The bootstring adaption algorithm # sub adapt { my($self,$delta, $numpoints, $firsttime) = @_;
$delta = $firsttime
? $delta / $self->{DAMP}
: $delta / 2;
$delta += $delta / $numpoints;
my $k = 0;
while ( $delta > (($self->{BASE}-$self->{TMIN})*$self->{TMAX})/2 ) {
$delta /= $self->{BASE} - $self->{TMIN};
$k += $self->{BASE};
}
return $k + ( (($self->{BASE}-$self->{TMIN}+1) * $delta)
/ ($delta+$self->{SKEW}) );
}
# Encoding routine # sub encode { my $self = shift; my $input = shift;
if ( exists $self->{DEBUG} ) {
$self->{trace} = "Encoding trace of $input:\n\n";
}
#my @input = split //, $input; # doesn't work in 5.6.x!
my @input = map substr($input, $_, 1), 0..length($input)-1;
my $n = $self->{INITIAL_N};
my $delta = 0;
my $bias = $self->{INITIAL_BIAS};
unless ( exists $self->{BasicRE} ) {
my $BasicRE = join'',@{$self->{BASIC}};
$self->{BasicRE} = qr/[$BasicRE]/;
}
# Trace output
if ( exists $self->{DEBUG} ) {
$self->{trace} .= "bias is $bias\n"
. "input is:\n"
. join(' ', map hex4($_), @input) . "\n";
}
my @output;
my @tmpout;
#my @basic = grep /$BasicRE/, @input;
my @basic = grep /$self->{BasicRE}/, @input;
my $h = my $b = @basic;
push @output, @basic, $self->{DELIMITER} if $b > 0;
if ( exists $self->{DEBUG} ) {
if ( @basic ) {
$self->{trace} .= 'basic code points ('
. join(', ', map hex4($_), @basic)
. ') are copied to literal portion: "'
. join('', @output)
. '"' . "\n";
} else {
$self->{trace} .= "there are no basic code points, so no literal portion\n";
}
}
my @ninput = map $self->nord($_), @input;
while ($h < @input) {
my $m = min(grep { $_ >= $n } @ninput);
if ( exists $self->{DEBUG} ) {
$self->{trace} .= sprintf "next code point to insert is %04x\n", $m;
}
$delta += ($m - $n) * ($h + 1);
$n = $m;
for my $c (@ninput) {
#my $c = $i;
$delta++ if $c < $n;
if ($c == $n) {
my $q = $delta;
LOOP:
for (my $k = $self->{BASE}; 1; $k += $self->{BASE}) {
my $t = ($k <= $bias) ? $self->{TMIN} :
($k >= $bias + $self->{TMAX}) ? $self->{TMAX} : $k - $bias;
last LOOP if $q < $t;
my $cp = $self->nchr($t + (($q - $t) % ($self->{BASE} - $t)));
push @tmpout, $cp;
$q = ($q - $t) / ($self->{BASE} - $t);
}
push @tmpout, $self->nchr($q);
$bias = $self->adapt($delta, $h + 1, $h == $b);
$delta = 0;
$h++;
}
}
if ( exists $self->{DEBUG} ) {
$self->{trace} .= "needed delta is $delta, encodes as " . '"'
. join('',@tmpout) . '"' . "\n"
. "bias becomes $bias\n";
}
push @output, @tmpout;
@tmpout = ();
$delta++;
$n++;
}
if ( exists $self->{DEBUG} ) {
$self->{trace} .= 'output is "' . join('', @output) . '"' . "\n";
}
return join '', @output;
}
# Find minimum value in list # sub min { my $min = shift; for (@_) { $min = $_ if $_ <= $min } return $min; }
# Bootstring decoding routing # sub decode{ my $self = shift; my $code = shift;
if ( exists $self->{DEBUG} ) {
$self->{trace} = "Decoding trace of $code:\n\n";
}
my $n = $self->{INITIAL_N};
my $i = 0;
my $bias = $self->{INITIAL_BIAS};
#my $BasicRE = join'',@{$self->{BASIC}};
#$BasicRE = qr/[$BasicRE]/;
#$BasicRE = qr/[join'',@{$self->{BASIC}}]/;
my @output;
if ( exists $self->{DEBUG} ) {
$self->{trace} .= "n is $n, i is $i, bias = $bias\n"
. 'input is "' . $code . '"' . "\n";
}
if ($code =~ s/(.*)$self->{DELIMITER}//o) {
push @output, map $self->nord($_), split //, $1;
if ( exists $self->{DEBUG} ) {
$self->{trace} .= 'literal portion is "' . $1 . $self->{DELIMITER}
. '", so extended string starts as:' . "\n"
. join(' ', map hex4($self->nchr($_)), @output) . "\n";
}
my $bas = join('',@{$self->{BASIC}});
for ( split //, $1 ) {
return _croak('non-basic code point' ) unless $bas =~ /$_/o;
}
} else {
if ( exists $self->{DEBUG} ) {
$self->{trace} .=
"there is no delimiter, so extended string starts empty\n";
}
}
while ($code) {
my $oldi = $i;
my $w = 1;
if ( exists $self->{DEBUG} ) {
$self->{trace} .= 'delta "';
}
LOOP:
for (my $k = $self->{BASE}; 1; $k += $self->{BASE}) {
my $cp = substr($code, 0, 1, '');
my $digit = $self->nord($cp);
if ( exists $self->{DEBUG} ) {
$self->{trace} .= $cp;
}
defined $digit or return _croak("invalid punycode input");
$i += $digit * $w;
my $t = ($k <= $bias)
? $self->{TMIN}
: ($k >= $bias + $self->{TMAX})
? $self->{TMAX}
: $k - $bias;
last LOOP if $digit < $t;
$w *= ($self->{BASE} - $t);
}
if ( exists $self->{DEBUG} ) {
$self->{trace} .= '" decodes to ' . "$i\n";
}
$bias = $self->adapt($i - $oldi, @output + 1, $oldi == 0);
if ( exists $self->{DEBUG} ) {
$self->{trace} .= "bias becomes $bias\n";
}
$n += $i / (@output + 1);
$i = $i % (@output + 1);
splice(@output, $i, 0, $n);
if ( exists $self->{DEBUG} ) {
$self->{trace} .= join(' ', map hex4($self->nchr($_)), @output) . "\n";
}
$i++;
}
my $res = pack("C*", map ord $self->nchr($_), @output);
return $res;
}
AUTHOR
Soren Dossing, <netcom at sauber.net>
BUGS Please report any bugs or feature requests to bug-encode-bootstring at rt.cpan.org
, or through the web interface at "/rt.cpan.org/NoAuth/ReportBug.html?Queue=Encode-Boo tstring" in http:. I will be notified, and then you'll automatically be notified of progress on your bug as I make changes.
SUPPORT
You can find documentation for this module with the perldoc command.
perldoc Encode::Bootstring
You can also look for information at:
RT: CPAN's request tracker
AnnoCPAN: Annotated CPAN documentation
CPAN Ratings
Search CPAN
ACKNOWLEDGEMENTS
Adam M. Costello for punycode reference implementation, and for advice and review of this more generic module.
COPYRIGHT & LICENSE
Copyright 2009 Soren Dossing.
This program is free software; you can redistribute it and/or modify it under the terms of either: the GNU General Public License as published by the Free Software Foundation; or the Artistic License.
See http://dev.perl.org/licenses/ for more information.