NAME

Encode::Bootstring - Encode and decode utf8 into a set of basic code points

VERSION

VERSION 0.01

SYNOPSIS

$BS = new Encode::Bootstring(
    BASIC => ["a".."z", "A".."Z", "0".."9"],
    TMAX => 53,
    SKEW => 78,
    INITIAL_BIAS => 32,
    TMIN => 38,
    DAMP => 40,
    DELIMITER => '_',
);

$bootstring = $BS->encode($utf8);
$utf8 = $BS->encode($bootstring);

All parameters are optional. Refer to RFC3492 for details of each parameter. The above parameters are suitable for encoding a variety of alphabets to ascii letters and numbers.

# Constructor # sub new { my $invocant = shift; my $class = ref($invocant) || $invocant; my $self = { @_ }; bless $self, $class; $self->_initialize(); return $self; }

# Initializer # # This load the basic code points table and set constants for encoding # and decoding. # Note: Are these constants reasonable? # sub _initialize { my $self = shift;

# Read parameters from new();
%{$self} = ( %{$self}, @_ );

# BASE is number of basic code points
$self->{BASE} = scalar @{$self->{BASIC}};

# Defaults
$self->{DELIMITER} ||= '-';
$self->{TMIN} ||= 1;
$self->{TMAX} ||= $self->{BASE} - 1;
$self->{INITIAL_N} = $self->{BASE} + 1;
$self->{INITIAL_BIAS} ||= 72;
$self->{SKEW} ||= 38;
$self->{DAMP} ||= 700;

# Render a modification of ascii table
$self->newtable();
}

# Handle errors # sub _croak { require Carp; Carp::croak(@_); }

# Create a variation of the ascii table (or part of it or beyond) # where all basic code points are first. # sub newtable { my $self = shift;

my $n = 0;

# Put basic code points in beginning of table
for ( @{$self->{BASIC}} ) {
  $self->{ord}{$_} = $n;
  $n++;
  $self->{maxord} = ord if not exists $self->{maxord} or $self->{maxord} < ord;
}

# Put skipped chars after basic code points
for ( 0..$self->{maxord} ) {
  my $c = chr $_;
  unless ( exists $self->{ord}{$c} ) {
    $self->{ord}{$c} = $n;
    $n++;
  } else {
  }
}

# Create a reverse map
%{$self->{chr}} = reverse %{$self->{ord}};
}

# Input int output char using modified table # sub nchr { my($self,$c) = @_;

 #return $_[0] > $self->{maxord} ? chr($_[0]) : $self->{chr}{$_[0]} ;
 return $c > $self->{maxord} ? chr($c) : $self->{chr}{$c} ;
}

# Input char output char using modified table # sub nord { my($self,$c) = @_;

return exists $self->{ord}{$c} ? $self->{ord}{$c} : ord($c) ;
}

# Hex code of ascii/utf8 char # sub hex4 { return sprintf('%04x', ord(shift)); }

# Dump modified table, for testing # sub dumptable { my $self = shift;

for (0..$self->{maxord}) {
  printf "%d = %s\n", $_, $self->nchr($_);
}
}

# The bootstring adaption algorithm # sub adapt { my($self,$delta, $numpoints, $firsttime) = @_;

$delta = $firsttime
       ? $delta / $self->{DAMP}
       : $delta / 2;
$delta += $delta / $numpoints;
my $k = 0;
while ( $delta > (($self->{BASE}-$self->{TMIN})*$self->{TMAX})/2 ) {
  $delta /= $self->{BASE} - $self->{TMIN};
  $k += $self->{BASE};
}
return $k + ( (($self->{BASE}-$self->{TMIN}+1) * $delta)
              / ($delta+$self->{SKEW}) );
}

# Encoding routine # sub encode { my $self = shift; my $input = shift;

if ( exists $self->{DEBUG} ) {
  $self->{trace} = "Encoding trace of $input:\n\n";
}

#my @input = split //, $input; # doesn't work in 5.6.x!
my @input = map substr($input, $_, 1), 0..length($input)-1;

my $n     = $self->{INITIAL_N};
my $delta = 0;
my $bias  = $self->{INITIAL_BIAS};
unless ( exists $self->{BasicRE} ) {
  my $BasicRE = join'',@{$self->{BASIC}};
  $self->{BasicRE} = qr/[$BasicRE]/;
}

# Trace output
if ( exists $self->{DEBUG} ) {
  $self->{trace} .= "bias is $bias\n"
                 .  "input is:\n"
                 .  join(' ', map hex4($_), @input) . "\n";
}

my @output;
my @tmpout;
#my @basic = grep /$BasicRE/, @input;
my @basic = grep /$self->{BasicRE}/, @input;
my $h = my $b = @basic;
push @output, @basic, $self->{DELIMITER} if $b > 0;

if ( exists $self->{DEBUG} ) {
  if ( @basic ) {
    $self->{trace} .= 'basic code points ('
                   .  join(', ', map hex4($_), @basic)
                   .  ') are copied to literal portion: "'
                   .  join('', @output)
                   .  '"' . "\n";
  } else {
    $self->{trace} .= "there are no basic code points, so no literal portion\n";
  }
}

my @ninput = map $self->nord($_), @input;
while ($h < @input) {
  my $m = min(grep { $_ >= $n } @ninput);
  if ( exists $self->{DEBUG} ) {
    $self->{trace} .= sprintf "next code point to insert is %04x\n", $m;
  }
  $delta += ($m - $n) * ($h + 1);
  $n = $m;
  for my $c (@ninput) {
    #my $c = $i;
    $delta++ if $c < $n;
    if ($c == $n) {
      my $q = $delta;
    LOOP:
      for (my $k = $self->{BASE}; 1; $k += $self->{BASE}) {
        my $t = ($k <= $bias) ? $self->{TMIN} :
          ($k >= $bias + $self->{TMAX}) ? $self->{TMAX} : $k - $bias;
        last LOOP if $q < $t;
        my $cp = $self->nchr($t + (($q - $t) % ($self->{BASE} - $t)));
        push @tmpout, $cp;
        $q = ($q - $t) / ($self->{BASE} - $t);
      }
      push @tmpout, $self->nchr($q);
      $bias = $self->adapt($delta, $h + 1, $h == $b);
      $delta = 0;
      $h++;
    }
  }
  if ( exists $self->{DEBUG} ) {
    $self->{trace} .= "needed delta is $delta, encodes as " . '"'
                   .  join('',@tmpout) . '"' . "\n"
                   .  "bias becomes $bias\n";
  }
  push @output, @tmpout;
  @tmpout = ();
  $delta++;
  $n++;
}
if ( exists $self->{DEBUG} ) {
  $self->{trace} .= 'output is "' . join('', @output) . '"' . "\n";
}
return join '', @output;
}

# Find minimum value in list # sub min { my $min = shift; for (@_) { $min = $_ if $_ <= $min } return $min; }

# Bootstring decoding routing # sub decode{ my $self = shift; my $code = shift;

if ( exists $self->{DEBUG} ) {
  $self->{trace} = "Decoding trace of $code:\n\n";
}

my $n      = $self->{INITIAL_N};
my $i      = 0;
my $bias   = $self->{INITIAL_BIAS};
#my $BasicRE = join'',@{$self->{BASIC}};
#$BasicRE = qr/[$BasicRE]/;
#$BasicRE = qr/[join'',@{$self->{BASIC}}]/;

my @output;

if ( exists $self->{DEBUG} ) {
  $self->{trace} .= "n is $n, i is $i, bias = $bias\n"
                 .  'input is "' . $code . '"' . "\n";
}

if ($code =~ s/(.*)$self->{DELIMITER}//o) {
  push @output, map $self->nord($_), split //, $1;
  if ( exists $self->{DEBUG} ) {
    $self->{trace} .= 'literal portion is "' . $1 . $self->{DELIMITER}
                   .  '", so extended string starts as:' . "\n"
                   .  join(' ', map hex4($self->nchr($_)), @output) . "\n";
  }
  my $bas = join('',@{$self->{BASIC}});
  for ( split //, $1 ) {
    return _croak('non-basic code point' ) unless $bas =~ /$_/o;
  }
} else {
  if ( exists $self->{DEBUG} ) {
    $self->{trace} .=
         "there is no delimiter, so extended string starts empty\n";
  }
}

while ($code) {
  my $oldi = $i;
  my $w    = 1;
  if ( exists $self->{DEBUG} ) {
    $self->{trace} .= 'delta "';
  }
LOOP:
  for (my $k = $self->{BASE}; 1; $k += $self->{BASE}) {
    my $cp = substr($code, 0, 1, '');
    my $digit = $self->nord($cp);
    if ( exists $self->{DEBUG} ) {
      $self->{trace} .= $cp;
    }
    defined $digit or return _croak("invalid punycode input");
    $i += $digit * $w;
      my $t = ($k <= $bias)
              ? $self->{TMIN}
              : ($k >= $bias + $self->{TMAX})
                ? $self->{TMAX}
                : $k - $bias;
      last LOOP if $digit < $t;
      $w *= ($self->{BASE} - $t);
  }
  if ( exists $self->{DEBUG} ) {
    $self->{trace} .= '" decodes to ' . "$i\n";
  }
  $bias = $self->adapt($i - $oldi, @output + 1, $oldi == 0);
  if ( exists $self->{DEBUG} ) {
    $self->{trace} .= "bias becomes $bias\n";
  }
  $n += $i / (@output + 1);
  $i = $i % (@output + 1);
  splice(@output, $i, 0, $n);
  if ( exists $self->{DEBUG} ) {
    $self->{trace} .= join(' ', map hex4($self->nchr($_)), @output) . "\n";
  }
  $i++;
}
my $res = pack("C*", map ord $self->nchr($_), @output);
return $res;
}

AUTHOR

Soren Dossing, <netcom at sauber.net>

BUGS Please report any bugs or feature requests to bug-encode-bootstring at rt.cpan.org, or through the web interface at "/rt.cpan.org/NoAuth/ReportBug.html?Queue=Encode-Boo tstring" in http:. I will be notified, and then you'll automatically be notified of progress on your bug as I make changes.

SUPPORT

You can find documentation for this module with the perldoc command.

perldoc Encode::Bootstring

You can also look for information at:

ACKNOWLEDGEMENTS

Adam M. Costello for punycode reference implementation, and for advice and review of this more generic module.

COPYRIGHT & LICENSE

Copyright 2009 Soren Dossing.

This program is free software; you can redistribute it and/or modify it under the terms of either: the GNU General Public License as published by the Free Software Foundation; or the Artistic License.

See http://dev.perl.org/licenses/ for more information.