The Perl Toolchain Summit 2025 Needs You: You can help 🙏 Learn more

#!perl
#
# This auxiliary script makes five header files
# used for building XSUB of Unicode::Collate.
#
# Usage:
# <do 'mkheader'> in perl, or <perl mkheader> in command line
#
# Input file:
# Collate/allkeys.txt
#
# Output file:
# ucatbl.h
#
use 5.006;
use strict;
use Carp;
BEGIN {
unless ("A" eq pack('U', 0x41)) {
die "Unicode::Collate cannot stringify a Unicode code point\n";
}
}
use constant TRUE => 1;
use constant FALSE => "";
use constant VCE_TEMPLATE => 'Cn4';
sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
our $PACKAGE = 'Unicode::Collate, mkheader';
our $prefix = "UCA_";
our %SimpleEntries; # $codepoint => $keys
our @Rest;
{
my($f, $fh);
foreach my $d (File::Spec->curdir()) {
$f = File::Spec->catfile($d, "Collate", "allkeys.txt");
last if open($fh, $f);
$f = undef;
}
croak "$PACKAGE: Collate/allkeys.txt is not found" if !defined $f;
while (my $line = <$fh>) {
next if $line =~ /^\s*#/;
if ($line =~ /^\s*\@/) {
push @Rest, $line;
next;
}
next if $line !~ /^\s*[0-9A-Fa-f]/;
$line =~ s/[#%]\s*(.*)//; # removing comment (not getting the name)
# gets element
my($e, $k) = split /;/, $line;
croak "Wrong Entry: <charList> must be separated by ';' ".
"from <collElement>" if ! $k;
my @uv = _getHexArray($e);
next if !@uv;
if (@uv != 1) {
push @Rest, $line;
next;
}
my $is_L3_ignorable = TRUE;
my @key;
foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
my @wt = _getHexArray($arr);
push @key, pack(VCE_TEMPLATE, $var, @wt);
$is_L3_ignorable = FALSE
if $wt[0] || $wt[1] || $wt[2];
# Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
# is completely ignorable.
# For expansion, an entry $is_L3_ignorable
# if and only if "all" CEs are [.0000.0000.0000].
}
my $mapping = $is_L3_ignorable ? [] : \@key;
my $num = @$mapping;
my $str = chr($num).join('', @$mapping);
$SimpleEntries{$uv[0]} = stringify($str);
}
}
sub stringify {
my $str = shift;
return sprintf '"%s"', join '',
map sprintf("\\x%02x", ord $_), split //, $str;
}
########## writing header files ##########
my $init = '';
{
my $type = "char*";
my $head = $prefix."rest";
$init .= "static $type $head [] = {\n";
for my $line (@Rest) {
$line =~ s/\s*\z//;
next if $line eq '';
$init .= "/*$line*/\n" if $line =~ /^[A-Za-z0-9_.:;@\ \[\]]+\z/;
$init .= "($type)".stringify($line).",\n";
}
$init .= "NULL\n"; # sentinel
$init .= "};\n\n";
}
my @tripletable = (
{
file => "ucatbl",
name => "simple",
type => "char*",
hash => \%SimpleEntries,
null => "NULL",
init => $init,
},
);
foreach my $tbl (@tripletable) {
my $file = "$tbl->{file}.h";
my $head = "${prefix}$tbl->{name}";
my $type = $tbl->{type};
my $hash = $tbl->{hash};
my $null = $tbl->{null};
my $init = $tbl->{init};
open FH, ">$file" or croak "$PACKAGE: $file can't be made";
binmode FH; select FH;
my %val;
print FH << 'EOF';
/*
* This file is auto-generated by mkheader.
* Any changes here will be lost!
*/
EOF
print $init if defined $init;
foreach my $uv (keys %$hash) {
croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv)
unless $uv <= 0x10FFFF;
my @c = unpack 'CCCC', pack 'N', $uv;
$val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv};
}
foreach my $p (sort { $a <=> $b } keys %val) {
next if ! $val{ $p };
for (my $r = 0; $r < 256; $r++) {
next if ! $val{ $p }{ $r };
printf "static $type ${head}_%02x_%02x [256] = {\n", $p, $r;
for (my $c = 0; $c < 256; $c++) {
print "\t", defined $val{$p}{$r}{$c}
? "($type)".$val{$p}{$r}{$c}
: $null;
print ',' if $c != 255;
print "\n" if $c % 8 == 7;
}
print "};\n\n";
}
}
foreach my $p (sort { $a <=> $b } keys %val) {
next if ! $val{ $p };
printf "static $type* ${head}_%02x [256] = {\n", $p;
for (my $r = 0; $r < 256; $r++) {
print $val{ $p }{ $r }
? sprintf("${head}_%02x_%02x", $p, $r)
: "NULL";
print ',' if $r != 255;
print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0;
}
print "};\n\n";
}
print "static $type** $head [] = {\n";
for (my $p = 0; $p <= 0x10; $p++) {
print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL";
print ',' if $p != 0x10;
print "\n";
}
print "};\n\n";
close FH;
}
1;
__END__