From Code to Community: Sponsoring The Perl and Raku Conference 2025 Learn more

#!perl
#
# This auxiliary script makes five header files
# used for building XSUB of Unicode::Normalize.
#
# Usage:
# <do 'mkheader'> in perl, or <perl mkheader> in command line
#
# Input files:
# unicore/CombiningClass.pl (or unicode/CombiningClass.pl)
# unicore/Decomposition.pl (or unicode/Decomposition.pl)
# ./Normalize/CompExcl.pl
#
# Output files:
# unfcan.h
# unfcpt.h
# unfcmb.h
# unfcmp.h
# unfexc.h
#
use 5.006;
use strict;
use Carp;
BEGIN {
unless ("A" eq pack('U', 0x41)) {
die "Unicode::Normalize cannot stringify a Unicode code point\n";
}
}
our $PACKAGE = 'Unicode::Normalize, mkheader';
our $prefix = "UNF_";
our $structname = "${prefix}complist";
sub pack_U {
return pack('U*', @_);
}
# %Canon and %Compat will be ($codepoint => $hexstring) after _U_stringify()
our %Comp1st; # $codepoint => $listname : may be composed with a next char.
our %CompList; # $listname,$2nd => $codepoint : composite
my $File_CompExcl = File::Spec->catfile(
File::Spec->curdir(), 'Normalize', 'CompExcl.pl'
);
##### The below part is common to mkheader and PP #####
our %Combin; # $codepoint => $number : combination class
our %Canon; # $codepoint => \@codepoints : canonical decomp.
our %Compat; # $codepoint => \@codepoints : compat. decomp.
our %Compos; # $1st,$2nd => $codepoint : composite
our %Exclus; # $codepoint => 1 : composition exclusions
our %Single; # $codepoint => 1 : singletons
our %NonStD; # $codepoint => 1 : non-starter decompositions
our %Comp2nd; # $codepoint => 1 : may be composed with a prev char.
# from core Unicode database
our $Combin = do "unicore/CombiningClass.pl"
|| do "unicode/CombiningClass.pl"
|| croak "$PACKAGE: CombiningClass.pl not found";
our $Decomp = do "unicore/Decomposition.pl"
|| do "unicode/Decomposition.pl"
|| croak "$PACKAGE: Decomposition.pl not found";
# from self Unicode database
our $CompEx = do $File_CompExcl
|| croak "$PACKAGE: CompExcl.pl not found";
# definition of Hangul constants
use constant SBase => 0xAC00;
use constant SFinal => 0xD7A3; # SBase -1 + SCount
use constant SCount => 11172; # LCount * NCount
use constant NCount => 588; # VCount * TCount
use constant LBase => 0x1100;
use constant LFinal => 0x1112;
use constant LCount => 19;
use constant VBase => 0x1161;
use constant VFinal => 0x1175;
use constant VCount => 21;
use constant TBase => 0x11A7;
use constant TFinal => 0x11C2;
use constant TCount => 28;
sub decomposeHangul {
my $sindex = $_[0] - SBase;
my $lindex = int( $sindex / NCount);
my $vindex = int(($sindex % NCount) / TCount);
my $tindex = $sindex % TCount;
my @ret = (
LBase + $lindex,
VBase + $vindex,
$tindex ? (TBase + $tindex) : (),
);
return wantarray ? @ret : pack_U(@ret);
}
########## getting full decomposition ##########
## converts string "hhhh hhhh hhhh" to a numeric list
## (hex digits separated by spaces)
sub _getHexArray { map hex, $_[0] =~ /\G *([0-9A-Fa-f]+)/g }
while ($Combin =~ /(.+)/g) {
my @tab = split /\t/, $1;
my $ini = hex $tab[0];
if ($tab[1] eq '') {
$Combin{$ini} = $tab[2];
} else {
$Combin{$_} = $tab[2] foreach $ini .. hex($tab[1]);
}
}
while ($Decomp =~ /(.+)/g) {
my @tab = split /\t/, $1;
my $compat = $tab[2] =~ s/<[^>]+>//;
my $dec = [ _getHexArray($tab[2]) ]; # decomposition
my $ini = hex($tab[0]); # initial decomposable character
my $end = $tab[1] eq '' ? $ini : hex($tab[1]);
# ($ini .. $end) is the range of decomposable characters.
foreach my $u ($ini .. $end) {
$Compat{$u} = $dec;
$Canon{$u} = $dec if ! $compat;
}
}
while ($CompEx =~ /(.+)/g) {
my $s = $1;
next if $s =~ /^#/;
$s =~ s/#.*//;
foreach my $u (_getHexArray($s)) {
next if !$Canon{$u}; # not assigned
next if $u == 0xFB1D && !$Canon{0x1D15E}; # 3.0.1 before Corrigendum #2
$Exclus{$u} = 1;
}
}
foreach my $u (keys %Canon) {
my $dec = $Canon{$u};
if (@$dec == 2) {
if ($Combin{ $dec->[0] }) {
$NonStD{$u} = 1;
} else {
$Compos{ $dec->[0] }{ $dec->[1] } = $u;
$Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
}
} elsif (@$dec == 1) {
$Single{$u} = 1;
} else {
my $h = sprintf '%04X', $u;
croak("Weird Canonical Decomposition of U+$h");
}
}
# modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo
foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) {
$Comp2nd{$j} = 1;
}
sub getCanonList {
my @src = @_;
my @dec = map {
(SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
: $Canon{$_} ? @{ $Canon{$_} } : $_
} @src;
return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
# condition @src == @dec is not ok.
}
sub getCompatList {
my @src = @_;
my @dec = map {
(SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
: $Compat{$_} ? @{ $Compat{$_} } : $_
} @src;
return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
# condition @src == @dec is not ok.
}
# exhaustive decomposition
foreach my $key (keys %Canon) {
$Canon{$key} = [ getCanonList($key) ];
}
# exhaustive decomposition
foreach my $key (keys %Compat) {
$Compat{$key} = [ getCompatList($key) ];
}
##### The above part is common to mkheader and PP #####
foreach my $comp1st (keys %Compos) {
my $listname = sprintf("${structname}_%06x", $comp1st);
# %04x is bad since it'd place _3046 after _1d157.
$Comp1st{$comp1st} = $listname;
my $rh1st = $Compos{$comp1st};
foreach my $comp2nd (keys %$rh1st) {
my $uc = $rh1st->{$comp2nd};
$CompList{$listname}{$comp2nd} = $uc;
}
}
sub split_into_char {
use bytes;
my $uni = shift;
my $len = length($uni);
my @ary;
for(my $i = 0; $i < $len; ++$i) {
push @ary, ord(substr($uni,$i,1));
}
return @ary;
}
sub _U_stringify {
sprintf '"%s"', join '',
map sprintf("\\x%02x", $_), split_into_char(pack_U(@_));
}
foreach my $hash (\%Canon, \%Compat) {
foreach my $key (keys %$hash) {
$hash->{$key} = _U_stringify( @{ $hash->{$key} } );
}
}
########## writing header files ##########
my @boolfunc = (
{
name => "Exclusion",
type => "bool",
hash => \%Exclus,
},
{
name => "Singleton",
type => "bool",
hash => \%Single,
},
{
name => "NonStDecomp",
type => "bool",
hash => \%NonStD,
},
{
name => "Comp2nd",
type => "bool",
hash => \%Comp2nd,
},
);
my $file = "unfexc.h";
open FH, ">$file" or croak "$PACKAGE: $file can't be made";
binmode FH; select FH;
print << 'EOF';
/*
* This file is auto-generated by mkheader.
* Any changes here will be lost!
*/
EOF
foreach my $tbl (@boolfunc) {
my @temp = sort {$a <=> $b} keys %{$tbl->{hash}};
my $type = $tbl->{type};
my $name = $tbl->{name};
print "$type is$name (UV uv)\n{\nreturn\n\t";
while (@temp) {
my $cur = shift @temp;
if (@temp && $cur + 1 == $temp[0]) {
print "($cur <= uv && uv <= ";
while (@temp && $cur + 1 == $temp[0]) {
$cur = shift @temp;
}
print "$cur)";
print "\n\t|| " if @temp;
} else {
print "uv == $cur";
print "\n\t|| " if @temp;
}
}
print "\n\t? TRUE : FALSE;\n}\n\n";
}
close FH;
####################################
my $compinit =
"typedef struct { UV nextchar; UV composite; } $structname;\n\n";
foreach my $i (sort keys %CompList) {
$compinit .= "$structname $i [] = {\n";
$compinit .= join ",\n",
map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}),
sort {$a <=> $b } keys %{ $CompList{$i} };
$compinit .= ",\n{0,0}\n};\n\n"; # with sentinel
}
my @tripletable = (
{
file => "unfcmb",
name => "combin",
type => "STDCHAR",
hash => \%Combin,
null => 0,
},
{
file => "unfcan",
name => "canon",
type => "char*",
hash => \%Canon,
null => "NULL",
},
{
file => "unfcpt",
name => "compat",
type => "char*",
hash => \%Compat,
null => "NULL",
},
{
file => "unfcmp",
name => "compos",
type => "$structname *",
hash => \%Comp1st,
null => "NULL",
init => $compinit,
},
);
foreach my $tbl (@tripletable) {
my $file = "$tbl->{file}.h";
my $head = "${prefix}$tbl->{name}";
my $type = $tbl->{type};
my $hash = $tbl->{hash};
my $null = $tbl->{null};
my $init = $tbl->{init};
open FH, ">$file" or croak "$PACKAGE: $file can't be made";
binmode FH; select FH;
my %val;
print FH << 'EOF';
/*
* This file is auto-generated by mkheader.
* Any changes here will be lost!
*/
EOF
print $init if defined $init;
foreach my $uv (keys %$hash) {
croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv)
unless $uv <= 0x10FFFF;
my @c = unpack 'CCCC', pack 'N', $uv;
$val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv};
}
foreach my $p (sort { $a <=> $b } keys %val) {
next if ! $val{ $p };
for (my $r = 0; $r < 256; $r++) {
next if ! $val{ $p }{ $r };
printf "static $type ${head}_%02x_%02x [256] = {\n", $p, $r;
for (my $c = 0; $c < 256; $c++) {
print "\t", defined $val{$p}{$r}{$c}
? "($type)".$val{$p}{$r}{$c}
: $null;
print ',' if $c != 255;
print "\n" if $c % 8 == 7;
}
print "};\n\n";
}
}
foreach my $p (sort { $a <=> $b } keys %val) {
next if ! $val{ $p };
printf "static $type* ${head}_%02x [256] = {\n", $p;
for (my $r = 0; $r < 256; $r++) {
print $val{ $p }{ $r }
? sprintf("${head}_%02x_%02x", $p, $r)
: "NULL";
print ',' if $r != 255;
print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0;
}
print "};\n\n";
}
print "static $type** $head [] = {\n";
for (my $p = 0; $p <= 0x10; $p++) {
print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL";
print ',' if $p != 0x10;
print "\n";
}
print "};\n\n";
close FH;
}
1;
__END__