Legal \\p{}
and \\P{}
constructs that match no characters
Unicode has some property-value pairs that currently don't match anything. This happens generally either because they are obsolete, or they exist for symmetry with other forms, but no language has yet been encoded that uses them. In this version of Unicode, the following match zero code points:
$zero_matches
END }
# Generate list of properties that we don't accept, grouped by the reasons
# why. This is so only put out the 'why' once, and then list all the
# properties that have that reason under it.
my %why_list; # The keys are the reasons; the values are lists of
# properties that have the key as their reason
# For each property, add it to the list that are suppressed for its reason
# The sort will cause the alphabetically first properties to be added to
# each list first, so each list will be sorted.
foreach my $property (sort keys %why_suppressed) {
next unless $why_suppressed{$property};
push @{$why_list{$why_suppressed{$property}}}, $property;
}
# For each reason (sorted by the first property that has that reason)...
my @bad_re_properties;
foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
keys %why_list)
{
# Add to the output, all the properties that have that reason.
my $has_item = 0; # Flag if actually output anything.
foreach my $name (@{$why_list{$why}}) {
# Split compound names into $property and $table components
my $property = $name;
my $table;
if ($property =~ / (.*) = (.*) /x) {
$property = $1;
$table = $2;
}
# This release of Unicode may not have a property that is
# suppressed, so don't reference a non-existent one.
$property = property_ref($property);
next if ! defined $property;
# And since this list is only for match tables, don't list the
# ones that don't have match tables.
next if ! $property->to_create_match_tables;
# Find any abbreviation, and turn it into a compound name if this
# is a property=value pair.
my $short_name = $property->name;
$short_name .= '=' . $property->table($table)->name if $table;
# Start with an empty line.
push @bad_re_properties, "\n\n" unless $has_item;
# And add the property as an item for the reason.
push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
$has_item = 1;
}
# And add the reason under the list of properties, if such a list
# actually got generated. Note that the header got added
# unconditionally before. But pod ignores extra blank lines, so no
# harm.
push @bad_re_properties, "\n$why\n" if $has_item;
} # End of looping through each reason.
if (! @bad_re_properties) {
push @bad_re_properties,
"*** This installation accepts ALL non-Unihan properties ***";
}
else {
# Add =over only if non-empty to avoid an empty =over/=back section,
# which is considered bad form.
unshift @bad_re_properties, "\n=over 4\n";
push @bad_re_properties, "\n=back\n";
}
# Similarly, generate a list of files that we don't use, grouped by the
# reasons why (Don't output if the reason is empty). First, create a hash
# whose keys are the reasons, and whose values are anonymous arrays of all
# the files that share that reason.
my %grouped_by_reason;
foreach my $file (keys %skipped_files) {
next unless $skipped_files{$file};
push @{$grouped_by_reason{$skipped_files{$file}}}, $file;
}
# Then, sort each group.
foreach my $group (keys %grouped_by_reason) {
@{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b }
@{$grouped_by_reason{$group}} ;
}
# Finally, create the output text. For each reason (sorted by the
# alphabetically first file that has that reason)...
my @unused_files;
foreach my $reason (sort { lc $grouped_by_reason{$a}->[0]
cmp lc $grouped_by_reason{$b}->[0]
}
keys %grouped_by_reason)
{
# Add all the files that have that reason to the output. Start
# with an empty line.
push @unused_files, "\n\n";
push @unused_files, map { "\n=item F<$_> \n" }
@{$grouped_by_reason{$reason}};
# And add the reason under the list of files
push @unused_files, "\n$reason\n";
}
# Similarly, create the output text for the UCD section of the pod
my @ucd_pod;
foreach my $key (keys %ucd_pod) {
next unless $ucd_pod{$key}->{'output_this'};
push @ucd_pod, format_pod_line($indent_info_column,
$ucd_pod{$key}->{'name'},
$ucd_pod{$key}->{'info'},
$ucd_pod{$key}->{'status'},
);
}
# Sort alphabetically, and fold for output
@ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod;
my $ucd_pod = simple_fold(\@ucd_pod,
' ',
$indent_info_column,
$automatic_pod_indent);
$ucd_pod = format_pod_line($indent_info_column, 'NAME', ' INFO')
. "\n"
. $ucd_pod;
my $space_hex = sprintf("%02x", ord " ");
local $" = "";
# Everything is ready to assemble.
my @OUT = << "END";
=begin comment
$HEADER
To change this file, edit $0 instead.
NAME
$pod_file - Index of Unicode Version $unicode_version character properties in Perl
DESCRIPTION
This document provides information about the portion of the Unicode database that deals with character properties, that is the portion that is defined on single code points. ("Other information in the Unicode data base" below briefly mentions other data that Unicode provides.)
Perl can provide access to all non-provisional Unicode character properties, though not all are enabled by default. The omitted ones are the Unihan properties and certain deprecated or Unicode-internal properties. (An installation may choose to recompile Perl's tables to change this. See "Unicode character properties that are NOT accepted by Perl".)
For most purposes, access to Unicode properties from the Perl core is through regular expression matches, as described in the next section. For some special purposes, and to access the properties that are not suitable for regular expression matching, all the Unicode character properties that Perl handles are accessible via the standard Unicode::UCD module, as described in the section "Properties accessible through Unicode::UCD".
Perl also provides some additional extensions and short-cut synonyms for Unicode properties.
This document merely lists all available properties and does not attempt to explain what each property really means. There is a brief description of each Perl extension; see "Other Properties" in perlunicode for more information on these. There is some detail about Blocks, Scripts, General_Category, and Bidi_Class in perlunicode, but to find out about the intricacies of the official Unicode properties, refer to the Unicode standard. A good starting place is $unicode_reference_url.
Note that you can define your own properties; see "User-Defined Character Properties" in perlunicode.
Properties accessible through \\p{}
and \\P{}
The Perl regular expression \\p{}
and \\P{}
constructs give access to most of the Unicode character properties. The table below shows all these constructs, both single and compound forms.
Compound forms consist of two components, separated by an equals sign or a colon. The first component is the property name, and the second component is the particular value of the property to match against, for example, \\p{Script_Extensions: Greek}
and \\p{Script_Extensions=Greek}
both mean to match characters whose Script_Extensions property value is Greek. (Script_Extensions
is an improved version of the Script
property.)
Single forms, like \\p{Greek}
, are mostly Perl-defined shortcuts for their equivalent compound forms. The table shows these equivalences. (In our example, \\p{Greek}
is a just a shortcut for \\p{Script_Extensions=Greek}
). There are also a few Perl-defined single forms that are not shortcuts for a compound form. One such is \\p{Word}
. These are also listed in the table.
In parsing these constructs, Perl always ignores Upper/lower case differences everywhere within the {braces}. Thus \\p{Greek}
means the same thing as \\p{greek}
. But note that changing the case of the "p"
or "P"
before the left brace completely changes the meaning of the construct, from "match" (for \\p{}
) to "doesn't match" (for \\P{}
). Casing in this document is for improved legibility.
Also, white space, hyphens, and underscores are normally ignored everywhere between the {braces}, and hence can be freely added or removed even if the /x
modifier hasn't been specified on the regular expression. But in the table below $a_bold_stricter at the beginning of an entry means that tighter (stricter) rules are used for that entry:
- Single form (
\\p{name}
) tighter rules: -
White space, hyphens, and underscores ARE significant except for:
white space adjacent to a non-word character
underscores separating digits in numbers
That means, for example, that you can freely add or remove white space adjacent to (but within) the braces without affecting the meaning.
- Compound form (
\\p{name=value}
or\\p{name:value}
) tighter rules: -
The tighter rules given above for the single form apply to everything to the right of the colon or equals; the looser rules still apply to everything to the left.
That means, for example, that you can freely add or remove white space adjacent to (but within) the braces and the colon or equal sign.
Some properties are considered obsolete by Unicode, but still available. There are several varieties of obsolescence:
- Stabilized
-
A property may be stabilized. Such a determination does not indicate that the property should or should not be used; instead it is a declaration that the property will not be maintained nor extended for newly encoded characters. Such properties are marked with $a_bold_stabilized in the table.
- Deprecated
-
A property may be deprecated, perhaps because its original intent has been replaced by another property, or because its specification was somehow defective. This means that its use is strongly discouraged, so much so that a warning will be issued if used, unless the regular expression is in the scope of a
no warnings 'deprecated'
statement. $A_bold_deprecated flags each such entry in the table, and the entry there for the longest, most descriptive version of the property will give the reason it is deprecated, and perhaps advice. Perl may issue such a warning, even for properties that aren't officially deprecated by Unicode, when there used to be characters or code points that were matched by them, but no longer. This is to warn you that your program may not work like it did on earlier Unicode releases.A deprecated property may be made unavailable in a future Perl version, so it is best to move away from them.
A deprecated property may also be stabilized, but this fact is not shown.
- Obsolete
-
Properties marked with $a_bold_obsolete in the table are considered (plain) obsolete. Generally this designation is given to properties that Unicode once used for internal purposes (but not any longer).
- Discouraged
-
This is not actually a Unicode-specified obsolescence, but applies to certain Perl extensions that are present for backwards compatibility, but are discouraged from being used. These are not obsolete, but their meanings are not stable. Future Unicode versions could force any of these extensions to be removed without warning, replaced by another property with the same name that means something different. $A_bold_discouraged flags each such entry in the table. Use the equivalent shown instead.
@block_warning
The table below has two columns. The left column contains the \\p{}
constructs to look up, possibly preceded by the flags mentioned above; and the right column contains information about them, like a description, or synonyms. The table shows both the single and compound forms for each property that has them. If the left column is a short name for a property, the right column will give its longer, more descriptive name; and if the left column is the longest name, the right column will show any equivalent shortest name, in both single and compound forms if applicable.
If braces are not needed to specify a property (e.g., \\pL
), the left column contains both forms, with and without braces.
The right column will also caution you if a property means something different than what might normally be expected.
All single forms are Perl extensions; a few compound forms are as well, and are noted as such.
Numbers in (parentheses) indicate the total number of Unicode code points matched by the property. For the entries that give the longest, most descriptive version of the property, the count is followed by a list of some of the code points matched by it. The list includes all the matched characters in the 0-255 range, enclosed in the familiar [brackets] the same as a regular expression bracketed character class. Following that, the next few higher matching ranges are also given. To avoid visual ambiguity, the SPACE character is represented as \\x$space_hex
.
For emphasis, those properties that match no code points at all are listed as well in a separate section following the table.
Most properties match the same code points regardless of whether "/i"
case-insensitive matching is specified or not. But a few properties are affected. These are shown with the notation (/i= other_property)
in the second column. Under case-insensitive matching they match the same code pode points as the property other_property.
There is no description given for most non-Perl defined properties (See $unicode_reference_url for that).
For compactness, '*' is used as a wildcard instead of showing all possible combinations. For example, entries like:
\\p{Gc: *} \\p{General_Category: *}
mean that 'Gc' is a synonym for 'General_Category', and anything that is valid for the latter is also valid for the former. Similarly,
\\p{Is_*} \\p{*}
means that if and only if, for example, \\p{Foo}
exists, then \\p{Is_Foo}
and \\p{IsFoo}
are also valid and all mean the same thing. And similarly, \\p{Foo=Bar}
means the same as \\p{Is_Foo=Bar}
and \\p{IsFoo=Bar}
. "*" here is restricted to something not beginning with an underscore.
Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'. And 'No', 'F', and 'False' are all synonyms for 'N'. The table shows 'Y*' and 'N*' to indicate this, and doesn't have separate entries for the other possibilities. Note that not all properties which have values 'Yes' and 'No' are binary, and they have all their values spelled out without using this wild card, and a NOT
clause in their description that highlights their not being binary. These also require the compound form to match them, whereas true binary properties have both single and compound forms available.
Note that all non-essential underscores are removed in the display of the short names below.
Legend summary:
- * is a wild-card
- (\\d+) in the info column gives the number of Unicode code points matched by this property.
- $DEPRECATED means this is deprecated.
- $OBSOLETE means this is obsolete.
- $STABILIZED means this is stabilized.
- $STRICTER means tighter (stricter) name matching applies.
- $DISCOURAGED means use of this form is discouraged, and may not be stable.
$formatted_properties
$zero_matches
Properties accessible through Unicode::UCD
The value of any Unicode (not including Perl extensions) character property mentioned above for any single code point is available through "charprop()" in Unicode::UCD. "charprops_all()" in Unicode::UCD returns the values of all the Unicode properties for a given code point.
Besides these, all the Unicode character properties mentioned above (except for those marked as for internal use by Perl) are also accessible by "prop_invlist()" in Unicode::UCD.
Due to their nature, not all Unicode character properties are suitable for regular expression matches, nor prop_invlist()
. The remaining non-provisional, non-internal ones are accessible via "prop_invmap()" in Unicode::UCD (except for those that this Perl installation hasn't included; see below for which those are).
For compatibility with other parts of Perl, all the single forms given in the table in the section above are recognized. BUT, there are some ambiguities between some Perl extensions and the Unicode properties, all of which are silently resolved in favor of the official Unicode property. To avoid surprises, you should only use prop_invmap()
for forms listed in the table below, which omits the non-recommended ones. The affected forms are the Perl single form equivalents of Unicode properties, such as \\p{sc}
being a single-form equivalent of \\p{gc=sc}
, which is treated by prop_invmap()
as the Script
property, whose short name is sc
. The table indicates the current ambiguities in the INFO column, beginning with the word "NOT"
.
The standard Unicode properties listed below are documented in $unicode_reference_url; Perl_Decimal_Digit is documented in "prop_invmap()" in Unicode::UCD. The other Perl extensions are in "Other Properties" in perlunicode;
The first column in the table is a name for the property; the second column is an alternative name, if any, plus possibly some annotations. The alternative name is the property's full name, unless that would simply repeat the first column, in which case the second column indicates the property's short name (if different). The annotations are given only in the entry for the full name. The annotations for binary properties include a list of the first few ranges that the property matches. To avoid any ambiguity, the SPACE character is represented as \\x$space_hex
.
If a property is obsolete, etc, the entry will be flagged with the same characters used in the table in the section above, like $DEPRECATED or $STABILIZED.
$ucd_pod
Properties accessible through other means
Certain properties are accessible also via core function calls. These are:
Lowercase_Mapping lc() and lcfirst()
Titlecase_Mapping ucfirst()
Uppercase_Mapping uc()
Also, Case_Folding is accessible through the /i
modifier in regular expressions, the \\F
transliteration escape, and the fc
operator.
Besides being able to say \\p{Name=...}
, the Name and Name_Aliases properties are accessible through the \\N{}
interpolation in double-quoted strings and regular expressions; and functions charnames::viacode()
, charnames::vianame()
, and charnames::string_vianame()
(which require a use charnames ();
to be specified.
Finally, most properties related to decomposition are accessible via Unicode::Normalize.
Unicode character properties that are NOT accepted by Perl
Perl will generate an error for a few character properties in Unicode when used in a regular expression. The non-Unihan ones are listed below, with the reasons they are not accepted, perhaps with work-arounds. The short names for the properties are listed enclosed in (parentheses). As described after the list, an installation can change the defaults and choose to accept any of these. The list is machine generated based on the choices made for the installation that generated this document.
@bad_re_properties
An installation can choose to allow any of these to be matched by downloading the Unicode database from http://www.unicode.org/Public/ to \$Config{privlib}
/unicore/ in the Perl source tree, changing the controlling lists contained in the program \$Config{privlib}
/unicore/mktables and then re-compiling and installing. (\%Config
is available from the Config module).
Also, perl can be recompiled to operate on an earlier version of the Unicode standard. Further information is at \$Config{privlib}
/unicore/README.perl.
Other information in the Unicode data base
The Unicode data base is delivered in two different formats. The XML version is valid for more modern Unicode releases. The other version is a collection of files. The two are intended to give equivalent information. Perl uses the older form; this allows you to recompile Perl to use early Unicode releases.
The only non-character property that Perl currently supports is Named Sequences, in which a sequence of code points is given a name and generally treated as a single entity. (Perl supports these via the \\N{...}
double-quotish construct, "charnames::string_vianame(name)" in charnames, and "namedseq()" in Unicode::UCD.
Below is a list of the files in the Unicode data base that Perl doesn't currently use, along with very brief descriptions of their purposes. Some of the names of the files have been shortened from those that Unicode uses, in order to allow them to be distinguishable from similarly named files on file systems for which only the first 8 characters of a name are significant.
@unused_files
SEE ALSO
END
# And write it. The 0 means no utf8.
main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
return;
}
sub make_Name_pm () { # Create and write Name.pm, which contains subroutines and data to use in # conjunction with Name.pl
# Maybe there's nothing to do.
return unless $has_hangul_syllables || @code_points_ending_in_code_point;
my @name = <<END;
$HEADER
$INTERNAL_ONLY_HEADER
END
# Convert these structures to output format.
my $code_points_ending_in_code_point =
main::simple_dumper(\@code_points_ending_in_code_point,
' ' x 8);
my $names = main::simple_dumper(\%names_ending_in_code_point,
' ' x 8);
my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
' ' x 8);
# Do the same with the Hangul names,
my $jamo;
my $jamo_l;
my $jamo_v;
my $jamo_t;
my $jamo_re;
if ($has_hangul_syllables) {
# Construct a regular expression of all the possible
# combinations of the Hangul syllables.
my @L_re; # Leading consonants
for my $i ($LBase .. $LBase + $LCount - 1) {
push @L_re, $Jamo{$i}
}
my @V_re; # Middle vowels
for my $i ($VBase .. $VBase + $VCount - 1) {
push @V_re, $Jamo{$i}
}
my @T_re; # Trailing consonants
for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
push @T_re, $Jamo{$i}
}
# The whole re is made up of the L V T combination.
$jamo_re = '('
. join ('|', sort @L_re)
. ')('
. join ('|', sort @V_re)
. ')('
. join ('|', sort @T_re)
. ')?';
# These hashes needed by the algorithm were generated
# during reading of the Jamo.txt file
$jamo = main::simple_dumper(\%Jamo, ' ' x 8);
$jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
$jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
$jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
}
push @name, <<END;
package charnames;
# This module contains machine-generated tables and code for the # algorithmically-determinable Unicode character names. The following # routines can be used to translate between name and code point and vice versa
{ # Closure
# Matches legal code point. 4-6 hex numbers, If there are 6, the first
# two must be 10; if there are 5, the first must not be a 0. Written this
# way to decrease backtracking. The first regex allows the code point to
# be at the end of a word, but to work properly, the word shouldn't end
# with a valid hex character. The second one won't match a code point at
# the end of a word, and doesn't have the run-on issue
my \$run_on_code_point_re = qr/$run_on_code_point_re/;
my \$code_point_re = qr/$code_point_re/;
# In the following hash, the keys are the bases of names which include
# the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The value
# of each key is another hash which is used to get the low and high ends
# for each range of code points that apply to the name.
my %names_ending_in_code_point = (
$names
);
# The following hash is a copy of the previous one, except is for loose
# matching, so each name has blanks and dashes squeezed out
my %loose_names_ending_in_code_point = (
$loose_names
);
# And the following array gives the inverse mapping from code points to
# names. Lowest code points are first
\@code_points_ending_in_code_point = (
$code_points_ending_in_code_point
);
# Is exportable, make read-only
Internals::SvREADONLY(\@code_points_ending_in_code_point, 1);
END
# Earlier releases didn't have Jamos. No sense outputting
# them unless will be used.
if ($has_hangul_syllables) {
push @name, <<END;
# Convert from code point to Jamo short name for use in composing Hangul
# syllable names
my %Jamo = (
$jamo
);
# Leading consonant (can be null)
my %Jamo_L = (
$jamo_l
);
# Vowel
my %Jamo_V = (
$jamo_v
);
# Optional trailing consonant
my %Jamo_T = (
$jamo_t
);
# Computed re that splits up a Hangul name into LVT or LV syllables
my \$syllable_re = qr/$jamo_re/;
my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
# These constants names and values were taken from the Unicode standard,
# version 5.1, section 3.12. They are used in conjunction with Hangul
# syllables
my \$SBase = $SBase_string;
my \$LBase = $LBase_string;
my \$VBase = $VBase_string;
my \$TBase = $TBase_string;
my \$SCount = $SCount;
my \$LCount = $LCount;
my \$VCount = $VCount;
my \$TCount = $TCount;
my \$NCount = \$VCount * \$TCount;
END
} # End of has Jamos
push @name, << 'END';
sub name_to_code_point_special {
my ($name, $loose) = @_;
# Returns undef if not one of the specially handled names; otherwise
# returns the code point equivalent to the input name
# $loose is non-zero if to use loose matching, 'name' in that case
# must be input as upper case with all blanks and dashes squeezed out.
END
if ($has_hangul_syllables) {
push @name, << 'END';
if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
|| ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
{
return if $name !~ qr/^$syllable_re$/;
my $L = $Jamo_L{$1};
my $V = $Jamo_V{$2};
my $T = (defined $3) ? $Jamo_T{$3} : 0;
return ($L * $VCount + $V) * $TCount + $T + $SBase;
}
END
}
push @name, << 'END';
# Name must end in 'code_point' for this to handle.
return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
|| (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
my $base = $1;
my $code_point = CORE::hex $2;
my $names_ref;
if ($loose) {
$names_ref = \%loose_names_ending_in_code_point;
}
else {
return if $base !~ s/-$//;
$names_ref = \%names_ending_in_code_point;
}
# Name must be one of the ones which has the code point in it.
return if ! $names_ref->{$base};
# Look through the list of ranges that apply to this name to see if
# the code point is in one of them.
for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
return if $names_ref->{$base}{'low'}->[$i] > $code_point;
next if $names_ref->{$base}{'high'}->[$i] < $code_point;
# Here, the code point is in the range.
return $code_point;
}
# Here, looked like the name had a code point number in it, but
# did not match one of the valid ones.
return;
}
sub code_point_to_name_special {
my $code_point = shift;
# Returns the name of a code point if algorithmically determinable;
# undef if not
END
if ($has_hangul_syllables) {
push @name, << 'END';
# If in the Hangul range, calculate the name based on Unicode's
# algorithm
if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
use integer;
my $SIndex = $code_point - $SBase;
my $L = $LBase + $SIndex / $NCount;
my $V = $VBase + ($SIndex % $NCount) / $TCount;
my $T = $TBase + $SIndex % $TCount;
$name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
$name .= $Jamo{$T} if $T != $TBase;
return $name;
}
END
}
push @name, << 'END';
# Look through list of these code points for one in range.
foreach my $hash (@code_points_ending_in_code_point) {
return if $code_point < $hash->{'low'};
if ($code_point <= $hash->{'high'}) {
return sprintf("%s-%04X", $hash->{'name'}, $code_point);
}
}
return; # None found
}
} # End closure
1; END
main::write("Name.pm", 0, \@name); # The 0 means no utf8.
return;
}
sub make_UCD () { # Create and write UCD.pl, which passes info about the tables to # Unicode::UCD
# Stringify structures for output
my $loose_property_name_of
= simple_dumper(\%loose_property_name_of, ' ' x 4);
chomp $loose_property_name_of;
my $strict_property_name_of
= simple_dumper(\%strict_property_name_of, ' ' x 4);
chomp $strict_property_name_of;
my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
chomp $stricter_to_file_of;
my $inline_definitions = simple_dumper(\@inline_definitions, " " x 4);
chomp $inline_definitions;
my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4);
chomp $loose_to_file_of;
my $nv_floating_to_rational
= simple_dumper(\%nv_floating_to_rational, ' ' x 4);
chomp $nv_floating_to_rational;
my $why_deprecated = simple_dumper(\%Unicode::UCD::why_deprecated, ' ' x 4);
chomp $why_deprecated;
# We set the key to the file when we associated files with tables, but we
# couldn't do the same for the value then, as we might not have the file
# for the alternate table figured out at that time.
foreach my $cased (keys %caseless_equivalent_to) {
my @path = $caseless_equivalent_to{$cased}->file_path;
my $path;
if ($path[0] eq "#") { # Pseudo-directory '#'
$path = join '/', @path;
}
else { # Gets rid of lib/
$path = join '/', @path[1, -1];
}
$caseless_equivalent_to{$cased} = $path;
}
my $caseless_equivalent_to
= simple_dumper(\%caseless_equivalent_to, ' ' x 4);
chomp $caseless_equivalent_to;
my $loose_property_to_file_of
= simple_dumper(\%loose_property_to_file_of, ' ' x 4);
chomp $loose_property_to_file_of;
my $strict_property_to_file_of
= simple_dumper(\%strict_property_to_file_of, ' ' x 4);
chomp $strict_property_to_file_of;
my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4);
chomp $file_to_swash_name;
# Create a mapping from each alias of Perl single-form extensions to all
# its equivalent aliases, for quick look-up.
my %perlprop_to_aliases;
foreach my $table ($perl->tables) {
# First create the list of the aliases of each extension
my @aliases_list; # List of legal aliases for this extension
my $table_name = $table->name;
my $standard_table_name = standardize($table_name);
my $table_full_name = $table->full_name;
my $standard_table_full_name = standardize($table_full_name);
# Make sure that the list has both the short and full names
push @aliases_list, $table_name, $table_full_name;
my $found_ucd = 0; # ? Did we actually get an alias that should be
# output for this table
# Go through all the aliases (including the two just added), and add
# any new unique ones to the list
foreach my $alias ($table->aliases) {
# Skip non-legal names
next unless $alias->ok_as_filename;
next unless $alias->ucd;
$found_ucd = 1; # have at least one legal name
my $name = $alias->name;
my $standard = standardize($name);
# Don't repeat a name that is equivalent to one already on the
# list
next if $standard eq $standard_table_name;
next if $standard eq $standard_table_full_name;
push @aliases_list, $name;
}
# If there were no legal names, don't output anything.
next unless $found_ucd;
# To conserve memory in the program reading these in, omit full names
# that are identical to the short name, when those are the only two
# aliases for the property.
if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) {
pop @aliases_list;
}
# Here, @aliases_list is the list of all the aliases that this
# extension legally has. Now can create a map to it from each legal
# standardized alias
foreach my $alias ($table->aliases) {
next unless $alias->ucd;
next unless $alias->ok_as_filename;
push @{$perlprop_to_aliases{standardize($alias->name)}},
uniques @aliases_list;
}
}
# Make a list of all combinations of properties/values that are suppressed.
my @suppressed;
if (! $debug_skip) { # This tends to fail in this debug mode
foreach my $property_name (keys %why_suppressed) {
# Just the value
my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
# The hash may contain properties not in this release of Unicode
next unless defined (my $property = property_ref($property_name));
# Find all combinations
foreach my $prop_alias ($property->aliases) {
my $prop_alias_name = standardize($prop_alias->name);
# If no =value, there's just one combination possible for this
if (! $value_name) {
# The property may be suppressed, but there may be a proxy
# for it, so it shouldn't be listed as suppressed
next if $prop_alias->ucd;
push @suppressed, $prop_alias_name;
}
else { # Otherwise
foreach my $value_alias
($property->table($value_name)->aliases)
{
next if $value_alias->ucd;
push @suppressed, "$prop_alias_name="
. standardize($value_alias->name);
}
}
}
}
}
@suppressed = sort @suppressed; # So doesn't change between runs of this
# program
# Convert the structure below (designed for Name.pm) to a form that UCD
# wants, so it doesn't have to modify it at all; i.e. so that it includes
# an element for the Hangul syllables in the appropriate place, and
# otherwise changes the name to include the "-<code point>" suffix.
my @algorithm_names;
my $done_hangul = $v_version lt v2.0.0; # Hanguls as we know them came
# along in this version
# Copy it linearly.
for my $i (0 .. @code_points_ending_in_code_point - 1) {
# Insert the hanguls in the correct place.
if (! $done_hangul
&& $code_points_ending_in_code_point[$i]->{'low'} > $SBase)
{
$done_hangul = 1;
push @algorithm_names, { low => $SBase,
high => $SBase + $SCount - 1,
name => '<hangul syllable>',
};
}
# Copy the current entry, modified.
push @algorithm_names, {
low => $code_points_ending_in_code_point[$i]->{'low'},
high => $code_points_ending_in_code_point[$i]->{'high'},
name =>
"$code_points_ending_in_code_point[$i]->{'name'}-<code point>",
};
}
# Serialize these structures for output.
my $loose_to_standard_value
= simple_dumper(\%loose_to_standard_value, ' ' x 4);
chomp $loose_to_standard_value;
my $string_property_loose_to_name
= simple_dumper(\%string_property_loose_to_name, ' ' x 4);
chomp $string_property_loose_to_name;
my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4);
chomp $perlprop_to_aliases;
my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4);
chomp $prop_aliases;
my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4);
chomp $prop_value_aliases;
my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : "";
chomp $suppressed;
my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4);
chomp $algorithm_names;
my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4);
chomp $ambiguous_names;
my $combination_property = simple_dumper(\%combination_property, ' ' x 4);
chomp $combination_property;
my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4);
chomp $loose_defaults;
my @ucd = <<END;
$HEADER
$INTERNAL_ONLY_HEADER
# This file is for the use of Unicode::UCD
# Highest legal Unicode code point \$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING;
# Hangul syllables \$Unicode::UCD::HANGUL_BEGIN = $SBase_string; \$Unicode::UCD::HANGUL_COUNT = $SCount;
# Maps Unicode (not Perl single-form extensions) property names in loose # standard form to their corresponding standard names \%Unicode::UCD::loose_property_name_of = ( $loose_property_name_of );
# Same, but strict names \%Unicode::UCD::strict_property_name_of = ( $strict_property_name_of );
# Gives the definitions (in the form of inversion lists) for those properties # whose definitions aren't kept in files \@Unicode::UCD::inline_definitions = ( $inline_definitions );
# Maps property, table to file for those using stricter matching. For paths # whose directory is '#', the file is in the form of a numeric index into # \@inline_definitions \%Unicode::UCD::stricter_to_file_of = ( $stricter_to_file_of );
# Maps property, table to file for those using loose matching. For paths # whose directory is '#', the file is in the form of a numeric index into # \@inline_definitions \%Unicode::UCD::loose_to_file_of = ( $loose_to_file_of );
# Maps floating point to fractional form \%Unicode::UCD::nv_floating_to_rational = ( $nv_floating_to_rational );
# If a %e floating point number doesn't have this number of digits in it after # the decimal point to get this close to a fraction, it isn't considered to be # that fraction even if all the digits it does have match. \$Unicode::UCD::e_precision = $E_FLOAT_PRECISION;
# Deprecated tables to generate a warning for. The key is the file containing # the table, so as to avoid duplication, as many property names can map to the # file, but we only need one entry for all of them. \%Unicode::UCD::why_deprecated = ( $why_deprecated );
# A few properties have different behavior under /i matching. This maps # those to substitute files to use under /i. \%Unicode::UCD::caseless_equivalent = ( $caseless_equivalent_to );
# Property names to mapping files \%Unicode::UCD::loose_property_to_file_of = ( $loose_property_to_file_of );
# Property names to mapping files \%Unicode::UCD::strict_property_to_file_of = ( $strict_property_to_file_of );
# Files to the swash names within them. \%Unicode::UCD::file_to_swash_name = ( $file_to_swash_name );
# Keys are all the possible "prop=value" combinations, in loose form; values # are the standard loose name for the 'value' part of the key \%Unicode::UCD::loose_to_standard_value = ( $loose_to_standard_value );
# String property loose names to standard loose name \%Unicode::UCD::string_property_loose_to_name = ( $string_property_loose_to_name );
# Keys are Perl extensions in loose form; values are each one's list of # aliases \%Unicode::UCD::loose_perlprop_to_name = ( $perlprop_to_aliases );
# Keys are standard property name; values are each one's aliases \%Unicode::UCD::prop_aliases = ( $prop_aliases );
# Keys of top level are standard property name; values are keys to another # hash, Each one is one of the property's values, in standard form. The # values are that prop-val's aliases. If only one specified, the short and # long alias are identical. \%Unicode::UCD::prop_value_aliases = ( $prop_value_aliases );
# Ordered (by code point ordinal) list of the ranges of code points whose # names are algorithmically determined. Each range entry is an anonymous hash # of the start and end points and a template for the names within it. \@Unicode::UCD::algorithmic_named_code_points = ( $algorithm_names );
# The properties that as-is have two meanings, and which must be disambiguated \%Unicode::UCD::ambiguous_names = ( $ambiguous_names );
# Keys are the prop-val combinations which are the default values for the # given property, expressed in standard loose form \%Unicode::UCD::loose_defaults = ( $loose_defaults );
# The properties that are combinations, in that they have both a map table and # a match table. This is actually for UCD.t, so it knows how to test for # these. \%Unicode::UCD::combination_property = ( $combination_property );
# All combinations of names that are suppressed. # This is actually for UCD.t, so it knows which properties shouldn't have # entries. If it got any bigger, would probably want to put it in its own # file to use memory only when it was needed, in testing. \@Unicode::UCD::suppressed_properties = ( $suppressed );
1; END
main::write("UCD.pl", 0, \@ucd); # The 0 means no utf8.
return;
}
sub write_all_tables() { # Write out all the tables generated by this program to files, as well as # the supporting data structures, pod file, and .t file.
my @writables; # List of tables that actually get written
my %match_tables_to_write; # Used to collapse identical match tables
# into one file. Each key is a hash function
# result to partition tables into buckets.
# Each value is an array of the tables that
# fit in the bucket.
# For each property ...
# (sort so that if there is an immutable file name, it has precedence, so
# some other property can't come in and take over its file name. (We
# don't care if both defined, as they had better be different anyway.)
# The property named 'Perl' needs to be first (it doesn't have any
# immutable file name) because empty properties are defined in terms of
# its table named 'All' under the -annotate option.) We also sort by
# the property's name. This is just for repeatability of the outputs
# between runs of this program, but does not affect correctness.
PROPERTY:
foreach my $property ($perl,
sort { return -1 if defined $a->file;
return 1 if defined $b->file;
return $a->name cmp $b->name;
} grep { $_ != $perl } property_ref('*'))
{
my $type = $property->type;
# And for each table for that property, starting with the mapping
# table for it ...
TABLE:
foreach my $table($property,
# and all the match tables for it (if any), sorted so
# the ones with the shortest associated file name come
# first. The length sorting prevents problems of a
# longer file taking a name that might have to be used
# by a shorter one. The alphabetic sorting prevents
# differences between releases
sort { my $ext_a = $a->external_name;
return 1 if ! defined $ext_a;
my $ext_b = $b->external_name;
return -1 if ! defined $ext_b;
# But return the non-complement table before
# the complement one, as the latter is defined
# in terms of the former, and needs to have
# the information for the former available.
return 1 if $a->complement != 0;
return -1 if $b->complement != 0;
# Similarly, return a subservient table after
# a leader
return 1 if $a->leader != $a;
return -1 if $b->leader != $b;
my $cmp = length $ext_a <=> length $ext_b;
# Return result if lengths not equal
return $cmp if $cmp;
# Alphabetic if lengths equal
return $ext_a cmp $ext_b
} $property->tables
)
{
# Here we have a table associated with a property. It could be
# the map table (done first for each property), or one of the
# other tables. Determine which type.
my $is_property = $table->isa('Property');
my $name = $table->name;
my $complete_name = $table->complete_name;
# See if should suppress the table if is empty, but warn if it
# contains something.
my $suppress_if_empty_warn_if_not
= $why_suppress_if_empty_warn_if_not{$complete_name} || 0;
# Calculate if this table should have any code points associated
# with it or not.
my $expected_empty =
# $perl should be empty
($is_property && ($table == $perl))
# Match tables in properties we skipped populating should be
# empty
|| (! $is_property && ! $property->to_create_match_tables)
# Tables and properties that are expected to have no code
# points should be empty
|| $suppress_if_empty_warn_if_not
;
# Set a boolean if this table is the complement of an empty binary
# table
my $is_complement_of_empty_binary =
$type == $BINARY &&
(($table == $property->table('Y')
&& $property->table('N')->is_empty)
|| ($table == $property->table('N')
&& $property->table('Y')->is_empty));
if ($table->is_empty) {
if ($suppress_if_empty_warn_if_not) {
$table->set_fate($SUPPRESSED,
$suppress_if_empty_warn_if_not);
}
# Suppress (by skipping them) expected empty tables.
next TABLE if $expected_empty;
# And setup to later output a warning for those that aren't
# known to be allowed to be empty. Don't do the warning if
# this table is a child of another one to avoid duplicating
# the warning that should come from the parent one.
if (($table == $property || $table->parent == $table)
&& $table->fate != $SUPPRESSED
&& $table->fate != $MAP_PROXIED
&& ! grep { $complete_name =~ /^$_$/ }
@tables_that_may_be_empty)
{
push @unhandled_properties, "$table";
}
# The old way of expressing an empty match list was to
# complement the list that matches everything. The new way is
# to create an empty inversion list, but this doesn't work for
# annotating, so use the old way then.
$table->set_complement($All) if $annotate
&& $table != $property;
}
elsif ($expected_empty) {
my $because = "";
if ($suppress_if_empty_warn_if_not) {
$because = " because $suppress_if_empty_warn_if_not";
}
Carp::my_carp("Not expecting property $table$because. Generating file for it anyway.");
}
# Some tables should match everything
my $expected_full =
($table->fate == $SUPPRESSED)
? 0
: ($is_property)
? # All these types of map tables will be full because
# they will have been populated with defaults
($type == $ENUM)
: # A match table should match everything if its method
# shows it should
($table->matches_all
# The complement of an empty binary table will match
# everything
|| $is_complement_of_empty_binary
)
;
my $count = $table->count;
if ($expected_full) {
if ($count != $MAX_WORKING_CODEPOINTS) {
Carp::my_carp("$table matches only "
. clarify_number($count)
. " Unicode code points but should match "
. clarify_number($MAX_WORKING_CODEPOINTS)
. " (off by "
. clarify_number(abs($MAX_WORKING_CODEPOINTS - $count))
. "). Proceeding anyway.");
}
# Here is expected to be full. If it is because it is the
# complement of an (empty) binary table that is to be
# suppressed, then suppress this one as well.
if ($is_complement_of_empty_binary) {
my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
my $opposing = $property->table($opposing_name);
my $opposing_status = $opposing->status;
if ($opposing_status) {
$table->set_status($opposing_status,
$opposing->status_info);
}
}
}
elsif ($count == $MAX_UNICODE_CODEPOINTS
&& $name ne "Any"
&& ($table == $property || $table->leader == $table)
&& $table->property->status ne $NORMAL)
{
Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway.");
}
if ($table->fate >= $SUPPRESSED) {
if (! $is_property) {
my @children = $table->children;
foreach my $child (@children) {
if ($child->fate < $SUPPRESSED) {
Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
}
}
}
next TABLE;
}
if (! $is_property) {
make_ucd_table_pod_entries($table) if $table->property == $perl;
# Several things need to be done just once for each related
# group of match tables. Do them on the parent.
if ($table->parent == $table) {
# Add an entry in the pod file for the table; it also does
# the children.
make_re_pod_entries($table) if defined $pod_directory;
# See if the table matches identical code points with
# something that has already been processed and is ready
# for output. In that case, no need to have two files
# with the same code points in them. We use the table's
# hash() method to store these in buckets, so that it is
# quite likely that if two tables are in the same bucket
# they will be identical, so don't have to compare tables
# frequently. The tables have to have the same status to
# share a file, so add this to the bucket hash. (The
# reason for this latter is that UCD.pm associates a
# status with a file.) We don't check tables that are
# inverses of others, as it would lead to some coding
# complications, and checking all the regular ones should
# find everything.
if ($table->complement == 0) {
my $hash = $table->hash . ';' . $table->status;
# Look at each table that is in the same bucket as
# this one would be.
foreach my $comparison
(@{$match_tables_to_write{$hash}})
{
# If the table doesn't point back to this one, we
# see if it matches identically
if ( $comparison->leader != $table
&& $table->matches_identically_to($comparison))
{
$table->set_equivalent_to($comparison,
Related => 0);
next TABLE;
}
}
# Here, not equivalent, add this table to the bucket.
push @{$match_tables_to_write{$hash}}, $table;
}
}
}
else {
# Here is the property itself.
# Don't write out or make references to the $perl property
next if $table == $perl;
make_ucd_table_pod_entries($table);
# There is a mapping stored of the various synonyms to the
# standardized name of the property for Unicode::UCD.
# Also, the pod file contains entries of the form:
# \p{alias: *} \p{full: *}
# rather than show every possible combination of things.
my @property_aliases = $property->aliases;
my $full_property_name = $property->full_name;
my $property_name = $property->name;
my $standard_property_name = standardize($property_name);
my $standard_property_full_name
= standardize($full_property_name);
# We also create for Unicode::UCD a list of aliases for
# the property. The list starts with the property name;
# then its full name.
my @property_list;
my @standard_list;
if ( $property->fate <= $MAP_PROXIED) {
@property_list = ($property_name, $full_property_name);
@standard_list = ($standard_property_name,
$standard_property_full_name);
}
# For each synonym ...
for my $i (0 .. @property_aliases - 1) {
my $alias = $property_aliases[$i];
my $alias_name = $alias->name;
my $alias_standard = standardize($alias_name);
# Add other aliases to the list of property aliases
if ($property->fate <= $MAP_PROXIED
&& ! grep { $alias_standard eq $_ } @standard_list)
{
push @property_list, $alias_name;
push @standard_list, $alias_standard;
}
# For Unicode::UCD, set the mapping of the alias to the
# property
if ($type == $STRING) {
if ($property->fate <= $MAP_PROXIED) {
$string_property_loose_to_name{$alias_standard}
= $standard_property_name;
}
}
else {
my $hash_ref = ($alias_standard =~ /^_/)
? \%strict_property_name_of
: \%loose_property_name_of;
if (exists $hash_ref->{$alias_standard}) {
Carp::my_carp("There already is a property with the same standard name as $alias_name: $hash_ref->{$alias_standard}. Old name is retained");
}
else {
$hash_ref->{$alias_standard}
= $standard_property_name;
}
# Now for the re pod entry for this alias. Skip if not
# outputting a pod; skip the first one, which is the
# full name so won't have an entry like: '\p{full: *}
# \p{full: *}', and skip if don't want an entry for
# this one.
next if $i == 0
|| ! defined $pod_directory
|| ! $alias->make_re_pod_entry;
my $rhs = "\\p{$full_property_name: *}";
if ($property != $perl && $table->perl_extension) {
$rhs .= ' (Perl extension)';
}
push @match_properties,
format_pod_line($indent_info_column,
'\p{' . $alias->name . ': *}',
$rhs,
$alias->status);
}
}
# The list of all possible names is attached to each alias, so
# lookup is easy
if (@property_list) {
push @{$prop_aliases{$standard_list[0]}}, @property_list;
}
if ($property->fate <= $MAP_PROXIED) {
# Similarly, we create for Unicode::UCD a list of
# property-value aliases.
# Look at each table in the property...
foreach my $table ($property->tables) {
my @values_list;
my $table_full_name = $table->full_name;
my $standard_table_full_name
= standardize($table_full_name);
my $table_name = $table->name;
my $standard_table_name = standardize($table_name);
# The list starts with the table name and its full
# name.
push @values_list, $table_name, $table_full_name;
# We add to the table each unique alias that isn't
# discouraged from use.
foreach my $alias ($table->aliases) {
next if $alias->status
&& $alias->status eq $DISCOURAGED;
my $name = $alias->name;
my $standard = standardize($name);
next if $standard eq $standard_table_name;
next if $standard eq $standard_table_full_name;
push @values_list, $name;
}
# Here @values_list is a list of all the aliases for
# the table. That is, all the property-values given
# by this table. By agreement with Unicode::UCD,
# if the name and full name are identical, and there
# are no other names, drop the duplicate entry to save
# memory.
if (@values_list == 2
&& $values_list[0] eq $values_list[1])
{
pop @values_list
}
# To save memory, unlike the similar list for property
# aliases above, only the standard forms have the list.
# This forces an extra step of converting from input
# name to standard name, but the savings are
# considerable. (There is only marginal savings if we
# did this with the property aliases.)
push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list;
}
}
# Don't write out a mapping file if not desired.
next if ! $property->to_output_map;
}
# Here, we know we want to write out the table, but don't do it
# yet because there may be other tables that come along and will
# want to share the file, and the file's comments will change to
# mention them. So save for later.
push @writables, $table;
} # End of looping through the property and all its tables.
} # End of looping through all properties.
# Now have all the tables that will have files written for them. Do it.
foreach my $table (@writables) {
my @directory;
my $filename;
my $property = $table->property;
my $is_property = ($table == $property);
# For very short tables, instead of writing them out to actual files,
# we in-line their inversion list definitions into UCD.pm. The
# definition replaces the file name, and the special pseudo-directory
# '#' is used to signal this. This significantly cuts down the number
# of files written at little extra cost to the hashes in UCD.pm.
# And it means, no run-time files to read to get the definitions.
if (! $is_property
&& ! $annotate # For annotation, we want to explicitly show
# everything, so keep in files
&& $table->ranges <= 3)
{
my @ranges = $table->ranges;
my $count = @ranges;
if ($count == 0) { # 0th index reserved for 0-length lists
$filename = 0;
}
elsif ($table->leader != $table) {
# Here, is a table that is equivalent to another; code
# in register_file_for_name() causes its leader's definition
# to be used
next;
}
else { # No equivalent table so far.
# Build up its definition range-by-range.
my $definition = "";
while (defined (my $range = shift @ranges)) {
my $end = $range->end;
if ($end < $MAX_WORKING_CODEPOINT) {
$count++;
$end = "\n" . ($end + 1);
}
else { # Extends to infinity, hence no 'end'
$end = "";
}
$definition .= "\n" . $range->start . $end;
}
$definition = "V$count" . $definition;
$filename = @inline_definitions;
push @inline_definitions, $definition;
}
@directory = "#";
register_file_for_name($table, \@directory, $filename);
next;
}
if (! $is_property) {
# Match tables for the property go in lib/$subdirectory, which is
# the property's name. Don't use the standard file name for this,
# as may get an unfamiliar alias
@directory = ($matches_directory, ($property->match_subdir)
? $property->match_subdir
: $property->external_name);
}
else {
@directory = $table->directory;
$filename = $table->file;
}
# Use specified filename if available, or default to property's
# shortest name. We need an 8.3 safe filename (which means "an 8
# safe" filename, since after the dot is only 'pl', which is < 3)
# The 2nd parameter is if the filename shouldn't be changed, and
# it shouldn't iff there is a hard-coded name for this table.
$filename = construct_filename(
$filename || $table->external_name,
! $filename, # mutable if no filename
\@directory);
register_file_for_name($table, \@directory, $filename);
# Only need to write one file when shared by more than one
# property
next if ! $is_property
&& ($table->leader != $table || $table->complement != 0);
# Construct a nice comment to add to the file
$table->set_final_comment;
$table->write;
}
# Write out the pod file
make_pod;
# And Name.pm, UCD.pl
make_Name_pm;
make_UCD;
make_property_test_script() if $make_test_script;
make_normalization_test_script() if $make_norm_test_script;
return;
}
my @white_space_separators = ( # This used only for making the test script. "", ' ', "\t", ' ' );
sub generate_separator($lhs) { # This used only for making the test script. It generates the colon or # equal separator between the property and property value, with random # white space surrounding the separator
return "" if $lhs eq ""; # No separator if there's only one (the r) side
# Choose space before and after randomly
my $spaces_before =$white_space_separators[rand(@white_space_separators)];
my $spaces_after = $white_space_separators[rand(@white_space_separators)];
# And return the whole complex, half the time using a colon, half the
# equals
return $spaces_before
. (rand() < 0.5) ? '=' : ':'
. $spaces_after;
}
sub generate_tests($lhs, $rhs, $valid_code, $invalid_code, $warning) { # This used only for making the test script. It generates test cases that # are expected to compile successfully in perl. Note that the LHS and # RHS are assumed to already be as randomized as the caller wants.
# $lhs # The property: what's to the left of the colon
# or equals separator
# $rhs # The property value; what's to the right
# $valid_code # A code point that's known to be in the
# table given by LHS=RHS; undef if table is
# empty
# $invalid_code # A code point known to not be in the table;
# undef if the table is all code points
# $warning
# Get the colon or equal
my $separator = generate_separator($lhs);
# The whole 'property=value'
my $name = "$lhs$separator$rhs";
my @output;
# Create a complete set of tests, with complements.
if (defined $valid_code) {
push @output, <<"EOC"
Expect(1, $valid_code, '\\p{$name}', $warning);
Expect(0, $valid_code, '\\p{^$name}', $warning);
Expect(0, $valid_code, '\\P{$name}', $warning);
Expect(1, $valid_code, '\\P{^$name}', $warning);
EOC
}
if (defined $invalid_code) {
push @output, <<"EOC"
Expect(0, $invalid_code, '\\p{$name}', $warning);
Expect(1, $invalid_code, '\\p{^$name}', $warning);
Expect(1, $invalid_code, '\\P{$name}', $warning);
Expect(0, $invalid_code, '\\P{^$name}', $warning);
EOC
}
return @output;
}
sub generate_wildcard_tests($lhs, $rhs, $valid_code, $invalid_code, $warning) { # This used only for making the test script. It generates wildcardl # matching test cases that are expected to compile successfully in perl.
# $lhs # The property: what's to the left of the
# or equals separator
# $rhs # The property value; what's to the right
# $valid_code # A code point that's known to be in the
# table given by LHS=RHS; undef if table is
# empty
# $invalid_code # A code point known to not be in the table;
# undef if the table is all code points
# $warning
return if $lhs eq "";
return if $lhs =~ / ^ Is_ /x; # These are not currently supported
# Generate a standardized pattern, with colon being the delimitter
my $wildcard = "$lhs=:\\A$rhs\\z:";
my @output;
push @output, "Expect(1, $valid_code, '\\p{$wildcard}', $warning);"
if defined $valid_code;
push @output, "Expect(0, $invalid_code, '\\p{$wildcard}', $warning);"
if defined $invalid_code;
return @output;
}
sub generate_error($lhs, $rhs, $already_in_error=0) { # This used only for making the test script. It generates test cases that # are expected to not only not match, but to be syntax or similar errors
# $lhs # The property: what's to the left of the
# colon or equals separator
# $rhs # The property value; what's to the right
# $already_in_error # Boolean; if true it's known that the
# unmodified LHS and RHS will cause an error.
# This routine should not force another one
# Get the colon or equal
my $separator = generate_separator($lhs);
# Since this is an error only, don't bother to randomly decide whether to
# put the error on the left or right side; and assume that the RHS is
# loosely matched, again for convenience rather than rigor.
$rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
my $property = $lhs . $separator . $rhs;
return <<"EOC";
Error('\\p{$property}');
Error('\\P{$property}');
EOC
}
# These are used only for making the test script # XXX Maybe should also have a bad strict seps, which includes underscore.
my @good_loose_seps = ( " ", "-", "\t", "", "_", ); my @bad_loose_seps = ( "/a/", ':=', );
sub randomize_stricter_name($name) { # This used only for making the test script. Take the input name and # return a randomized, but valid version of it under the stricter matching # rules.
# If the name looks like a number (integer, floating, or rational), do
# some extra work
if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
my $sign = $1;
my $number = $2;
my $separator = $3;
# If there isn't a sign, part of the time add a plus
# Note: Not testing having any denominator having a minus sign
if (! $sign) {
$sign = '+' if rand() <= .3;
}
# And add 0 or more leading zeros.
$name = $sign . ('0' x int rand(10)) . $number;
if (defined $separator) {
my $extra_zeros = '0' x int rand(10);
if ($separator eq '.') {
# Similarly, add 0 or more trailing zeros after a decimal
# point
$name .= $extra_zeros;
}
else {
# Or, leading zeros before the denominator
$name =~ s,/,/$extra_zeros,;
}
}
}
# For legibility of the test, only change the case of whole sections at a
# time. To do this, first split into sections. The split returns the
# delimiters
my @sections;
for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
trace $section if main::DEBUG && $to_trace;
if (length $section > 1 && $section !~ /\D/) {
# If the section is a sequence of digits, about half the time
# randomly add underscores between some of them.
if (rand() > .5) {
# Figure out how many underscores to add. max is 1 less than
# the number of digits. (But add 1 at the end to make sure
# result isn't 0, and compensate earlier by subtracting 2
# instead of 1)
my $num_underscores = int rand(length($section) - 2) + 1;
# And add them evenly throughout, for convenience, not rigor
use integer;
my $spacing = (length($section) - 1)/ $num_underscores;
my $temp = $section;
$section = "";
for my $i (1 .. $num_underscores) {
$section .= substr($temp, 0, $spacing, "") . '_';
}
$section .= $temp;
}
push @sections, $section;
}
else {
# Here not a sequence of digits. Change the case of the section
# randomly
my $switch = int rand(4);
if ($switch == 0) {
push @sections, uc $section;
}
elsif ($switch == 1) {
push @sections, lc $section;
}
elsif ($switch == 2) {
push @sections, ucfirst $section;
}
else {
push @sections, $section;
}
}
}
trace "returning", join "", @sections if main::DEBUG && $to_trace;
return join "", @sections;
}
sub randomize_loose_name($name, $want_error=0) { # This used only for making the test script
$name = randomize_stricter_name($name);
my @parts;
push @parts, $good_loose_seps[rand(@good_loose_seps)];
# Preserve trailing ones for the sake of not stripping the underscore from
# 'L_'
for my $part (split /[-\s_]+ (?= . )/, $name) {
if (@parts) {
if ($want_error and rand() < 0.3) {
push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
$want_error = 0;
}
else {
push @parts, $good_loose_seps[rand(@good_loose_seps)];
}
}
push @parts, $part;
}
my $new = join("", @parts);
trace "$name => $new" if main::DEBUG && $to_trace;
if ($want_error) {
if (rand() >= 0.5) {
$new .= $bad_loose_seps[rand(@bad_loose_seps)];
}
else {
$new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
}
}
return $new;
}
# Used to make sure don't generate duplicate test cases. my %test_generated;
sub make_property_test_script() { # This used only for making the test script # this written directly -- it's huge.
print "Making test script\n" if $verbosity >= $PROGRESS;
# This uses randomness to test different possibilities without testing all
# possibilities. To ensure repeatability, set the seed to 0. But if
# tests are added, it will perturb all later ones in the .t file
srand 0;
$t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
# Create a list of what the %f representation is for each rational number.
# This will be used below.
my @valid_base_floats = '0.0';
foreach my $e_representation (keys %nv_floating_to_rational) {
push @valid_base_floats,
eval $nv_floating_to_rational{$e_representation};
}
# It doesn't matter whether the elements of this array contain single lines
# or multiple lines. main::write doesn't count the lines.
my @output;
push @output, <<'EOF_CODE';
Error('\p{Script=InGreek}'); # Bug #69018
Test_GCB("1100 $nobreak 1161"); # Bug #70940
Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726
Error('\p{InKana}'); # 'Kana' is not a block so InKana shouldn't compile
Expect(1, 0xB6, '\p{In=V1_1}', ""); # Didn't use to work
Expect(1, 0x3A2,'\p{In=NA}', ""); # Didn't use to work
# Make sure this gets tested; it was not part of the official test suite at # the time this was added. Note that this is as it would appear in the # official suite, and gets modified to check for the perl tailoring by # Test_WB() Test_WB("$breakable 0020 $breakable 0020 $breakable 0308 $breakable"); Test_LB("$nobreak 200B $nobreak 0020 $nobreak 0020 $breakable 2060 $breakable"); Expect(1, ord(" "), '\p{gc=:(?aa)s:}', ""); # /aa is valid Expect(1, ord(" "), '\p{gc=:(?-s)s:}', ""); # /-s is valid EOF_CODE
# Sort these so get results in same order on different runs of this
# program
foreach my $property (sort { $a->has_dependency <=> $b->has_dependency
or
lc $a->name cmp lc $b->name
} property_ref('*'))
{
# Non-binary properties should not match \p{}; Test all for that.
if ($property->type != $BINARY && $property->type != $FORCED_BINARY) {
my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS }
$property->aliases;
foreach my $property_alias ($property->aliases) {
my $name = standardize($property_alias->name);
# But some names are ambiguous, meaning a binary property with
# the same name when used in \p{}, and a different
# (non-binary) property in other contexts.
next if grep { $name eq $_ } keys %ambiguous_names;
push @output, <<"EOF_CODE";
Error('\\p{$name}');
Error('\\P{$name}');
EOF_CODE
}
}
foreach my $table (sort { $a->has_dependency <=> $b->has_dependency
or
lc $a->name cmp lc $b->name
} $property->tables)
{
# Find code points that match, and don't match this table.
my $valid = $table->get_valid_code_point;
my $invalid = $table->get_invalid_code_point;
my $warning = ($table->status eq $DEPRECATED)
? "'deprecated'"
: '""';
# Test each possible combination of the property's aliases with
# the table's. If this gets to be too many, could do what is done
# in the set_final_comment() for Tables
my @table_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->aliases;
next unless @table_aliases;
my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->property->aliases;
next unless @property_aliases;
# Every property can be optionally be prefixed by 'Is_', so test
# that those work, by creating such a new alias for each
# pre-existing one.
push @property_aliases, map { Alias->new("Is_" . $_->name,
$_->loose_match,
$_->make_re_pod_entry,
$_->ok_as_filename,
$_->status,
$_->ucd,
)
} @property_aliases;
my $max = max(scalar @table_aliases, scalar @property_aliases);
for my $j (0 .. $max - 1) {
# The current alias for property is the next one on the list,
# or if beyond the end, start over. Similarly for table
my $property_name
= $property_aliases[$j % @property_aliases]->name;
$property_name = "" if $table->property == $perl;
my $table_alias = $table_aliases[$j % @table_aliases];
my $table_name = $table_alias->name;
my $loose_match = $table_alias->loose_match;
# If the table doesn't have a file, any test for it is
# already guaranteed to be in error
my $already_error = ! $table->file_path;
# A table that begins with these could actually be a
# user-defined property, so won't be compile time errors, as
# the definitions of those can be deferred until runtime
next if $already_error && $table_name =~ / ^ I[ns] /x;
# Generate error cases for this alias.
push @output, generate_error($property_name,
$table_name,
$already_error);
# If the table is guaranteed to always generate an error,
# quit now without generating success cases.
next if $already_error;
# Now for the success cases. First, wildcard matching, as it
# shouldn't have any randomization.
if ($table_alias->status eq $NORMAL) {
push @output, generate_wildcard_tests($property_name,
$table_name,
$valid,
$invalid,
$warning,
);
}
my $random;
if ($loose_match) {
# For loose matching, create an extra test case for the
# standard name.
my $standard = standardize($table_name);
# $test_name should be a unique combination for each test
# case; used just to avoid duplicate tests
my $test_name = "$property_name=$standard";
# Don't output duplicate test cases.
if (! exists $test_generated{$test_name}) {
$test_generated{$test_name} = 1;
push @output, generate_tests($property_name,
$standard,
$valid,
$invalid,
$warning,
);
if ($table_alias->status eq $NORMAL) {
push @output, generate_wildcard_tests(
$property_name,
$standard,
$valid,
$invalid,
$warning,
);
}
}
$random = randomize_loose_name($table_name)
}
else { # Stricter match
$random = randomize_stricter_name($table_name);
}
# Now for the main test case for this alias.
my $test_name = "$property_name=$random";
if (! exists $test_generated{$test_name}) {
$test_generated{$test_name} = 1;
push @output, generate_tests($property_name,
$random,
$valid,
$invalid,
$warning,
);
if ($property->name eq 'nv') {
if ($table_name !~ qr{/}) {
push @output, generate_tests($property_name,
sprintf("%.15e", $table_name),
$valid,
$invalid,
$warning,
);
}
else {
# If the name is a rational number, add tests for a
# non-reduced form, and for a floating point equivalent.
# 60 is a number divisible by a bunch of things
my ($numerator, $denominator) = $table_name
=~ m! (.+) / (.+) !x;
$numerator *= 60;
$denominator *= 60;
push @output, generate_tests($property_name,
"$numerator/$denominator",
$valid,
$invalid,
$warning,
);
# Calculate the float, and the %e representation
my $float = eval $table_name;
my $e_representation = sprintf("%.*e",
$E_FLOAT_PRECISION, $float);
# Parse that
my ($non_zeros, $zeros, $exponent_sign, $exponent)
= $e_representation
=~ / -? [1-9] \. (\d*?) (0*) e ([+-]) (\d+) /x;
my $min_e_precision;
my $min_f_precision;
if ($exponent_sign eq '+' && $exponent != 0) {
Carp::my_carp_bug("Not yet equipped to handle"
. " positive exponents");
return;
}
else {
# We're trying to find the minimum precision that
# is needed to indicate this particular rational
# for the given $E_FLOAT_PRECISION. For %e, any
# trailing zeros, like 1.500e-02 aren't needed, so
# the correct value is how many non-trailing zeros
# there are after the decimal point.
$min_e_precision = length $non_zeros;
# For %f, like .01500, we want at least
# $E_FLOAT_PRECISION digits, but any trailing
# zeros aren't needed, so we can subtract the
# length of those. But we also need to include
# the zeros after the decimal point, but before
# the first significant digit.
$min_f_precision = $E_FLOAT_PRECISION
+ $exponent
- length $zeros;
}
# Make tests for each possible precision from 1 to
# just past the worst case.
my $upper_limit = ($min_e_precision > $min_f_precision)
? $min_e_precision
: $min_f_precision;
for my $i (1 .. $upper_limit + 1) {
for my $format ("e", "f") {
my $this_table
= sprintf("%.*$format", $i, $float);
# If we don't have enough precision digits,
# make a fail test; otherwise a pass test.
my $pass = ($format eq "e")
? $i >= $min_e_precision
: $i >= $min_f_precision;
if ($pass) {
push @output, generate_tests($property_name,
$this_table,
$valid,
$invalid,
$warning,
);
}
elsif ( $format eq "e"
# Here we would fail, but in the %f
# case, the representation at this
# precision could actually be a
# valid one for some other rational
|| ! grep { $this_table
=~ / ^ $_ 0* $ /x }
@valid_base_floats)
{
push @output,
generate_error($property_name,
$this_table,
1 # 1 => already an
# error
);
}
}
}
}
}
}
}
$table->DESTROY();
}
$property->DESTROY();
}
# Make any test of the boundary (break) properties TODO if the code
# doesn't match the version being compiled
my $TODO_FAILING_BREAKS = ($version_of_mk_invlist_bounds ne $v_version)
? "\nsub TODO_FAILING_BREAKS { 1 }\n"
: "\nsub TODO_FAILING_BREAKS { 0 }\n";
@output= map {
map s/^/ /mgr,
map "$_;\n",
split /;\n/, $_
} @output;
# Cause there to be 'if' statements to only execute a portion of this
# long-running test each time, so that we can have a bunch of .t's running
# in parallel
my $chunks = 10 # Number of test files
- 1 # For GCB & SB
- 1 # For WB
- 4; # LB split into this many files
my @output_chunked;
my $chunk_count=0;
my $chunk_size= int(@output / $chunks) + 1;
while (@output) {
$chunk_count++;
my @chunk= splice @output, 0, $chunk_size;
push @output_chunked,
"if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
@chunk,
"}\n";
}
$chunk_count++;
push @output_chunked,
"if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
(map {" Test_GCB('$_');\n"} @backslash_X_tests),
(map {" Test_SB('$_');\n"} @SB_tests),
"}\n";
$chunk_size= int(@LB_tests / 4) + 1;
@LB_tests = map {" Test_LB('$_');\n"} @LB_tests;
while (@LB_tests) {
$chunk_count++;
my @chunk= splice @LB_tests, 0, $chunk_size;
push @output_chunked,
"if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
@chunk,
"}\n";
}
$chunk_count++;
push @output_chunked,
"if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
(map {" Test_WB('$_');\n"} @WB_tests),
"}\n";
&write($t_path,
0, # Not utf8;
[$HEADER,
$TODO_FAILING_BREAKS,
<DATA>,
@output_chunked,
"Finished();\n",
]);
return;
}
sub make_normalization_test_script() { print "Making normalization test script\n" if $verbosity >= $PROGRESS;
my $n_path = 'TestNorm.pl';
unshift @normalization_tests, <<'END';
use utf8;
use Test::More;
sub ord_string { # Convert packed ords to printable string use charnames (); return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' } unpack "U*", shift) . "'"; #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) . "'"; }
sub Test_N { my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_; my $display_source = ord_string($source); my $display_nfc = ord_string($nfc); my $display_nfd = ord_string($nfd); my $display_nfkc = ord_string($nfkc); my $display_nfkd = ord_string($nfkd);
use Unicode::Normalize;
# NFC
# nfc == toNFC(source) == toNFC(nfc) == toNFC(nfd)
# nfkc == toNFC(nfkc) == toNFC(nfkd)
#
# NFD
# nfd == toNFD(source) == toNFD(nfc) == toNFD(nfd)
# nfkd == toNFD(nfkc) == toNFD(nfkd)
#
# NFKC
# nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) ==
# toNFKC(nfkc) == toNFKC(nfkd)
#
# NFKD
# nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) ==
# toNFKD(nfkc) == toNFKD(nfkd)
is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc");
is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc");
is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc");
is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc");
is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc");
is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd");
is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd");
is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd");
is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd");
is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd");
is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc");
is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc");
is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc");
is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc");
is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc");
is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd");
is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd");
is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd");
is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd");
is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd");
}
END
&write($n_path,
1, # Is utf8;
[
@normalization_tests,
'done_testing();'
]);
return;
}
# Skip reasons, so will be exact same text and hence the files with each # reason will get grouped together in perluniprops. my $Documentation = "Documentation"; my $Indic_Skip = "Provisional; for the analysis and processing of Indic scripts"; my $Validation = "Validation Tests"; my $Validation_Documentation = "Documentation of validation Tests"; my $Unused_Skip = "Currently unused by Perl";
# This is a list of the input files and how to handle them. The files are # processed in their order in this list. Some reordering is possible if # desired, but the PropertyAliases and PropValueAliases files should be first, # and the extracted before the others (as data in an extracted file can be # over-ridden by the non-extracted. Some other files depend on data derived # from an earlier file, like UnicodeData requires data from Jamo, and the case # changing and folding requires data from Unicode. Mostly, it is safest to # order by first version releases in (except the Jamo). # # The version strings allow the program to know whether to expect a file or # not, but if a file exists in the directory, it will be processed, even if it # is in a version earlier than expected, so you can copy files from a later # release into an earlier release's directory. my @input_file_objects = ( Input_file->new('PropertyAliases.txt', v3.2, Handler => \&process_PropertyAliases, Early => [ \&substitute_PropertyAliases ], Required_Even_in_Debug_Skip => 1, ), Input_file->new(undef, v0, # No file associated with this Progress_Message => 'Finishing property setup', Handler => \&finish_property_setup, ), Input_file->new('PropValueAliases.txt', v3.2, Handler => \&process_PropValueAliases, Early => [ \&substitute_PropValueAliases ], Has_Missings_Defaults => $NOT_IGNORED, Required_Even_in_Debug_Skip => 1, ), Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0, Property => 'General_Category', ), Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0, Property => 'Canonical_Combining_Class', Has_Missings_Defaults => $NOT_IGNORED, ), Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0, Property => 'Numeric_Type', Has_Missings_Defaults => $NOT_IGNORED, ), Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0, Property => 'East_Asian_Width', Has_Missings_Defaults => $NOT_IGNORED, ), Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0, Property => 'Line_Break', Has_Missings_Defaults => $NOT_IGNORED, ), Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1, Property => 'Bidi_Class', Has_Missings_Defaults => $NOT_IGNORED, ), Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0, Property => 'Decomposition_Type', Has_Missings_Defaults => $NOT_IGNORED, ), Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0), Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0, Property => 'Numeric_Value', Each_Line_Handler => \&filter_numeric_value_line, Has_Missings_Defaults => $NOT_IGNORED, ), Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0, Property => 'Joining_Group', Has_Missings_Defaults => $NOT_IGNORED, ),
Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
Property => 'Joining_Type',
Has_Missings_Defaults => $NOT_IGNORED,
),
Input_file->new("${EXTRACTED}DName.txt", v10.0.0,
Skip => 'This file adds no new information not already'
. ' present in other files',
# And it's unnecessary programmer work to handle this new
# format. Previous Derived files actually had bug fixes
# in them that were useful, but that should not be the
# case here.
),
Input_file->new('Jamo.txt', v2.0.0,
Property => 'Jamo_Short_Name',
Each_Line_Handler => \&filter_jamo_line,
),
Input_file->new('UnicodeData.txt', v1.1.5,
Pre_Handler => \&setup_UnicodeData,
# We clean up this file for some early versions.
Each_Line_Handler => [ (($v_version lt v2.0.0 )
? \&filter_v1_ucd
: ($v_version eq v2.1.5)
? \&filter_v2_1_5_ucd
# And for 5.14 Perls with 6.0,
# have to also make changes
: ($v_version ge v6.0.0
&& $^V lt v5.17.0)
? \&filter_v6_ucd
: undef),
# Early versions did not have the
# proper Unicode_1 names for the
# controls
(($v_version lt v3.0.0)
? \&filter_early_U1_names
: undef),
# Early versions did not correctly
# use the later method for giving
# decimal digit values
(($v_version le v3.2.0)
? \&filter_bad_Nd_ucd
: undef),
# And the main filter
\&filter_UnicodeData_line,
],
EOF_Handler => \&EOF_UnicodeData,
),
Input_file->new('CJKXREF.TXT', v1.1.5,
Withdrawn => v2.0.0,
Skip => 'Gives the mapping of CJK code points '
. 'between Unicode and various other standards',
),
Input_file->new('ArabicShaping.txt', v2.0.0,
Each_Line_Handler =>
($v_version lt 4.1.0)
? \&filter_old_style_arabic_shaping
: undef,
# The first field after the range is a "schematic name"
# not used by Perl
Properties => [ '<ignored>', 'Joining_Type', 'Joining_Group' ],
Has_Missings_Defaults => $NOT_IGNORED,
),
Input_file->new('Blocks.txt', v2.0.0,
Property => 'Block',
Has_Missings_Defaults => $NOT_IGNORED,
Each_Line_Handler => \&filter_blocks_lines
),
Input_file->new('Index.txt', v2.0.0,
Skip => 'Alphabetical index of Unicode characters',
),
Input_file->new('NamesList.txt', v2.0.0,
Skip => 'Annotated list of characters',
),
Input_file->new('PropList.txt', v2.0.0,
Each_Line_Handler => (($v_version lt v3.1.0)
? \&filter_old_style_proplist
: undef),
),
Input_file->new('Props.txt', v2.0.0,
Withdrawn => v3.0.0,
Skip => 'A subset of F<PropList.txt> (which is used instead)',
),
Input_file->new('ReadMe.txt', v2.0.0,
Skip => $Documentation,
),
Input_file->new('Unihan.txt', v2.0.0,
Withdrawn => v5.2.0,
Construction_Time_Handler => \&construct_unihan,
Pre_Handler => \&setup_unihan,
Optional => [ "",
'Unicode_Radical_Stroke'
],
Each_Line_Handler => \&filter_unihan_line,
),
Input_file->new('SpecialCasing.txt', v2.1.8,
Each_Line_Handler => ($v_version eq 2.1.8)
? \&filter_2_1_8_special_casing_line
: \&filter_special_casing_line,
Pre_Handler => \&setup_special_casing,
Has_Missings_Defaults => $IGNORED,
),
Input_file->new(
'LineBreak.txt', v3.0.0,
Has_Missings_Defaults => $NOT_IGNORED,
Property => 'Line_Break',
# Early versions had problematic syntax
Each_Line_Handler => ($v_version ge v3.1.0)
? undef
: ($v_version lt v3.0.0)
? \&filter_substitute_lb
: \&filter_early_ea_lb,
# Must use long names for property values see comments at
# sub filter_substitute_lb
Early => [ "LBsubst.txt", '_Perl_LB', 'Alphabetic',
'Alphabetic', # default to this because XX ->
# AL
# Don't use _Perl_LB as a synonym for
# Line_Break in later perls, as it is tailored
# and isn't the same as Line_Break
'ONLY_EARLY' ],
),
Input_file->new('EastAsianWidth.txt', v3.0.0,
Property => 'East_Asian_Width',
Has_Missings_Defaults => $NOT_IGNORED,
# Early versions had problematic syntax
Each_Line_Handler => (($v_version lt v3.1.0)
? \&filter_early_ea_lb
: undef),
),
Input_file->new('CompositionExclusions.txt', v3.0.0,
Property => 'Composition_Exclusion',
),
Input_file->new('UnicodeData.html', v3.0.0,
Withdrawn => v4.0.1,
Skip => $Documentation,
),
Input_file->new('BidiMirroring.txt', v3.0.1,
Property => 'Bidi_Mirroring_Glyph',
Has_Missings_Defaults => ($v_version lt v6.2.0)
? $NO_DEFAULTS
# Is <none> which doesn't mean
# anything to us, we will use the
# null string
: $IGNORED,
),
Input_file->new('NamesList.html', v3.0.0,
Skip => 'Describes the format and contents of '
. 'F<NamesList.txt>',
),
Input_file->new('UnicodeCharacterDatabase.html', v3.0.0,
Withdrawn => v5.1,
Skip => $Documentation,
),
Input_file->new('CaseFolding.txt', v3.0.1,
Pre_Handler => \&setup_case_folding,
Each_Line_Handler =>
[ ($v_version lt v3.1.0)
? \&filter_old_style_case_folding
: undef,
\&filter_case_folding_line
],
Has_Missings_Defaults => $IGNORED,
),
Input_file->new("NormTest.txt", v3.0.1,
Handler => \&process_NormalizationsTest,
Skip => ($make_norm_test_script) ? 0 : $Validation,
),
Input_file->new('DCoreProperties.txt', v3.1.0,
# 5.2 changed this file
Has_Missings_Defaults => (($v_version ge v5.2.0)
? $NOT_IGNORED
: $NO_DEFAULTS),
),
Input_file->new('DProperties.html', v3.1.0,
Withdrawn => v3.2.0,
Skip => $Documentation,
),
Input_file->new('PropList.html', v3.1.0,
Withdrawn => v5.1,
Skip => $Documentation,
),
Input_file->new('Scripts.txt', v3.1.0,
Property => 'Script',
Each_Line_Handler => (($v_version le v4.0.0)
? \&filter_all_caps_script_names
: undef),
Has_Missings_Defaults => $NOT_IGNORED,
),
Input_file->new('DNormalizationProps.txt', v3.1.0,
Has_Missings_Defaults => $NOT_IGNORED,
Each_Line_Handler => (($v_version lt v4.0.1)
? \&filter_old_style_normalization_lines
: undef),
),
Input_file->new('DerivedProperties.html', v3.1.1,
Withdrawn => v5.1,
Skip => $Documentation,
),
Input_file->new('DAge.txt', v3.2.0,
Has_Missings_Defaults => $NOT_IGNORED,
Property => 'Age'
),
Input_file->new('HangulSyllableType.txt', v4.0,
Has_Missings_Defaults => $NOT_IGNORED,
Early => [ \&generate_hst, 'Hangul_Syllable_Type' ],
Property => 'Hangul_Syllable_Type'
),
Input_file->new('NormalizationCorrections.txt', v3.2.0,
# This documents the cumulative fixes to erroneous
# normalizations in earlier Unicode versions. Its main
# purpose is so that someone running on an earlier
# version can use this file to override what got
# published in that earlier release. It would be easy
# for mktables to handle this file. But all the
# corrections in it should already be in the other files
# for the release it is. To get it to actually mean
# something useful, someone would have to be using an
# earlier Unicode release, and copy it into the directory
# for that release and recompile. So far there has been
# no demand to do that, so this hasn't been implemented.
Skip => 'Documentation of corrections already '
. 'incorporated into the Unicode data base',
),
Input_file->new('StandardizedVariants.html', v3.2.0,
Skip => 'Obsoleted as of Unicode 9.0, but previously '
. 'provided a visual display of the standard '
. 'variant sequences derived from '
. 'F<StandardizedVariants.txt>.',
# I don't know why the html came earlier than the
# .txt, but both are skipped anyway, so it doesn't
# matter.
),
Input_file->new('StandardizedVariants.txt', v4.0.0,
Skip => 'Certain glyph variations for character display '
. 'are standardized. This lists the non-Unihan '
. 'ones; the Unihan ones are also not used by '
. 'Perl, and are in a separate Unicode data base '
. 'L<http://www.unicode.org/ivd>',
),
Input_file->new('UCD.html', v4.0.0,
Withdrawn => v5.2,
Skip => $Documentation,
),
Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter' ],
Property => 'Word_Break',
Has_Missings_Defaults => $NOT_IGNORED,
),
Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1,
Early => [ \&generate_GCB, '_Perl_GCB' ],
Property => 'Grapheme_Cluster_Break',
Has_Missings_Defaults => $NOT_IGNORED,
),
Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
Handler => \&process_GCB_test,
retain_trailing_comments => 1,
),
Input_file->new("$AUXILIARY/GraphemeBreakTest.html", v4.1.0,
Skip => $Validation_Documentation,
),
Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
Handler => \&process_SB_test,
retain_trailing_comments => 1,
),
Input_file->new("$AUXILIARY/SentenceBreakTest.html", v4.1.0,
Skip => $Validation_Documentation,
),
Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
Handler => \&process_WB_test,
retain_trailing_comments => 1,
),
Input_file->new("$AUXILIARY/WordBreakTest.html", v4.1.0,
Skip => $Validation_Documentation,
),
Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
Property => 'Sentence_Break',
Early => [ "SBsubst.txt", '_Perl_SB', 'OLetter' ],
Has_Missings_Defaults => $NOT_IGNORED,
),
Input_file->new('NamedSequences.txt', v4.1.0,
Handler => \&process_NamedSequences
),
Input_file->new('Unihan.html', v4.1.0,
Withdrawn => v5.2,
Skip => $Documentation,
),
Input_file->new('NameAliases.txt', v5.0,
Property => 'Name_Alias',
Each_Line_Handler => ($v_version le v6.0.0)
? \&filter_early_version_name_alias_line
: \&filter_later_version_name_alias_line,
),
# NameAliases.txt came along in v5.0. The above constructor handles
# this. But until 6.1, it was lacking some information needed by core
# perl. The constructor below handles that. It is either a kludge or
# clever, depending on your point of view. The 'Withdrawn' parameter
# indicates not to use it at all starting in 6.1 (so the above
# constructor applies), and the 'v6.1' parameter indicates to use the
# Early parameter before 6.1. Therefore 'Early" is always used,
# yielding the internal-only property '_Perl_Name_Alias', which it
# gets from a NameAliases.txt from 6.1 or later stored in
# N_Asubst.txt. In combination with the above constructor,
# 'Name_Alias' is publicly accessible starting with v5.0, and the
# better 6.1 version is accessible to perl core in all releases.
Input_file->new("NameAliases.txt", v6.1,
Withdrawn => v6.1,
Early => [ "N_Asubst.txt", '_Perl_Name_Alias', "" ],
Property => 'Name_Alias',
EOF_Handler => \&fixup_early_perl_name_alias,
Each_Line_Handler =>
\&filter_later_version_name_alias_line,
),
Input_file->new('NamedSqProv.txt', v5.0.0,
Skip => 'Named sequences proposed for inclusion in a '
. 'later version of the Unicode Standard; if you '
. 'need them now, you can append this file to '
. 'F<NamedSequences.txt> and recompile perl',
),
Input_file->new("$AUXILIARY/LBTest.txt", v5.1.0,
Handler => \&process_LB_test,
retain_trailing_comments => 1,
),
Input_file->new("$AUXILIARY/LineBreakTest.html", v5.1.0,
Skip => $Validation_Documentation,
),
Input_file->new("BidiTest.txt", v5.2.0,
Skip => $Validation,
),
Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
Optional => "",
Each_Line_Handler => \&filter_unihan_line,
),
Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
Optional => "",
Each_Line_Handler => \&filter_unihan_line,
),
Input_file->new('UnihanIRGSources.txt', v5.2.0,
Optional => [ "",
'kCompatibilityVariant',
'kIICore',
'kIRG_GSource',
'kIRG_HSource',
'kIRG_JSource',
'kIRG_KPSource',
'kIRG_MSource',
'kIRG_KSource',
'kIRG_SSource',
'kIRG_TSource',
'kIRG_USource',
'kIRG_UKSource',
'kIRG_VSource',
],
Pre_Handler => \&setup_unihan,
Each_Line_Handler => \&filter_unihan_line,
),
Input_file->new('UnihanNumericValues.txt', v5.2.0,
Optional => [ "",
'kAccountingNumeric',
'kOtherNumeric',
'kPrimaryNumeric',
],
Each_Line_Handler => \&filter_unihan_line,
),
Input_file->new('UnihanOtherMappings.txt', v5.2.0,
Optional => "",
Each_Line_Handler => \&filter_unihan_line,
),
Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
Optional => [ "",
'Unicode_Radical_Stroke'
],
Each_Line_Handler => \&filter_unihan_line,
),
Input_file->new('UnihanReadings.txt', v5.2.0,
Optional => "",
Each_Line_Handler => \&filter_unihan_line,
),
Input_file->new('UnihanVariants.txt', v5.2.0,
Optional => "",
Each_Line_Handler => \&filter_unihan_line,
),
Input_file->new('CJKRadicals.txt', v5.2.0,
Skip => 'Maps the kRSUnicode property values to '
. 'corresponding code points',
),
Input_file->new('EmojiSources.txt', v6.0.0,
Skip => 'Maps certain Unicode code points to their '
. 'legacy Japanese cell-phone values',
),
# This file is actually not usable as-is until 6.1.0, because the property
# is provisional, so its name is missing from PropertyAliases.txt until
# that release, so that further work would have to be done to get it to
# work properly
Input_file->new('ScriptExtensions.txt', v6.0.0,
Property => 'Script_Extensions',
Early => [ sub {} ], # Doesn't do anything but ensures
# that this isn't skipped for early
# versions
Pre_Handler => \&setup_script_extensions,
Each_Line_Handler => \&filter_script_extensions_line,
Has_Missings_Defaults => (($v_version le v6.0.0)
? $NO_DEFAULTS
: $IGNORED),
),
# These two Indic files are actually not usable as-is until 6.1.0,
# because they are provisional, so their property values are missing from
# PropValueAliases.txt until that release, so that further work would have
# to be done to get them to work properly.
Input_file->new('IndicMatraCategory.txt', v6.0.0,
Withdrawn => v8.0.0,
Property => 'Indic_Matra_Category',
Has_Missings_Defaults => $NOT_IGNORED,
Skip => $Indic_Skip,
),
Input_file->new('IndicSyllabicCategory.txt', v6.0.0,
Property => 'Indic_Syllabic_Category',
Has_Missings_Defaults => $NOT_IGNORED,
Skip => (($v_version lt v8.0.0)
? $Indic_Skip
: 0),
),
Input_file->new('USourceData.txt', v6.2.0,
Skip => 'Documentation of status and cross reference of '
. 'proposals for encoding by Unicode of Unihan '
. 'characters',
),
Input_file->new('USourceGlyphs.pdf', v6.2.0,
Skip => 'Pictures of the characters in F<USourceData.txt>',
),
Input_file->new('BidiBrackets.txt', v6.3.0,
Properties => [ 'Bidi_Paired_Bracket',
'Bidi_Paired_Bracket_Type'
],
Has_Missings_Defaults => $NO_DEFAULTS,
),
Input_file->new("BidiCharacterTest.txt", v6.3.0,
Skip => $Validation,
),
Input_file->new('IndicPositionalCategory.txt', v8.0.0,
Property => 'Indic_Positional_Category',
Has_Missings_Defaults => $NOT_IGNORED,
),
Input_file->new('TangutSources.txt', v9.0.0,
Skip => 'Specifies source mappings for Tangut ideographs'
. ' and components. This data file also includes'
. ' informative radical-stroke values that are used'
. ' internally by Unicode',
),
Input_file->new('VerticalOrientation.txt', v10.0.0,
Property => 'Vertical_Orientation',
Has_Missings_Defaults => $NOT_IGNORED,
),
Input_file->new('NushuSources.txt', v10.0.0,
Skip => 'Specifies source material for Nushu characters',
),
Input_file->new('EquivalentUnifiedIdeograph.txt', v11.0.0,
Property => 'Equivalent_Unified_Ideograph',
Has_Missings_Defaults => $NOT_IGNORED,
),
Input_file->new('EmojiData.txt', v11.0.0,
# Is in UAX #51 and not the UCD, so must be updated
# separately, and the first line edited to indicate the
# UCD release we're pretending it to be in. The UTC says
# this is a transitional state, and in fact was moved to
# the UCD in 13.0
Withdrawn => v13.0.0,
Pre_Handler => \&setup_emojidata,
Has_Missings_Defaults => $NOT_IGNORED,
Each_Line_Handler => \&filter_emojidata_line,
UCD => 0,
),
Input_file->new("$EMOJI/emoji.txt", v13.0.0,
Has_Missings_Defaults => $NOT_IGNORED,
UCD => 0,
),
Input_file->new("$EMOJI/ReadMe.txt", v13.0.0,
Skip => $Documentation,
UCD => 0,
),
Input_file->new('IdStatus.txt', v13.0.0,
Pre_Handler => \&setup_IdStatus,
Property => 'Identifier_Status',
UCD => 0,
),
Input_file->new('IdType.txt', v13.0.0,
Pre_Handler => \&setup_IdType,
Each_Line_Handler => \&filter_IdType_line,
Property => 'Identifier_Type',
UCD => 0,
),
Input_file->new('confusables.txt', v15.0.0,
Skip => $Unused_Skip,
UCD => 0,
),
Input_file->new('confusablesSummary.txt', v15.0.0,
Skip => $Unused_Skip,
UCD => 0,
),
Input_file->new('intentional.txt', v15.0.0,
Skip => $Unused_Skip,
UCD => 0,
),
);
# End of all the preliminaries. # Do it...
if (@missing_early_files) { print simple_fold(join_lines(<<END
The compilation cannot be completed because one or more required input files, listed below, are missing. This is because you are compiling Unicode version $unicode_version, which predates the existence of these file(s). To fully function, perl needs the data that these files would have contained if they had been in this release. To work around this, create copies of later versions of the missing files in the directory containing '$0'. (Perl will make the necessary adjustments to the data to compensate for it not being the same version as is being compiled.) The files are available from unicode.org, via either ftp or http. If using http, they will be under www.unicode.org/versions/. Below are listed the source file name of each missing file, the Unicode version to copy it from, and the name to store it as. (Note that the listed source file name may not be exactly the one that Unicode calls it. If you don't find it, you can look it up in 'README.perl' to get the correct name.) END )); print simple_fold(join_lines("\n$_")) for @missing_early_files; exit 2; }
if ($compare_versions) { Carp::my_carp(<<END Warning. \$compare_versions is set. Output is not suitable for production END ); }
# Put into %potential_files a list of all the files in the directory structure # that could be inputs to this program File::Find::find({ wanted=>sub { return unless / \. ( txt | htm l? ) $ /xi; # Some platforms change the # name's case my $full = lc(File::Spec->rel2abs($_)); $potential_files{$full} = 1; return; } }, File::Spec->curdir());
my @mktables_list_output_files; my $old_start_time = 0; my $old_options = "";
if (! -e $file_list) { print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE; $write_unchanged_files = 1; } elsif ($write_unchanged_files) { print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE; } else { print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE; my $file_handle; if (! open $file_handle, "<", $file_list) { Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!"); $glob_list = 1; } else { my @input;
# Read and parse mktables.lst, placing the results from the first part
# into @input, and the second part into @mktables_list_output_files
for my $list ( \@input, \@mktables_list_output_files ) {
while (<$file_handle>) {
s/^ \s+ | \s+ $//xg;
if (/^ \s* \# \s* Autogenerated\ starting\ on\ (\d+)/x) {
$old_start_time = $1;
next;
}
if (/^ \s* \# \s* From\ options\ (.+) /x) {
$old_options = $1;
next;
}
next if /^ \s* (?: \# .* )? $/x;
last if /^ =+ $/x;
my ( $file ) = split /\t/;
push @$list, $file;
}
@$list = uniques(@$list);
next;
}
# Look through all the input files
foreach my $input (@input) {
next if $input eq 'version'; # Already have checked this.
# Ignore if doesn't exist. The checking about whether we care or
# not is done via the Input_file object.
next if ! file_exists($input);
# The paths are stored with relative names, and with '/' as the
# delimiter; convert to absolute on this machine
my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
$potential_files{lc $full} = 1;
}
}
close $file_handle;
}
if ($glob_list) {
# Here wants to process all .txt files in the directory structure.
# Convert them to full path names. They are stored in the platform's
# relative style
my @known_files;
foreach my $object (@input_file_objects) {
my $file = $object->file;
next unless defined $file;
push @known_files, File::Spec->rel2abs($file);
}
my @unknown_input_files;
foreach my $file (keys %potential_files) { # The keys are stored in lc
next if grep { $file eq lc($_) } @known_files;
# Here, the file is unknown to us. Get relative path name
$file = File::Spec->abs2rel($file);
push @unknown_input_files, $file;
# What will happen is we create a data structure for it, and add it to
# the list of input files to process. First get the subdirectories
# into an array
my (undef, $directories, undef) = File::Spec->splitpath($file);
$directories =~ s;/$;;; # Can have extraneous trailing '/'
my @directories = File::Spec->splitdir($directories);
# If the file isn't extracted (meaning none of the directories is the
# extracted one), just add it to the end of the list of inputs.
if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
push @input_file_objects, Input_file->new($file, v0);
}
else {
# Here, the file is extracted. It needs to go ahead of most other
# processing. Search for the first input file that isn't a
# special required property (that is, find one whose first_release
# is non-0), and isn't extracted. Also, the Age property file is
# processed before the extracted ones, just in case
# $compare_versions is set.
for (my $i = 0; $i < @input_file_objects; $i++) {
if ($input_file_objects[$i]->first_released ne v0
&& lc($input_file_objects[$i]->file) ne 'dage.txt'
&& $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
{
splice @input_file_objects, $i, 0,
Input_file->new($file, v0);
last;
}
}
}
}
if (@unknown_input_files) {
print STDERR simple_fold(join_lines(<<END
The following files are unknown as to how to handle. Assuming they are typical property files. You'll know by later error messages if it worked or not: END ) . " " . join(", ", @unknown_input_files) . "\n\n"); } } # End of looking through directory structure for more .txt files.
# Create the list of input files from the objects we have defined, plus # version my @input_files = qw(version Makefile); foreach my $object (@input_file_objects) { my $file = $object->file; next if ! defined $file; # Not all objects have files next if defined $object->skip;; push @input_files, $file; }
if ( $verbosity >= $VERBOSE ) { print "Expecting ".scalar( @input_files )." input files. ", "Checking ".scalar( @mktables_list_output_files )." output files.\n"; }
# We set $most_recent to be the most recently changed input file, including # this program itself (done much earlier in this file) foreach my $in (@input_files) { next unless -e $in; # Keep going even if missing a file my $mod_time = (stat $in)[9]; $most_recent = $mod_time if $mod_time > $most_recent;
# See that the input files have distinct names, to warn someone if they
# are adding a new one
if ($make_list) {
my ($volume, $directories, $file ) = File::Spec->splitpath($in);
$directories =~ s;/$;;; # Can have extraneous trailing '/'
my @directories = File::Spec->splitdir($directories);
construct_filename($file, 'mutable', \@directories);
}
}
# We use 'Makefile' just to see if it has changed since the last time we # rebuilt. Now discard it. @input_files = grep { $_ ne 'Makefile' } @input_files;
my $rebuild = $write_unchanged_files # Rebuild: if unconditional rebuild || ! scalar @mktables_list_output_files # or if no outputs known || $old_start_time < $most_recent # or out-of-date || $old_options ne $command_line_arguments; # or with different # options
# Now we check to see if any output files are older than youngest, if # they are, we need to continue on, otherwise we can presumably bail. if (! $rebuild) { foreach my $out (@mktables_list_output_files) { if ( ! file_exists($out)) { print "'$out' is missing.\n" if $verbosity >= $VERBOSE; $rebuild = 1; last; } #local $to_trace = 1 if main::DEBUG; trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace; if ( (stat $out)[9] <= $most_recent ) { #trace "$out: most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace; print "'$out' is too old.\n" if $verbosity >= $VERBOSE; $rebuild = 1; last; } } } if (! $rebuild) { print "$0: Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n"; exit(0); } print "$0: Must rebuild tables.\n" if $verbosity >= $VERBOSE;
# Ready to do the major processing. First create the perl pseudo-property. $perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
# Process each input file foreach my $file (@input_file_objects) { $file->run; }
# Finish the table generation.
print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS; finish_Unicode();
# For the very specialized case of comparing two Unicode versions... if (DEBUG && $compare_versions) { handle_compare_versions(); }
print "Compiling Perl properties\n" if $verbosity >= $PROGRESS; compile_perl();
print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS; add_perl_synonyms();
print "Writing tables\n" if $verbosity >= $PROGRESS; write_all_tables();
# Write mktables.lst if ( $file_list and $make_list ) {
print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
foreach my $file (@input_files, @files_actually_output) {
my (undef, $directories, $basefile) = File::Spec->splitpath($file);
my @directories = grep length, File::Spec->splitdir($directories);
$file = join '/', @directories, $basefile;
}
my $ofh;
if (! open $ofh,">",$file_list) {
Carp::my_carp("Can't write to '$file_list'. Skipping: $!");
return
}
else {
my $localtime = localtime $start_time;
print $ofh <<"END";
#
# $file_list -- File list for $0.
#
# Autogenerated starting on $start_time ($localtime)
# From options $command_line_arguments
#
# - First section is input files
# ($0 itself is not listed but is automatically considered an input)
# - Section separator is /^=+\$/
# - Second section is a list of output files.
# - Lines matching /^\\s*#/ are treated as comments
# which along with blank lines are ignored.
#
# Input files:
END print $ofh "$_\n" for sort(@input_files); print $ofh "\n=================================\n# Output files:\n\n"; print $ofh "$_\n" for sort @files_actually_output; print $ofh "\n# ",scalar(@input_files)," input files\n", "# ",scalar(@files_actually_output)+1," output files\n\n", "# End list\n"; close $ofh or Carp::my_carp("Failed to close $ofh: $!");
print "Filelist has ",scalar(@input_files)," input files and ",
scalar(@files_actually_output)+1," output files\n"
if $verbosity >= $VERBOSE;
}
}
# Output these warnings unless -q explicitly specified. if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) { if (@unhandled_properties) { print "\nProperties and tables that unexpectedly have no code points\n"; foreach my $property (sort @unhandled_properties) { print $property, "\n"; } }
if (%potential_files) {
print "\nInput files that are not considered:\n";
foreach my $file (sort keys %potential_files) {
print File::Spec->abs2rel($file), "\n";
}
}
print "\nAll done\n" if $verbosity >= $VERBOSE;
}
if ($version_of_mk_invlist_bounds lt $v_version) { Carp::my_carp("WARNING: \\b{} algorithms (regen/mk_invlist.pl) need" . " to be checked and possibly updated to Unicode" . " $string_version. Failing tests will be marked TODO"); }
exit(0);
# TRAILING CODE IS USED BY make_property_test_script() __DATA__
use strict; use warnings;
use feature 'signatures';
no warnings 'experimental::uniprop_wildcards';
# Test qr/\X/ and the \p{} regular expression constructs. This file is # constructed by mktables from the tables it generates, so if mktables is # buggy, this won't necessarily catch those bugs. Tests are generated for all # feasible properties; a few aren't currently feasible; see # is_code_point_usable() in mktables for details.
# Standard test packages are not used because this manipulates SIG_WARN. It # exits 0 if every non-skipped test succeeded; -1 if any failed.
my $Tests = 0; my $Fails = 0;
# loc_tools.pl requires this function to be defined sub ok($pass, @msg) { print "not " unless $pass; print "ok "; print ++$Tests; print " - ", join "", @msg if @msg; print "\n"; }
sub Expect($expected, $ord, $regex, $warning_type='') { my $line = (caller)[2];
# Convert the code point to hex form
my $string = sprintf "\"\\x{%04X}\"", $ord;
my @tests = "";
# The first time through, use all warnings. If the input should generate
# a warning, add another time through with them turned off
push @tests, "no warnings '$warning_type';" if $warning_type;
foreach my $no_warnings (@tests) {
# Store any warning messages instead of outputting them
local $SIG{__WARN__} = $SIG{__WARN__};
my $warning_message;
$SIG{__WARN__} = sub { $warning_message = $_[0] };
$Tests++;
# A string eval is needed because of the 'no warnings'.
# Assumes no parentheses in the regular expression
my $result = eval "$no_warnings
my \$RegObj = qr($regex);
$string =~ \$RegObj ? 1 : 0";
if (not defined $result) {
print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
$Fails++;
}
elsif ($result ^ $expected) {
print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
$Fails++;
}
elsif ($warning_message) {
if (! $warning_type || ($warning_type && $no_warnings)) {
print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
$Fails++;
}
else {
print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
}
}
elsif ($warning_type && ! $no_warnings) {
print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
$Fails++;
}
else {
print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
}
}
return;
}
sub Error($regex) { $Tests++; if (eval { 'x' =~ qr/$regex/; 1 }) { $Fails++; my $line = (caller)[2]; print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n"; } else { my $line = (caller)[2]; print "ok $Tests - got and expected error for qr/$regex/; line $line\n"; } return; }
# Break test files (e.g. GCBTest.txt) character that break allowed here my $breakable_utf8 = my $breakable = chr(utf8::unicode_to_native(0xF7)); utf8::upgrade($breakable_utf8);
# Break test files (e.g. GCBTest.txt) character that indicates can't break # here my $nobreak_utf8 = my $nobreak = chr(utf8::unicode_to_native(0xD7)); utf8::upgrade($nobreak_utf8);
my $are_ctype_locales_available; my $utf8_locale; chdir 't' if -d 't'; eval { require "./loc_tools.pl" }; if (defined &locales_enabled) { $are_ctype_locales_available = locales_enabled('LC_CTYPE'); if ($are_ctype_locales_available) { $utf8_locale = &find_utf8_ctype_locale; } }
# Eval'd so can run on versions earlier than the property is available in my $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}\p{WB=ZWJ}]/'; if (! defined $WB_Extend_or_Format_re) { $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}]/'; }
sub _test_break($template, $break_type) { # Test various break property matches. The 2nd parameter gives the # property name. The input is a line from auxiliary/*Test.txt for the # given property. Each such line is a sequence of Unicode (not native) # code points given by their hex numbers, separated by the two characters # defined just before this subroutine that indicate that either there can # or cannot be a break between the adjacent code points. All these are # tested. # # For the gcb property extra tests are made. if there isn't a break, that # means the sequence forms an extended grapheme cluster, which means that # \X should match the whole thing. If there is a break, \X should stop # there. This is all converted by this routine into a match: $string =~ # /(\X)/, Each \X should match the next cluster; and that is what is # checked.
my $line = (caller 1)[2]; # Line number
my $comment = "";
if ($template =~ / ( .*? ) \s* \# (.*) /x) {
$template = $1;
$comment = $2;
# Replace leading spaces with a single one.
$comment =~ s/ ^ \s* / # /x;
}
# The line contains characters above the ASCII range, but in Latin1. It
# may or may not be in utf8, and if it is, it may or may not know it. So,
# convert these characters to 8 bits. If knows is in utf8, simply
# downgrade.
if (utf8::is_utf8($template)) {
utf8::downgrade($template);
} else {
# Otherwise, if it is in utf8, but doesn't know it, the next lines
# convert the two problematic characters to their 8-bit equivalents.
# If it isn't in utf8, they don't harm anything.
use bytes;
$template =~ s/$nobreak_utf8/$nobreak/g;
$template =~ s/$breakable_utf8/$breakable/g;
}
# Perl customizes wb. So change the official tests accordingly
if ($break_type eq 'wb' && $WB_Extend_or_Format_re) {
# Split into elements that alternate between code point and
# break/no-break
my @line = split / +/, $template;
# Look at each code point and its following one
for (my $i = 1; $i < @line - 1 - 1; $i+=2) {
# The customization only involves changing some breaks to
# non-breaks.
next if $line[$i+1] =~ /$nobreak/;
my $lhs = chr utf8::unicode_to_native(hex $line[$i]);
my $rhs = chr utf8::unicode_to_native(hex $line[$i+2]);
# And it only affects adjacent space characters.
next if $lhs !~ /\s/u;
# But, we want to make sure to test spaces followed by a Extend
# or Format.
next if $rhs !~ /\s|$WB_Extend_or_Format_re/;
# To test the customization, add some white-space before this to
# create a span. The $lhs white space may or may not be bound to
# that span, and also with the $rhs. If the $rhs is a binding
# character, the $lhs is bound to it and not to the span, unless
# $lhs is vertical space. In all other cases, the $lhs is bound
# to the span. If the $rhs is white space, it is bound to the
# $lhs
my $bound;
my $span;
if ($rhs =~ /$WB_Extend_or_Format_re/) {
if ($lhs =~ /\v/) {
$bound = $breakable;
$span = $nobreak;
}
else {
$bound = $nobreak;
$span = $breakable;
}
}
else {
$span = $nobreak;
$bound = $nobreak;
}
splice @line, $i, 0, ( '0020', $nobreak, '0020', $span);
$i += 4;
$line[$i+1] = $bound;
}
$template = join " ", @line;
}
# The input is just the break/no-break symbols and sequences of Unicode
# code points as hex digits separated by spaces for legibility. e.g.:
# ÷ 0020 × 0308 ÷ 0020 ÷
# Convert to native \x format
$template =~ s/ \s* ( [[:xdigit:]]+ ) \s* /sprintf("\\x{%02X}", utf8::unicode_to_native(hex $1))/gex;
$template =~ s/ \s* //gx; # Probably the line above removed all spaces;
# but be sure
# Make a copy of the input with the symbols replaced by \b{} and \B{} as
# appropriate
my $break_pattern = $template =~ s/ $breakable /\\b{$break_type}/grx;
$break_pattern =~ s/ $nobreak /\\B{$break_type}/gx;
my $display_string = $template =~ s/[$breakable$nobreak]//gr;
my $string = eval "\"$display_string\"";
# The remaining massaging of the input is for the \X tests. Get rid of
# the leading and trailing breakables
$template =~ s/^ \s* $breakable \s* //x;
$template =~ s/ \s* $breakable \s* $ //x;
# Delete no-breaks
$template =~ s/ \s* $nobreak \s* //xg;
# Split the input into segments that are breakable between them.
my @should_display = split /\s*$breakable\s*/, $template;
my @should_match = map { eval "\"$_\"" } @should_display;
# If a string can be represented in both non-ut8 and utf8, test both cases
my $display_upgrade = "";
UPGRADE:
for my $to_upgrade (0 .. 1) {
if ($to_upgrade) {
# If already in utf8, would just be a repeat
next UPGRADE if utf8::is_utf8($string);
utf8::upgrade($string);
$display_upgrade = " (utf8-upgraded)";
}
my @modifiers = qw(a aa d u i);
if ($are_ctype_locales_available) {
push @modifiers, "l$utf8_locale" if defined $utf8_locale;
# The /l modifier has C after it to indicate the locale to try
push @modifiers, "lC";
}
# Test for each of the regex modifiers.
for my $modifier (@modifiers) {
my $display_locale = "";
# For /l, set the locale to what it says to.
if ($modifier =~ / ^ l (.*) /x) {
my $locale = $1;
$display_locale = "(locale = $locale)";
POSIX::setlocale(POSIX::LC_CTYPE(), $locale);
$modifier = 'l';
}
no warnings qw(locale regexp surrogate);
my $pattern = "(?$modifier:$break_pattern)";
# Actually do the test
my $matched_text;
my $matched = $string =~ qr/$pattern/;
if ($matched) {
$matched_text = "matched";
}
else {
$matched_text = "failed to match";
print "not ";
if (TODO_FAILING_BREAKS) {
$comment = " # $comment" unless $comment =~ / ^ \s* \# /x;
$comment =~ s/#/# TODO/;
}
}
print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$pattern/$display_upgrade; line $line $display_locale$comment\n";
# Only print the comment on the first use of this line
$comment = "";
# Repeat with the first \B{} in the pattern. This makes sure the
# code in regexec.c:find_byclass() for \B gets executed
if ($pattern =~ / ( .*? : ) .* ( \\B\{ .* ) /x) {
my $B_pattern = "$1$2";
$matched = $string =~ qr/$B_pattern/;
print "not " unless $matched;
$matched_text = ($matched) ? "matched" : "failed to match";
print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$B_pattern/$display_upgrade; line $line $display_locale";
print " # TODO" if TODO_FAILING_BREAKS && ! $matched;
print "\n";
}
}
next if $break_type ne 'gcb';
# Finally, do the \X match.
my @matches = $string =~ /(\X)/g;
# Look through each matched cluster to verify that it matches what we
# expect.
my $min = (@matches < @should_match) ? @matches : @should_match;
for my $i (0 .. $min - 1) {
$Tests++;
if ($matches[$i] eq $should_match[$i]) {
print "ok $Tests - ";
if ($i == 0) {
print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
} else {
print "And \\X #", $i + 1,
}
print " correctly matched $should_display[$i]; line $line\n";
} else {
$matches[$i] = join("", map { sprintf "\\x{%04X}", ord $_ }
split "", $matches[$i]);
print "not ok $Tests -";
print " # TODO" if TODO_FAILING_BREAKS;
print " In \"$display_string\" =~ /(\\X)/g, \\X #",
$i + 1,
" should have matched $should_display[$i]",
" but instead matched $matches[$i]",
". Abandoning rest of line $line\n";
next UPGRADE;
}
}
# And the number of matches should equal the number of expected matches.
$Tests++;
if (@matches == @should_match) {
print "ok $Tests - Nothing was left over; line $line\n";
} else {
print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line";
print " # TODO" if TODO_FAILING_BREAKS;
print "\n";
}
}
return;
}
sub Test_GCB($t) { _test_break($t, 'gcb'); }
sub Test_LB($t) { _test_break($t, 'lb'); }
sub Test_SB($t) { _test_break($t, 'sb'); }
sub Test_WB($t) { _test_break($t, 'wb'); }
sub Finished() { print "1..$Tests\n"; exit($Fails ? -1 : 0); }
2 POD Errors
The following errors were encountered while parsing the POD:
- Around line 16961:
=end comment without matching =begin. (Stack: [empty])
- Around line 20368:
Non-ASCII character seen before =encoding in '÷'. Assuming UTF-8