#!/usr/bin/env perl ##---------------------------------------------------------------------------- ## Unicode Locale Identifier - ~/scripts/create_database.pl ## Version v0.2.0 ## Copyright(c) 2024 DEGUEST Pte. Ltd. ## Author: Jacques Deguest ## Created 2024/06/15 ## Modified 2025/01/01 ## All rights reserved ## ## ## This program is free software; you can redistribute it and/or modify it ## under the same terms as Perl itself. ##---------------------------------------------------------------------------- # If you want to know more about this script, and how to use it, do: perldoc create_database.pl # or, if you prefer, ./create_database.pl --man or ./create_database.pl --help for a short help use strict; use warnings; use open ':std' => ':utf8'; use utf8; use vars qw( $VERSION $DEBUG $VERBOSE $LOG_LEVEL $PROG_NAME $MAINTAINER $opt $opts $out $err @argv ); use Clone (); use Data::Pretty qw( dump ); use DateTime; use DateTime::Format::Strptime; use DBD::SQLite; use DBI qw( :sql_types ); use Getopt::Class v0.102.6; use HTML::Entities qw( decode_entities ); use JSON; use List::Util qw( uniq ); use Locale::Unicode; use Module::Generic::File qw( file stdout stderr tempfile ); use Module::Generic::Array; use Pod::Usage; use Scalar::Util qw( looks_like_number ); use XML::LibXML; our $VERSION = 'v0.2.0'; our $DEBUG = 0; our $VERBOSE = 0; our $LOG_LEVEL = 0; our $MAINTAINER = 'Jacques Deguest'; our $PROG_NAME = file( $0 )->basename( '.pl' ); our $out = stdout( binmode => ':utf8', autoflush => 1 ); our $err = stderr( binmode => ':utf8', autoflush => 1 ); my $logfile = file( $0 )->extension( 'log' ); my $tmpfile = tempfile( suffix => 'sqlite3', cleanup => 0 ); my $lib_dir = file( $0 )->parent->parent->child( 'lib' ); my $live_db_file; my $credit = < Copyright(c) 2024-2025 DEGUEST Pte. Ltd.. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. EOT my $dict = { apply_patch => { type => 'boolean', default => 1 }, cldr_version => { type => 'string' }, created => { type => 'datetime' }, db_file => { type => 'file', default => $tmpfile }, extend => { type => 'boolean', default => 0, action => 1 }, extended_timezones_cities => { type => 'file' }, log_file => { type => 'file', default => $logfile }, maintainer => { type => 'string', default => \$MAINTAINER }, patch_only => { type => 'boolean', default => 0, action => 1 }, replace => { type => 'boolean', default => 0 }, use_log => { type => 'boolean', default => 0 }, # Generic options quiet => { type => 'boolean', default => 0 }, debug => { type => 'integer', alias => [qw(d)], default => \$DEBUG }, verbose => { type => 'integer', default => \$VERBOSE }, v => { type => 'code', code => sub{ printf( STDOUT "2f\n", $VERSION ); } }, # help => { type => 'code', alias => [qw(?)], code => sub{ pod2usage(1); } }, help => { type => 'code', alias => [qw(?)], code => sub{ pod2usage( -exitstatus => 1, -verbose => 99, -sections => [qw( NAME SYNOPSIS DESCRIPTION COMMANDS OPTIONS AUTHOR COPYRIGHT )] ); } }, man => { type => 'code', code => sub{ pod2usage( -exitstatus => 0, -verbose => 2 ); } }, }; # Create backup of arguments our @argv = @ARGV; our $opt = Getopt::Class->new({ dictionary => $dict }) || die( "Error instantiating Getopt::Class object: ", Getopt::Class->error, "\n" ); $opt->usage( sub{ pod2usage(2) } ); our $opts = $opt->exec || die( "An error occurred executing Getopt::Class: ", $opt->error, "\n" ); my @errors = (); my $opt_errors = $opt->configure_errors; push( @errors, @$opt_errors ) if( $opt_errors->length ); if( $opts->{quiet} ) { $DEBUG = $VERBOSE = 0; } # Unless the log level has been set directly with a command line option unless( $LOG_LEVEL ) { $LOG_LEVEL = 1 if( $VERBOSE ); $LOG_LEVEL = ( 1 + $DEBUG ) if( $DEBUG ); } $err->print( "options check " ) if( $LOG_LEVEL ); if( length( $opts->{created} ) ) { $opts->{created_time} = $opts->{created}->epoch; $err->print( "Creating unix time from string $opts->{created} => $opts->{created_time}\n" ) if( $LOG_LEVEL ); } else { $opts->{created_time} = time(); } if( $opts->{replace} ) { $live_db_file = $lib_dir->child( 'Locale/Unicode/unicode_cldr.sqlite3' ); if( !$lib_dir->exists ) { push( @errors, "Unable to find the lib directory ${lib_dir}" ); } elsif( !$live_db_file->parent->exists ) { push( @errors, "Parent directory for live SQLite database file ${live_db_file} does not exist!" ); } } $err->print( @errors ? " not ok\n" : " ok\n" ) if( $LOG_LEVEL ); if( @errors ) { my $error = join( "\n", map{ "\t* $_" } @errors ); substr( $error, 0, 0, "\n\tThe following arguments are mandatory and missing.\n" ); if( !$opts->{quiet} ) { $err->print( <{use_log} = 1 if( $DEBUG ); my $script_dir = file( $0 )->parent; my $log_fh; if( $opts->{use_log} ) { $logfile = $opts->{log_file}; $log_fh = $logfile->open( '>', { binmode => ':utf8', autoflush => 1 }) || die( $logfile->error ); } local $SIG{__DIE__} = sub { $err->print( @_ ); $err->print( "Temporary SQLite database file not cleaned up upon exception: $tmpfile\n" ); exit(1); }; local $SIG{INT} = $SIG{TERM} = sub { my $sig = shift( @_ ); $err->print( "Caught a ${sig} signal.\n" ); $err->print( "Temporary SQLite database file not cleaned up upon exception: $tmpfile\n" ); exit(1); }; my @files; our $json = JSON->new->relaxed->allow_nonref->allow_blessed->convert_blessed; $tmpfile = $opts->{db_file}; &log( "Using database file ${tmpfile}" ); &log( "Making SQL connection to ${tmpfile}" ); my $dbh = DBI->connect( "dbi:SQLite:dbname=${tmpfile}", '', '' ) || die( "Unable to make connection to SQLite database file ${tmpfile}: ", $DBI::errstr ); # Enable the use of foreign keys $dbh->do("PRAGMA foreign_keys = ON"); # $dbh->{sqlite_string_mode} = DBD::SQLite::Constants::DBD_SQLITE_STRING_MODE_UNICODE_FALLBACK; $dbh->{sqlite_string_mode} = DBD::SQLite::Constants::DBD_SQLITE_STRING_MODE_UNICODE_STRICT; $dbh->{sqlite_see_if_its_a_number} = 1; $out->print( "Connection established to temporary SQLite database file ${tmpfile}\n" ) if( $DEBUG || !$opts->{replace} ); # NOTE: key variables declaration my( $xml, $doc, $sth, $ref ); my $lang_vars = {}; # NOTE: Find out what action to take my $action_found = ''; my @actions = grep{ exists( $dict->{ $_ }->{action} ) } keys( %$opts ); foreach my $action ( @actions ) { $action =~ tr/-/_/; next if( ref( $opts->{ $action } ) eq 'CODE' ); if( $opts->{ $action } && $action_found && $action_found ne $action ) { push( @errors, "You have opted for \"$action\", but \"$action_found\" is already selected." ); } elsif( $opts->{ $action } && !length( $action_found ) ) { $action_found = $action; die( "Unable to find a subroutne for '$action'" ) if( !main->can( $action ) ); } } if( !$action_found ) { $action_found = 'process'; } my $coderef = ( exists( $dict->{ $action_found }->{code} ) && ref( $dict->{ $action_found }->{code} ) eq 'CODE' ) ? $dict->{ $action_found }->{code} : main->can( $action_found ); if( !defined( $coderef ) ) { die( "There is no sub for action \"$action_found\"\n" ); } # exit( $coderef->() ? 0 : 1 ); &_cleanup_and_exit( $coderef->() ? 0 : 1 ); sub process { if( !scalar( @ARGV ) ) { die( "$0 /some/where/cldr-common-45.0" ); } my $topdir = file( shift( @ARGV ) ); if( !$topdir->exists ) { die( "CLDR top directory provided ${topdir} does not exist." ); } elsif( !$topdir->is_dir ) { die( "CLDR top directory provided ${topdir} is not a directory." ); } my $basedir = $topdir->child( 'common' ); if( !$basedir->exists ) { die( "CLDR JSON base directory ${basedir} does not exist." ); } my $iana_timezone_file = $script_dir->child( 'zone1970.tab' ); my $iana_alias_file = $script_dir->child( 'backward' ); my $cache_tz_corrections_file = $script_dir->child( 'tz_corrections.json' ); if( !$iana_timezone_file->exists ) { die( "The IANA Olson time zone database file 'zone1970.tab' does not exist. Please download it from ftp://ftp.iana.org/tz/tzdata-latest.tar.gz and place it in the 'scripts' folder." ); } my $anno_dir = $basedir->child( 'annotations' ); my $bcp47_dir = $basedir->child( 'bcp47' ); my $casings_dir = $basedir->child( 'casing' ); my $collation_dir = $basedir->child( 'collation' ); my $main_dir = $basedir->child( 'main' ); my $rbnf_dir = $basedir->child( 'rbnf' ); my $subdivisions_l10n_dir = $basedir->child( 'subdivisions' ); for( $anno_dir, $bcp47_dir, $casings_dir, $collation_dir, $main_dir, $rbnf_dir, $subdivisions_l10n_dir ) { die( "No diectory ${_} found." ) if( !$_->exists ); } my $n = 0; local $@; &log( "Creating SQL schema." ); my $tables = load_schema( file( $0 )->parent->child( 'cldr-schema.sql' ) ); &log( "Loaded ", scalar( @$tables ), " tables schema." ); my $tables_to_query_check = {}; @$tables_to_query_check{ @$tables } = (1) x scalar( @$tables ); my $boolean_map = { 'true' => 1, 'false' => 0, }; # NOTE: Preparing all SQL queries &log( "Preparing all SQL queries." ); my $queries = [ aliases => "INSERT INTO aliases (alias, replacement, reason, type, comment) VALUES(?, ?, ?, ?, ?)", annotations => "INSERT INTO annotations (locale, annotation, defaults, tts) VALUES(?, ?, ?, ?)", bcp47_currencies => "INSERT INTO bcp47_currencies (currid, code, description, is_obsolete) VALUES(?, ?, ?, ?)", bcp47_extensions => "INSERT INTO bcp47_extensions (category, extension, alias, value_type, description, deprecated) VALUES(?, ?, ?, ?, ?, ?)", bcp47_timezones => "INSERT INTO bcp47_timezones (tzid, alias, preferred, description, deprecated) VALUES(?, ?, ?, ?, ?)", bcp47_values => "INSERT INTO bcp47_values (category, extension, value, description) VALUES(?, ?, ?, ?)", calendar_append_formats => "INSERT INTO calendar_append_formats (locale, calendar, format_id, format_pattern) VALUES(?, ?, ?, ?)", calendar_available_formats => "INSERT INTO calendar_available_formats (locale, calendar, format_id, format_pattern, count, alt) VALUES(?, ?, ?, ?, ?, ?)", calendar_cyclics_l10n => "INSERT INTO calendar_cyclics_l10n (locale, calendar, format_set, format_type, format_length, format_id, format_pattern) VALUES(?, ?, ?, ?, ?, ?, ?)", calendar_datetime_formats => "INSERT INTO calendar_datetime_formats (locale, calendar, format_length, format_type, format_pattern) VALUES(?, ?, ?, ?, ?)", calendar_eras => "INSERT INTO calendar_eras (calendar, sequence, code, aliases, start, until) VALUES(?, ?, ?, ?, ?, ?)", calendar_eras_l10n => "INSERT INTO calendar_eras_l10n (locale, calendar, era_width, era_id, alt, locale_name) VALUES(?, ?, ?, ?, ?, ?)", calendar_formats_l10n => "INSERT INTO calendar_formats_l10n (locale, calendar, format_type, format_length, alt, format_id, format_pattern) VALUES(?, ?, ?, ?, ?, ?, ?)", calendar_interval_formats => "INSERT INTO calendar_interval_formats (locale, calendar, format_id, greatest_diff_id, format_pattern, alt, part1, separator, part2, repeating_field) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", calendar_terms => "INSERT INTO calendar_terms (locale, calendar, term_type, term_context, term_width, alt, yeartype, term_name, term_value) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?)", calendars => "INSERT INTO calendars (calendar, system, inherits, description) VALUES(?, ?, ?, ?)", calendars_l10n => "INSERT INTO calendars_l10n (locale, calendar, locale_name) VALUES(?, ?, ?)", casings => "INSERT INTO casings (locale, token, value) VALUES(?, ?, ?)", collations_l10n => "INSERT INTO collations_l10n (locale, collation, locale_name) VALUES(?, ?, ?)", code_mappings => "INSERT INTO code_mappings (code, alpha3, numeric, fips10, type) VALUES(?, ?, ?, ?, ?)", currencies => "INSERT INTO currencies (currency, digits, rounding, cash_digits, cash_rounding, is_obsolete, status) VALUES(?, ?, ?, ?, ?, ?, ?)", currencies_info => "INSERT INTO currencies_info (territory, currency, start, until, is_tender, hist_sequence, is_obsolete) VALUES(?, ?, ?, ?, ?, ?, ?)", currencies_l10n => "INSERT INTO currencies_l10n (locale, currency, count, locale_name, symbol) VALUES(?, ?, ?, ?, ?)", date_fields_l10n => "INSERT INTO date_fields_l10n (locale, field_type, field_length, relative, locale_name) VALUES(?, ?, ?, ?, ?)", date_terms => "INSERT INTO date_terms (locale, term_type, term_length, display_name) VALUES(?, ?, ?, ?)", day_periods => "INSERT INTO day_periods (locale, day_period, start, until) VALUES(?, ?, ?, ?)", language_population => "INSERT INTO language_population (territory, locale, population_percent, literacy_percent, writing_percent, official_status) VALUES(?, ?, ?, ?, ?, ?)", languages => "INSERT OR IGNORE INTO languages (language, scripts, territories, parent, alt, status) VALUES(?, ?, ?, ?, ?, ?)", languages_match => "INSERT INTO languages_match (desired, supported, distance, is_symetric, is_regexp, sequence) VALUES(?, ?, ?, ?, ?, ?)", likely_subtags => "INSERT INTO likely_subtags (locale, target) VALUES(?, ?)", locales => "INSERT INTO locales (locale, parent, collations, status) VALUES(?, ?, ?, ?)", locales_info => "INSERT INTO locales_info (locale, property, value) VALUES(?, ?, ?)", locales_l10n => "INSERT INTO locales_l10n (locale, locale_id, locale_name, alt) VALUES(?, ?, ?, ?)", locale_number_systems => "INSERT INTO locale_number_systems (locale, number_system, native, traditional, finance) VALUES(?, ?, ?, ?, ?)", metainfos => "INSERT INTO metainfos (property, value) VALUES(?, ?)", metazones => "INSERT INTO metazones (metazone, territories, timezones) VALUES(?, ?, ?)", metazones_names => "INSERT INTO metazones_names (locale, metazone, width, generic, standard, daylight) VALUES(?, ?, ?, ?, ?, ?)", number_formats_l10n => "INSERT INTO number_formats_l10n (locale, number_system, number_type, format_length, format_type, format_id, format_pattern, alt, count) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?)", number_symbols_l10n => "INSERT INTO number_symbols_l10n (locale, number_system, property, value, alt) VALUES(?, ?, ?, ?, ?)", number_systems => "INSERT INTO number_systems (number_system, digits, type) VALUES(?, ?, ?)", number_systems_l10n => "INSERT INTO number_systems_l10n (locale, number_system, locale_name, alt) VALUES(?, ?, ?, ?)", person_name_defaults => "INSERT INTO person_name_defaults (locale, value) VALUES(?, ?)", plural_ranges => "INSERT INTO plural_ranges (locale, aliases, start, stop, result) VALUES(?, ?, ?, ?, ?)", plural_rules => "INSERT INTO plural_rules (locale, aliases, count, rule) VALUES(?, ?, ?, ?)", rbnf => "INSERT INTO rbnf (locale, grouping, ruleset, rule_id, rule_value) VALUES(?, ?, ?, ?, ?)", refs => "INSERT INTO refs (code, uri, description) VALUES(?, ?, ?)", regions => "INSERT OR IGNORE INTO territories (territory, contains, status) VALUES(?, ?, ?)", scripts => "INSERT INTO scripts (script, rank, sample_char, id_usage, rtl, lb_letters, has_case, shaping_req, ime, density, origin_country, likely_language, status) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", scripts_l10n => "INSERT INTO scripts_l10n (locale, script, locale_name, alt) VALUES(?, ?, ?, ?)", subdivisions => "INSERT INTO subdivisions (territory, subdivision, parent, is_top_level, status) VALUES(?, ?, ?, ?, ?)", subdivisions_l10n => "INSERT INTO subdivisions_l10n (locale, subdivision, locale_name) VALUES(?, ?, ?)", territories => "INSERT INTO territories (territory, parent, gdp, literacy_percent, population, languages, contains, currency, calendars, min_days, first_day, weekend, status) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", territories_l10n => "INSERT INTO territories_l10n (locale, territory, locale_name, alt) VALUES(?, ?, ?, ?)", time_formats => "INSERT INTO time_formats (region, territory, locale, time_format, time_allowed) VALUES(?, ?, ?, ?, ?)", time_relative_l10n => "INSERT INTO time_relative_l10n (locale, field_type, field_length, relative, format_pattern, count) VALUES(?, ?, ?, ?, ?, ?)", timezones => "INSERT INTO timezones (timezone, territory, region, tzid, metazone, tz_bcpid, is_golden, is_primary, is_preferred, is_canonical, alias) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", timezones_cities => "INSERT INTO timezones_cities (locale, timezone, city, alt) VALUES(?, ?, ?, ?)", # This is defined in the extend_timezones_cities() function timezones_cities_supplemental => undef, timezones_formats => "INSERT INTO timezones_formats (locale, type, subtype, format_pattern) VALUES(?, ?, ?, ?)", timezones_info => "INSERT INTO timezones_info (timezone, metazone, start, until) VALUES(?, ?, ?, ?)", timezones_names => "INSERT INTO timezones_names (locale, timezone, width, generic, standard, daylight) VALUES(?, ?, ?, ?, ?, ?)", unit_aliases => "INSERT INTO unit_aliases (alias, target, reason) VALUES(?, ?, ?)", unit_constants => "INSERT INTO unit_constants (constant, expression, value, description, status) VALUES(?, ?, ?, ?, ?)", unit_conversions => "INSERT INTO unit_conversions (source, base_unit, expression, factor, systems, category) VALUES(?, ?, ?, ?, ?, ?)", units_l10n => "INSERT INTO units_l10n (locale, format_length, unit_type, unit_id, unit_pattern, pattern_type, locale_name, count, gender, gram_case) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", unit_prefixes => "INSERT INTO unit_prefixes (unit_id, symbol, power, factor) VALUES(?, ?, ?, ?)", unit_prefs => "INSERT INTO unit_prefs (unit_id, territory, category, usage, geq, skeleton) VALUES(?, ?, ?, ?, ?, ?)", unit_quantities => "INSERT INTO unit_quantities (base_unit, quantity, status, comment) VALUES(?, ?, ?, ?)", variants => "INSERT INTO variants (variant, status) VALUES(?, ?)", variants_l10n => "INSERT INTO variants_l10n (locale, variant, locale_name, alt) VALUES(?, ?, ?, ?)", week_preferences => "INSERT INTO week_preferences (locale, ordering) VALUES(?, ?)", ]; my $sths = {}; for( my $i = 0; $i < scalar( @$queries ); $i += 2 ) { my $id = $queries->[$i]; $out->print( "[${id}] " ) if( $DEBUG ); my $sql = $queries->[$i + 1]; # It is listed, but we skip it to make the 'tables_to_query_check' happy if( !defined( $sql ) ) { delete( $tables_to_query_check->{ $id } ); next; } elsif( exists( $sths->{ $id } ) ) { die( "There is already a statement object for ID '${id}' with SQL: ", $sths->{ $id }->{Statement} ); } my $sth = $dbh->prepare( $sql ) || die( "Error preparing query '", $sql, "': ", $dbh->errstr ); $sths->{ $id } = $sth; $out->print( "ok\n" ) if( $DEBUG ); if( exists( $tables_to_query_check->{ $id } ) ) { delete( $tables_to_query_check->{ $id } ); } else { warn( "No table '$id' found in our a tables-to-query map check." ); } } if( scalar( keys( %$tables_to_query_check ) ) ) { die( sprintf( "There are %d tables with no statement defined: %s", scalar( keys( %$tables_to_query_check ) ), join( ', ', sort( keys( %$tables_to_query_check ) ) ) ) ); } else { &log( "All tables have a statement defined." ); } # NOTE: Add meta information &log( "Add meta information." ); my $today = DateTime->from_epoch( epoch => $opts->{created_time} ); my $cldr_version; # cldr-common-45.0 if( $opts->{cldr_version} ) { $cldr_version = $opts->{cldr_version}; } elsif( $topdir =~ /\-(\d+(?:\.\d+)*)$/ ) { $cldr_version = $1; } else { die( "Unable to figure out the CLDR version from the directory of its data: ${topdir}" ); } $sth = $sths->{metainfos} || die( "Unable to get a statement object for table metainfos" ); my $meta = [ { property => 'cldr_version', value => $cldr_version }, { property => 'built_on', value => $today->iso8601 }, { property => 'maintainer', value => $opts->{maintainer} }, ]; foreach my $def ( @$meta ) { eval { $sth->execute( @$def{ qw( property value ) } ); } || die( "Error adding meta information property '$def->{property}' with value '$def->{value}': ", ( $@ || $sth->errstr ), "\nwith SQL query: ", $sth->{Statement}, "\n", dump( $def ) ); } # NOTE: Pre-loading all known currencies, languages, regions, scripts, subdivisions, variants &log( "Pre-loading all known currencies, languages, regions, scripts, subdivisions, variants." ); my $known = {}; my $known_data = { currencies => { file => $basedir->child( "validity/currency.xml" ), type => 'currency', }, languages => { file => $basedir->child( "validity/language.xml" ), type => 'language', }, territories => { file => $basedir->child( "validity/region.xml" ), type => 'region', }, scripts => { file => $basedir->child( "validity/script.xml" ), type => 'script', }, subdivisions => { file => $basedir->child( "validity/subdivision.xml" ), type => 'subdivision', }, variants => { file => $basedir->child( "validity/variant.xml" ), type => 'variant', }, }; # Sorting does not really matter, but it is just so I get the same order of output foreach my $prop ( sort( keys( %$known_data ) ) ) { $out->print( "[${prop}] " ) if( $DEBUG ); $n = 0; my $def = $known_data->{ $prop }; my $validDom = load_xml( $def->{file} ); my $validRes = $validDom->findnodes( "/supplementalData/idValidity/id[\@type=\"$def->{type}\"]" ) || die( "Unable to find any data of type '$def->{type}' in file $def->{file}" ); my $hash = {}; while( my $el = $validRes->shift ) { my $status = $el->getAttribute( 'idStatus' ); my $data = trim( $el->textContent ); my $ids = [split( /[[:blank:]\h\v]+/, $data )]; foreach my $id ( @$ids ) { if( index( $id, '~' ) != -1 ) { $id =~ s{ (?[a-zA-Z0-9]+)(?[a-zA-Z0-9])\~(?[a-zA-Z0-9]+) } { my $re = {%+}; foreach my $c ( $re->{start}..$re->{end} ) { my $r = $re->{prefix} . $c; # $out->print( "Adding '${r}'\n" ); $hash->{ $r } = { status => $status, }; $n++; } ''; }exs; } else { $hash->{ $id } = { status => $status, }; $n++; } } } $known->{ $prop } = $hash; $out->print( "ok. ${n} ${prop} added.\n" ) if( $DEBUG ); } my $supplemental_data_file = $basedir->child( 'supplemental/supplementalData.xml' ); my $suppDoc = load_xml( $supplemental_data_file ); # NOTE: Pre-loading currencies data (BCP47 and locale data) &log( "Pre-loading currencies data (BCP47 and locale data)" ); my $bcp_currency_file = $basedir->child( 'bcp47/currency.xml' ); my $eng_locale_data_file = $basedir->child( 'main/en.xml' ); my $bcpCurrDoc = load_xml( $bcp_currency_file ); my $engLocaleDoc = load_xml( $eng_locale_data_file ); my $bcpCurrRes = $bcpCurrDoc->findnodes( '/ldmlBCP47/keyword/key[@name="cu"]/type[@name]' ); # $out->print( $bcpCurrRes->size, " currency BCP47 IDs found.\n" ); my $bcpCurrIds = {}; my $bcpCurrDesc2id = {}; while( my $el = $bcpCurrRes->shift ) { my $id = $el->getAttribute( 'name' ); my $desc = $el->getAttribute( 'description' ); $desc =~ s/[[:blank:]\h\v]/ /gs; $desc =~ s/\((\d{4})\D(\d{4})\)$/\($1-$2\)/; $bcpCurrIds->{ $id } = $desc; if( exists( $bcpCurrDesc2id->{ $desc } ) ) { die( "There already exist the currency '", $bcpCurrDesc2id->{ $desc }, "' for the description '${desc}' that this currency '${id}' also has." ); } else { $bcpCurrDesc2id->{ lc( $desc ) } = $id; } } my $engCurrRes = $engLocaleDoc->findnodes( '/ldml/numbers/currencies/currency' ); # $out->print( $engCurrRes->size, " locale currencies found.\n" ); my $currMap = {}; my $currBCPMap = {}; my $currUnknown = {}; while( my $el = $engCurrRes->shift ) { my $code = $el->getAttribute( 'type' ); # my $disp = $el->getChildrenByTagName( 'displayName' ); my $disp = $el->findnodes( './displayName[not(@count)]' ); # $out->print( "Found ", $disp->size, " name(s) for this currency ${code}\n" ) if( $DEBUG ); my $desc = $disp->shift->textContent; $desc = decode_entities( $desc ) if( index( $desc, '&' ) != -1 ); # Switch commercial and (&) to regular and $desc =~ s/[[:blank:]\h]\&[[:blank:]\h]/ and /g if( index( $desc, '&' ) != -1 ); my $is_obsolete = 0; if( $desc =~ s/\((\d{4})\D(\d{4})\)$/\($1-$2\)/ ) { $is_obsolete++; } $desc =~ s/[[:blank:]\h\v]/ /gs; # Afghan Afghani (1927–2002) -> Afghan Afghani (1927–2002) $desc =~ s/\–/\–/g; my $test = lc( $desc ); # $out->print( "Checking currency code '${code}' with description '${desc}'\n" ) if( $DEBUG ); if( exists( $bcpCurrDesc2id->{ $test } ) ) { # I prefer the spelling of the BCP47 which keeps the casing proper, i.e. first letter upper case for each word $currMap->{ $code } = { id => $bcpCurrDesc2id->{ $test }, description => $bcpCurrIds->{ $bcpCurrDesc2id->{ $test } }, is_obsolete => $is_obsolete, }; $currBCPMap->{ $bcpCurrDesc2id->{ $test } } = { code => $code, description => $bcpCurrIds->{ $bcpCurrDesc2id->{ $test } }, is_obsolete => $is_obsolete, }; delete( $bcpCurrIds->{ $bcpCurrDesc2id->{ $test } } ); } else { if( $desc =~ /\((\d{4})\D(\d{4})\)$/ ) { $out->print( "\tThis is an old currency in use from $1 to $2\n" ) if( $DEBUG ); } $currUnknown->{ $code }++; } } $out->print( "Could map out ", scalar( keys( %$currMap ) ), " currencies while ", scalar( keys( %$currUnknown ) ), " were left unknown.\n" ) if( $DEBUG ); # $out->print( dump( $currMap ), "\n" ) if( $DEBUG >= 4 ); if( scalar( keys( %$currUnknown ) ) ) { $out->print( "Unknowns: ", join( ', ', sort( keys( %$currUnknown ) ) ), "\n" ) if( $DEBUG ); exit(1); } if( scalar( keys( %$bcpCurrIds ) ) ) { $out->print( "Unmapped BCP47 IDs: ", scalar( keys( %$bcpCurrIds ) ), ":\n" ) if( $DEBUG ); foreach my $id ( sort( keys( %$bcpCurrIds ) ) ) { $out->print( "${id}: ", $bcpCurrIds->{ $id }, "\n" ) if( $DEBUG ); } exit(1); } # NOTE: Loading currencies &log( "Loading currencies." ); $n = 0; my $currRes = $suppDoc->findnodes( '/supplementalData/currencyData/fractions/info[not(@iso4217="DEFAULT")]' ) || die( "Unable to get the currencies information from ${supplemental_data_file}" ); if( !$currRes->size ) { die( "No currencies information was found in ${supplemental_data_file}" ); } $sth = $sths->{currencies} || die( "No SQL statement object for currencies" ); # We load up the initial set of data in the currencies dictionary my $currenciesData = {}; while( my $el = $currRes->shift ) { my $def = { currency => ( $el->getAttribute( 'iso4217' ) || die( "No attribute 'iso4217' found for this currency element: ", $el->toString() ) ), digits => $el->getAttribute( 'digits' ), rounding => $el->getAttribute( 'rounding' ), cash_digits => $el->getAttribute( 'cashDigits' ), cash_rounding => $el->getAttribute( 'cashRounding' ), }; foreach my $prop ( qw( digits rounding ) ) { if( !defined( $def->{ $prop } ) || !length( $def->{ $prop } ) ) { die( "No attribute '${prop}' could be found for this currency '", $def->{currency}, "': ", $el->toString() ); } } if( exists( $currMap->{ $def->{currency} } ) ) { $def->{is_obsolete} = $currMap->{ $def->{currency} }->{is_obsolete}; } $currenciesData->{ $def->{currency} } = $def; } # Now, we merge our main currencies data to ensure we have a complete set, although many will not have the rounding or digits information provided. foreach my $code ( keys( %$currMap ) ) { if( !exists( $currenciesData->{ $code } ) ) { $currenciesData->{ $code } = { currency => $code, is_obsolete => $currMap->{ $code }->{is_obsolete}, } } } # Add missing currencies and set the status using the known currencies we loaded at the beginning. foreach my $code ( keys( %{$known->{currencies}} ) ) { if( !exists( $currenciesData->{ $code } ) ) { $currenciesData->{ $code } = { currency => $code }; } $currenciesData->{ $code }->{status} = $known->{currencies}->{ $code }->{status}; } foreach my $code ( sort( keys( %$currenciesData ) ) ) { my $def = $currenciesData->{ $code }; $out->print( "[", $def->{currency}, "] " ) if( $DEBUG ); eval { $sth->execute( @$def{qw( currency digits rounding cash_digits cash_rounding is_obsolete status )} ); } || die( "Error adding currency '", $def->{currency}, "' information to table currencies: ", ( $@ || $sth->errstr ), "\n", dump( $def ) ); $out->print( "ok\n" ) if( $DEBUG ); $n++; } &log( "${n} currencies added." ); # NOTE: Loading BCP47 currencies &log( "Loading BCP47 currencies." ); $n = 0; $sth = $sths->{bcp47_currencies} || die( "No SQL statement object for bcp47_currencies" ); foreach my $id ( sort( keys( %$currBCPMap ) ) ) { my $def = $currBCPMap->{ $id }; $out->print( "[${id}] " ) if( $DEBUG ); eval { $sth->execute( $id, @$def{qw( code description is_obsolete )} ); } || die( "Error adding BCP47 currency '${id}' information and ISO 4217 currency code '$def->{code}' to table bcp47_currencies: ", ( $@ || $sth->errstr ), "\n", dump( $def ) ); $out->print( "ok\n" ) if( $DEBUG ); $n++; } &log( "${n} BCP47 currencies added." ); # NOTE: Pre-loading core regional territories &log( "Pre-loading core regional territories" ); my $containersRes = $suppDoc->findnodes( '//territoryContainment/group' ) || die( "Unable to get territories containers information in ${supplemental_data_file}" ); if( !$containersRes->size ) { die( "No territories containers information could be found in ${supplemental_data_file}" ); } $n = 0; # We need to ensure the locale we use in territoryInfo exists since we need to satisfy the foreign key. # So we add the locale to the hash reference $known_locales so it can be added my $known_locales = {}; my $territoryInfo = {}; while( my $el = $containersRes->shift ) { my $code = $el->getAttribute( 'type' ); # Example: 030 (Eastern Asia) contains CN HK JP KP KR MN MO TW my $contains = $el->getAttribute( 'contains' ) || die( "Unable to get the value of contained territories in attribute 'contains' for this element: ", $el->toString() ); $contains = [split( /[[:blank:]\h\v]+/, $contains )]; # grouping, deprecated my $status = $el->getAttribute( 'status' ); $out->print( "[${code}] " ) if( $DEBUG ); # We need to ensure territory code such as 001 is treated as a string and not end up as 1 $code = sprintf( '%03d', $code ) if( $code =~ /^\d{1,2}$/ ); if( !exists( $territoryInfo->{ $code } ) ) { $territoryInfo->{ $code } = { contains => $contains, status => $status, }; $out->print( "ok\n" ) if( $DEBUG ); &log( "Pre-loaded core regional territory ${code}" ); $n++; } # There is another entry, but with attribute 'grouping' or 'status', so we add the contained territories to the stack elsif( $el->hasAttribute( 'grouping' ) || $el->hasAttribute( 'status' ) ) { $out->print( "adding territories: ", join( ', ', @$contains ), "\n" ) if( $DEBUG ); push( @{$territoryInfo->{ $code }->{contains}}, @$contains ); } else { $out->print( "ignored (duplicate?)\n" ) if( $DEBUG ); } } &log( "${n} core regional territories pre-loaded." ); # NOTE: Collecting territories data &log( "Collecting territories data." ); $n = 0; my $terrRes = $suppDoc->findnodes( '/supplementalData/territoryInfo/territory' ) || die( "Unable to get the territories information in ${supplemental_data_file}" ); if( !$terrRes->size ) { die( "No territories information found in ${supplemental_data_file}" ); } while( my $el = $terrRes->shift ) { my $code = $el->getAttribute( 'type' ); if( exists( $territoryInfo->{ $code } ) ) { die( "This territory code '${code}' seems to already exists and thus have already been defined as a container region during our previous pass." ); } $territoryInfo->{ $code } = { gdp => $el->getAttribute( 'gdp' ), literacy_percent => $el->getAttribute( 'literacyPercent' ), population => $el->getAttribute( 'population' ), }; my $langs = {}; my $langs_order = []; my @langPop = $el->getElementsByTagName( 'languagePopulation' ); foreach my $lel ( @langPop ) { my $lang = $lel->getAttribute( 'type' ); # Some languages used here use underscore. Not sure why, but we need to harmonize and standardize this with the rest of the data. $lang =~ tr/_/-/; # Make sure to add that locale to satisfy the foreign key requirement $known_locales->{ $lang } = { locale => $lang } if( !exists( $known_locales->{ $lang } ) ); $langs->{ $lang } = { population_percent => $lel->getAttribute( 'populationPercent' ), literacy_percent => $lel->getAttribute( 'literacyPercent' ), writing_percent => $lel->getAttribute( 'writingPercent' ), official_status => $lel->getAttribute( 'officialStatus' ), }; push( @$langs_order, $lang ); } $territoryInfo->{ $code }->{language_population} = $langs; $territoryInfo->{ $code }->{_langs} = $langs_order; } # NOTE: Adding missing territory codes &log( "Adding missing territory codes." ); my $missingButRequiredTerritoryCodes = { # Used in supplemental/supplementalData.xml//weekData/minDays AN => { status => 'deprecated' }, }; foreach my $code ( keys( %$missingButRequiredTerritoryCodes ) ) { if( !exists( $territoryInfo->{ $code } ) ) { $out->print( "Adding ${code}\n" ) if( $DEBUG ); $territoryInfo->{ $code } = $missingButRequiredTerritoryCodes->{ $code }; } } # NOTE: Processing territories currencies historical data to derive current currency code for each territory &log( "Pre-loading territories currency historical data." ); $n = 0; my $currencyRegionsRes = $suppDoc->findnodes( '//currencyData/region[@iso3166]' ) || die( "Unable to get the currency region nodes in file ${supplemental_data_file}" ); if( !$currencyRegionsRes->size ) { die( "No currency region nodes found in file ${supplemental_data_file}" ); } my $o = 0; my $currenciesInfo = {}; my $dtParser = DateTime::Format::Strptime->new( pattern => '%Y-%m-%d', locale => 'en_GB', time_zone => 'GMT', ); # See my $deprecatedTerritories = { # "Burma (BU) was renamed to Myanmar (MM)" BU => { status => 'deprecated' }, # Czechoslovakia CS => { status => 'deprecated' }, # "East Germany merged with West Germany, and no longer exists as "DD". DDM was an ISO-4217 code." DD => { status => 'deprecated' }, # Soviet Union. "It split into RU and several other regions." SU => { status => 'deprecated' }, # East Timor. Renamed to TL # TP => { status => 'deprecated' }, # Yemen, Democratic # YD => { status => 'deprecated' }, # Yugoslavia # YU => { status => 'deprecated' }, # "Zaire (ZR) was renamed to Democratic Republic of Congo (CD)." ZR => { status => 'deprecated' }, }; my $currencyException = { # Unknown or Invalid Territory # Currency code for transations where no currency is involved ZZ => 'XXX', }; while( my $el = $currencyRegionsRes->shift ) { my $code = $el->getAttribute( 'iso3166' ) || die( "No attribute 'iso3166' found for this element: ", $el->toString() ); $out->print( "[${code}] " ) if( $DEBUG ); my $infoRes = $el->findnodes( './currency[@iso4217]' ) || die( "Unable to get the 'currency' nodes for the region code '${code}': ", $el->toString() ); my $totalCurrInfos = $infoRes->size; if( !exists( $territoryInfo->{ $code } ) ) { if( exists( $deprecatedTerritories->{ $code } ) ) { $territoryInfo->{ $code } = $deprecatedTerritories->{ $code }; $out->print( "deprecated, and has ${totalCurrInfos} historical currency information found.\n" ) if( $DEBUG ); # This is an old territory code with no historical currency data next if( !$totalCurrInfos ); } else { die( "Territory code '${code}' used in historical currency information is unknown, not in the list of defined territories. You may want to add it to the list of exceptions in the \$deprecatedTerritories hash reference here." ); } } elsif( exists( $territoryInfo->{ $code }->{contains} ) ) { $out->print( "is a container territory: " ) if( $DEBUG ); } $currenciesInfo->{ $code } = []; # The CLDR list those historical records with the latest on top, but we do the opposite, especially since in SQL it does not matter that much. # And we use the field hist_sequence to keep track of the sequence my $seq = 0; my $activeCurrency; my $lastEndDate; while( my $el_curr = $infoRes->shift ) { my $curr = $el_curr->getAttribute( 'iso4217' ) || die( "No attribute 'iso4217' found for this territory code '${code}': ", $el_curr->toString() ); my $from = $el_curr->getAttribute( 'from' ); if( !length( $from // '' ) ) { warn( "Warning: no attribute 'from' found for this currency code '${curr}' for the territory code '${code}': " . $el_curr->toString() ); } my $to; if( $el_curr->hasAttribute( 'to' ) ) { $to = $el_curr->getAttribute( 'to' ) || die( "Attribute 'to' is defined, but is empty for this currency '${curr}' historical record for the territory '${code}': ", $el_curr->toString() ); } foreach( $from, $to ) { if( defined( $_ ) ) { if( /^(?\d{4})\D(?\d{1,2})$/ ) { warn( "Missing 'day' for '$_' for territory '${code}' and currency '${curr}', defaulting to 1: $_" ); $_ = sprintf( '%04d-%02d-%02d', $+{year}, $+{month}, 1 ); } elsif( /^(?\d{4})$/ ) { warn( "Missing 'month' and 'day' for '$_' for territory '${code}' and currency '${curr}', defaulting to 1: $_" ); $_ = sprintf( '%04d-%02d-%02d', $+{year}, 1, 1 ); } elsif( !length( $_ // '' ) ) { $_ = undef; } } } # If there is no end date for this currency, this means it is still active. if( defined( $from ) && !defined( $to ) ) { if( defined( $activeCurrency ) ) { if( defined( $lastEndDate ) ) { my $fromDt = $dtParser->parse_datetime( $from ); my $prevDt = $dtParser->parse_datetime( $lastEndDate ); if( $fromDt >= $prevDt ) { warn( "Warning: Found previously set active currency '${activeCurrency}' for this territory code '${code}', but this currency '${curr}' has a start date '${from}' higher or equal to the previous one end date '${lastEndDate}'. Using '${curr}' instead." ); $activeCurrency = $curr; } else { warn( "Warning: Found previously set active currency '${activeCurrency}' for this territory code '${code}', but this currency '${curr}' has a start date '${from}' lower than the previous one end or start date '${lastEndDate}'. Not using '${curr}' as the territory default currency." ); } } else { die( "Found currency '${curr}' with start date '", ( $from // 'undef' ), "' and no end date for this territory code '${code}', but another one is already defined: '${activeCurrency}'" ); } } else { $activeCurrency = $curr; $lastEndDate = $from; } } elsif( defined( $to ) ) { $lastEndDate = $to; # For historical currency that are singleton and are part of an deprecated territory like BU if( exists( $deprecatedTerritories->{ $code } ) && !defined( $activeCurrency ) ) { $activeCurrency = $curr; } } elsif( defined( $from ) ) { $lastEndDate = $from; } # For territories with no active currency, such as AQ elsif( !defined( $from ) && !defined( $to ) && $totalCurrInfos == 1 ) { $activeCurrency = $curr; } my $is_tender = 0; if( $el_curr->hasAttribute( 'tender' ) ) { my $this = $el_curr->getAttribute( 'tender' ) || die( "Attribute 'tender' is defined for this currency '${curr}', but its value is empty: ", $el_curr->toString() ); $this = lc( $this ); if( exists( $boolean_map->{ $this } ) ) { $is_tender = $boolean_map->{ $this }; } else { die( "Value for attribute 'tender' (${this}) for this currency '${curr}' is unsupported. I was expecting either 'true' or 'false': ", $el_curr->toString() ); } } push( @{$currenciesInfo->{ $code }}, { code => $code, currency => $curr, start => $from, until => $to, is_tender => $is_tender, hist_sequence => ++$seq, is_obsolete => ( defined( $to ) ? 1 : 0 ), }); $o++; } if( defined( $activeCurrency ) ) { $territoryInfo->{ $code }->{currency} = $activeCurrency; } elsif( exists( $currencyException->{ $code } ) ) { $territoryInfo->{ $code }->{currency} = $currencyException->{ $code }; } else { die( "No active currency found for this territory '${code}'" ); } $out->print( "ok\n" ) if( $DEBUG ); $n++; } &log( "${n} territory currencies pre-loaded for ${o} historical records" ); # NOTE: Pre-loading BCP47 calendar names &log( "Pre-loading BCP47 calendar names." ); $n = 0; my $bcp_calendar_file = $basedir->child( 'bcp47/calendar.xml' ); my $calDom = load_xml( $bcp_calendar_file ); my $calRes = $calDom->findnodes( '//keyword/key[@name="ca"]/type' ) || die( "Failed to get nodes for BCP calendars" ); my $cals_names = {}; while( my $el = $calRes->shift ) { # my $id = $el->getAttribute( 'name' ) || die( "Unable to get the attribute 'name' for this BCP47 calendar: ", $el->toString() ); my $desc = $el->getAttribute( 'description' ) || die( "Unable to get the attribute 'description' for this BCP47 calendar: ", $el->toString() ); $cals_names->{ $id } = $desc; $out->print( "[${id}] ${desc}\n" ) if( $DEBUG ); } # NOTE: Loading calendars &log( "Loading calendars." ); $n = 0; my $era_n = 0; my $calendarRes = $suppDoc->findnodes( '//calendarData/calendar' ) || die( "Unable to get the calendar information in ${supplemental_data_file}" ); if( !$calendarRes->size ) { die( "No calendar information was found in ${supplemental_data_file}" ); } $sth = $sths->{calendars} || die( "No SQL statement object for calendars" ); my $sth_era = $sths->{calendar_eras} || die( "No SQL statement object for calendar_eras" ); # Used to check the calendar associated with a territory actually exists my $calendars = {}; while( my $el = $calendarRes->shift ) { my $calendar = $el->getAttribute( 'type' ) || die( "No attribute value for 'type' in this calendar element: ", $el->toString() ); $calendars->{ $calendar }++; $out->print( "[${calendar}] " ) if( $DEBUG ); my $def = { calendar => $calendar, }; # has no child node if( $el->hasChildNodes ) { if( my $calSys = $el->findnodes( './calendarSystem/@type' )->shift ) { $def->{system} = $calSys->getValue(); } # Example: Japanese calendar -> if( my $inheritRes = $el->findnodes( './inheritEras[@calendar]' )->shift ) { $def->{inherits} = $inheritRes->getAttribute( 'calendar' ); } } if( exists( $cals_names->{ $def->{calendar} } ) ) { $def->{description} = $cals_names->{ $def->{calendar} }; } eval { $sth->execute( @$def{qw( calendar system inherits description )} ); } || die( "Error adding data for calendar code '${calendar}': ", ( $@ || $sth->errstr ), "\n", dump( $def ) ); my $erasRes = $el->findnodes( './eras/era[@type]' ); if( $erasRes->size ) { my $data = []; while( my $el_era = $erasRes->shift ) { my $def_era = { calendar => $calendar, sequence => $el_era->getAttribute( 'type' ), code => $el_era->getAttribute( 'code' ), aliases => $el_era->getAttribute( 'aliases' ), start => $el_era->getAttribute( 'start' ), until => $el_era->getAttribute( 'end' ), }; if( !defined( $def_era->{sequence} ) || !length( $def_era->{sequence} ) ) { die( "No sequence is defined for this era of the calendar '", ( $def_era->{calendar} // 'undef' ), "': ", $el_era->toString() ); } $def_era->{aliases} = [split( /[[:blank:]\h\v]+/, $def_era->{aliases} )] if( defined( $def_era->{aliases} ) ); eval { $sth_era->execute( @$def_era{qw( calendar sequence code )}, to_array( $def_era->{aliases} ), @$def_era{qw( start until )} ); } || die( "Error adding data for calendar era sequence '", ( $def_era->{sequence} // 'undef' ), "': ", ( $@ || $sth_era->errstr ), "\n", $el_era->toString(), "\n", dump( $def_era ) ); $era_n++; } } $out->print( "ok\n" ) if( $DEBUG ); } &log( "${n} calendars and ${era_n} eras added." ); $sth_era->finish; # NOTE: Adding some more calendars from main/en.xml &log( "Adding some more calendars from main/en.xml" ); $n = 0; my $en_file = $main_dir->child( 'en.xml' ) || die( "Unable to get the file object for $main_dir/en.xml" ); my $enDom = load_xml( $en_file ); my $enCalendarsRes = $enDom->findnodes( '/ldml/localeDisplayNames/types/type[@key="calendar"]' ); &log( sprintf( "\t%d calendars found in main/en.xml", $enCalendarsRes->size ) ); while( my $el = $enCalendarsRes->shift ) { my $calendar = $el->getAttribute( 'type' ) || die( "Unable to get the calendar ID for locale en in file ${en_file} for this element: ", $el->toString() ); next if( exists( $calendars->{ $calendar } ) ); $out->print( "\tAdding missing calendar ID ${calendar}: " ) if( $DEBUG ); my $def = { calendar => $calendar, description => trim( $el->textContent ), }; eval { $sth->execute( @$def{qw( calendar system inherits description )} ); } || die( "Error adding data for calendar code '${calendar}': ", ( $@ || $sth->errstr ), "\n", dump( $def ) ); $out->print( "ok\n" ) if( $DEBUG ); $calendars->{ $calendar }++; $n++; } &log( "${n} additional calendar IDs added." ); # NOTE: Loading territories calendars preferences &log( "Loading territories calendars preferences." ); $n = 0; my $calPrefsRes = $suppDoc->findnodes( '//calendarPreferenceData/calendarPreference' ) || die( "Unable to get calendar preferences information in ${supplemental_data_file}" ); if( !$calPrefsRes->size ) { die( "No calendar preferences information found in ${supplemental_data_file}" ); } while( my $el = $calPrefsRes->shift ) { my $codes = $el->getAttribute( 'territories' ) || die( "No attribute 'territories' found for this calendar preferences element: ", $el->toString() ); my $cals = $el->getAttribute( 'ordering' ) || die( "No attribute 'ordering' found for this calendar preferences element: ", $el->toString() ); foreach( $codes, $cals ) { $_ = [split( /[[:blank:]\h\v]+/, $_ )]; } # Check calendars actually exist foreach my $cal ( @$cals ) { if( !exists( $calendars->{ $cal } ) ) { die( "Calendar '${cal}' is unknown. Used in element: ", $el->toString() ); } } foreach my $code ( @$codes ) { # 'World' is used to define a default value. That default value is set in the SQL schema # if( $code eq '001' ) # { # next; # } # elsif( !exists( $territoryInfo->{ $code } ) ) if( !exists( $territoryInfo->{ $code } ) ) { die( "Calendar preference for territory '${code}', but this territory is not defined in CLDR." ); } $territoryInfo->{ $code }->{calendars} = $cals; $n++; } } &log( "${n} territories were associated calendar preferences. The rest defaults to 'gregorian'" ); # NOTE: Loading week data &log( "Loading week data." ); my $week_map = [ { xpath => '//weekData/minDays', attribute => 'count', property => 'min_days' }, { xpath => '//weekData/firstDay', attribute => 'day', property => 'first_day' }, { xpath => '//weekData/weekendStart', attribute => 'day', property => 'weekend', is_array => 1 }, { xpath => '//weekData/weekendEnd', attribute => 'day', property => 'weekend', is_array => 1 }, ]; my $day_map = { mon => 1, tue => 2, wed => 3, thu => 4, fri => 5, sat => 6, sun => 7, }; foreach my $def ( @$week_map ) { my $data = $def->{data}; my $weekRes = $suppDoc->findnodes( $def->{xpath} ) || die( "Unable to get week data for xpath $def->{xpath} in ${supplemental_data_file}" ); if( !$weekRes->size ) { die( "No week data found for xpath $def->{xpath} in ${supplemental_data_file}" ); } while( my $el = $weekRes->shift ) { # Ignore an exception made just for one case.... # if( $el->hasAttribute( 'alt' ) ) { next; } my $val = $el->getAttribute( $def->{attribute} ) || die( "No attribute value '$def->{attribute}' for this element: ", $el->toString() ); if( exists( $day_map->{ lc( $val ) } ) ) { $val = $day_map->{ lc( $val ) }; } my $codes = $el->getAttribute( 'territories' ) || die( "No attribute value 'territories' for this element: ", $el->toString() ); $codes = trim( $codes ); $codes = [split( /[[:blank:]\h\v]+/, $codes )]; foreach my $code ( @$codes ) { # This is used by CLDR to define the default value, and we define the default value in the SQL schema # next if( $code eq '001' ); if( !exists( $territoryInfo->{ $code } ) ) { die( "Unknown territory code '${code}' for this element: ", $el->toString() ); } elsif( exists( $def->{property} ) ) { if( $def->{is_array} ) { unless( exists( $territoryInfo->{ $code }->{ $def->{property} } ) && ref( $territoryInfo->{ $code }->{ $def->{property} } ) eq 'ARRAY' ) { $territoryInfo->{ $code }->{ $def->{property} } = []; } push( @{$territoryInfo->{ $code }->{ $def->{property} }}, $val ); } else { $territoryInfo->{ $code }->{ $def->{property} } = $val; } } else { die( "No property value set for this week data in our internal map!" ); } } } } # NOTE: Loadding missing territories from the known territories dictionary &log( "Loadding missing territories from the known territories dictionary." ); $n = 0; foreach my $code ( sort( keys( %{$known->{territories}} ) ) ) { $out->print( "[${code}] " ) if( $DEBUG ); if( exists( $territoryInfo->{ $code } ) ) { if( $territoryInfo->{ $code }->{status} ) { if( $territoryInfo->{ $code }->{status} ne $known->{territories}->{ $code }->{status} ) { die( "A status with value '", $territoryInfo->{ $code }->{status}, " is already set for territory '${code}', but it does not match with that from the known territories dictionary: '", $known->{territories}->{ $code }->{status}, "'" ); } } else { $territoryInfo->{ $code }->{status} = $known->{territories}->{ $code }->{status}; $out->print( "ok, status set.\n" ) if( $DEBUG ); } } else { $territoryInfo->{ $code } = $known->{territories}->{ $code }; $territoryInfo->{ $code }->{territory} = $code; $out->print( "ok, added missing\n" ) if( $DEBUG ); $n++; } } &log( sprintf( "%d territories missing (%.2f%%) added out of %d\n", $n, ( ( $n / scalar( keys( %{$known->{territories}} ) ) ) * 100 ), scalar( keys( %{$known->{territories}} ) ) ) ); # NOTE: Loading possible additional territories from the locale data files &log( "Loading possible additional territories from the locale data files." ); $n = 0; my $territoriesFromLocaleDataRes = $engLocaleDoc->findnodes( '//localeDisplayNames/territories/territory' ) || die( "Unable to get territories data from locale data file ${eng_locale_data_file}" ); &log( sprintf( "Processing %d territories from ${eng_locale_data_file}", $territoriesFromLocaleDataRes->size ) ); while( my $el = $territoriesFromLocaleDataRes->shift ) { # Example: Japan my $code = $el->getAttribute( 'type' ) || die( "Unable to get the territory code from the 'type' attribute in this element: ", $el->toString() ); if( !exists( $territoryInfo->{ $code } ) ) { $territoryInfo->{ $code } = { territory => $code }; $n++; } } &log( "${n} additional territories added to known territories." ); my $territory_parent_lookup = sub { my $code = shift( @_ ); # We sort so we get the 3-digits regions first foreach my $region ( sort( keys( %$territoryInfo ) ) ) { my $def = $territoryInfo->{ $region }; if( exists( $def->{contains} ) && defined( $def->{contains} ) && ref( $def->{contains} ) eq 'ARRAY' ) { if( scalar( grep( $_ eq $code, @{$def->{contains}} ) ) ) { return( $region ); } } } return; }; # NOTE: Loading territories &log( "Loading territories." ); $n = 0; $sth = $sths->{territories} || die( "No SQL statement object for territories" ); foreach my $code ( sort( keys( %$territoryInfo ) ) ) { my $def = $territoryInfo->{ $code }; # This territory languages sorted by usage popularity # Too unreliable # my $langs = [sort{ $def->{language_population}->{ $b }->{population_percent} <=> $def->{language_population}->{ $a }->{population_percent} } keys( %{$def->{language_population}} )]; # A private property in which we stored the language in the order it was in the XML file my $langs = $def->{_langs}; $out->print( "[${code}] " ) if( $DEBUG ); if( !defined( $def->{parent} ) || !length( $def->{parent} // '' ) ) { my $region = $territory_parent_lookup->( $code ); if( defined( $region ) ) { $def->{parent} = $region; $out->print( "parent set to ${region} " ) if( $DEBUG ); } else { $out->print( "no parent region found " ) if( $DEBUG ); } } eval { $code = sprintf( '%03d', $code ) if( $code =~ /^\d{1,2}$/ ); $def->{parent} = sprintf( '%03d', $def->{parent} ) if( defined( $def->{parent} ) && $def->{parent} =~ /^\d{1,2}$/ ); $sth->bind_param( 1, "$code", SQL_VARCHAR ); $sth->bind_param( 2, $def->{parent}, SQL_VARCHAR ); $sth->bind_param( 3, $def->{gdp}, SQL_INTEGER ); $sth->bind_param( 4, $def->{literacy_percent}, SQL_FLOAT ); $sth->bind_param( 5, $def->{population}, SQL_INTEGER ); $sth->bind_param( 6, to_array( $langs ), SQL_VARCHAR ); $sth->bind_param( 7, to_array( $def->{contains} ), SQL_VARCHAR ); $sth->bind_param( 8, $def->{currency}, SQL_VARCHAR ); $sth->bind_param( 9, to_array( $def->{calendars} ), SQL_VARCHAR ); $sth->bind_param( 10, $def->{min_days}, SQL_INTEGER ); $sth->bind_param( 11, $def->{first_day}, SQL_INTEGER ); $sth->bind_param( 12, to_array( $def->{weekend} ), SQL_VARCHAR ); $sth->bind_param( 13, $def->{status}, SQL_VARCHAR ); $sth->execute; } || die( "Error adding data for country code '${code}': ", ( $@ || $sth->errstr ), "\n", dump( $def ) ); $out->print( "ok\n" ) if( $DEBUG ); $n++; } &log( "${n} territories added." ); # NOTE: Loading territories currency historical data &log( "Loading territories currency historical data." ); $n = 0; $o = 0; $sth = $sths->{currencies_info} || die( "No SQL statement object for currencies_info" ); foreach my $code ( sort( keys( %$currenciesInfo ) ) ) { $n++; for( my $i = 0; $i < scalar( @{$currenciesInfo->{ $code }} ); $i++ ) { my $def = $currenciesInfo->{ $code }->[$i]; eval { $sth->execute( @$def{qw( code currency start until is_tender hist_sequence is_obsolete )} ); } || die( "Error adding currency historical data for currency code '$def->{currency}' and territory code '${code}': ", ( $@ || $sth->errstr ), "\n", dump( $def ) ); $o++; } } &log( "${n} territory currencies added for ${o} historical records" ); # NOTE: Pre-loading language information &log( "Pre-loading language information." ); my $languageDataRes = $suppDoc->findnodes( '//languageData/language' ) || die( "Unable to get language information in ${supplemental_data_file}" ); if( !$languageDataRes->size ) { die( "No language information found in ${supplemental_data_file}" ); } my $known_langs = {}; while( my $el = $languageDataRes->shift ) { my $lang = $el->getAttribute( 'type' ) || die( "No type attribute set for this language tag" ); my $def = { language => $lang, territories => [split( /[[:blank:]\h\v]+/, ( $el->getAttribute( 'territories' ) || '' ) )], scripts => [split( /[[:blank:]\h\v]+/, ( $el->getAttribute( 'scripts' ) || '' ) )], alt => $el->getAttribute( 'alt' ), }; if( exists( $known_langs->{ $lang } ) ) { if( !$def->{alt} && !$known_langs->{ $lang }->{alt} ) { die( "Redefining language '${lang}', but neither the previous entry nor this one has an 'alt' attribute set, which would normally make them distinctive. Previous entry: ", dump( $known_langs->{ $lang } ), "\nCurrent entry: ", dump( $def ) ); } $known_langs->{ $lang } = [$known_langs->{ $lang }]; push( @{$known_langs->{ $lang }}, $def ); } else { $known_langs->{ $lang } = $def; } my $hasScripts = scalar( @{$def->{scripts}} ); if( scalar( @{$def->{territories}} ) ) { foreach my $territory ( @{$def->{territories}} ) { if( $hasScripts ) { foreach my $sc ( @{$def->{scripts}} ) { $known_locales->{ "${lang}-${sc}-${territory}" } = { locale => "${lang}-${sc}-${territory}" }; } } else { $known_locales->{ "${lang}-${territory}" } = { locale => "${lang}-${territory}" }; } } } elsif( $hasScripts ) { foreach my $sc ( @{$def->{scripts}} ) { $known_locales->{ "${lang}-${sc}" } = { locale => "${lang}-${sc}" }; } } else { $known_locales->{ $lang } = { locale => $lang }; } } # NOTE: Adding missing languages &log( "Adding missing languages." ); $n = 0; foreach my $code ( sort( keys( %{$known->{languages}} ) ) ) { $out->print( "[${code}] " ) if( $DEBUG ); if( !exists( $known_langs->{ $code } ) ) { $known_langs->{ $code } = { language => $code }; $n++; $out->print( "added." ) if( $DEBUG ); } if( ref( $known_langs->{ $code } ) eq 'ARRAY' ) { foreach my $this ( @{$known_langs->{ $code }} ) { $this->{status} = $known->{languages}->{ $code }->{status}; } } else { $known_langs->{ $code }->{status} = $known->{languages}->{ $code }->{status}; } $out->print( "\n" ) if( $DEBUG ); } &log( sprintf( "%d languages added (%.2f%%) out of %d", $n, ( ( $n / scalar( keys( %{$known->{languages}} ) ) ) * 100 ), scalar( keys( %{$known->{languages}} ) ) ) ); # NOTE: Processing language groups to derive parent (iso-639-5) &log( "Processing language groups to derive parent (iso-639-5)." ); # See $n = 0; my $lang_group_file = $basedir->child( 'supplemental/languageGroup.xml' ); my $langGroupDoc = load_xml( $lang_group_file ); my $langGroupRes = $langGroupDoc->findnodes( '/supplementalData/languageGroups/languageGroup[@parent]' ) || die( "Unable to get language groups from ${lang_group_file}" ); &log( $langGroupRes->size, " language groups found." ); while( my $el = $langGroupRes->shift ) { my $parent = $el->getAttribute( 'parent' ); my $data = trim( $el->textContent ); my $langs = [split( /[[:blank:]\h\v]+/, $data )]; foreach my $lang ( @$langs ) { if( exists( $known_langs->{ $lang } ) ) { if( ref( $known_langs->{ $lang } ) eq 'ARRAY' ) { foreach my $this ( @{$known_langs->{ $lang }} ) { $this->{parent} = $parent; } } else { $known_langs->{ $lang }->{parent} = $parent; } $n++; } else { die( "Unknown language found \"${lang}\" with parent \"${parent}\". This means it did not exist in ", $known_data->{languages}->{file} ); } } } &log( "${n} languages allocated a parent." ); # NOTE: Loading language information &log( "Loading language information." ); $n = 0; $sth = $sths->{languages} || die( "No SQL statement object for languages" ); foreach my $lang ( sort( keys( %$known_langs ) ) ) { $out->print( "[${lang}] " ) if( $DEBUG ); # my $def = $known_langs->{ $lang }; my $defs = ref( $known_langs->{ $lang } ) eq 'ARRAY' ? $known_langs->{ $lang } : [$known_langs->{ $lang }]; foreach my $def ( @$defs ) { eval { $sth->execute( $lang, to_array( $def->{scripts} ), to_array( $def->{territories} ), @$def{qw( parent alt status )} ); } || die( "Error adding data for language '${lang}': ", ( $@ || $sth->errstr ), "\n", dump( $def ) ); } $out->print( "ok\n" ) if( $DEBUG ); $n++; } &log( "${n} languages added." ); my $possibly_missing = { 'zh-Hant-TW' => { language => 'zh', scripts => ['Hant'], territories => ['TW'] }, 'zh-TW' => { language => 'zh', territories => ['TW'] }, 'zh-HK' => { language => 'zh', territories => ['HK'] }, 'zh-MO' => { language => 'zh', territories => ['MO'] }, 'zh-SG' => { language => 'zh', territories => ['SG'] }, }; # NOTE: Adding possibly missing languages &log( "Adding possibly missing languages." ); $n = 0; foreach my $lang ( sort( keys( %$possibly_missing ) ) ) { my $def = $possibly_missing->{ $lang }; $out->print( "[${lang}] " ) if( $DEBUG ); eval { $sth->execute( $def->{language}, to_array( $def->{scripts} ), to_array( $def->{territories} ), @$def{qw( parent alt status )} ); } || die( "Error adding additional language '${lang}': ", ( $@ || $sth->errstr ), "\n", dump( $def ) ); $out->print( "ok\n" ) if( $DEBUG ); $n++ if( $sth->rows ); $known_locales->{ $lang } = { locale => $lang }; } &log( "${n} additional languages added." ); # NOTE: Loading known locales &log( "Loading known locales" ); my $supp_meta_file = $basedir->child( 'supplemental/supplementalMetadata.xml' ); my $metaDoc = load_xml( $supp_meta_file ); my $metaDefaultLangsRes = $metaDoc->findnodes( '/supplementalData/metadata/defaultContent/@locales' )->shift || die( "Unable to get the known locales in ${supp_meta_file}" ); my $default_locales = trim( $metaDefaultLangsRes->getValue() ); my @defaultLangs = split( /[[:blank:]\h\v]+/, $default_locales ); scalar( @defaultLangs ) || die( "No default locales found in file $supp_meta_file" ); foreach my $locale ( @defaultLangs ) { $locale =~ tr/_/-/; # Should not be needed but better safe than sorry if( index( $locale, 'root' ) != -1 ) { if( length( $locale ) > 4 ) { my $loc = Locale::Unicode->new( $locale ); $loc->language( 'und' ); $locale = $loc->as_string; } else { $locale = 'und'; } } $known_locales->{ $locale } = { locale => $locale }; } # NOTE: Adding more locales from the known languages &log( "Adding more locales from the known languages." ); $n = 0; foreach my $code ( sort( keys( %{$known->{languages}} ) ) ) { if( !exists( $known_locales->{ $code } ) ) { $known_locales->{ $code } = { locale => $code }; $n++; } $known_locales->{ $code }->{status} = $known->{languages}->{ $code }->{status}; } &log( sprintf( "%d languages were added as locales (%.2f%%) out of %d", $n, ( ( $n / scalar( keys( %{$known->{languages}} ) ) ) * 100 ), scalar( keys( %{$known->{languages}} ) ) ) ); # NOTE: Adding even more locales from the locale data files &log( "Adding even more locales from the locale data files." ); $n = 0; my $localesFromLocaleDataRes = $engLocaleDoc->findnodes( '//localeDisplayNames/languages/language' ) || die( "Unable to get locales data from locale data file ${eng_locale_data_file}" ); &log( sprintf( "Processing %d locales from ${eng_locale_data_file}", $localesFromLocaleDataRes->size ) ); while( my $el = $localesFromLocaleDataRes->shift ) { # Example: Japanese my $code = $el->getAttribute( 'type' ) || die( "Unable to get the locale from the 'type' attribute in this element: ", $el->toString() ); $code =~ tr/_/-/; if( !exists( $known_locales->{ $code } ) ) { $known_locales->{ $code } = { locale => $code }; $n++; } } &log( "${n} additional locales added to known locales." ); # NOTE: Adding possibly missing locales from the the main identities &log( "Adding possibly missing locales from the the main identities." ); $n = 0; $main_dir->open || die( $main_dir->error ); # while( my $f = $main_dir->read( as_object => 1, exclude_invisible => 1 ) ) @files = $main_dir->read( as_object => 1, exclude_invisible => 1, 'sort' => 1 ); foreach my $f ( @files ) { next unless( $f->extension eq 'xml' ); my $basename = $f->basename; my $mainDoc = load_xml( $f ); my $locale = identity_to_locale( $mainDoc ); ( my $locale2 = $f->basename( '.xml' ) ) =~ tr/_/-/; if( lc( $locale ) ne lc( $locale2 ) && $locale2 ne 'root' ) { warn( "XML identity says the locale is '${locale}', but the file basename says it should be '${locale2}', and I think the file basename is correct for file $f" ); $locale = $locale2; } if( index( $locale, 'root' ) != -1 ) { if( length( $locale ) > 4 ) { my $loc = Locale::Unicode->new( $locale ); $loc->language( 'und' ); $locale = $loc->as_string; } else { $locale = 'und'; } } $out->print( "[${basename}] -> ${locale} " ) if( $DEBUG ); if( !exists( $known_locales->{ $locale } ) ) { $known_locales->{ $locale } = { locale => $locale }; $out->print( "added." ) if( $DEBUG ); $n++; } $out->print( "\n" ) if( $DEBUG ); } &log( "${n} additional locales added to known locales." ); $main_dir->close; # NOTE: Adding parent information to locales &log( "Adding parent information to locales." ); my $localesParentsRes = $suppDoc->findnodes( '/supplementalData/parentLocales/parentLocale' ); if( !$localesParentsRes->size ) { die( "No locale parent information found in supplemental data file ${supplemental_data_file} in xpath /supplementalData/parentLocales/parentLocale" ); } $n = 0; while( my $el = $localesParentsRes->shift ) { my $parent = $el->getAttribute( 'parent' ) || die( "No 'parent' value found in attribute 'parent' for this locale parent in element: ", $el->toString ); # Standardise the locale as per the standard $parent =~ tr/_/-/; if( index( $parent, 'root' ) != -1 ) { if( length( $parent ) > 4 ) { my $loc = Locale::Unicode->new( $parent ); $loc->language( 'und' ); $parent = $loc->as_string; } else { $parent = 'und'; } } my $locales = $el->getAttribute( 'locales' ) || die( "No list of locales associated with parent ${parent} found in attribute 'locales' for this locale parent in element: ", $el->toString ); $locales = trim( $locales ); $locales = [split( /[[:blank:]\h]+/, $locales )]; $out->printf( "[${parent}] for %d child locales ", scalar( @$locales ) ) if( $DEBUG ); foreach my $locale ( @$locales ) { # Standardise the locale as per the standard $locale =~ tr/_/-/; if( index( $locale, 'root' ) != -1 ) { if( length( $locale ) > 4 ) { my $loc = Locale::Unicode->new( $locale ); $loc->language( 'und' ); $locale = $loc->as_string; } else { $locale = 'und'; } } if( !exists( $known_locales->{ $locale } ) ) { warn( "Warning only: unknown locale '${locale}' (adding it now) to set its parent locale in supplemental data file ${supplemental_data_file} for this element: ", $el->toString ); $known_locales->{ $locale } = { locale => $locale }; } $known_locales->{ $locale }->{parent} = $parent; } $out->print( "ok\n" ) if( $DEBUG ); } # NOTE: Adding collations information to locales &log( "Adding collations information to locales." ); $n = 0; $collation_dir->open || die( $collation_dir->error ); # while( my $f = $main_dir->read( as_object => 1, exclude_invisible => 1 ) ) @files = $collation_dir->read( as_object => 1, exclude_invisible => 1, 'sort' => 1 ); foreach my $f ( @files ) { next unless( $f->extension eq 'xml' ); my $basename = $f->basename; my $collationDoc = load_xml( $f ); my $locale = identity_to_locale( $collationDoc ); ( my $locale2 = $f->basename( '.xml' ) ) =~ tr/_/-/; if( lc( $locale ) ne lc( $locale2 ) && $locale2 ne 'root' ) { warn( "XML identity says the locale is '${locale}', but the file basename says it should be '${locale2}', and I think the file basename is correct for file $f" ); $locale = $locale2; } if( index( $locale, 'root' ) != -1 ) { if( length( $locale ) > 4 ) { my $loc = Locale::Unicode->new( $locale ); $loc->language( 'und' ); $locale = $loc->as_string; } else { $locale = 'und'; } } $out->print( "[${basename}] -> ${locale} " ) if( $DEBUG ); my $collationTypesRes = $collationDoc->findnodes( '/ldml/collations/collation[@type]' ); if( !$collationTypesRes->size ) { $out->print( "\tnothing found. This locale inherits collation from root (und)\n" ) if( $DEBUG ); next; } my @collations = (); while( my $el = $collationTypesRes->shift ) { my $name = $el->getAttribute( 'type' ); if( !length( $name // '' ) ) { warn( "The locale ${locale} is missing the 'type' attribute for collation in file ${f}." ); next; } push( @collations, $name ); } if( !exists( $known_locales->{ $locale } ) ) { $known_locales->{ $locale } = { locale => $locale }; } if( scalar( @collations ) ) { @collations = uniq( @collations ); $known_locales->{ $locale }->{collations} = \@collations; $out->print( "ok" ); $n++; } else { $out->print( "nothing found" ); } $out->print( "\n" ) if( $DEBUG ); } &log( "${n} locales were added collation information." ); $collation_dir->close; $n = 0; $sth = $sths->{locales} || die( "No SQL statement object for locales" ); foreach my $locale ( sort( keys( %$known_locales ) ) ) { my $def = { locale => $locale, }; if( ref( $known_locales->{ $locale } ) eq 'HASH' ) { $def->{parent} = $known_locales->{ $locale }->{parent}; $def->{status} = $known_locales->{ $locale }->{status}; } if( exists( $known_locales->{ $locale }->{collations} ) && defined( $known_locales->{ $locale }->{collations} ) && ref( $known_locales->{ $locale }->{collations} ) eq 'ARRAY' ) { $def->{collations} = to_array( $known_locales->{ $locale }->{collations} ); } $out->print( "[${locale}] " ) if( $DEBUG ); eval { $sth->execute( @$def{qw( locale parent collations status )} ); } || die( "Error adding locale '${locale}' to table 'locales': ", ( $@ || $sth->errstr ), "\nfor SQL query $sth->{Statement}", "\n", dump( $def ) ); $out->print( "ok\n" ) if( $DEBUG ); $n++; } &log( "${n} locales added." ); # NOTE: Loading likely subtags &log( "Loading likely subtags." ); my $likely_file = $basedir->child( 'supplemental/likelySubtags.xml' ); my $likelyDoc = load_xml( $likely_file ); my $rulesRes = $likelyDoc->findnodes( '//likelySubtags/likelySubtag' ) || die( "Unable to get the likely subtags in ${likely_file}" ); if( !$rulesRes->size ) { die( "No likely subtags found in ${likely_file}" ); } $sth = $sths->{likely_subtags} || die( "No SQL statement object for likely_subtags" ); # This is going to take some memory... my $likely = {}; $n = 0; while( my $el = $rulesRes->shift ) { my $locale = $el->getAttribute( 'from' ) || die( "No 'from' attribute found!" ); my $target = $el->getAttribute( 'to' ) || die( "No 'to' attribute found!" ); my $this = $el->nextNonBlankSibling; my $comment; # Example: if( $this && $this->isa( 'XML::LibXML::Comment' ) ) { my $data = $this->data; if( $data =~ /\{[[:blank:]\h]*(?[^\}]+)\}[[:blank:]\h]*\=\>[[:blank:]\h]*\{[[:blank:]\h]*(?[^\}]+)\}/ ) { $comment = "from $+{from} to $+{to}"; } } $locale =~ tr/_/-/; $target =~ tr/_/-/; $out->print( "[${locale} -> ${target} ", ( defined( $comment ) ? "($comment) " : '' ) ) if( $DEBUG ); eval { $sth->execute( $locale, $target ); } || die( "Error adding likely subtags rule for locale '${locale}' and target '${target}': ", ( $@ || $sth->errstr ) ); $out->print( "ok\n" ) if( $DEBUG ); $n++; $likely->{ $locale } = $target; } # Ref: # 1 - Web Rank: # The approximate rank of this script from a large sample of the web, # in terms of the number of characters found in that script. # Below 32 the ranking is not statistically significant. # 2 - Sample Character: # A sample character for use in "Last Resort" style fonts. # For printing the combining mark for Zinh in a chart, U+25CC can be prepended. # See http://unicode.org/policies/lastresortfont_eula.html # 3 - Origin country: # The approximate area where the script originated, expressed as a BCP47 region code. # 4 - Density: # The approximate information density of characters in this script, based on comparison of bilingual texts. # 5 - ID Usage: # The usage for IDs (tables 4-7) according to UAX #31. # For a description of values, see # http://unicode.org/reports/tr31/#Table_Candidate_Characters_for_Exclusion_from_Identifiers # 6 - RTL: # YES if the script is RTL # Derived from whether the script contains RTL letters according to the Bidi_Class property # 7 - LB letters: # YES if the major languages using the script allow linebreaks between letters (excluding hyphenation). # Derived from LB property. # 8 - Shaping Required: # YES if shaping is required for the major languages using that script for NFC text. # This includes not only ligation (and Indic conjuncts), Indic vowel splitting/reordering, and # Arabic-style contextual shaping, but also cases where NSM placement is required, like Thai. # MIN if NSM placement is sufficient, not the more complex shaping. # The NSM placement may only be necessary for some major languages using the script. # 9 - Input Method Engine Required: # YES if the major languages using the script require IMEs. # In particular, users (of languages for that script) would be accustomed to using IMEs (such as Japanese) # and typical commercial products for those languages would need IME support in order to be competitive. # 10- Cased # YES if in modern (or most recent) usage case distinctions are customary. # NOTE: Pre-loading scripts &log( "Pre-loading scripts." ); $n = 0; my $scripts_file = $basedir->child( 'properties/scriptMetadata.txt' ); my $known_scripts = {}; my $script_fh = $scripts_file->open( '<', { binmode => ':utf8' }) || die( "Unable to open $scripts_file in read mode: ", $scripts_file->error ); my $bool_map = { 'NO' => 0, 'YES' => 1, 'UNKNOWN' => undef, 'MIN' => 'MIN', }; my @bool_fields = qw( rtl lb_letters has_case shaping_req ime ); my @script_fields = qw( script rank sample_char origin_country density id_usage rtl lb_letters shaping_req ime has_case ); my $lineno = 0; while( defined( my $l = $script_fh->getline ) ) { ++$lineno; next if( $l =~ /^[[:blank:]\h]*(?:\Z|\#)/ ); chomp( $l ); # Remove any possible trailing comment $l =~ s/[[:blank:]\h]+\#(?:.*?)$// if( index( $l, '#' ) != -1 ); my @values = split( /[[:blank:]\h]*\;[[:blank:]\h]*/, $l, -1 ); # likelyLanguage my $def = {}; @$def{ @script_fields } = @values; # Ensure standard formatting $def->{script} = ucfirst( lc( $def->{script} ) ); $out->print( "[$def->{script}] " ) if( $DEBUG ); if( scalar( @values ) != scalar( @script_fields ) ) { die( "Incorrect number of columns retrieved (", scalar( @values ), ") where ", scalar( @script_fields ), " were expected at line $lineno in file $scripts_file" ); } foreach my $bool_field ( @bool_fields ) { if( !defined( $def->{ $bool_field } ) || !length( $def->{ $bool_field } ) ) { $def->{ $bool_field } = undef; } elsif( exists( $bool_map->{ $def->{ $bool_field } } ) ) { $def->{ $bool_field } = $bool_map->{ $def->{ $bool_field } }; } else { die( "Unknown value '", $def->{ $bool_field }, "' for boolean field '${bool_field}' for script '$def->{script}' in file ${scripts_file}" ); } } # Find out the likely language my $likely_lang = 'und-' . $def->{script}; if( exists( $likely->{ $likely_lang } ) ) { $def->{likely_language} = [split( /-/, $likely->{ $likely_lang } )]->[0]; } $known_scripts->{ $def->{script} } = $def; $out->print( "ok\n" ) if( $DEBUG ); $n++; } &log( "${n} scripts pre-loaded." ); # NOTE: Adding possible missing scripts &log( "Adding possible missing scripts." ); $n = 0; foreach my $code ( sort( keys( %{$known->{scripts}} ) ) ) { if( !exists( $known_scripts->{ $code } ) ) { $known_scripts->{ $code } = { script => $code }; $n++; } $known_scripts->{ $code }->{status} = $known->{scripts}->{ $code }->{status}; } &log( sprintf( "%d missing scripts added (%.2f%%) out of %d", $n, ( ( $n / scalar( keys( %{$known->{scripts}} ) ) ) * 100 ), scalar( keys( %{$known->{scripts}} ) ) ) ); # NOTE: Adding even more scripts from the locale data files &log( "Adding even more scripts from the locale data files." ); $n = 0; my $scriptsFromLocaleDataRes = $engLocaleDoc->findnodes( '//localeDisplayNames/scripts/script' ) || die( "Unable to get scripts data from locale data file ${eng_locale_data_file}" ); &log( sprintf( "Processing %d scripts from ${eng_locale_data_file}", $scriptsFromLocaleDataRes->size ) ); while( my $el = $scriptsFromLocaleDataRes->shift ) { # Example: my $code = $el->getAttribute( 'type' ) || die( "Unable to get the script from the 'type' attribute in this element: ", $el->toString() ); $code = ucfirst( lc( $code ) ); if( !exists( $known_scripts->{ $code } ) ) { $known_scripts->{ $code } = { script => $code }; $n++; } } &log( "${n} additional scripts added to known scripts." ); # NOTE: Loading scripts &log( "Loading scripts." ); $sth = $sths->{scripts} || die( "No SQL statement object for scripts" ); $n = 0; foreach my $code ( sort( keys( %$known_scripts ) ) ) { my $def = $known_scripts->{ $code }; $out->print( "[$def->{script}] " ) if( $DEBUG ); eval { $sth->execute( @$def{qw( script rank sample_char id_usage rtl lb_letters has_case shaping_req ime density origin_country likely_language status )} ); } || die( "Error at line ${lineno} adding script '$def->{script}' information to table 'scripts': ", ( $@ || $sth->errstr ), "\n", dump( $def ) ); $out->print( "ok\n" ) if( $DEBUG ); $n++; } &log( "${n} scripts added." ); %$known_scripts = (); # NOTE: Pre-loading variants &log( "Pre-loading variants." ); $n = 0; my $known_variants = {}; my $variantsFromLocaleDataRes = $engLocaleDoc->findnodes( '//localeDisplayNames/variants/variant' ) || die( "Unable to get variants data from locale data file ${eng_locale_data_file}" ); if( !$variantsFromLocaleDataRes->size ) { die( "No variant nodes found in file ${eng_locale_data_file}" ); } &log( sprintf( "Processing %d variants from ${eng_locale_data_file}", $variantsFromLocaleDataRes->size ) ); while( my $el = $variantsFromLocaleDataRes->shift ) { # Example: Valencian my $code = $el->getAttribute( 'type' ) || die( "Unable to get the variant from the 'type' attribute in this element: ", $el->toString() ); $code = lc( $code ); $known_variants->{ $code } = { variant => $code }; $n++; } &log( "${n} variants pre-loaded." ); # NOTE: Adding even more variants from the locale data files &log( "Adding even more variants from the locale data files." ); $n = 0; &log( "${n} additional variants added to known variants." ); # NOTE: Adding possibly missing variants &log( "Adding possibly missing variants." ); $n = 0; foreach my $code ( sort( keys( %{$known->{variants}} ) ) ) { if( !exists( $known_variants->{ $code } ) ) { $known_variants->{ $code } = { variant => $code }; $n++; } $known_variants->{ $code }->{status} = $known->{variants}->{ $code }->{status}; } &log( sprintf( "%d missing variants added (%.2f%%) out of %d", $n, ( ( $n / scalar( keys( %{$known->{variants}} ) ) ) * 100 ), scalar( keys( %{$known->{variants}} ) ) ) ); # NOTE: Loading variants &log( "Loading variants." ); $n = 0; $sth = $sths->{variants} || die( "No SQL statement object for variants" ); foreach my $code ( sort( keys( %$known_variants ) ) ) { my $def = $known_variants->{ $code }; my $status = $def->{status}; $out->print( "[${code}] " ) if( $DEBUG ); eval { $sth->execute( $code, $status ); } || die( "Error adding variant information for variant '${code}': ", ( $@ || $sth->errstr ) ); $out->print( "ok\n" ) if( $DEBUG ); $n++; } &log( "${n} variants added." ); %$known_variants = (); # NOTE: Loading language population &log( "Loading language population" ); $n = 0; $sth = $sths->{language_population} || die( "No SQL statement object for language_population" ); foreach my $code ( sort( keys( %$territoryInfo ) ) ) { my $def = $territoryInfo->{ $code }; if( !exists( $def->{language_population} ) ) { next; } elsif( ref( $def->{language_population} ) ne 'HASH' ) { die( "Property 'language_population' value for territory code '${code}' exists, but is not an hash reference in $supplemental_data_file" ); } $out->print( "[${code}] " ) if( $DEBUG ); my $langpop = $def->{language_population}; foreach my $locale ( sort{ $langpop->{ $b }->{population_percent} <=> $langpop->{ $a }->{population_percent} } keys( %$langpop ) ) { my $this = $langpop->{ $locale }; eval { $sth->execute( $code, $locale, @$this{qw( population_percent literacy_percent writing_percent official_status )} ); } || die( "Error adding language population information for territory '${code}' and locale '${locale}': ", ( $@ || $sth->errstr ), "\n", dump( $this ) ); $out->print( "${locale} " ) if( $DEBUG ); $n++; } $out->print( "\n" ) if( $DEBUG ); } &log( "${n} language populations added." ); # NOTE: Loading aliases &log( "Processing aliases." ); $sth = $sths->{aliases} || die( "No SQL statement object for aliases" ); my $alias_map = [ { xpath => '//alias/languageAlias', type => 'language' }, { xpath => '//alias/scriptAlias', type => 'script' }, { xpath => '//alias/territoryAlias', type => 'territory' }, { xpath => '//alias/subdivisionAlias', type => 'subdivision' }, { xpath => '//alias/variantAlias', type => 'variant' }, { xpath => '//alias/zoneAlias', type => 'zone' }, ]; foreach my $def ( @$alias_map ) { my $type = $def->{type}; &log( "Loading ${type} aliases." ); $n = 0; my $aliasRes = $metaDoc->findnodes( $def->{xpath} ) || die( "Unable to get ${type} aliases in ${supp_meta_file}" ); if( !$aliasRes->size ) { die( "No alias ${type} found in ${supp_meta_file}" ); } while( my $el = $aliasRes->shift ) { my $alias = $el->getAttribute( 'type' ) || die( "No 'type' attribute found for this ${type} alias element: ", $el->toString() ); my $replacement = $el->getAttribute( 'replacement' ) || die( "No 'replacement' attribute found for this ${type} alias element: ", $el->toString() ); # Normalise $replacement =~ tr/_/-/; $replacement = [split( /[[:blank:]\h\v]+/, $replacement )]; my $reason = $el->getAttribute( 'reason' ); my $comment; if( my $this = $el->nextNonBlankSibling ) { if( $this->isa( 'XML::LibXML::Comment' ) ) { $comment = $this->data; $comment = trim( $comment ) if( defined( $comment ) ); $comment = undef if( $comment eq 'null' ); } } # Normalise the alias $alias =~ tr/_/-/; $out->print( "[${type} / ${alias} -> ", join( ', ', @$replacement ), "] ", ( defined( $comment ) ? "(${comment}) " : '' ) ) if( $DEBUG ); eval { $sth->execute( $alias, to_array( $replacement ), $reason, $type, $comment ); } || die( "Error adding alias information for ${type} '${alias}' and replacement '", join( ', ', @$replacement ), "': ", ( $@ || $sth->errstr ) ); $out->print( "ok\n" ) if( $DEBUG ); $n++; } &log( "${n} ${type} aliases added." ); } my $zone_file = $basedir->child( 'supplemental/metaZones.xml' ); my $zoneDoc = load_xml( $zone_file ); # NOTE: Loading metazones &log( "Loading metazones." ); my $metazonesRes = $zoneDoc->findnodes( '/supplementalData/metaZones/mapTimezones/mapZone[@type]' ); if( !$metazonesRes->size ) { die( "No metazone data found in file ${zone_file}" ); } $sth = $sths->{metazones} || die( "Unable to get the statement object for table \"metazones\"." ); my $metazones = {}; # while( my $el = $metazonesRes->shift ) { my $id = $el->getAttribute( 'other' ) || die( "Unable to get the metazone from the attribute 'other' for this element: ", $el->toString() ); my $territory = $el->getAttribute( 'territory' ); my $timezone = $el->getAttribute( 'type' ); $metazones->{ $id } ||= { metazone => $id, territories => [], timezones => [], }; push( @{$metazones->{ $id }->{territories}}, $territory ); push( @{$metazones->{ $id }->{timezones}}, $timezone ); } foreach my $zone ( sort( keys( %$metazones ) ) ) { my $def = $metazones->{ $zone }; eval { $sth->execute( $def->{metazone}, to_array( $def->{territories} ), to_array( $def->{timezones} ) ); } || die( "Error adding metazone information for metazone '$def->{metazone}': ", ( $@ || $sth->errstr ), "\n", dump( $def ) ); } # NOTE: Loading the IANA Olson time zone database &log( "Loading the IANA Olson time zone database." ); # Those time zones have the territory set to 001, because the CLDR misuses that property to flag them as "golden" time zones. We instead set a 'is_golden' field in the timezones table my $tz_corrections = {}; if( $cache_tz_corrections_file->exists ) { &log( "Loading time zones corrections from cache file ${cache_tz_corrections_file}" ); $tz_corrections = $cache_tz_corrections_file->load_json || die( $cache_tz_corrections_file->error ); } else { &log( "Computing time zones corrections." ); my $iana_tzdb_alias = {}; my $iana_tzdb_map = {}; my $fh = $iana_alias_file->open( '<', { binmode => ':utf8' }) || die( $iana_alias_file->error ); while( defined( $_ = $fh->getline ) ) { chomp; next if( /^[[:blank:]\h]*(?:\Z|\#)/ ); my( $dummy, $from, $to, $comment ) = split( /[[:blank:]\h]+/, $_ ); $out->print( "Aliasing ${from} -> ${to}\n" ) if( $DEBUG ); $iana_tzdb_alias->{ $to } = $from; } $fh = $iana_timezone_file->open( '<', { binmode => ':utf8' }) || die( $iana_timezone_file->error ); local $" = ', '; while( defined( $_ = $fh->getline ) ) { chomp; next if( /^[[:blank:]\h]*(?:\Z|\#)/ ); my( $codes, $coordinates, $tz, $comment ) = split( /[[:blank:]\h]+/, $_ ); $codes = [split( /\,/, $codes )]; $out->print( "[${tz}] -> @$codes\n" ) if( $DEBUG ); foreach my $code ( @$codes ) { $iana_tzdb_map->{ $tz } = $code; } } $fh->close; my $worldTerritoriesRes = $zoneDoc->findnodes( '/supplementalData/metaZones/mapTimezones/mapZone[@territory="001"]' ); $out->printf( "Found %d time zones with incorrect territories.\n", $worldTerritoriesRes->size ) if( $DEBUG ); while( my $el = $worldTerritoriesRes->shift ) { # Example: my $tz = $el->getAttribute( 'type' ) || die( "Unable to get the 'type' attribute for this mapZone element: ", $el->toString() ); if( !exists( $iana_tzdb_map->{ $tz } ) ) { if( exists( $iana_tzdb_alias->{ $tz } ) && exists( $iana_tzdb_map->{ $iana_tzdb_alias->{ $tz } } ) ) { warn( "No value found for time zone ${tz}, but found in alias: ${tz} -> ", $iana_tzdb_alias->{ $tz } ); $tz_corrections->{ $tz } = $iana_tzdb_map->{ $iana_tzdb_alias->{ $tz } }; next; } else { warn( "No value found for time zone ${tz}" ); } } $tz_corrections->{ $tz } = $iana_tzdb_map->{ $tz }; } $cache_tz_corrections_file->unload_json( $tz_corrections => { pretty => 1, canonical => 1, }) || die( $cache_tz_corrections_file->error ); } &log( sprintf( "%d time zone corrections loaded.", scalar( keys( %$tz_corrections ) ) ) ); # NOTE: Building map of territory name to territory code &log( "Building map of territory name to territory code." ); my $engTerritoriesRes = $engLocaleDoc->findnodes( '//localeDisplayNames/territories/territory' ) || die( "Unable to get the nodes of English territories names from ${eng_locale_data_file}" ); &log( sprintf( "%d English locale territories names found.", $engTerritoriesRes->size ) ); my $eng_territories_names_to_code = {}; while( my $el = $engTerritoriesRes->shift ) { # Example: Antarctica my $code = $el->getAttribute( 'type' ) || die( "Unable to get the territory code from attribute 'type' for this element: ", $el->toString() ); my $name = $el->textContent; if( !defined( $name ) || !length( $name ) ) { die( "Territory name for '${code}' is empty: ", $el->toString() ); } if( index( $name, '&' ) != -1 ) { $name = decode_entities( $name ); } if( exists( $eng_territories_names_to_code->{ $name } ) ) { # This is a variation, which we ignore. if( $el->hasAttribute( 'alt' ) ) { next; } else { die( "Found another territory (", $eng_territories_names_to_code->{ $name }, ") with the same name '${name}' for our code '${code}'" ); } } $eng_territories_names_to_code->{ $name } = $code; } # NOTE: Pre-loading time zones &log( "Pre-loading time zones." ); $n = 0; my $tzMapRes = $zoneDoc->findnodes( '//metaZones/mapTimezones/mapZone' ) || die( "Unable to get the map timezones in ${zone_file}" ); if( !$tzMapRes->size ) { die( "No map timezones found in ${zone_file}" ); } my $metazoneIdsRes = $zoneDoc->findnodes( '//metaZones/metazoneIds/metazoneId' ) || die( "Unable to get the timezone IDs in ${zone_file}" ); if( !$metazoneIdsRes->size ) { die( "No timezone IDs found in ${zone_file}" ); } my $primaryZonesRes = $zoneDoc->findnodes( '//primaryZones/primaryZone' ) || die( "Unable to get the primary timezones in ${zone_file}" ); if( !$primaryZonesRes->size ) { die( "No primary timezones found in ${zone_file}" ); } my $tzs = { # Default value used in localised data 'Etc/Unknown' => { timezone => 'Etc/Unknown', territory => 'ZZ', region => 'Etc', is_golden => 0, } }; # NOTE: Pre-loading time zone information &log( "Pre-loading time zone information." ); my $timezonesRes = $zoneDoc->findnodes( '//metaZones/metazoneInfo/timezone' ) || die( "Unable to get the meta timezones information in ${zone_file}" ); if( !$timezonesRes->size ) { die( "No meta timezones information found in ${zone_file}" ); } my $tzRe = qr/(?\d{4})\D(?\d{1,2})\D(?\d{1,2})[[:blank:]\h]+(?\d{1,2})\D(?\d{1,2})/; $n = 0; my $tz_infos = {}; # NOTE: Collecting timezones from metaZones.xml//metaZones/metazoneInfo/timezone &log( "Collecting timezones from metaZones.xml//metaZones/metazoneInfo/timezone" ); while( my $el = $timezonesRes->shift ) { my $tz = $el->getAttribute( 'type' ) || die( "Unable to find the attribute 'type' on this timezone tag: ", $el->toString() ); $tz_infos->{ $tz } = []; if( !exists( $tzs->{ $tz } ) ) { $tzs->{ $tz } = { timezone => $tz, is_golden => 0 }; $tzs->{ $tz }->{region} = [split( /\//, $tz )]->[0] if( index( $tz, '/' ) != -1 ); $out->print( "Collected ${tz}\n" ) if( $DEBUG ); } # We set the value of metazone to the most recent one, and for that we use DateTime to compare DateTime object. my( $metazone, $metazone_from, $metazone_to ); my @metaZones = $el->getChildrenByTagName( 'usesMetazone' ); # Example: foreach my $el_meta ( @metaZones ) { my $def = { timezone => $tz, metazone => ( $el_meta->getAttribute( 'mzone' ) || die( "No attribute 'mzone' found on this 'usesMetazone' tag: ", $el_meta->toString() ) ), ( $el_meta->hasAttribute( 'from' ) ? ( start => $el_meta->getAttribute( 'from' ) ) : () ), ( $el_meta->hasAttribute( 'to' ) ? ( 'until' => $el_meta->getAttribute( 'to' ) ) : () ), }; # The first 'usesMetazone' is the most recent, so if it is already set, we ignore # This turned out to be too simple. # $tzs->{ $tz }->{metazone} = $def->{metazone} unless( $tzs->{ $tz }->{metazone} ); my $metadt = {}; foreach my $prop ( qw( start until ) ) { if( exists( $def->{ $prop } ) && defined( $def->{ $prop } ) ) { if( $def->{ $prop } =~ /^$tzRe$/ ) { my $re = {%+}; $def->{ $prop } = sprintf( '%4d-%02d-%02dT%02d:%02d:00', @$re{qw( year month day hour minute )} ); $metadt->{ $prop } = DateTime->new( %$re, time_zone => 'floating' ); } else { die( "Property '${prop}' for time zone '${tz}' seems to have an invalid datetime format '", $def->{ $prop }, "'" ); } } } push( @{$tz_infos->{ $tz }}, $def ); if( exists( $metadt->{start} ) || exists( $metadt->{until} ) ) { if( !defined( $metazone_from ) && !defined( $metazone_to ) ) { $metazone_from = $metadt->{start}; $metazone_to = $metadt->{until}; $metazone = $def->{metazone}; } elsif( defined( $metadt->{start} ) ) { if( defined( $metazone_to ) && $metadt->{start} >= $metazone_to ) { $metazone = $def->{metazone}; } else { warn( "Warning only: weirdly enough, this meta zone '$def->{metazone}' for the zone ID '$def->{timezone}' is not historically the first one, and yet its start datetime ($def->{from}) is not higher than the previous metazone end datetime (", $metazone_to->iso8601, "): ", dump( $def ) ); } } else { die( "Time zone ID ${tz} has metazone $def->{metazone}, which is not historically the first one, and yet I could not get a start datetime: ", dump( $def ) ); } } else { $metazone = $def->{metazone}; } ++$n; } # End checking each metazone for this timezone $tzs->{ $tz }->{metazone} = $metazone; } &log( "${n} time zone information pre-loaded." ); # Now check the primary zones that also are golden zones. # See the specs: # NOTE: Collecting primary (golden) timezones from metaZones.xml//primaryZones/primaryZone &log( "Collecting primary (golden) timezones from metaZones.xml//primaryZones/primaryZone" ); while( my $el = $primaryZonesRes->shift ) { my $tz = $el->textContent || die( "No text content could be found for this primary time zone: ", $el->toString() ); unless( defined( $tz ) && length( $tz ) && index( $tz, '/' ) != -1 ) { die( "Textual content for this prime zone element is either empty or malformed: ", $el->toString() ); } if( !exists( $tzs->{ $tz } ) ) { die( "Unable to find the primary time zone '${tz}' in our previously built dictionary." ); } my $territory = $el->getAttribute( 'iso3166' ) || die( "No territory code defined for this primary zone '${tz}': ", $el->toString() ); $tzs->{ $tz }->{is_primary} = 1; $tzs->{ $tz }->{territory} = $territory unless( $tzs->{ $tz }->{territory} ); } # We use it so we can add the CLDR 4-characters time zone id in a separate iteration # NOTE: Collecting metazones from metaZones.xml//metaZones/mapTimezones/mapZone &log( "Collecting metazones from metaZones.xml//metaZones/mapTimezones/mapZone" ); my $metazone_to_dict = {}; while( my $el = $tzMapRes->shift ) { my $tz = $el->getAttribute( 'type' ) || die( "Unable to get the attribute 'type' for this element: ", $el->toString() ); my $def = { timezone => $tz, territory => ( $el->getAttribute( 'territory' ) || die( "No attribute 'territory' for this time zone '${tz}': ", $el->toString() ) ), metazone => ( $el->getAttribute( 'other' ) || die( "No attribute 'other' for this time zone '${tz}': ", $el->toString() ) ), # By default is_golden => 0, is_primary => 0, }; if( index( $tz, '/' ) != -1 ) { $def->{region} = [split( '/', $tz )]->[0]; } elsif( index( $def->{metazone}, '_' ) != -1 ) { $def->{region} = [split( '_', $def->{region} )]->[0]; } else { die( "Neither the time zone (${tz}) nor the metazone ($def->{metazone}) have any region information." ); } # Perl converted it to an integer, and removed any leading zeros. if( $def->{territory} =~ /^\d{1,3}$/ ) { $def->{territory} = sprintf( '%03d', int( $def->{territory} ) ); # "The golden zones are those in mapZone supplemental data under the territory "001"." # $def->{is_golden} = 1; } else { # This is the preferred timezone for this territory # $def->{is_preferred} = 1; } foreach my $prop ( keys( %$def ) ) { if( $prop eq 'is_golden' || !exists( $tzs->{ $tz }->{ $prop } ) || !length( $tzs->{ $tz }->{ $prop } // '' ) ) { $tzs->{ $tz }->{ $prop } = $def->{ $prop }; } } $metazone_to_dict->{ $def->{metazone} } ||= []; push( @{$metazone_to_dict->{ $def->{metazone} }}, $def ); } # Associating metazone information to timezone that somehow were left out from the metazone to timezone mapping &log( "Setting timezones missing 'territory' and 'is_golden' data" ); foreach my $tz ( keys( %$tzs ) ) { my $this = $tzs->{ $tz }; if( $this->{metazone} && !$this->{territory} && exists( $metazone_to_dict->{ $this->{metazone} } ) && ref( $metazone_to_dict->{ $this->{metazone} } ) && scalar( @{$metazone_to_dict->{ $this->{metazone} }} ) == 1 ) { $out->print( "\tTimezone '${tz}' does not have a territory set, using the entry for metazone '", $this->{metazone}, "' to get the information.\n" ) if( $DEBUG ); $this->{territory} = $metazone_to_dict->{ $this->{metazone} }->[0]->{territory}; # $this->{is_golden} = $metazone_to_dict->{ $this->{metazone} }->[0]->{is_golden}; } } # NOTE: Collecting timezone IDs from metaZones.xml//metaZones/metazoneIds/metazoneId &log( "Collecting timezone IDs from metaZones.xml//metaZones/metazoneIds/metazoneId" ); while( my $el = $metazoneIdsRes->shift ) { my $metazone = $el->getAttribute( 'longId' ) || die( "No attribute 'longId' for this meta zone element: ", $el->toString() ); my $id = $el->getAttribute( 'shortId' ) || die( "No attribute 'shortId' for this meta zone '${metazone}': ", $el->toString() ); if( !exists( $metazone_to_dict->{ $metazone } ) ) { die( "Unable to find the meta zone '${metazone}' in our previously built map." ); } foreach my $tz_ref ( @{$metazone_to_dict->{ $metazone }} ) { my $tz = $tz_ref->{timezone} || die( "Error: no timezone is set: ", dump( $tz_ref ) ); $tzs->{ $tz }->{tzid} = $id; $out->print( "Set ID '${id}' to metazone '${metazone}' (", $tzs->{ $tz }->{timezone}, "). Time zone now has this tzid (", $tzs->{ $tz }->{tzid}, ")\n" ) if( $DEBUG ); } } # NOTE: Loading additional information from windowsZones &log( "Loading additional information from windowsZones." ); my $windows_zones_file = $basedir->child( 'supplemental/windowsZones.xml' ); my $windZonesDom = load_xml( $windows_zones_file ); my $windZonesRes = $windZonesDom->findnodes( '/supplementalData/windowsZones/mapTimezones/mapZone' ) || die( "Unable to get windows zones from ${windows_zones_file}" ); $n = 0; while( my $el = $windZonesRes->shift ) { # Example: # my $territory = $el->getAttribute( 'territory' ) || die( "Unable to get the 'territory' attribute value for this windows zone: ", $el->toString() ); my $zones = $el->getAttribute( 'type' ) || die( "Unable to get the 'type' attribute value for this windows zone: ", $el->toString() ); $zones = [split( /[[:blank:]\h]+/, $zones )]; foreach my $zone ( @$zones ) { $out->print( "[${zone}] " ) if( $DEBUG ); if( exists( $tzs->{ $zone } ) ) { if( !exists( $tzs->{ $zone }->{territory} ) || !length( $tzs->{ $zone }->{territory} // '' ) || ( # territory for a time zone may have been set to 001 (World), or some region code, # but next iteration could allocate a more accurate territory, such as an ISO3166 code length( $tzs->{ $zone }->{territory} // '' ) && $tzs->{ $zone }->{territory} =~ /^\d{1,3}$/ ) ) { $tzs->{ $zone }->{territory} = $territory; $tzs->{ $zone }->{region} = [split( /\//, $zone )]->[0] if( index( $zone, '/' ) != -1 ); $n++; $out->print( "added territory ${territory}\n" ) if( $DEBUG ); } else { $out->print( "already have territory '", ( $tzs->{ $zone }->{territory} // 'undef' ), "'\n" ) if( $DEBUG ); } } else { $tzs->{ $zone } = { timezone => $zone, territory => $territory, region => ( index( $zone, '/' ) != -1 ? [split( /\//, $zone )]->[0] : undef ), }; $out->print( "missing time zone added\n" ) if( $DEBUG ); } } } &log( "${n} additional time zone information added." ); # NOTE: Loading BCP47 timezones &log( "Loading BCP47 timezones." ); $n = 0; my $bcp47_tz_file = $basedir->child( 'bcp47/timezone.xml' ); my $bcp47_tzDoc = load_xml( $bcp47_tz_file ); my $tzKeysRes = $bcp47_tzDoc->findnodes( '//keyword/key/type' ) || die( "Unable to get BCP47 timezones in ${bcp47_tz_file}" ); if( !$tzKeysRes->size ) { die( "No BCP47 timezones found in ${bcp47_tz_file}" ); } $sth = $sths->{bcp47_timezones} || die( "No SQL statement object for bcp47_timezones" ); my $tz_bool_map = { 'true' => 1, 'false' => 0, }; while( my $el = $tzKeysRes->shift ) { my $def = { tzid => ( $el->getAttribute( 'name' ) || die( "Unable to get the attribute 'name' for this timezone element: ", $el->toString() ) ), alias => $el->getAttribute( 'alias' ), preferred => $el->getAttribute( 'preferred' ), description => $el->getAttribute( 'description' ), }; $out->print( "[$def->{tzid}] " ) if( $DEBUG ); if( $el->hasAttribute( 'deprecated' ) ) { my $bool = $el->getAttribute( 'deprecated' ); if( exists( $tz_bool_map->{ $bool } ) ) { $def->{deprecated} = $tz_bool_map->{ $bool }; } else { die( "Unknown boolean value for deprecated: '", ( $bool // 'undef' ), "'" ); } } if( defined( $def->{alias} ) && length( $def->{alias} ) ) { $def->{alias} = [split( /[[:blank:]\h\v]+/, $def->{alias} )]; # We check each of the IANA timezone ID, and if we find it in the list of timezones previously built, we add our BCP47 timezone ID to it dictionary definition. my $main_tz; foreach my $tz ( @{$def->{alias}} ) { if( exists( $tzs->{ $tz } ) ) { $tzs->{ $tz }->{tz_bcpid} = $def->{tzid}; $tzs->{ $tz }->{alias} = [grep( $_ ne $tz, @{$def->{alias}} )]; $tzs->{ $tz }->{is_primary} = 0 unless( defined( $tzs->{ $tz }->{is_primary} ) ); $tzs->{ $tz }->{is_preferred} = 0 unless( defined( $tzs->{ $tz }->{is_preferred} ) ); $tzs->{ $tz }->{is_canonical} = 0 unless( defined( $tzs->{ $tz }->{is_canonical} ) ); $out->print( "alias added to time zone '${tz}'. " ) if( $DEBUG ); $main_tz = $tz if( !defined( $main_tz ) ); } else { $out->print( "Unknown time zone '${tz}' found in BCP47 time zones for BCP47 tz ID '$def->{tzid}', adding it to our list of known timezones.\n" ) if( $DEBUG ); unless( defined( $main_tz ) ) { foreach my $tz ( @{$def->{alias}} ) { if( exists( $tzs->{ $tz } ) ) { $main_tz = $tz; last; } } } if( !defined( $main_tz ) ) { die( "This timezone ID '${tz}' is not known to us yet, and none of its aliases are either: '", join( "', '", @{$def->{alias}} ), "'" ); } my $tz_info = Clone::clone( $tzs->{ $main_tz } ); $tz_info->{timezone} = $tz; # $tz_info->{region} = [split( '/', $tz )]->[0]; $tz_info->{region} = [split( '/', $tz )]->[0] if( index( $tz, '/' ) != -1 ); $tz_info->{tz_bcpid} = $def->{tzid}; $tz_info->{alias} = [grep( $_ ne $tz, @{$def->{alias}} )]; $tz_info->{is_primary} = 0; $tz_info->{is_preferred} = 0; $tz_info->{is_canonical} = 0; $tzs->{ $tz } = $tz_info; } } # The first one is the canonical timezone as per the LDML specifications $tzs->{ $def->{alias}->[0] }->{is_canonical} = 1; } eval { $sth->execute( $def->{tzid}, to_array( $def->{alias} ), @$def{qw( preferred description deprecated )} ); } || die( "Error adding BCP47 timezone information for TZ ID '$def->{tzid}': ", ( $@ || $sth->errstr ), "\n", dump( $def ) ); $out->print( "ok\n" ) if( $DEBUG ); $n++; } &log( "${n} BCP47 timezones added." ); # NOTE: Dumping timezones to JSON my $tz_debug_file = $script_dir->child( 'timezones.json' ); $tz_debug_file->unload_json( $tzs => { pretty => 1, canonical => 1 } ) || die( $tz_debug_file->error ); # NOTE: Loading time zones &log( "Loading time zones." ); $n = 0; $sth = $sths->{timezones} || die( "No SQL statement object for timezones" ); foreach my $tz ( sort( keys( %$tzs ) ) ) { my $def = $tzs->{ $tz }; $out->print( "[${tz}] " ) if( $DEBUG ); $def->{timezone} = $tz if( !exists( $def->{timezone} ) || !length( $def->{timezone} // '' ) ); if( !exists( $def->{territory} ) || !defined( $def->{territory} ) || !length( $def->{territory} ) ) { # For example, Antarctica/Troll -> Antarctica -> AQ if( $def->{region} && exists( $eng_territories_names_to_code->{ $def->{region} } ) ) { $def->{territory} = $eng_territories_names_to_code->{ $def->{region} }; } elsif( index( $tz, '/' ) == -1 ) { $def->{territory} = '001'; $def->{region} = 'World'; } elsif( lc( [split( '/', $tz )]->[0] ) eq 'etc' ) { $def->{territory} = '001'; $def->{region} = 'World'; } else { die( "Missing 'territory' property for time zone '${tz}': ", dump( $def ) ); } } elsif( substr( $tz, 0, 3 ) eq 'GMT' || substr( $tz, 0, 3 ) eq 'UTC' || substr( $tz, 0, 7 ) eq 'Etc/GMT' || substr( $tz, 0, 7 ) eq 'Etc/UTC' || substr( $tz, 0, 7 ) eq 'Etc/UCT' || $tz eq 'Etc/Universal' ) { $def->{region} = 'World'; } # No region has been set so far, and this is because this is a zone that belongs to the World, such as CST6CDT or Greenwich elsif( !length( $def->{region} // '' ) ) { $def->{region} = 'World'; } $def->{is_primary} //= 0; eval { $sth->bind_param( 1, $def->{timezone}, SQL_VARCHAR ); $sth->bind_param( 2, $def->{territory}, SQL_VARCHAR ); $sth->bind_param( 3, $def->{region}, SQL_VARCHAR ); $sth->bind_param( 4, $def->{tzid}, SQL_VARCHAR ); $sth->bind_param( 5, $def->{metazone}, SQL_VARCHAR ); $sth->bind_param( 6, $def->{tz_bcpid}, SQL_VARCHAR ); $sth->bind_param( 7, $def->{is_golden}, SQL_BOOLEAN ); $sth->bind_param( 8, $def->{is_primary}, SQL_BOOLEAN ); $sth->bind_param( 9, $def->{is_preferred}, SQL_BOOLEAN ); $sth->bind_param( 10, $def->{is_canonical}, SQL_BOOLEAN ); $sth->bind_param( 11, to_array( $def->{alias} ), SQL_VARCHAR ); $sth->execute; } || die( "Error adding time zone information for time zone '${tz}': ", ( $@ || $sth->errstr ), "\n", dump( $def ) ); $out->print( "ok\n" ) if( $DEBUG ); $n++; } &log( "${n} timezones added." ); # NOTE: Loading time zone information &log( "Loading time zone historical records." ); $sth = $sths->{timezones_info} || die( "No SQL statement object for timezones_info" ); $n = 0; foreach my $tz ( sort( keys( %$tz_infos ) ) ) { foreach my $def ( @{$tz_infos->{ $tz }} ) { $out->print( "[${tz} -> ", ( $def->{metazone} // 'no metazone' ), ' ', ( $def->{start} ? "(from $def->{start}" : '' ), ( $def->{until} ? " -> $def->{until}) " : ') ' ) ) if( $DEBUG ); eval { $sth->execute( @$def{ qw( timezone metazone start until ) } ); } || die( "Error adding time zone historical record for time zone '${tz}': ", ( $@ || $sth->errstr ), "\n", dump( $def ) ); $out->print( "ok\n" ) if( $DEBUG ); ++$n; } } &log( "${n} historical records added." ); # NOTE: Pre-loading subdivisions &log( "Pre-loading subdivisions." ); $n = 0; my $subdiv_file = $basedir->child( 'supplemental/subdivisions.xml' ); my $subdivDoc = load_xml( $subdiv_file ); my $subgroupsRes = $subdivDoc->findnodes( '//subdivisionContainment/subgroup' ) || die( "Unable to get subgroup from file ${subdiv_file}" ); if( !$subgroupsRes->size ) { die( "No subdivisions found in file ${subdiv_file}" ); } my $known_subdivisions = {}; my $territories_having_subdivisions = {}; while( my $el = $subgroupsRes->shift ) { my $parent = $el->getAttribute( 'type' ) || die( "No 'type' property found for this subdivision: ", $el->toString() ); my $kids = $el->getAttribute( 'contains' ) || die( "No 'contains' property found for this subdivision: ", $el->toString() ); $kids = [split( /[[:blank:]\h\v]+/, $kids )]; my $is_top = 0; my $territory; if( $parent =~ /^[A-Z]{2}$/ ) { $is_top = 1; if( !exists( $territoryInfo->{ $parent } ) ) { die( "Parent territory code '${parent}' does not exist in table territories." ); } $territories_having_subdivisions->{ $parent }++; $territory = $parent; } foreach my $kid ( @$kids ) { $known_subdivisions->{ $kid } = { subdivision => $kid, parent => $parent, is_top_level => $is_top, ( defined( $territory ) ? ( territory => $territory ) : () ), }; } $n += scalar( @$kids ); } &log( "${n} subdivisions pre-loaded." ); my $leftout = []; foreach my $code ( sort( keys( %$territoryInfo ) ) ) { push( @$leftout, $code ) if( !exists( $territories_having_subdivisions->{ $code } ) ); } &log( scalar( @$leftout ), " territories did not have subdivision: ", join( ', ', @$leftout ) ); # NOTE: Adding possibly missing subdivisions &log( "Adding possibly missing subdivisions." ); $n = 0; foreach my $code ( sort( keys( %{$known->{subdivisions}} ) ) ) { if( !exists( $known_subdivisions->{ $code } ) ) { $known_subdivisions->{ $code } = { subdivision => $code }; $n++; } $known_subdivisions->{ $code }->{status} = $known->{subdivisions}->{ $code }->{status}; } &log( sprintf( "%d missing subdivisions added (%.2f%%) out of %d", $n, ( ( $n / scalar( keys( %{$known->{subdivisions}} ) ) ) * 100 ), scalar( keys( %{$known->{subdivisions}} ) ) ) ); # NOTE: Associating a territory code for each subdivision by looking up its associated territory &log( "Associating a territory code for each subdivision by looking up its associated territory." ); $n = 0; my $subdiv_lookup; $subdiv_lookup = sub { my $code = shift( @_ ); if( exists( $known_subdivisions->{ $code } ) ) { if( exists( $known_subdivisions->{ $code }->{territory} ) && defined( $known_subdivisions->{ $code }->{territory} ) && length( $known_subdivisions->{ $code }->{territory} ) ) { return( $known_subdivisions->{ $code }->{territory} ); } elsif( exists( $known_subdivisions->{ $code }->{parent} ) && length( $known_subdivisions->{ $code }->{parent} // '' ) ) { return( $subdiv_lookup->( $known_subdivisions->{ $code }->{parent} ) ); } else { return; } } else { die( "Subdivision code '${code}' is unknown." ); } }; foreach my $sub ( sort( keys( %$known_subdivisions ) ) ) { if( !exists( $known_subdivisions->{ $sub }->{territory} ) || !length( $known_subdivisions->{ $sub }->{territory} // '' ) ) { my $code; if( $known_subdivisions->{ $sub }->{status} eq 'unknown' ) { $code = 'ZZ'; } else { $code = $subdiv_lookup->( $known_subdivisions->{ $sub }->{parent} || $sub ); if( !$code && $known_subdivisions->{ $sub }->{status} ne 'deprecated' && $known_subdivisions->{ $sub }->{parent} ne 'unknown' ) { die( "Unable to find an associated territory for the subdivision '${sub}' with status '", $known_subdivisions->{ $sub }->{status}, "' and parent '", $known_subdivisions->{ $sub }->{parent}, "'" ); } } $known_subdivisions->{ $sub }->{territory} = $code if( defined( $code ) && length( $code // '' ) ); } } &log( "${n} territory code associated for subdivisions." ); my @missing_subdivision_territory = (); foreach my $sub ( sort( keys( %$known_subdivisions ) ) ) { if( !exists( $known_subdivisions->{ $sub }->{territory} ) || !length( $known_subdivisions->{ $sub }->{territory} // '' ) ) { push( @missing_subdivision_territory, $sub ); } } if( scalar( @missing_subdivision_territory ) ) { warn( scalar( @missing_subdivision_territory ), " deprecated or unknown subdivisions found without an asociated territory: ", join( ', ', @missing_subdivision_territory ) ); } else { $out->print( "All ", scalar( keys( %$known_subdivisions ) ), " subdivisions have an associated territory.\n" ) if( $DEBUG ); } # NOTE: Loading subdivisions &log( "Loading subdivisions." ); $n = 0; $sth = $sths->{subdivisions} || die( "No SQL statement object for subdivisions" ); foreach my $code ( sort( keys( %$known_subdivisions ) ) ) { $out->print( "[${code}] " ) if( $DEBUG ); my $def = $known_subdivisions->{ $code }; eval { $sth->execute( @$def{qw( territory subdivision parent is_top_level status )} ); } || die( "Error adding subdivision information for subdivision '$def->{subdivision}', territory '$def->{territory}' and parent '", ( $def->{parent} // 'undef' ), "': ", ( $@ || $sth->errstr ), "\n", dump( $def ) ); $out->print( "ok\n" ) if( $DEBUG ); $n++; } &log( "${n} subdivisions added." ); %$known_subdivisions = (); # NOTE: Loading localised subdivisions &log( "Loading localised subdivisions." ); $n = 0; my $x = 0; my $total_localised = 0; $subdivisions_l10n_dir->open || die( "Unable to open directory $subdivisions_l10n_dir: ", $subdivisions_l10n_dir->error ); $sth = $sths->{subdivisions_l10n} || die( "No SQL statement object for subdivisions_l10n" ); # while( my $f = $subdivisions_l10n_dir->read( as_object => 1, exclude_invisible => 1 ) ) @files = $subdivisions_l10n_dir->read( as_object => 1, exclude_invisible => 1, 'sort' => 1 ); foreach my $f ( @files ) { next unless( $f->extension eq 'xml' ); my $basename = $f->basename; next if( $basename eq 'root.xml' ); $out->print( "[${basename}] " ) if( $DEBUG ); my $locDoc = load_xml( $f ); # Returned an XML::LibXML::Attr # my $langAttr = $locDoc->findnodes( '//identity/language/@type' ) || # die( "No language node with attribute could be found in the XML file $f" ); # my $locale = $langAttr->getValue(); my $locale = identity_to_locale( $locDoc ); ( my $locale2 = $f->basename( '.xml' ) ) =~ tr/_/-/; if( lc( $locale ) ne lc( $locale2 ) && $locale2 ne 'root' ) { warn( "XML identity says the locale is '${locale}', but the file basename says it should be '${locale2}', and I think the file basename is correct for file $f" ); $locale = $locale2; } if( index( $locale, 'root' ) != -1 ) { if( length( $locale ) > 4 ) { my $loc = Locale::Unicode->new( $locale ); $loc->language( 'und' ); $locale = $loc->as_string; } else { $locale = 'und'; } } my $namesRes = $locDoc->findnodes( '//localeDisplayNames/subdivisions/subdivision' ); unless( $namesRes->size ) { warn( "Warning only: No localised subdivision names found for language ${locale} in file ${f}" ); next; } while( my $el = $namesRes->shift ) { my $id = $el->getAttribute( 'type' ) || die( "No subdivision ID found with attribute 'type' in file ${f} for this element: ", $el->toString() ); # Somehow, the XML file also contains some territory codes like AS, AW, AX # We need to filter them out if( length( $id ) == 2 && $id =~ /^[A-Z]{2}$/ && exists( $territoryInfo->{ $id } ) ) { next; } my $name = $el->textContent; if( index( $name, '&' ) != -1 && index( $name, ';' ) != -1 ) { $name = decode_entities( $name ); } eval { $sth->execute( $locale, $id, $name ); } || die( "Error adding localised subdivision name information in file ${f} with id '${id}' and locale '${locale}': ", ( $@ || $sth->errstr ) ); $x++; } $out->print( "ok (${x})\n" ) if( $DEBUG ); $total_localised += $x; $x = 0; $n++; } &log( "${n} locales processed adding a total of ${total_localised} localised subdivisions." ); # NOTE: Loading core numbering systems rules &log( "Loading core numbering systems rules." ); $n = 0; my $root_num_sys_file = $basedir->child( 'rbnf/root.xml' ); my $rootSysNumDoc = load_xml( $root_num_sys_file ); my $sysNumRulesRes = $rootSysNumDoc->findnodes( '/ldml/rbnf/rulesetGrouping[@type="NumberingSystemRules"]/ruleset' ) || die( "Unable to get the core numbering systems in file ${root_num_sys_file}" ); &log( sprintf( "%d rules found.", $sysNumRulesRes->size ) ); my $numbering_systems = {}; my $fetch_numbers; $fetch_numbers = sub { my $el = shift( @_ ); my $args = shift( @_ ) // {}; my $start = $args->{start} // 0; my $lang = $args->{locale} // 'und'; my $rbnfDoc = $args->{doc} // $rootSysNumDoc; my $rbnf_file = $args->{file} // $root_num_sys_file; my $group = $args->{group} // 'NumberingSystemRules'; my $id = $el->getAttribute( 'id' ); my $numRulesRes = $el->findnodes( "./rbnfrule[\@value >= \"${start}\" and \@value < \"10\"]" ); my @numbers = (); if( !$numRulesRes->size ) { die( "No number found in the RBNF file ${rbnf_file} for the numbering system id '${id}' with rulesetGrouping 'NumberingSystemRules' and ruleset type '${id}'" ); } # All is good elsif( $numRulesRes->size == 10 ) { @numbers = map( [split( ';', $_->textContent )]->[0], $numRulesRes->get_nodelist ); } # =%%cyrillic-lower-1-10=҃; else { foreach my $node ( $numRulesRes->get_nodelist ) { # my $node = $nodes[$i]; my $val = $node->textContent; $out->print( "\tFound value '${val}'\n" ) if( $DEBUG ); if( index( $val, '%' ) != -1 ) { # =%%cyrillic-lower-1-10=; # >҃=%%cyrillic-lower-1-10=; if( $val =~ /^(?.*)\=\%{1,2}(?[^\=]+)\=/ ) { my $target = $+{target}; # Actually, we ignore it, because I am not that sure this is really a prefix. my $prefix = $+{prefix}; $out->print( "\tFound alias pointing to ${target} with prefix '", ( $prefix // 'undef' ), "'\n" ) if( $DEBUG ); if( exists( $numbering_systems->{ $target } ) ) { $out->printf( "\tFound cached data with %d elements for number system ${target}\n", scalar( @{$numbering_systems->{ $target }} ) ) if( $DEBUG ); # if( defined( $prefix ) ) # { # push( @numbers, map( $prefix . $_, @{$numbering_systems->{ $target }} ) ); # } # else # { push( @numbers, @{$numbering_systems->{ $target }} ); # } } else { my $resolverRes = $rbnfDoc->findnodes( '/ldml/rbnf/rulesetGrouping[@type="' . $group . '"]/ruleset[@type="' . $target . '"]' ); if( !$resolverRes->size ) { die( "RBNF alias points to ${target}, but I was unable to find it in file ${rbnf_file}" ); } my $el_target = $resolverRes->shift; $args->{start} = $start; my $nums = $fetch_numbers->( $el_target, $args ); $out->print( "\tResolved for element with ID '", $el->getAttribute( 'type' ), "' pointing to ${target} returned ", scalar( @$nums ), ": ", join( ', ', @$nums ), "\n" ) if( $DEBUG ); push( @numbers, @$nums ); } } else { die( "Unknown RBFN alias found in string '${val}' in file ${rbnf_file}" ); } } else { my $rule_id = $node->getAttribute( 'value' ); if( $rule_id =~ /^\d$/ ) { push( @numbers, [split( ';', $val )]->[0] ); } else { die( "I was expecting a number, but instead found '${id}'" ); } } $start++; } } $out->print( "\treturning: ", join( ', ', @numbers ), "\n" ) if( $DEBUG ); return( \@numbers ); }; while( my $el = $sysNumRulesRes->shift ) { # Example: my $id = $el->getAttribute( 'type' ) || die( "No ruleset name value found with attribute 'type': ", $el->toString() ); $out->print( "[${id}] \n" ) if( $DEBUG ); my $nums = $fetch_numbers->( $el ); $numbering_systems->{ $id } = $nums; $n++; $out->print( "\t@$nums\n" ) if( $DEBUG ); } &log( "${n} numbering systems loaded." ); # NOTE: Loading numbering systems &log( "Loading numbering systems." ); $n = 0; my $num_sys_file = $basedir->child( 'supplemental/numberingSystems.xml' ); my $numsysDoc = load_xml( $num_sys_file ); my $nsysRes = $numsysDoc->findnodes( '//numberingSystems/numberingSystem' ) || die( "Unable to get the numbering system nodes from file $num_sys_file" ); $sth = $sths->{number_systems} || die( "No SQL statement object for number_systems" ); # We use this hash to check if a number system is known to us when we add its localised version in number_systems_l10n my $number_systems = {}; while( my $el = $nsysRes->shift ) { my $id = $el->getAttribute( 'id' ) || die( "Unable to get the attribute 'id' for this numbering system element: ", $el->toString() ); $number_systems->{ $id }++; $out->print( "[${id}] " ) if( $DEBUG ); my $type = $el->getAttribute( 'type' ) || die( "Unable to get the attribute 'type' for this numbering system element: ", $el->toString() ); my @numbers; if( $el->hasAttribute( 'digits' ) ) { my $str = $el->getAttribute( 'digits' ) || die( "Unable to get the attribute 'digits' for this numbering system element: ", $el->toString() ); if( index( $str, '&' ) != -1 ) { @numbers = map( decode_entities( $_ ), split( //, $str ) ); } else { @numbers = split( //, $str ); } } # Example: # # elsif( $el->hasAttribute( 'rules' ) ) { my $rules = $el->getAttribute( 'rules' ) || die( "Unable to get the attribute 'rules' for this numbering system element: ", $el->toString() ); if( index( $rules, '/' ) != -1 ) { my( $locale, $rbnfType, $ruleType ) = split( '/', $rules, 3 ); for( $locale, $rbnfType, $ruleType ) { if( !defined( $_ ) ) { die( "Missing key RBNF XML path information to retrieve the digits for the numbering system '${id}': ", $el->toString() ); } } my $rbnfFile = $basedir->child( "rbnf/${locale}.xml" ); if( !$rbnfFile->exists ) { die( "RBNF file ${rbnfFile} for locale ${locale} does not exist." ); } my $rbnfDoc = load_xml( $rbnfFile ); my $numRulesRes = $rbnfDoc->findnodes( "/ldml/rbnf/rulesetGrouping[\@type=\"${rbnfType}\"]/ruleset[\@type=\"${ruleType}\"]" ); if( !$numRulesRes->size ) { die( "No number found in the RBNF file ${rbnfFile} for the numbering system id '${id}' with rulesetGrouping '${rbnfType}' and ruleset type '${ruleType}'" ); } my $el_rule = $numRulesRes->shift; my $nums = $fetch_numbers->( $el_rule, { locale => $locale, file => $rbnfFile, doc => $rbnfDoc, group => $rbnfType, }); # @numbers = map( [split( ';', $_->textContent )]->[0], $numRulesRes->get_nodelist ); @numbers = @$nums; } elsif( $rules =~ /^[a-z][a-zA-Z\-]+$/ ) { if( exists( $numbering_systems->{ $rules } ) ) { @numbers = @{$numbering_systems->{ $rules }}; } else { die( "Unknown rule value '${rules}'. Was not found in the core numbering systems file ${root_num_sys_file}" ); } } else { die( "Unsupported numbering systems rule value '${rules}': ", $el->toString() ); } } else { die( "This numbering system has no 'digits' nor 'rules' attribute defined: ", $el->toString() ); } my $digits = \@numbers; eval { $sth->execute( $id, to_array( $digits ), $type ); } || die( "Error adding number system information for id '${id}': ", ( $@ || $sth->errstr ) ); $out->print( "ok\n" ) if( $DEBUG ); $n++; } # NOTE: Loading time formats &log( "Loading time formats." ); $n = 0; my $timeHoursRes = $suppDoc->findnodes( '//timeData/hours' ) || die( "Unable to get the time hours preferred formats in ${supplemental_data_file}" ); if( !$timeHoursRes->size ) { die( "No time hours preferred formats found in file ${supplemental_data_file}" ); } $sth = $sths->{time_formats} || die( "No SQL statement object for time_formats" ); while( my $el = $timeHoursRes->shift ) { my $pref = $el->getAttribute( 'preferred' ) || die( "Unable to get the attribute 'preferred' in preferred time element: ", $el->toString() ); my $allowed = $el->getAttribute( 'allowed' ) || die( "Unable to get the attribute 'allowed' in preferred time element: ", $el->toString() ); my $codes = $el->getAttribute( 'regions' ) || die( "Unable to get the attribute 'regions' in preferred time element: ", $el->toString() ); $codes = [split( /[[:blank:]\h\v]+/, $codes )]; $allowed = [split( /[[:blank:]\h\v]+/, $allowed )] if( defined( $allowed ) ); foreach my $code ( @$codes ) { $out->print( "[{${code}] " ) if( $DEBUG ); # This is messed up. The CLDR XML file for time formatting has a property 'region', which, sometimes, contains locales. They should have used a different property name, such as 'locale' my( $territory, $locale ); if( index( $code, '_' ) != -1 ) { $code =~ tr/_/-/; } if( index( $code, '-' ) != -1 ) { ( $locale, $territory ) = split( '-', $code, 2 ); } else { $territory = $code; } # A 3-digits code like 001 that got truncated, because it turned into an integer if( $code =~ /^\d{1,2}$/ ) { $code = sprintf( '%03d', $code ); } # The CLDR uses 001 (World) to signify the default value. # We set the default value in the SQL schema, so we do not need this. # if( $code eq '001' ) # { # next; # } # elsif( !exists( $territoryInfo->{ $territory } ) ) if( !exists( $territoryInfo->{ $territory } ) ) { die( "Unknown territory territory code '${territory}' for property 'region' with value '${code}'. Not previous defined in CLDR as a territory." ); } eval { $sth->bind_param( 1, "$code", SQL_VARCHAR ); $sth->bind_param( 2, $territory, SQL_VARCHAR ); $sth->bind_param( 3, $locale, SQL_VARCHAR ); $sth->bind_param( 4, $pref, SQL_VARCHAR ); $sth->bind_param( 5, to_array( $allowed ), SQL_VARCHAR ); $sth->execute; } || die( "Error adding time formatting information for region '${code}': ", ( $@ || $sth->errstr ) ); $n++; $out->print( "ok\n" ) if( $DEBUG ); } } &log( "Time formatting added to ${n} territories." ); # NOTE: Loading week of preference &log( "Loading week of preference." ); $n = 0; my $weekPrefsRest = $suppDoc->findnodes( '//weekData/weekOfPreference' ) || die( "Unable to get week of preferences information from ${supplemental_data_file}" ); if( !$weekPrefsRest->size ) { die( "No week of preferences information found in ${supplemental_data_file}" ); } $sth = $sths->{week_preferences} || die( "No SQL statement object for week_preferences" ); # Example: while( my $el = $weekPrefsRest->shift ) { my $locales = $el->getAttribute( 'locales' ) || die( "No attribute 'locales' for this element: ", $el->toString() ); # Example: $locales =~ tr/_/-/; $locales = [split( /[[:blank:]\h\v]+/, $locales )]; my $prefs = $el->getAttribute( 'ordering' ) || die( "No attribute 'ordering' for this element: ", $el->toString() ); $prefs = [split( /[[:blank:]\h\v]+/, $prefs )]; foreach my $locale ( @$locales ) { # Should not be needed, but better safe than sorry if( index( $locale, 'root' ) != -1 ) { if( length( $locale ) > 4 ) { my $loc = Locale::Unicode->new( $locale ); $loc->language( 'und' ); $locale = $loc->as_string; } else { $locale = 'und'; } } $out->print( "[${locale}] " ) if( $DEBUG ); eval { $sth->execute( $locale, to_array( $prefs ) ); } || die( "Error adding week of preference information for locale '${locale}': ", ( $@ || $sth->errstr ) ); $out->print( "ok\n" ) if( $DEBUG ); $n++; } } &log( "${n} week of preference information added." ); # NOTE: Loading code mappings &log( "Loading code mappings." ); $n = 0; $sth = $sths->{code_mappings} || die( "No SQL statement object for code_mappings" ); my $code_mappings = [ { type => 'territory', xpath => '//codeMappings/territoryCodes' }, { type => 'currency', xpath => '//codeMappings/currencyCodes' }, ]; foreach my $this ( @$code_mappings ) { my $mapRes = $suppDoc->findnodes( $this->{xpath} ) || die( "Unable to get the $this->{type} information in file ${supplemental_data_file}" ); if( !$mapRes->size ) { die( "No $this->{type} information found in file ${supplemental_data_file}" ); } while( my $el = $mapRes->shift ) { my $def = { code => ( $el->getAttribute( 'type' ) || die( "Unable to get attribute 'type' for this code mapping element: ", $el->toString() ) ), alpha3 => $el->getAttribute( 'alpha3' ), numeric => $el->getAttribute( 'numeric' ), fips10 => $el->getAttribute( 'fips10' ), type => $this->{type}, }; $out->print( "$def->{type} / [$def->{code}] " ) if( $DEBUG ); eval { $sth->execute( @$def{qw( code alpha3 numeric fips10 type )} ); } || die( "Error adding code mapping information for code '$def->{code}' of type $this->{type}: ", ( $@ || $sth->errstr ), "\n", dump( $def ) ); $out->print( "ok\n" ) if( $DEBUG ); $n++; } } &log( "${n} code mappings added." ); # NOTE: Loading person name defaults &log( "Loading person name defaults." ); $n = 0; my $nameOrderRes = $suppDoc->findnodes( '//personNamesDefaults/nameOrderLocalesDefault' ) || die( "Unable to get the name order locale information from file ${supplemental_data_file}" ); if( !$nameOrderRes->size ) { die( "No name order locale information found in file ${supplemental_data_file}" ); } $sth = $sths->{person_name_defaults} || die( "No SQL statement object for person_name_defaults" ); # Example: hu ja km ko mn si ta te vi yue zh while( my $el = $nameOrderRes->shift ) { my $value = $el->getAttribute( 'order' ) || die( "No attribute 'order' found for this person name defaults element: ", $el->toString() ); $out->print( "[${value}] " ) if( $DEBUG ); my $locales = $el->textContent || die( "No text content found for this person name defaults element: ", $el->toString() ); $locales = [split( /[[:blank:]\h\v]+/, $locales )]; foreach my $locale ( @$locales ) { # There should be no need, but that might change in future release of CLDR $locale =~ tr/_/-/; # Should not be needed, but better safe than sorry if( index( $locale, 'root' ) != -1 ) { if( length( $locale ) > 4 ) { my $loc = Locale::Unicode->new( $locale ); $loc->language( 'und' ); $locale = $loc->as_string; } else { $locale = 'und'; } } eval { $sth->execute( $locale, $value ); } || die( "Error adding person name defaults information for value '${value}' and locale '${locale}': ", ( $@ || $sth->errstr ) ); $n++; } $out->print( "ok\n" ) if( $DEBUG ); } &log( "${n} person name defaults added." ); # NOTE: Loading Rule-Based Number Formats &log( "Loading Rule-Based Number Formats." ); $n = 0; $rbnf_dir->open || die( $rbnf_dir->error ); $sth = $sths->{rbnf} || die( "No SQL statement object for rbnf" ); # while( my $f = $rbnf_dir->read( as_object => 1, exclude_invisible => 1 ) ) @files = $rbnf_dir->read( as_object => 1, exclude_invisible => 1, 'sort' => 1 ); foreach my $f ( @files ) { next unless( $f->extension eq 'xml' ); my $rbnfDoc = load_xml( $f ); my $locale = identity_to_locale( $rbnfDoc ); ( my $locale2 = $f->basename( '.xml' ) ) =~ tr/_/-/; if( lc( $locale ) ne lc( $locale2 ) && $locale2 ne 'root' ) { warn( "XML identity says the locale is '${locale}', but the file basename says it should be '${locale2}', and I think the file basename is correct for file $f" ); $locale = $locale2; } if( index( $locale, 'root' ) != -1 ) { if( length( $locale ) > 4 ) { my $loc = Locale::Unicode->new( $locale ); $loc->language( 'und' ); $locale = $loc->as_string; } else { $locale = 'und'; } } $out->print( "[${locale}] " ) if( $DEBUG ); my $rbnfRes = $rbnfDoc->findnodes( '//rbnf/rulesetGrouping' ); if( !$rbnfRes->size ) { warn( "Warning only: no RBNF grouping found for locale '${locale}' in file $f" ); $out->print( "ignored\n" ) if( $DEBUG ); next; } while( my $el = $rbnfRes->shift ) { my $grouping = $el->getAttribute( 'type' ) || die( "Unable to get the attribute 'type' for this grouping element: ", $el->toString() ); my @sets = $el->getChildrenByTagName( 'ruleset' ); foreach my $set ( @sets ) { my $ruleset = $set->getAttribute( 'type' ) || die( "Unable to get the attribute 'type' for this ruleset: ", $set->toString() ); my @rules = $set->getChildrenByTagName( 'rbnfrule' ); foreach my $rule ( @rules ) { my $id; if( !length( ( $id = $rule->getAttribute( 'value' ) ) // '' ) ) { die( "Unable to get the attribute 'value' for this rule: ", $rule->toString() ); } my $value = $rule->textContent; if( !defined( $value ) || !length( $value ) ) { die( "Unable to get the rule value for the rule id '${id}' in grouping '${grouping}': ", $rule->toString() ); } eval { $sth->execute( $locale, $grouping, $ruleset, $id, $value ); } || die( "Error adding RBNF information for groupind '${grouping}, locale '${locale}', rule set '${ruleset}', and id '${id}': ", ( $@ || $sth->errstr ) ); $n++; } } } $out->print( "ok\n" ) if( $DEBUG ); } &log( "${n} RBNF rules added." ); # NOTE: Loading references &log( "Loading references." ); $n = 0; my $refsRes = $suppDoc->findnodes( '//references/reference' ) || die( "Unable to get the 'reference' nodes in ${supplemental_data_file}" ); if( !$refsRes->size ) { die( "No 'reference' node could be found in ${supplemental_data_file}" ); } $sth = $sths->{refs} || die( "No SQL statement object for refs" ); while( my $el = $refsRes->shift ) { my $def = { code => ( $el->getAttribute( 'type' ) || die( "No attribute 'type' found for this reference element: ", $el->toString() ) ), uri => $el->getAttribute( 'uri' ), description => $el->textContent, }; $out->print( "[$def->{code}] " ) if( $DEBUG ); $def->{description} = undef if( $def->{description} eq '[missing]' ); # Decode HTML entities if there is a description and the character '&' is contained if( defined( $def->{description} ) && index( $def->{description}, '&' ) != -1 && index( $def->{description}, ';' ) != -1 ) { $def->{description} = decode_entities( $def->{description} ); } if( defined( $def->{description} ) && length( $def->{description} ) ) { $def->{description} = trim( $def->{description} ); } $def->{description} = undef unless( length( $def->{description} // '' ) ); eval { $sth->execute( @$def{qw( code uri description )} ); } || die( "Error adding reference information for code '$def->{code}': ", ( $@ || $sth->errstr ), "\n", dump( $def ) ); $out->print( "ok\n" ) if( $DEBUG ); $n++; } &log( "${n} references added." ); # NOTE: Loading BCP47 extensions &log( "Loading BCP47 extensions and values." ); $n = 0; $bcp47_dir->open || die( "Unable to open BCP47 directory: ", $bcp47_dir->error ); $sth = $sths->{bcp47_extensions} || die( "No SQL statement object for bcp47_extensions" ); my $sth_bcp_sth = $sths->{bcp47_values} || die( "No SQL statement object for bcp47_values" ); # while( my $f = $bcp47_dir->read( as_object => 1, exclude_invisible => 1 ) ) @files = $bcp47_dir->read( as_object => 1, exclude_invisible => 1, 'sort' => 1 ); foreach my $f ( @files ) { next unless( $f->extension eq 'xml' ); my $cat = $f->basename( '.xml' ); $cat =~ tr/-/_/; $out->print( "[${cat}] " ) if( $DEBUG ); my $extDom = load_xml( $f ); my $classesRes = $extDom->findnodes( '//keyword/key' ) || die( "Error getting the BCP47 classes from file $f" ); die( "No information found for category '${cat}' in file $f" ) if( !$classesRes->size ); # # while( my $el = $classesRes->shift ) { my $def = { category => $cat, extension => ( $el->getAttribute( 'name' ) || die( "Unable to get the attribute 'name' from file ${f} on this element: ", $el->toString() ) ), alias => $el->getAttribute( 'alias' ), value_type => $el->getAttribute( 'valueType' ), description => $el->getAttribute( 'description' ), deprecated => $el->getAttribute( 'deprecated' ), }; if( defined( $def->{deprecated} ) && length( $def->{deprecated} // '' ) ) { if( exists( $boolean_map->{ $def->{deprecated} } ) ) { $def->{deprecated} = $boolean_map->{ $def->{deprecated} }; } else { die( "The BCP47 extension $def->{extension} has a deprecated status ($def->{deprecated}), but its value is unknown (neither true or false): ", dump( $def ) ); } } eval { $sth->execute( @$def{qw( category extension alias value_type description deprecated )} ); } || die( "Error adding BCP47 extension information for category '$def->{category}' and extension '$def->{extension}' from file ${f}: ", ( $@ || $sth->errstr ), "\n", dump( $def ) ); $n++; my @values = $el->getChildrenByTagName( 'type' ); scalar( @values ) || die( "No data found for BCP47 extension '$def->{extension}' in category '${cat}' in file ${f}" ); # foreach my $el_val ( @values ) { my $val = $el_val->getAttribute( 'name' ) || die( "Unable to get an extension value attribute for category '${cat}' and extension '$def->{extension}' from file ${f} for element: ", $el_val->toString() ); my $desc = $el_val->getAttribute( 'description' ) || die( "Unable to get an extension description attribute for category '${cat}' and extension '$def->{extension}' from file ${f} for element: ", $el_val->toString() ); eval { $sth_bcp_sth->execute( $cat, $def->{extension}, $val, $desc ); } || die( "Error adding BCP47 extension value information for value '${val}' and category '$def->{category}' and extension '$def->{extension}' from file ${f}: ", ( $@ || $sth_bcp_sth->errstr ) ); } } $out->print( "ok\n" ) if( $DEBUG ); $n++; } &log( "${n} extensions data added." ); # NOTE: Loading casings &log( "Loading casings." ); $casings_dir->open || die( "Unable to open directory $casings_dir: ", $casings_dir->error ); $n = 0; $sth = $sths->{casings} || die( "No SQL statements object for casings table." ); # while( my $f = $casings_dir->read( as_object => 1, exclude_invisible => 1 ) ) @files = $casings_dir->read( as_object => 1, exclude_invisible => 1, 'sort' => 1 ); foreach my $f ( @files ) { next unless( $f->extension eq 'xml' ); my $basename = $f->basename( '.xml' ); my $locDoc = load_xml( $f ); my $locale = identity_to_locale( $locDoc ); ( my $locale2 = $basename ) =~ tr/_/-/; if( lc( $locale ) ne lc( $locale2 ) && $locale2 ne 'root' ) { warn( "XML identity says the locale is '${locale}', but the file basename says it should be '${locale2}', and I think the file basename is correct for file $f" ); $locale = $locale2; } if( index( $locale, 'root' ) != -1 ) { if( length( $locale ) > 4 ) { my $loc = Locale::Unicode->new( $locale ); $loc->language( 'und' ); $locale = $loc->as_string; } else { $locale = 'und'; } } $out->print( "[${locale}] " ) if( $DEBUG ); my $elemsRes = $locDoc->findnodes( '//casingData/casingItem' ); if( !$elemsRes->size ) { warn( "Warning only: no casing items found for locale '${locale}' in file $f" ); $out->print( "ignored. No data\n" ) if( $DEBUG ); next; } # lowercase # There should be 22 to 24 of those casing tokens my $cnt = 0; while( my $el = $elemsRes->shift ) { my $token = $el->getAttribute( 'type' ) || die( "No attribute 'type' found for this casing element: ", $el->toString() ); my $value = $el->textContent || die( "No value found for this casing element '${token}': ", $el->toString() ); eval { $sth->execute( $locale, $token, $value ); } || die( "Error adding casing information for locale '${locale}' and token '${token}' and value '${value}' for file ${f}: ", ( $@ || $sth->errstr ) ); $cnt++; } $out->print( "ok -> ${cnt} rows\n" ) if( $DEBUG ); $n++; } &log( "${n} locale casing information added." ); # NOTE: Loading day periods &log( "Loading day periods." ); $n = 0; my $total_locales = 0; my $day_periods_file = $basedir->child( 'supplemental/dayPeriods.xml' ); my $dayPeriodsDoc = load_xml( $day_periods_file ); $sth = $sths->{day_periods} || die( "No SQL statement object for day_periods" ); my $rules = $dayPeriodsDoc->findnodes( '//dayPeriodRuleSet[not(@type)]/dayPeriodRules' ) || die( "Unable to find day periods ruleset in file $day_periods_file" ); if( !$rules->size ) { die( "No rules found in day period XML file $day_periods_file" ); } # Example: while( my $el = $rules->shift ) { my $locales = $el->getAttribute( 'locales' ) || die( "No attribute 'locales' found for this day period element: ", $el->toString() ); $locales = [split( /[[:blank:]\h\v]+/, $locales )]; # Example: # my $dpRules = $el->findnodes( './dayPeriodRule' ); if( !$dpRules->size ) { warn( "Warning only: unable to find child elements 'dayPeriodRule' for locale '", join( ', ', @$locales ), "' for this day period rule set in file ${day_periods_file}: ", $el->toString() ); } while( my $el_rule = $dpRules->shift ) { my $token = $el_rule->getAttribute( 'type' ) || die( "No attribute 'type' for this day period rule element: ", $el_rule->toString() ); my( $from, $before ); if( $el_rule->hasAttribute( 'at' ) ) { $from = $before = $el_rule->getAttribute( 'at' ) || die( "Unable to get attribute 'at' for this day period element: ", $el_rule->toString() ); } else { $from = $el_rule->getAttribute( 'from' ) || die( "Unable to get attribute 'from' for this day period element: ", $el_rule->toString() ); $before = $el_rule->getAttribute( 'before' ) || die( "Unable to get attribute 'before' for this day period element: ", $el_rule->toString() ); } foreach my $locale ( @$locales ) { # Should not be needed, but better safe than sorry $locale =~ tr/_/-/; if( index( $locale, 'root' ) != -1 ) { if( length( $locale ) > 4 ) { my $loc = Locale::Unicode->new( $locale ); $loc->language( 'und' ); $locale = $loc->as_string; } else { $locale = 'und'; } } eval { $sth->execute( $locale, $token, $from, $before ); } || die( "Error adding day period information for locale '${locale}' and token '${token}': ", ( $@ || $sth->errstr ) ); $n++; } $total_locales += scalar( @$locales ); } } &log( "${n} day periods added for ${total_locales} locales." ); # NOTE: Loading localised data &log( "Loading localised data." ); $n = 0; $main_dir->open || die( $main_dir->error ); my $sth_locale = $sths->{locales_l10n} || die( "No SQL statement object for locales_l10n" ); my $sth_script = $sths->{scripts_l10n} || die( "No SQL statement object for scripts_l10n" ); my $sth_territory = $sths->{territories_l10n} || die( "No SQL statement object for territories_l10n" ); my $sth_variant = $sths->{variants_l10n} || die( "No SQL statement object for variants_l10n" ); my $sth_currency = $sths->{currencies_l10n} || die( "No SQL statement object for currencies_l10n" ); my $sth_cal_term = $sths->{calendar_terms} || die( "No SQL statement object for calendar_terms" ); my $sth_cal_era = $sths->{calendar_eras_l10n} || die( "No SQL statement object for calendar_eras_l10n" ); my $sth_dt_fmt = $sths->{calendar_formats_l10n} || die( "No SQL statement object for calendar_formats_l10n" ); my $sth_dt_pat_fmt = $sths->{calendar_datetime_formats} || die( "No SQL statement object for calendar_datetime_formats" ); my $sth_avail_fmt = $sths->{calendar_available_formats} || die( "No SQL statement object for calendar_available_formats" ); my $sth_append_fmt = $sths->{calendar_append_formats} || die( "No SQL statement object for calendar_append_formats" ); my $sth_inter_fmt = $sths->{calendar_interval_formats} || die( "No SQL statement object for calendar_interval_formats" ); my $sth_cyclic = $sths->{calendar_cyclics_l10n} || die( "No SQL statement object for calendar_cyclics_l10n" ); my $sth_field = $sths->{date_fields_l10n} || die( "No SQL statement object for date_fields_l10n" ); my $sth_time_rel = $sths->{time_relative_l10n} || die( "No SQL statement object for time_relative_l10n" ); my $sth_date_term = $sths->{date_terms} || die( "No SQL statement object for date_terms" ); my $sth_locale_info = $sths->{locales_info} || die( "No SQL statement object for locales_info" ); my $sth_locale_num_sys = $sths->{locale_number_systems} || die( "No SQL statement object for locale_number_systems" ); my $sth_num_sys_l10n = $sths->{number_systems_l10n} || die( "No SQL statement object for number_systems_l10n" ); my $sth_cals_l10n = $sths->{calendars_l10n} || die( "No SQL statement object for calendars_l10n" ); my $sth_collation_l10n = $sths->{collations_l10n} || die( "No SQL statement object for collations_l10n" ); my $sth_timezone_city = $sths->{timezones_cities} || die( "No SQL statement object for timezones_cities" ); my $sth_tz_formats = $sths->{timezones_formats} || die( "No SQL statement object for timezones_formats" ); my $sth_tz_names = $sths->{timezones_names} || die( "No SQL statement object for timezones_names" ); my $sth_metatz_names = $sths->{metazones_names} || die( "No SQL statement object for metazones_names" ); my $patch = { '45.0' => { calendar_interval_formats => { 'an' => { 'yMEd' => { # See CLDR bug report No 17800 # 'M' => 'E, d/M/y – E, d/M/y', }, }, 'brx' => { 'Md' => { # See CLDR bug report No 17808 # 'M' => 'd/M – d/M', }, }, 'hi' => { 'GyM' => { # See CLDR bug report No 17809 # 'M' => 'GGGGG M/y – M/y', }, }, }, }, }; my $localesRes; #while( my $f = $main_dir->read( as_object => 1, exclude_invisible => 1, 'sort' => 1 ) ) @files = $main_dir->read( as_object => 1, exclude_invisible => 1, 'sort' => 1 ); # We need to process root.xml first, as it hold some core default values such as for date/time format skeletons that are not always present in other locales my $root_file; for( my $i = 0; $i < scalar( @files ); $i++ ) { my $f = $files[$i]; if( $f->basename eq 'root.xml' ) { $root_file = $f; splice( @files, $i, 1 ); last; } } if( !defined( $root_file ) ) { die( "I was unable to find the root.xml file in ${main_dir}" ); } unshift( @files, $root_file ); # calendar_id -> date|time -> full|long|medium|short = datetimeSkeleton my $calendars_date_time_skeletons = {}; $out->printf( "Processing %d localised data files.\n", scalar( @files ) ) if( $DEBUG ); foreach my $f ( @files ) { next unless( $f->extension eq 'xml' ); my $mainDoc = load_xml( $f ); my $locale = identity_to_locale( $mainDoc ); ( my $locale2 = $f->basename( '.xml' ) ) =~ tr/_/-/; if( lc( $locale ) ne lc( $locale2 ) && $locale2 ne 'root' ) { warn( "XML identity says the locale is '${locale}', but the file basename says it should be '${locale2}', and I think the file basename is correct for file $f" ); $locale = $locale2; } if( index( $locale, 'root' ) != -1 ) { if( length( $locale ) > 4 ) { my $loc = Locale::Unicode->new( $locale ); $loc->language( 'und' ); $locale = $loc->as_string; } else { $locale = 'und'; } } $out->print( "Processing ${locale} data from file ${f}\n" ) if( $DEBUG ); my $added = {}; # Check whether there is any data at all. Some XML file, such as the one for ja_JP.xml contains only the 'identity' tag my $hasData = $mainDoc->findnodes( '//localeDisplayNames' ); if( $hasData->size ) { # NOTE: Loading locales L10N &log( "[${locale}] Loading Locales L10N for locale ${locale}." ); $localesRes = $mainDoc->findnodes( '//localeDisplayNames/languages/language[@type]' ); if( !$localesRes->size ) { warn( "Warning only: no locales localised names found for locale '${locale}' in file $f" ); } # Example: japonais while( my $el = $localesRes->shift ) { my $id = $el->getAttribute( 'type' ) || die( "Unable to get the attribute 'type' value for this element: ", $el->toString() ); my $val = $el->textContent; if( index( $val, '&' ) != -1 && index( $val, ';' ) != -1 ) { $val = decode_entities( $val ); } # Unfortunately, it seems that they are not 'languages', but 'locales', so this is a misnomer # And, it is formatted with underscore when the canonical version is with a dash ('-') $id =~ tr/_/-/; if( index( $id, 'root' ) != -1 ) { if( length( $id ) > 4 ) { my $loc = Locale::Unicode->new( $id ); $loc->language( 'und' ); $id = $loc->as_string; } else { $id = 'und'; } my $hasUndLocaleRes = $mainDoc->findnodes( '/ldml/localeDisplayNames/languages/language[@type="und"]' ); if( $id eq 'und' && $hasUndLocaleRes->size ) { warn( "Found locale ID 'root', but there is already a locale ID 'und' that is also defined for locale ${locale} in file ${f}, skipping." ); next; } } my $def = { locale => $locale, locale_id => $id, locale_name => $val, }; if( $el->hasAttribute( 'alt' ) ) { $def->{alt} = $el->getAttribute( 'alt' ); } eval { $sth_locale->execute( @$def{qw( locale locale_id locale_name alt )} ); } || die( "Error adding localised information from file ${f} for locale ${locale} and locale ID '${id}': ", ( $@ || $sth_locale->errstr ), "\nwith query: ", $sth_locale->{Statement}, "\n", dump( $def ) ); $added->{languages}++; } # NOTE: Loading script L10N &log( "\tLoading script L10N." ); $localesRes = $mainDoc->findnodes( '//localeDisplayNames/scripts/script[@type]' ); if( !$localesRes->size ) { warn( "Warning only: no scripts localised names found for locale '${locale}' in file $f" ); } # Example: while( my $el = $localesRes->shift ) { my $id = $el->getAttribute( 'type' ) || die( "Unable to get the attribute 'type' value for this element: ", $el->toString() ); my $val = $el->textContent; if( index( $val, '&' ) != -1 && index( $val, ';' ) != -1 ) { $val = decode_entities( $val ); } my $alt; if( $el->hasAttribute( 'alt' ) ) { $alt = $el->getAttribute( 'alt' ); } eval { $sth_script->execute( $locale, $id, $val, $alt ); } || die( "Error adding localised information from file ${f} for locale ${locale} and for script ${id}: ", ( $@ || $sth_script->errstr ), "\nwith query: ", $sth_script->{Statement} ); $added->{scripts}++; } # NOTE: Loading territories L10N &log( "\tLoading territories L10N." ); $localesRes = $mainDoc->findnodes( '//localeDisplayNames/territories/territory[@type]' ); if( !$localesRes->size ) { warn( "Warning only: no territories localised names found for locale '${locale}' in file $f" ); } # Example: Japon while( my $el = $localesRes->shift ) { my $id = $el->getAttribute( 'type' ) || die( "Unable to get the attribute 'type' value for this element: ", $el->toString() ); my $val = $el->textContent; if( index( $val, '&' ) != -1 && index( $val, ';' ) != -1 ) { $val = decode_entities( $val ); } my $alt; if( $el->hasAttribute( 'alt' ) ) { $alt = $el->getAttribute( 'alt' ); } eval { $sth_territory->bind_param( 1, $locale, SQL_VARCHAR ); $sth_territory->bind_param( 2, "$id", SQL_VARCHAR ); $sth_territory->bind_param( 3, $val, SQL_VARCHAR ); $sth_territory->bind_param( 4, $alt, SQL_VARCHAR ); $sth_territory->execute; } || die( "Error adding localised information from file ${f} for locale ${locale} and for territory ${id}: ", ( $@ || $sth_territory->errstr ), "\nwith query: ", $sth_territory->{Statement} ); $added->{territories}++; } # NOTE: Loading variants L10N &log( "\tLoading variants L10N." ); $localesRes = $mainDoc->findnodes( '//localeDisplayNames/variants/variant[@type]' ); if( !$localesRes->size ) { warn( "Warning only: no variants localised names found for locale '${locale}' in file $f" ); } # Example: valencien while( my $el = $localesRes->shift ) { my $id = $el->getAttribute( 'type' ) || die( "Unable to get the attribute 'type' value for this element: ", $el->toString() ); my $val = $el->textContent; if( index( $val, '&' ) != -1 && index( $val, ';' ) != -1 ) { $val = decode_entities( $val ); } $id = lc( $id ); my $alt; if( $el->hasAttribute( 'alt' ) ) { $alt = $el->getAttribute( 'alt' ); } eval { $sth_variant->execute( $locale, $id, $val, $alt ); } || die( "Error adding localised information from file ${f} for locale ${locale} and for variant ${id}: ", ( $@ || $sth_variant->errstr ), "\nwith query: ", $sth_variant->{Statement} ); $added->{variants}++; } # NOTE: Loading currencies L10N &log( "\tLoading currencies L10N." ); $localesRes = $mainDoc->findnodes( '//numbers/currencies/currency[@type]' ); if( !$localesRes->size ) { warn( "Warning only: unable to get the localised names for locale '${locale}' for currencies in file $f" ); } # Example: # # Japanese Yen # Japanese yen # Japanese yen # ¥ # while( my $el = $localesRes->shift ) { my $id = $el->getAttribute( 'type' ) || die( "Unable to get the attribute 'type' value for this element: ", $el->toString() ); my $def = { locale => $locale, currency => $id, }; my $symbolRes = $el->findnodes( './symbol' ); if( $symbolRes->size ) { my $el_symbol = $symbolRes->shift; $def->{symbol} = trim( $el_symbol->textContent ); } my $namesRes = $el->findnodes( './displayName' ); if( !$namesRes->size ) { warn( "Warning only: currency '${id}' exists for locale ${locale}, but no localised names is defined in file $f for this element: ", $el->toString() ) unless( $locale eq 'und' ); } while( my $el_name = $namesRes->shift ) { if( $el_name->hasAttribute( 'count' ) ) { $def->{count} = $el_name->getAttribute( 'count' ) || die( "No value provided for 'count' for this currency '${id}' locale name value: ", $el_name->toString() ); } else { $def->{count} = undef; } my $val = $el_name->textContent; if( index( $val, '&' ) != -1 && index( $val, ';' ) != -1 ) { $val = decode_entities( $val ); } $def->{locale_name} = $val; eval { $sth_currency->execute( @$def{qw( locale currency count locale_name symbol )} ); } || die( "Error adding localised information from file ${f} for locale ${locale} and for currency ${id}: ", ( $@ || $sth_currency->errstr ), "\nwith query: ", $sth_currency->{Statement}, "\n", dump( $def ) ); $added->{currencies}++; } } } else { $out->print( "no data, skipping.\n" ) if( $DEBUG ); # next; } # NOTE: Load calendar terms, locale eras, formats, timezones and more &log( "\tLoad calendar terms, locale eras, formats and more." ); my $calLocalesDatesRes = $mainDoc->findnodes( '/ldml/dates' ); if( $calLocalesDatesRes->size ) { my $el_dates = $calLocalesDatesRes->shift; my $calLocalesCalendarsRes = $el_dates->findnodes( './calendars/calendar' ); if( !$calLocalesCalendarsRes->size ) { warn( "Warning only: unable to get the localised terms for locale '${locale}' for calendars in file $f" ); } # while( my $el = $calLocalesCalendarsRes->shift ) { my $cal_id = $el->getAttribute( 'type' ) || die( "Unable to get the calendar ID value from attribute 'type' for this element: ", $el->toString() ); # NOTE: Check for calendar terms my $cal_term_types = { month => { xpath_container => './months', xpath_context => './monthContext', xpath_width => './monthWidth', xpath_terms => './month', }, day => { xpath_container => './days', xpath_context => './dayContext', xpath_width => './dayWidth', xpath_terms => './day', }, quarter => { xpath_container => './quarters', xpath_context => './quarterContext', xpath_width => './quarterWidth', xpath_terms => './quarter', }, day_period => { xpath_container => './dayPeriods', xpath_context => './dayPeriodContext', xpath_width => './dayPeriodWidth', xpath_terms => './dayPeriod', }, }; foreach my $type ( sort( keys( %$cal_term_types ) ) ) { my $this = $cal_term_types->{ $type }; my $calTermContainerRes = $el->findnodes( $this->{xpath_container} ); if( !$calTermContainerRes->size ) { $out->print( "\tNo terms container of type ${type} found for calendar ${cal_id} for locale ${locale} in file ${f}\n" ) if( $DEBUG ); next; } my $el_container = $calTermContainerRes->shift; # Example: # # # # # # # # # my $calTermContainerHasAliasRes = $el_container->findnodes( './alias[@path]' ); if( $calTermContainerHasAliasRes->size ) { $out->print( "\tCalendar ${cal_id} terms container of type ${type} is aliased. Resolving it... " ) if( $DEBUG ); $el_container = resolve_alias( $calTermContainerHasAliasRes ) || die( "Calendar ${cal_id} terms containers of type ${type} is aliased, but the resolved element contains nothing for locale ${locale} in file ${f}" ); $out->print( "ok\n" ) if( $DEBUG ); } my $calTermContextRes = $el_container->findnodes( $this->{xpath_context} ); # while( my $el_context = $calTermContextRes->shift ) { my $context = $el_context->getAttribute( 'type' ) || die( "This calendar ${cal_id} ${type} context has no attribute 'type' value: ", $el_context->toString() ); my $calTermContextHasAliasRes = $el_context->findnodes( './alias[@path]' ); if( $calTermContextHasAliasRes->size ) { $out->print( "\tCalendar ${cal_id} terms context of type ${type} is aliased. Resolving it... " ) if( $DEBUG ); $el_context = resolve_alias( $calTermContextHasAliasRes ) || die( "Calendar ${cal_id} terms context of type ${type} is aliased, but the resolved element contains nothing for locale ${locale} in file ${f}" ); $out->print( "ok\n" ) if( $DEBUG ); } my $calTermWidthRes = $el_context->findnodes( $this->{xpath_width} ); # while( my $el_term_width = $calTermWidthRes->shift ) { my $width = $el_term_width->getAttribute( 'type' ); my $calTermWidthHasAliasRes = $el_term_width->findnodes( './alias[@path]' ); if( $calTermWidthHasAliasRes->size ) { $out->print( "\tCalendar ${cal_id} terms width of type ${type} is aliased. Resolving it... " ) if( $DEBUG ); $el_term_width = resolve_alias( $calTermWidthHasAliasRes ) || die( "Calendar ${cal_id} terms width of type ${type} for context ${context} is aliased, but the resolved element contains nothing for locale ${locale} in file ${f}" ); $out->print( "ok\n" ) if( $DEBUG ); } my $calTermsRes = $el_term_width->findnodes( $this->{xpath_terms} ); # Jan while( my $el_term = $calTermsRes->shift ) { my $def = { locale => $locale, calendar => $cal_id, term_type => $type, term_context => $context, term_width => $width, term_name => $el_term->getAttribute( 'type' ), term_value => trim( $el_term->textContent ), }; foreach my $att ( qw( alt yeartype ) ) { if( $el_term->hasAttribute( $att ) ) { $def->{ $att } = $el_term->getAttribute( $att ); } } eval { $sth_cal_term->execute( @$def{qw( locale calendar term_type term_context term_width alt yeartype term_name term_value )} ); } || die( "Error executing query to add calendar ${cal_id} term of type '${type}' for locale '${locale}' and for calendar '${cal_id}' from file ${f}: ", ( $@ || $sth_cal_term->errstr ), "\nwith query: ", $sth_cal_term->{Statement}, "\n", dump( $def ) ); $added->{cal_terms}++; } } } } # NOTE: Check for calendar eras &log( "\tCheck for calendar eras." ); my $calErasRes = $el->findnodes( './eras' ); if( $calErasRes->size ) { my $el_eras = $calErasRes->shift; my $cal_eras_map = { wide => './eraNames', abbreviated => './eraAbbr', narrow => './eraNarrow', }; foreach my $width ( sort( keys( %$cal_eras_map ) ) ) { my $xpath = $cal_eras_map->{ $width }; my $calErasWidthRes = $el_eras->findnodes( $xpath ); if( !$calErasWidthRes->size ) { $out->print( "\tno era width ${width} found, skipping.\n" ) if( $DEBUG ); next; } my $el_eras_width = $calErasWidthRes->shift; my $calErasWidthHasAliasRes = $el_eras_width->findnodes( './alias[@path]' ); if( $calErasWidthHasAliasRes->size ) { $el_eras_width = resolve_alias( $calErasWidthHasAliasRes ) || die( "Unable to resolve alias for calendar ${cal_id} of width ${width} for locale ${locale} in file ${f} for this element: ", $el_eras->toString() ); } my $calErasDataRes = $el_eras_width->findnodes( './era' ); # Before Christ while( my $el_cal_era = $calErasDataRes->shift ) { my $def = { locale => $locale, calendar => $cal_id, era_width => $width, era_id => $el_cal_era->getAttribute( 'type' ), locale_name => $el_cal_era->textContent, }; if( $el_cal_era->hasAttribute( 'alt' ) ) { $def->{alt} = $el_cal_era->getAttribute( 'alt' ); } eval { $sth_cal_era->execute( @$def{qw( locale calendar era_width era_id alt locale_name )} ); } || die( "Error executing query to add calendar era of width '${width}' for locale '${locale}' and for calendar '${cal_id}' from file ${f}: ", ( $@ || $sth_cal_era->errstr ), "\nwith query: ", $sth_cal_era->{Statement}, "\n", dump( $def ) ); $added->{cal_era}++; } } } else { $out->print( "\tno era found for calendar ${cal_id} in locale ${locale}\n" ) if( $DEBUG ); } # NOTE: Check for calendar date/time formats &log( "\tCheck for calendar date/time formats." ); my $cal_date_time_map = { date => { xpath_container => './dateFormats', xpath_len => './dateFormatLength', xpath_fmt => './dateFormat', xpath_pat => './pattern', xpath_skel => './datetimeSkeleton', }, 'time' => { xpath_container => './timeFormats', xpath_len => './timeFormatLength', xpath_fmt => './timeFormat', xpath_pat => './pattern', xpath_skel => './datetimeSkeleton', }, }; # # foreach my $type ( sort( keys( %$cal_date_time_map ) ) ) { $out->print( "\t\tChecking for formats for ${type}\n" ) if( $DEBUG ); my $this = $cal_date_time_map->{ $type }; # A cache of pattern value to their ID (skeleton) so we can lookup a missing skeleton for an identical pattern my $cache_values = {}; my $calDateOrTimeContainerRes = $el->findnodes( $this->{xpath_container} ); if( !$calDateOrTimeContainerRes->size ) { $out->print( "\t\tno format of type ${type} found for locale ${locale} for calendar ${cal_id}\n" ) if( $DEBUG ); next; } my $el_container = $calDateOrTimeContainerRes->shift; my $calDtContainerHasAliasRes = $el_container->findnodes( './alias[@path]' ); if( $calDtContainerHasAliasRes->size ) { $el_container = resolve_alias( $calDtContainerHasAliasRes ) || die( "The calendar formats container for ${type} is aliased, but could not get the resolved path for calendar ${cal_id} for locale ${locale} for this element: ", $el->toString() ); } my $calDateOrTimeLengthRes = $el_container->findnodes( $this->{xpath_len} ); if( !$calDateOrTimeLengthRes->size ) { die( "No calendar ${cal_id} format length tag found for locale ${locale} in file ${f} for this element: ", $el->toString() ); } while( my $el_len = $calDateOrTimeLengthRes->shift ) { my $len = $el_len->getAttribute( 'type' ) || die( "Unable to get the ${type} length type for locale ${locale} in file ${f} for this element: ", $el_len->toString() ); my $calDtLengthHasAliasRes = $el_len->findnodes( './alias[@path]' ); if( $calDtLengthHasAliasRes->size ) { $el_len = resolve_alias( $calDtLengthHasAliasRes ) || die( "The calendar ${cal_id} format length ${len} is aliased, but I am unable to get the resolved path for locale ${locale} in file ${f} for this element: ", $el_container->toString() ); } my $calDateOrTimeFormatRes = $el_len->findnodes( $this->{xpath_fmt} ); if( !$calDateOrTimeFormatRes->size ) { $out->print( "\t\t\tno calendar ${cal_id} formats found for length ${len} for locale ${locale}\n" ) if( $DEBUG ); next; } my $el_fmt = $calDateOrTimeFormatRes->shift; my $calDtFormatHasAliasRes = $el_fmt->findnodes( './alias[@path]' ); if( $calDtFormatHasAliasRes->size ) { $el_fmt = resolve_alias( $calDtFormatHasAliasRes ) || die( "The Date or time format of type ${type} for length ${len} in calendar ${cal_id} is aliased, but I cannot resolve its path for locale ${locale} in file ${f} for this element: ", $el_len->toString() ); } # yMMMd my $calFormatIdRes = $el_fmt->findnodes( $this->{xpath_skel} ); # MMM d, y my $calFormatValueRes = $el_fmt->findnodes( $this->{xpath_pat} ); my $pattern_id; # if( !$calFormatIdRes->size ) # { # warn( "Warning only: no ID (skeleton) for this ${type} format for locale ${locale} in file ${f} for this element. Skipping: ", $el_len->toString() ); # next; # } # elsif( $calFormatIdRes->size > 1 ) if( $calFormatIdRes->size > 1 ) { die( "More than one ID (skeleton) found (", $calFormatValueRes->size, ") for this ${type} format for locale ${locale} in file ${f} for this element: ", $el_len->toString() ); } elsif( $calFormatIdRes->size ) { my $el_cal_fmt_id = $calFormatIdRes->shift || die( "No ${type} format ID (skeleton) element could be retrieved for locale ${locale} in file ${f} for this element: ", $el_len->toString() ); $pattern_id = $el_cal_fmt_id->textContent; # NOTE: Save the pattern ID for this calendar date/time length for other locales missing it # If we are processing the root locale, we keep a record of the pattern ID for this length and calendar if( $locale eq 'und' ) { $out->print( "\t\t\t[root] Saving pattern ID '${pattern_id}' for length '${len}' for type '${type}', and calendar ID '${cal_id}'\n" ) if( $DEBUG ); $calendars_date_time_skeletons->{ $cal_id } ||= {}; $calendars_date_time_skeletons->{ $cal_id }->{ $type } ||= {}; $calendars_date_time_skeletons->{ $cal_id }->{ $type }->{ $len } = $pattern_id; } } if( !$calFormatValueRes->size ) { die( "No value (pattern) for this ${type} format for locale ${locale} in file ${f} for this element: ", $el_len->toString() ); } # my $pattern_id = $el_cal_fmt_id->textContent; while( my $el_dt_val = $calFormatValueRes->shift ) { my $pat_val = $el_dt_val->textContent; if( !defined( $pattern_id ) ) { if( exists( $calendars_date_time_skeletons->{ $cal_id }->{ $type }->{ $len } ) ) { $pattern_id = $calendars_date_time_skeletons->{ $cal_id }->{ $type }->{ $len }; warn( "Warning only: ID (skeleton) for this type ${type} and length '${len}' and calendar '${cal_id}' and locale '${locale}' was missing, but could get it from the root cache." ); } elsif( exists( $cache_values->{ $pat_val } ) ) { warn( "Warning only: no ID (skeleton) for this ${type} format for locale ${locale} in file ${f} for this element, but found a cache value (", $cache_values->{ $pat_val } , ") for pattern value '${pat_val}': ", $el_len->toString() ); $pattern_id = $cache_values->{ $pat_val }; } else { warn( "Warning only: no ID (skeleton) for this ${type} format for locale ${locale} in file ${f} for this element and no cache value found either. Skipping: ", $el_len->toString() ); next; } } my $def = { locale => $locale, calendar => $cal_id, format_type => $type, format_length => $len, format_id => $pattern_id, format_pattern => $pat_val, }; $cache_values->{ $pat_val } = $pattern_id; if( $el_dt_val->hasAttribute( 'alt' ) ) { $def->{alt} = $el_dt_val->getAttribute( 'alt' ); } eval { $sth_dt_fmt->execute( @$def{qw( locale calendar format_type format_length alt format_id format_pattern )} ); } || die( "Error executing query to add calendar ${type} format for ID '$def->{format_id}' for locale '${locale}' and for calendar '${cal_id}' from file ${f}: ", ( $@ || $sth_dt_fmt->errstr ), "\nwith query: ", $sth_dt_fmt->{Statement}, "\n", dump( $def ) ); $added->{cal_date_or_time_format}++; } } } # NOTE: Checking datetime formats &log( "\tChecking datetime formats." ); my $calDateTimeContainerRes = $el->findnodes( './dateTimeFormats' ); if( $calDateTimeContainerRes->size ) { my $el_container = $calDateTimeContainerRes->shift; my $calDateTimeHasAliasRes = $el_container->findnodes( './alias[@path]' ); if( $calDateTimeHasAliasRes->size ) { $out->print( "\t\tDateTime container for calendar ${cal_id} is aliased for locale ${locale}, resolving it.\n" ) if( $DEBUG ); $el_container = resolve_alias( $calDateTimeHasAliasRes ) || die( "DateTime container for calendar ${cal_id} is aliased, but could not resolve its path for locale ${locale} in file ${f}" ); } my $calDateTimeLengthRes = $el_container->findnodes( './dateTimeFormatLength' ); # # full, long, medium, short while( my $el_len = $calDateTimeLengthRes->shift ) { my $len = $el_len->getAttribute( 'type' ); my $calDtLengthHasAliasRes = $el_len->findnodes( './alias[@path]' ); if( $calDtLengthHasAliasRes->size ) { $out->print( "\t\t\tthe DateTime format length ${len} tag is aliased, resolving it.\n" ) if( $DEBUG ); $el_len = resolve_alias( $calDtLengthHasAliasRes ) || die( "The DateTime format length ${len} for calendar ${cal_id} is aliased, but I am unable to resolve it for locale ${locale} in file ${f} for this element: ", $el_len->toString() ); } my $calFmtRes = $el_len->findnodes( './dateTimeFormat' ); while( my $el_fmt = $calFmtRes->shift ) { # my $type = $el_fmt->hasAttribute( 'type' ) ? $el_fmt->getAttribute( 'type' ) : 'standard'; # Compensate for a bug (reported) where a 'type' attribute is missing on 'dateTimeFormat' tag, which would prevent any alias from resolving # For example in file main/root.xml, would fail, because there is no dateTimeFormat with 'type' attribute with value 'standard' if( !$el_fmt->hasAttribute( 'type' ) ) { $el_fmt->setAttribute( type => 'standard' ); } $type = $el_fmt->getAttribute( 'type' ); my $calDtFormatHasAliasRes = $el_fmt->findnodes( './alias[@path]' ); if( $calDtFormatHasAliasRes->size ) { $out->print( "\t\t\t\tThe calendar ${cal_id} DateTime format length ${len} format is aliased, resolving it.\n" ) if( $DEBUG ); $el_fmt = resolve_alias( $calDtFormatHasAliasRes ) || die( "The calendar ${cal_id} DateTime format length ${len} format is aliased, but I am unable to resolve it for locale ${locale} in file ${f} for this element: ", $el_len->toString() ); } my $calPatternsRes = $el_fmt->findnodes( './pattern' ); # {1}, {0} while( my $el_pat = $calPatternsRes->shift ) { my $def = { locale => $locale, calendar => $cal_id, format_length => $len, format_type => $type, format_pattern => $el_pat->textContent, }; eval { $sth_dt_pat_fmt->execute( @$def{qw( locale calendar format_length format_type format_pattern )} ); } || die( "Error executing query to add calendar ${type} format pattern '", ( $def->{format_pattern} // 'undef' ), "' for locale '${locale}' and for calendar '${cal_id}' from file ${f}: ", ( $@ || $sth_dt_pat_fmt->errstr ), "\nwith query: ", $sth_dt_pat_fmt->{Statement}, "\n", dump( $def ) ); $added->{cal_datetime_format}++; } } } # NOTE: Checking available datetime formats &log( "\tChecking available datetime formats." ); my $calAvailableFormatsRes = $el_container->findnodes( './availableFormats' ); if( $calAvailableFormatsRes->size ) { my $el_available = $calAvailableFormatsRes->shift; my $calAvailableHasAliasRes = $el_available->findnodes( './alias[@path]' ); if( $calAvailableHasAliasRes->size ) { $el_available = resolve_alias( $calAvailableHasAliasRes ) || die( "Calendard ${cal_id} available formats is aliased, but I could not resolve it for locale ${locale} in file ${f} for this element: ", $el_container->toString() ); } my $calAvailableFormatsItemsRes = $el_available->findnodes( './dateFormatItem' ); # h:mm:ss B while( my $el_item = $calAvailableFormatsItemsRes->shift ) { my $def = { locale => $locale, calendar => $cal_id, format_id => ( $el_item->getAttribute( 'id' ) || die( "Unable to get the available format ID from the attribute 'id' in this element: ", $el_item->toString() ) ), format_pattern => $el_item->textContent, }; if( !defined( $def->{format_pattern} ) || !length( $def->{format_pattern} // '' ) ) { die( "No pattern found for this available format with id '$def->{format_id}' for calendar '${cal_id}' and locale '${locale}': ", $el_item->toString() ); } if( $el_item->hasAttribute( 'count' ) ) { $def->{count} = $el_item->getAttribute( 'count' ); } if( $el_item->hasAttribute( 'alt' ) ) { $def->{alt} = $el_item->getAttribute( 'alt' ); } eval { $sth_avail_fmt->execute( @$def{qw( locale calendar format_id format_pattern count alt )} ); } || die( "Error executing query to add calendar available format '", ( $def->{format_pattern} // 'undef' ), "' with id '$def->{format_id}' for locale '${locale}' and for calendar '${cal_id}' from file ${f}: ", ( $@ || $sth_avail_fmt->errstr ), "\nwith query: ", $sth_avail_fmt->{Statement}, "\n", dump( $def ) ); $added->{cal_available_format}++; } } # NOTE: Checking calendar append items &log( "\tChecking calendar append items." ); my $calDateTimeAppendRes = $el_container->findnodes( './appendItems' ); if( $calDateTimeAppendRes->size ) { my $el_append = $calDateTimeAppendRes->shift; my $calDateTimeAppendHasAliasRes = $el_append->findnodes( './alias[@path]' ); if( $calDateTimeAppendHasAliasRes->size ) { $out->print( "\t\tCalendar ${cal_id} append formats is aliased, resolving it.\n" ) if( $DEBUG ); $el_append = resolve_alias( $calDateTimeAppendHasAliasRes ) || die( "Calendar ${cal_id} append formats is aliased, but I cannot resolve it for locale ${locale} in file ${f} for this element: ", $el_container->toString() ); } my $calAppendItemsRes = $el_append->findnodes( './appendItem' ); # {0} {1} while( my $el_append = $calAppendItemsRes->shift ) { my $def = { locale => $locale, calendar => $cal_id, format_id => ( $el_append->getAttribute( 'request' ) || die( "Unable to get the append format pattern from the attribute 'request' in this element: ", $el_append->toString() ) ), format_pattern => $el_append->textContent, }; if( !defined( $def->{format_pattern} ) || !length( $def->{format_pattern} // '' ) ) { die( "No pattern found for this append item format with id '$def->{format_id}' for calendar '${cal_id}' and locale '${locale}': ", $el_append->toString() ); } eval { $sth_append_fmt->execute( @$def{qw( locale calendar format_id format_pattern )} ); } || die( "Error executing query to add calendar append item format '", ( $def->{format_pattern} // 'undef' ), "' with id '$def->{format_id}' for locale '${locale}' and for calendar '${cal_id}' from file ${f}: ", ( $@ || $sth_append_fmt->errstr ), "\nwith query: ", $sth_append_fmt->{Statement}, "\n", dump( $def ) ); $added->{cal_append_format}++; } } # NOTE: Checking calendar interval formats &log( "\tChecking calendar interval formats." ); my $calIntervalFormatRes = $el_container->findnodes( './intervalFormats' ); if( $calIntervalFormatRes->size ) { my $el_int = $calIntervalFormatRes->shift; my $calIntervalFormatHasAliasRes = $el_int->findnodes( './alias[@path]' ); if( $calIntervalFormatHasAliasRes->size ) { $el_int = resolve_alias( $calIntervalFormatHasAliasRes ) || die( "Calendar ${cal_id} interval format is aliased, but I cannot resolve it for locale ${locale} in file ${f} for this element: ", $el_container->toString() ); } my $calIntervalFormatItemsRes = $el_int->findnodes( './intervalFormatItem' ); # while( my $el_item = $calIntervalFormatItemsRes->shift ) { my $int_id = $el_item->getAttribute( 'id' ) || die( "Unable to get the interval ID value from the attribute 'id' in this element: ", $el_item->toString() ); # h B – h B my $calDiffFormatRes = $el_item->findnodes( './greatestDifference' ); while( my $el_diff = $calDiffFormatRes->shift ) { my $def = { locale => $locale, calendar => $cal_id, format_id => $int_id, greatest_diff_id => $el_diff->getAttribute( 'id' ), format_pattern => $el_diff->textContent, }; if( $opts->{apply_patch} && exists( $patch->{ $cldr_version } ) && ref( $patch->{ $cldr_version } ) eq 'HASH' && exists( $patch->{ $cldr_version }->{calendar_interval_formats} ) && ref( $patch->{ $cldr_version }->{calendar_interval_formats} ) eq 'HASH' && exists( $patch->{ $cldr_version }->{calendar_interval_formats}->{ $locale } ) && ref( $patch->{ $cldr_version }->{calendar_interval_formats}->{ $locale } ) eq 'HASH' && exists( $patch->{ $cldr_version }->{calendar_interval_formats}->{ $locale }->{ $def->{format_id} } ) && ref( $patch->{ $cldr_version }->{calendar_interval_formats}->{ $locale }->{ $def->{format_id} } ) eq 'HASH' && exists( $patch->{ $cldr_version }->{calendar_interval_formats}->{ $locale }->{ $def->{format_id} }->{ $def->{greatest_diff_id} } ) ) { warn( "Warning only: Datetime interval with format ID '$def->{format_id}' and greatest difference ID '$def->{greatest_diff_id}' has a patch (", $patch->{ $cldr_version }->{calendar_interval_formats}->{ $locale }->{ $def->{format_id} }->{ $def->{greatest_diff_id} }, "), applying it instead of the default pattern (", $def->{format_pattern}, ")" ); $def->{format_pattern} = $patch->{ $cldr_version }->{calendar_interval_formats}->{ $locale }->{ $def->{format_id} }->{ $def->{greatest_diff_id} }; } foreach my $prop ( qw( greatest_diff_id format_pattern ) ) { if( !defined( $def->{ $prop } ) || !length( $def->{ $prop } // '' ) ) { die( "No pattern found for this append item format with id '$def->{format_id}' for calendar '${cal_id}' and locale '${locale}': ", $el_diff->toString() ); } } if( $el_diff->hasAttribute( 'alt' ) ) { $def->{alt} = $el_diff->getAttribute( 'alt' ); } my( $p1, $sep, $p2, $repeating_field ) = find_interval_repeating_field({ pattern => $def->{format_pattern}, greatest_diff => $def->{greatest_diff_id}, }); unless( defined( $p1 ) ) { warn( "Warning only: failed to find the repeating field for pattern '$def->{format_pattern}', with locale '${locale}' and format ID '$def->{format_id}' from file ${f}: ", dump( $ref ) ); next; } if( "${p1}${sep}${p2}" ne $def->{format_pattern} ) { die( "Reconstructed string '${p1}${sep}${p2}' does not match original string '$def->{format_pattern}' for locale '${locale}' in file ${f} for element: ", $el_diff->toString() ); } @$def{qw( part1 separator part2 repeating_field )} = ( $p1, $sep, $p2, $repeating_field ); eval { $sth_inter_fmt->execute( @$def{qw( locale calendar format_id greatest_diff_id format_pattern alt part1 separator part2 repeating_field )} ); } || die( "Error executing query to add calendar interval format '", ( $def->{format_pattern} // 'undef' ), "' with id '$def->{format_id}' and greatest difference ID '", ( $def->{greatest_diff_id} // 'undef' ), "' for locale '${locale}' and for calendar '${cal_id}' from file ${f}: ", ( $@ || $sth_inter_fmt->errstr ), "\nwith query: ", $sth_inter_fmt->{Statement}, "\n", dump( $def ) ); $added->{cal_interval_format}++; } } my $defIntervalFormatRes = $el_int->findnodes( './intervalFormatFallback' ); # {0} – {1} if( $defIntervalFormatRes->size ) { my $el_interval_default_fmt = $defIntervalFormatRes->shift || die( "No default interval format element found for calendar '${cal_id}' and locale '${locale}' in file ${f}" ); my $def = { locale => $locale, calendar => $cal_id, format_id => 'default', greatest_diff_id => 'default', format_pattern => $el_interval_default_fmt->textContent, }; if( $el_interval_default_fmt->hasAttribute( 'alt' ) ) { $def->{alt} = $el_interval_default_fmt->getAttribute( 'alt' ); } my( $p1, $sep, $p2, $repeating_field ) = find_interval_repeating_field({ pattern => $def->{format_pattern}, greatest_diff => $def->{greatest_diff_id}, }); unless( defined( $p1 ) ) { warn( "Warning only: failed to find the repeating field for pattern '$def->{format_pattern}', with locale '${locale}' and format ID '$def->{format_id}' from file ${f}: ", dump( $ref ) ); next; } if( "${p1}${sep}${p2}" ne $def->{format_pattern} ) { die( "Reconstructed string '${p1}${sep}${p2}' does not match original string '$def->{format_pattern}' for locale '${locale}' in file ${f} for element: ", $el_interval_default_fmt->toString() ); } @$def{qw( part1 separator part2 repeating_field )} = ( $p1, $sep, $p2, $repeating_field ); eval { $sth_inter_fmt->execute( @$def{qw( locale calendar format_id greatest_diff_id format_pattern alt part1 separator part2 repeating_field )} ); } || die( "Error executing query to add calendar default interval format '", ( $def->{format_pattern} // 'undef' ), "' with id '$def->{format_id}' and greatest difference ID '", ( $def->{greatest_diff_id} // 'undef' ), "' for locale '${locale}' and for calendar '${cal_id}' from file ${f}: ", ( $@ || $sth_inter_fmt->errstr ), "\nwith query: ", $sth_inter_fmt->{Statement}, "\n", dump( $def ) ); $added->{cal_interval_format}++; } } } # Done with DateTime formats # NOTE: Checking calendar cyclic name sets &log( "\tChecking calendar cyclic name sets." ); my $calCyclicContainerRes = $el->findnodes( './cyclicNameSets' ); if( $calCyclicContainerRes->size ) { my $el_container = $calCyclicContainerRes->shift; my $calCyclicContainerHasAliasRes = $el_container->findnodes( './alias[@path]' ); if( $calCyclicContainerHasAliasRes->size ) { $el_container = resolve_alias( $calCyclicContainerHasAliasRes ) || die( "The calendar ${cal_id} cyclic container is aliased, but I could not resolve it for locale ${locale} in file ${f}" ); } my $calCyclicNameSetRes = $el_container->findnodes( './cyclicNameSet' ); while( my $el_cyclic = $calCyclicNameSetRes->shift ) { my $set = $el_cyclic->getAttribute( 'type' ) || die( "Unable to get the calendar cyclic set type value from the attribute 'type' in this element: ", $el_cyclic->toString() ); my $calCyclicNameSetHasAliasRes = $el_cyclic->findnodes( './alias[@path]' ); if( $calCyclicNameSetHasAliasRes->size ) { $el_cyclic = resolve_alias( $calCyclicNameSetHasAliasRes ) || die( "Calendar ${cal_id} cyclic name set is aliased, but I could not resolve it for locale ${locale} in file ${f} for this element: ", $el_container->toString() ); } # my $calCyclicContextRes = $el_cyclic->findnodes( './cyclicNameContext' ); while( my $el_ctx = $calCyclicContextRes->shift ) { my $context = $el_ctx->getAttribute( 'type' ) || die( "Unable to get the calendar cyclic set context value from the attribute 'type' in this element: ", $el_ctx->toString() ); my $calCyclicContextHasAliasRes = $el_ctx->findnodes( './alias[@path]' ); if( $calCyclicContextHasAliasRes->size ) { $el_ctx = resolve_alias( $calCyclicContextHasAliasRes ) || die( "Calendar ${cal_id} cyclic context with set ${set} and context ${context} is aliased, but I could not resolve it for locale ${locale} in file ${f} for this element: ", $el_cyclic->toString() ); } my $calCyclicLengthRes = $el_ctx->findnodes( './cyclicNameWidth' ); # while( my $el_len = $calCyclicLengthRes->shift ) { my $len = $el_len->getAttribute( 'type' ) || die( "Unable to get the calendar cyclic set length value from the attribute 'type' in this element: ", $el_len->toString() ); my $calCyclicWidthHasAliasRes = $el_len->findnodes( './alias[@path]' ); if( $calCyclicWidthHasAliasRes->size ) { $el_len = resolve_alias( $calCyclicWidthHasAliasRes ) || die( "Calendar ${cal_id} cyclic length ${len} with set ${set} and context ${context} is aliased, but I could not resolve it for locale ${locale} in file ${f} for this element: ", $el_cyclic->toString() ); } my $calCyclicNamesRes = $el_len->findnodes( './cyclicName' ); # Tiger while( my $el_name = $calCyclicNamesRes->shift ) { my $def = { locale => $locale, calendar => $cal_id, format_set => $set, format_type => $context, format_length => $len, format_id => $el_name->getAttribute( 'type' ), format_pattern => $el_name->textContent, }; eval { $sth_cyclic->execute( @$def{qw( locale calendar format_set format_type format_length format_id format_pattern )} ); } || die( "Error executing query to add calendar cyclick set '${set}' with type '${context}', length '${len}', id '", ( $def->{format_id} // 'undef' ), "' and pattern '", ( $def->{format_pattern} // 'undef' ), "' for locale '${locale}' and for calendar '${cal_id}' from file ${f}: ", ( $@ || $sth_cyclic->errstr ), "\nwith query: ", $sth_cyclic->{Statement}, "\n", dump( $def ) ); $added->{cal_cyclic}++; } } } } } else { $out->print( "\tno cyclic data for this calendar ${cal_id} for locale ${locale}\n" ) if( $DEBUG ); } } # End looping through all calendar systems # NOTE: Checking locale date fields &log( "\tChecking locale date fields." ); my $calDateFieldsRes = $el_dates->findnodes( './fields' ); if( $calDateFieldsRes->size ) { &log( sprintf( "\t%d locale date fields found.", $calDateFieldsRes->size ) ); my $el_fields = $calDateFieldsRes->shift; my $calDateFieldRes = $el_fields->findnodes( './field[@type]' ); while( my $el_field = $calDateFieldRes->shift ) { my $type = $el_field->getAttribute( 'type' ) || die( "Unable to get the field type from attribute 'type' for locale ${locale} in file ${f} for this element: ", $el_fields->toString() ); my( $field_type, $field_length ) = split( /[^a-zA-Z0-9]/, $type, 2 ); $field_length //= 'standard'; my $calDateFieldHasAliasRes = $el_field->findnodes( './alias[@path]' ); if( $calDateFieldHasAliasRes->size ) { $el_field = resolve_alias( $calDateFieldHasAliasRes ) || die( "This date field of type ${type} is aliased, but I could not resolve it for locale ${locale} in file ${f}." ); } my $displayNameRes = $el_field->findnodes( './displayName' ); my $display_name; if( !$displayNameRes->size ) { warn( "Warning only: missing display name for this field of type '${type}' for locale '${locale}' in file ${f}" ); } else { $display_name = trim( $displayNameRes->shift->textContent ); my $def = { locale => $locale, term_type => $field_type, term_length => $field_length, display_name => $display_name, }; eval { $sth_date_term->execute( @$def{qw( locale term_type term_length display_name )} ); } || die( "Error executing query to add date term for locale ${locale} and term type $def->{term_type} and term length $def->{term_length} from file ${f}: ", ( $@ || $sth_date_term->errstr ), "\nwith query: ", $sth_date_term->{Statement}, "\n", dump( $def ) ); $added->{date_term}++; } my $calDateFieldItemsRes = $el_field->findnodes( './relative[@type]' ); while( my $el_item = $calDateFieldItemsRes->shift ) { my $def = { locale => $locale, field_type => $field_type, field_length => $field_length, relative => $el_item->getAttribute( 'type' ), locale_name => $el_item->textContent, }; eval { $sth_field->execute( @$def{qw( locale field_type field_length relative locale_name )} ); } || die( "Error executing query to add date field for locale ${locale} and field type ${field_type} and field length ${field_length} from file ${f}: ", ( $@ || $sth_field->errstr ), "\nwith query: ", $sth_field->{Statement}, "\n", dump( $def ) ); $added->{cal_field}++; } # And now, we process the 'relativeTime' elements, such as: # # in {0} year # in {0} years # # # {0} year ago # {0} years ago # my $calDateTimeRelItemsRes = $el_field->findnodes( './relativeTime[@type]' ); while( my $el_time_rel = $calDateTimeRelItemsRes->shift ) { # This can either be 'future' or 'past', which we translate to 1 or -1 my $time_relative_position = $el_time_rel->getAttribute( 'type' ) // ''; my $calDateTimeRelHasAliasRes = $el_time_rel->findnodes( './alias[@path]' ); if( $calDateTimeRelHasAliasRes->size ) { $el_time_rel = resolve_alias( $calDateTimeRelHasAliasRes ) || die( "This date time relative of type ${type} is aliased, but I could not resolve it for locale ${locale} in file ${f}." ); } my $calDateTimeRelPatternItemsRes = $el_time_rel->findnodes( './relativeTimePattern' ); while( my $el_time_rel_pat = $calDateTimeRelPatternItemsRes->shift ) { my $def = { locale => $locale, field_type => $field_type, field_length => $field_length, relative => ( $time_relative_position eq 'future' ? 1 : $time_relative_position eq 'past' ? -1 : undef ), pattern => $el_time_rel_pat->textContent, }; if( $el_time_rel_pat->hasAttribute( 'count' ) ) { $def->{count} = $el_time_rel_pat->getAttribute( 'count' ); } eval { $sth_time_rel->execute( @$def{qw( locale field_type field_length relative pattern count )} ); } || die( "Error executing query to add date field for locale ${locale} and field type ${field_type} and field length ${field_length} from file ${f}: ", ( $@ || $sth_time_rel->errstr ), "\nwith query: ", $sth_time_rel->{Statement}, "\n", dump( $def ) ); $added->{cal_time_rel}++; } } } } else { $out->print( "\tno localised date fields for locale ${locale} in file ${f}\n" ) if( $DEBUG ); } my $tzNamesContainerRes = $el_dates->findnodes( './timeZoneNames' ); if( $tzNamesContainerRes->size ) { my $el = $tzNamesContainerRes->shift; my $tzNamesAliasRes = $el->findnodes( './alias[@path]' ); # Example: if( $tzNamesAliasRes->size ) { # XXX Remove this die( "I found an alias for the locale timezone/metazones for locale '${locale}' in file ${f}" ); $el = resolve_alias( $tzNamesAliasRes ) || die( "This timezones and metazones names is aliased, but I could not resolve it for locale ${locale} in file ${f} for this element." ); } # NOTE: Checking locale time zone formats &log( "\tChecking locale time zone formats." ); # # {0} Daylight Time my $tzFormatsRes = $el->findnodes( './*[local-name()="hourFormat" or local-name()="gmtFormat" or local-name()="gmtZeroFormat" or local-name()="regionFormat" or local-name()="fallbackFormat"]' ); if( $tzFormatsRes->size ) { $out->printf( "\t\tProcessing %d time zone formats for locale '${locale}'\n", $tzFormatsRes->size ) if( $DEBUG ); my $tz_fmt_map = { hourFormat => 'hour', gmtFormat => 'gmt', gmtZeroFormat => 'gmt_zero', regionFormat => 'region', fallbackFormat => 'fallback', }; my $c = 0; while( my $el_tz_fmt = $tzFormatsRes->shift ) { my $tag = $el_tz_fmt->nodeName; $out->print( "\t\tAdding time zone format of type '${tag}': " ) if( $DEBUG ); if( !exists( $tz_fmt_map->{ $tag } ) ) { die( "Tag \"${tag}\" is not in our internal type map." ); } my $def = { locale => $locale, type => $tz_fmt_map->{ $tag }, format_pattern => trim( $el_tz_fmt->textContent ), }; if( $el_tz_fmt->hasAttribute( 'type' ) ) { $def->{subtype} = $el_tz_fmt->getAttribute( 'type' ); } eval { $sth_tz_formats->execute( @$def{qw( locale type subtype format_pattern )} ); } || die( "Error executing query to add timezone format of type '$def->{type}' for locale '${locale}' from file ${f}: ", ( $@ || $sth_tz_formats->errstr ), "\nwith query: ", $sth_tz_formats->{Statement}, "\n", dump( $def ) ); $c++; $out->print( "ok\n" ); } $out->print( "\t\t${c} time zone format(s) added.\n" ) if( $DEBUG ); } else { $out->print( "\t\tthe locale ${locale} has no time zone formats set.\n" ) if( $DEBUG ); } # NOTE: Checking locale time zone sample cities &log( "\tChecking locale time zone sample cities for locale ${locale}." ); my $TimeZonesRes = $el->findnodes( './zone[@type]' ); my $tz_tags_map = { exemplarCity => 'city', long => 'long', short => 'short', }; if( $TimeZonesRes->size ) { &log( sprintf( "\t\t%d locale time zone sample cities found for locale ${locale}.", $TimeZonesRes->size ) ); while( my $el_tz = $TimeZonesRes->shift ) { my $timezone = $el_tz->getAttribute( 'type' ) || die( "No timezone ID value defined for this element: ", $el->toString() ); $out->print( "\t\t\t[${timezone}]\n" ) if( $DEBUG ); my @kids = $el_tz->nonBlankChildNodes; $out->printf( "\t\t\t\t%d children nodes found: %s\n", scalar( @kids ), join( ', ', map{ $_->nodeName } @kids ) ) if( $DEBUG ); foreach my $el_kid ( @kids ) { my $tag = $el_kid->nodeName; if( !exists( $tz_tags_map->{ $tag } ) ) { die( "Found tag ${tag} as child of this time zones list, but it is unknown to us for time zone '${timezone}' for locale ${locale} in file ${f} for this element: ", $el_kid->toString() ); } my $prop = $tz_tags_map->{ $tag } || die( "Unable to find an equivalence in our timezone map for the tag ${tag}: ", $el->toString() ); if( $prop eq 'city' ) { my $def = { locale => $locale, timezone => $timezone, city => trim( $el_kid->textContent ), }; if( $el_kid->hasAttribute( 'alt' ) ) { $def->{alt} = $el_kid->getAttribute( 'alt' ); } $out->print( "\t\t\t\tFound sample city '$def->{city}' for timezone '${timezone}' for locale ${locale}\n" ) if( $DEBUG ); eval { $sth_timezone_city->execute( @$def{qw( locale timezone city alt )} ); } || die( "Error executing query to add timezone $def->{timezone} sample city '$def->{city}' for locale '${locale}' from file ${f}: ", ( $@ || $sth_timezone_city->errstr ), "\nwith query: ", $sth_timezone_city->{Statement}, "\n", dump( $def ) ); $added->{timezones_cities}++; } elsif( $prop eq 'short' || $prop eq 'long' ) { my $def = { locale => $locale, timezone => $timezone, width => $prop, }; my @tz_name_kids = $el_kid->nonBlankChildNodes; $out->printf( "\t\t\t\tFound ${prop} timezone name definition for timezone '${timezone}' for locale ${locale} with %d children\n", scalar( @tz_name_kids ) ) if( $DEBUG ); if( !scalar( @tz_name_kids ) ) { die( "Locale '${locale}' has the time zone '${timezone}' set with time zone name of width '$def->{width}', but no data could be found in file ${f} for this element: ", $el_tz->toString() ); } foreach my $el_tz_kid ( @tz_name_kids ) { my $name_type = $el_tz_kid->nodeName; my $name_value = trim( $el_tz_kid->textContent ); $def->{ $name_type } = $name_value; } eval { $sth_tz_names->execute( @$def{qw( locale timezone width generic standard daylight )} ); } || die( "Error executing query to add timezone $def->{timezone} locale names for locale '${locale}' from file ${f}: ", ( $@ || $sth_tz_names->errstr ), "\nwith query: ", $sth_tz_names->{Statement}, "\n", dump( $def ) ); } } } } else { $out->print( "\tno localised time zone sample cities for locale ${locale} in file ${f}\n" ) if( $DEBUG ); } # NOTE: Checking for locale metazones &log( "\tChecking for locale metazones." ); my $MetazonesRes = $el_dates->findnodes( './timeZoneNames/metazone[@type]' ); if( $MetazonesRes->size ) { &log( sprintf( "\t%d locale metazone found.", $MetazonesRes->size ) ); while( my $el_metatz = $MetazonesRes->shift ) { my $metazone = $el_metatz->getAttribute( 'type' ) || die( "No value found for metazone attribute 'type' for this element: ", $el->toString() ); my $MetaTzNamesRes = $el_metatz->findnodes( './*[local-name()="long" or local-name()="short"]' ); $out->printf( "\t\tfound %d metazone long/short localised name(s) for metazone '${metazone}'\n", $MetaTzNamesRes->size ) if( $DEBUG ); while( my $el_tz_width = $MetaTzNamesRes->shift ) { # 'long' or 'short' my $tz_name_width = $el_tz_width->nodeName; my $def = { locale => $locale, metazone => $metazone, width => $tz_name_width, }; my $tzNamesTypesRes = $el_tz_width->findnodes( './*[local-name()="generic" or local-name()="standard" or local-name()="daylight"]' ); if( !$tzNamesTypesRes->size ) { die( "Locale '${locale}' has the metazone '${metazone}' set with metazone name of width '${tz_name_width}', but no data could be found in file ${f} for this element: ", $el->toString() ); } while( my $el_tz_name = $tzNamesTypesRes->shift ) { my $name_type = $el_tz_name->nodeName; my $name_value = trim( $el_tz_name->textContent ); $def->{ $name_type } = $name_value; } eval { $sth_metatz_names->execute( @$def{qw( locale metazone width generic standard daylight )} ); } || die( "Error executing query to add timezone $def->{timezone} locale names for locale '${locale}' from file ${f}: ", ( $@ || $sth_metatz_names->errstr ), "\nwith query: ", $sth_metatz_names->{Statement}, "\n", dump( $def ) ); } } } else { $out->print( "\t\tno localised metazone found for locale ${locale} in file ${f}\n" ) if( $DEBUG ); } } else { $out->print( "\t\tno localised timezones and metazones for locale ${locale} in file ${f}\n" ) if( $DEBUG ); } } $out->printf( "\tok, added %d locales, %d scripts, %d territories, %d variants, %d currencies, %d calendar terms, %d eras, %d date or time formats, %d datetime formats, %d available formats, %d appended formats, %d interval formats, %d cyclic, %d fields\n", @$added{qw( languages scripts territories variants currencies cal_terms cal_era cal_date_or_time_format cal_datetime_format cal_available_format cal_append_format cal_interval_format cal_cyclic cal_field )} ) if( $DEBUG ); # NOTE: Checking for layout orientation (left-to-right) &log( "\tChecking for layout orientation (left-to-right)." ); my $layoutLTRRes = $mainDoc->findnodes( '//layout/orientation/characterOrder' ); if( $layoutLTRRes->size ) { # right-to-left my $ltr = trim( $layoutLTRRes->shift->textContent ); if( !defined( $ltr ) || !length( $ltr // '' ) ) { die( "Unable to get the value for the layout orientation for the locale ${locale} in file ${f} with xpath //layout/orientation/characterOrder" ); } eval { $sth_locale_info->execute( $locale, 'char_orientation', $ltr ); } || die( "Error executing query to add locale information for layout orientation (char_orientation) and value '${ltr}': ", ( $@ || $sth_locale_info->errstr ), "\nwith query: ", $sth_locale_info->{Statement} ); $out->printf( "\t%d element added.\n", $sth_locale_info->rows ) if( $DEBUG ); } else { $out->print( "\tNo layout orientation found.\n" ) if( $DEBUG ); } # NOTE: Checking for quotation marks &log( "\tChecking for quotation marks." ); # Example: # # # # # # my $quotationsRes = $mainDoc->findnodes( '//delimiters/*[local-name()="quotationStart" or local-name()="quotationEnd" or local-name()="alternateQuotationStart" or local-name()="alternateQuotationEnd"]' ); my $quotation_map = { quotationStart => 'quotation_start', quotationEnd => 'quotation_end', alternateQuotationStart => 'quotation2_start', alternateQuotationEnd => 'quotation2_end', }; my $j = 0; while( my $el = $quotationsRes->shift ) { my $tag = $el->nodeName; if( !exists( $quotation_map->{ $tag } ) ) { die( "Quotation tag found (${tag}) for locale '${locale}' does not exist in our internal property map in file ${f} for this element: ", $el->toString() ); } my $val = $el->textContent; eval { $sth_locale_info->execute( $locale, $quotation_map->{ $tag }, $val ); } || die( "Error executing query to add locale information for quotation mark (${tag} -> ", $quotation_map->{ $tag }, ") for locale ${locale} in file ${f}: ", ( $@ || $sth_locale_info->errstr ), "\nwith query: ", $sth_locale_info->{Statement} ); $j++; } $out->printf( "\t%d quotation mark information added.\n", $j ) if( $DEBUG ); # NOTE: Checking for POSIX yes/no string &log( "\tChecking for POSIX yes/no string." ); # Example: # # # はい:y # いいえ:n # # my $yesNoRes = $mainDoc->findnodes( '//posix/messages/*[local-name()="yesstr" or local-name()="nostr"]' ); my $yes_no_map = { yesstr => 'yes', nostr => 'no', }; $j = 0; while( my $el = $yesNoRes->shift ) { my $tag = $el->nodeName; if( !exists( $yes_no_map->{ $tag } ) ) { die( "Yes/No string tag found (${tag}) for locale '${locale}' does not exist in our internal property map in file ${f} for this element: ", $el->toString() ); } my $val = $el->textContent; if( !defined( $val ) || !length( $val // '' ) ) { die( "Found a yes/no string value, but its content is empty for locale '${locale}' in file ${f} for this element: ", $el->toString() ); } elsif( index( $val, ':' ) == -1 ) { warn( "Warning only: found a yes/no string value, but its content is malformed. I could not find a ':' separator for locale '${locale}' in file ${f} for this element: ", $el->toString() ); } $val = [split( ':', $val )]->[0]; if( !length( $val // '' ) ) { die( "Found a yes/no string value, but its content after spliting it is empty for locale '${locale}' in file ${f} for this element: ", $el->toString() ); } eval { $sth_locale_info->execute( $locale, $yes_no_map->{ $tag }, $val ); } || die( "Error executing query to add locale information for yes/no string (${tag} -> ", $yes_no_map->{ $tag }, "): ", ( $@ || $sth_locale_info->errstr ), "\nwith query: ", $sth_locale_info->{Statement} ); $j++; } $out->printf( "\t%d yes/no string information added.\n", $j ) if( $DEBUG ); # NOTE: Adding locale number system properties (punctuation, percent, group, etc) &log( "\tAdding locale number system symbols (punctuation, percent, group, etc)." ); my $localeNumberSymbolRes = $mainDoc->findnodes( '/ldml/numbers/symbols[@numberSystem]' ); $j = 0; if( $localeNumberSymbolRes->size ) { $sth = $sths->{number_symbols_l10n} || die( "No statement object set for table 'number_symbols_l10n'." ); # Example: # # . # , # ; # % # + # - # ~ # E # × # # # NaN # : # my $symbols_map = { approximatelySign => 'approximately', currencyDecimal => 'currency_decimal', currencyGroup => 'currency_group', decimal => 'decimal', exponential => 'exponential', group => 'group', infinity => 'infinity', list => 'list', minusSign => 'minus', nan => 'nan', nativeZeroDigit => 'native_zero_digit', patternDigit => 'pattern_digit', percentSign => 'percent', perMille => 'per_mille', plusSign => 'plus', special => 'special', superscriptingExponent => 'superscript', timeSeparator => 'time_separator', }; my $symbols_data = {}; while( my $el = $localeNumberSymbolRes->shift ) { my $sys_id = $el->getAttribute( 'numberSystem' ) || die( "Unable to get the number system ID for this symbol in attribute 'numberSystem' for locale ${locale} in file ${f} for this element: ", $el->toString() ); my $numSymbolAliasRes = $el->findnodes( './alias[@path]' ); # Example: if( $numSymbolAliasRes->size ) { $el = resolve_alias( $numSymbolAliasRes ) || die( "This number symbol with number system '${sys_id}' is aliased, but I could not resolve it for locale ${locale} in file ${f} for this element." ); } if( !exists( $symbols_data->{ $sys_id } ) ) { $symbols_data->{ $sys_id } = {} } else { die( "Symbols for number system '${sys_id}' is being redefined for number system ${sys_id} and for locale ${locale} in file ${f} for this element: ", $el->toString() ); } my @kids = $el->nonBlankChildNodes; foreach my $el_kid ( @kids ) { my $tag = $el_kid->nodeName; if( !exists( $symbols_map->{ $tag } ) ) { die( "Found tag ${tag} as child of this symbols list, but it is unknown to us for number system ${sys_id} and for locale ${locale} in file ${f} for this element: ", $el->toString() ); } my $prop = $symbols_map->{ $tag }; my $val = trim( $el_kid->textContent ); my $def = { locale => $locale, number_system => $sys_id, property => $prop, value => $val, }; if( $el_kid->hasAttribute( 'alt' ) ) { $def->{alt} = $el_kid->getAttribute( 'alt' ); } my $prop_key = join( ';', map( $_ // '', @$def{qw( property alt )} ) ); if( exists( $symbols_data->{ $sys_id }->{ $prop_key } ) ) { die( "Symbol property ${tag} ('${prop}') is being redefined for number system ${sys_id} and locale ${locale} in file ${f} for this element: ", $el->toString() ); } $symbols_data->{ $sys_id }->{ $prop_key } = $val; eval { $sth->execute( @$def{qw( locale number_system property value alt )} ); } || die( "Error executing query to add locale numbering system symbol ${prop} for locale ${locale} in file ${f}: ", ( $@ || $sth->errstr ), "\nwith query: ", $sth->{Statement}, "\nwith data: ", dump( $def ) ); $j++; } } $out->printf( "\t%d locale symbols added.\n", $j ) if( $DEBUG ); } else { &log( "\tNo numbering system symbols for locale ${locale} in file ${f}" ); } # NOTE: Adding formats for decimal, scientific, percent, currency and miscellaneous &log( "\tAdding formats for decimal, scientific, percent, currency and miscellaneous." ); $j = 0; my $number_format_map = { currencyFormats => { xpath_container => './currencyFormats', xpath_len => './currencyFormatLength', xpath_fmt => './currencyFormat', xpath_pat => './pattern', type => 'currency', regexp => qr/currencyFormats\[\@numberSystem=["']([a-zA-Z0-9\_\-]+)["']\]/, }, decimalFormats => { xpath_container => './decimalFormats', xpath_len => './decimalFormatLength', xpath_fmt => './decimalFormat', xpath_pat => './pattern', type => 'decimal', regexp => qr/decimalFormats\[\@numberSystem=["']([a-zA-Z0-9\_\-]+)["']\]/, }, miscPatterns => { xpath_container => './miscPatterns', xpath_pat => './pattern', type => 'misc', regexp => qr/miscPatterns\[\@numberSystem=["']([a-zA-Z0-9\_\-]+)["']\]/, }, percentFormats => { xpath_container => './percentFormats', xpath_len => './percentFormatLength', xpath_fmt => './percentFormat', xpath_pat => './pattern', type => 'percent', regexp => qr/percentFormats\[\@numberSystem=["']([a-zA-Z0-9\_\-]+)["']\]/, }, scientificFormats => { xpath_container => './scientificFormats', xpath_len => './scientificFormatLength', xpath_fmt => './scientificFormat', xpath_pat => './pattern', type => 'scientific', regexp => qr/scientificFormats\[\@numberSystem=["']([a-zA-Z0-9\_\-]+)["']\]/, }, }; $sth = $sths->{number_formats_l10n} || die( "No statement object set for table 'number_formats_l10n'." ); my $numbersRes = $mainDoc->findnodes( '/ldml/numbers' ); if( $numbersRes->size ) { my $el = $numbersRes->shift; my( $default_num_system, $other_num_system ); # NOTE: Checking for locale default and other numbering systems &log( "Checking for locale default and other numbering systems." ); # More than one may be defined, but we use only the first one my $defNumberingSysRes = $el->findnodes( './defaultNumberingSystem' ); if( $defNumberingSysRes->size ) { my $el_def_num_sys = $defNumberingSysRes->shift; $default_num_system = trim( $el_def_num_sys->textContent ); if( !defined( $default_num_system ) || !length( $default_num_system // '' ) ) { die( "A default numbering system ID has been declared with tag 'defaultNumberingSystem', but is actually empty for locale ${locale} in file ${f}" ); } } my $otherNumberingSysRes = $el->findnodes( './otherNumberingSystems' ); if( $otherNumberingSysRes->size ) { my $el_other_num_sys = $otherNumberingSysRes->shift; my $otherNumberingSysHasAliasRes = $el_other_num_sys->findnodes( './alias[@path]' ); if( $otherNumberingSysHasAliasRes->size ) { $el_other_num_sys = resolve_alias( $otherNumberingSysHasAliasRes ) || die( "Unable to resolve alias for locale other number system for locale ${locale} in file ${f}" ); } my @other_num_sys = $el_other_num_sys->nonBlankChildNodes; # # jpan # jpanfin # foreach my $el_other_num_sys ( @other_num_sys ) { my $num_sys_name = $el_other_num_sys->nodeName; if( $num_sys_name ne 'native' && $num_sys_name ne 'traditional' && $num_sys_name ne 'finance' ) { die( "Unknown other numbering system '${num_sys_name}' declared in locale '${locale}' in file ${f} for this element: ", $el_other_num_sys->toString() ); } $other_num_system //= {}; $other_num_system->{ $num_sys_name } = trim( $el_other_num_sys->textContent ); if( !length( $other_num_system->{ $num_sys_name } // '' ) ) { die( "Other numbering system ID '${num_sys_name}' has been declared, but is actually empty for locale ${locale} in file ${f} for this element: ", $el_other_num_sys->toString() ); } } if( defined( $default_num_system ) || defined( $other_num_system ) ) { my $def = { locale => $locale, number_system => $default_num_system, native => $other_num_system->{native}, traditional => $other_num_system->{traditional}, finance => $other_num_system->{finance}, }; eval { $sth_locale_num_sys->execute( @$def{qw( locale number_system native traditional finance )} ); } || die( "Error executing SQL query to add locale's numbering systems used for locale ${locale} in file ${f}: ", ( $@ || $sth_locale_num_sys->errstr ), "\nSQL Query: ", $sth_locale_num_sys->{Statement}, "\n", dump( $def ) ); } } elsif( defined( $default_num_system ) ) { my $def = { locale => $locale, number_system => $default_num_system, }; eval { $sth_locale_num_sys->execute( @$def{qw( locale number_system native traditional finance )} ); } || die( "Error executing SQL query to add locale's numbering systems used for locale ${locale} in file ${f}: ", ( $@ || $sth_locale_num_sys->errstr ), "\nSQL Query: ", $sth_locale_num_sys->{Statement}, "\n", dump( $def ) ); } foreach my $n_type ( sort( keys( %$number_format_map ) ) ) { $j = 0; my $this = $number_format_map->{ $n_type }; my $numberContainerRes = $el->findnodes( $this->{xpath_container} ); my $type = $this->{type}; while( my $el_container = $numberContainerRes->shift ) { # my $sys_id; if( $el_container->hasAttribute( 'numberSystem' ) ) { $sys_id = $el_container->getAttribute( 'numberSystem' ) || die( "Unable to get the numbering system value from the attribute 'numberSystem' for number format of type ${n_type} (${type}) for locale ${locale} in file ${f} for this element: ", $el_container->toString() ); } elsif( defined( $default_num_system ) ) { my $isDeuplicateRes = $el->findnodes( $this->{xpath_container} . '[@numberSystem="' . $default_num_system . '"]' ); if( $isDeuplicateRes->size ) { &log( "The number format of type ${type} has no 'numberSystem' attribute set, and the default 'numberSystem' value '${default_num_system}' exists already, so we skip it to avoid creating a duplicate in the database for locale ${locale} in file ${f}" ); next; } else { $sys_id = $default_num_system; } } elsif( defined( $other_num_system ) && $other_num_system->{native} ) { my $isDeuplicateRes = $el->findnodes( $this->{xpath_container} . '[@numberSystem="' . $other_num_system->{native} . '"]' ); if( $isDeuplicateRes->size ) { &log( "The number format of type ${type} has no 'numberSystem' attribute set, and the default 'numberSystem' value '$other_num_system->{native}' exists already, so we skip it to avoid creating a duplicate in the database for locale ${locale} in file ${f}" ); next; } else { $sys_id = $other_num_system->{native}; } } else { warn( "Warning only: no attribute 'numberSystem' found for this number format of type ${n_type} (${type}), and no default (defaultNumberingSystem) or other (otherNumberingSystems) number system declared for locale ${locale} in file ${f} for this element'. Skipping: ", $el_container->toString() ); next; } my $numFormatAliasRes = $el_container->findnodes( './alias[@path]' ); # Example: if( $numFormatAliasRes->size ) { $el_container = resolve_alias( $numFormatAliasRes ) || die( "This number format of type ${n_type} (${type}) is aliased, but could not resolve it for number system ${sys_id} and for locale ${locale} in file ${f} for this element: ", $el->toString() ); } my $format_data = []; # # # # #,##0.### # # # # # # # # 0K # 00K # 000K # 0M # 00M # 000M # 0G # 00G # 000G # 0T # 00T # 000T # # # # # # # # # # #,##0.00 ¤ # #,##0.00 # # # # # # my $process_pattern = sub { my( $el_pat, $def ) = @_; $def->{format_id} = 'default'; if( $el_pat->hasAttribute( 'type' ) ) { $def->{format_id} = $el_pat->getAttribute( 'type' ); if( !defined( $def->{format_id} ) || !length( $def->{format_id} // '' ) ) { die( "Unable to get the number format ID for number format of type ${n_type} (${type}) for this numbering system ID ${sys_id} for this locale ${locale} in file ${f} for this element: ", $el_pat->toString() ); } } if( $el_pat->hasAttribute( 'alt' ) ) { $def->{alt} = $el_pat->getAttribute( 'alt' ) || die( "Unable to get the number format pattern alt value from attribute 'alt' for this number format of type ${n_type} (${type}) for numbering system ID ${sys_id} for locale ${locale} in file ${f} for this element: ", $el_pat->toString() ); } else { $def->{alt} = undef; } if( $el_pat->hasAttribute( 'count' ) ) { $def->{count} = $el_pat->getAttribute( 'count' ) || die( "Unable to get the number format pattern count value from attribute 'count' for this number format of type ${n_type} (${type}) for numbering system ID ${sys_id} for locale ${locale} in file ${f} for this element: ", $el_pat->toString() ); } else { $def->{count} = undef; } $def->{format_pattern} = $el_pat->textContent; push( @$format_data, $def ); }; if( exists( $this->{xpath_len} ) && length( $this->{xpath_len} // '' ) ) { my $numFormatLengthRes = $el_container->findnodes( $this->{xpath_len} ); while( my $el_len = $numFormatLengthRes->shift ) { my $len = 'default'; if( $el_len->hasAttribute( 'type' ) ) { $len = $el_len->getAttribute( 'type' ) || die( "Unable to get the number format length type from attribute 'type' for this number format of type ${n_type} (${type}) for this locale ${locale} in file ${f} for this element: ", $el_len->toString() ); } my $numFormatLengthHasAliasRes = $el_len->findnodes( './alias[@path]' ); if( $numFormatLengthHasAliasRes->size ) { $el_len = resolve_alias( $numFormatLengthHasAliasRes ) || die( "This number format of type ${n_type} (${type}) for length ${len} is aliased, but I could not resolve it for this element: ", $el_container->toString() ); } my $numFormatRes = $el_len->findnodes( $this->{xpath_fmt} ) || die( "Unable to get any number format tag for number format of type ${n_type} (${type}) for this numbering system ID ${sys_id} for locale ${locale} in file ${f} for this element: ", $el_len->toString() ); while( my $el_fmt_actual = $numFormatRes->shift ) { my $fmt_type = 'default'; if( $el_fmt_actual->hasAttribute( 'type' ) ) { $fmt_type = $el_fmt_actual->getAttribute( 'type' ) || die( "Unable to get the number formatting type from attribute 'type' for this number format of type ${n_type} (${type}) for numbering system ID ${sys_id} for locale ${locale} in file ${f} for this element: ", $el_fmt_actual->toString() ); } my $numFormatPatternsRes = $el_fmt_actual->findnodes( $this->{xpath_pat} ); while( my $el_pat = $numFormatPatternsRes->shift ) { my $def = { locale => $locale, number_system => $sys_id, number_type => $type, format_length => $len, format_type => $fmt_type, }; $process_pattern->( $el_pat, $def ); } } } } # The number format patterns are directly defined under the numbering system ID, such as with miscellaneous else { my $numFormatPatternsRes = $el_container->findnodes( $this->{xpath_pat} ); while( my $el_pat = $numFormatPatternsRes->shift ) { my $def = { locale => $locale, number_system => $sys_id, number_type => $type, format_length => 'long', format_type => 'default', }; $process_pattern->( $el_pat, $def ); } } my $total = scalar( @$format_data ); &log( "\tLoading ${total} ${type} number format patterns for number system ${sys_id}." ); my $k = 0; foreach my $def ( @$format_data ) { eval { # We need to force DBD::SQLite to treat the format_id as a text and not as an integer, otherwise, the check constraint on format_id would fail on ID such as 10000000000000000000 for locale 'ja' my @keys = qw( locale number_system number_type format_length format_type format_id format_pattern alt count ); for( my $i = 0; $i < scalar( @keys ); $i++ ) { $sth->bind_param( $i + 1, $def->{ $keys[$i] }, SQL_VARCHAR ); } # $sth->execute( @$def{qw( locale number_system number_type format_length format_type format_id format_pattern alt count )} ); $sth->execute; } || die( "Error executing query to add locale numbering system pattern for locale ${locale} in file ${f}: ", ( $@ || $sth->errstr ), "\nwith query: ", $sth->{Statement}, "\n", dump( $def ) ); $j++; $k++; $out->print( "${k}/${total}\r" ) if( $DEBUG > 1 ); } $out->print( "\n" ) if( $DEBUG > 1 ); } $out->printf( "\t%d ${type} number format patterns added.\n", $j ) if( $DEBUG ); } } else { $out->print( "\tno number formats defined for this locale ${locale} in file ${f}\n" ) if( $DEBUG ); } # NOTE: Adding locale units &log( "\tAdding locale units." ); $j = 0; my $unit_locale_map = { compoundUnit => { type => 'compound', xpath_unit => './compoundUnit', xpath_pat => './*[local-name()="unitPrefixPattern" or local-name()="compoundUnitPattern" or local-name()="compoundUnitPattern1"]', }, unit => { type => 'regular', xpath_unit => './unit', xpath_pat => './*[local-name()="unitPattern" or local-name()="perUnitPattern"]', }, }; my $pattern_type_map = { compoundUnitPattern => 'regular', compoundUnitPattern1 => 'regular', unitPrefixPattern => 'prefix', unitPattern => 'regular', perUnitPattern => 'per-unit', }; $sth = $sths->{units_l10n} || die( "Unable to get a statement object for table 'units_l10n'." ); my $process_unit = sub { my( $def, $kids ) = @_; my $patterns = []; foreach my $el_kid ( @$kids ) { my $tag = $el_kid->nodeName; if( $tag eq 'displayName' ) { $def->{locale_name} = $el_kid->textContent; } elsif( exists( $pattern_type_map->{ $tag } ) ) { $def->{pattern_type} = $pattern_type_map->{ $tag }; $def->{unit_pattern} = trim( $el_kid->textContent ); if( $el_kid->hasAttribute( 'count' ) ) { $def->{count} = $el_kid->getAttribute( 'count' ); } if( $el_kid->hasAttribute( 'gender' ) ) { $def->{gender} = $el_kid->getAttribute( 'gender' ); } push( @$patterns, { %$def } ); } else { die( "Unknown element tag for this unit ID $def->{unit_id} for unit length $def->{format_length} for locale ${locale} in file ${f} for this element: ", $el_kid->toString() ); } } return( $patterns ); }; my $unitsRes = $mainDoc->findnodes( '/ldml/units/unitLength' ); &log( sprintf( "\t%d locale unit information found.", $unitsRes->size ) ); while( my $el = $unitsRes->shift ) { my $len = $el->getAttribute( 'type' ) || die( "Unable to get this unit length from the attribute 'type' for locale ${locale} in file ${f}" ); my $unitsAliasRes = $el->findnodes( './alias[@path]' ); # Example: # if( $unitsAliasRes->size ) { $el = resolve_alias( $unitsAliasRes ) || die( "Unit length is aliased, but I could not resolve it for locale ${locale} in file ${f}" ); } foreach my $u_type ( sort( keys( %$unit_locale_map ) ) ) { my $this = $unit_locale_map->{ $u_type }; my $type = $this->{type}; my $unitsRes = $el->findnodes( $this->{xpath_unit} ); if( !$unitsRes->size ) { warn( "Warning only: no unit definition found for units of type ${type} with length ${len} for locale ${locale} in file ${f}" ); next; } # Example: # while( my $el_unit = $unitsRes->shift ) { my $id = $el_unit->getAttribute( 'type' ) || die( "Unable to get the unit ID from the attribute 'type' for this locale ${locale} in file ${f} for this element: ", $el_unit->toString() ); my $unitAliasRes = $el_unit->findnodes( './alias[@path]' ); # If this unit is aliased if( $unitAliasRes->size ) { $el_unit = resolve_alias( $unitAliasRes ) || die( "This unit length with ID ${id} is aliased, but could not resolve it for this locale ${locale} in file ${f} for this element: ", $el->toString() ); } my @kids = $el_unit->nonBlankChildNodes; if( !scalar( @kids ) ) { die( "Unable to get any definition elements for this unit ID ${id} for locale ${locale} in file ${f} for this element: ", $el_unit->toString() ); } my $patterns = []; my( $locale_name, $gender ); foreach my $el_kid ( @kids ) { my $tag = $el_kid->nodeName; if( $tag eq 'displayName' ) { $locale_name = $el_kid->textContent; } elsif( $tag eq 'gender' ) { $gender = trim( $el_kid->textContent ); if( $gender ne 'feminine' && $gender ne 'masculine' && $gender ne 'neuter' && $gender ne 'inanimate' && $gender ne 'common' ) { die( "The gender for this unit ID ${id} with length ${len} is '${gender}', but I expected either 'masculine', 'feminine', 'neuter' or 'inanimate' for locale ${locale} in file ${f} for this element: ", $el_unit->toString() ); } } elsif( exists( $pattern_type_map->{ $tag } ) ) { my $def = { locale => $locale, format_length => $len, unit_type => $type, unit_id => $id, locale_name => $locale_name, }; $def->{pattern_type} = $pattern_type_map->{ $tag }; $def->{unit_pattern} = trim( $el_kid->textContent ); if( $el_kid->hasAttribute( 'count' ) ) { $def->{count} = $el_kid->getAttribute( 'count' ); } if( $el_kid->hasAttribute( 'gender' ) ) { $def->{gender} = $el_kid->getAttribute( 'gender' ); } elsif( defined( $gender ) ) { $def->{gender} = $gender; } if( $el_kid->hasAttribute( 'case' ) ) { $def->{gram_case} = $el_kid->getAttribute( 'case' ); } push( @$patterns, $def ); } else { die( "Unknown element tag '${tag}' for this unit ID ${id} for unit length ${len} for locale ${locale} in file ${f} for this element: ", $el_kid->toString() ); } } foreach my $def ( @$patterns ) { eval { $sth->execute( @$def{qw( locale format_length unit_type unit_id unit_pattern pattern_type locale_name count gender gram_case )} ); } || die( "Error executing SQL query to add unit information with id ${id} with length ${len} for locale ${locale} in file ${f}: ", ( $@ || $sth->errstr ), "\nSQL Query: ", $sth->{Statement}, "\n", dump( $def ) ); $j++; } } # Done checking all the unit definitions } # Done checking each known unit types (compound and regular) } # Done checking all unit length definitions $out->printf( "\t%d locale unit information added.\n", $j ) if( $DEBUG ); # NOTE: Checking localised names for calendar IDs &log( "Checking localised names for calendar IDs." ); my $calendarNamesRes = $mainDoc->findnodes( '/ldml/localeDisplayNames/types/type[@key="calendar"]' ); $j = 0; while( my $el = $calendarNamesRes->shift ) { my $def = { locale => $locale, calendar => ( $el->getAttribute( 'type' ) || die( "Localised calendar is missing its ID for locale ${locale} in file ${f} for this element: ", $el->toString() ) ), locale_name => trim( $el->textContent ), }; eval { $sth_cals_l10n->execute( @$def{qw( locale calendar locale_name )} ); } || die( "Error executing SQL query to add localised calendar ID information with calendar id $def->{calendar} for locale ${locale} in file ${f}: ", ( $@ || $sth_cals_l10n->errstr ), "\nSQL Query: ", $sth_cals_l10n->{Statement}, "\n", dump( $def ) ); $j++; } $out->printf( "\t%d locale calendar ID information added.\n", $j ) if( $DEBUG ); # NOTE: Checking localised names for number system IDs &log( "Checking localised names for number system IDs." ); my $numberSystemNamesRes = $mainDoc->findnodes( '/ldml/localeDisplayNames/types/type[@key="numbers"]' ); $j = 0; while( my $el = $numberSystemNamesRes->shift ) { my $def = { locale => $locale, number_system => ( $el->getAttribute( 'type' ) || die( "Localised number system is missing its ID for locale ${locale} in file ${f} for this element: ", $el->toString() ) ), locale_name => trim( $el->textContent ), }; if( $el->hasAttribute( 'alt' ) ) { $def->{alt} = $el->getAttribute( 'alt' ); } $out->print( "\t[$def->{number_system}] " ) if( $DEBUG ); # finance, native and traditional are part of other possible numbering systems, but undefined in the CLDR # if( !exists( $number_systems->{ $def->{number_system} } ) ) { warn( "Warning only: the number system '$def->{number_system}' used in localised data for locale '${locale}' is unknown to us in file ${f} for element: ", $el->toString() ) unless( $def->{number_system} eq 'native' || $def->{number_system} eq 'traditional' or $def->{number_system} eq 'finance' ); $out->print( "unknown, skipping.\n" ) if( $DEBUG ); next; } eval { $sth_num_sys_l10n->execute( @$def{qw( locale number_system locale_name alt )} ); } || die( "Error executing SQL query to add localised number system ID information with number system id $def->{number_system} for locale ${locale} in file ${f}: ", ( $@ || $sth_num_sys_l10n->errstr ), "\nSQL Query: ", $sth_num_sys_l10n->{Statement}, "\n", dump( $def ) ); $j++; $out->print( "ok\n" ) if( $DEBUG ); } $out->printf( "\t%d locale number system ID information added.\n", $j ) if( $DEBUG ); # NOTE: Checking localised names for collation IDs &log( "Checking localised names for collation IDs." ); my $collationNamesRes = $mainDoc->findnodes( '/ldml/localeDisplayNames/types/type[@key="collation"]' ); $j = 0; while( my $el = $collationNamesRes->shift ) { my $def = { locale => $locale, collation => ( $el->getAttribute( 'type' ) || die( "Localised collation is missing its ID for locale ${locale} in file ${f} for this element: ", $el->toString() ) ), locale_name => trim( $el->textContent ), }; eval { $sth_collation_l10n->execute( @$def{qw( locale collation locale_name )} ); } || die( "Error executing SQL query to add localised collation ID information with collation id $def->{collation} for locale ${locale} in file ${f}: ", ( $@ || $sth_collation_l10n->errstr ), "\nSQL Query: ", $sth_collation_l10n->{Statement}, "\n", dump( $def ) ); $j++; } $out->printf( "\t%d locale collation ID information added.\n", $j ) if( $DEBUG ); $n++; } &log( "${n} locales information processed." ); # NOTE: Loading annotations &log( "Loading annotations." ); $n = 0; my $l = 0; $anno_dir->open || die( "Unable to open annotation directory ${anno_dir}: ", $anno_dir->error ); # add_missing_to_dir( $anno_dir ); $sth = $sths->{annotations} || die( "No SQL statement object for annotations" ); # while( my $f = $anno_dir->read( as_object => 1, exclude_invisible => 1 ) ) @files = $anno_dir->read( as_object => 1, exclude_invisible => 1, 'sort' => 1 ); foreach my $f ( @files ) { next if( $f->extension ne 'xml' ); my $annoDoc = load_xml( $f ); my $locale = identity_to_locale( $annoDoc ); ( my $locale2 = $f->basename( '.xml' ) ) =~ tr/_/-/; if( lc( $locale ) ne lc( $locale2 ) && $locale2 ne 'root' ) { warn( "XML identity says the locale is '${locale}', but the file basename says it should be '${locale2}', and I think the file basename is correct for file $f" ); $locale = $locale2; } if( index( $locale, 'root' ) != -1 ) { if( length( $locale ) > 4 ) { my $loc = Locale::Unicode->new( $locale ); $loc->language( 'und' ); $locale = $loc->as_string; } else { $locale = 'und'; } } $out->print( "[${locale}] " ) if( $DEBUG ); $l++; my $annoRes = $annoDoc->findnodes( '//annotations/annotation' ); if( !$annoRes->size ) { warn( "Warning only: unable to get the annotation data for locale '${locale}' in file $f" ); } my $i = 0; while( my $el = $annoRes->get_node(++$i) ) { my $id = $el->getAttribute( 'cp' ); if( !defined( $id ) || !length( $id ) ) { die( "No ID set for this annotation element: ", $el->toString() ); } # Example: < or & if( index( $id, '&' ) != -1 && index( $id, ';' ) != -1 ) { $id = decode_entities( $id ); } my $val = $el->textContent; if( index( $val, '&' ) != -1 && index( $val, ';' ) != -1 ) { $val = decode_entities( $val ); } my $defaults = [split( /[[:blank:]\h]*\|[[:blank:]\h]*/, $val )]; my $tts; my $sibling = $annoRes->get_node( $i + 1 ); if( $sibling && $sibling->getAttribute( 'cp' ) eq $id && $sibling->hasAttribute( 'type' ) && ( $sibling->getAttribute( 'type' ) || '' ) eq 'tts' ) { $tts = $sibling->textContent; $i++; if( !defined( $tts ) || !length( $tts ) ) { die( "TTS definition exists for this annotation '${id}' at position ${i}, but the TTS value is empty." ); } elsif( index( $tts, '|' ) != -1 ) { die( "It seems this TTS value is designed to contain multiple values. This is unexpected, and would require a change in the database schema to reflect that." ); } } eval { $sth->execute( $locale, $id, to_array( $defaults ), $tts ); } || die( "Error adding localised information for annotation No ${i} (${id}): ", ( $@ || $sth->errstr ) ); $n += ( defined( $tts ) ? 2 : 1 ); } $out->print( "ok ${i} annotations added.\n" ) if( $DEBUG ); } &log( "${n} annotations added for ${l} locales." ); # NOTE: Loading languages match rules &log( "Loading languages match rules." ); $n = 0; $sth = $sths->{languages_match} || die( "No SQL statement object for languages_match" ); my $lang_match_file = $basedir->child( 'supplemental/languageInfo.xml' ); my $langMatchDoc = load_xml( $lang_match_file ); my $langMatchVar = $langMatchDoc->findnodes( '/supplementalData/languageMatching/languageMatches/matchVariable' ) || die( "Unable to get the language match variables in ${lang_match_file}" ); $out->print( $langMatchVar->size, " language match variables found.\n" ) if( $DEBUG ); my $langMatchRes = $langMatchDoc->findnodes( '/supplementalData/languageMatching/languageMatches/languageMatch' ) || die( "Unable to get the language matches in ${lang_match_file}" ); # Transform separator from '_' to '-' my $normalise_sep = 1; my $seq = 0; my $lang_match_bool_map = { true => 1, false => 0, }; # By default, desired and supported are symmetric: # while( my $el = $langMatchVar->shift ) { # my $var = $el->getAttribute( 'id' ) || die( "No variable name set in attribute 'id' for this element: ", $el->toString() ); my $data = $el->getAttribute( 'value' ) || die( "No variable value set in attribute 'value' for this element: ", $el->toString() ); # The algorithm is actually more versatile with '+' adding to the set and '-' removing from set # Luckily, the latter is not used, so we can just simply add all to the set # Might need to improve on that in the future through, as this might become a liability my $val = [split( /\+/, $data )]; $var =~ s/^\$//; $out->print( "Found variable '${var}' with values: ", join( ', ', @$val ), "\n" ) if( $DEBUG ); $lang_vars->{ $var } = $val; } while( my $el = $langMatchRes->shift ) { # my $def = {}; foreach my $prop ( qw( desired supported distance ) ) { my $val = $el->getAttribute( $prop ); if( !defined( $val ) || !length( $val ) ) { die( "No variable value set in attribute '${prop}' for this element: ", $el->toString() ); } $val =~ s/_/-/gs if( ( $prop eq 'desired' or $prop eq 'supported' ) && $normalise_sep ); $def->{ $prop } = $val; } if( $el->hasAttribute( 'oneway' ) ) { my $bool = $el->getAttribute( 'oneway' ) || die( "No boolean value set in attribute 'oneway' for this element: ", $el->toString() ); if( !exists( $lang_match_bool_map->{ $bool } ) ) { die( "No match found in boolean map for value '${bool}'" ); } # We reverse the value, since the XML specifies whether this entry is asymmetric $def->{is_symetric} = ( $lang_match_bool_map->{ $bool } ? 0 : 1 ); } $out->print( "[$def->{desired} -> $def->{supported}] " ) if( $DEBUG ); # # # There is a match variable embedded if( index( $def->{desired}, '*' ) != -1 || index( $def->{desired}, '$' ) != -1 ) { $def->{sequence} = ++$seq; # $def->{was} = { desired => $def->{desired} }; # # # # # # # $def->{desired} =~ s{ ^ (?: (?\*|[a-zA-Z0-9]+) | (?: (?\*|[a-zA-Z0-9]+) (?[^\*a-zA-Z0-9]+) (?: (?: (?