use 5.010; use strict; use warnings; package HTML::HTML5::Sanity; BEGIN { $HTML::HTML5::Sanity::AUTHORITY = 'cpan:TOBYINK'; $HTML::HTML5::Sanity::VERSION = '0.105'; } require Exporter; our @ISA = qw(Exporter); our %EXPORT_TAGS = ( 'all' => [ qw(fix_document) ], 'standard' => [ qw(fix_document) ], ); our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); our @EXPORT = ( @{ $EXPORT_TAGS{'standard'} } ); our $FIX_LANG_ATTRIBUTES = 1; use Locale::Country qw(country_code2code LOCALE_CODE_ALPHA_2 LOCALE_CODE_NUMERIC); use XML::LibXML qw(:ns :libxml); our $lang_3to2 = { 'aar' => 'aa' , 'abk' => 'ab' , 'ave' => 'ae' , 'afr' => 'af' , 'aka' => 'ak' , 'amh' => 'am' , 'arg' => 'an' , 'ara' => 'ar' , 'asm' => 'as' , 'ava' => 'av' , 'aym' => 'ay' , 'aze' => 'az' , 'bak' => 'ba' , 'bel' => 'be' , 'bul' => 'bg' , 'bih' => 'bh' , 'bis' => 'bi' , 'bam' => 'bm' , 'ben' => 'bn' , 'tib' => 'bo' , 'bod' => 'bo' , 'bre' => 'br' , 'bos' => 'bs' , 'cat' => 'ca' , 'che' => 'ce' , 'cha' => 'ch' , 'cos' => 'co' , 'cre' => 'cr' , 'cze' => 'cs' , 'ces' => 'cs' , 'chu' => 'cu' , 'chv' => 'cv' , 'wel' => 'cy' , 'cym' => 'cy' , 'dan' => 'da' , 'ger' => 'de' , 'deu' => 'de' , 'div' => 'dv' , 'dzo' => 'dz' , 'ewe' => 'ee' , 'gre' => 'el' , 'ell' => 'el' , 'eng' => 'en' , 'epo' => 'eo' , 'spa' => 'es' , 'est' => 'et' , 'baq' => 'eu' , 'eus' => 'eu' , 'per' => 'fa' , 'fas' => 'fa' , 'ful' => 'ff' , 'fin' => 'fi' , 'fij' => 'fj' , 'fao' => 'fo' , 'fre' => 'fr' , 'fra' => 'fr' , 'fry' => 'fy' , 'gle' => 'ga' , 'gla' => 'gd' , 'glg' => 'gl' , 'grn' => 'gn' , 'guj' => 'gu' , 'glv' => 'gv' , 'hau' => 'ha' , 'heb' => 'he' , 'hin' => 'hi' , 'hmo' => 'ho' , 'hrv' => 'hr' , 'hat' => 'ht' , 'hat' => 'ht' , 'hun' => 'hu' , 'arm' => 'hy' , 'hye' => 'hy' , 'her' => 'hz' , 'ina' => 'ia' , 'ind' => 'id' , 'ile' => 'ie' , 'ibo' => 'ig' , 'iii' => 'ii' , 'ipk' => 'ik' , 'ido' => 'io' , 'ice' => 'is' , 'isl' => 'is' , 'ita' => 'it' , 'iku' => 'iu' , 'jpn' => 'ja' , 'jav' => 'jv' , 'geo' => 'ka' , 'kat' => 'ka' , 'kon' => 'kg' , 'kik' => 'ki' , 'kik' => 'ki' , 'kua' => 'kj' , 'kaz' => 'kk' , 'kal' => 'kl' , 'khm' => 'km' , 'kan' => 'kn' , 'kor' => 'ko' , 'kau' => 'kr' , 'kas' => 'ks' , 'kur' => 'ku' , 'kom' => 'kv' , 'cor' => 'kw' , 'kir' => 'ky' , 'lat' => 'la' , 'ltz' => 'lb' , 'ltz' => 'lb' , 'lug' => 'lg' , 'lim' => 'li' , 'lin' => 'ln' , 'lao' => 'lo' , 'lit' => 'lt' , 'lub' => 'lu' , 'lav' => 'lv' , 'mlg' => 'mg' , 'mah' => 'mh' , 'mao' => 'mi' , 'mri' => 'mi' , 'mac' => 'mk' , 'mkd' => 'mk' , 'mal' => 'ml' , 'mon' => 'mn' , 'mar' => 'mr' , 'may' => 'ms' , 'msa' => 'ms' , 'mlt' => 'mt' , 'bur' => 'my' , 'mya' => 'my' , 'nau' => 'na' , 'nob' => 'nb' , 'nde' => 'nd' , 'nep' => 'ne' , 'ndo' => 'ng' , 'dut' => 'nl' , 'nld' => 'nl' , 'nno' => 'nn' , 'nor' => 'no' , 'nbl' => 'nr' , 'nav' => 'nv' , 'nya' => 'ny' , 'oci' => 'oc' , 'oji' => 'oj' , 'orm' => 'om' , 'ori' => 'or' , 'oss' => 'os' , 'pan' => 'pa' , 'pli' => 'pi' , 'pol' => 'pl' , 'pus' => 'ps' , 'por' => 'pt' , 'que' => 'qu' , 'roh' => 'rm' , 'run' => 'rn' , 'rum' => 'ro' , 'ron' => 'ro' , 'rus' => 'ru' , 'kin' => 'rw' , 'san' => 'sa' , 'srd' => 'sc' , 'snd' => 'sd' , 'sme' => 'se' , 'sag' => 'sg' , 'sin' => 'si' , 'slo' => 'sk' , 'slk' => 'sk' , 'slv' => 'sl' , 'smo' => 'sm' , 'sna' => 'sn' , 'som' => 'so' , 'alb' => 'sq' , 'sqi' => 'sq' , 'srp' => 'sr' , 'ssw' => 'ss' , 'sot' => 'st' , 'sun' => 'su' , 'swe' => 'sv' , 'swa' => 'sw' , 'tam' => 'ta' , 'tel' => 'te' , 'tgk' => 'tg' , 'tha' => 'th' , 'tir' => 'ti' , 'tuk' => 'tk' , 'tgl' => 'tl' , 'tsn' => 'tn' , 'ton' => 'to' , 'tur' => 'tr' , 'tso' => 'ts' , 'tat' => 'tt' , 'twi' => 'tw' , 'tah' => 'ty' , 'uig' => 'ug' , 'ukr' => 'uk' , 'urd' => 'ur' , 'uzb' => 'uz' , 'ven' => 've' , 'vie' => 'vi' , 'vol' => 'vo' , 'wln' => 'wa' , 'wol' => 'wo' , 'xho' => 'xh' , 'yid' => 'yi' , 'yor' => 'yo' , 'zha' => 'za' , 'chi' => 'zh' , 'zho' => 'zh' , 'zul' => 'zu' , }; our $lang_grandfather = { 'art-lojban' => 'jbo', 'i-ami' => 'ami', 'i-bnn' => 'bnn', 'i-hak' => 'hak', 'i-klingon' => 'tlh', 'i-lux' => 'lb', 'i-navajo' => 'nv', 'i-pwn' => 'pwn', #haha 'i-tao' => 'tao', 'i-tay' => 'tay', 'i-tsu' => 'tsu', 'no-bok' => 'nb', 'no-nyn' => 'nn', 'sgn-be-fr' => 'sfb', 'sgn-be-nl' => 'vgt', 'sgn-ch-de' => 'sgg', 'zh-guoyu' => 'cmn', 'zh-hakka' => 'hak', 'zh-min-nan' => 'nan', 'zh-xiang' => 'hsn', }; our $obsolete_iso3166 = { 'UK' => 'GB', # Exceptionally reserved 'FX' => 'FR', # Exceptionally reserved 'ZR' => 'CD', # Zaire => Congo 'HV' => 'BF', # Upper Volta => Burkina Faso 'DY' => 'BJ', # Dahomey => Benin 'BU' => 'MM', # Burma => Myanmar 'TP' => 'TL', # East Timor => Timor-Leste 'NH' => 'VU', # New Hebrides => Vanuatu 'RH' => 'ZW', # Rhodesia => Zimbabwe }; our $canon_lang = {}; sub fix_document { my $old_document = shift; my $attribute_behaviour = shift || 0; my $new_document = XML::LibXML::Document->new; my $new_root = fix_element( $old_document->documentElement, $new_document, { ':' => 'http://www.w3.org/1999/xhtml', 'xml' => XML_XML_NS } ); $new_document->setDocumentElement($new_root); return $new_document; } sub fix_element { my $old_element = shift; my $new_document = shift; my $parent_declarations = shift; my $declared_namespaces = {}; foreach my $k (keys %{$parent_declarations}) { $declared_namespaces->{$k} = $parent_declarations->{$k}; } # Process namespace declarations on this element. foreach my $attr ($old_element->attributes) { next if $attr->nodeType == XML_NAMESPACE_DECL; if ($attr->nodeName =~ /^xmlns:(.*)$/) { my $prefix = $1; if ($prefix eq 'xml' && $attr->getData eq XML_XML_NS) { # that's OK. } elsif ($prefix eq 'xml' || $attr->getData eq XML_XML_NS) { next; } elsif ($prefix eq 'xmlns' || $attr->getData eq XML_XMLNS_NS) { next; } $declared_namespaces->{ $prefix } = $attr->getData; } } # Process any default XML Namespace my $hasExplicit = 0; if ($old_element->hasAttributeNS(undef, 'xmlns')) { $hasExplicit = 1; $declared_namespaces->{ ':' } = $old_element->getAttributeNS(undef, 'xmlns'); } # Create a new element. my $new_element; if ($hasExplicit) { $new_element = $new_document->createElementNS( $declared_namespaces->{ ':' }, $old_element->nodeName, ); } else { my $tag = $old_element->nodeName; if ($tag =~ /^([^:]+)\:([^:]+)$/) { my $ns_prefix = $1; my $localname = $2; if (defined $declared_namespaces->{$ns_prefix}) { $new_element = $new_document->createElementNS( $declared_namespaces->{$ns_prefix}, $tag); } } unless ($new_element) { $new_element = $new_document->createElementNS( $declared_namespaces->{ ':' }, $tag); } } # Add attributes to new element. foreach my $old_attr ($old_element->attributes) { next if $old_attr->nodeType == XML_NAMESPACE_DECL; # next if $old_attr->nodeName =~ /^xmlns(:.*)?$/; fix_attribute($old_attr, $new_element, $declared_namespaces); } # Process child nodes. foreach my $old_kid ($old_element->childNodes) { if ($old_kid->nodeType == XML_TEXT_NODE || $old_kid->nodeType == XML_CDATA_SECTION_NODE) { $new_element->appendTextNode($old_kid->nodeValue); } elsif ($old_kid->nodeType == XML_COMMENT_NODE) { $new_element->appendChild( $new_document->createComment($old_kid->nodeValue) ); } elsif ($old_kid->nodeType == XML_ELEMENT_NODE) { $new_element->appendChild( fix_element($old_kid, $new_document, $declared_namespaces) ); } } return $new_element; } sub fix_attribute { my $old_attribute = shift; my $new_element = shift; my $declared_namespaces = shift; my $name = $old_attribute->nodeName; my @new_attribute; if ($name =~ /^([^:]+)\:([^:]+)$/) { my $ns_prefix = $1; my $localname = $2; if (defined $declared_namespaces->{$ns_prefix}) { @new_attribute = ( $declared_namespaces->{$ns_prefix}, sprintf("%s:%s", $ns_prefix, $localname), ); } } my $node_value = $old_attribute->nodeValue; if ($FIX_LANG_ATTRIBUTES && $name =~ /^(xml:)?lang$/i) { return undef unless _valid_lang($node_value); if ($FIX_LANG_ATTRIBUTES == 2) { $node_value = _canon_lang($node_value); } } if (@new_attribute) { $new_element->setAttributeNS(@new_attribute, $node_value); } else { $new_element->setAttribute($name, $node_value); } return undef; } sub _valid_lang { my $value_to_test = shift; return 1 if (defined $value_to_test) && ($value_to_test eq ''); return 0 unless defined $value_to_test; # Regex for recognizing RFC 4646 well-formed tags # http://www.rfc-editor.org/rfc/rfc4646.txt # http://tools.ietf.org/html/draft-ietf-ltru-4646bis-21 # The structure requires no forward references, so it reverses the order. # It uses Java/Perl syntax instead of the old ABNF # The uppercase comments are fragments copied from RFC 4646 # Note: the tool requires that any real "=" or "#" or ";" in the regex be escaped. my $alpha = '[a-z]'; # ALPHA my $digit = '[0-9]'; # DIGIT my $alphanum = '[a-z0-9]'; # ALPHA / DIGIT my $x = 'x'; # private use singleton my $singleton = '[a-wyz]'; # other singleton my $s = '[_-]'; # separator -- lenient parsers will use [_-] -- strict will use [-] # Now do the components. The structure is slightly different to allow for capturing the right components. # The notation (?:....) is a non-capturing version of (...): so the "?:" can be deleted if someone doesn't care about capturing. my $language = '([a-z]{2,8}) | ([a-z]{2,3} [_-] [a-z]{3})'; # ABNF (2*3ALPHA) / 4ALPHA / 5*8ALPHA --- note: because of how | works in regex, don't use $alpha{2,3} | $alpha{4,8} # We don't have to have the general case of extlang, because there can be only one extlang (except for zh-min-nan). # Note: extlang invalid in Unicode language tags my $script = '[a-z]{4}' ; # 4ALPHA my $region = '(?: [a-z]{2}|[0-9]{3})' ; # 2ALPHA / 3DIGIT my $variant = '(?: [a-z0-9]{5,8} | [0-9] [a-z0-9]{3} )' ; # 5*8alphanum / (DIGIT 3alphanum) my $extension = '(?: [a-wyz] (?: [_-] [a-z0-9]{2,8} )+ )' ; # singleton 1*("-" (2*8alphanum)) my $privateUse = '(?: x (?: [_-] [a-z0-9]{1,8} )+ )' ; # "x" 1*("-" (1*8alphanum)) # Define certain grandfathered codes, since otherwise the regex is pretty useless. # Since these are limited, this is safe even later changes to the registry -- # the only oddity is that it might change the type of the tag, and thus # the results from the capturing groups. # http://www.iana.org/assignments/language-subtag-registry # Note that these have to be compared case insensitively, requiring (?i) below. my $grandfathered = '(?: (en [_-] GB [_-] oed) | (i [_-] (?: ami | bnn | default | enochian | hak | klingon | lux | mingo | navajo | pwn | tao | tay | tsu )) | (no [_-] (?: bok | nyn )) | (sgn [_-] (?: BE [_-] (?: fr | nl) | CH [_-] de )) | (zh [_-] min [_-] nan) )'; # old: | zh $s (?: cmn (?: $s Hans | $s Hant )? | gan | min (?: $s nan)? | wuu | yue ); # For well-formedness, we don't need the ones that would otherwise pass. # For validity, they need to be checked. # $grandfatheredWellFormed = (?: # art $s lojban # | cel $s gaulish # | zh $s (?: guoyu | hakka | xiang ) # ); # Unicode locales: but we are shifting to a compatible form # $keyvalue = (?: $alphanum+ \= $alphanum+); # $keywords = ($keyvalue (?: \; $keyvalue)*); # We separate items that we want to capture as a single group my $variantList = $variant . '(?:' . $s . $variant . ')*' ; # special for multiples my $extensionList = $extension . '(?:' . $s . $extension . ')*' ; # special for multiples my $langtag = " ($language) ($s ( $script ) )? ($s ( $region ) )? ($s ( $variantList ) )? ($s ( $extensionList ) )? ($s ( $privateUse ) )? "; # Here is the final breakdown, with capturing groups for each of these components # The variants, extensions, grandfathered, and private-use may have interior '-' my $r = ($value_to_test =~ /^( ($langtag) | ($privateUse) | ($grandfathered) )$/xi); return $r; } # If people use a non-canon lang once, they're likely to do it twice, so a little caching. sub _canon_lang { my $lang = shift; unless (defined $canon_lang->{$lang}) { $canon_lang->{$lang} = __canon_lang($lang); } return $canon_lang->{$lang}; } sub __canon_lang { # Use lower case only my $lang = lc shift; # If there's a 2 letter code where a three letter code has been used, replace it. if ($lang =~ /^([a-z]{3})/) { substr($lang, 0, 3) = $lang_3to2->{$1} if defined $lang_3to2->{$1}; } return $lang_grandfather->{$lang} if defined $lang_grandfather->{$lang}; # Simple case. return $lang if length $lang < 4; # Replace '_' with '-'. $lang =~ s/_/-/g; # upper case the country component of lang-country pairs. return sprintf('%s-%s', $1, _canon_country($2)) if $lang =~ /^([a-z]{2,3})-([a-z]{2}|\d{3})$/; # title case the script component of lang-script pairs. return sprintf('%s-%s', $1, _canon_script($2)) if $lang =~ /^([a-z]{2,3})-([a-z]{4})$/; # title case the script component, upper case country componet of lang-script-country triplets. return sprintf('%s-%s-%s', $1, _canon_script($2), _canon_country($3)) if $lang =~ /^([a-z]{2,3})-([a-z]{4})-([a-z]{2}|\d{3})$/; # Too complicated - give up and return lower case. return $lang; } sub _canon_country { my $c = uc shift; if ($c =~ /^\d\d\d$/) { my $c1 = country_code2code($c, LOCALE_CODE_NUMERIC, LOCALE_CODE_ALPHA_2); $c = uc $c1 if defined $c1 && length $c1; } return $obsolete_iso3166->{$c} if defined $obsolete_iso3166->{$c}; return $c; } sub _canon_script { my $s = ucfirst lc shift; return $s; } 1; __END__ =head1 NAME HTML::HTML5::Sanity - make HTML5 DOM trees less insane =head1 SYNOPSIS use HTML::HTML5::Parser; use HTML::HTML5::Sanity; my $parser = HTML::HTML5::Parser->new; my $html5_dom = $parser->parse_file('http://example.com/'); my $sane_dom = fix_document($html5_dom); =head1 DESCRIPTION The Document Object Model (DOM) generated by HTML::HTML5::Parser meets the requirements of the HTML5 spec, but will probably catch a lot of people by surprise. The main oddity is that elements and attributes which appear to be namespaced are not really. For example, the following element:
...
Looks like it should be parsed so that it has an attribute "lang" in the XML namespace. Not so. It will really be parsed as having the attribute "xml:lang" in the null namespace. =over 4 =item C<< fix_document($document) >> $sane_dom = fix_document($html5_dom); Returns a modified copy of the DOM and leaving the original DOM unmodified. =item C<< fix_element($element_node, $new_document_node, \%namespaces) >> Don't use this. Not exported. =item C<< fix_attribute($attribute_node, $new_element_node, \%namespaces) >> Don't use this. Not exported. =item C<$HTML::HTML5::Sanity::FIX_LANG_ATTRIBUTES> $HTML::HTML5::Sanity::FIX_LANG_ATTRIBUTES = 2; $sane_dom = fix_document($html5_dom); If set to 1 (the default), the package will detect invalid values in @lang and @xml:lang, and remove the attribute if it is invalid. If set to 2, it will also attempt to canonicalise the value (e.g. 'EN_GB' will be converted to to 'en-GB'). If set to 0, then the value of language attributes is not checked. =back =head1 BUGS Please report any bugs to L. =head1 SEE ALSO L, L, L. =head1 AUTHOR Toby Inkster Etobyink@cpan.orgE. =head1 COPYRIGHT AND LICENSE Copyright (C) 2009-2014 by Toby Inkster This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 DISCLAIMER OF WARRANTIES THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. =cut