use
5.010;
BEGIN {
$HTML::HTML5::Sanity::AUTHORITY
=
'cpan:TOBYINK'
;
$HTML::HTML5::Sanity::VERSION
=
'0.105'
;
}
our
@ISA
=
qw(Exporter)
;
our
%EXPORT_TAGS
= (
'all'
=> [
qw(fix_document)
],
'standard'
=> [
qw(fix_document)
],
);
our
@EXPORT_OK
= ( @{
$EXPORT_TAGS
{
'all'
} } );
our
@EXPORT
= ( @{
$EXPORT_TAGS
{
'standard'
} } );
our
$FIX_LANG_ATTRIBUTES
= 1;
use
Locale::Country
qw(country_code2code LOCALE_CODE_ALPHA_2 LOCALE_CODE_NUMERIC)
;
our
$lang_3to2
= {
'aar'
=>
'aa'
,
'abk'
=>
'ab'
,
'ave'
=>
'ae'
,
'afr'
=>
'af'
,
'aka'
=>
'ak'
,
'amh'
=>
'am'
,
'arg'
=>
'an'
,
'ara'
=>
'ar'
,
'asm'
=>
'as'
,
'ava'
=>
'av'
,
'aym'
=>
'ay'
,
'aze'
=>
'az'
,
'bak'
=>
'ba'
,
'bel'
=>
'be'
,
'bul'
=>
'bg'
,
'bih'
=>
'bh'
,
'bis'
=>
'bi'
,
'bam'
=>
'bm'
,
'ben'
=>
'bn'
,
'tib'
=>
'bo'
,
'bod'
=>
'bo'
,
'bre'
=>
'br'
,
'bos'
=>
'bs'
,
'cat'
=>
'ca'
,
'che'
=>
'ce'
,
'cha'
=>
'ch'
,
'cos'
=>
'co'
,
'cre'
=>
'cr'
,
'cze'
=>
'cs'
,
'ces'
=>
'cs'
,
'chu'
=>
'cu'
,
'chv'
=>
'cv'
,
'wel'
=>
'cy'
,
'cym'
=>
'cy'
,
'dan'
=>
'da'
,
'ger'
=>
'de'
,
'deu'
=>
'de'
,
'div'
=>
'dv'
,
'dzo'
=>
'dz'
,
'ewe'
=>
'ee'
,
'gre'
=>
'el'
,
'ell'
=>
'el'
,
'eng'
=>
'en'
,
'epo'
=>
'eo'
,
'spa'
=>
'es'
,
'est'
=>
'et'
,
'baq'
=>
'eu'
,
'eus'
=>
'eu'
,
'per'
=>
'fa'
,
'fas'
=>
'fa'
,
'ful'
=>
'ff'
,
'fin'
=>
'fi'
,
'fij'
=>
'fj'
,
'fao'
=>
'fo'
,
'fre'
=>
'fr'
,
'fra'
=>
'fr'
,
'fry'
=>
'fy'
,
'gle'
=>
'ga'
,
'gla'
=>
'gd'
,
'glg'
=>
'gl'
,
'grn'
=>
'gn'
,
'guj'
=>
'gu'
,
'glv'
=>
'gv'
,
'hau'
=>
'ha'
,
'heb'
=>
'he'
,
'hin'
=>
'hi'
,
'hmo'
=>
'ho'
,
'hrv'
=>
'hr'
,
'hat'
=>
'ht'
,
'hat'
=>
'ht'
,
'hun'
=>
'hu'
,
'arm'
=>
'hy'
,
'hye'
=>
'hy'
,
'her'
=>
'hz'
,
'ina'
=>
'ia'
,
'ind'
=>
'id'
,
'ile'
=>
'ie'
,
'ibo'
=>
'ig'
,
'iii'
=>
'ii'
,
'ipk'
=>
'ik'
,
'ido'
=>
'io'
,
'ice'
=>
'is'
,
'isl'
=>
'is'
,
'ita'
=>
'it'
,
'iku'
=>
'iu'
,
'jpn'
=>
'ja'
,
'jav'
=>
'jv'
,
'geo'
=>
'ka'
,
'kat'
=>
'ka'
,
'kon'
=>
'kg'
,
'kik'
=>
'ki'
,
'kik'
=>
'ki'
,
'kua'
=>
'kj'
,
'kaz'
=>
'kk'
,
'kal'
=>
'kl'
,
'khm'
=>
'km'
,
'kan'
=>
'kn'
,
'kor'
=>
'ko'
,
'kau'
=>
'kr'
,
'kas'
=>
'ks'
,
'kur'
=>
'ku'
,
'kom'
=>
'kv'
,
'cor'
=>
'kw'
,
'kir'
=>
'ky'
,
'lat'
=>
'la'
,
'ltz'
=>
'lb'
,
'ltz'
=>
'lb'
,
'lug'
=>
'lg'
,
'lim'
=>
'li'
,
'lin'
=>
'ln'
,
'lao'
=>
'lo'
,
'lit'
=>
'lt'
,
'lub'
=>
'lu'
,
'lav'
=>
'lv'
,
'mlg'
=>
'mg'
,
'mah'
=>
'mh'
,
'mao'
=>
'mi'
,
'mri'
=>
'mi'
,
'mac'
=>
'mk'
,
'mkd'
=>
'mk'
,
'mal'
=>
'ml'
,
'mon'
=>
'mn'
,
'mar'
=>
'mr'
,
'may'
=>
'ms'
,
'msa'
=>
'ms'
,
'mlt'
=>
'mt'
,
'bur'
=>
'my'
,
'mya'
=>
'my'
,
'nau'
=>
'na'
,
'nob'
=>
'nb'
,
'nde'
=>
'nd'
,
'nep'
=>
'ne'
,
'ndo'
=>
'ng'
,
'dut'
=>
'nl'
,
'nld'
=>
'nl'
,
'nno'
=>
'nn'
,
'nor'
=>
'no'
,
'nbl'
=>
'nr'
,
'nav'
=>
'nv'
,
'nya'
=>
'ny'
,
'oci'
=>
'oc'
,
'oji'
=>
'oj'
,
'orm'
=>
'om'
,
'ori'
=>
'or'
,
'oss'
=>
'os'
,
'pan'
=>
'pa'
,
'pli'
=>
'pi'
,
'pol'
=>
'pl'
,
'pus'
=>
'ps'
,
'por'
=>
'pt'
,
'que'
=>
'qu'
,
'roh'
=>
'rm'
,
'run'
=>
'rn'
,
'rum'
=>
'ro'
,
'ron'
=>
'ro'
,
'rus'
=>
'ru'
,
'kin'
=>
'rw'
,
'san'
=>
'sa'
,
'srd'
=>
'sc'
,
'snd'
=>
'sd'
,
'sme'
=>
'se'
,
'sag'
=>
'sg'
,
'sin'
=>
'si'
,
'slo'
=>
'sk'
,
'slk'
=>
'sk'
,
'slv'
=>
'sl'
,
'smo'
=>
'sm'
,
'sna'
=>
'sn'
,
'som'
=>
'so'
,
'alb'
=>
'sq'
,
'sqi'
=>
'sq'
,
'srp'
=>
'sr'
,
'ssw'
=>
'ss'
,
'sot'
=>
'st'
,
'sun'
=>
'su'
,
'swe'
=>
'sv'
,
'swa'
=>
'sw'
,
'tam'
=>
'ta'
,
'tel'
=>
'te'
,
'tgk'
=>
'tg'
,
'tha'
=>
'th'
,
'tir'
=>
'ti'
,
'tuk'
=>
'tk'
,
'tgl'
=>
'tl'
,
'tsn'
=>
'tn'
,
'ton'
=>
'to'
,
'tur'
=>
'tr'
,
'tso'
=>
'ts'
,
'tat'
=>
'tt'
,
'twi'
=>
'tw'
,
'tah'
=>
'ty'
,
'uig'
=>
'ug'
,
'ukr'
=>
'uk'
,
'urd'
=>
'ur'
,
'uzb'
=>
'uz'
,
'ven'
=>
've'
,
'vie'
=>
'vi'
,
'vol'
=>
'vo'
,
'wln'
=>
'wa'
,
'wol'
=>
'wo'
,
'xho'
=>
'xh'
,
'yid'
=>
'yi'
,
'yor'
=>
'yo'
,
'zha'
=>
'za'
,
'chi'
=>
'zh'
,
'zho'
=>
'zh'
,
'zul'
=>
'zu'
,
};
our
$lang_grandfather
= {
'art-lojban'
=>
'jbo'
,
'i-ami'
=>
'ami'
,
'i-bnn'
=>
'bnn'
,
'i-hak'
=>
'hak'
,
'i-klingon'
=>
'tlh'
,
'i-lux'
=>
'lb'
,
'i-navajo'
=>
'nv'
,
'i-pwn'
=>
'pwn'
,
'i-tao'
=>
'tao'
,
'i-tay'
=>
'tay'
,
'i-tsu'
=>
'tsu'
,
'no-bok'
=>
'nb'
,
'no-nyn'
=>
'nn'
,
'sgn-be-fr'
=>
'sfb'
,
'sgn-be-nl'
=>
'vgt'
,
'sgn-ch-de'
=>
'sgg'
,
'zh-guoyu'
=>
'cmn'
,
'zh-hakka'
=>
'hak'
,
'zh-min-nan'
=>
'nan'
,
'zh-xiang'
=>
'hsn'
,
};
our
$obsolete_iso3166
= {
'UK'
=>
'GB'
,
'FX'
=>
'FR'
,
'ZR'
=>
'CD'
,
'HV'
=>
'BF'
,
'DY'
=>
'BJ'
,
'BU'
=>
'MM'
,
'TP'
=>
'TL'
,
'NH'
=>
'VU'
,
'RH'
=>
'ZW'
,
};
our
$canon_lang
= {};
sub
fix_document
{
my
$old_document
=
shift
;
my
$attribute_behaviour
=
shift
|| 0;
my
$new_document
= XML::LibXML::Document->new;
my
$new_root
= fix_element(
$old_document
->documentElement,
$new_document
,
);
$new_document
->setDocumentElement(
$new_root
);
return
$new_document
;
}
sub
fix_element
{
my
$old_element
=
shift
;
my
$new_document
=
shift
;
my
$parent_declarations
=
shift
;
my
$declared_namespaces
= {};
foreach
my
$k
(
keys
%{
$parent_declarations
})
{
$declared_namespaces
->{
$k
} =
$parent_declarations
->{
$k
};
}
foreach
my
$attr
(
$old_element
->attributes)
{
next
if
$attr
->nodeType == XML_NAMESPACE_DECL;
if
(
$attr
->nodeName =~ /^xmlns:(.*)$/)
{
my
$prefix
= $1;
if
(
$prefix
eq
'xml'
&&
$attr
->getData eq XML_XML_NS)
{
}
elsif
(
$prefix
eq
'xml'
||
$attr
->getData eq XML_XML_NS)
{
next
;
}
elsif
(
$prefix
eq
'xmlns'
||
$attr
->getData eq XML_XMLNS_NS)
{
next
;
}
$declared_namespaces
->{
$prefix
} =
$attr
->getData;
}
}
my
$hasExplicit
= 0;
if
(
$old_element
->hasAttributeNS(
undef
,
'xmlns'
))
{
$hasExplicit
= 1;
$declared_namespaces
->{
':'
} =
$old_element
->getAttributeNS(
undef
,
'xmlns'
);
}
my
$new_element
;
if
(
$hasExplicit
)
{
$new_element
=
$new_document
->createElementNS(
$declared_namespaces
->{
':'
},
$old_element
->nodeName,
);
}
else
{
my
$tag
=
$old_element
->nodeName;
if
(
$tag
=~ /^([^:]+)\:([^:]+)$/)
{
my
$ns_prefix
= $1;
my
$localname
= $2;
if
(
defined
$declared_namespaces
->{
$ns_prefix
})
{
$new_element
=
$new_document
->createElementNS(
$declared_namespaces
->{
$ns_prefix
},
$tag
);
}
}
unless
(
$new_element
)
{
$new_element
=
$new_document
->createElementNS(
$declared_namespaces
->{
':'
},
$tag
);
}
}
foreach
my
$old_attr
(
$old_element
->attributes)
{
next
if
$old_attr
->nodeType == XML_NAMESPACE_DECL;
fix_attribute(
$old_attr
,
$new_element
,
$declared_namespaces
);
}
foreach
my
$old_kid
(
$old_element
->childNodes)
{
if
(
$old_kid
->nodeType == XML_TEXT_NODE
||
$old_kid
->nodeType == XML_CDATA_SECTION_NODE)
{
$new_element
->appendTextNode(
$old_kid
->nodeValue);
}
elsif
(
$old_kid
->nodeType == XML_COMMENT_NODE)
{
$new_element
->appendChild(
$new_document
->createComment(
$old_kid
->nodeValue)
);
}
elsif
(
$old_kid
->nodeType == XML_ELEMENT_NODE)
{
$new_element
->appendChild(
fix_element(
$old_kid
,
$new_document
,
$declared_namespaces
)
);
}
}
return
$new_element
;
}
sub
fix_attribute
{
my
$old_attribute
=
shift
;
my
$new_element
=
shift
;
my
$declared_namespaces
=
shift
;
my
$name
=
$old_attribute
->nodeName;
my
@new_attribute
;
if
(
$name
=~ /^([^:]+)\:([^:]+)$/)
{
my
$ns_prefix
= $1;
my
$localname
= $2;
if
(
defined
$declared_namespaces
->{
$ns_prefix
})
{
@new_attribute
= (
$declared_namespaces
->{
$ns_prefix
},
sprintf
(
"%s:%s"
,
$ns_prefix
,
$localname
),
);
}
}
my
$node_value
=
$old_attribute
->nodeValue;
if
(
$FIX_LANG_ATTRIBUTES
&&
$name
=~ /^(xml:)?lang$/i)
{
return
undef
unless
_valid_lang(
$node_value
);
if
(
$FIX_LANG_ATTRIBUTES
== 2)
{
$node_value
= _canon_lang(
$node_value
);
}
}
if
(
@new_attribute
)
{
$new_element
->setAttributeNS(
@new_attribute
,
$node_value
);
}
else
{
$new_element
->setAttribute(
$name
,
$node_value
);
}
return
undef
;
}
sub
_valid_lang
{
my
$value_to_test
=
shift
;
return
1
if
(
defined
$value_to_test
) && (
$value_to_test
eq
''
);
return
0
unless
defined
$value_to_test
;
my
$alpha
=
'[a-z]'
;
my
$digit
=
'[0-9]'
;
my
$alphanum
=
'[a-z0-9]'
;
my
$x
=
'x'
;
my
$singleton
=
'[a-wyz]'
;
my
$s
=
'[_-]'
;
my
$language
=
'([a-z]{2,8}) | ([a-z]{2,3} [_-] [a-z]{3})'
;
my
$script
=
'[a-z]{4}'
;
my
$region
=
'(?: [a-z]{2}|[0-9]{3})'
;
my
$variant
=
'(?: [a-z0-9]{5,8} | [0-9] [a-z0-9]{3} )'
;
my
$extension
=
'(?: [a-wyz] (?: [_-] [a-z0-9]{2,8} )+ )'
;
my
$privateUse
=
'(?: x (?: [_-] [a-z0-9]{1,8} )+ )'
;
my
$grandfathered
= '(?:
(en [_-] GB [_-] oed)
| (i [_-] (?: ami | bnn |
default
| enochian | hak | klingon | lux | mingo | navajo | pwn | tao | tay | tsu ))
| (
no
[_-] (?: bok | nyn ))
| (sgn [_-] (?: BE [_-] (?: fr | nl) | CH [_-] de ))
| (zh [_-] min [_-] nan)
)';
my
$variantList
=
$variant
.
'(?:'
.
$s
.
$variant
.
')*'
;
my
$extensionList
=
$extension
.
'(?:'
.
$s
.
$extension
.
')*'
;
my
$langtag
= "
(
$language
)
(
$s
(
$script
) )?
(
$s
(
$region
) )?
(
$s
(
$variantList
) )?
(
$s
(
$extensionList
) )?
(
$s
(
$privateUse
) )?
";
my
$r
= (
$value_to_test
=~
/^(
(
$langtag
)
| (
$privateUse
)
| (
$grandfathered
)
)$/xi);
return
$r
;
}
sub
_canon_lang
{
my
$lang
=
shift
;
unless
(
defined
$canon_lang
->{
$lang
})
{
$canon_lang
->{
$lang
} = __canon_lang(
$lang
);
}
return
$canon_lang
->{
$lang
};
}
sub
__canon_lang
{
my
$lang
=
lc
shift
;
if
(
$lang
=~ /^([a-z]{3})/)
{
substr
(
$lang
, 0, 3) =
$lang_3to2
->{$1}
if
defined
$lang_3to2
->{$1};
}
return
$lang_grandfather
->{
$lang
}
if
defined
$lang_grandfather
->{
$lang
};
return
$lang
if
length
$lang
< 4;
$lang
=~ s/_/-/g;
return
sprintf
(
'%s-%s'
, $1, _canon_country($2))
if
$lang
=~ /^([a-z]{2,3})-([a-z]{2}|\d{3})$/;
return
sprintf
(
'%s-%s'
, $1, _canon_script($2))
if
$lang
=~ /^([a-z]{2,3})-([a-z]{4})$/;
return
sprintf
(
'%s-%s-%s'
, $1, _canon_script($2), _canon_country($3))
if
$lang
=~ /^([a-z]{2,3})-([a-z]{4})-([a-z]{2}|\d{3})$/;
return
$lang
;
}
sub
_canon_country
{
my
$c
=
uc
shift
;
if
(
$c
=~ /^\d\d\d$/)
{
my
$c1
= country_code2code(
$c
, LOCALE_CODE_NUMERIC, LOCALE_CODE_ALPHA_2);
$c
=
uc
$c1
if
defined
$c1
&&
length
$c1
;
}
return
$obsolete_iso3166
->{
$c
}
if
defined
$obsolete_iso3166
->{
$c
};
return
$c
;
}
sub
_canon_script
{
my
$s
=
ucfirst
lc
shift
;
return
$s
;
}
1;