use
5.010;
DOCTYPE_NIL
=>
''
,
DOCTYPE_HTML32
=>
'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">'
,
DOCTYPE_HTML5
=>
'<!DOCTYPE html>'
,
DOCTYPE_LEGACY
=>
'<!DOCTYPE html SYSTEM "about:legacy-compat">'
,
DOCTYPE_HTML2
=>
'<!DOCTYPE html PUBLIC "-//IETF//DTD HTML 2.0//EN">'
,
};
our
$VERSION
=
'0.201'
;
our
%EXPORT_TAGS
= (
doctype
=> [
qw(DOCTYPE_NIL DOCTYPE_HTML32 DOCTYPE_HTML4 DOCTYPE_HTML5
DOCTYPE_LEGACY DOCTYPE_XHTML1 DOCTYPE_XHTML11 DOCTYPE_XHTML_BASIC
DOCTYPE_XHTML_RDFA DOCTYPE_HTML2 DOCTYPE_HTML40 DOCTYPE_HTML40_STRICT
DOCTYPE_HTML40_LOOSE DOCTYPE_HTML40_FRAMESET DOCTYPE_HTML401
DOCTYPE_HTML401_STRICT DOCTYPE_HTML401_LOOSE DOCTYPE_HTML401_FRAMESET
DOCTYPE_XHTML1_STRICT DOCTYPE_XHTML1_LOOSE DOCTYPE_XHTML1_FRAMESET
DOCTYPE_XHTML_MATHML_SVG DOCTYPE_XHTML_BASIC_10 DOCTYPE_XHTML_BASIC_11
DOCTYPE_HTML4_RDFA DOCTYPE_HTML401_RDFA11 DOCTYPE_HTML401_RDFA10
DOCTYPE_XHTML_RDFA10 DOCTYPE_XHTML_RDFA11)
]
);
our
@EXPORT_OK
= @{
$EXPORT_TAGS
{doctype} };
our
@VoidElements
=
qw(area base br col command embed hr
img input keygen link meta param source track wbr)
;
our
@BooleanAttributes
=
qw(
hidden
audio@autoplay audio@preload audio@controls audio@loop
button@autofocus button@disabled button@formnovalidate
command@checked command@disabled
details@open
dl@compact
fieldset@disabled
form@novalidate
hr@noshade
iframe@seamless
img@ismap
input@autofocus input@checked input@disabled input@formnovalidate
input@multiple input@readonly input@required
keygen@autofocus keygen@disabled
ol@reversed
optgroup@disabled
option@disabled option@selected
script@async script@defer
select@autofocus select@disabled select@multiple select@readonly
select@required
style@scoped
textarea@autofocus textarea@disabled textarea@required
time@pubdate
track@default
video@autoplay video@preload video@controls video@loop
)
;
our
@OptionalStart
=
qw(html head body tbody)
;
our
@OptionalEnd
=
qw(html head body tbody dt dd li optgroup
option p rp rt td th tfoot thead tr)
;
sub
new
{
my
(
$class
,
%opts
) =
@_
;
my
$self
=
bless
\
%opts
=>
$class
;
$self
->{
'markup'
} //=
'html'
;
$self
->{
'charset'
} //=
'utf8'
;
$self
->{
'refs'
} //=
'hex'
;
$self
->{
'doctype'
} //= (
$self
->is_xhtml? DOCTYPE_LEGACY : DOCTYPE_HTML5);
$self
->{
'polyglot'
} //= !!
$self
->is_xhtml;
return
$self
;
}
sub
is_xhtml
{
my
(
$self
) =
@_
;
return
(
$self
->{
'markup'
} =~ m
'^(xml|xhtml|application/xml|text/xml|application/xhtml\+xml)$'
i);
}
sub
is_polyglot
{
my
(
$self
) =
@_
;
return
$self
->{
'polyglot'
};
}
sub
should_quote_attributes
{
my
(
$self
) =
@_
;
return
$self
->{
'quote_attributes'
}
if
exists
$self
->{
'quote_attributes'
};
return
$self
->is_xhtml ||
$self
->is_polyglot;
}
sub
should_slash_voids
{
my
(
$self
) =
@_
;
return
$self
->{
'voids'
}
if
exists
$self
->{
'voids'
};
return
$self
->is_xhtml ||
$self
->is_polyglot;
}
sub
should_force_end_tags
{
my
(
$self
) =
@_
;
return
$self
->{
'end_tags'
}
if
exists
$self
->{
'end_tags'
};
return
$self
->is_xhtml ||
$self
->is_polyglot;
}
sub
should_force_start_tags
{
my
(
$self
) =
@_
;
return
$self
->{
'start_tags'
}
if
exists
$self
->{
'start_tags'
};
return
$self
->is_xhtml ||
$self
->is_polyglot;
}
sub
document
{
my
(
$self
,
$document
) =
@_
;
my
@childNodes
=
$document
->childNodes;
return
$self
->doctype
.
join
''
, (
map
{
$self
->_element_etc(
$_
); }
@childNodes
);
}
sub
doctype
{
my
(
$self
) =
@_
;
return
$self
->{
'doctype'
};
}
sub
_element_etc
{
my
(
$self
,
$etc
) =
@_
;
if
(
$etc
->nodeName eq
'#text'
)
{
return
$self
->text(
$etc
); }
elsif
(
$etc
->nodeName eq
'#comment'
)
{
return
$self
->comment(
$etc
); }
elsif
(
$etc
->nodeName eq
'#cdata-section'
)
{
return
$self
->cdata(
$etc
); }
elsif
(
$etc
->isa(
'XML::LibXML::PI'
))
{
return
$self
->pi(
$etc
); }
else
{
return
$self
->element(
$etc
); }
}
sub
element
{
my
(
$self
,
$element
) =
@_
;
return
$element
->toString
my
$rv
=
''
;
my
$tagname
=
$element
->nodeName;
my
%attrs
=
map
{
$_
->
nodeName
=>
$_
}
$element
->attributes;
my
@kids
=
$element
->childNodes;
if
(
$tagname
eq
'html'
&& !
$self
->is_xhtml && !
$self
->is_polyglot)
{
delete
$attrs
{
'xmlns'
};
}
my
$omitstart
= 0;
if
(!
%attrs
and !
$self
->should_force_start_tags and
grep
{
$tagname
eq
$_
}
@OptionalStart
)
{
$omitstart
+=
eval
"return \$self->_check_omit_start_${tagname}(\$element);"
;
}
my
$omitend
= 0;
if
(!
$self
->should_force_end_tags and
grep
{
$tagname
eq
$_
}
@OptionalEnd
)
{
$omitend
+=
eval
"return \$self->_check_omit_end_${tagname}(\$element);"
;
}
unless
(
$omitstart
)
{
$rv
.=
'<'
.
$tagname
;
foreach
my
$a
(
sort
keys
%attrs
)
{
$rv
.=
' '
.
$self
->attribute(
$attrs
{
$a
},
$element
);
}
}
if
(!
@kids
and
grep
{
$tagname
eq
$_
}
@VoidElements
and !
$omitstart
)
{
$rv
.=
$self
->should_slash_voids ?
' />'
:
'>'
;
return
$rv
;
}
$rv
.=
'>'
unless
$omitstart
;
foreach
my
$kid
(
@kids
)
{
$rv
.=
$self
->_element_etc(
$kid
);
}
unless
(
$omitend
)
{
$rv
.=
'</'
.
$tagname
.
'>'
;
}
return
$rv
;
}
sub
attribute
{
my
(
$self
,
$attr
,
$element
) =
@_
;
my
$minimize
= 0;
my
$quote
= 1;
my
$quotechar
=
'"'
;
my
$attrname
=
$attr
->nodeName;
my
$elemname
=
$element
?
$element
->nodeName :
'*'
;
unless
(
$self
->should_quote_attributes)
{
if
((
$attr
->value eq
$attrname
or
$attr
->value eq
''
)
and
grep
{
$_
eq
$attrname
or
$_
eq
sprintf
(
'%s@%s'
,
$elemname
,
$attrname
) }
@BooleanAttributes
)
{
return
$attrname
;
}
if
(
$attr
->value =~ /^[A-Za-z0-9\._:-]+$/)
{
return
sprintf
(
'%s=%s'
,
$attrname
,
$attr
->value);
}
}
my
$encoded_value
;
if
(
$attr
->value !~ /\"/)
{
$quotechar
=
'"'
;
$encoded_value
=
$self
->encode_entities(
$attr
->value);
}
elsif
(
$attr
->value !~ /\'/)
{
$quotechar
=
"'"
;
$encoded_value
=
$self
->encode_entities(
$attr
->value);
}
else
{
$quotechar
=
'"'
;
$encoded_value
=
$self
->encode_entities(
$attr
->value,
characters
=>
"\""
);
}
return
sprintf
(
'%s=%s%s%s'
,
$attrname
,
$quotechar
,
$encoded_value
,
$quotechar
);
}
sub
comment
{
my
(
$self
,
$text
) =
@_
;
return
'<!--'
.
$self
->encode_entities(
$text
->nodeValue) .
'-->'
;
}
sub
pi
{
my
(
$self
,
$pi
) =
@_
;
if
(
$pi
->nodeName eq
'decode'
)
{
return
HTML::HTML5::Entities::decode(
$pi
->textContent);
}
return
$pi
->toString;
}
sub
cdata
{
my
(
$self
,
$text
) =
@_
;
if
(
$self
->is_polyglot &&
$text
->parentNode->nodeName =~ /^(script|style)$/i)
{
return
'/* <![CDATA[ */'
.
$text
->nodeValue .
'/* ]]> */'
;
}
elsif
(!
$self
->is_xhtml &&
$text
->parentNode->nodeName =~ /^(script|style)$/i)
{
return
$text
->nodeValue;
}
elsif
(!
$self
->is_xhtml)
{
return
$self
->text(
$text
);
}
else
{
return
'<![CDATA['
.
$text
->nodeValue .
']]>'
;
}
}
sub
text
{
my
(
$self
,
$text
) =
@_
;
if
(
$self
->is_polyglot &&
$text
->parentNode->nodeName =~ /^(script|style)$/i)
{
return
'/* <![CDATA[ */'
.
$text
->nodeValue .
'/* ]]> */'
;
}
elsif
(!
$self
->is_xhtml &&
$text
->parentNode->nodeName =~ /^(script|style)$/i)
{
return
$text
->nodeValue;
}
elsif
(
$text
->parentNode->nodeName =~ /^(script|style)$/i)
{
return
'<![CDATA['
.
$text
->nodeValue .
']]>'
;
}
return
$self
->encode_entities(
$text
->nodeValue,
characters
=>
"<>"
);
}
sub
encode_entities
{
my
(
$self
,
$string
,
%options
) =
@_
;
my
$characters
=
$options
{
'characters'
};
$characters
.=
'&'
;
$characters
.=
'\x{0}-\x{8}\x{B}\x{C}\x{E}-\x{1F}\x{26}\x{7F}'
;
$characters
.=
'\x{80}-\x{FFFFFF}'
unless
$self
->{
'charset'
} =~ /^utf[_-]?8$/i;
my
$regexp
=
qr/[$characters]/
;
local
$HTML::HTML5::Entities::hex
= (
$self
->{
'refs'
} !~ /dec/i);
return
HTML::HTML5::Entities::encode_entities(
$string
,
$regexp
);
}
sub
encode_entity
{
my
(
$self
,
$char
) =
@_
;
local
$HTML::HTML5::Entities::hex
= (
$self
->{
'refs'
} !~ /dec/i);
return
HTML::HTML5::Entities::encode_entities(
$char
,
qr/./
);
}
sub
_check_omit_end_body
{
my
(
$self
,
$element
) =
@_
;
my
$next
=
$element
->nextSibling;
unless
(
defined
$next
&&
$next
->nodeName eq
'#comment'
)
{
return
1
if
$element
->childNodes || !
$self
->_check_omit_start_body(
$element
);
}
}
sub
_check_omit_end_head
{
my
(
$self
,
$element
) =
@_
;
my
$next
=
$element
->nextSibling;
return
0
unless
defined
$next
;
return
0
if
$next
->nodeName eq
'#comment'
;
return
0
if
$next
->nodeName eq
'#text'
&&
$next
->nodeValue =~ /^\s/;
return
1;
}
sub
_check_omit_end_html
{
my
(
$self
,
$element
) =
@_
;
my
@bodies
=
$element
->getChildrenByTagName(
'body'
);
if
(
$bodies
[-1]->childNodes ||
$bodies
[-1]->attributes)
{
return
!
defined
$element
->nextSibling;
}
}
sub
_check_omit_end_dd
{
my
(
$self
,
$element
) =
@_
;
return
1
unless
defined
$element
->nextSibling;
return
1
if
$element
->nextSibling->nodeName
=~ /^( dd | dt )$/x;
}
*_check_omit_end_dt
= \
&_check_omit_end_dd
;
sub
_check_omit_end_li
{
my
(
$self
,
$element
) =
@_
;
return
1
unless
defined
$element
->nextSibling;
return
1
if
$element
->nextSibling->nodeName
=~ /^( li )$/x;
}
sub
_check_omit_end_optgroup
{
my
(
$self
,
$element
) =
@_
;
return
1
unless
defined
$element
->nextSibling;
return
1
if
$element
->nextSibling->nodeName
=~ /^( optgroup )$/x;
}
sub
_check_omit_end_option
{
my
(
$self
,
$element
) =
@_
;
return
1
unless
defined
$element
->nextSibling;
return
1
if
$element
->nextSibling->nodeName
=~ /^( option | optgroup )$/x;
}
sub
_check_omit_end_p
{
my
(
$self
,
$element
) =
@_
;
return
1
unless
defined
$element
->nextSibling;
return
1
if
$element
->nextSibling->nodeName
=~ /^( address | article | aside | blockquote | dir
| div | dl | fieldset | footer | form | h[1-6]
| header | hr | menu | nav | ol | p | pre | section
| table | ul )$/x;
}
sub
_check_omit_end_rp
{
my
(
$self
,
$element
) =
@_
;
return
1
unless
defined
$element
->nextSibling;
return
1
if
$element
->nextSibling->nodeName
=~ /^( rp | rt )$/x;
}
*_check_omit_end_rt
= \
&_check_omit_end_rp
;
sub
_check_omit_end_td
{
my
(
$self
,
$element
) =
@_
;
return
1
unless
defined
$element
->nextSibling;
return
1
if
$element
->nextSibling->nodeName
=~ /^( td | th )$/x;
}
*_check_omit_end_th
= \
&_check_omit_end_td
;
sub
_check_omit_end_tbody
{
my
(
$self
,
$element
) =
@_
;
return
1
unless
defined
$element
->nextSibling;
return
1
if
$element
->nextSibling->nodeName
=~ /^( tbody | tfoot )$/x;
}
sub
_check_omit_end_tfoot
{
my
(
$self
,
$element
) =
@_
;
return
1
unless
defined
$element
->nextSibling;
return
1
if
$element
->nextSibling->nodeName
=~ /^( tbody )$/x;
}
sub
_check_omit_end_thead
{
my
(
$self
,
$element
) =
@_
;
return
0
unless
defined
$element
->nextSibling;
return
1
if
$element
->nextSibling->nodeName
=~ /^( tbody | tfoot )$/x;
}
sub
_check_omit_end_tr
{
my
(
$self
,
$element
) =
@_
;
return
1
unless
defined
$element
->nextSibling;
return
1
if
$element
->nextSibling->nodeName
=~ /^(
tr
)$/x;
}
sub
_check_omit_start_body
{
my
(
$self
,
$element
) =
@_
;
my
@kids
=
$element
->childNodes;
my
$next
=
$kids
[0];
return
0
unless
defined
$next
;
return
0
if
$next
->nodeName eq
'#comment'
;
return
0
if
$next
->nodeName eq
'#text'
&&
$next
->nodeValue =~ /^\s/;
return
0
if
$next
->nodeName eq
'style'
;
return
0
if
$next
->nodeName eq
'script'
;
return
1;
}
sub
_check_omit_start_head
{
my
(
$self
,
$element
) =
@_
;
my
@kids
=
$element
->childNodes;
return
(
@kids
and
$kids
[0]->nodeType==XML_ELEMENT_NODE);
}
sub
_check_omit_start_html
{
my
(
$self
,
$element
) =
@_
;
my
@kids
=
$element
->childNodes;
return
(
@kids
and
$kids
[0]->nodeName ne
'#comment'
);
}
sub
_check_omit_start_tbody
{
my
(
$self
,
$element
) =
@_
;
my
@kids
=
$element
->childNodes;
return
0
unless
@kids
;
return
0
unless
$kids
[0]->nodeName eq
'tr'
;
return
1
unless
defined
$element
->previousSibling;
return
1
if
$element
->previousSibling->nodeName eq
'tbody'
&&
$self
->_check_omit_end_tbody(
$element
->previousSibling);
return
1
if
$element
->previousSibling->nodeName eq
'thead'
&&
$self
->_check_omit_end_thead(
$element
->previousSibling);
return
1
if
$element
->previousSibling->nodeName eq
'tfoot'
&&
$self
->_check_omit_end_tfoot(
$element
->previousSibling);
}
1;