use
Mojo::Util
qw(html_attr_unescape html_unescape xml_escape)
;
has
tree
=>
sub
{ [
'root'
] };
has
'xml'
;
my
$ATTR_RE
=
qr/
([^<>=\s\/
]+|\/)
(?:
\s*=\s*
(?s:(["'])(.*?)\g{-2}|([^>\s]*))
)?
\s*
/x;
my
$TOKEN_RE
=
qr/
([^<]+)? # Text
(?:
<(?:
!(?:
DOCTYPE(
\s+\w+ # Doctype
(?:(?:\s+\w+)?(?:\s+(?:"[^"]*"|'[^']*'))+)? # External ID
(?:\s+\[.+?\])? # Int Subset
\s*)
|
--(.*?)--\s* # Comment
|
\[CDATA\[(.*?)\]\] # CDATA
)
|
\?(.*?)\? # Processing Instruction
|
\s*([^<>\s]+\s*(?:(?:$ATTR_RE){0,32766})*+) # Tag
)>
|
(<) # Runaway "<"
)??
/
xis;
my
%RAW
=
map
{
$_
=> 1 }
qw(script style)
;
my
%RCDATA
=
map
{
$_
=> 1 }
qw(title textarea)
;
my
%END
= (
body
=>
'head'
,
optgroup
=>
'optgroup'
,
option
=>
'option'
);
map
{
$END
{
$_
} =
'p'
} (
qw(address article aside blockquote details div dl fieldset figcaption)
,
qw(figure footer form h1 h2 h3 h4 h5 h6 header hgroup hr main menu nav ol p)
,
qw(pre section table ul)
);
my
%TABLE
=
map
{
$_
=> 1 }
qw(colgroup tbody td tfoot th thead tr)
;
my
%CLOSE
= (
li
=> [{
li
=> 1}, {
ul
=> 1,
ol
=> 1}],
tr
=> [{
tr
=> 1}, {
table
=> 1}]);
$CLOSE
{
$_
} = [\
%TABLE
, {
table
=> 1}]
for
qw(colgroup tbody tfoot thead)
;
$CLOSE
{
$_
} = [{
dd
=> 1,
dt
=> 1}, {
dl
=> 1}]
for
qw(dd dt)
;
$CLOSE
{
$_
} = [{
rp
=> 1,
rt
=> 1}, {
ruby
=> 1}]
for
qw(rp rt)
;
$CLOSE
{
$_
} = [{
th
=> 1,
td
=> 1}, {
table
=> 1}]
for
qw(td th)
;
my
%EMPTY
=
map
{
$_
=> 1 } (
qw(area base br col embed hr img input keygen link menuitem meta param)
,
qw(source track wbr)
);
my
@PHRASING
= (
qw(a abbr area audio b bdi bdo br button canvas cite code data datalist)
,
qw(del dfn em embed i iframe img input ins kbd keygen label link map mark)
,
qw(math meta meter noscript object output picture progress q ruby s samp)
,
qw(script select slot small span strong sub sup svg template textarea time u)
,
qw(var video wbr)
);
my
@OBSOLETE
=
qw(acronym applet basefont big font strike tt)
;
my
%PHRASING
=
map
{
$_
=> 1 }
@OBSOLETE
,
@PHRASING
;
my
%BLOCK
=
map
{
$_
=> 1 } (
qw(a address applet article aside b big blockquote body button caption)
,
qw(center code col colgroup dd details dialog dir div dl dt em fieldset)
,
qw(figcaption figure font footer form frameset h1 h2 h3 h4 h5 h6 head)
,
qw(header hgroup html i iframe li listing main marquee menu nav nobr)
,
qw(noembed noframes noscript object ol optgroup option p plaintext pre rp)
,
qw(rt s script section select small strike strong style summary table)
,
qw(tbody td template textarea tfoot th thead title tr tt u ul xmp)
);
sub
parse {
my
(
$self
,
$html
) = (
shift
,
"$_[0]"
);
my
$xml
=
$self
->xml;
my
$current
=
my
$tree
= [
'root'
];
while
(
$html
=~ /\G
$TOKEN_RE
/gcso) {
my
(
$text
,
$doctype
,
$comment
,
$cdata
,
$pi
,
$tag
,
$runaway
)
= ($1, $2, $3, $4, $5, $6, $11);
$text
.=
'<'
if
defined
$runaway
;
_node(
$current
,
'text'
, html_unescape
$text
)
if
defined
$text
;
if
(
defined
$tag
) {
if
(
$tag
=~ /^\/\s*(\S+)/) { _end(
$xml
? $1 :
lc
$1,
$xml
, \
$current
) }
elsif
(
$tag
=~ m!^([^\s/]+)([\s\S]*)!) {
my
(
$start
,
$attr
) = (
$xml
? $1 :
lc
$1, $2);
my
(
%attrs
,
$closing
);
while
(
$attr
=~ /
$ATTR_RE
/go) {
my
(
$key
,
$value
) = (
$xml
? $1 :
lc
$1, $3 // $4);
++
$closing
and
next
if
$key
eq
'/'
;
$attrs
{
$key
} =
defined
$value
? html_attr_unescape
$value
:
$value
;
}
$start
=
'img'
if
!
$xml
&&
$start
eq
'image'
;
_start(
$start
, \
%attrs
,
$xml
, \
$current
);
_end(
$start
,
$xml
, \
$current
)
if
!
$xml
&&
$EMPTY
{
$start
} || (
$xml
|| !
$BLOCK
{
$start
}) &&
$closing
;
next
if
$xml
|| !
$RAW
{
$start
} && !
$RCDATA
{
$start
};
next
unless
$html
=~ m!\G(.*?)<\s*/\s*\Q
$start
\E\s*>!gcsi;
_node(
$current
,
'raw'
,
$RCDATA
{
$start
} ? html_unescape $1 : $1);
_end(
$start
, 0, \
$current
);
}
}
elsif
(
defined
$doctype
) { _node(
$current
,
'doctype'
,
$doctype
) }
elsif
(
defined
$comment
) { _node(
$current
,
'comment'
,
$comment
) }
elsif
(
defined
$cdata
) { _node(
$current
,
'cdata'
,
$cdata
) }
elsif
(
defined
$pi
) {
$self
->xml(
$xml
= 1)
if
!
exists
$self
->{xml} &&
$pi
=~ /xml/i;
_node(
$current
,
'pi'
,
$pi
);
}
}
return
$self
->tree(
$tree
);
}
sub
render { _render(
$_
[0]->tree,
$_
[0]->xml) }
sub
_end {
my
(
$end
,
$xml
,
$current
) =
@_
;
my
$next
=
$$current
;
do
{
return
if
$next
->[0] eq
'root'
;
return
$$current
=
$next
->[3]
if
$next
->[1] eq
$end
;
return
if
!
$xml
&&
$PHRASING
{
$end
} && !
$PHRASING
{
$next
->[1]};
}
while
$next
=
$next
->[3];
}
sub
_node {
my
(
$current
,
$type
,
$content
) =
@_
;
push
@$current
,
my
$new
= [
$type
,
$content
,
$current
];
weaken
$new
->[2];
}
sub
_render {
my
(
$tree
,
$xml
) =
@_
;
my
$type
=
$tree
->[0];
return
xml_escape
$tree
->[1]
if
$type
eq
'text'
;
return
$tree
->[1]
if
$type
eq
'raw'
;
return
'<!DOCTYPE'
.
$tree
->[1] .
'>'
if
$type
eq
'doctype'
;
return
'<!--'
.
$tree
->[1] .
'-->'
if
$type
eq
'comment'
;
return
'<![CDATA['
.
$tree
->[1] .
']]>'
if
$type
eq
'cdata'
;
return
'<?'
.
$tree
->[1] .
'?>'
if
$type
eq
'pi'
;
return
join
''
,
map
{ _render(
$_
,
$xml
) }
@$tree
[1 ..
$#$tree
]
if
$type
eq
'root'
;
my
$tag
=
$tree
->[1];
my
$result
=
"<$tag"
;
for
my
$key
(
sort
keys
%{
$tree
->[2]}) {
my
$value
=
$tree
->[2]{
$key
};
$result
.=
$xml
?
qq{ $key="$key"}
:
" $key"
and
next
unless
defined
$value
;
$result
.=
qq{ $key="}
. xml_escape(
$value
) .
'"'
;
}
return
$xml
?
"$result />"
:
$EMPTY
{
$tag
} ?
"$result>"
:
"$result></$tag>"
unless
$tree
->[4];
no
warnings
'recursion'
;
$result
.=
'>'
.
join
''
,
map
{ _render(
$_
,
$xml
) }
@$tree
[4 ..
$#$tree
];
return
"$result</$tag>"
;
}
sub
_start {
my
(
$start
,
$attrs
,
$xml
,
$current
) =
@_
;
if
(!
$xml
&&
$$current
->[0] ne
'root'
) {
if
(
my
$end
=
$END
{
$start
}) { _end(
$end
, 0,
$current
) }
elsif
(
my
$close
=
$CLOSE
{
$start
}) {
my
(
$allowed
,
$scope
) =
@$close
;
my
$parent
=
$$current
;
while
(
$parent
->[0] ne
'root'
&& !
$scope
->{
$parent
->[1]}) {
_end(
$parent
->[1], 0,
$current
)
if
$allowed
->{
$parent
->[1]};
$parent
=
$parent
->[3];
}
}
}
push
@
$$current
,
my
$new
= [
'tag'
,
$start
,
$attrs
,
$$current
];
weaken
$new
->[3];
$$current
=
$new
;
}
1;