use
Mojo::Util
qw(decode encode html_unescape xml_escape)
;
has
[
qw(charset xml)
];
has
tree
=>
sub
{ [
'root'
] };
my
$ATTR_RE
=
qr/
\s*
([^=\s>]+) # Key
(?:
\s*=\s*
(?:
"([^"]*?)" # Quotation marks
|
'([^']*?)' # Apostrophes
|
([^>\s]*) # Unquoted
)
)?
\s*
/
x;
my
$END_RE
=
qr!^\s*/\s*(.+)\s*!
;
my
$TOKEN_RE
=
qr/
([^<]*) # Text
(?:
<\?(.*?)\?> # Processing Instruction
|
<!--(.*?)--\s*> # Comment
|
<!\[CDATA\[(.*?)\]\]> # CDATA
|
<!DOCTYPE(
\s+\w+
(?:(?:\s+\w+)?(?:\s+(?:"[^"]*"|'[^']*'))+)? # External ID
(?:\s+\[.+?\])? # Int Subset
\s*
)>
|
<(
\s*
[^>\s]+ # Tag
(?:$ATTR_RE)* # Attributes
)>
)??
/
xis;
my
%OPTIONAL
=
map
{
$_
=> 1 }
qw(body colgroup dd head li optgroup option p rt rp tbody td tfoot th)
;
my
%PARAGRAPH
=
map
{
$_
=> 1 } (
qw(address article aside blockquote dir div dl fieldset footer form h1 h2)
,
qw(h3 h4 h5 h6 header hgroup hr menu nav ol p pre section table ul)
);
my
%TABLE
=
map
{
$_
=> 1 }
qw(col colgroup tbody td th thead tr)
;
my
%VOID
=
map
{
$_
=> 1 } (
qw(area base br col command embed hr img input keygen link meta param)
,
qw(source track wbr)
);
my
%INLINE
=
map
{
$_
=> 1 } (
qw(a abbr acronym applet b basefont bdo big br button cite code del dfn em)
,
qw(font i iframe img ins input kbd label map object q s samp script select)
,
qw(small span strike strong sub sup textarea tt u var)
);
sub
parse {
my
(
$self
,
$html
) =
@_
;
my
$charset
=
$self
->charset;
$html
= decode(
$charset
,
$html
) //
return
$self
->charset(
undef
)
if
$charset
;
my
$tree
= [
'root'
];
my
$current
=
$tree
;
while
(
$html
=~ m/\G
$TOKEN_RE
/gcs) {
my
(
$text
,
$pi
,
$comment
,
$cdata
,
$doctype
,
$tag
)
= ($1, $2, $3, $4, $5, $6);
if
(
length
$text
) {
$text
= html_unescape
$text
if
(
index
$text
,
'&'
) >= 0;
$self
->_text(
$text
, \
$current
);
}
if
(
$doctype
) {
$self
->_doctype(
$doctype
, \
$current
) }
elsif
(
$comment
) {
$self
->_comment(
$comment
, \
$current
) }
elsif
(
$cdata
) {
$self
->_cdata(
$cdata
, \
$current
) }
elsif
(
$pi
) {
$self
->_pi(
$pi
, \
$current
) }
next
unless
$tag
;
my
$cs
=
$self
->xml;
if
(
$tag
=~
$END_RE
) {
$self
->_end(
$cs
? $1 :
lc
($1), \
$current
) }
elsif
(
$tag
=~ m!([^\s/]+)([\s\S]*)!) {
my
(
$start
,
$attr
) = (
$cs
? $1 :
lc
($1), $2);
my
%attrs
;
while
(
$attr
=~ /
$ATTR_RE
/g) {
my
$key
=
$cs
? $1 :
lc
($1);
my
$value
= $2 // $3 // $4;
next
if
$key
eq
'/'
;
$value
= html_unescape
$value
if
$value
&& (
index
$value
,
'&'
) >= 0;
$attrs
{
$key
} =
$value
;
}
$self
->_start(
$start
, \
%attrs
, \
$current
);
$self
->_end(
$start
, \
$current
)
if
(!
$self
->xml &&
$VOID
{
$start
}) ||
$attr
=~ m!/\s*$!;
if
(
grep
{
$_
eq
$start
}
qw(script style)
) {
if
(
$html
=~ m!\G(.*?)<\s*/\s
*$start
\s*>!gcsi) {
$self
->_raw($1, \
$current
);
$self
->_end(
$start
, \
$current
);
}
}
}
}
return
$self
->tree(
$tree
);
}
sub
render {
my
$self
=
shift
;
my
$content
=
$self
->_render(
$self
->tree);
my
$charset
=
$self
->charset;
return
$charset
? encode(
$charset
,
$content
) :
$content
;
}
sub
_cdata {
my
(
$self
,
$cdata
,
$current
) =
@_
;
push
@
$$current
, [
'cdata'
,
$cdata
];
}
sub
_close {
my
(
$self
,
$current
,
$tags
,
$stop
) =
@_
;
$tags
||= \
%TABLE
;
$stop
||=
'table'
;
my
$parent
=
$$current
;
while
(
$parent
) {
last
if
$parent
->[0] eq
'root'
||
$parent
->[1] eq
$stop
;
$tags
->{
$parent
->[1]} and
$self
->_end(
$parent
->[1],
$current
);
$parent
=
$parent
->[3];
}
}
sub
_comment {
my
(
$self
,
$comment
,
$current
) =
@_
;
push
@
$$current
, [
'comment'
,
$comment
];
}
sub
_doctype {
my
(
$self
,
$doctype
,
$current
) =
@_
;
push
@
$$current
, [
'doctype'
,
$doctype
];
}
sub
_end {
my
(
$self
,
$end
,
$current
) =
@_
;
return
if
$$current
->[0] eq
'root'
;
my
$found
= 0;
my
$next
=
$$current
;
while
(
$next
) {
last
if
$next
->[0] eq
'root'
;
++
$found
and
last
if
$next
->[1] eq
$end
;
return
if
!
$self
->xml &&
$INLINE
{
$end
} && !
$INLINE
{
$next
->[1]};
$next
=
$next
->[3];
}
return
unless
$found
;
$next
=
$$current
;
while
(
$$current
=
$next
) {
last
if
$$current
->[0] eq
'root'
;
$next
=
$$current
->[3];
if
(
$end
eq
$$current
->[1]) {
return
$$current
=
$$current
->[3] }
elsif
(
$OPTIONAL
{
$$current
->[1]}) {
$self
->_end(
$$current
->[1],
$current
);
}
elsif
(
$end
eq
'table'
) {
$self
->_close(
$current
) }
$self
->_end(
$$current
->[1],
$current
);
}
}
sub
_pi {
my
(
$self
,
$pi
,
$current
) =
@_
;
$self
->xml(1)
if
!
defined
$self
->xml &&
$pi
=~ /xml/i;
push
@
$$current
, [
'pi'
,
$pi
];
}
sub
_raw {
my
(
$self
,
$raw
,
$current
) =
@_
;
push
@
$$current
, [
'raw'
,
$raw
];
}
sub
_render {
my
(
$self
,
$tree
) =
@_
;
my
$e
=
$tree
->[0];
return
xml_escape
$tree
->[1]
if
$e
eq
'text'
;
return
$tree
->[1]
if
$e
eq
'raw'
;
return
"<!DOCTYPE"
.
$tree
->[1] .
">"
if
$e
eq
'doctype'
;
return
"<!--"
.
$tree
->[1] .
"-->"
if
$e
eq
'comment'
;
return
"<![CDATA["
.
$tree
->[1] .
"]]>"
if
$e
eq
'cdata'
;
return
"<?"
.
$tree
->[1] .
"?>"
if
$e
eq
'pi'
;
my
$start
=
$e
eq
'root'
? 1 : 2;
my
$content
=
''
;
if
(
$e
eq
'tag'
) {
$start
= 4;
my
$tag
=
$tree
->[1];
$content
.=
"<$tag"
;
my
@attrs
;
for
my
$key
(
sort
keys
%{
$tree
->[2]}) {
my
$value
=
$tree
->[2]{
$key
};
push
@attrs
,
$key
and
next
unless
defined
$value
;
push
@attrs
,
qq{$key="}
. xml_escape(
$value
) .
'"'
;
}
my
$attrs
=
join
' '
,
@attrs
;
$content
.=
" $attrs"
if
$attrs
;
return
$self
->xml ||
$VOID
{
$tag
} ?
"$content />"
:
"$content></$tag>"
unless
$tree
->[4];
$content
.=
'>'
;
}
$content
.=
$self
->_render(
$tree
->[
$_
])
for
$start
..
$#$tree
;
$content
.=
'</'
.
$tree
->[1] .
'>'
if
$e
eq
'tag'
;
return
$content
;
}
sub
_start {
my
(
$self
,
$start
,
$attrs
,
$current
) =
@_
;
if
(!
$self
->xml &&
$$current
->[0] ne
'root'
) {
if
(
$start
eq
'li'
) {
$self
->_close(
$current
, {
li
=> 1},
'ul'
) }
elsif
(
$PARAGRAPH
{
$start
}) {
$self
->_end(
'p'
,
$current
) }
elsif
(
$start
eq
'body'
) {
$self
->_end(
'head'
,
$current
) }
elsif
(
$start
eq
'optgroup'
) {
$self
->_end(
'optgroup'
,
$current
) }
elsif
(
$start
eq
'option'
) {
$self
->_end(
'option'
,
$current
) }
elsif
(
grep
{
$_
eq
$start
}
qw(colgroup thead tbody tfoot)
) {
$self
->_close(
$current
);
}
elsif
(
$start
eq
'tr'
) {
$self
->_close(
$current
, {
tr
=> 1}) }
elsif
(
grep
{
$_
eq
$start
}
qw(th td)
) {
$self
->_close(
$current
, {
th
=> 1});
$self
->_close(
$current
, {
td
=> 1});
}
elsif
(
grep
{
$_
eq
$start
}
qw(dt dd)
) {
$self
->_end(
'dt'
,
$current
);
$self
->_end(
'dd'
,
$current
);
}
elsif
(
grep
{
$_
eq
$start
}
qw(rt rp)
) {
$self
->_end(
'rt'
,
$current
);
$self
->_end(
'rp'
,
$current
);
}
}
my
$new
= [
'tag'
,
$start
,
$attrs
,
$$current
];
weaken
$new
->[3];
push
@
$$current
,
$new
;
$$current
=
$new
;
}
sub
_text {
my
(
$self
,
$text
,
$current
) =
@_
;
push
@
$$current
, [
'text'
,
$text
];
}
1;