$VERSION
=
'0.14'
;
my
%CHAR2DEFAULT_ENT
= (
'&'
=>
'&'
,
'<'
=>
'<'
,
'>'
=>
'>'
,
'"'
=>
'"'
);
my
%NUM2DEFAULT_ENT
= (
'38'
=>
'amp'
,
'60'
=>
'lt'
,
'62'
=>
'gt'
,
'"'
=>
'"'
);
use
base(
'HTML::TreeBuilder'
);
sub
isElementNode { 0 }
sub
isAttributeNode { 0 }
sub
isNamespaceNode { 0 }
sub
isTextNode { 0 }
sub
isProcessingInstructionNode { 0 }
sub
isPINode { 0 }
sub
isCommentNode { 0 }
sub
getChildNodes {
return
wantarray
? () : []; }
sub
getFirstChild {
return
undef
; }
sub
getLastChild {
return
undef
; }
sub
getElementById
{
my
(
$self
,
$id
) =
@_
;
return
scalar
$self
->look_down(
id
=>
$id
);
}
sub
to_number {
return
XML::XPathEngine::Number->new(
shift
->getValue); }
sub
cmp
{
my
(
$a
,
$b
)=
@_
;
if
(
$b
->isa(
'HTML::TreeBuilder::XPath::Root'
) ) {
return
-1; }
return
0
if
(
$a
==
$b
);
return
1
if
(
$a
->is_inside(
$b
));
return
-1
if
(
$b
->is_inside(
$a
));
my
@a_pile
= (
$a
,
$a
->lineage);
my
@b_pile
= (
$b
,
$b
->lineage);
unless
(
$a_pile
[-1] ==
$b_pile
[-1])
{
warn
"2 nodes not in the same pile: "
,
ref
(
$a
),
" - "
,
ref
(
$b
),
"\n"
;
print
"a: "
,
$a
->string_value,
"\nb: "
,
$b
->string_value,
"\n"
;
return
undef
;
}
my
$a_anc
=
pop
@a_pile
;
my
$b_anc
=
pop
@b_pile
;
while
(
$a_anc
==
$b_anc
)
{
$a_anc
=
pop
@a_pile
;
$b_anc
=
pop
@b_pile
;
}
if
(
defined
(
$a_anc
->{_rank}) &&
defined
(
$b_anc
->{_rank}))
{
return
$a_anc
->{_rank} <=>
$b_anc
->{_rank}; }
else
{
my
(
$a_prev
,
$a_next
,
$b_prev
,
$b_next
)= (
$a_anc
,
$a_anc
,
$b_anc
,
$b_anc
);
while
()
{
$a_prev
=
$a_prev
->getPreviousSibling ||
return
-1;
return
1
if
(
$a_prev
==
$b_anc
);
$a_next
=
$a_next
->getNextSibling ||
return
1;
return
-1
if
(
$a_next
==
$b_anc
);
$b_prev
=
$b_prev
->getPreviousSibling ||
return
1;
return
-1
if
(
$b_prev
==
$a_next
);
$b_next
=
$b_next
->getNextSibling ||
return
-1;
return
1
if
(
$b_next
==
$a_prev
);
}
}
}
push
@ISA
,
'HTML::TreeBuilder::XPath::Node'
;
{
my
$xp
;
sub
xp
{
$xp
||=XML::XPathEngine->new();
return
$xp
;
}
}
sub
findnodes {
my
(
$elt
,
$path
)=
@_
;
return
xp->findnodes(
$path
,
$elt
); }
sub
findnodes_as_string {
my
(
$elt
,
$path
)=
@_
;
return
xp->findnodes_as_string(
$path
,
$elt
); }
sub
findnodes_as_strings {
my
(
$elt
,
$path
)=
@_
;
return
xp->findnodes_as_strings(
$path
,
$elt
); }
sub
findvalue {
my
(
$elt
,
$path
)=
@_
;
return
xp->findvalue(
$path
,
$elt
); }
sub
findvalues {
my
(
$elt
,
$path
)=
@_
;
return
xp->findvalues(
$path
,
$elt
); }
sub
exists
{
my
(
$elt
,
$path
)=
@_
;
return
xp->
exists
(
$path
,
$elt
); }
sub
find_xpath {
my
(
$elt
,
$path
)=
@_
;
return
xp->find(
$path
,
$elt
); }
sub
matches {
my
(
$elt
,
$path
)=
@_
;
return
xp->matches(
$elt
,
$path
,
$elt
); }
sub
set_namespace {
my
$elt
=
shift
; xp->new->set_namespace(
@_
); }
sub
getRootNode
{
my
$elt
=
shift
;
return
$elt
->root->getParentNode;
}
sub
getParentNode
{
my
$elt
=
shift
;
return
$elt
->{_parent} ||
bless
{
_root
=>
$elt
},
'HTML::TreeBuilder::XPath::Root'
;
}
sub
getName {
return
shift
->tag; }
sub
getLocalName { (
my
$name
=
$_
[0]->tag) =~ s{^.*:}{};
$name
; }
sub
getNextSibling {
my
(
$elt
)=
@_
;
my
$parent
=
$elt
->{_parent} ||
return
undef
;
return
$parent
->_child_as_object(
scalar
$elt
->right, (
$elt
->{_rank} || 0) + 1);
}
sub
getPreviousSibling {
my
(
$elt
)=
@_
;
my
$parent
=
$elt
->{_parent} ||
return
undef
;
return
undef
unless
$elt
->{_rank};
return
$parent
->_child_as_object(
scalar
$elt
->left,
$elt
->{_rank} - 1);
}
sub
isElementNode {
return
ref
$_
[0] && (
$_
[0]->{_tag}!~ m{^~}) ? 1 : 0; }
sub
isCommentNode {
return
ref
$_
[0] && (
$_
[0]->{_tag} eq
'~comment'
) ? 1 : 0; }
sub
isProcessingInstructionNode {
return
ref
$_
[0] && (
$_
[0]->{_tag} eq
'~pi'
) ? 1 : 0; }
sub
isTextNode {
return
ref
$_
[0] ? 0 : 1; }
sub
getValue
{
my
$elt
=
shift
;
if
(
$elt
->isCommentNode) {
return
$elt
->{text}; }
return
$elt
->as_text;
}
sub
getChildNodes
{
my
$parent
=
shift
;
my
$rank
=0;
my
@children
=
map
{
$parent
->_child_as_object(
$_
,
$rank
++) }
$parent
->content_list;
return
wantarray
?
@children
: \
@children
;
}
sub
getFirstChild
{
my
$parent
=
shift
;
my
@content
=
$parent
->content_list;
if
(
@content
)
{
return
$parent
->_child_as_object(
$content
[0], 0); }
else
{
return
undef
; }
}
sub
getLastChild
{
my
$parent
=
shift
;
my
@content
=
$parent
->content_list;
if
(
@content
)
{
return
$parent
->_child_as_object(
$content
[-1],
$#content
); }
else
{
return
undef
; }
}
sub
getAttributes
{
my
$elt
=
shift
;
my
%atts
=
$elt
->all_external_attr;
my
$rank
=0;
my
@atts
=
map
{
bless
( {
_name
=>
$_
,
_value
=>
$atts
{
$_
},
_elt
=>
$elt
,
_rank
=>
$rank
++,
},
'HTML::TreeBuilder::XPath::Attribute'
)
}
sort
keys
%atts
;
return
wantarray
?
@atts
: \
@atts
;
}
sub
to_number {
return
XML::XPathEngine::Number->new(
$_
[0]->as_text); }
sub
string_value
{
my
$elt
=
shift
;
if
(
$elt
->isCommentNode) {
return
$elt
->{text}; }
return
$elt
->as_text;
};
sub
_child_as_object
{
my
(
$elt
,
$elt_or_text
,
$rank
)=
@_
;
return
undef
unless
(
defined
$elt_or_text
);
if
( !
ref
$elt_or_text
)
{
$elt_or_text
=
bless
{
_content
=>
$elt_or_text
,
_parent
=>
$elt
, },
'HTML::TreeBuilder::XPath::TextNode'
;
Scalar::Util::weaken(
$elt_or_text
->{_parent});
}
if
(
ref
$rank
) {
warn
"rank is a "
,
ref
(
$rank
),
" elt_or_text is a "
,
ref
(
$elt_or_text
); }
$elt_or_text
->{_rank}=
$rank
;
return
$elt_or_text
;
}
sub
toString {
return
shift
->as_XML(
@_
); }
{
no
warnings
'redefine'
;
sub
as_XML_compact
{
my
(
$node
,
$opt
)=
@_
;
my
$name
=
$node
->{
'_tag'
};
if
(
$name
eq
'~literal'
) {
return
_xml_escape_text(
$node
->{text}); }
if
(
$name
eq
'~declaration'
) {
return
'<!'
. _xml_escape_text(
$node
->{text}) .
">"
; }
if
(
$name
eq
'~pi'
) {
return
'<?'
. _xml_escape_text(
$node
->{text}) .
'?>'
; }
if
(
$name
eq
'~comment'
) {
return
'<!--'
. _xml_escape_comment(
$node
->{text}) .
'-->'
; }
my
$lc_name
=
lc
$name
;
my
$xml
=
$node
->_start_tag;
if
(
$HTML::Tagset::isCDATA_Parent
{
$lc_name
})
{
my
$content
=
$node
->{_content} ||
''
;
if
(
ref
$content
eq
'ARRAY'
||
$content
->isa(
'ARRAY'
))
{
$xml
.= _xml_escape_cdata(
join
(
''
,
@$content
),
$opt
); }
else
{
$xml
.=
$content
; }
}
else
{
foreach
my
$child
(
$node
->content_list)
{
if
(
ref
$child
) {
$xml
.=
$child
->as_XML_compact(); }
else
{
$xml
.= _xml_escape_text(
$child
); }
}
}
$xml
.=
"</$name>"
unless
$HTML::Tagset::emptyElement
{
$lc_name
};
return
$xml
;
}
}
{
my
%phrase_name
;
my
%extra_newline
;
my
$default_indent
;
BEGIN
{
%phrase_name
=
%HTML::Tagset::isPhraseMarkup
;
$phrase_name
{
'~literal'
}= 1;
$default_indent
=
' '
;
%extra_newline
=
map
{
$_
=> 1 }
qw(html head body script div table tbody thead tfoot tr form dl ol ul)
;
}
sub
as_XML_indented
{
my
(
$node
,
$opt
)=
@_
;
my
$name
=
$node
->{
'_tag'
};
my
$lc_name
=
lc
$name
;
if
(
$name
eq
'~literal'
) {
return
_xml_escape_text(
$node
->{text}); }
if
(
$name
eq
'~declaration'
) {
return
'<!'
. _xml_escape_text(
$node
->{text}) .
">\n"
; }
if
(
$name
eq
'~pi'
) {
return
'<?'
. _xml_escape_text(
$node
->{text}) .
'?>'
; }
if
(
$name
eq
'~comment'
) {
return
'<!--'
. _xml_escape_comment(
$node
->{text}) .
'-->'
; }
my
$xml
;
my
$pre_tag_indent
=
''
;
if
(!
$phrase_name
{
$lc_name
}) {
$pre_tag_indent
=
"\n"
. (
$opt
->{indent} ||
$default_indent
) x (
$opt
->{indent_level}||0); }
if
(
$opt
->{indent_level}) {
$xml
.=
$pre_tag_indent
; }
$xml
.=
$node
->_start_tag();
my
$content
=
''
;
if
(
$HTML::Tagset::isCDATA_Parent
{
$lc_name
})
{
my
$content
=
$node
->{_content} ||
''
;
if
(
ref
$content
&& (
ref
$content
eq
'ARRAY'
||
$content
->isa(
'ARRAY'
) ))
{
$content
= _xml_escape_cdata(
join
(
''
,
@$content
),
$opt
); }
}
else
{
my
%child_opt
=
%$opt
;
$child_opt
{indent_level}++;
foreach
my
$child
(
$node
->content_list)
{
if
(
ref
$child
) {
$content
.=
$child
->as_XML_indented( \
%child_opt
); }
else
{
$content
.= _xml_escape_text(
$child
); }
}
}
$xml
.=
$content
;
if
(
$extra_newline
{
$lc_name
} &&
$content
ne
''
) {
$xml
.=
$pre_tag_indent
; }
$xml
.=
"</$name>"
unless
$HTML::Tagset::emptyElement
{
$lc_name
};
$xml
.=
"\n"
if
( !
$opt
->{indent_level});
return
$xml
;
}
}
sub
_start_tag
{
my
(
$node
)=
@_
;
my
$name
=
$node
->{
'_tag'
};
my
$start_tag
.=
"<$name"
;
foreach
my
$att_name
(
sort
keys
%$node
)
{
next
if
( (!
length
$att_name
) || (
$att_name
=~ m{^_}) || (
$att_name
eq
'/'
) );
my
$well_formed_att_name
= well_formed_name(
$att_name
);
$start_tag
.=
qq{ $well_formed_att_name="}
. _xml_escape_attribute_value(
$node
->{
$att_name
}) .
qq{"}
;
}
$start_tag
.=
$HTML::Tagset::emptyElement
{
lc
$name
} ?
" />"
:
">"
;
return
$start_tag
;
}
sub
well_formed_name
{
my
(
$name
)=
@_
;
$name
=~ s{[^\w:_-]+}{_}g;
if
(
$name
=~ m{^\d}) {
$name
=
"a$name"
; }
return
$name
;
}
sub
_indent_level
{
my
(
$node
)=
@_
;
my
$level
=
scalar
grep
{ !
$HTML::Tagset::isPhraseMarkup
{
lc
$_
->{_tag}} }
$node
->lineage;
return
$level
;
}
{
my
(
$indent
,
%extra_newline
,
$nl
);
BEGIN
{
$indent
=
' '
;
$nl
=
"\n"
;
%extra_newline
=
map
{
$_
=> 1 }
qw(html head body script div table tr form ol ul)
;
}
sub
indents
{
my
(
$opt
,
$name
)=
@_
;
my
$indents
= {
pre_start_tag
=>
''
,
post_start_tag
=>
''
,
pre_end_tag
=>
''
,
post_end_tag
=>
''
};
if
(
$opt
->{indented})
{
my
$indent_level
=
$opt
->{indent_level};
my
$wrapping_nl
=
$nl
;
if
( !
defined
(
$indent_level
)) {
$indent_level
= 0;
$wrapping_nl
=
''
; }
if
(
$HTML::Tagset::isKnown
{
lc
$name
} && !
$HTML::Tagset::isPhraseMarkup
{
lc
$name
} &&
$indent_level
> 0)
{
$indents
->{pre_start_tag}=
$wrapping_nl
. (
$indent
x
$indent_level
); }
if
(
$extra_newline
{
lc
$name
})
{
$indents
->{post_start_tag}=
$nl
;
$indents
->{pre_end_tag}=
$nl
. (
$indent
x
$indent_level
);
}
if
(
$indent_level
== 0)
{
$indents
->{post_end_tag} =
$wrapping_nl
; }
}
return
$indents
;
}
}
sub
_xml_escape_attribute_value
{
my
(
$text
)=
@_
;
$text
=~ s{([&<>"])}{
$CHAR2DEFAULT_ENT
{$1}}g;
return
$text
;
}
sub
_xml_escape_text
{
my
(
$text
)=
@_
;
$text
=~ s{([&<>])}{
$CHAR2DEFAULT_ENT
{$1}}g;
return
$text
;
}
sub
_xml_escape_comment
{
my
(
$text
)=
@_
;
$text
=~ s{([&<>])}{
$CHAR2DEFAULT_ENT
{$1}}g;
$text
=~ s{--}{-&
return
$text
;
}
sub
_xml_escape_cdata
{
my
(
$text
,
$opt
)=
@_
;
if
(
$opt
->{force_escape_cdata} ||
$text
=~ m{[<&]})
{
$text
=~ s{^\s*\Q<![CDATA[}{}s;
$text
=~ s{\Q]]>\E\s*$}{}s;
$text
=~ s{]]>}{]]&
$text
=
"<![CDATA[$text]]>"
;
}
return
$text
;
}
sub
getParentNode {
return
shift
->{_parent}; }
sub
getValue {
return
shift
->{_content}; }
sub
isTextNode {
return
1; }
sub
getAttributes {
return
wantarray
? () : []; }
sub
as_XML
{
my
(
$node
,
$entities
)=
@_
;
my
$content
=
$node
->{_content};
if
(
$node
->{_parent} &&
$node
->{_parent}->{_tag} eq
'script'
)
{
$content
=~ s{(&\w+;)}{HTML::Entities::decode($1)}eg; }
else
{
$content
= HTML::Element::_xml_escape_text(
$content
); }
return
$content
;
}
*as_XML_compact
=
*as_XML
;
*as_XML_indented
=
*as_XML
;
sub
getPreviousSibling
{
my
$self
=
shift
;
my
$rank
=
$self
->{_rank};
my
$parent
=
$self
->{_parent};
return
$rank
?
$parent
->_child_as_object(
$parent
->{_content}->[
$rank
-1],
$rank
-1) :
undef
;
}
sub
getNextSibling
{
my
$self
=
shift
;
my
$rank
=
$self
->{_rank};
my
$parent
=
$self
->{_parent};
my
$next_sibling
=
$parent
->{_content}->[
$rank
+1];
return
defined
(
$next_sibling
) ?
$parent
->_child_as_object(
$next_sibling
,
$rank
+1) :
undef
;
}
sub
getRootNode
{
return
shift
->{_parent}->getRootNode; }
sub
string_value {
return
shift
->{_content}; }
sub
lineage
{
my
(
$node
)=
@_
;
my
$parent
=
$node
->{_parent};
return
(
$parent
,
$parent
->lineage);
}
sub
is_inside
{
my
(
$text
,
$node
)=
@_
;
return
$text
->{_parent}->is_inside(
$node
);
}
1;
sub
getParentNode {
return
$_
[0]->{_elt}; }
sub
getValue {
return
$_
[0]->{_value}; }
sub
getName {
return
$_
[0]->{_name} ; }
sub
getLocalName { (
my
$name
=
$_
[0]->{_name}) =~ s{^.*:}{};
$name
; }
sub
string_value {
return
$_
[0]->{_value}; }
sub
to_number {
return
XML::XPathEngine::Number->new(
$_
[0]->{_value}); }
sub
isAttributeNode { 1 }
sub
toString {
return
qq{ $_[0]->{_name}
=
"$_[0]->{_value}"
}; }
sub
getAttributes {
return
$_
[0]->{_elt}->getAttributes; }
sub
getPreviousSibling
{
my
$self
=
shift
;
my
$rank
=
$self
->{_rank};
return
undef
unless
$rank
;
my
%atts
=
$self
->{_elt}->all_external_attr;
my
$previous_att_name
= (
sort
keys
%atts
)[
$rank
-1];
return
bless
( {
_name
=>
$previous_att_name
,
_value
=>
$atts
{
$previous_att_name
},
_elt
=>
$self
->{_elt},
_rank
=>
$rank
-1,
},
'HTML::TreeBuilder::XPath::Attribute'
);
}
sub
getNextSibling
{
my
$self
=
shift
;
my
$rank
=
$self
->{_rank};
my
%atts
=
$self
->{_elt}->all_external_attr;
my
$next_att_name
= (
sort
keys
%atts
)[
$rank
+1] ||
return
undef
;
return
bless
( {
_name
=>
$next_att_name
,
_value
=>
$atts
{
$next_att_name
},
_elt
=>
$self
->{_elt},
_rank
=>
$rank
+1,
},
'HTML::TreeBuilder::XPath::Attribute'
);
}
sub
lineage
{
my
(
$att
)=
@_
;
my
$elt
=
$att
->{_elt};
return
(
$elt
,
$elt
->lineage);
}
sub
is_inside
{
my
(
$att
,
$node
)=
@_
;
return
(
$att
->{_elt} ==
$node
) ||
$att
->{_elt}->is_inside(
$node
);
}
1;
sub
getParentNode {
return
(); }
sub
getChildNodes {
my
@content
= (
$_
[0]->{_root});
return
wantarray
?
@content
: \
@content
; }
sub
getAttributes {
return
[] }
sub
isDocumentNode {
return
1 }
sub
getRootNode {
return
$_
[0] }
sub
getName {
return
}
sub
getNextSibling {
return
}
sub
getPreviousSibling {
return
}
sub
lineage {
return
(
$_
[0]); }
sub
is_inside {
return
0; }
sub
cmp {
return
$_
[1]->isa(
' HTML::TreeBuilder::XPath::Root'
) ? 0 : 1; }
1;