The Perl and Raku Conference 2025: Greenville, South Carolina - June 27-29 Learn more

#!/usr/bin/perl -T
# This script tests HTML::DOM features that are not part of the DOM inter-
# faces.
# See css.t for css_url_fetcher.
# ~~~ I need a test that makes sure HTML::TreeBuilder doesn’t spit out
# warnings because of hash deref overloading.
use strict; use warnings; use utf8; use lib 't';
use Test::More tests => reverse 54;
# -------------------------#
# Test 1: load the module
BEGIN { use_ok 'HTML::DOM'; }
# -------------------------#
# Tests 2-3: constructor
my $doc = new HTML::DOM;
isa_ok $doc, 'HTML::DOM';
# weaken_response
{
require HTTP::Response;
my $res = new HTTP::Response;
my $doc = new HTML::DOM response => $res, weaken_response => 1;
require Scalar'Util;
Scalar'Util'weaken $res;
is $res, undef, 'weaken_response';
}
# -------------------------#
# Tests 4-24: elem_handler, parse, eof and write
# It is important that this
# 18 May, 2010: I’ve just discovered the previous line, which I apparently
# wrote five months ago; but I have no idea what it was going to say.
$doc->elem_handler(script => sub {
eval($_[1]->firstChild->data);
$@ and die;
});
$doc->write(<<'-----');
<body><p>Para 1
<p>Para 2
<script type='application/x-perl'>
$doc->write('<p>Para ' . ($doc->body->childNodes->length+1))
</script>
<p>Para 4
<p>Para 5
<script type='application/x-perl'>
$doc->write('<p>Para ' . ($doc->body->childNodes->length+1))
</script>
</body>
-----
$doc->close;
{
no warnings 'deprecated';
local $[ = 1;
use warnings 'deprecated';
my @p_tags = $doc->body->childNodes;
for(1..6){
is $p_tags[$_]->tagName, 'P',
"body\'s child node no. $_ is a P elem";
isa_ok $p_tags[$_]->firstChild, 'HTML::DOM::Text',
"first child of para $_";
like $p_tags[$_]->firstChild->data, qr/Para $_\b/,
"contents of para $_";
}
}
{
my $script = $doc->createElement('script');
$script->appendChild($doc->createTextNode('$doc->title("scred")'));
$doc->body->appendChild($script);
is $doc->title, 'scred', "elem_handlers are triggered on node insertion";
}
{
# Test that elements are accessible as soon as they are written (i.e.,
# that write is not actually buffered, even though we call it that). This
# was fixed in 0.040.
$doc->write(<<' -----');
<script>
# This is based on a horrid piece of code at
$doc->write("<img id='img1' height='1' width='1'>");
$doc->getElementById("img1")->src(
);
</script>
-----
$doc->close;
is $doc->find('img')->src,
'so-called buffered write is not actually buffered'
}
{
# Test that we don’t get errors when the document’s root element is
# detached inside an elem handler just before a write. Fixed in 0.051
my$ h = new HTML::DOM;
$h->elem_handler(script => sub {
$h->removeChild($h->firstChild); $h->write("foo") }
);
ok eval { $h->write("<script></script>"); 1 },
'no error from writing inside an elem handler when there is no doc root';
}
# -------------------------#
# Tests 25-37: parse_file & charset
use File::Spec::Functions 'catfile';
is $doc->charset, undef, 'undefined charset';
ok +($doc = new HTML::DOM) # clobber the existing one
->parse_file(catfile(dirname ($0),'test.html')),
'parse_file returns true';
is $doc->charset, 'utf-8', 'charset';
sub traverse($) {
my $thing = shift;
[ map {
nodeName $_,
{
defined(attributes $_)
? do {
my $attrs = attributes $_;
map +($attrs->item($_)->name,
$attrs->item($_)->value),
0..$attrs->length-1;
} :() ,
$_->isa('HTML::DOM::CharacterData') ?
(data => data $_) :() ,
hasChildNodes $_ ? (children => &traverse($_)) :()
}
} childNodes $thing ]
}
is_deeply traverse $doc, [
HTML => {
children => [
HEAD => {
children => [
META => {
'http-equiv' => 'Content-Type',
content => 'text/html; charset=utf-8',
}
],
},
BODY => {
children => [
P => {
children => [
'#text' => {
data => 'Para 1',
},
],
},
P => {
id => 'aoeu',
children => [
'#text' => {
data => 'Para ',
},
B => {
children => [
'#text' => {
data => '2',
},
],
},
],
},
P => {
children => [
'#text' => {
data => 'Para ',
},
I => {
children => [
'#text' => {
data => '3',
},
],
},
],
},
P => {
children => [
'#text' => {
data => 'Para ',
},
SPAN => {
class => 'ssalc',
children => [
'#text' => {
data => '4',
},
],
},
'#text' => {
data => '‼',
},
'#text' => {
data => "\n", # the line break after </html>
},
],
}
],
},
],
},
], 'parse_file';
ok !(new HTML::DOM)
->parse_file(catfile(dirname ($0),'I know this file does not exist.')),
'parse_file can return false';
($doc = new HTML::DOM charset => 'x-mac-roman') # clobber the existing one
->parse_file(catfile(dirname ($0),'test.html'));
like $doc->getElementsByTagName('p')->[-1]->as_text, qr/‼/,
'parse_file respects existing charset';
# Pathological cases (fixed in 0.036):
{
my $filename = catfile(dirname ($0),'test.html');
$doc->open; $doc->close;
ok eval { $doc->parse_file($filename); 1 },
'parse_file does not die when close has been called' or diag $@;
like $doc->innerHTML, qr/ssalc/, 'parse_file works after close';
$doc->open;
$doc->removeChild($doc->documentElement);
ok eval { $doc->parse_file($filename); 1 },
'parse_file does not die when the root element has been removed';
like $doc->innerHTML, qr/ssalc/,
'parse_file works after the removal of the root';
}
$doc = new HTML::DOM charset => 'iso-8859-1';
is $doc->charset, 'iso-8859-1', 'charset in constructor';
is $doc->charset('utf-16be'), 'iso-8859-1', 'charset get/set';
is $doc->charset, 'utf-16be', 'get charset after set';
# -------------------------#
# Test 38: another elem_handler test with nested <script> elems
# This was causing infinite recursion before version 0.004.
{
my $counter; my $doc = new HTML::DOM;
$doc->elem_handler(script => sub {
++$counter == 3 and die; # avoid infinite recursion
(my $data = $_[1]->firstChild->data) =~ s/\\\//\//g;
$doc->write($data);
$@ and die;
});
eval {
$doc->write(<<' -----');
<script>
<script>stuff<\/script>
</script>
-----
$doc->close;
};
is $counter,2, 'nested <script> elems';
}
# -------------------------#
# Test 39: Yet another elem_handler test, this time with '*' for the tag.
# I broke this in 0.009 and fixed it in 0.010.
{
my $counter; my $doc = new HTML::DOM;
$doc->elem_handler('*' => sub {
++$counter;
});
$doc->write('<p><b><i></i></b></p>');
is $counter,3, 'elem_handler(*)';
}
# -------------------------#
# Tests 40-43: event_parent
{
my $doc = new HTML::DOM;
my $thing = bless[];
is $doc->event_parent, undef, 'event_parent is initially undef';
is $doc->event_parent($thing), undef,
'event parent returns undef when setting the first time';;
is $doc->event_parent, $thing,, 'and setting it actually worked';
require Scalar'Util;
Scalar'Util'weaken($thing);
is $thing, undef, 'event_parent holds a weak reference';
}
# -------------------------#
# Tests 44-51: base
{
my $doc = new HTML::DOM url => 'file:///';
$doc->open, $doc->close;
is $doc->base, 'file:///', '->base with no <base>';
$doc->find('head')->innerHTML('<base href="file:///Volumes/">');
is $doc->base, 'file:///Volumes/', '->base from <base>';
$doc->find('base')->getAttributeNode('href'); # autoviv the attr node
ok !ref $doc->base, 'base returns a plain scalar';
require HTTP'Response;
$doc = new HTML::DOM response => new HTTP'Response 200, OK => [
content_type => 'text/html',
], '';
'base from response object';
$doc->innerHTML("<base target=_blank><base href='http://rext/'>");
is $doc->base, "http://rext/",
'retval of base when <base target> comes before <base href>';
"z" =~ /z/; # (test for weird bug introduced in 0.033 & fixed in 0.034)
is $doc->base, "http://rext/",
'base after regexp match that does not match the base';
# base with data URLs
$doc = new HTML::DOM url => "data:text/html,blahblahblah";
is $doc->base, "data:text/html,blahblahblah", 'base with data URL';
# This one was fixed in 0.057:
my $r = new HTTP'Response 200, OK => [content_type=>'text/html'], '';
require HTTP'Request;
request $r new HTTP'Request GET => 'data:text/html,blooblah';
$doc = new HTML::DOM response => $r;
is $doc->base, 'data:text/html,blooblah';
}
# -------------------------#
# Test 52-4: Yet another elem_handler test, for when elem_handlers orphan
# the <html> element. This also makes sure elem_handers are
# called even without closing tags. (Both were fixed 0.036.)
{
my $doc = new HTML::DOM;
my $accum = '';
my $doc_elem;
$doc->elem_handler('p' => sub {
$accum .= 'p';
my $doc = shift;
if(my $de = $doc->documentElement) {
$doc_elem = $de;
$doc->removeChild($de)
}
});
ok eval { $doc->write('<p><p>'); $doc->close; 1; },
'orphaning the doc elem does not stop parsing';
is $accum,'pp', 'implicit closing tags trigger elem_handler';
is $doc_elem->innerHTML,'<head></head><body><p></p><p></p></body>',
'parsing continues in the orphaned element';
}