NAME
Mojo::DOM - Minimalistic XML/HTML5 DOM Parser With CSS3 Selectors
SYNOPSIS
use Mojo::DOM;
# Parse
my $dom = Mojo::DOM->new;
$dom->parse('<div><div id="a">A</div><div id="b">B</div></div>');
# Find
my $b = $dom->at('#b');
print $b->text;
# Iterate
$dom->find('div[id]')->each(sub { print shift->text });
# Loop
for my $e ($dom->find('div[id]')->each) {
print $e->text;
}
# Get the first 10 links
$dom->find('a[href]')
->while(sub { print shift->attrs->{href} && pop() < 10 });
# Search for a link about a specific topic
$dom->find('a[href]')
->until(sub { $_->text =~ m/kraih/ && print $_->attrs->{href} });
DESCRIPTION
Mojo::DOM is a minimalistic and very relaxed XML/HTML5 DOM parser with support for CSS3 selectors. Note that this module is EXPERIMENTAL and might change without warning!
Selectors
All CSS3 selectors that make sense for a standalone parser are supported.
*
-
Any element.
E
-
my $title = $dom->at('title');
An element of type
E
. E[foo]
-
my $links = $dom->find('a[href]');
An
E
element with afoo
attribute. E[foo="bar"]
-
my $fields = $dom->find('input[name="foo"]');
An
E
element whosefoo
attribute value is exactly equal tobar
. E[foo~="bar"]
-
my $fields = $dom->find('input[name~="foo"]');
An
E
element whosefoo
attribute value is a list of whitespace-separated values, one of which is exactly equal tobar
. E[foo^="bar"]
-
my $fields = $dom->find('input[name^="f"]');
An
E
element whosefoo
attribute value begins exactly with the stringbar
. E[foo$="bar"]
-
my $fields = $dom->find('input[name$="o"]');
An
E
element whosefoo
attribute value ends exactly with the stringbar
. E[foo*="bar"]
-
my $fields = $dom->find('input[name*="fo"]');
An
E
element whosefoo
attribute value contains the substringbar
. E:root
-
my $root = $dom->at(':root');
An
E
element, root of the document. E:checked
-
my $input = $dom->at(':checked');
A user interface element
E
which is checked (for instance a radio-button or checkbox). E:empty
-
my $empty = $dom->find(':empty');
An
E
element that has no children (including text nodes). E:nth-child(n)
-
my $third = $dom->at('div:nth-child(3)'); my $odd = $dom->find('div:nth-child(odd)'); my $even = $dom->find('div:nth-child(even)'); my $top3 = $dom->find('div:nth-child(-n+3)');
An
E
element, then-th
child of its parent. E:nth-last-child(n)
-
my $third = $dom->at('div:nth-last-child(3)'); my $odd = $dom->find('div:nth-last-child(odd)'); my $even = $dom->find('div:nth-last-child(even)'); my $bottom3 = $dom->find('div:nth-last-child(-n+3)');
An
E
element, then-th
child of its parent, counting from the last one. E:nth-of-type(n)
-
my $third = $dom->at('div:nth-of-type(3)'); my $odd = $dom->find('div:nth-of-type(odd)'); my $even = $dom->find('div:nth-of-type(even)'); my $top3 = $dom->find('div:nth-of-type(-n+3)');
An
E
element, then-th
sibling of its type. E:nth-last-of-type(n)
-
my $third = $dom->at('div:nth-last-of-type(3)'); my $odd = $dom->find('div:nth-last-of-type(odd)'); my $even = $dom->find('div:nth-last-of-type(even)'); my $bottom3 = $dom->find('div:nth-last-of-type(-n+3)');
An
E
element, then-th
sibling of its type, counting from the last one. E:first-child
-
my $first = $dom->at('div p:first-child');
An
E
element, first child of its parent. E:last-child
-
my $last = $dom->at('div p:last-child');
An
E
element, last child of its parent. E:first-of-type
-
my $first = $dom->at('div p:first-of-type');
An
E
element, first sibling of its type. E:last-of-type
-
my $last = $dom->at('div p:last-of-type');
An
E
element, last sibling of its type. E:only-child
-
my $lonely = $dom->at('div p:only-child');
An
E
element, only child of its parent. E:only-of-type
-
my $lonely = $dom->at('div p:only-of-type');
an
E
element, only sibling of its type. E:not(s)
-
my $others = $dom->at('div p:not(:first-child)');
An
E
element that does not match simple selectors
. E F
-
my $headlines = $dom->find('div h1');
An
F
element descendant of anE
element. E > F
-
my $headlines = $dom->find('html > body > div > h1');
An
F
element child of anE
element. E + F
-
my $second = $dom->find('h1 + h2');
An
F
element immediately preceded by anE
element. E ~ F
-
my $second = $dom->find('h1 ~ h2');
An
F
element preceded by anE
element. E, F, G
-
my $headlines = $dom->find('h1, h2, h3');
Elements of type
E
,F
andG
. E[foo=bar][bar=baz]
-
my $links = $dom->find('a[foo^="b"][foo$="ar"]');
An
E
element whose attributes match all following attribute selectors.
ATTRIBUTES
Mojo::DOM implements the following attributes.
charset
my $charset = $dom->charset;
$dom = $dom->charset('UTF-8');
Charset used for decoding and encoding XML.
tree
my $array = $dom->tree;
$dom = $dom->tree(['root', ['text', 'lalala']]);
Document Object Model.
METHODS
Mojo::DOM inherits all methods from Mojo::Base and implements the following new ones.
after
$dom = $dom->after('<p>Hi!</p>');
Add after element.
$dom->parse('<div><h1>A</h1></div>')->at('h1')->after('<h2>B</h2>');
all_text
my $text = $dom->all_text;
Extract all text content from DOM structure.
at
my $result = $dom->at('html title');
Find a single element with CSS3 selectors.
attrs
my $attrs = $dom->attrs;
Element attributes.
before
$dom = $dom->before('<p>Hi!</p>');
Add before element.
$dom->parse('<div><h2>A</h2></div>')->at('h2')->before('<h1>B</h1>');
children
my $children = $dom->children;
Children of element.
find
my $results = $dom->find('html title');
Find elements with CSS3 selectors.
print $dom->find('div')->[23]->text;
$dom->find('div')->each(sub { print shift->text });
$dom->find('div')->while(sub { print $_->text && $_->text =~ /foo/ });
$dom->find('div')->until(sub { $_->text =~ /foo/ && print $_->text });
inner_xml
my $xml = $dom->inner_xml;
Render content of this element to XML.
namespace
my $namespace = $dom->namespace;
Element namespace.
parent
my $parent = $dom->parent;
Parent of element.
parse
$dom = $dom->parse('<foo bar="baz">test</foo>');
Parse XML document.
replace
$dom = $dom->replace('<div>test</div>');
Replace elements.
$dom->parse('<div><h1>A</h1></div>')->at('h1')->replace('<h2>B</h2>');
replace_inner
$dom = $dom->replace_inner('test');
Replace element content.
$dom->parse('<div><h1>A</h1></div>')->at('h1')->replace_inner('B');
root
my $root = $dom->root;
Find root element.
text
my $text = $dom->text;
Extract text content from element only, not including child elements.
to_xml
my $xml = $dom->to_xml;
Render DOM to XML.
type
my $type = $dom->type;
$dom = $dom->type('html');
Element type.