package Mojo::DOM; use Mojo::Base -base; use overload '%{}' => sub { shift->attrs }, 'bool' => sub {1}, '""' => sub { shift->to_xml }, fallback => 1; # "Fry: This snow is beautiful. I'm glad global warming never happened. # Leela: Actually, it did. But thank God nuclear winter canceled it out." use Carp 'croak'; use Mojo::Collection; use Mojo::DOM::CSS; use Mojo::DOM::HTML; use Mojo::Util 'squish'; use Scalar::Util qw(blessed weaken); sub AUTOLOAD { my $self = shift; # Method my ($package, $method) = our $AUTOLOAD =~ /^([\w:]+)::(\w+)$/; croak "Undefined subroutine &${package}::$method called" unless blessed $self && $self->isa(__PACKAGE__); # Search children my $children = $self->children($method); return @$children > 1 ? $children : $children->[0] if @$children; croak qq{Can't locate object method "$method" via package "$package"}; } sub DESTROY { } sub new { my $class = shift; my $self = bless [Mojo::DOM::HTML->new], ref $class || $class; return @_ ? $self->parse(@_) : $self; } sub all_text { my ($self, $trim) = @_; my $tree = $self->tree; return _text(_elements($tree), 1, _trim($tree, $trim)); } sub append { shift->_add(1, @_) } sub append_content { my ($self, $new) = @_; my $tree = $self->tree; push @$tree, @{_parent($self->_parse("$new"), $tree)}; return $self; } sub at { shift->find(@_)->[0] } sub attrs { my $self = shift; # Hash my $tree = $self->tree; my $attrs = $tree->[0] eq 'root' ? {} : $tree->[2]; return $attrs unless @_; # Get return $attrs->{$_[0]} // '' unless @_ > 1 || ref $_[0]; # Set %$attrs = (%$attrs, %{ref $_[0] ? $_[0] : {@_}}); return $self; } sub charset { shift->_html(charset => @_) } sub children { my ($self, $type) = @_; # Walk tree my @children; my $charset = $self->charset; my $xml = $self->xml; my $tree = $self->tree; for my $e (@$tree[($tree->[0] eq 'root' ? 1 : 4) .. $#$tree]) { # Make sure child is the right type next unless $e->[0] eq 'tag'; next if defined $type && $e->[1] ne $type; push @children, $self->new->charset($charset)->tree($e)->xml($xml); } return Mojo::Collection->new(@children); } sub content_xml { my $self = shift; # Render children my $tree = $self->tree; my $charset = $self->charset; my $xml = $self->xml; return join '', map { Mojo::DOM::HTML->new(charset => $charset, tree => $_, xml => $xml)->render } @$tree[($tree->[0] eq 'root' ? 1 : 4) .. $#$tree]; } sub find { my ($self, $selector) = @_; # Match selector against tree and upgrade results my $charset = $self->charset; my $xml = $self->xml; return Mojo::Collection->new( map { $self->new->charset($charset)->tree($_)->xml($xml) } @{Mojo::DOM::CSS->new(tree => $self->tree)->select($selector)}); } sub namespace { my $self = shift; # Namespace prefix return '' if (my $current = $self->tree)->[0] eq 'root'; my $ns = $current->[1] =~ /^(.*?):/ ? "xmlns:$1" : undef; # Walk tree while ($current) { last if $current->[0] eq 'root'; # Namespace for prefix my $attrs = $current->[2]; if ($ns) { /^\Q$ns\E$/ and return $attrs->{$_} for keys %$attrs } # Namespace attribute elsif (defined $attrs->{xmlns}) { return $attrs->{xmlns} } # Parent $current = $current->[3]; } return ''; } sub next { shift->_sibling(1) } sub parent { my $self = shift; return undef if (my $tree = $self->tree)->[0] eq 'root'; return $self->new->charset($self->charset)->tree($tree->[3]) ->xml($self->xml); } sub parse { my $self = shift; $self->[0]->parse(@_); return $self; } sub prepend { shift->_add(0, @_) } sub prepend_content { my ($self, $new) = @_; my $tree = $self->tree; splice @$tree, $tree->[0] eq 'root' ? 1 : 4, 0, @{_parent($self->_parse("$new"), $tree)}; return $self; } sub previous { shift->_sibling(0) } sub remove { shift->replace('') } sub replace { my ($self, $new) = @_; # Parse my $tree = $self->tree; if ($tree->[0] eq 'root') { return $self->xml(undef)->parse($new) } else { $new = $self->_parse("$new") } # Find and replace my $parent = $tree->[3]; my $i = $parent->[0] eq 'root' ? 1 : 4; for my $e (@$parent[$i .. $#$parent]) { last if $e == $tree; $i++; } splice @$parent, $i, 1, @{_parent($new, $parent)}; return $self; } sub replace_content { my ($self, $new) = @_; my $tree = $self->tree; splice @$tree, $tree->[0] eq 'root' ? 1 : 4, $#$tree, @{_parent($self->_parse("$new"), $tree)}; return $self; } sub root { my $self = shift; # Find root my $root = $self->tree; while ($root->[0] eq 'tag') { last unless my $parent = $root->[3]; $root = $parent; } return $self->new->charset($self->charset)->tree($root)->xml($self->xml); } sub text { my ($self, $trim) = @_; my $tree = $self->tree; return _text(_elements($tree), 0, _trim($tree, $trim)); } sub text_after { my ($self, $trim) = @_; # Find following text elements return '' if (my $tree = $self->tree)->[0] eq 'root'; my (@elements, $started); for my $e (@{_elements($tree->[3])}) { ++$started and next if $e eq $tree; next unless $started; last if $e->[0] eq 'tag'; push @elements, $e; } return _text(\@elements, 0, _trim($tree->[3], $trim)); } sub text_before { my ($self, $trim) = @_; # Find preceding text elements return '' if (my $tree = $self->tree)->[0] eq 'root'; my @elements; for my $e (@{_elements($tree->[3])}) { last if $e eq $tree; push @elements, $e; @elements = () if $e->[0] eq 'tag'; } return _text(\@elements, 0, _trim($tree->[3], $trim)); } sub to_xml { shift->[0]->render } sub tree { shift->_html(tree => @_) } sub type { my ($self, $type) = @_; # Get return '' if (my $tree = $self->tree)->[0] eq 'root'; return $tree->[1] unless $type; # Set $tree->[1] = $type; return $self; } sub xml { shift->_html(xml => @_) } sub _add { my ($self, $offset, $new) = @_; # Not a tag return $self if (my $tree = $self->tree)->[0] eq 'root'; # Find parent my $parent = $tree->[3]; my $i = $parent->[0] eq 'root' ? 1 : 4; for my $e (@$parent[$i .. $#$parent]) { last if $e == $tree; $i++; } # Add children splice @$parent, $i + $offset, 0, @{_parent($self->_parse("$new"), $parent)}; return $self; } sub _elements { return [] unless my $e = shift; return [@$e[($e->[0] eq 'root' ? 1 : 4) .. $#$e]]; } sub _html { my ($self, $method) = (shift, shift); return $self->[0]->$method unless @_; $self->[0]->$method(@_); return $self; } sub _parent { my ($children, $parent) = @_; # Link parent to children my @new; for my $e (@$children[1 .. $#$children]) { if ($e->[0] eq 'tag') { $e->[3] = $parent; weaken $e->[3]; } push @new, $e; } return \@new; } sub _parse { my $self = shift; Mojo::DOM::HTML->new(charset => $self->charset, xml => $self->xml) ->parse(shift)->tree; } sub _sibling { my ($self, $next) = @_; # Make sure we have a parent return undef unless my $parent = $self->parent; # Find previous or next sibling my ($previous, $current); for my $child ($parent->children->each) { ++$current and next if $child->tree eq $self->tree; return $next ? $child : $previous if $current; $previous = $child; } # No siblings return undef; } sub _text { my ($elements, $recurse, $trim) = @_; # Walk tree my $text = ''; for my $e (@$elements) { my $type = $e->[0]; # Nested tag my $content = ''; if ($type eq 'tag' && $recurse) { $content = _text(_elements($e), 1, _trim($e, $trim)); } # Text elsif ($type eq 'text') { $content = $trim ? squish($e->[1]) : $e->[1] } # CDATA or raw text elsif ($type eq 'cdata' || $type eq 'raw') { $content = $e->[1] } # Add leading whitespace if punctuation allows it $content = " $content" if $text =~ /\S\z/ && $content =~ /^[^.!?,;:\s]+/; # Trim whitespace blocks $text .= $content if $content =~ /\S+/ || !$trim; } return $text; } sub _trim { my ($e, $trim) = @_; # Disabled return 0 unless $e && ($trim = defined $trim ? $trim : 1); # Detect "pre" tag while ($e->[0] eq 'tag') { return 0 if $e->[1] eq 'pre'; last unless $e = $e->[3]; } return 1; } 1; =head1 NAME Mojo::DOM - Minimalistic HTML/XML DOM parser with CSS selectors =head1 SYNOPSIS use Mojo::DOM; # Parse my $dom = Mojo::DOM->new('<div><p id="a">A</p><p id="b">B</p></div>'); # Find say $dom->at('#b')->text; say $dom->find('p')->pluck('text'); # Walk say $dom->div->p->[0]->text; say $dom->div->children('p')->first->{id}; # Iterate $dom->find('p[id]')->each(sub { say shift->{id} }); # Loop for my $e ($dom->find('p[id]')->each) { say $e->text; } # Modify $dom->div->p->[1]->append('<p id="c">C</p>'); # Render say $dom; =head1 DESCRIPTION L<Mojo::DOM> is a minimalistic and relaxed HTML/XML DOM parser with CSS selector support. It will even try to interpret broken XML, so you should not use it for validation. =head1 CASE SENSITIVITY L<Mojo::DOM> defaults to HTML semantics, that means all tags and attributes are lowercased and selectors need to be lower case as well. my $dom = Mojo::DOM->new('<P ID="greeting">Hi!</P>'); say $dom->at('p')->text; say $dom->p->{id}; If XML processing instructions are found, the parser will automatically switch into XML mode and everything becomes case sensitive. my $dom = Mojo::DOM->new('<?xml version="1.0"?><P ID="greeting">Hi!</P>'); say $dom->at('P')->text; say $dom->P->{ID}; XML detection can also be disabled with the C<xml> method. # Force XML semantics $dom->xml(1); # Force HTML semantics $dom->xml(0); =head1 METHODS L<Mojo::DOM> inherits all methods from L<Mojo::Base> and implements the following new ones. =head2 new my $dom = Mojo::DOM->new; my $dom = Mojo::DOM->new('<foo bar="baz">test</foo>'); Construct a new array-based L<Mojo::DOM> object. =head2 all_text my $trimmed = $dom->all_text; my $untrimmed = $dom->all_text(0); Extract all text content from DOM structure, smart whitespace trimming is enabled by default. # "foo bar baz" $dom->parse("<div>foo\n<p>bar</p>baz\n</div>")->div->all_text; # "foo\nbarbaz\n" $dom->parse("<div>foo\n<p>bar</p>baz\n</div>")->div->all_text(0); =head2 append $dom = $dom->append('<p>Hi!</p>'); Append to element. # "<div><h1>A</h1><h2>B</h2></div>" $dom->parse('<div><h1>A</h1></div>')->at('h1')->append('<h2>B</h2>')->root; =head2 append_content $dom = $dom->append_content('<p>Hi!</p>'); Append to element content. # "<div><h1>AB</h1></div>" $dom->parse('<div><h1>A</h1></div>')->at('h1')->append_content('B')->root; =head2 at my $result = $dom->at('html title'); Find a single element with CSS selectors. All selectors from L<Mojo::DOM::CSS> are supported. # Find first element with "svg" namespace definition my $namespace = $dom->at('[xmlns\:svg]')->{'xmlns:svg'}; =head2 attrs my $attrs = $dom->attrs; my $foo = $dom->attrs('foo'); $dom = $dom->attrs({foo => 'bar'}); $dom = $dom->attrs(foo => 'bar'); Element attributes. =head2 charset my $charset = $dom->charset; $dom = $dom->charset('UTF-8'); Charset used for decoding and encoding HTML/XML. =head2 children my $collection = $dom->children; my $collection = $dom->children('div'); Return a L<Mojo::Collection> object containing the children of this element, similar to C<find>. # Show type of random child element say $dom->children->shuffle->first->type; =head2 content_xml my $xml = $dom->content_xml; Render content of this element to XML. Note that the XML will be encoded if a C<charset> has been defined. # "<b>test</b>" $dom->parse('<div><b>test</b></div>')->div->content_xml; =head2 find my $collection = $dom->find('html title'); Find elements with CSS selectors and return a L<Mojo::Collection> object. All selectors from L<Mojo::DOM::CSS> are supported. # Find a specific element and extract information my $id = $dom->find('div')->[23]{id}; # Extract information from multiple elements my @headers = $dom->find('h1, h2, h3')->pluck('text')->each; =head2 namespace my $namespace = $dom->namespace; Find element namespace. # Find namespace for an element with namespace prefix my $namespace = $dom->at('svg > svg\:circle')->namespace; # Find namespace for an element that may or may not have a namespace prefix my $namespace = $dom->at('svg > circle')->namespace; =head2 next my $sibling = $dom->next; Next sibling of element. # "<h2>B</h2>" $dom->parse('<div><h1>A</h1><h2>B</h2></div>')->at('h1')->next; =head2 parent my $parent = $dom->parent; Parent of element. =head2 parse $dom = $dom->parse('<foo bar="baz">test</foo>'); Parse HTML/XML document with L<Mojo::DOM::HTML>. # Parse UTF-8 encoded XML my $dom = Mojo::DOM->new->charset('UTF-8')->xml(1)->parse($xml); =head2 prepend $dom = $dom->prepend('<p>Hi!</p>'); Prepend to element. # "<div><h1>A</h1><h2>B</h2></div>" $dom->parse('<div><h2>B</h2></div>')->at('h2')->prepend('<h1>A</h1>')->root; =head2 prepend_content $dom = $dom->prepend_content('<p>Hi!</p>'); Prepend to element content. # "<div><h2>AB</h2></div>" $dom->parse('<div><h2>B</h2></div>')->at('h2')->prepend_content('A')->root; =head2 previous my $sibling = $dom->previous; Previous sibling of element. # "<h1>A</h1>" $dom->parse('<div><h1>A</h1><h2>B</h2></div>')->at('h2')->previous; =head2 remove my $old = $dom->remove; Remove element. # "<div></div>" $dom->parse('<div><h1>A</h1></div>')->at('h1')->remove->root; =head2 replace my $old = $dom->replace('<div>test</div>'); Replace element. # "<div><h2>B</h2></div>" $dom->parse('<div><h1>A</h1></div>')->at('h1')->replace('<h2>B</h2>')->root; # "<div></div>" $dom->parse('<div><h1>A</h1></div>')->at('h1')->replace('')->root; =head2 replace_content $dom = $dom->replace_content('test'); Replace element content. # "<div><h1>B</h1></div>" $dom->parse('<div><h1>A</h1></div>')->at('h1')->replace_content('B')->root; # "<div><h1></h1></div>" $dom->parse('<div><h1>A</h1></div>')->at('h1')->replace_content('')->root; =head2 root my $root = $dom->root; Find root node. =head2 text my $trimmed = $dom->text; my $untrimmed = $dom->text(0); Extract text content from element only (not including child elements), smart whitespace trimming is enabled by default. # "foo baz" $dom->parse("<div>foo\n<p>bar</p>baz\n</div>")->div->text; # "foo\nbaz\n" $dom->parse("<div>foo\n<p>bar</p>baz\n</div>")->div->text(0); =head2 text_after my $trimmed = $dom->text_after; my $untrimmed = $dom->text_after(0); Extract text content immediately following element, smart whitespace trimming is enabled by default. # "baz" $dom->parse("<div>foo\n<p>bar</p>baz\n</div>")->div->p->text_after; # "baz\n" $dom->parse("<div>foo\n<p>bar</p>baz\n</div>")->div->p->text_after(0); =head2 text_before my $trimmed = $dom->text_before; my $untrimmed = $dom->text_before(0); Extract text content immediately preceding element, smart whitespace trimming is enabled by default. # "foo" $dom->parse("<div>foo\n<p>bar</p>baz\n</div>")->div->p->text_before; # "foo\n" $dom->parse("<div>foo\n<p>bar</p>baz\n</div>")->div->p->text_before(0); =head2 to_xml my $xml = $dom->to_xml; my $xml = "$dom"; Render this element and its content to XML. Note that the XML will be encoded if a C<charset> has been defined. # "<b>test</b>" $dom->parse('<div><b>test</b></div>')->div->b->to_xml; =head2 tree my $tree = $dom->tree; $dom = $dom->tree(['root', [qw(text lalala)]]); Document Object Model. =head2 type my $type = $dom->type; $dom = $dom->type('div'); Element type. # List types of child elements say $dom->children->pluck('type'); =head2 xml my $xml = $dom->xml; $dom = $dom->xml(1); Disable HTML semantics in parser and activate case sensitivity, defaults to auto detection based on processing instructions. =head1 CHILD ELEMENTS In addition to the methods above, many child elements are also automatically available as object methods, which return a L<Mojo::DOM> or L<Mojo::Collection> object, depending on number of children. say $dom->p->text; say $dom->div->[23]->text; say $dom->div->pluck('text'); =head1 ELEMENT ATTRIBUTES Direct hash reference access to element attributes is also possible. say $dom->{foo}; say $dom->div->{id}; =head1 SEE ALSO L<Mojolicious>, L<Mojolicious::Guides>, L<http://mojolicio.us>. =cut