# $Id: LibXML.pm 12 2005-06-21 09:43:03Z daisuke $ # # Daisuke Maki <dmaki@cpan.org> # All rights reserved. package XML::RSS::LibXML; use strict; our $VERSION = '0.03'; use XML::LibXML; use XML::LibXML::XPathContext; my %namespaces = ( rdf => "http://www.w3.org/1999/02/22-rdf-syntax-ns#", dc => "http://purl.org/dc/elements/1.1/", sy => "http://purl.org/rss/1.0/modules/syndication/", admin => "http://webns.net/mvcb/", content => "http://purl.org/rss/1.0/modules/content/", cc => "http://web.resource.org/cc/", taxo => "http://purl.org/rss/1.0/modules/taxonomy/", rss20 => "http://backend.userland.com/rss2", # really a dummy rss10 => "http://purl.org/rss/1.0/", rss09 => "http://my.netscape.com/rdf/simple/0.9/", ); my %VersionPrefix = ( '2.0' => 'rss20', '1.0' => 'rss10', '0.9' => 'rss09', ); sub new { my $class = shift; my %args = @_; my $p = XML::LibXML->new(); $p->recover(1); my $c = XML::LibXML::XPathContext->new(); my $self = bless { _parser => $p, _context => $c, _namespaces => {} }, $class; $self->_init(); return $self; } sub _init { my $self = shift; # Register namespaces. while (my($prefix, $uri) = each %namespaces) { $self->add_module(prefix => $prefix, uri => $uri); } } sub add_module { my $self = shift; my %args = @_; $self->{_context}->registerNs($args{prefix}, $args{uri}); $self->{_namespaces}->{$args{prefix}} = $args{uri}; } sub parse { my $self = shift; my $p = $self->{_parser}; my $dom = $p->parse_string(@_); $self->{_dom} = $dom; $self->_parse_dom; return $self; } sub parsefile { my $self = shift; my $p = $self->{_parser}; my $dom = $p->parse_file(@_); $self->{_dom} = $dom; $self->_parse_dom; } sub as_string { my $self = shift; return $self->{_dom} ? $self->{_dom}->toString(1) : undef; } sub _elem { $_[0]->{$_[1]} } sub channel { my $self = shift; return $_[0] ? $self->_elem('channel')->{$_[0]} : $self->_elem('channel'); } sub items { shift->_elem('items') } sub _parse_dom { my $self = shift; $self->{_internal}->{version} = $self->_guess_version(); $self->{channel} = $self->_parse_channel; $self->{items} = $self->_parse_items; } sub _xpath_context { my $self = shift; my $xc = XML::LibXML::XPathContext->new(); while (my($prefix, $namespace) = each %{$self->{_namespaces}}) { $xc->registerNs($prefix, $namespace); } return $xc; } sub _guess_version { my $self = shift; $self->{_dom} or die; my $dom = $self->{_dom}; my $xc = $self->_xpath_context(); # Test starting from the most likely candidate if ($xc->findnodes('/rdf:RDF', $dom)) { # 1.0 or 0.9 if ($xc->findnodes('/rdf:RDF/rss10:channel', $dom)) { return '1.0'; } else { return '0.9'; } } elsif ($xc->findnodes('/rss', $dom)) { # 0.91 or 2.0 -ish return $xc->findvalue('/rss/@version', $dom); } return 'UNKNOWN'; } sub grab_data { my($self, $node, $xc, $candidates) = @_; return unless $candidates; my $xpath; my $version = $self->{_internal}->{version}; foreach my $xp (@$candidates) { if ($xp !~ /^[^:]+:/ && $VersionPrefix{$version}) { $xpath = $VersionPrefix{$version} . ":$xp"; } else { $xpath = $xp; } if (my($v) = eval { $xc->findnodes($xpath, $node) }) { my %data; if (my $prefix = $v->prefix) { $data{prefix} = $prefix; $data{name} = $v->localname; } $data{data} = $v->textContent(); return \%data; } } return; } my %ChannelRoot = ( '1.0' => '/rdf:RDF/rss10:channel', '0.9' => '/rdf:RDF/rss09:channel', '2.0' => '/rss/channel' ); sub _parse_channel { my $self = shift; my %args = @_; my $dom = $self->{_dom} or die "channel called before parse!"; my $version = $self->{_internal}->{version}; my $xc = $self->_xpath_context; my $root_xpath = $ChannelRoot{$version} || $ChannelRoot{other}; if( my ($channel) = $xc->findnodes($root_xpath, $dom)) { return $self->_parse_children($channel); } return undef; } my %ItemRoot = ( '1.0' => '/rdf:RDF/rss10:item', '0.9' => '/rdf:RDF/rss09:item', '2.0' => '/rss/channel/item' ); sub _parse_items { my $self = shift; my $dom = $self->{_dom} or die "channel called before parse!"; my @items; my $version = $self->{_internal}->{version}; my $xc = $self->_xpath_context; my $root_xpath = $ItemRoot{$version} || $ItemRoot{other}; # grab everything by namespace foreach my $item ($xc->findnodes($root_xpath, $dom)) { push @items, $self->_parse_children($item); } return wantarray ? @items : \@items; } sub _parse_children { my $self = shift; my $node = shift; my $version = $self->{_internal}->{version}; my $root_xpath = $ItemRoot{$version} || $ItemRoot{other}; my $xc = $self->_xpath_context; my $vprefix = $VersionPrefix{$version}; my %item; foreach my $prefix (keys %{$self->{_namespaces}}) { next if $prefix =~ /^rss/ && $prefix ne $vprefix; my %sub; my $xpath = $prefix eq $vprefix ? "./*" : "./*[starts-with(name(), '$prefix:')]"; foreach my $node ($xc->findnodes($xpath, $node)) { $sub{$node->localname} = $node->textContent(); } if (keys %sub) { if ($vprefix eq $prefix) { while (my ($key, $value) = each %sub) { $item{$key} = $value; } } else { $item{$prefix} = \%sub; } } } return \%item; } 1; __END__ =head1 NAME XML::RSS::LibXML - XML::RSS with XML::LibXML (parse-only) =head1 SYNOPSIS use XML::RSS::LibXML; my $rss = XML::RSS::LibXML->new; $rss->parsefile($file); print "channel: $rss->{channel}->{title}\n"; foreach my $item (@{ $rss->{items} }) { print " item: $item->{title} ($item->{link})\n"; } # Add custom modules $rss->add_module(uri => $uri, prefix => $prefix); =head1 DESCRIPTION XML::RSS::LibXML uses XML::LibXML (libxml2) for parsing RSS instead of XML::RSS' XML::Parser (expat), while trying to keep interface compatibility with XML::RSS. XML::RSS is an extremely handy tool, but it is unfortunately not exactly the most lean or efficient RSS parser, especially in a long-running process. So for a long time I had been using my own version of RSS parser to get the maximum speed and efficiency - this is the re-packaged version of that module, such that it adheres to the XML::RSS interface. XML::RSS::LibXML is B<NOT> 100% compatible with XML::RSS. For example, XML::RSS::LibXML is not capable of outputting RSS in various formats. Use this module when you have severe performance requirements in parsing RSS files. =head1 PARSED FIELDS =head1 METHODS =head2 new Creates a new instance of XML::RSS::LibXML =head2 parse($string) Parse a string containing RSS. =head2 parse_file($filename) Parse an RSS file specified by $filename =head2 as_string() Return the string representation of the parsed RSS. =head2 add_module(uri =E<gt> $uri, prefix =E<gt> $prefix) Adds a new module. You should do this before parsing the RSS. XML::RSS::LibXML understands a few modules by default: rdf => "http://www.w3.org/1999/02/22-rdf-syntax-ns#", dc => "http://purl.org/dc/elements/1.1/", sy => "http://purl.org/rss/1.0/modules/syndication/", admin => "http://webns.net/mvcb/", content => "http://purl.org/rss/1.0/modules/content/", cc => "http://web.resource.org/cc/", taxo => "http://purl.org/rss/1.0/modules/taxonomy/", So you do not need to add these explicitly. =head1 PERFORMANCE Here's a simple benchmark using benchmark.pl in this distribution: daisuke@localhost XML-RSS-LibXML$ perl -Mlib=lib benchmark.pl index.rdf Rate rss rss_libxml rss 8.00/s -- -97% rss_libxml 262/s 3172% -- =head1 CAVEATS No support whatsover for writing RSS. No plans to support it either. Only first level data under E<lt>channelE<gt> and E<lt>itemE<gt> tags are examined. So if you have complex data, this module will not pick it up. For most of the cases, this will suffice, though. =head1 TODO Tests. Currently tests are simply stolen from XML::RSS. It would be nice to have tests that do more extensive testing for correctness =head1 SEE ALSO L<XML::RSS|XML::RSS>, L<XML::LibXML|XML::LibXML>, L<XML::LibXML::XPathContext> =head1 AUTHORS Copyright 2005 Daisuke Maki E<lt>dmaki@cpan.orgE<gt>, Tatsuhiko Miyagawa E<lt>miyagawa@bulknews.netE<gt>. All rights reserved. Development partially funded by Brazil, Ltd. E<lt>http://b.razil.jpE<gt> =cut