package Mojo::Feed::Reader; use Mojo::Base -base; use Mojo::UserAgent; use Mojo::Feed; use Mojo::File 'path'; use Mojo::Util 'decode', 'trim'; use Carp qw(carp croak); use Scalar::Util qw(blessed); # feed mime-types: our @feed_types = ( 'application/x.atom+xml', 'application/atom+xml', 'application/xml', 'text/xml', 'application/rss+xml', 'application/rdf+xml' ); our %is_feed = map { $_ => 1 } @feed_types; has ua => sub { Mojo::UserAgent->new }; has charset => 'UTF-8'; sub parse { my ($self, $xml, $charset) = @_; my ($body, $source, $url, $file); return unless ($xml); if (!ref $xml) { if ($xml =~ /^\</) { # looks like XML string... $body = $xml; } elsif ($xml =~ /^https?\:/) { $url = Mojo::URL->new($xml); } elsif (-r $xml) { # a readable file path $file = path($xml); } else { die "unknown argument $xml"; } } else { # $xml is a reference if (blessed $xml && $xml->can('slurp')) { $file = $xml; } elsif (blessed $xml && $xml->isa('Mojo::URL')) { $url = $xml->clone(); } elsif (blessed $xml && $xml->isa('Mojo::DOM')) { $body = $xml->to_string; # we don't need your dom, we make our own } elsif (ref $xml eq 'SCALAR') { $body = $$xml; } else { die "unknown argument $xml"; } } if ($url) { ($body, $charset) = $self->load($url); } if ($file) { $body = $file->slurp; } $charset ||= $self->charset; $body = $charset ? decode($charset, $body) // $body : $body; $source = $url || $file; my $feed = Mojo::Feed->new(body => $body, source => $source); return ($feed->is_valid) ? $feed : undef; } sub load { my ($self, $url) = @_; my $tx = $self->ua->get($url); if (!$tx->success) { croak "Error getting feed from url ", $url, ": ", (($tx->error) ? $tx->error->{message} : ''); } return ($tx->res->body, $tx->res->content->charset); } # discover - get RSS/Atom feed URL from argument. # Code adapted to use Mojolicious from Feed::Find by Benjamin Trott # Any stupid mistakes are my own sub discover { my ($self, $url) = @_; # $self->ua->max_redirects(5)->connect_timeout(30); return $self->ua->get_p($url) ->catch(sub { my ($err) = shift; die "Connection Error: $err" })->then(sub { my ($tx) = @_; if ($tx->success && $tx->res->code == 200) { return $self->_find_feed_links($tx->req->url, $tx->res); } return; }); } sub _find_feed_links { my ($self, $url, $res) = @_; state $feed_ext = qr/\.(?:rss|xml|rdf)$/; my @feeds; # use split to remove charset attribute from content_type my ($content_type) = split(/[; ]+/, $res->headers->content_type); if ($is_feed{$content_type}) { push @feeds, Mojo::URL->new($url)->to_abs; } else { # we are in a web page. PHEAR. my $base = Mojo::URL->new( $res->dom->find('head base')->map('attr', 'href')->join('') || $url) ->to_abs($url); my $title = $res->dom->find('head > title')->map('text')->join('') || $url; $res->dom->find('head link')->each(sub { my $attrs = $_->attr(); return unless ($attrs->{'rel'}); my %rel = map { $_ => 1 } split /\s+/, lc($attrs->{'rel'}); my $type = ($attrs->{'type'}) ? lc trim $attrs->{'type'} : ''; if ($is_feed{$type} && ($rel{'alternate'} || $rel{'service.feed'})) { push @feeds, Mojo::URL->new($attrs->{'href'})->to_abs($base); } }); $res->dom->find('a')->grep(sub { $_->attr('href') && Mojo::URL->new($_->attr('href'))->path =~ /$feed_ext/io; })->each(sub { push @feeds, Mojo::URL->new($_->attr('href'))->to_abs($base); }); # call me crazy, but maybe this is just a feed served as HTML? unless (@feeds) { if ($self->parse($res->body, $res->content->charset)) { push @feeds, Mojo::URL->new($url)->to_abs; } } } return @feeds; } sub parse_opml { my ($self, $opml_file) = @_; my $opml_str = decode $self->charset, (ref $opml_file) ? $opml_file->slurp : Mojo::File->new($opml_file)->slurp; my $d = Mojo::DOM->new->parse($opml_str); my (%subscriptions, %categories); for my $item ($d->find(q{outline})->each) { my $node = $item->attr; if (!defined $node->{xmlUrl}) { my $cat = $node->{title} || $node->{text}; $categories{$cat} = $item->children('[xmlUrl]')->map('attr', 'xmlUrl'); } else { # file by RSS URL: $subscriptions{$node->{xmlUrl}} = $node; } } # assign categories for my $cat (keys %categories) { for my $rss ($categories{$cat}->each) { next unless ($subscriptions{$rss}) ; # don't auto-vivify for empty "categories" $subscriptions{$rss}{'categories'} ||= []; push @{$subscriptions{$rss}{'categories'}}, $cat; } } return (values %subscriptions); } 1; __END__ =encoding utf-8 =for stopwords tagline pubDate dc:date =head1 NAME Mojo::Feed::Reader - Fetch feeds =head1 SYNOPSIS use Mojo::Feed::Reader; my $feedr = Mojo::Feed::Reader->new( ua => $ua ); my $feed = $feedr->parse("atom.xml"); print $feed->title, "\n", $feed->items->map('title')->join("\n"); # Feed discovery (returns a Promise): $feedr->discover("search.cpan.org")->then(sub { my (@feeds) = @_; if (@feeds) { print $_ for (@feeds); } })->catch(sub { die "Error: ", @_; }); # =head1 DESCRIPTION L<Mojo::Feed::Reader> is an Object Oriented module for identifying, fetching and parsing RSS and Atom Feeds. It relies on L<Mojo::DOM> for XML/HTML parsing and L<Mojo::UserAgent> for fetching feeds and checking URLs. =head1 ATTRIBUTES L<Mojo::Feed::Reader> implements the following attributes. =head2 ua $feed->ua(Mojo::UserAgent->new()); $feed->ua->get("http://example.com"); L<Mojo::UserAgent> object used to fetch feeds from the web. =head1 METHODS L<Mojo::Feed::Reader> inherits all methods from L<Mojo::Base> and implements the following new ones. =head2 new Construct a new L<Mojo::Feed::Reader> object. =head2 discover my @feeds; Mojo::Feed::Reader->new->discover('search.cpan.org') ->then(sub { @feeds = @_; }) ->wait(); for my $feed in (@feeds) { print $feed . "\n"; } # @feeds is a list of Mojo::URL objects A Mojo port of L<Feed::Find> by Benjamin Trott. This method implements feed auto-discovery for finding syndication feeds, given a URL. Returns a Mojo::Promise, which is fulfilled with a list of feeds (Mojo::URL objects) =head2 parse my $feedr = Mojo::Feed::Reader->new; # parse an RSS/Atom feed my $url = Mojo::URL->new('http://rss.slashdot.org/Slashdot/slashdot'); my $feed = $feedr->parse($url); # parse a file $feed2 = $feedr->new->parse('/downloads/foo.rss'); # parse a string my $str = Mojo::File->new('atom.xml')->slurp; $feed3 = $feedr->parse($str); A minimalist liberal RSS/Atom parser, using Mojo::DOM queries. If the parsed object is not a feed (for example, the parser was given an HTML page), the method will return undef. =head2 parse_opml my @subscriptions = Mojo::Feed->parse_opml( 'mysubs.opml' ); foreach my $sub (@subscriptions) { say 'RSS URL is: ', $sub->{xmlUrl}; say 'Website URL is: ', $sub->{htmlUrl}; say 'categories: ', join ',', @{$sub->{categories}}; } Parse an OPML subscriptions file and return the list of feeds as an array of hashrefs. Each hashref will contain an array ref in the key 'categories' listing the folders (parent nodes) in the OPML tree the subscription item appears in. =head1 CREDITS Dotan Dimet Mario Domgoergen Some tests adapted from L<Feed::Find> and L<XML:Feed>, Feed auto-discovery adapted from L<Feed::Find>. =head1 LICENSE Copyright (C) Dotan Dimet. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 AUTHOR Dotan Dimet E<lt>dotan@corky.netE<gt> Mario Domgoergen =cut 1;