—#!/usr/bin/perl
package
XML::RSSLite;
use
strict;
$VERSION
= 0.17;
sub
import
{
no
strict
'refs'
;
shift
;
my
$pkg
=
scalar
caller
();
*{
"${pkg}::parseRSS"
} = \
&parseRSS
;
*{
"${pkg}::parseXML"
} = \
&parseXML
if
grep
(
$_
eq
'parseXML'
,
@_
);
}
sub
parseRSS {
my
(
$rr
,
$cref
,
$strip
) =
@_
;
die
"$rr is not a hash reference"
unless
ref
(
$rr
) eq
'HASH'
;
die
"$cref is not a scalar reference"
unless
ref
(
$cref
) eq
'SCALAR'
;
# Gotta have some content to parse
return
unless
$$cref
;
preprocess(
$cref
,
$strip
);
{
_parseRSS(
$rr
,
$cref
),
last
if
index
(${
$cref
},
'<rss'
)+1;
_parseRDF(
$rr
,
$cref
),
last
if
index
(${
$cref
},
'<rdf:RDF'
)+1;
_parseSN(
$rr
,
$cref
),
last
if
index
(${
$cref
},
'<scriptingnews'
)+1;
_parseWL(
$rr
,
$cref
),
last
if
index
(${
$cref
},
'<weblog'
)+1;
die
"Content must be RSS|RDF|ScriptingNews|Weblog|reasonably close"
;
}
postprocess(
$rr
);
}
sub
preprocess {
my
(
$cref
,
$strip
) =
@_
;
$$cref
=~ y/\r\n/\n/s;
if
( !
defined
(
$strip
) ){
$$cref
=~ y{\n\t ~0-9\-+!@
#$%^&*()_=a-zA-Z[]\\;':",./<>?}{ }cs;
}
elsif
(
$strip
eq
'1'
){
$$cref
=~ s/[[:^
:]]/ /g;
}
#XXX $$cref =~ s/&(?!0[a-zA-Z0-9]+|#\d+);/amp/gs;
#XXX Do we wish to (re)allow escaped HTML?!
$$cref
=~ s{(?:<|
<
;)/?(?:b|i|h[1-6]|p|center|quote|strong)(?:>|
>
;)}{}gsi;
}
sub
_parseRSS {
parseXML(
$_
[0],
$_
[1],
'channel'
, 0);
$_
[0]->{
'items'
} =
$_
[0]->{
'item'
};
}
sub
_parseRDF {
my
(
$rr
,
$cref
) =
@_
;
$rr
->{
'items'
} = [];
my
$item
;
parseXML(
$_
[0],
$_
[1],
'rdf:RDF'
, 0);
# Alias RDF to RSS
if
(
exists
(
$rr
->{
'item'
}) ){
$rr
->{
'items'
} =
$rr
->{
'item'
};
}
else
{
my
$li
=
$_
[0]->{
'rdf:li'
} ||
$_
[0]->{
'rdf:Seq'
}->{
'rdf:li'
};
foreach
$item
( @{
$li
} ){
my
%ia
;
if
(
exists
$item
->{
'dc:description'
}) {
$ia
{
'description'
} =
$item
->{
'dc:description'
};
}
if
(
exists
$item
->{
'dc:title'
}) {
$ia
{
'title'
} =
$item
->{
'dc:title'
};
}
if
(
exists
$item
->{
'dc:identifier'
}) {
$ia
{
'link'
} =
delete
(
$item
->{
'dc:identifier'
});
}
push
(@{
$rr
->{
'items'
}}, \
%ia
);
}
}
}
sub
_parseSN {
my
(
$rr
,
$cref
) =
@_
;
$rr
->{
'items'
} = ();
my
$item
;
parseXML(
$rr
,
$cref
,
'channel'
, 0);
# Alias SN to RSS terms
foreach
$item
( @{
$_
[0]->{
'rdf:li'
}} ){
my
%ia
;
if
(
exists
$item
->{
'text'
}) {
$ia
{
'description'
} =
$item
->{
'text'
};
}
if
(
exists
$item
->{
'linetext'
}) {
$ia
{
'title'
} =
$item
->{
'linetext'
};
}
if
(
exists
$item
->{
'url'
}) {
$ia
{
'link'
} =
$item
->{
'url'
};
}
push
(@{
$rr
->{
'items'
}}, \
%ia
);
}
}
sub
_parseWL {
my
(
$rr
,
$cref
) =
@_
;
$rr
->{
'items'
} = ();
my
$item
;
#XXX is this the right tag to parse for?
parseXML(
$rr
,
$cref
,
'channel'
, 0);
# Alias WL to RSS
foreach
$item
( @{
$_
[0]->{
'rdf:li'
}} ){
my
%ia
;
if
(
exists
$item
->{
'url'
}) {
$ia
{
'link'
} =
delete
(
$item
->{
'url'
});
}
push
(@{
$rr
->{
'items'
}}, \
%ia
);
}
}
sub
postprocess {
my
$rr
=
shift
;
#XXX Not much to do, what about un-munging URL's in source, etc.?!
return
unless
defined
(
$rr
->{
'items'
});
$rr
->{
'items'
} = [
$rr
->{
'items'
}]
unless
ref
(
$rr
->{
'items'
}) eq
'ARRAY'
;
foreach
my
$i
(@{
$rr
->{
'items'
}}) {
$i
->{description} =
$i
->{description}->{
'<>'
}
if
ref
(
$i
->{description});
# Put stuff into the right name if necessary
if
( not
$i
->{
'link'
} ){
if
(
defined
(
$i
->{
'url'
}) ){
$i
->{
'link'
} =
delete
(
$i
->{
'url'
}); }
# See if you can use misplaced url in title for empty links
elsif
(
exists
(
$i
->{
'title'
}) ){
# The next case would trap this, but try to short-circuit the gathering
if
(
$i
->{
'title'
} =~ /^(?:https?|ftp):/) {
$i
->{
'link'
} =
$i
->{
'title'
};
}
elsif
(
$i
->{
'title'
} =~ /
"((?:https?|ftp).*?)"
/) {
$i
->{
'link'
} = $1;
$i
->{
'title'
} =~ s/<.*?>//;
}
else
{
next
;
}
}
}
# Trim whitespace
$i
->{
'link'
} =~ s/\s+$//;
$i
->{
'link'
} =~ s/^\s+//;
# Make sure you've got an http/ftp link
if
(
exists
(
$i
->{
'link'
}) &&
$i
->{
'link'
} !~ m{^(https?|ftp)://}i) {
## Rip link out of anchor tag
if
(
ref
(
$i
->{
'link'
}) &&
$i
->{
'link'
}->{a}->{href} ){
$i
->{
'link'
} =
$i
->{
'link'
}->{a}->{href} }
## Smells like a relative url
elsif
(
$i
->{
'link'
} =~ m{^[
#/]} and $rr->{'link'} =~ m{^https?://} ){
if
(
substr
(
$i
->{
'link'
}, 0, 1) ne
'/'
) {
$i
->{
'link'
} =
'/'
.
$i
->{
'link'
};
}
$i
->{
'link'
} =
$rr
->{
'link'
} .
$i
->{
'link'
};
}
else
{
next
;
}
}
#If we don't have a title, use the link
unless
(
defined
(
$i
->{
'title'
}) ){
$i
->{
'title'
} =
$i
->{
'link'
};
}
if
(
exists
(
$i
->{
'link'
}) ){
#XXX # Fix pre-process munging
# $i->{'link'} =~ s/&/&/gi;
$i
->{
'link'
} =~ s/ /%20/g;
}
}
}
sub
parseXML{
my
(
$hash
,
$xml
,
$tag
,
$comments
) =
@_
;
my
(
$begin
,
$end
,
@comments
);
local
$_
;
#Kill comments
while
( (
$begin
=
index
(${
$xml
},
'<!--'
)) > -1 &&
${
$xml
} =~ m%<!--.*?--(>)
%sg
){
my
$str
=
substr
(${
$xml
},
$begin
,
pos
(${
$xml
})-
$begin
,
''
);
#Save them if requested
do
{
unshift
@comments
, [
$begin
,
substr
(
$str
, 4,
length
(
$str
)-7)] }
if
$comments
;
}
_parseXML(
$hash
,
$xml
,
$tag
);
# #XXX Context of comment is lost!
# #Expose comments if requested
# do{ push(@$comments, $_->[1]) for @comments } if ref($comments) eq 'ARRAY';
if
(
$comments
){
#Restore comments if requested
substr
(${
$xml
},
$_
->[0], 0,
'<!--'
.
$_
->[1].
'-->'
)
for
@comments
;
#Expose comments if requested
do
{
push
(
@$comments
,
$_
->[1])
for
@comments
}
if
ref
(
$comments
) eq
'ARRAY'
;
}
}
sub
_parseXML{
my
(
$hash
,
$xml
,
$tag
,
$index
) =
@_
;
my
(
$begin
,
$end
);
#Find topTag and set pos to start matching from there
${
$xml
} =~ /<
$tag
(?:>|\s)/g;
(
$begin
,
$end
) = (0,
pos
(${
$xml
})||0);
#Match either <foo></foo> or <bar />, optional attributes, stash tag name
while
( ${
$xml
} =~ m%<([^\s>]+)(?:\s+[^>]*?)?(?:/|>.*?</\1)>
%sg
){
#Save the tag name, we'll need it
$tag
= $1 || $2;
#Save the new beginning and end
(
$begin
,
$end
) = (
$end
,
pos
(${
$xml
}));
#Get the bit we just matched.
my
$str
=
substr
(${
$xml
},
$begin
,
$end
-
$begin
);
#Extract the actual attributes and contents of the tag
$str
=~ m%<\Q
$tag
\E\s*([^>]*?)?>(.*?)</\Q
$tag
\E>
%s
||
#XXX pointed out by hv
# $str =~ s%^.*?<$tag\s*([^>]*?)?>(.*?)</$tag>%<$tag>$2</$tag>%s ||
$str
=~ m%<\Q
$tag
\E\s*([^>]*?)?\s*/>%;
my
(
$attr
,
$content
) = ($1, $2);
#Did we get attributes? clean them up and chuck them in a hash.
if
(
$attr
){
(
$_
,
$attr
) = (
$attr
, {});
$attr
->{$1} = $3
while
m/([^\s=]+)\s*=\s*(['"]?)([^\2>]*?)(?:\2|$)/g;
}
my
$inhash
;
#Recurse if contents has more tags, replace contents with reference we get
if
(
$content
&&
index
(
$content
,
'<'
) > -1 ){
_parseXML(
$inhash
={}, \
$str
,
$tag
);
#Was there any data in the contents? We should extract that...
if
(
$str
=~ />[^><]+</ ){
#The odd RE above shortcircuits unnecessary entry
#Clean whitespace between tags
#$str =~ s%(?<=>)?\s*(?=<)%%g; #XXX ~same speed, wacko warning
#$str =~ s%(>?)\s*<%$1<%g;
#XXX #$str =~ s%(?:^|(?<=>))\s*(?:(?=<)|\z)%%g
my
$qr
=
qr{@{[join('|', map { quotemeta }
keys
%{
$inhash
})]}};
$content
=~ s%<(
$qr
)\s*(?:[^>]*?)?(?:/|>.*?</\1)>%
%sg
;
$inhash
->{
'<>'
} =
$content
if
$content
=~ /\S/;
}
}
if
(
ref
(
$inhash
) ){
#We have attributes? Then we should merge them.
if
(
ref
(
$attr
) ){
for
(
keys
%{
$attr
} ){
$inhash
->{
$_
} =
exists
(
$inhash
->{
$_
}) ?
(
ref
(
$inhash
->{
$_
}) eq
'ARRAY'
?
[@{
$inhash
->{
$_
}},
$attr
->{
$_
}] :
[
$inhash
->{
$_
},
$attr
->{
$_
}] ) :
$attr
->{
$_
};
}
}
}
elsif
(
ref
(
$attr
) ){
$inhash
=
$attr
;
}
else
{
#Otherwise save our content
$inhash
=
$content
;
}
$hash
->{
$tag
} =
exists
(
$hash
->{
$tag
}) ?
(
ref
(
$hash
->{
$tag
}) eq
'ARRAY'
?
[@{
$hash
->{
$tag
}},
$inhash
] :
[
$hash
->{
$tag
},
$inhash
] ) :
$inhash
;
}
}
1;
__END__
=pod
=head1 NAME
XML::RSSLite - lightweight, "relaxed" RSS (and XML-ish) parser
=head1 SYNOPSIS
use XML::RSSLite;
parseRSS(\%result, \$content);
print "=== Channel ===\n",
"Title: $result{'title'}\n",
"Desc: $result{'description'}\n",
"Link: $result{'link'}\n\n";
foreach $item (@{$result{'items'}}) {
print " --- Item ---\n",
" Title: $item->{'title'}\n",
" Desc: $item->{'description'}\n",
" Link: $item->{'link'}\n\n";
}
=head1 DESCRIPTION
This module attempts to extract the maximum amount of content from
available documents, and is less concerned with XML compliance than
alternatives. Rather than rely on XML::Parser, it uses heuristics and good
old-fashioned Perl regular expressions. It stores the data in a simple
hash structure, and "aliases" certain tags so that when done, you can
count on having the minimal data necessary for re-constructing a valid
RSS file. This means you get the basic title, description, and link for a
channel and its items.
This module extracts more usable links by parsing "scriptingNews" and
"weblog" formats in addition to RDF & RSS. It also "sanitizes" the
output for best results. The munging includes:
=over
=item Remove html tags to leave plain text
=item Remove leading whitespace from URIs
=item By defaul strips characters except 0-9~!@#$%^&*()-+=a-zA-Z[];',.:"<>?\s
=item Use <url> tags when <link> is empty
=item Use misplaced urls in <title> when <link> is empty
=item Exract links from <a href=...> if required
=item Limit links to ftp and http(s)
=item Join relative item urls (beginning with / or #) to the site base
=back
=head2 EXPORT
=over
=item parseRSS($outHashRef, $inScalarRef, [$strip])
=over
=item inScalarRef - required
Reference to a scalar containing the document to be parsed. NOTE: The
contents will effectively be destroyed. Make a deep copy first if you care.
=item outHashRef - required
Reference to the hash within which to store the parsed content.
=item strip - optional
An expression indicating the level of winnowing to be performed on the
characters permitted in the results.
=over
=item 1 strip non-printable characters
=item 0 no characters are removed
=item undefined (Default) strip everything but:
0-9~!@#$%^&*()-+= a-zA-Z[];',.:"<>?\t\n
=back
=back
=back
=head2 EXPORTABLE
=over
=item parseXML(\%parsedTree, \$parseThis, 'topTag', $comments);
=over
=item parsedTree - required
Reference to hash to store the parsed document within.
=item parseThis - required
Reference to scalar containing the document to parse.
=item topTag - optional
Tag to consider the root node, leaving this undefined is not recommended.
=item comments - optional
=over
=item false will remove contents from parseThis
=item true will not remove comments from parseThis
=item array reference is true, comments are stored here
=back
=back
=back
=head2 CAVEATS
This is not a conforming parser. It does not handle the following
=over
=item
<foo bar=">">
=item
<foo><bar> <bar></bar> <bar></bar> </bar></foo>
=item
<![CDATA[ ]]>
=item
PI
=back
It's non-validating, without a DTD the following cannot be properly addressed
=over
=item entities
=item namespaces
This may or may not be arriving in some future release.
=back
=back
=head1 SEE ALSO
perl(1), C<XML::RSS>, C<XML::SAX::PurePerl>,
C<XML::Parser::Lite>, <XML::Parser>
=head1 AUTHOR
Jerrad Pierce <jpierce@cpan.org>.
Scott Thomason <scott@thomasons.org>
=head1 LICENSE
Portions Copyright (c) 2002,2003,2009 Jerrad Pierce, (c) 2000 Scott Thomason.
All rights reserved. This program is free software; you can redistribute it
and/or modify it under the same terms as Perl itself.
=cut