package HTML::HTML5::Parser::UA;
use 5.008001;
use strict;
use warnings;
BEGIN {
$HTML::HTML5::Parser::UA::AUTHORITY = 'cpan:TOBYINK';
$HTML::HTML5::Parser::UA::VERSION = '0.992';
}
use Encode qw(decode);
use HTTP::Tiny;
use URI::file;
our $NO_LWP = '0';
sub get
{
my ($class, $uri, $ua) = @_;
if (ref $ua and $ua->isa('HTTP::Tiny') and $uri =~ /^https?:/i)
{ goto \&_get_tiny }
if (ref $ua and $ua->isa('LWP::UserAgent'))
{ goto \&_get_lwp }
if (UNIVERSAL::can('LWP::UserAgent', 'can') and not $NO_LWP)
{ goto \&_get_lwp }
if ($uri =~ /^file:/i)
{ goto \&_get_fs }
goto \&_get_tiny;
}
sub _get_lwp
{
eval "require LWP::UserAgent; 1"
or do {
require Carp;
Carp::croak("could not load LWP::UserAgent");
};
my ($class, $uri, $ua) = @_;
$ua ||= LWP::UserAgent->new(
agent => sprintf(
"%s/%s ",
'HTML::HTML5::Parser',
HTML::HTML5::Parser->VERSION,
),
default_headers => HTTP::Headers->new(
'Accept' => join q(, ) => qw(
text/html
application/xhtml+xml;q=0.9
application/xml;q=0.1
text/xml;q=0.1
)
),
parse_head => 0,
);
my $response = $ua->get($uri);
my $h = $response->headers;
my %header_hash =
map { lc($_) => $h->header($_); }
$h->header_field_names;
return +{
success => $response->is_success,
status => $response->code,
reason => $response->message,
headers => \%header_hash,
content => $response->content,
decoded_content => $response->decoded_content,
};
}
sub _get_tiny
{
my ($class, $uri, $ua) = @_;
$ua ||= HTTP::Tiny->new(
agent => sprintf("%s/%s", 'HTML::HTML5::Parser', HTML::HTML5::Parser->VERSION),
default_headers => +{
'Accept' => join(q(, ) => qw(
text/html
application/xhtml+xml;q=0.9
application/xml;q=0.1
text/xml;q=0.1
)),
},
);
my $response = $ua->get($uri);
if ($response->{headers}{'content-type'} =~ /charset=(\S+)/)
{
(my $encoding = $1) =~ s/["']//g;
$response->{decoded_content} = eval {
decode($encoding, $response->{content})
};
}
$response->{decoded_content} = $response->{content}
unless defined $response->{decoded_content};
return $response;
}
sub _get_fs
{
my $class = shift;
my ($uri) = map { ref() ? $_ : URI->new($_) } @_;
my $file = $uri->file;
my ($status, $reason, $content, $content_type) = do {
if (not -e $file)
{ (404 => 'Not Found', 'File not found.', 'text/plain') }
elsif (not -r $file)
{ (403 => 'Forbidden', 'File not readable by effective guid.', 'text/plain') }
else
{ (200 => 'OK') }
};
$content ||= do {
if (open my $fh, '<', $file)
{ local $/ = <$fh> }
else
{ $status = 418; $reason = "I'm a teapot"; $content_type = 'text/plain'; $! }
};
$content_type ||= 'text/xml' if $file =~ /\.xml$/i;
$content_type ||= 'application/xhtml+xml' if $file =~ /\.xht(ml)?$/i;
$content_type ||= 'text/html' if $file =~ /\.html?$/i;
$content_type ||= 'application/octet-stream';
return +{
success => ($status == 200),
status => $status,
reason => $reason,
headers => +{
'content-type' => $content_type,
'content-length' => length($content),
},
content => $content,
decoded_content => $content,
};
}
1;
=head1 NAME
HTML::HTML5::Parser::UA - simple web user agent class
=head1 SYNOPSIS
use aliased 'HTML::HTML5::Parser::UA';
my $response = UA->get($url);
die unless $response->{success};
print $response->{decoded_content};
=head1 DESCRIPTION
This is a simple wrapper around HTTP::Tiny and LWP::UserAgent to smooth out
the API differences between them. It only supports bog standard
C<< get($url) >> requests.
If LWP::UserAgent is already in memory, this module will use that.
If LWP::UserAgent is not in memory, then this module will use HTTP::Tiny (or
direct filesystem access for "file://" URLs).
If LWP::UserAgent is not in memory, and you attempt to request a URL that
HTTP::Tiny cannot handle (e.g. an "ftp://" URL), then this module will load
LWP::UserAgent and die if it cannot be loaded (e.g. is not installed).
HTML::HTML5::Parser::UA is used by the C method of
HTML::HTML5::Parser.
=head2 Class Method
=over
=item C<< get($url, $ua) >>
Gets the URL and returns a hashref similar to HTTP::Tiny's hashrefs, but
with an additional C key, which contains the response
body, decoded into a Perl character string (not a byte string).
If $ua is given (it's optional), then this user agent will be used to
perform the actual request. Must be undef or an LWP::UserAgent object
(or a subclass) or an HTTP::Tiny object (or a subclass).
=back
=head2 Package Variable
=over
=item C<< $HTML::HTML5::Parser::NO_LWP >>
If true, avoids using LWP::UserAgent.
=back
=head1 MOTIVATION
L is a good piece of software but it has a dependency on
L. L is only used to provide one fairly
esoteric feature, which this package doesn't make use of. (It's the
C option.)
Because of that, I don't especially want HTML::HTML5::Parser to have a
dependency on LWP::UserAgent. Hence this module.
=head1 SEE ALSO
L.
=head1 AUTHOR
Toby Inkster, Etobyink@cpan.orgE
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2012 by Toby Inkster
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=head1 DISCLAIMER OF WARRANTIES
THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.