NAME
LWP::Parallel - Extension for LWP to allow parallel HTTP and FTP access
SYNOPSIS
use
LWP::Parallel;
"This is LWP::Parallel_$LWP::Parallel::VERSION\n"
;
DESCRIPTION
Introduction
ParallelUserAgent is an extension to the existing libwww module. It allows you to take a list of URLs (currently supports only HTTP and FTP protocol) and connect to all of them _in parallel_, then wait for the results to come in.
See the Parallel::UserAgent for how to create a LWP UserAgent that will access multiple Web resources in parallel. The Parallel::RobotUA module will additionally offer proper handling of robot.txt file, the de-facto exclusion protocol for Web Robots.
Examples
The following examples might help to get you started:
# display tons of debugging messages. See 'perldoc LWP::Debug'
#use LWP::Debug qw(+);
# shortcut for demo URLs
my
$reqs
= [
HTTP::Request->new(
'GET'
,
$url
),
HTTP::Request->new(
'GET'
,
$url
.
"homes/marclang/"
),
];
my
$pua
= LWP::Parallel::UserAgent->new();
$pua
->in_order (1);
# handle requests in order of registration
$pua
->duplicates(0);
# ignore duplicates
$pua
->timeout (2);
# in seconds
$pua
->redirect (1);
# follow redirects
foreach
my
$req
(
@$reqs
) {
"Registering '"
.
$req
->url.
"'\n"
;
if
(
my
$res
=
$pua
->register (
$req
) ) {
STDERR
$res
->error_as_HTML;
}
}
my
$entries
=
$pua
->
wait
();
foreach
(
keys
%$entries
) {
my
$res
=
$entries
->{
$_
}->response;
"Answer for '"
,
$res
->request->url,
"' was \t"
,
$res
->code,
": "
,
$res
->message,
"\n"
;
}
Parallel::UserAgent (as well as the Parallel::RobotUA) offer three default methods that will be called at certain points during the connection: on_connect
, on_return
and on_failure
.
#
# provide subclassed UserAgent to override on_connect, on_failure and
# on_return methods
#
package
myUA;
use
Exporter();
@ISA
=
qw(LWP::Parallel::UserAgent Exporter)
;
@EXPORT
=
@LWP::Parallel::UserAgent::EXPORT_OK
;
# redefine methods: on_connect gets called whenever we're about to
# make a a connection
sub
on_connect {
my
(
$self
,
$request
,
$response
,
$entry
) =
@_
;
"Connecting to "
,
$request
->url,
"\n"
;
}
# on_failure gets called whenever a connection fails right away
# (either we timed out, or failed to connect to this address before,
# or it's a duplicate). Please note that non-connection based
# errors, for example requests for non-existant pages, will NOT call
# on_failure since the response from the server will be a well
# formed HTTP response!
sub
on_failure {
my
(
$self
,
$request
,
$response
,
$entry
) =
@_
;
"Failed to connect to "
,
$request
->url,
"\n\t"
,
$response
->code,
", "
,
$response
->message,
"\n"
if
$response
;
}
# on_return gets called whenever a connection (or its callback)
# returns EOF (or any other terminating status code available for
# callback functions). Please note that on_return gets called for
# any successfully terminated HTTP connection! This does not imply
# that the response sent from the server is a success!
sub
on_return {
my
(
$self
,
$request
,
$response
,
$entry
) =
@_
;
if
(
$response
->is_success) {
"\n\nWoa! Request to "
,
$request
->url,
" returned code "
,
$response
->code,
": "
,
$response
->message,
"\n"
;
$response
->content;
}
else
{
"\n\nBummer! Request to "
,
$request
->url,
" returned code "
,
$response
->code,
": "
,
$response
->message,
"\n"
;
# print $response->error_as_HTML;
}
return
;
}
package
main;
use
HTTP::Request;
# shortcut for demo URLs
my
$reqs
= [
HTTP::Request->new(
'GET'
,
$url
),
HTTP::Request->new(
'GET'
,
$url
.
"homes/marclang/"
),
];
my
$pua
= myUA->new();
foreach
my
$req
(
@$reqs
) {
"Registering '"
.
$req
->url.
"'\n"
;
$pua
->register (
$req
);
}
my
$entries
=
$pua
->
wait
();
# responses will be caught by on_return, etc
The final example will demonstrate a simple Web Robot that keeps a cache of the "robots.txt" permission files it has encountered so far. This example also uses callbacks to handle the response as it comes in.
# persistent robot rules support. See 'perldoc WWW::RobotRules::AnyDBM_File'
# shortcut for demo URLs
my
$reqs
= [
HTTP::Request->new(
'GET'
,
$url
),
# these are all redirects. depending on how you set
# 'redirect_ok' they either just return the status code for
# redirect (like 302 moved), or continue to follow redirection.
HTTP::Request->new(
'GET'
,
$url
.
"research/ahoy/"
),
HTTP::Request->new(
'GET'
,
$url
.
"research/ahoy/doc/paper.html"
),
# these are all non-existant server. the first one should take
# some time, but the following ones should be rejected right
# away
# although server exists, file doesn't
HTTP::Request->new(
'GET'
,
$url
.
"foobar/bar/baz.html"
),
];
my
(
$req
,
$res
);
# establish persistant robot rules cache. See WWW::RobotRules for
# non-permanent version. you should probably adjust the agentname
# and cache filename.
my
$rules
= new WWW::RobotRules::AnyDBM_File
'ParallelUA'
,
'cache'
;
# create new UserAgent (actually, a Robot)
my
$pua
= new LWP::Parallel::RobotUA (
"ParallelUA"
,
'yourname@your.site.com'
,
$rules
);
$pua
->timeout (2);
# in seconds
$pua
->delay ( 5);
# in seconds
$pua
->max_req ( 2);
# max parallel requests per server
$pua
->max_hosts(10);
# max parallel servers accessed
# for our own print statements that follow below:
local
($\) =
""
;
# ensure standard $OUTPUT_RECORD_SEPARATOR
# register requests
foreach
$req
(
@$reqs
) {
"Registering '"
.
$req
->url.
"'\n"
;
$pua
->register (
$req
, \
&handle_answer
);
# Each request, even if it failed to # register properly, will
# show up in the final list of # requests returned by $pua->wait,
# so you can examine it # later.
}
# $pua->wait returns a pointer to an associative array, containing
# an '$entry' for each request made, sorted by its url. (as returned
# by $request->url->as_string)
my
$entries
=
$pua
->
wait
();
# give another timeout here, 25 seconds
# let's see what we got back (see also callback function!!)
foreach
(
keys
%$entries
) {
$res
=
$entries
->{
$_
}->response;
# examine response to find cascaded requests (redirects, etc) and
# set current response to point to the very first response of this
# sequence. (not very exciting if you set '$pua->redirect(0)')
my
$r
=
$res
;
my
@redirects
;
while
(
$r
) {
$res
=
$r
;
$r
=
$r
->previous;
push
(
@redirects
,
$res
)
if
$r
;
}
# summarize response. see "perldoc HTTP::Response"
"Answer for '"
,
$res
->request->url,
"' was \t"
,
$res
->code,
": "
,
$res
->message,
"\n"
;
# print redirection history, in case we got redirected
foreach
(
@redirects
) {
"\t"
,
$_
->request->url,
"\t"
,
$_
->code,
": "
,
$_
->message,
"\n"
;
}
}
# our callback function gets called whenever some data comes in
# (same parameter format as standard LWP::UserAgent callbacks!)
sub
handle_answer {
my
(
$content
,
$response
,
$protocol
,
$entry
) =
@_
;
"Handling answer from '"
,
$response
->request->url,
": "
,
length
(
$content
),
" bytes, Code "
,
$response
->code,
", "
,
$response
->message,
"\n"
;
if
(
length
(
$content
) ) {
# just store content if it comes in
$response
->add_content(
$content
);
}
else
{
# Having no content doesn't mean the connection is closed!
# Sometimes the server might return zero bytes, so unless
# you already got the information you need, you should continue
# processing here (see below)
# Otherwise you can return a special exit code that will
# determins how ParallelUA will continue with this connection.
# Note: We have to import those constants via "qw(:CALLBACK)"!
# return C_ENDCON; # will end only this connection
# (silly, we already have EOF)
# return C_LASTCON; # wait for remaining open connections,
# but don't issue any new ones!!
# return C_ENDALL; # will immediately end all connections
# and return from $pua->wait
}
# ATTENTION!! If you want to keep reading from your connection,
# you should have a final 'return undef' statement here. Even if
# you think that all data has arrived, it does not hurt to return
# undef here. The Parallel UserAgent will figure out by itself
# when to close the connection!
return
undef
;
# just keep on connecting/reading/waiting
# until the server closes the connection.
}
AUTHOR
Marc Langheinrich, marclang@cs.washington.edu
SEE ALSO
See LWP for an overview on Web communication using Perl. See LWP::Parallel::UserAgent and LWP::Parallel::RobotUA for details on how to use this library.
COPYRIGHT
Copyright 1997-2000 Marc Langheinrich.
This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself.