#!/usr/bin/perl -w
use
vars
qw($MAX_DEPTH $MAX_DOCS $PREFIX $REFERER $VERBOSE $QUIET $SLEEP $HIER $AUTH $IIS $TOLOWER $NOSPACE %KEEPEXT)
;
my
$progname
= $0;
$progname
=~ s|.*/||;
$progname
=~ s/\.\w*$//;
$VERSION
=
"5.818"
;
$MAX_DEPTH
= 5;
$MAX_DOCS
= 50;
GetOptions(
'version'
=> \
&print_version
,
'help'
=> \
&usage
,
'depth=i'
=> \
$MAX_DEPTH
,
'limit=i'
=> \
$MAX_DOCS
,
'verbose!'
=> \
$VERBOSE
,
'quiet!'
=> \
$QUIET
,
'sleep=i'
=> \
$SLEEP
,
'prefix:s'
=> \
$PREFIX
,
'referer:s'
=> \
$REFERER
,
'hier'
=> \
$HIER
,
'auth=s'
=> \
$AUTH
,
'iis'
=> \
$IIS
,
'tolower'
=> \
$TOLOWER
,
'nospace'
=> \
$NOSPACE
,
'keepext=s'
=> \
$KEEPEXT
{
'OPT'
},
) || usage();
sub
print_version {
my
$DISTNAME
=
'libwww-perl-'
. LWP::Version();
print
<<"EOT";
This is lwp-rget version $VERSION ($DISTNAME)
Copyright 1996-1998, Gisle Aas.
This program is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
EOT
exit
0;
}
my
$start_url
=
shift
|| usage();
usage()
if
@ARGV
;
my
$ua
= new LWP::UserAgent;
$ua
->agent(
"$progname/$VERSION "
);
$ua
->env_proxy;
unless
(
defined
$PREFIX
) {
$PREFIX
= url(
$start_url
);
eval
{
$PREFIX
->eparams(
undef
);
$PREFIX
->equery(
undef
);
};
$_
=
$PREFIX
->epath;
s|[^/]+$||;
$PREFIX
->epath(
$_
);
$PREFIX
=
$PREFIX
->as_string;
}
%KEEPEXT
=
map
{
lc
(
$_
) => 1 }
split
(/\s*,\s*/, (
$KEEPEXT
{
'OPT'
}||0));
my
$SUPPRESS_REFERER
;
$SUPPRESS_REFERER
++
if
(
$REFERER
||
""
) eq
"NONE"
;
print
<<
""
if
$VERBOSE
;
START =
$start_url
MAX_DEPTH =
$MAX_DEPTH
MAX_DOCS =
$MAX_DOCS
PREFIX =
$PREFIX
my
$no_docs
= 0;
my
%seen
= ();
my
$filename
= fetch(
$start_url
,
undef
,
$REFERER
);
print
"$filename\n"
unless
$QUIET
;
sub
fetch
{
my
(
$url
,
$type
,
$referer
,
$depth
) =
@_
;
$url
=
$url
->as_string
if
(
ref
(
$url
));
while
(
$url
=~ s
$url
= fix_backslashes(
$url
)
if
(
defined
$IIS
);
$url
= url(
$url
);
$type
||=
'a'
;
$type
=
'img'
if
(
$type
eq
'body'
||
$type
eq
'td'
);
$depth
||= 0;
my
$out
= (
" "
x
$depth
) .
$url
.
" "
;
$out
.=
"."
x (60 -
length
(
$out
));
print
STDERR
$out
.
" "
if
$VERBOSE
;
if
(
$url
->scheme eq
'mailto'
) {
print
STDERR
"*skipping mailto*\n"
if
$VERBOSE
;
return
$url
->as_string;
}
my
$plain_url
=
$url
->clone;
$plain_url
->frag(
undef
);
if
(
$type
ne
'img'
and
$url
->as_string !~ /^\Q
$PREFIX
/o) {
print
STDERR
"*outsider*\n"
if
$VERBOSE
;
return
$url
->as_string;
}
$plain_url
= to_lower(
$plain_url
)
if
(
defined
$TOLOWER
);
my
$seen
=
$seen
{
$plain_url
->as_string};
if
(
$seen
) {
my
$frag
=
$url
->frag;
$seen
.=
"#$frag"
if
defined
(
$frag
);
$seen
= protect_frag_spaces(
$seen
);
print
STDERR
"$seen (again)\n"
if
$VERBOSE
;
return
$seen
;
}
if
(
$depth
>
$MAX_DEPTH
and
$type
ne
'img'
) {
print
STDERR
"*too deep*\n"
if
$VERBOSE
;
return
$url
;
}
if
(
$no_docs
>
$MAX_DOCS
) {
print
STDERR
"*too many*\n"
if
$VERBOSE
;
return
$url
;
}
$no_docs
++;
sleep
(
$SLEEP
)
if
$SLEEP
;
my
$req
= HTTP::Request->new(
GET
=>
$url
);
$req
->header (
'Accept'
,
'*/*'
)
if
(
defined
$IIS
);
$req
->authorization_basic(
split
(/:/,
$AUTH
))
if
(
defined
$AUTH
);
if
(
$referer
&& !
$SUPPRESS_REFERER
) {
if
(
$req
->url->scheme eq
'http'
) {
$referer
= url(
$referer
)
unless
ref
(
$referer
);
undef
$referer
if
(
$referer
->scheme ||
''
) eq
'https'
;
}
$req
->referer(
$referer
)
if
$referer
;
}
my
$res
=
$ua
->request(
$req
);
if
(
$res
->is_success) {
my
$doc
=
$res
->content;
my
$ct
=
$res
->content_type;
my
$name
= find_name(
$res
->request->url,
$ct
);
print
STDERR
"$name\n"
unless
$QUIET
;
$seen
{
$plain_url
->as_string} =
$name
;
if
(
$ct
eq
"text/html"
) {
save(
$name
,
$doc
);
my
$base
=
$res
->base;
$doc
=~
s/
(
<(img|a|body|area|frame|td)\b
[^>]+
\b(?:src|href|background)
\s*=\s*
)
(?:
(
")([^"
]*)" |
(
')([^'
]*)' |
([^\s>]+)
)
/
new_link($1,
lc
($2), $3||$5, HTML::Entities::decode($4||$6||$7),
$base
,
$name
,
"$url"
,
$depth
+1)
/giex;
}
save(
$name
,
$doc
);
return
$name
;
}
else
{
print
STDERR
$res
->code .
" "
.
$res
->message .
"\n"
if
$VERBOSE
;
$seen
{
$plain_url
->as_string} =
$url
->as_string;
return
$url
->as_string;
}
}
sub
new_link
{
my
(
$pre
,
$type
,
$quote
,
$url
,
$base
,
$localbase
,
$referer
,
$depth
) =
@_
;
$url
= protect_frag_spaces(
$url
);
$url
= fetch(url(
$url
,
$base
)->
abs
,
$type
,
$referer
,
$depth
);
$url
= url(
"file:$url"
,
"file:$localbase"
)->rel
unless
$url
=~ /^[.+\-\w]+:/;
$url
= unprotect_frag_spaces(
$url
);
return
$pre
.
$quote
.
$url
.
$quote
;
}
sub
protect_frag_spaces
{
my
(
$url
) =
@_
;
$url
=
$url
->as_string
if
(
ref
(
$url
));
if
(
$url
=~ m/^([^
{
my
(
$base
,
$frag
) = ($1, $2);
$frag
=~ s/ /%20/g;
$url
=
$base
.
$frag
;
}
return
$url
;
}
sub
unprotect_frag_spaces
{
my
(
$url
) =
@_
;
$url
=
$url
->as_string
if
(
ref
(
$url
));
if
(
$url
=~ m/^([^
{
my
(
$base
,
$frag
) = ($1, $2);
$frag
=~ s/%20/ /g;
$url
=
$base
.
$frag
;
}
return
$url
;
}
sub
fix_backslashes
{
my
(
$url
) =
@_
;
my
(
$base
,
$frag
);
$url
=
$url
->as_string
if
(
ref
(
$url
));
if
(
$url
=~ m/([^
{
(
$base
,
$frag
) = ($1, $2);
}
else
{
$base
=
$url
;
$frag
=
""
;
}
$base
=~
tr
/\\/\//;
$base
=~ s/%5[cC]/\//g;
return
$base
.
$frag
;
}
sub
to_lower
{
my
(
$url
) =
@_
;
my
$was_object
= 0;
if
(
ref
(
$url
))
{
$url
=
$url
->as_string;
$was_object
= 1;
}
if
(
$url
=~ m/([^
{
$url
=
lc
($1) . $2;
}
else
{
$url
=
lc
(
$url
);
}
if
(
$was_object
== 1)
{
return
url(
$url
);
}
else
{
return
$url
;
}
}
sub
translate_spaces
{
my
(
$url
) =
@_
;
my
(
$base
,
$frag
);
$url
=
$url
->as_string
if
(
ref
(
$url
));
if
(
$url
=~ m/([^
{
(
$base
,
$frag
) = ($1, $2);
}
else
{
$base
=
$url
;
$frag
=
""
;
}
$base
=~ s/^ *//;
$base
=~ s/ *$//;
$base
=~
tr
/ /_/;
$base
=~ s/%20/_/g;
return
$base
.
$frag
;
}
sub
mkdirp
{
my
(
$directory
,
$mode
) =
@_
;
my
@dirs
=
split
(/\//,
$directory
);
my
$path
=
shift
(
@dirs
);
my
$result
= 1;
unless
(-d
$path
) {
$result
&&=
mkdir
(
$path
,
$mode
);
}
foreach
(
@dirs
) {
$path
.=
"/$_"
;
if
( ! -d
$path
) {
$result
&&=
mkdir
(
$path
,
$mode
);
}
}
return
$result
;
}
sub
find_name
{
my
(
$url
,
$type
) =
@_
;
$url
= translate_spaces(
$url
)
if
(
defined
$NOSPACE
);
$url
= to_lower(
$url
)
if
(
defined
$TOLOWER
);
$url
= url(
$url
)
unless
ref
(
$url
);
my
$path
=
$url
->path;
$path
=~ s|(.*/)||;
my
$dirname
=
".$1"
;
if
(!
$HIER
) {
$dirname
=
""
;
}
elsif
(! -d
$dirname
) {
mkdirp(
$dirname
, 0775);
}
my
$extra
=
""
;
my
$suffix
;
if
(
$KEEPEXT
{
lc
(
$type
)}) {
$suffix
= (
$path
=~ m/\.(.*)/) ? $1 :
""
;
}
else
{
$suffix
= media_suffix(
$type
);
}
$path
=~ s|\..*||;
$path
=
"index"
unless
length
$path
;
while
(1) {
my
$file
=
$dirname
.
$path
.
$extra
;
$file
.=
".$suffix"
if
$suffix
;
return
$file
unless
-f
$file
;
unless
(
$extra
) {
$extra
=
"001"
;
next
;
}
$extra
++;
}
}
sub
save
{
my
$name
=
shift
;
open
(FILE,
">$name"
) ||
die
"Can't save $name: $!"
;
binmode
FILE;
print
FILE
$_
[0];
close
(FILE);
}
sub
usage
{
print
<<
""
;
exit
1;
Usage:
$progname
[options] <URL>
Allowed options are:
--auth=USER:PASS Set authentication credentials
for
web site
--depth=N Maximum depth to traverse (
default
is
$MAX_DEPTH
)
--hier Download into hierarchy (not all files into cwd)
--referer=URI Set initial referer header (or
"NONE"
)
--iis Workaround IIS 2.0 bug by sending
"Accept: */*"
MIME
header; translates backslashes (\\) to forward slashes (/)
--keepext=type Keep file extension
for
MIME types (comma-separated list)
--limit=N A limit on the number documents to get (
default
is
$MAX_DOCS
)
--nospace Translate spaces URLs (not
--version Print version number and quit
--verbose More output
--quiet No output
--
sleep
=SECS Sleep between gets, ie. go slowly
--prefix=PREFIX Limit URLs to follow to those which begin
with
PREFIX
--tolower Translate all URLs to lowercase (useful
with
IIS servers)
}