The Perl Toolchain Summit 2025 Needs You: You can help 🙏 Learn more

#!perl
use strict;
use autodie;
use feature qw(say);
require File::Slurp;
require File::Spec;
use List::Util qw(sum);
require Net::FTP;
$| = 1;
my %ignore;
while ( my $line = <main::DATA> ) {
chomp $line;
next if $line =~ /^#/;
next unless $line;
$ignore{$line} = 1;
}
my $ua = LWP::UserAgent->new(ssl_opts => { verify_hostname => 0 });
$ua->timeout(58);
$ua->env_proxy;
my @filenames = @ARGV;
@filenames = sort grep { $_ !~ /^\.git/ } File::Find::Rule->new->file->in('.')
unless @filenames;
my $total_bytes = sum map {-s} @filenames;
my $extract_progress = Term::ProgressBar::Simple->new(
{ count => $total_bytes,
name => 'Extracting URIs',
}
);
my %uris;
foreach my $filename (@filenames) {
next if $filename =~ /uris\.txt/;
next if $filename =~ /check_uris/;
next if $filename =~ /\.patch$/;
next if $filename =~ 'cpan/Pod-Simple/t/perlfaqo?\.pod';
next if $filename =~ /checkURL\.pl$/;
my $contents = File::Slurp::read_file($filename);
my @uris = URI::Find::Simple::list_uris($contents);
foreach my $uri (@uris) {
next unless $uri =~ /^(http|ftp)/;
next if $ignore{$uri};
# no need to hit rt.perl.org
next
if $uri =~ m{^https?://rt.perl.org/(?:rt3/)?Ticket/Display.html?id=\d+$};
# no need to hit github
next
if $uri =~ m{^https?://(?:www\.)?github\.com/[pP]erl/perl5/issues/\d+$};
# no need to hit rt.cpan.org
next
if $uri =~ m{^https?://rt.cpan.org/Public/Bug/Display.html?id=\d+$};
# no need to hit google groups (weird redirect LWP does not like)
next
if $uri =~ m{^http://groups\.google\.com/};
push @{ $uris{$uri} }, $filename;
}
$extract_progress += -s $filename;
}
my $bw = Parallel::Fork::BossWorkerAsync->new(
work_handler => \&work_alarmed,
global_timeout => 120,
worker_count => 20,
);
foreach my $uri ( keys %uris ) {
my @filenames = @{ $uris{$uri} };
$bw->add_work( { uri => $uri, filenames => \@filenames } );
}
undef $extract_progress;
my $fetch_progress = Term::ProgressBar::Simple->new(
{ count => scalar( keys %uris ),
name => 'Fetching URIs',
}
);
my %filenames;
while ( $bw->pending() ) {
my $response = $bw->get_result();
my $uri = $response->{uri};
my @filenames = @{ $response->{filenames} };
my $is_success = $response->{is_success};
my $message = $response->{message};
unless ($is_success) {
foreach my $filename (@filenames) {
push @{ $filenames{$filename} },
{ uri => $uri, message => $message };
}
}
$fetch_progress++;
}
$bw->shut_down();
my $fh = IO::File->new('> uris.txt');
foreach my $filename ( sort keys %filenames ) {
$fh->say("* $filename");
my @bits = @{ $filenames{$filename} };
foreach my $bit (@bits) {
my $uri = $bit->{uri};
my $message = $bit->{message};
$fh->say(" $uri");
$fh->say(" $message");
}
}
$fh->close;
say 'Finished, see uris.txt';
sub work_alarmed {
my $conf = shift;
eval {
local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
alarm 60;
$conf = work($conf);
alarm 0;
};
if ($@) {
$conf->{is_success} = 0;
$conf->{message} = 'Timed out';
}
return $conf;
}
sub work {
my $conf = shift;
my $uri = $conf->{uri};
my @filenames = @{ $conf->{filenames} };
if ( $uri =~ /^http/ ) {
my $uri_without_fragment = URI->new($uri);
my $fragment = $uri_without_fragment->fragment(undef);
my $response = $ua->head($uri_without_fragment);
$conf->{is_success} = $response->is_success;
$conf->{message} = $response->status_line;
return $conf;
} else {
my $uri_object = URI->new($uri);
my $host = $uri_object->host;
my $path = $uri_object->path;
my ( $volume, $directories, $filename )
= File::Spec->splitpath($path);
my $ftp = Net::FTP->new( $host, Passive => 1, Timeout => 60 );
unless ($ftp) {
$conf->{is_succcess} = 0;
$conf->{message} = "Can not connect to $host: $@";
return $conf;
}
my $can_login = $ftp->login( "anonymous", '-anonymous@' );
unless ($can_login) {
$conf->{is_success} = 0;
$conf->{message} = "Can not login ", $ftp->message;
return $conf;
}
my $can_binary = $ftp->binary();
unless ($can_binary) {
$conf->{is_success} = 0;
$conf->{message} = "Can not binary ", $ftp->message;
return $conf;
}
my $can_cwd = $ftp->cwd($directories);
unless ($can_cwd) {
$conf->{is_success} = 0;
$conf->{message} = "Can not cwd to $directories ", $ftp->message;
return $conf;
}
if ($filename) {
my $can_size = $ftp->size($filename);
unless ($can_size) {
$conf->{is_success} = 0;
$conf->{message}
= "Can not size $filename in $directories",
$ftp->message;
return $conf;
}
} else {
my ($can_dir) = $ftp->dir;
unless ($can_dir) {
my ($can_ls) = $ftp->ls;
unless ($can_ls) {
$conf->{is_success} = 0;
$conf->{message}
= "Can not dir or ls in $directories ",
$ftp->message;
return $conf;
}
}
}
$conf->{is_success} = 1;
return $conf;
}
}
__DATA__
# these are fine but give errors
# these are missing, sigh
http://use.perl.org/~autrijus/journal/25768
# these are URI extraction bugs
ftp:passive-mode
ftp:
http:[-
# these are used as an example
http://www.cs.vu.nl/~tmgil/vi.html
http://w4.lns.cornell.edu/~pvhp/ptk/ptkTOC.html
http://[::]:1024/
http://[www.json::pp.org]/
# these are used to generate or match URLs
ftp://(.*?)/(.*)/(.*
ftp://(.*?)/(.*)/(.*
ftp://(.*?)/(.*)/(.*
http://$host/
ftp:/
http://$addr/mark?commit=$
http:/
ftp:%5Cn$url
ftp:%5Cn$url
# weird redirects that LWP doesn't like
# broken webserver that doesn't like HEAD requests
# these have been reported upstream to CPAN authors
http://www.debian.or.jp/~kubota/unicode-symbols.html.en
http://www.debian.or.jp/~kubota/unicode-symbols.html.en
http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html
http://www.csse.monash.edu.au/~damian/CPAN/Filter-Simple.tar.gz
http://www.plover.com/~mjd/perl/Memoize/
http://www.plover.com/~mjd/perl/MiniMemoize/
__END__
=head1 NAME
checkURL.pl - Check that all the URLs in the Perl source are valid
=head1 DESCRIPTION
This program checks that all the URLs in the Perl source are valid. It
checks HTTP and FTP links in parallel and contains a list of known
bad example links in its source. It takes 4 minutes to run on my
machine. The results are written to 'uris.txt' and list the filename,
the URL and the error:
* ext/Locale-Maketext/lib/Locale/Maketext.pod
404 Not Found
...
It should be run every so often and links fixed and upstream authors
notified.
Note that the web is unstable and some websites are temporarily down.