The Perl Toolchain Summit 2025 Needs You: You can help 🙏 Learn more

#!/usr/bin/perl
use strict;
###################################################################################
#
# Copyright (C) 2001 Bill Moseley swishscript@hank.org
# Program to test the SWISH::Filter module
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version
# 2 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# The above lines must remain at the top of this program
#
#
#
# 5 July 2006 - perl@peknet.com added --ignore_filters option
# 13 July 2006 - perl@peknet.com added metadata dump
#
####################################################################################
use URI;
use constant ABORT => 0;
use constant DEBUG => 1;
use constant INFO => 2;
my $version = '0.02';
my (
$verbose, $show_content, @file, @url,
$help, $man, $quiet, $headers,
$path, $depreciated, $mimetypes
);
my $skip_binary = 1;
my $lines = 10;
my $max_chars = 1000;
my @ignore_filters = ();
GetOptions(
'verbose!' => \$verbose, # turn on INFO messages
'content!' => \$show_content,
'quiet' => \$quiet,
'lines=i' => \$lines,
'help|?' => \$help,
'man' => \$man,
'headers' => \$headers,
'skip_binary!' => \$skip_binary,
'path' => \$path,
'depreciated' => \$depreciated,
'mimetypes' => \$mimetypes,
'ignore_filters=s@' => \@ignore_filters
) || pod2usage(2);
pod2usage(-verbose => 1) if $help;
pod2usage(-verbose => 2) if $man;
if ($path)
{
print '/usr/local/lib/swish-e/perl', "\n";
exit;
}
pod2usage(
-verbose => 0,
-message => "Must specify a file or URL",
-exitvar => 1,
)
unless @ARGV or $mimetypes;
$ENV{FILTER_DEBUG} = 1 if $verbose; # used by SWISH::Filter
msg(INFO, "SWISH::Filter found at [%s]\n", $INC{'SWISH/Filter.pm'});
my $filter = SWISH::Filter->new(ignore_filters => \@ignore_filters);
if ($mimetypes)
{
my @filters = $filter->filter_list;
print "Mimetypes:\n\n";
for my $filter (@filters)
{
print " $filter:\n";
for my $pattern ($filter->mimetypes)
{
print " $pattern\n";
}
print "\n";
}
}
my $return = 0;
for my $doc (@ARGV)
{
eval { $depreciated ? process_doc_old($doc) : process_doc($doc) };
$return = 1 if $@;
warn "** $0:\n $@\n" if $@; # always warn on die
}
exit $return;
sub process_doc
{
my ($file) = @_;
my $uri;
eval { $uri = URI->new($file) };
my %config = !$@ && $uri->scheme ? fetch_url($file) : fetch_file($file);
my $doc =
$filter->convert(
%config,
name => $file,
meta_data => {'convertedBy' => 'swish-filter-test'}
);
die "Failed to process document [$file]\n" unless $doc;
my $content_type = $doc->content_type || "unknown";
my $parser_type = $doc->swish_parser_type || '';
my $binary = $doc->is_binary;
my $msg = $doc->was_filtered ? '' : 'not';
my $name = $doc->name;
msg(DEBUG, <<EOF );
Document $file was $msg filtered.
Document: $file ($name)
Content-Type: $content_type
Parser type: $parser_type
EOF
if (my $filters_used = $doc->filters_used)
{
for my $filter (@$filters_used)
{
my $class = ref($filter->{name});
msg(DEBUG,
" >Filter used: $class ( $filter->{start_content_type} -> $filter->{end_content_type} )"
);
msg(DEBUG, " >Metadata: " . Dumper($doc->meta_data || 'none'));
}
}
if (!$binary)
{
my @doc = split /\n/, substr(${$doc->fetch_doc}, 0, $max_chars);
my $toshow = $lines;
$toshow = scalar(@doc) - 1 if $toshow >= @doc;
msg(INFO, join "\n",
'-- Output Content Sample --',
@doc[0 .. $toshow],
'', '-- end --', '');
}
die "Skipping binary [$file]\n" if $binary && $skip_binary;
if ($headers)
{
my $len = length ${$doc->fetch_doc};
print "Path-Name: $file\nContent-Length: $len\n";
print "Document-Type: $parser_type\n" if $parser_type;
print "\n";
}
print ${$doc->fetch_doc} if $show_content;
}
sub process_doc_old
{
my ($file) = @_;
my $uri;
eval { $uri = URI->new($file) };
my %config = !$@ && $uri->scheme ? fetch_url($file) : fetch_file($file);
my $was_filtered = $filter->filter(%config, name => $file,);
my $content_type = $filter->content_type || "unknown";
my $orig_content_type = $filter->original_content_type || "unknown";
my $parser_type = $filter->swish_parser_type || '';
my $binary = $content_type !~ m[^text/];
my $msg = $was_filtered ? '' : 'not';
msg(DEBUG, <<EOF );
Document $file was $msg filtered.
Document: $file
Content-Type: $content_type (initial was $orig_content_type)
Parser type: $parser_type
EOF
if (!$binary)
{
my @doc = split /\n/, substr(${$filter->fetch_doc}, 0, $max_chars);
$lines = @doc - 1 if $lines >= @doc;
msg(INFO, join "\n",
'-- Output Content Sample --',
@doc[0 .. $lines],
'', '-- end --', '');
}
die "Skipping binary [$file]\n" if $binary && $skip_binary;
if ($headers)
{
my $len = length ${$filter->fetch_doc};
print "Path-Name: $file\nContent-Length: $len\n";
print "Document-Type: $parser_type\n" if $parser_type;
print "\n";
}
print ${$filter->fetch_doc} if $show_content;
}
sub fetch_file
{
my $file = shift;
# just try to open for error reporting
open FH, "<$file" or die "Failed to open '$file': $!\n";
close FH;
die "File '$file' has zero size\n" if -z $file;
return (document => $file);
}
sub fetch_url
{
my $url = shift;
eval { require LWP::UserAgent };
die "LWP::UserAgent is required to fetch a URL\n$@\n" if $@;
my $ua = LWP::UserAgent->new;
my $request = HTTP::Request->new('GET', $url);
my $response = $ua->request($request);
die sprintf "Error while getting '%s'. Server returned %s.",
$response->request->uri, $response->status_line
unless $response->is_success;
my $content = $response->content;
my $content_type = $response->content_type;
return (
document => \$content,
content_type => $content_type,
);
}
sub msg
{
my $msg_level = shift;
return if $quiet;
return if !$verbose && $msg_level > DEBUG;
printf(STDERR @_), print STDERR "\n";
}
__END__
=head1 NAME
swish-filter-test - program to test the SWISH::Filter module.
=head1 SYNOPSIS
swish-filter-test [options] <file or url> <...>
Options:
-quiet don't generate messages to stderr
-content output content to stdout
-(no)skip_binary skip output of binary files (default)
-lines <num> Number of lines of content to display to stderr if verbose
-headers output with headers for swish-e -S prog method
-verbose enable $ENV{FILTER_DEBUG} for verbose output
-path output @INC path to SWISH::Filter module
-help brief help message
-man full documentation
-ignore <filter> ignore <filter> in selecting a filter to use
=head1 DESCRIPTION
swish-filter-test is a program to test the Perl module SWISH::Filter.
SWISH::Filter is a module that is included with the swish-e distribution.
Documents supplied to this script (as a URL or a plain file) on the command line
are passed to the SWISH::Filter module. This is useful for testing filters.
The SWISH::Filter module works by checking a document's content-type and looking
for an available filter. Most filters require additional helper programs (e.g.
the filter to convert PDF to HTML requires the Xpdf package). Using the -verbose
options should indicate if a required program is missing.
Options to this script control how much output is generated. Options can also be specified
to generate output that can be piped directly to swish-e (see C<-headers> below).
All messages are sent to stderr unless --content or -headers are specified and then content
is sent to stdout.
=head1 OPTIONS
=over 8
=item B<-quiet>
Don't write info out to stderr. Normally not useful unless you just want to filter
a document and not really test the SWISH::Filter module.
Fatal errors are written to
stderr error regardless of the -quiet option.
=item B<-verbose>
Enables FILTER_DEBUG mode in the SWISH::Filter module, and enables extra info
including a summary of the filtered document to stderr. Enable if trying to find
out why a filter does not work.
=item B<-lines>
Number of summary lines of summary content to show. Summary lines are only showed if -verbose
is selected. Lines are sent to stderr, not stdout.
Note, the summary is limited to 1000 characters regardless of this option.
=item B<-content>
Specifying -content will output full content to stdout. The default is to only display
a few lines.
=item B<-(no)skip_binary>
The default is to not output content from binary files. -noskip_binary will disable this.
=item B<-headers>
Prints the headers used by swish-e when reading documents with -S prog. This can be used to
filter documents directly to swish-e:
swish-filter-test -headers -content http://localhost/ test.pdf | swish-e -S prog -i stdin -v1
=item B<-path>
Prints the installed location of the SWISH::Filter parent directory for use in PERL5LIB,
Allows using SWISH::Filter in other programs, or with the Swish-e -S http method with
swishspider.
For example:
PERL5LIB=`swish-filter-test -path` swish-e -S http -i http://localhost
=item B<-help>
Print a brief help message and exits.
=item B<-man>
Prints the manual page and exits.
=back
=cut