—#!/usr/bin/perl
use
strict;
use
warnings;
###################################################################################
#
# Copyright (C) 2001 Bill Moseley swishscript@hank.org
# Program to test the SWISH::Filter module
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version
# 2 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# The above lines must remain at the top of this program
#
#
#
# 5 July 2006 - perl@peknet.com added --ignore_filters option
# 13 July 2006 - perl@peknet.com added metadata dump
#
####################################################################################
use
Getopt::Long;
use
SWISH::Filter;
use
Pod::Usage;
use
URI;
use
Data::Dumper;
my
$version
=
'0.02'
;
my
(
$verbose
,
$show_content
,
@file
,
@url
,
$help
,
$man
,
$quiet
,
$headers
,
$path
,
$depreciated
,
$mimetypes
);
my
$skip_binary
= 1;
my
$lines
= 10;
my
$max_chars
= 1000;
my
@ignore_filters
= ();
GetOptions(
'verbose!'
=> \
$verbose
,
# turn on INFO messages
'content!'
=> \
$show_content
,
'quiet'
=> \
$quiet
,
'lines=i'
=> \
$lines
,
'help|?'
=> \
$help
,
'man'
=> \
$man
,
'headers'
=> \
$headers
,
'skip_binary!'
=> \
$skip_binary
,
'path'
=> \
$path
,
'depreciated'
=> \
$depreciated
,
'mimetypes'
=> \
$mimetypes
,
'ignore_filters=s@'
=> \
@ignore_filters
) || pod2usage(2);
pod2usage(
-verbose
=> 1)
if
$help
;
pod2usage(
-verbose
=> 2)
if
$man
;
if
(
$path
)
{
'/usr/local/lib/swish-e/perl'
,
"\n"
;
exit
;
}
pod2usage(
-verbose
=> 0,
-message
=>
"Must specify a file or URL"
,
-exitvar
=> 1,
)
unless
@ARGV
or
$mimetypes
;
$ENV
{FILTER_DEBUG} = 1
if
$verbose
;
# used by SWISH::Filter
msg(INFO,
"SWISH::Filter found at [%s]\n"
,
$INC
{
'SWISH/Filter.pm'
});
my
$filter
= SWISH::Filter->new(
ignore_filters
=> \
@ignore_filters
);
if
(
$mimetypes
)
{
my
@filters
=
$filter
->filter_list;
"Mimetypes:\n\n"
;
for
my
$filter
(
@filters
)
{
" $filter:\n"
;
for
my
$pattern
(
$filter
->mimetypes)
{
" $pattern\n"
;
}
"\n"
;
}
}
my
$return
= 0;
for
my
$doc
(
@ARGV
)
{
eval
{
$depreciated
? process_doc_old(
$doc
) : process_doc(
$doc
) };
$return
= 1
if
$@;
warn
"** $0:\n $@\n"
if
$@;
# always warn on die
}
exit
$return
;
sub
process_doc
{
my
(
$file
) =
@_
;
my
$uri
;
eval
{
$uri
= URI->new(
$file
) };
my
%config
= !$@ &&
$uri
->scheme ? fetch_url(
$file
) : fetch_file(
$file
);
my
$doc
=
$filter
->convert(
%config
,
name
=>
$file
,
meta_data
=> {
'convertedBy'
=>
'swish-filter-test'
}
);
die
"Failed to process document [$file]\n"
unless
$doc
;
my
$content_type
=
$doc
->content_type ||
"unknown"
;
my
$parser_type
=
$doc
->swish_parser_type ||
''
;
my
$binary
=
$doc
->is_binary;
my
$msg
=
$doc
->was_filtered ?
''
:
'not'
;
my
$name
=
$doc
->name;
msg(DEBUG,
<<EOF );
Document $file was $msg filtered.
Document: $file ($name)
Content-Type: $content_type
Parser type: $parser_type
EOF
if
(
my
$filters_used
=
$doc
->filters_used)
{
for
my
$filter
(
@$filters_used
)
{
my
$class
=
ref
(
$filter
->{name});
msg(DEBUG,
" >Filter used: $class ( $filter->{start_content_type} -> $filter->{end_content_type} )"
);
msg(DEBUG,
" >Metadata: "
. Dumper(
$doc
->meta_data ||
'none'
));
}
}
if
(!
$binary
)
{
my
@doc
=
split
/\n/,
substr
(${
$doc
->fetch_doc}, 0,
$max_chars
);
my
$toshow
=
$lines
;
$toshow
=
scalar
(
@doc
) - 1
if
$toshow
>=
@doc
;
msg(INFO,
join
"\n"
,
'-- Output Content Sample --'
,
@doc
[0 ..
$toshow
],
''
,
'-- end --'
,
''
);
}
die
"Skipping binary [$file]\n"
if
$binary
&&
$skip_binary
;
if
(
$headers
)
{
my
$len
=
length
${
$doc
->fetch_doc};
"Path-Name: $file\nContent-Length: $len\n"
;
"Document-Type: $parser_type\n"
if
$parser_type
;
"\n"
;
}
${
$doc
->fetch_doc}
if
$show_content
;
}
sub
process_doc_old
{
my
(
$file
) =
@_
;
my
$uri
;
eval
{
$uri
= URI->new(
$file
) };
my
%config
= !$@ &&
$uri
->scheme ? fetch_url(
$file
) : fetch_file(
$file
);
my
$was_filtered
=
$filter
->filter(
%config
,
name
=>
$file
,);
my
$content_type
=
$filter
->content_type ||
"unknown"
;
my
$orig_content_type
=
$filter
->original_content_type ||
"unknown"
;
my
$parser_type
=
$filter
->swish_parser_type ||
''
;
my
$binary
=
$content_type
!~ m[^text/];
my
$msg
=
$was_filtered
?
''
:
'not'
;
msg(DEBUG,
<<EOF );
Document $file was $msg filtered.
Document: $file
Content-Type: $content_type (initial was $orig_content_type)
Parser type: $parser_type
EOF
if
(!
$binary
)
{
my
@doc
=
split
/\n/,
substr
(${
$filter
->fetch_doc}, 0,
$max_chars
);
$lines
=
@doc
- 1
if
$lines
>=
@doc
;
msg(INFO,
join
"\n"
,
'-- Output Content Sample --'
,
@doc
[0 ..
$lines
],
''
,
'-- end --'
,
''
);
}
die
"Skipping binary [$file]\n"
if
$binary
&&
$skip_binary
;
if
(
$headers
)
{
my
$len
=
length
${
$filter
->fetch_doc};
"Path-Name: $file\nContent-Length: $len\n"
;
"Document-Type: $parser_type\n"
if
$parser_type
;
"\n"
;
}
${
$filter
->fetch_doc}
if
$show_content
;
}
sub
fetch_file
{
my
$file
=
shift
;
# just try to open for error reporting
open
FH,
"<$file"
or
die
"Failed to open '$file': $!\n"
;
close
FH;
die
"File '$file' has zero size\n"
if
-z
$file
;
return
(
document
=>
$file
);
}
sub
fetch_url
{
my
$url
=
shift
;
die
"LWP::UserAgent is required to fetch a URL\n$@\n"
if
$@;
my
$ua
= LWP::UserAgent->new;
my
$request
= HTTP::Request->new(
'GET'
,
$url
);
my
$response
=
$ua
->request(
$request
);
die
sprintf
"Error while getting '%s'. Server returned %s."
,
$response
->request->uri,
$response
->status_line
unless
$response
->is_success;
my
$content
=
$response
->content;
my
$content_type
=
$response
->content_type;
return
(
document
=> \
$content
,
content_type
=>
$content_type
,
);
}
sub
msg
{
my
$msg_level
=
shift
;
return
if
$quiet
;
return
if
!
$verbose
&&
$msg_level
> DEBUG;
printf
(STDERR
@_
),
STDERR
"\n"
;
}
__END__
=head1 NAME
swish-filter-test - program to test the SWISH::Filter module.
=head1 SYNOPSIS
swish-filter-test [options] <file or url> <...>
Options:
-quiet don't generate messages to stderr
-content output content to stdout
-(no)skip_binary skip output of binary files (default)
-lines <num> Number of lines of content to display to stderr if verbose
-headers output with headers for swish-e -S prog method
-verbose enable $ENV{FILTER_DEBUG} for verbose output
-path output @INC path to SWISH::Filter module
-help brief help message
-man full documentation
-ignore <filter> ignore <filter> in selecting a filter to use
=head1 DESCRIPTION
swish-filter-test is a program to test the Perl module SWISH::Filter.
SWISH::Filter is a module that is included with the swish-e distribution.
Documents supplied to this script (as a URL or a plain file) on the command line
are passed to the SWISH::Filter module. This is useful for testing filters.
The SWISH::Filter module works by checking a document's content-type and looking
for an available filter. Most filters require additional helper programs (e.g.
the filter to convert PDF to HTML requires the Xpdf package). Using the -verbose
options should indicate if a required program is missing.
Options to this script control how much output is generated. Options can also be specified
to generate output that can be piped directly to swish-e (see C<-headers> below).
All messages are sent to stderr unless --content or -headers are specified and then content
is sent to stdout.
=head1 OPTIONS
=over 8
=item B<-quiet>
Don't write info out to stderr. Normally not useful unless you just want to filter
a document and not really test the SWISH::Filter module.
Fatal errors are written to
stderr error regardless of the -quiet option.
=item B<-verbose>
Enables FILTER_DEBUG mode in the SWISH::Filter module, and enables extra info
including a summary of the filtered document to stderr. Enable if trying to find
out why a filter does not work.
=item B<-lines>
Number of summary lines of summary content to show. Summary lines are only showed if -verbose
is selected. Lines are sent to stderr, not stdout.
Note, the summary is limited to 1000 characters regardless of this option.
=item B<-content>
Specifying -content will output full content to stdout. The default is to only display
a few lines.
=item B<-(no)skip_binary>
The default is to not output content from binary files. -noskip_binary will disable this.
=item B<-headers>
Prints the headers used by swish-e when reading documents with -S prog. This can be used to
filter documents directly to swish-e:
swish-filter-test -headers -content http://localhost/ test.pdf | swish-e -S prog -i stdin -v1
=item B<-path>
Prints the installed location of the SWISH::Filter parent directory for use in PERL5LIB,
Allows using SWISH::Filter in other programs, or with the Swish-e -S http method with
swishspider.
For example:
PERL5LIB=`swish-filter-test -path` swish-e -S http -i http://localhost
=item B<-help>
Print a brief help message and exits.
=item B<-man>
Prints the manual page and exits.
=back
=cut