dta-cab-analyze.perl - metacpan.org

#!/usr/bin/perl -w

use lib '.', 'MUDL';
use DTA::CAB;
use DTA::CAB::Utils ':all';
use DTA::CAB::Datum ':all';
use DTA::CAB::Queue::Server;
use DTA::CAB::Fork::Pool;
use File::Basename qw(basename dirname);
use File::Path qw(rmtree);
use File::Temp qw();
use File::Copy qw();
use IO::File;
use Getopt::Long qw(:config no_ignore_case);
use Time::HiRes qw(gettimeofday tv_interval);
use Pod::Usage;

#BEGIN { require "./CabOpt.pm"; }
use DTA::CAB::Chain::DTA;##-- DEBUG
#use DTA::CAB::Chain::DE_free;##-- DEBUG
#use DTA::CAB::Analyzer::MootSub; ##-- DEBUG
#use DTA::CAB::Format::TEIws; ##-- DEBUG

use strict;

##==============================================================================
## Constants & Globals
## ==============================================================================

##-- program identity
our $prog = basename($0);
our $VERSION = $DTA::CAB::VERSION;

##--------------------------------------------------------------
## Options: Main Process

##-- Options: Main: General
our ($help,$version,$verbose);
#$verbose = 'default';

##-- Options: eval
our @eval_begin = qw();
our @eval_onload = qw();
our @eval_end   = qw();

##-- Options: Main: forking options
our $njobs   = 0; ##-- default: 0 jobs (process everything in the main thread)
our $qpath   = tmpfsfile("dta_cab_q${$}_XXXX", UNLINK=>1);
our $keeptmp = 0;

##-- Options: Main: logging (see %DTA::CAB::Logger::defaultLogOpts)

##-- Options: Main: analysis
our $rcFile       = undef;
our $analyzeClass = 'DTA::CAB::Analyzer';

##-- Options: Main: I/O
our $inputList = 0;      ##-- inputs are command-line lists, not filenames (main thread only)

##-- Options: Main: block-wise
our $block_spec      = undef;     ##-- input block specification; see DTA::CAB::Format::blockOptions()
our %blockOpts       = qw();      ##-- parsed block options
our $logBlockInfo    = 'info';    ##-- log-level for block operations
our $logBlockTrace   = 'none';    ##-- log-level for block trace
our $logBlockProfile = 'none';    ##-- log-level for block profiling

##--------------------------------------------------------------
## Options: Subprocess Options

## %job : job-specific options
our %job =
  (
    ##-- Options: Child: Analysis
   analyzeOpts => {},
   doProfile   => 1,

   ##-- Options: Child: I/O
   inputClass  => undef,	##-- default parser class
   outputClass => undef,	##-- default format class
   inputWords  => 0,		##-- inputs are words, not filenames
   inputOpts   => {},
   outputOpts  => {level=>0},
   outfmt      => '-',		##-- output format

   ##-- Options: input (not here)
   input => undef,
  );

##==============================================================================
## Command-line

## %specs = globalOptionSpecs() : Getopt::Long specs only relevant for main thread
sub globalOptionSpecs {
  return
    (
     ##-- General
     'help|h'    => \$help,
     'version|V' => \$version,
     'module|M=s'  => sub {push(@eval_begin,"use $_[1];")},
     'eval-begin|begin|eb=s'  => \@eval_begin,
     'eval-onload|onload|el=s' => \@eval_onload,
     'eval-end|end|ee=s'      => \@eval_end,

     ##-- Parallelization
     'jobs|jn|j=i'                         => \$njobs,
     'job-queue|queue-path|qpath|jq|qp=s'  => \$qpath,
     'input-list|il|list|l!'               => \$inputList,
     'keeptmp|keeptemp|keep!'              => \$keeptmp,

     ##-- Block-wise processing
     'block|block-size|bs|b:s'             => sub {$block_spec=($_[1] || '')},
     'noblock|B'                           => sub { undef $block_spec; },
     'log-block-info|lbi|block-info|bi|log-block|lb=s' => \$logBlockInfo,
     'log-block-trace|block-trace|lbt|bt=s'            => \$logBlockTrace,
     'log-block-profile|lbp|block-profile|bp=s'        => \$logBlockProfile,
     'noblock-info|nobi'    => sub { $logBlockInfo='none'; },
     'noblock-trace|nobt'   => sub { $logBlockTrace='none'; },
     'noblock-profile|nobp' => sub { $logBlockProfile='none'; },


     ##-- Analysis
     'configuration|c=s'    => \$rcFile,
     'analyzer-class|analyze-class|analysis-class|ac|a=s' => \$analyzeClass,

     ##-- Log4perl stuff
     DTA::CAB::Logger->cabLogOptions('verbose'=>1),
    );
}

## %specs = childOptionSpecs() : Getopt::Long specs overridable by child threads
sub childOptionSpecs {
  return
    (
     ##-- Analysis
     'analyzer-option|analyze-option|analysis-option|ao|aO|O=s' => $job{analyzeOpts},
     'profile|p!' => \$job{doProfile},

     ##.. I/O: generic
     'format-class|fc=s' => sub {$job{inputClass}=$job{outputClass}=$_[1]},
     'format-option|fo=s%' => sub {$job{inputOpts}{$_[1]}=$job{outputOpts}{$_[1]}=$_[2]},

     ##-- I/O: input
     'input-format-class|iformat|ifmt|ifc|input-class|ic|parser-class|pc=s' => \$job{inputClass},
     'input-format-option|ifoption|ifo|input-option|io|parser-option|po=s'  =>  $job{inputOpts},
     'tokens|t|words|w!'                       => \$job{inputWords},

     ##-- I/O: output
     'output-format-class|oformat|ofmt|ofc|output-class|oc=s' => \$job{outputClass},
     'output-format-option|ofoption|ofo|output-option|oo=s'   =>  $job{outputOpts},
     'output-level|ol|format-level|fl=s'       => \$job{outputOpts}{level},
     'output-format|output-file|output|o=s'    => \$job{outfmt},
    );
}

GetOptions(globalOptionSpecs(), childOptionSpecs());
if ($version) {
  print cab_version;
  exit(0);
}

#pod2usage({-exitval=>0, -verbose=>1}) if ($man);
pod2usage({-exitval=>0, -verbose=>0}) if ($help);

##==============================================================================
## MAIN: Initialize (main thread only)
##==============================================================================

##-- main: init: globals
our ($ifmt,$ofmt, $fp);

##-- save per-job overridable options
our $job0 = Storable::dclone(\%job);

##-- for cabeval stuff
our %analyzeOpts = %{$job0->{analyzeOpts}};

##-- main: init: log4perl
DTA::CAB::Logger->logInit();

##-- main: init: hack: set utf8 mode on stdio
binmode(STDOUT,':utf8');
binmode(STDERR,':utf8');

##------------------------------------------------------
## main: init: signals
sub cleandie {
  cleanup();
  exit(1);
}
$SIG{$_}=\&cleandie foreach (qw(TERM KILL HUP INT QUIT ABRT));

##------------------------------------------------------
## main: init: user code
foreach (@eval_begin) {
  eval "$_;";
  die("$prog: error evaluating user BEGIN code ($_): $@") if ($@);
}

##------------------------------------------------------
## main: init: analyzer
$analyzeClass = "DTA::CAB::Analyzer::$analyzeClass" if ($analyzeClass !~ /\:\:/);
eval "use $analyzeClass;";
if ($@ && !UNIVERSAL::can($analyzeClass,'new')) {
  $analyzeClass = "DTA::CAB::Analyzer::$analyzeClass";
  eval "use $analyzeClass;";
}
die("$prog: could not load analyzer class '$analyzeClass': $@") if ($@);
our ($cab);
if (defined($rcFile)) {
  DTA::CAB->debug("${analyzeClass}->loadFile($rcFile)");
  $cab = $analyzeClass->loadFile($rcFile)
    or die("$0: load failed for analyzer from '$rcFile': $!");
} else {
  DTA::CAB->debug("${analyzeClass}->new()");
  $cab = $analyzeClass->new(%{$job{analyzeOpts}})
    or die("$0: $analyzeClass->new() failed: $!");
}

foreach (@eval_onload) {
  eval "$_;";
  die("$prog: error evaluating user ONLOAD code ($_): $@") if ($@);
}

##------------------------------------------------------
## main: init: prepare (load data)
$cab->debug("prepare()");
$cab->prepare($job{analyzeOpts})
  or die("$0: could not prepare analyzer: $!");

##------------------------------------------------------
## main: init: formats (just report)
DTA::CAB->debug("using default input format class ", ref(new_ifmt()));
DTA::CAB->debug("using default output format class ", ref(new_ofmt()));

##------------------------------------------------------
## main: init: profiling
our $tv_started = [gettimeofday] if ($job{doProfile});

##======================================================================
## Subs: I/O

## $ext = file_extension($filename)
##  + returns file extension, including leading '.'
##  + returns empty string if no dot in filename
sub file_extension {
  my $file = shift;
  chomp($file);
  return $1 if (File::Basename::basename($file) =~ m/(\.[^\.]*)$/);
  return '';
}

## $outfile = outfilename($infile,$outfmt)
sub outfilename {
  my ($infile,$outfmt) = @_;
  return $outfmt if (!defined($infile));
  my $d = File::Basename::dirname($infile);
  my $b = File::Basename::basename($infile);
  my $x = '';
  if ($b =~ /^(.*)(\.[^\.\/]*)$/) {
    ($b,$x) = ($1,$2);
  }
  my $outfile = $outfmt;
  $outfile =~ s|%F|%d/%b|g;
  $outfile =~ s|%f|$infile|g;
  $outfile =~ s|%d|$d|g;
  $outfile =~ s|%b|$b|g;
  $outfile =~ s|%x|$x|g;
  return $outfile;
}

## $ifmt = new_ifmt()
## $ifmt = new_ifmt(%_job)
##   + %_job is a subprocess option hash like global %job (default)
sub new_ifmt {
  my %_job = (%job,@_);
  return ($ifmt = DTA::CAB::Format->newReader(class=>$_job{inputClass},file=>($_job{input}||$ARGV[0]),%{$_job{inputOpts}||{}}))
    || die("$0: could not create input parser of class $_job{inputClass}: $!");
}

## $ofmt = new_ofmt()
## $ofmt = new_ofmt(%_job)
##   + %_job is a subprocess option hash like global %job (default)
##   + uses %_job{outfile} to guess format from output filename
sub new_ofmt {
  my %_job = (%job,@_);
  my $outfile = outfilename(($_job{input}||$ARGV[0]), $_job{outfmt});
  return ($ofmt = DTA::CAB::Format->newWriter(class=>$_job{outputClass},file=>$outfile,%{$_job{outputOpts}||{}}))
    || die("$0: could not create output formatter of class $_job{outputClass}: $!");
}

##======================================================================
## Subs: child process callback(s)

## undef = resetOptions()
##  + resets global %job to a deep copy of %$job0
sub resetOptions {
  %job = %{Storable::dclone($job0)};
}

## undef = cb_init()
##  + child process initialization
sub cb_init {
  $fp->{fh}->close() if ($fp->{fh}->opened);
  @{$fp->{queue}} = @{$fp->{pids}} = %{$fp->{blocks}} = qw();
}

## undef = cb_work(\%qjob)
##  + worker callback for child threads
##  + queue dispatches jobs as HASH-refs \%qjob
##  + each \%qjob has a key (opts=>\%job) analagous to global %job
##  + additionally \%qjob has one of the following keys:
##    (
##     block => \%block,       ##-- (block-mode only): block specification as returned by $ifmt->blockScan(),
##     input => $input,        ##-- (file-mode only): job-specific input source (filename)
##     indoc => $indoc,        ##-- (words-mode only): input document
##    )
sub cb_work {
  my ($fp,$qjob) = @_;

  ##----------------------------------------------------
  ## parse job options
  %job = %{ $qjob->{opts} || $job0 };

  ##----------------------------------------------------
  ## Global (re-)initialization
  my $outfile = outfilename(($qjob->{input}||'out'),$job{outfmt}); ##-- may be overridden
  my $ntok=0;
  my $nchr=0;
  my $tv_jstarted = [gettimeofday];
  #DTA::CAB->logdie("dying to debug") if (!@{$fp->{pids}}); ##-- DEBUG

  ##----------------------------------------------------
  ## Input & Output Formats
  new_ifmt();
  new_ofmt();

  if ($qjob->{block}) {
    ##--------------------------------------------------
    ## Analyze: Block-wise
    my $blk   = $qjob->{block};
    $blk->{ofile} = $outfile if (!defined($blk->{ofile}));
    my $blkid = $blk->{blkid} || "$blk->{ifile} -> $blk->{ofile} [$blk->{id}[0]/$blk->{id}[1]]";
    $fp->vlog($logBlockInfo,"BLOCK $blkid");

    ##-- slurp & parse block input buffer
    $ifmt->vlog($logBlockTrace, "BLOCK $blkid: parseBlock()");
    my $doc = $ifmt->parseBlock($blk);
    #$ifmt->vlog($logBlockInfo, "BLOCK $blkid: parsed ", $doc->nTokens, " tok(s), ", scalar(@{$doc->{body}}), " sent(s)");

    ##-- analyze
    $cab->vlog($logBlockTrace, "BLOCK $blkid: analyzeDocument()");
    $doc = $cab->analyzeDocument($doc,$job{analyzeOpts});
    #$cab->vlog($logBlockInfo, "BLOCK $blkid: analyzed ", $doc->nTokens, " tok(s), ", scalar(@{$doc->{body}}), " sent(s)");

    ##-- output
    $ofmt->vlog($logBlockTrace, "BLOCK $blkid: putDocumentBlock()");
    $ofmt->putDocumentBlock($doc,$blk);

    ##-- DEBUG
    #$ofmt->vlog($logBlockInfo, "BLOCK $blkid: wrote ", $doc->nTokens, " tok(s), ", scalar(@{$doc->{body}}), " sent(s)");
    #$ofmt->toFile("blk_$blk->{id}[0]")->putDocumentRaw($doc)->flush();

    ##-- report: statistics
    if ($job{doProfile}) {
      $ntok = $doc->nTokens();
      $nchr = $blk->{ilen};
      $fp->qaddcounts($ntok,$nchr);
      DTA::CAB::Logger->logProfile($logBlockProfile, tv_interval($tv_jstarted,[gettimeofday]), $ntok,$nchr);
    }
    undef $doc; ##-- we can free up the analyzed document now

    ##-- dump block output back to server for append
    $fp->qaddblock($blk);
  }
  elsif ($qjob->{indoc}) {
    ##--------------------------------------------------
    ## Analyze: Document: pre-parsed
    my $doc = $qjob->{indoc};
    my $docid = '"'.($doc->{body}[0]{tokens}[0] || '(nil)').' ..."';

    ##-- analyze
    $cab->trace("analyzeDocument($docid)");
    $doc = $cab->analyzeDocument($doc,$job{analyzeOpts});

    ##-- output
    $ofmt->trace("putDocumentRaw($docid)");
    $ofmt->toFile($outfile);
    $ofmt->putDocumentRaw($doc)->flush;

    ##-- report: statistics
    if ($job{doProfile}) {
      use bytes;
      $ntok  = $doc->nTokens();
      $nchr += length($_->{text}) foreach (map {@{$_->{tokens}}} @{$doc->{body}}); ##-- hack
      $fp->qaddcounts($ntok,$nchr);
    }
  }
  else {
    ##--------------------------------------------------
    ## Analyze: Document: file
    my $infile = $qjob->{input};
    $fp->info("processing file $infile");

    ##-- parse
    $ifmt->trace("parseFile($infile)");
    my $doc = $ifmt->parseFile($infile)
      or die("$prog: parse failed for input file '$infile': $!");
    $ifmt->close; ##-- ... we can free any format-local input buffers now

    ##-- analyze
    $cab->trace("analyzeDocument($infile)");
    $doc = $cab->analyzeDocument($doc,$job{analyzeOpts});

    ##-- output
    $ofmt->trace("putDocumentRaw($infile -> $outfile)");
    $ofmt->toFile($outfile)
      or die("$prog: open failed for output file '$outfile': $!");
    $ofmt->putDocumentRaw($doc)->flush;

    ##-- report: statistics
    if ($job{doProfile}) {
      $ntok = $doc->nTokens;
      $nchr = (-s $infile) if ($infile ne '-');
      $fp->qaddcounts($ntok,$nchr);
    }
  }

  return 0;
}
##--/cb_work


##======================================================================
## MAIN: guts

##------------------------------------------------------
## main: init: queue

$fp = DTA::CAB::Fork::Pool->new(njobs=>$njobs, local=>$qpath, init=>\&cb_init, work=>\&cb_work, installReaper=>1, logBlock=>$logBlockTrace)
  or die("$0: could not create fork-pool with socket '$qpath': $!");
#DTA::CAB->info("created job queue on UNIX socket '$qpath'");

##------------------------------------------------------
## main: parse specified inputs into a job-queue
my @jobs = qw();
push(@ARGV,'-') if (!@ARGV);
if ($inputList) {
  ##-- list-input mode: push each list item as an individual job
  die("$0: cannot combine -list and -words options (use TT, TJ, or TXT format to process flat word lists)") if ($job{inputWords});
  while (<>) {
    chomp;
    next if (m/^\s*$/ || m/^\s*\#/ || m/^\s*\%\%/);
    %job = %{Storable::dclone($job0)};
    my ($rc,$argv) = Getopt::Long::GetOptionsFromString($_, childOptionSpecs());
    die("$prog: could not parse options-string '$_' at $ARGV line $.") if (!$rc);
    my $jopts = Storable::dclone(\%job);
    push(@jobs, {opts=>$jopts, input=>$_}) foreach (@$argv);
  }
}
elsif ($job{inputWords}) {
  ##-- word-input mode: pass document on the queue
  my @words = map { utf8::decode($_) if (!utf8::is_utf8($_)); $_ } @ARGV;
  my $doc = toDocument([ toSentence([ map {toToken($_)} @words ]) ]);
  @jobs = ( {opts=>\%job, indoc=>$doc} );
}
else {
  ##-- file-input mode: push arguments as individual jobs
  @jobs = map { {opts=>\%job, input=>$_} } @ARGV;
}

##------------------------------------------------------
## main: block-scan if requested
if (!defined($block_spec)) {
  ##-- document-wise processing: just enqueue the parsed jobs
  $fp->enq($_) foreach (@jobs);
}
else {
  ##-- block-wise processing: scan for block boundaries and enqueue each block separately
  %blockOpts = $ifmt->blockOptions($block_spec);
  DTA::CAB->info("using block-wise I/O with eob=$blockOpts{eob}, size>=$blockOpts{bsize}");

  foreach my $job (@jobs) {
    if ($job->{input} eq '-') {
      ##-- stdin hack: spool it to the filesystem for blocking
      my ($tmpfh,$tmpfile) = tmpfsfile("dta_cab_stdin${$}_XXXX", UNLINK=>1);
      File::Copy::copy(\*STDIN,$tmpfh)
	  or die("$prog: could not spool stdin to $tmpfile: $!");
      $tmpfh->close();
      $job->{input} = $tmpfile;
    }

    ##-- block-scan
    new_ifmt(%$job);
    #$ifmt->trace("blockScan($job->{input})");
    my $ofile  = outfilename($job->{input}, $job->{opts}{outfmt});
    my $blocks = $ifmt->blockScan($job->{input}, %blockOpts);
    my $nblks  = scalar(@$blocks);
    my $idfmt  = "%s -> %s [%".length($nblks)."d/%d]";
    my $blki   = 0;
    foreach (@$blocks) {
      $_->{ofile} = $ofile;
      $_->{blkid} = sprintf($idfmt, $_->{ifile}, $_->{ofile}, ++$blki, $nblks);
      $fp->enq({%$job,block=>$_});
    }
  }
}
$fp->info("populated job-queue with ", $fp->size, " item(s)");
#print Data::Dumper->Dump([$fp->{queue}],['QUEUE']), "\n";
#exit 0; ##-- DEBUG


##------------------------------------------------------
## main: guts: process queue

$fp->serverMain();

#$fp->debug("waiting for subprocess(es) to terminate...");
$SIG{CHLD} = undef; ##-- remove installed reaper-sub, if any
$fp->waitall();

##-- check for any remaining unflushed data blocks
my $flushok=1;
my ($bkey,$bt);
while (($bkey,$bt)=each(%{$fp->{blocks}||{}})) {
  next if (!$bt || !$bt->{pending} || !@{$bt->{pending}});
  $fp->logcarp("found ", scalar(@{$bt->{pending}}), " unflushed data block(s) for '$bkey'");
  $flushok = 0;
}
$fp->logcroak("some data blocks were not flushed to disk") if (!$flushok);


##------------------------------------------------------
## main: guts: profiling

if ($job{doProfile}) {
  DTA::CAB::Logger->logProfile('info', tv_interval($tv_started,[gettimeofday]), @$fp{qw(ntok nchr)});
}

##======================================================================
## MAIN: cleanup

##------------------------------------------------------
## main: cleanup: user code
foreach (@eval_end) {
  eval "$_;";
  die("$prog: error evaluating user END code ($_): $@") if ($@);
}

##-- be nice & say goodbyte
DTA::CAB::Logger->info("program exiting normally.");

if (0) {
  ##-- DEBUG memory usage
  my $memusg = `ps -p $$ -o rss=,vsz=`;
  chomp($memusg);
  my ($rss,$vsz) = split(' ',$memusg,2);
  DTA::CAB->info("Memory usage via ps: RSS=$rss, VSZ=$vsz");
  #$_=<STDIN>;

  ##-- dummy debug
  our $cyclic = bless({},'DTA::CAB');
  $cyclic->{self} = $cyclic;
}

##-- main: cleanup: queues & temporary files
sub cleanup {
  if (!$fp || !$fp->is_child) {
    #print STDERR "$0: END block running\n"; ##-- DEBUG
    $fp->abort()  if ($fp);
    $fp->unlink() if ($fp && !$keeptmp);
    #$statq->unlink() if ($statq && !$keeptmp);
    #File::Path::rmtree($blockdir) if ($blockdir && !$keeptmp);
  }
}

END {
  cleanup();
}

__END__
=pod

=head1 NAME

dta-cab-analyze.perl - Command-line analysis interface for DTA::CAB

=head1 SYNOPSIS

 dta-cab-analyze.perl [OPTIONS...] DOCUMENT_FILE(s)...

 General Options
  -help                           ##-- show short usage summary
  -version                        ##-- show version & exit
  -verbose LEVEL                  ##-- alias for -log-level=LEVEL
  -begin CODE                     ##-- evaluate CODE early in script
  -onload CODE                    ##-- evaluate CODE after loading analyzer(s)
  -module MODULE                  ##-- alias for -begin="use MODULE;"
  -end CODE                       ##-- evaluade CODE late in script

 Parallelization Options
  -jobs NJOBS                     ##-- fork() off up to NJOBS parallel jobs (default=0: don't fork() at all)
  -job-queue QPATH                ##-- use QPATH as job-queue socket (default: temporary)
  -keep , -nokeep                 ##-- do/don't keep temporary queue files (default: don't)

 Analysis Options
  -config PLFILE                  ##-- load analyzer config file PLFILE
  -analysis-class  CLASS          ##-- set analyzer class (if -config is not specified)
  -analysis-option OPT=VALUE      ##-- set analysis option
  -profile , -noprofile           ##-- do/don't report profiling information (default: do)

 I/O Options
  -list                           ##-- arguments are list-files, not filenames
  -words                          ##-- arguments are word text, not filenames
  -input-class CLASS              ##-- select input parser class (default: Text)
  -input-option OPT=VALUE         ##-- set input parser option

  -output-class CLASS             ##-- select output formatter class (default: Text)
  -output-option OPT=VALUE        ##-- set output formatter option
  -output-level LEVEL             ##-- override output formatter level (default: 1)
  -output-format TEMPLATE         ##-- set output format (default=STDOUT)

  -format-class CLASS             ##-- alias for -input-class=CLASS -output-class=CLASS
  -format-option OPT=VALUE        ##-- alias for -input-option OPT=VALUE -output-option OPT=VALUE

 Block-wise Processing Options
  -block SIZE[{k,M,G,T}][@EOB]    ##-- pseudo-streaming block-wise analysis (not for all formats)
  -noblock                        ##-- disable block-wise processing
  -log-block-info LEVEL		  ##-- log block-info at LEVEL (default=INFO)
  -log-block-trace LEVEL          ##-- log block-trace at LEVEL (default=none)
  -log-block-profile LEVEL        ##-- log block-profile at LEVEL (default=none)

 Logging Options                  ##-- see Log::Log4perl(3pm)
  -log-level LEVEL                ##-- set minimum log level (default=TRACE)
  -log-stderr , -nolog-stderr     ##-- do/don't log to stderr (default=true)
  -log-syslog , -nolog-syslog     ##-- do/don't log to syslog (default=false)
  -log-file LOGFILE               ##-- log directly to FILE (default=none)
  -log-rotate , -nolog-rotate     ##-- do/don't auto-rotate log files (default=true)
  -log-config L4PFILE             ##-- log4perl config file (overrides -log-stderr, etc.)
  -log-watch  , -nowatch          ##-- do/don't watch log4perl config file (default=false)
  -log-option OPT=VALUE           ##-- set any logging option (e.g. -log-option twlevel=trace)

=cut

##==============================================================================
## Description
##==============================================================================
=pod

=head1 DESCRIPTION

dta-cab-analyze.perl is a command-line utility for analyzing
documents with the L<DTA::CAB|DTA::CAB> analysis suite, without the need
to set up and/or connect to an independent server.

=cut

##==============================================================================
## Options and Arguments
##==============================================================================
=pod

=head1 OPTIONS AND ARGUMENTS

=cut

##==============================================================================
## Options: General Options
=pod

=head2 General Options

=over 4

=item -help

Display a short help message and exit.

=item -man

Display a longer help message and exit.

=item -version

Display program and module version information and exit.

=item -verbose

Set default log level (trace|debug|info|warn|error|fatal).

=back

=cut

##==============================================================================
## Options: Parallelization Options
=pod

=head2 Parallelization Options

=over 4

=item -jobs NJOBS

Fork() off up to NJOBS parallel jobs.
If NJOBS=0 (default), doesn't fork() at all.

=item -job-queue QPATH

Use QPATH as job-queue socket.  Default is to use a temporary file.

=item -keep , -nokeep

Do/don't keep temporary queue files after program termination (default: don't)

=back

=cut

##==============================================================================
## Options: Other Options
=pod

=head2 Analysis Options

=over 4

=item -config PLFILE

B<Required>.

Load analyzer configuration from PLFILE,
which should be a perl source file parseable
by L<DTA::CAB::Persistent::loadFile()|DTA::CAB::Persistent/item_loadFile>
as a L<DTA::CAB::Analyzer|DTA::CAB::Analyzer> object.
Prototypically, this file will just look like:

 our $obj = DTA::CAB->new( opt1=>$val1, ... );

=item -analysis-option OPT=VALUE

Set an arbitrary analysis option C<OPT> to C<VALUE>.
May be multiply specified.

=item -profile , -noprofile

Do/don't report profiling information (default: do)

=back

=cut

##==============================================================================
## Options: I/O Options
=pod

=head2 I/O Options

=over 4

=item -list

Arguments are list files (1 input per line), not filenames.
List-file arguments can actually contain a subset of command-line options
in addition to input filenames.
Not compatible with the L<-words> option.

=item -words

Arguments are word text, not filenames.
Not compatible with the L<-list> option.

=item -block SIZE[{k,M,G,T}][@EOB]

Do pseudo-streaming block-wise analysis.
Currently only supported for 'TT' and 'TJ' formats.
SIZE is the minimum size in bytes for non-final analysis blocks,
and may have an optional SI suffix 'k', 'M', 'G', or 'T'.
EOB indicates the desired block-boundary type; either 's' to
force all block-boundaries to be sentence boundaries,
or 't' ('w') for token (word) boundaries.  Default=128k@w.

=item -input-class CLASS

Select input parser class (default: Text).

=item -input-option OPT=VALUE

Set arbitrary input parser options.
May be multiply specified.



=item -output-class CLASS

Select output formatter class (default: Text)

=item -output-option OPT=VALUE

Set arbitrary output formatter option.
May be multiply specified.

=item -output-level LEVEL

Override output formatter level (default: 1)

=item -output-format FORMAT

Set output format (default='-' (STDOUT)), a printf-style format which may contain the following %-escapes:

 %f  : INFILE           : current input file
 %b  : basename(INFILE) : basename of current input file
 %d  : dirname(INFILE)  : directory of current input file
 %x  : extension(INFILE): extension of current input file
 %F  :                  : alias for %d/%b

=back

=cut


##======================================================================
## Footer
##======================================================================
=pod

=head1 ACKNOWLEDGEMENTS

Perl by Larry Wall.

=head1 AUTHOR

Bryan Jurish E<lt>moocow@cpan.orgE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2009-2019 by Bryan Jurish. All rights reserved.

This program is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.24.1 or,
at your option, any later version of Perl 5 you may have available.

=head1 SEE ALSO

L<dta-cab-analyze.perl(1)|dta-cab-analyze.perl>,
L<dta-cab-convert.perl(1)|dta-cab-convert.perl>,
L<dta-cab-cachegen.perl(1)|dta-cab-cachegen.perl>,
L<dta-cab-xmlrpc-server.perl(1)|dta-cab-xmlrpc-server.perl>,
L<dta-cab-xmlrpc-client.perl(1)|dta-cab-xmlrpc-client.perl>,
L<DTA::CAB(3pm)|DTA::CAB>,
L<perl(1)|perl>,
...

=cut
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	Go to GitHub issues (only if GitHub is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)
Keyboard Shortcuts