dcdb-corpus-compile.perl

#!/usr/bin/perl -w

use lib qw(. ./blib/lib ./blib/arch lib lib/blib/lib lib/blib/arch);
use DiaColloDB;
use DiaColloDB::Corpus::Compiled;
use DiaColloDB::Utils qw(:si);
use Getopt::Long qw(:config no_ignore_case);
use Pod::Usage;
use File::Basename qw(basename);
use strict;

##----------------------------------------------------------------------
## Globals
##----------------------------------------------------------------------

##-- program vars
our $prog       = basename($0);
our ($help,$version);

our %log        = (level=>'TRACE', rootLevel=>'FATAL');

our $globargs   = 1; ##-- glob input corpus @ARGV?
our $listargs   = 0; ##-- input args are raw corpus file-lists?
our $union      = 0; ##-- input args are pre-compiled corpora?
our $append     = 0; ##-- append to output corpus?
our $dotime     = 1; ##-- report timing?

our $outdir     = undef; ##-- required

our %icorpus    = (dclass=>'DDCTabs', dopts=>{});
our $filters    = DiaColloDB::Corpus::Filters->new();
our %ocorpus    = (
                   base    => undef,
                   njobs   => -1,
                   filters => $filters,
                  );

##----------------------------------------------------------------------
## Command-line processing
##----------------------------------------------------------------------
foreach (@ARGV) { utf8::decode($_) if (!utf8::is_utf8($_)); }
GetOptions(##-- general
	   'h|help' => \$help,
	   'V|version' => \$version,
	   #'verbose|v=i' => \$verbose,
           'j|jobs|njobs|nj=f' => \$ocorpus{njobs},

	   ##-- input corpus options
	   'g|glob!' => \$globargs,
	   'l|list!' => \$listargs,
           'u|union!' => \$union,
           ##
	   'C|document-class|dclass|dc=s' => \$icorpus{dclass},
	   'D|document-option|docoption|dopt|do|dO=s%' => \$icorpus{dopts},
	   'by-sentence|bysentence' => sub { $icorpus{dopts}{eosre}='^$' },
	   'by-paragraph|byparagraph' => sub { $icorpus{dopts}{eosre}='^%%\$DDC:BREAK\.p=' },
	   'by-doc|bydoc|by-file|byfile' => sub { $icorpus{dopts}{eosre}='' },

	   ##-- filter options
           'f|filter=s%' => sub { $filters->{$_[1]}=$_[2]; },
           'F|nofilters|no-filters|all|A|no-prune|noprune|use-all-the-data' => sub { $filters->clear },

	   ##-- I/O and logging
           'a|append!' => \$append,
           'o|output-directory|outdir|output|out|od=s' => \$outdir,
	   't|timing|times|time!' => \$dotime,
           'lf|log-file|logfile=s' => \$log{file},
	   'll|log-level|level=s' => sub { $log{level} = uc($_[1]); },
	   'lo|log-option|logopt=s' => \%log,
	  );

if ($version) {
  print STDERR "$prog version $DiaColloDB::VERSION by Bryan Jurish\n";
  exit 0 if ($version);
}
pod2usage({-exitval=>0,-verbose=>0}) if ($help);
die("$prog: ERROR: no output corpus directory specified: use the -output (-o) option!\n") if (!defined($outdir));


##----------------------------------------------------------------------
## MAIN
##----------------------------------------------------------------------

##-- setup logger
DiaColloDB::Logger->ensureLog(%log);
my $logger = 'DiaColloDB::Logger';
my $timer  = DiaColloDB::Timer->start();

##-- common variables
$ocorpus{flags} = $append ? '>>' : '>';
my ($ocorpus);

if ($union) {
  ##-- union: merge pre-compiled corpora
  $ocorpus = DiaColloDB::Corpus::Compiled->union(\@ARGV, %ocorpus, dbdir=>$outdir)
    or die("$prog: failed to create union corpus");
}
else {
  ##-- !union: compile raw input corpus data

  ##-- open input corpus
  push(@ARGV,'-') if (!@ARGV);
  my $icorpus = DiaColloDB::Corpus->new(%icorpus);
  $icorpus->open(\@ARGV, 'glob'=>$globargs, 'list'=>$listargs)
    or die("$prog: failed to open input corpus: $!");

  ##-- compile input corpus
  $ocorpus = $icorpus->compile($outdir, %ocorpus)
    or die("$prog: failed to compile output corpus '$outdir'.* from raw input corpus");
}

##-- cleanup
$ocorpus->close() if ($ocorpus);

##-- timing
if ($dotime) {
  (my $du = `du -h "$outdir" `) =~ s/\s.*\z//s;
  $logger->info("operation completed in ", $timer->timestr, "; compiled corpus size = ${du}B");
}

__END__

###############################################################
## pods
###############################################################

=pod

=head1 NAME

dcdb-corpus-compile.perl - pre-compile a DiaColloDB corpus

=head1 SYNOPSIS

 dcdb-corpus-compile.perl [OPTIONS] [INPUT(s)...]

 General Options:
   -h, -help            # this help message
   -V, -version         # report version information and exit
   -j, -jobs NJOBS      # set number of worker threads

 Input Corpus Options:
   -l, -[no]list        # INPUT(s) are/aren't file-lists (default=no)
   -g, -[no]glob        # do/don't glob INPUT(s) argument(s) (default=don't)
   -u, -[no]union       # do/don't treat INPUT(S) as pre-compiled corpus to be merged (default=don't)
   -C, -dclass CLASS    # set corpus document class (default=DDCTabs)
   -D, -dopt OPT=VAL    # set corpus document option, e.g.
                        #   eosre=EOSRE  # eos regex (default='^$')
                        #   foreign=BOOL # disable D*-specific heuristics
       -bysent          # default split by sentences (default)
       -byparagraph     # default split by paragraphs
       -bypage          # default split by page
       -bydoc           # default split by document

 Content Filter Options:
   -f, -filter KEY=VAL  # set filter option for KEY = (p|w|l)(bad|good)(_file)?
                        #   (p|w|l)good=REGEX      # positive regex for (postags|words|lemmata)
                        #   (p|w|l)bad=REGEX       # negative regex for (postags|words|lemmata)
                        #   (p|w|l)goodfile=FILE   # positive list-file for (postags|words|lemmata)
                        #   (p|w|l)badfile=FILE    # negative list-file for (postags|words|lemmata)
   -F, -nofilters       # clear all filter options

 I/O and Logging Options:
   -ll, -log-level LVL  # set log-level (default=TRACE)
   -lo, -log-option K=V # set log option (e.g. logdate, logtime, file, syslog, stderr, ...)
   -t,  -[no]times      # do/don't report operating timing (default=do)
   -a,  -[no]append     # do/don't append to existing output corpus (default=don't)
   -o,  -output OUTDIR  # set output corpus directory (required)

=cut

###############################################################
## DESCRIPTION
###############################################################
=pod

=head1 DESCRIPTION

dcdb-corpus-compile.perl pre-compiles a L<DiaColloDB::Corpus::Compiled|DiaColloDB::Corpus::Compiled>
from a tokenized and annotated input corpus represented as a L<DiaColloDB::Corpus|DiaColloDB::Corpus>
object, optionally applying L<content filters|DiaColloDB::Corpus::Filters> such as stopword lists.
The resulting compiled corpus can be used with L<dcdb-create.perl(1)|dcdb-create.perl>
to compile a L<DiaColloDB|DiaColloDB> collocation database.

Note that it is B<not> necessary to pre-compile a corpus with this script in order
to create a fully functional L<DiaColloDB|DiaColloDB> database from a source corpus,
since the L<DiaColloDB::create()|DiaColloDB::compile/create> method as invoked by
the L<dcdb-create.perl(1)|dcdb-create.perl> script should
implicitly create a (temporary) C<DiaColloDB::Corpus::Compiled> object
as and when required.

=cut


###############################################################
## OPTIONS AND ARGUMENTS
###############################################################
=pod

=head1 OPTIONS AND ARGUMENTS


=cut

###############################################################
# Arguments
###############################################################
=pod

=head2 Arguments

=over 4

=item INPUT(s)

File(s), glob(s), file-list(s), or basename(s) to be compiled.
Interpretation depends on the L<-glob|/-glob>, L<-list|/-list>, and L<-union|/-union>
options.

=back

=cut

###############################################################
# General Options
###############################################################
=pod

=head2 General Options

=over 4

=item -help

Display a brief help message and exit.

=item -version

Display version information and exit.

=item -jobs NJOBS

Run C<NJOBS> parallel compilation threads.
If specified as 0, will run only a single thread.
The default value (-1) will run as many jobs as there are cores on the (unix/linux) system;
see L<DiaColloDB::Utils/nJobs> for details.

=back

=cut


###############################################################
# Input Corpus Options
=pod

=head2 Input Corpus Options

=over 4

=item -list

=item -nolist

Do/don't treat INPUT(s) as file-lists rather than corpus data files.
Default=don't.

=item -glob

=item -noglob

Do/don't expand wildcards in INPUT(s).
Default=do.

=item -union

=item -nounion

Do/don't treat INPUT(s) as pre-compiled corpora to be merged.
Note that in C<-union> mode, no corpus content filters are applied
(they are assumed to have been applied to the INPUT(s) prior to the union call).
Default=don't

=item -dclass CLASS

Set corpus document class (default=DDCTabs).
See L<DiaColloDB::Document/SUBCLASSES> for a list
of supported input formats.
If you are using the default L<DDCTabs|DiaColloDB::Document::DDCTabs> document class
on your own (non-D*) corpus, you may also want to specify
L<C<-dopt foreign=1>|/"-dopt OPT=VAL">.

Aliases: -C, -document-class, -dclass, -dc

=item -dopt OPT=VAL

Set corpus document option, e.g.
L<C<-dopt eosre=EOSRE>|DDCTabs/new> sets the end-of-sentence regex
for the default L<DDCTabs|DiaColloDB::Document::DDCTabs> document class,
and L<C<-dopt foreign=1>|DDCTabs/new> disables D*-specific hacks.

Aliases: -D, -document-option, -docoption, -dopt, -do, -dO

=item -bysent

Split corpus (-> track collocations in compiled database) by sentence (default).

=item -byparagraph

Split corpus (-> track collocations in compiled database) by paragraph.

=item -bypage

Split corpus (-> track collocations in compiled database) by page.

=item -bydoc

Split corpus (-> track collocations in compiled database) by document.

=back

=cut


###############################################################
# Filter Options
=pod

=head2 Filter Options

=over 4

=item -use-all-the-data

Disables all content-filter options,
inspired by Mark Lauersdorf; equivalent to:

 -f=pgood='' \
 -f=wgood='' \
 -f=lgood='' \
 -f=pbad='' \
 -f=wbad='' \
 -f=lbad=''

Aliases: -F, -nofilters, -A, -all, -noprune

=back

=cut

###############################################################
# I/O and Logging Options
=pod

=head2 I/O and Logging Options

=over 4

=item -log-level LEVEL

Set L<DiaColloDB::Logger|DiaColloDB::Logger> log-level (default=TRACE).

Aliases: -ll, -log-level, -level

=item -log-option OPT=VAL

Set arbitrary L<DiaColloDB::Logger|DiaColloDB::Logger> option (e.g. logdate, logtime, file, syslog, stderr, ...).

Aliases: -lo, -log-option, -logopt

=item -[no]times

Do/don't report operating timing (default=do)

Aliases: -t, -timing, -times, -time

=item -output OUTDIR

Output directory for compiled corpus (required).

Aliases: -o, -output-directory, -outdir, -output, -out, -od

=back

=cut


###############################################################
# Bugs and Limitations
###############################################################
=pod

=head1 BUGS AND LIMITATIONS

Probably many.

=cut


###############################################################
# Footer
###############################################################
=pod

=head1 ACKNOWLEDGEMENTS

Perl by Larry Wall.

=head1 AUTHOR

Bryan Jurish E<lt>moocow@cpan.orgE<gt>

=head1 SEE ALSO

L<DiaColloDB(3pm)|DiaColloDB>,
L<DiaColloDB::Corpus(3pm)|DiaColloDB::Corpus>,
L<DiaColloDB::Corpus::Compiled(3pm)|DiaColloDB::Corpus::Compiled>,
L<DiaColloDB::Corpus::Filters(3pm)|DiaColloDB::Corpus::Filters>,
L<dcdb-create.perl(1)|dcdb-create.perl>,
L<perl(1)|perl>.

=cut
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	Go to GitHub issues (only if GitHub is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)
Keyboard Shortcuts