The Perl Toolchain Summit 2025 Needs You: You can help 🙏 Learn more

#!/usr/bin/perl -w
use lib qw(. ./blib/lib ./blib/arch lib lib/blib/lib lib/blib/arch);
use DiaColloDB;
use DiaColloDB::Utils qw(:math :si :jobs);
use Getopt::Long qw(:config no_ignore_case);
use Pod::Usage;
use File::Basename qw(basename);
use strict;
#use DiaColloDB::XS; ##-- DEBUG
#use DiaColloDB::Relation::TDF; ##-- DEBUG
#use DiaColloDB::Document::TCF; ##-- DEBUG
##----------------------------------------------------------------------
## Globals
##----------------------------------------------------------------------
##-- program vars
our $prog = basename($0);
our ($help,$version);
our %log = (level=>'TRACE', rootLevel=>'FATAL');
our $dbdir = undef;
our $globargs = 1; ##-- glob @ARGV?
our $listargs = 0; ##-- args are file-lists?
our $union = 0; ##-- args are db-dirs?
our $lazy_union = 0; ##-- union mode: create a list-client config?
our $dotime = 1; ##-- report timing?
our %corpus = (dclass=>'DDCTabs', dopts=>{});
our %coldb = (
pack_id=>'N',
pack_date=>'n',
pack_f=>'N',
pack_off=>'N',
pack_len=>'n',
dmax=>5,
cfmin=>2,
tfmin=>2,
fmin_l=>undef,
keeptmp=>0,
mmap => 1,
debug => 0,
tdfopts=>{
minDocFreq => 4,
minDocSize => 8,
#maxDocSize => 'inf',
},
vbreak=>'#file',
);
our %uopts = qw(); ##-- user-options, for lazy-union creation
##----------------------------------------------------------------------
## Command-line processing
##----------------------------------------------------------------------
sub pack64 {
$coldb{$_}=($_[1] ? 'Q>' : 'N') foreach qw(pack_id pack_f pack_off);
$coldb{pack_len}=($_[1] ? 'n' : 'N');
$coldb{tdfopts}{itype} = $_[1] ? 'ccs_indx' : 'long';
$coldb{tdfopts}{vtype} = $_[1] ? 'double' : 'float';
}
sub wantxs {
#print STDERR "WANT_XS=$_[1]\n";
$DiaColloDB::Relation::Cofreqs::WANT_XS = $_[1];
}
sub njobs {
$DiaColloDB::NJOBS = nJobs($_[1]);
$ENV{OMP_NUM_THREADS} = max2($DiaColloDB::NJOBS,1);
}
foreach (@ARGV) { utf8::decode($_) if (!utf8::is_utf8($_)); }
GetOptions(##-- general
'help|h' => \$help,
'version|V' => \$version,
'xs!' => \&wantxs,
'pp!' => sub { wantxs($_[0],!$_[1]) },
#'verbose|v=i' => \$verbose,
##-- corpus options
'glob|g!' => \$globargs,
'list|l!' => \$listargs,
'union|u|merge!' => \$union,
'jobs|njobs|j=s' => \&njobs,
'lazy-union|list-union|lazy|lu!' => \$lazy_union,
'document-class|dclass|dc=s' => \$corpus{dclass},
'document-option|docoption|dopt|do|dO=s%' => \$corpus{dopts},
'by-sentence|bysentence' => sub { $corpus{dopts}{eosre}='^$' },
'by-paragraph|byparagraph' => sub { $corpus{dopts}{eosre}='^%%\$DDC:BREAK\.p=' },
'by-doc|bydoc|by-file|byfile' => sub { $corpus{dopts}{eosre}='' },
##-- coldb options
'index-attributes|attributes|attrs|a=s' => \$coldb{attrs},
'nofilters|no-filters|F|all|A|no-prune|noprune|use-all-the-data' => sub {
$coldb{$_} = 0 foreach (grep {$_ =~ /fmin/} keys %coldb);
$coldb{$_} = '' foreach (@DiaColloDB::Corpus::Filters::NAMES);
$coldb{$_} = undef foreach (@DiaColloDB::Corpus::Filters::FILES);
$coldb{tdfopts}{$_} = 0 foreach (grep {$_ =~ /min.*Freq/} keys %{$coldb{tdfopts}});
$coldb{tdfopts}{$_} = 1 foreach (grep {$_ =~ /min.*Size/} keys %{$coldb{tdfopts}});
$coldb{tdfopts}{$_} = 'inf' foreach (grep {$_ =~ /max.*(Freq|Size)/} keys %{$coldb{tdfopts}});
$coldb{tdfopts}{$_} = '' foreach (qw(mgood mbad));
},
'64bit|64|quad|Q!' => sub { pack64( $_[1]); },
'32bit|32|long|L|N!' => sub { pack64(!$_[1]); },
'mmap!' => \$coldb{mmap},
'debug!' => \$coldb{debug},
'max-distance|maxd|dmax|n=i' => \$coldb{dmax},
'min-term-frequency|min-tf|mintf|tfmin|min-frequency|min-f|minf|fmin=i' => \$coldb{tfmin},
'min-lemma-frequency|min-lf|minlf|lfmin=i' => \$coldb{fmin_l},
'min-cofrequency|min-cf|mincf|cfmin=i' => \$coldb{cfmin},
'index-tdf|index-tdm|tdf|tdm!' => \$coldb{index_tdf},
'tdf-dbreak|tdf-break|dbreak|db|vbreak|vb=s' => \$coldb{dbreak},
'tdf-min-term-frequency|tdf-tfmin|tdf-fmin=i' => \$coldb{tdfopts}{minFreq},
'tdf-min-document-frequency|tdf-dfmin=i' => \$coldb{tdfopts}{minDocFreq},
'tdf-break-min-size|tdf-break-min|tdf-nmin|vbnmin|vbmin=s' => \$coldb{tdfopts}{minDocSize},
'tdf-break-max-size|tdf-break-max|tdf-nmax|vbnmax|vbmax=s' => \$coldb{tdfopts}{maxDocSize},
'tdf-option|tdm-option|tdfopt|tdmopt|tdmo|tdfo|to|tO=s%' => sub { $coldb{tdfopts}{$_[1]}=$_[2] },
'keeptmp|keep!' => \$coldb{keeptmp},
'option|O=s%' => sub { $coldb{$_[1]}=$uopts{$_[1]}=$_[2]; },
##-- I/O and logging
'timing|times|time|t!' => \$dotime,
'log-level|level|ll=s' => sub { $log{level} = uc($_[1]); },
'log-option|logopt|lo=s' => \%log,
'log-file|lf=s' => \$log{file},
'nolog-file|nolf' => sub { $log{file}=undef; },
'output|outdir|od|o=s' => \$dbdir,
);
if ($version) {
print STDERR "$prog version $DiaColloDB::VERSION by Bryan Jurish\n";
exit 0 if ($version);
}
pod2usage({-exitval=>0,-verbose=>0}) if ($help);
die("$prog: ERROR: no output location specified: use the -output (-o) option!\n") if (!defined($dbdir));
##----------------------------------------------------------------------
## MAIN
##----------------------------------------------------------------------
##-- setup logger
DiaColloDB::Logger->ensureLog(%log);
##-- setup corpus
push(@ARGV,'-') if (!@ARGV);
$globargs = 0 if ($lazy_union); ##-- allow "real" remote URLs for lazy union
my $corpus = DiaColloDB::Corpus->new(%corpus);
$corpus->open(\@ARGV, 'glob'=>$globargs, 'list'=>$listargs, ($union ? (logOpen=>'off') : qw()))
or die("$prog: failed to open corpus: $!");
##-- create db
my $timer = DiaColloDB::Timer->start();
my ($coldb);
if ($lazy_union) {
##-- lazy union: just create "thin" client URL
$coldb = DiaColloDB::Client::list->new(%uopts)
or die("$prog: failed to create lazy union list-client: $!");
$coldb->open($corpus->{files})
or die("$prog: failed to open lazy union list-client: $!");
$coldb->saveHeaderFile($dbdir)
or die("$prog: failed to save lazy union list-client configuration to 'rcfile://$dbdir': $!");
}
else {
##-- physical DB
$coldb = DiaColloDB->new(%coldb)
or die("$prog: failed to create new DiaColloDB object: $!");
if ($union) {
##-- physical DB: union: create from dbdirs
$coldb->union($corpus->{files}, dbdir=>$dbdir, flags=>'rw')
or die("$prog: DiaColloDB::union() failed: $!");
} else {
##-- physical DB: create from corpus
$coldb->create($corpus, dbdir=>$dbdir, flags=>'rw', attrs=>($coldb{attrs}||'l,p'))
or die("$prog: DiaColloDB::create() failed: $!");
}
}
##-- cleanup
#my $du = si_str($coldb->du());
$coldb->close() if ($coldb);
##-- timing
if ($dotime) {
(my $du = `du -h "$dbdir"`) =~ s/\s.*\z//s;
$coldb->info("operation completed in ", $timer->timestr, "; db size = ${du}B");
}
__END__
###############################################################
## pods
###############################################################
=pod
=head1 NAME
dcdb-create.perl - create a DiaColloDB diachronic collocation database
=head1 SYNOPSIS
dcdb-create.perl [OPTIONS] [INPUT(s)...]
General Options:
-help ##-- this help message
-version ##-- report version information and exit
-jobs NJOBS ##-- number of threads for corpus compilation (default=-1: all cores)
-xs , -pp ##-- do/don't use fast XS implementations where available (default=if available)
Corpus Options:
-list , -nolist ##-- INPUT(s) are/aren't file-lists (default=no)
-glob , -noglob ##-- do/don't glob INPUT(s) argument(s) (default=do)
-union, -nounion ##-- do/don't trate INPUT(s) as DB directories to be merged (default=don't)
-lazy , -nolazy ##-- do/don't create "lazy" list-client (union mode only; default=don't)
-dclass CLASS ##-- set corpus document class (default=DDCTabs)
-dopt OPT=VAL ##-- set corpus document option, e.g.
## eosre=EOSRE # eos regex (default='^$')
## foreign=BOOL # disable D*-specific heuristics
-bysent ##-- track collocations by sentence (default)
-byparagraph ##-- track collocations by paragraph
-bypage ##-- track collocations by page
-bydoc ##-- track collocations by document
Indexing Options:
-attrs ATTRS ##-- select index attributes (default=l,p)
## known attributes: l, p, w, doc.title, ...
-use-all-the-data ##-- disable default frequency- and regex-filters
-64bit ##-- use 64-bit quads where available
-32bit ##-- use 32-bit integers where available
-dmax DIST ##-- maximum distance for indexed co-occurrences (default=5)
-tfmin TFMIN ##-- minimum global term frequency (default=2)
-lfmin LFMIN ##-- minimum global lemma frequency (default=undef:tfmin)
-cfmin CFMIN ##-- minimum relation co-occurrence frequency (default=2)
-[no]tdf ##-- do/don't create (term x document) index relation (default=if available)
-tdf-dbreak BREAK ##-- set tdf matrix "document" granularity (e.g. s,p,page,file; default=file)
-tdf-fmin VFMIN ##-- set minimum tdf term frequency (default=undef: TFMIN)
-tdf-dfmin VDFMIN ##-- set minimum tdf term "document"-frequency (default=4)
-tdf-nmin VNMIN ##-- set minimum number of content tokens per tdf "document" (default=8)
-tdf-nmax VNMAX ##-- set maximum number of content tokens per tdf "document" (default=inf)
-tdf-option OPT=VAL ##-- set arbitrary tdf matrix option, e.g.
## minFreq=INT # minimum term frequency (default=undef: use TFMIN)
## minDocFreq=INT # minimum term document-"frequency" (default=4)
## minDocSize=INT # minimum document size (#/terms) (default=4)
## maxDocSize=INT # maximum document size (#/terms) (default=inf)
## mgood=REGEX # positive regex for document-level metatdata
## mbad=REGEX # negative regex for document-level metatdata
-option OPT=VAL ##-- set arbitrary DiaColloDB option, e.g.
## pack_id=PACKFMT # pack-format for IDs
## pack_f=PACKFMT # pack-format for frequencies
## pack_date=PACKFMT # pack-format for dates
## (p|w|l)good=REGEX # positive regex for (postags|words|lemmata)
## (p|w|l)bad=REGEX # negative regex for (postags|words|lemmata)
## (p|w|l)goodfile=FILE # positive list-filefor (postags|words|lemmata)
## (p|w|l)badfile=FILE # negative list-file for (postags|words|lemmata)
## ddcServer=HOST:PORT # server for ddc relations
## ddcTimeout=SECONDS # timeout for ddc relations
I/O and Logging Options:
-log-level LEVEL ##-- set log-level (default=TRACE)
-log-option OPT=VAL ##-- set log option (e.g. logdate, logtime, file, syslog, stderr, ...)
-[no]keep ##-- do/don't keep temporary files (default=don't)
-[no]mmap ##-- do/don't use mmap for file access (default=do)
-[no]debug ##-- do/don't enable painful debugging checks (default=don't)
-[no]times ##-- do/don't report operating timing (default=do)
-output OUT ##-- output directory or client configuration file (required)
Environment Variables:
DIACOLLO_SORT ##-- system sort command prefix
SORT ##-- fallback for DIACOLLO_SORT
=cut
###############################################################
## DESCRIPTION
###############################################################
=pod
=head1 DESCRIPTION
dcdb-create.perl
compiles a L<DiaColloDB|DiaColloDB> diachronic collocation database
from a tokenized and annotated input corpus,
or merges multiple existing L<DiaColloDB|DiaColloDB> databases
into a single database directory.
The resulting database can be queried with
the
L<dcdb-query.perl(1)|dcdb-query.perl> script,
or wrapped into a web-service with
the help of the L<DiaColloDB::WWW|DiaColloDB::WWW> utilities,
which see for details.
=cut
###############################################################
## OPTIONS AND ARGUMENTS
###############################################################
=pod
=head1 OPTIONS AND ARGUMENTS
=cut
###############################################################
# Arguments
###############################################################
=pod
=head2 Arguments
=over 4
=item INPUT(s)
File(s), glob(s), file-list(s) to be indexed or existing indices to be merged.
Interpretation depends on the L<-glob|/-glob>, L<-list|/-list>, L<-union|/-union>,
and L<-lazy|/-lazy>
options.
=back
=cut
###############################################################
# General Options
###############################################################
=pod
=head2 General Options
=over 4
=item -help
Display a brief help message and exit.
=item -version
Display version information and exit.
=item -jobs NJOBS
Run C<NJOBS> parallel compilation threads.
If specified as 0, will run only a single thread.
The default value (-1) will run as many jobs as there are cores on the (unix/linux) system;
see L<DiaColloDB::Utils/nJobs> for details.
Also sets the environment variable C<OMP_NUM_THREADS> after interpreting
the C<NJOBS> request.
=back
=cut
###############################################################
# Corpus Options
=pod
=head2 Corpus Options
Input corpora can be either "raw" corpora using the
default L<DiaColloDB::Corpus|DiaColloDB::Corpus> class
or a single "pre-compiled" corpus directory using the
L<DiaColloDB::Corpus::Compiled|DiaColloDB::Corpus::Compiled> conventions
as created by the L<dcdb-corpus-compile.perl(1)|dcdb-corpus-compile.perl>
script.
If a pre-compiled input corpus directory is specified,
only the L<corpus content filters|DiaColloDB::Corpus::Filters> pre-compiled
into the corpus itself are used, and the corpus content filter
options to this script (C<-Opgood=REGEX> etc.) will have no effect.
For "raw" input corpora, a temporary
L<DiaColloDB::Corpus::Compiled|DiaColloDB::Corpus::Compiled> object
will be created and the L<DiaColloDB::Corpus::Filters|DiaColloDB::Corpus::Filters>
options to this script should be honored.
=over 4
=item -list
=item -nolist
Do/don't treat INPUT(s) as file-lists rather than corpus data files or
L<pre-compiled corpus directories|DiaColloDB::Corpus::Compiled>.
Default=don't.
=item -glob
=item -noglob
Do/don't expand wildcards in INPUT(s).
Has no effect for L<pre-compiled corpus directories|DiaColloDB::Corpus::Compiled>.
Default=do.
=item -union
=item -nounion
Do/don't trate INPUT(s) as DB directories to be merged.
Creates a new physical DB by merging data from the argument
INPUT(s).
Default=don't.
=item -lazy
=item -nolazy
Enable/disable "lazy union" mode.
If enabled, INPUT(s) are treated as DB URLs to be merged "lazily",
and only a simple L<DiaColloDB::Client::list|DiaColloDB::Client::list>
configuration file F<OUT> is created, suitable for passing to
L<dcdb-query.perl|dcdb-query.perl> as F<rcfile://OUT>. User
options specified with L<C<-option OPT=VAL>|/"-option OPT=VAL"> will
clobber the L<DiaColloDB::Client::list|DiaColloDB::Client::list> defaults
(e.g. C<fudge>, C<fork>, etc.). Unlike L<-union|/-union> mode,
no physical DB is created in L<-lazy|/-lazy> mode; queries to the lazy
client are deferred to the underlying DB URLs specified in the configuration
file. The lazy configuration should behave like a physical DB created with L<-union|/-union>,
can be created in near constant time,
requires only a few bytes of disk space,
and may even process queries faster than a physical DB if you have the
L<threads|threads> module installed.
Default=off.
Aliases: -lazy-union, -list-union, -lu
=item -dclass CLASS
Set corpus document class (default=DDCTabs) for raw (i.e. not L<pre-compiled|DiaColloDB::Corpus::Compiled>) corpora.
See L<DiaColloDB::Document/SUBCLASSES> for a list
of supported input formats.
If you are using the default L<DDCTabs|DiaColloDB::Document::DDCTabs> document class
on your own (non-D*) corpus, you may also want to specify
L<C<-dopt foreign=1>|/"-dopt OPT=VAL">.
Has no effect for L<pre-compiled corpus directory INPUT(s)|DiaColloDB::Corpus::Compiled>.
=item -dopt OPT=VAL
Set corpus document option for raw (i.e. not L<pre-compiled|DiaColloDB::Corpus::Compiled>) corpora, e.g.
L<C<-dopt eosre=EOSRE>|DDCTabs/new> sets the end-of-sentence regex
for the default L<DDCTabs|DiaColloDB::Document::DDCTabs> document class,
and L<C<-dopt foreign=1>|DDCTabs/new> disables D*-specific hacks.
Potentially dangerous for L<pre-compiled corpus directory INPUT(s)|DiaColloDB::Corpus::Compiled>.
Aliases: -document-option, -docoption, -dO
=item -bysent
Track collocations by sentence (default).
Has no effect for L<pre-compiled corpus directory INPUT(s)|DiaColloDB::Corpus::Compiled>.
=item -byparagraph
Track collocations by paragraph.
Has no effect for L<pre-compiled corpus directory INPUT(s)|DiaColloDB::Corpus::Compiled>.
=item -bypage
Track collocations by page.
Has no effect for L<pre-compiled corpus directory INPUT(s)|DiaColloDB::Corpus::Compiled>.
=item -bydoc
Track collocations by document.
Has no effect for L<pre-compiled corpus directory INPUT(s)|DiaColloDB::Corpus::Compiled>.
=back
=cut
###############################################################
# Indexing Options
=pod
=head2 Indexing Options
=over 4
=item -attrs ATTRS
Select attributes to be indexed (default=l,p).
Known attributes include C<l, p, w, doc.title, doc.author>, etc.
=item -use-all-the-data
Disables default frequency- and regex-based pruning filter options,
inspired by Mark Lauersdorf; equivalent to:
-tfmin=0 \
-lfmin=0 \
-cfmin=0 \
-tdf-tfmin=0 \
-tdf-dfmin=0 \
-tdf-nmin=0 \
-tdf-nmax=inf \
-O=pgood='' -O=poodfile='' \
-O=wgood='' -O=wgoodfile='' \
-O=lgood='' -O=lgoodfile='' \
-O=pbad='' -O=pbadfile='' \
-O=wbad='' -O=wbadfile='' \
-O=lbad='' -O=lbadfile='' \
-tO=mgood='' \
-tO=mbad=''
Corpus L<content filters|DiaColloDB::Corpus::Filters>
(C<pgood>, C<pgoodfile>, ..., C<lbad>, C<lbadfile>)
have no effect for
L<pre-compiled corpus directory INPUT(s)|DiaColloDB::Corpus::Compiled>
Aliases: -all, -noprune, -nofilters, -F
=item -64bit
Use 64-bit quads to index integer IDs where available.
=item -32bit
Use 32-bit integers where available (default).
=item -dmax DIST
Specify maximum distance for indexed co-occurrences (default=5).
=item -tfmin TFMIN
Specify minimum global term frequency (default=2).
A "term" in this sense is an n-tuple of indexed attributes
B<not including> the "date" component.
=item -lfmin LFMIN
Specify minimum global lemma frequency (default=undef:TFMIN).
=item -cfmin CFMIN
Specify minimum relation co-occurrence frequency (default=2).
=item -[no]tdf
Do/don't create (term x document) index relation (default=if available).
=item -tdf-dbreak BREAK
Set tdf matrix "document" granularity (e.g. s,p,page,file; default=file).
=item -tdf-fmin VFMIN
Set minimum tdf term frequency (default=undef: use TFMIN).
=item -tdf-dfmin VDFMIN
Set minimum term document-"frequency" (default=4).
=item -tdf-nmin VNMIN
Set minimum number of content tokens per tdf "document" (default=8).
=item -tdf-nmax VNMAX
Set maximum number of content tokens per tdf "document" (default=inf).
=item -tdf-option OPT=VAL
Set arbitrary L<tdf matrixDiaColloDB|DiaColloDB::Relation::TDF> option, e.g.
minFreq=INT # -tdf-fmin: minimum term frequency
minDocFreq=INT # -tdf-dfmin: minimum term document-"frequency"
minDocSize=INT # -tdf-nmin: minimum document size (#/terms)
maxDocSize=INT # -tdf-nmax: maximum document size (#/terms)
mgood=REGEX # positive regex for document-level metatdata
mbad=REGEX # negative regex for document-level metatdata
Alias: -tO
=item -option OPT=VAL
Set arbitrary L<DiaColloDB|DiaColloDB> index option, e.g.
pack_id=PACKFMT # pack-format for IDs
pack_f=PACKFMT # pack-format for frequencies
pack_date=PACKFMT # pack-format for dates
(p|w|l)good=REGEX # (raw input only) positive regex for (postags|words|lemmata)
(p|w|l)bad=REGEX # (raw input only) negative regex for (postags|words|lemmata)
(p|w|l)goodfile=REGEX # (raw input only) positive list-file for (postags|words|lemmata)
(p|w|l)badfile=REGEX # (raw input only) negative list-file for (postags|words|lemmata)
ddcServer=HOST:PORT # server for ddc relations
ddcTimeout=SECONDS # timeout for ddc relations
Alias: -O
=back
=cut
###############################################################
# I/O and Logging Options
=pod
=head2 I/O and Logging Options
=over 4
=item -log-level LEVEL
Set L<DiaColloDB::Logger|DiaColloDB::Logger> log-level (default=TRACE).
=item -log-option OPT=VAL
Set arbitrary L<DiaColloDB::Logger|DiaColloDB::Logger> option (e.g. logdate, logtime, file, syslog, stderr, ...).
=item -[no]keep
Do/don't keep temporary files (default=don't)
=item -[no]mmap
Do/don't use mmap() for low-level index file access (default=do)
=item -[no]debug
Do/don't enable painful debugging checks (default=don't)
=item -[no]times
Do/don't report operating timing (default=do)
=item -output OUT
Output directory or filename (required).
=back
=cut
###############################################################
# Bugs and Limitations
###############################################################
=pod
=head1 BUGS AND LIMITATIONS
Probably many.
=cut
###############################################################
# Footer
###############################################################
=pod
=head1 ACKNOWLEDGEMENTS
Perl by Larry Wall.
=head1 AUTHOR
Bryan Jurish E<lt>moocow@cpan.orgE<gt>
=head1 SEE ALSO
L<DiaColloDB(3pm)|DiaColloDB>,
L<dcdb-corpus-compile.perl(1)|dcdb-corpus-compile.perl>,
L<dcdb-info.perl(1)|dcdb-info.perl>,
L<dcdb-query.perl(1)|dcdb-query.perl>,
L<dcdb-export.perl(1)|dcdb-export.perl>,
L<perl(1)|perl>.
=cut