#!/usr/bin/perl -w
use lib qw(. ./blib/lib ./blib/arch lib lib/blib/lib lib/blib/arch);
use DiaColloDB;
use DiaColloDB::Utils qw(:json :time);
use Getopt::Long qw(:config no_ignore_case);
use Pod::Usage;
use File::Basename qw(basename);
use strict;
#use DiaColloDB::Relation::TDF; ##-- DEBUG
BEGIN {
select STDERR; $|=1; select STDOUT;
}
##----------------------------------------------------------------------
## Globals
##----------------------------------------------------------------------
##-- program vars
our $prog = basename($0);
our ($help,$version);
our %log = (level=>'TRACE', rootLevel=>'FATAL');
our $dburl = undef;
our %cli = (opts=>{});
our $http_user = undef;
our $rel = 'cof';
our %query = (
query =>'', ##-- target query, common
date =>undef, ##-- target date(s), common
slice =>1, ##-- date slice, common
##
#aquery=>'', ##-- target query(ta), arg1
adate =>undef, ##-- target date(s), arg1
aslice =>undef, ##-- date slice, arg1
##
bquery =>'', ##-- target query, arg2
bdate =>undef, ##-- target date(s), arg2
bslice =>undef, ##-- date slice, arg2
##
groupby=>'l', ##-- result aggregation (empty:all available attributes, no restrictions)
##
eps => 0, ##-- smoothing constant (old default=0.5)
score =>'ld', ##-- score func
diff=>'adiff', ##-- diff-op
kbest =>10, ##-- k-best items per date
cutoff =>undef, ##-- minimum score cutoff
global =>0, ##-- trim globally (vs. slice-locally)?
strings => 1, ##-- debug: want strings?
onepass => 0, ##-- use fast but incorrect 1-pass method?
);
our %save = (format=>undef);
our $outfmt = 'text'; ##-- output format: 'text' or 'json'
our $pretty = 1;
our $dotime = 1; ##-- report timing?
our $niters = 1; ##-- number of benchmark iterations
##----------------------------------------------------------------------
## Command-line processing
##----------------------------------------------------------------------
GetOptions(##-- general
'help|h' => \$help,
'version|V' => \$version,
##-- general
'log-level|level|log=s' => sub { $log{level} = uc($_[1]); },
'client-option|db-option|do|O=s%' => \%cli,
'subclient-option|suboption|so|SO=s%' => \$cli{opts},
##-- query options
#'difference|diff|D|compare|comp|cmp!' => \$diff,
#'profile|prof|prf|P' => sub { $diff=0 },
'collocations|collocs|collo|col|cofreqs|cof|co|f12|f2|12|2' => sub { $rel='cof' },
'unigrams|ug|u|f1|1' => sub { $rel='xf' },
'ddc' => sub { $rel='ddc' },
'tdf|tdm|matrix|mat|vector-space|vs|vector|vec' => sub { $rel='tdf' },
##
(map {("${_}date|${_}d=s"=>\$query{"${_}date"})} ('',qw(a b))), ##-- date,adate,bdate
(map {("${_}date-slice|${_}ds|${_}slice|${_}sl|${_}s=s"=>\$query{"${_}slice"})} ('',qw(a b))), ##-- slice,aslice,bslice
##
'group-by|groupby|group|gb|g=s' => \$query{groupby},
##
'difference|diff|D|compare|comp|cmp=s' => \$query{diff},
'epsilon|eps|e=f' => \$query{eps},
'mutual-information-log-frequency|milf|mi' => sub {$query{score}='milf'},
'mutual-information-1|mi1' => sub {$query{score}='mi1'},
'mutual-information-3|mi3' => sub {$query{score}='mi3'},
'log-dice|logdice|ld|dice' => sub {$query{score}='ld'},
'log-likelihood|loglik|logl|ll' => sub {$query{score}='ll'},
'frequency|freq|f' => sub {$query{score}='f'},
'frequency-per-million|fpm|fm' => sub {$query{score}='fm'},
'log-frequency|logf|lf' => sub { $query{score}='lf' },
'log-frequency-per-million|logfm|lfm' => sub { $query{score}='lfm' },
'k-best|kbest|k=i' => \$query{kbest},
'no-k-best|nokbest|nok' => sub {$query{kbest}=undef},
'cutoff|C=f' => \$query{cutoff},
'no-cutoff|nocutoff|noc' => sub {$query{cutoff}=undef},
'global|G!' => \$query{global},
'local|L!' => sub { $query{global}=!$_[1]; },
'strings|S!' => \$query{strings},
'one-pass|onepass|1-pass|1pass|1p|single-pass|singlepass|single!' => \$query{onepass},
'two-pass|teopass|2-pass|2pass|2p|multi-pass|multipass|multi|mp!' => sub { $query{onepass}=!$_[1]; },
##-- I/O
'user|U=s' => \$http_user,
'text|t' => sub {$outfmt='text'},
'json|j' => sub {$outfmt='json'},
'html' => sub {$outfmt='html'},
'pretty|p!' => \$pretty,
'ugly!' => sub {$pretty=!$_[1]},
'null|noout' => sub {$outfmt=''},
'score-format|sf|format|fmt=s' => \$save{format},
'timing|times|time|T!' => \$dotime,
'bench|n-iterations|iterations|iters|i=i' => \$niters,
);
if ($version) {
print STDERR "$prog version $DiaColloDB::VERSION by Bryan Jurish\n";
exit 0 if ($version);
}
pod2usage({-exitval=>0,-verbose=>0}) if ($help);
pod2usage({-exitval=>1,-verbose=>0,-msg=>"$prog: ERROR: no DBURL specified!"}) if (@ARGV<1);
pod2usage({-exitval=>1,-verbose=>0,-msg=>"$prog: ERROR: no QUERY specified!"}) if (@ARGV<2);
##----------------------------------------------------------------------
## MAIN
##----------------------------------------------------------------------
##-- setup logger
DiaColloDB::Logger->ensureLog(%log);
##-- parse user options
if ($http_user) {
my ($user,$pass) = split(/:/,$http_user,2);
$pass //= '';
if ($pass eq '') {
print STDERR "Password: ";
$pass = <STDIN>;
chomp $pass;
}
@{$cli{opts}}{qw(user password)} = @cli{qw(user password)} = ($user,$pass),
}
##-- open db client
$dburl = shift(@ARGV);
my ($cli);
if ($dburl !~ m{^[a-zA-Z]+://} && -d $dburl) {
##-- hack for local directory URLs without scheme
$cli = DiaColloDB->new(dbdir=>$dburl,%cli);
} else {
##-- use client interface for any URL with a scheme
$cli = DiaColloDB::Client->new($dburl,%cli);
}
die("$prog: failed to create new DiaColloDB::Client object for $dburl: $!") if (!$cli);
##-- client query
do { utf8::decode($_) if (!utf8::is_utf8($_)) } foreach (@ARGV);
our $isDiff = (@ARGV > 1);
$query{query} = shift;
$query{bquery} = @ARGV ? shift : $query{query};
$rel = "d$rel" if ($isDiff);
##-- DEBUG queries
if (0 && $query{query} eq 'debug') {
$query{query} = 'Mann|Frau';
#$query{query} = '$p=NN !#has[textClass,/politik/i]';
#$query{query} = 'Mann #has[textClass,/zeitung/i]';
#$query{query} = '* #has[textClass,/Zeitung/i]';
#$query{query} = 'Katze && Maus';
#$query{query} = '* #has[genre,/Zeitung/]';
#$query{query} = 'Katze && Maus && Hund';
#$query{query} = 'Mann with $p=NN';
##
#($isDiff,$rel,@query{qw(query bquery slice diff groupby)}) = (1,'dtdf','* #has[author,/Habermas/]','* #has[author,/Cassirer/]',0,'min','l,p=NN');
#($isDiff,$rel,@query{qw(query bquery slice adate bdate)}) = (1,'d2','Bewegung','Bewegung',0,'1900:1910','1990:2000');
#($isDiff,$rel,@query{qw(query bquery slice onepass groupby)}) = (1,'d2','Mann','Frau',0,1,'l,p=ADJA');
#($isDiff,$rel,@query{qw(query bquery slice groupby)}) = (1,'diff-ddc','$p=PAV=2 #has[textClass,/Wiss*/]','$p=PAV=2 #has[textClass,/Bell*/]',0,'l');
##
#($rel,@query{qw(query slice)}) = ('ddc', '$p=ADJA=2 Haus', 0);
#($rel,@query{qw(query slice)}) = ('tdf', 'Haus', 0);
##
#($rel,@query{qw(query groupby slice date)}) = ('cof','Mann','l,p=ADJA',0,'1914:1915');
#($rel,@query{qw(query groupby slice date)}) = ('ug','/mann$/i','l,p=NN',0,'1914:1915');
#($rel,@query{qw(query groupby slice date)}) = ('tdf','Mann','l,p=ADJA',0,'1914:1915'); ##-- TODO
#
#($rel,@query{qw(query groupby slice)}) = ('ddc','"$p=ADJA=2 Mann"','l,p',0);
#($rel,@query{qw(query groupby slice)}) = ('ddc','"$p=ADJA=2 Kaffee"','l',0);
##
#($rel,@query{qw(query date groupby slice)}) = ('ddc','near(flood,{frequency,uncertainty,risk}=2,8) #fmin 1', 2004, '[@const]', 1);
##
#($rel,@query{qw(slice query)}) = ('tdf',0,'function &= $p=/^N/');
#($rel,@query{qw(slice query)}) = ('tdf',0,'/optics/');
##
#($rel,@query{qw(slice query)}) = ('ddc',0,'"$p=ADJA=2 @Maus" #fmin 1');
##
#($rel,@query{qw(slice query)}) = ('tdf',0,'Maschine');
#($rel,@query{qw(slice query groupby)}) = ('tdf',0,'Zahnstange && Nuth','basename');
#($rel,@query{qw(slice query groupby)}) = ('tdf',0,'Zahnstange #has[basename,fischer_w*]','basename');
##
#($rel,@query{qw(slice groupby query)}) = ('ddc',0,'l','"(ge* &= $p=ADJA)=2 $l=@Maschine" #fmin 1'); ##-- xykeys-like f2 bug
#($rel,@query{qw(slice groupby query)}) = ('ddc',0,'l','Haus=2 || Garten=1 #fmin 1'); ##-- xykeys-like f2 bug
#($rel,@query{qw(slice groupby query)}) = ('ddc',0,'l','"*=2 Mansch" #fmin 1'); ##-- missing {schön,ganz,",d} from list-client (was cutoff=>0, fixed with cutoff=>'' in Client::list)
}
##--/DEBUG queries
if ($niters != 1) {
$cli->info("performing $niters query iterations");
}
my $timer = DiaColloDB::Timer->start();
foreach my $iter (1..$niters) {
my $mp = $cli->query($rel, %query)
or die("$prog: query() failed for relation '$rel', query '$query{query}'".($isDiff ? " - '$query{bquery}'" : '').": $cli->{error}");
##-- dump stringified query
my $outfile = ($iter==1 ? '-' : '/dev/null');
if ($outfmt eq 'text') {
$mp->trace("saveTextFile()");
$mp->saveTextFile($outfile,%save);
}
elsif ($outfmt eq 'json') {
$mp->trace("saveJsonFile()");
$mp->saveJsonFile($outfile, pretty=>$pretty,canonical=>$pretty); #utf8=>0
}
elsif ($outfmt eq 'html') {
$mp->trace("saveHtmlFile()");
$mp->saveHtmlFile($outfile,verbose=>!$pretty,%save);
}
}
##-- cleanup
$cli->close();
##-- timing
if ($dotime || $niters > 1) {
$cli->info("operation completed in ", $timer->timestr,
($niters > 1 ? sprintf(" (%.2f iter/sec)", $niters/$timer->elapsed) : qw()),
);
}
__END__
###############################################################
## pods
###############################################################
=pod
=encoding utf8
=head1 NAME
dcdb-query.perl - query a DiaColloDB diachronic collocation database
=head1 SYNOPSIS
dcdb-query.perl [OPTIONS] DBURL QUERY1 [QUERY2]
General Options:
-help # display a brief usage summary
-version # display program version
-[no]time # do/don't report operation timing (default=do)
-iters NITERS # benchmark NITERS iterations of query
Query Options:
-col, -ug, -ddc, -tdf # select profile type (collocations, unigrams, ddc client, tdf matrix; default=-col)
-(a|b)?date DATES # set target DATE or /REGEX/ or MIN-MAX
-(a|b)?slice SLICE # set target date slice (default=1)
-groupby GROUPBY # set result aggregation (default=l)
-kbest KBEST # return only KBEST items per date-slice (default=10)
-nokbest # disable k-best pruning
-cutoff CUTOFF # set minimum score for returned items (default=none)
-nocutoff # disable cutoff pruning
-[no]global # do/don't trim profiles globally (vs. locally by date-slice; default=don't)
-[no]strings # debug: do/don't stringify returned profile (default=do)
-1pass , -2pass # do/don't use fast but incorrect 1-pass method (default=don't)
-O KEY=VALUE # set DiaColloDB::Client option
-SO KEY_=VALUE # set sub-client option (for list:// clients)
Scoring Options:
-f # score by raw frequency
-lf # score by log-frequency
-fm # score by frequency per million tokens
-lfm # score by log-frequency per million tokens
-milf # score by pointwise mutual information x log-frequency product
-mi1 # score by raw pointwise mutual information
-mi3 # score by pointwise mutual information^3 (RychlĂ˝ 2008)
-ld # score by scaled log-Dice coefficient (RychlĂ˝ 2008)
-ll # score by 1-sided log-likelihood ratio (Evert 2008)
-eps EPS # smoothing constant (default=0)
-diff DIFFOP # diff operation (adiff|diff|sum|min|max|avg|havg|gavg; default=adiff)
I/O Options:
-user USER[:PASSWD] # user credentials for HTTP queries
-text # use text output (default)
-json # use json output
-null # don't output profile at all
-[no]pretty # do/don't pretty-print json output (default=do)
-log-level LEVEL # set minimum DiaColloDB log-level
Arguments:
QUERY1 # space-separated target1 string(s) LIST or /REGEX/ or DDC-query
QUERY2 # space-separated target2 string(s) LIST or /REGEX/ or DDC-query (for diff profiles)
Grouping and Filtering:
GROUPBY is a space- or comma-separated list of the form ATTR1[=FILTER1] ..., where:
- ATTR is the name or alias of a supported attribute (e.g. 'lemma', 'pos', etc.), and
- FILTER is either a |-separated LIST of literal values or a /REGEX/[gimsadlu]*
Diff Operations:
DIFF is one of: adiff diff sum min max avg havg gavg lavg
=cut
###############################################################
## DESCRIPTION
###############################################################
=pod
=head1 DESCRIPTION
dcdb-query.perl
is a command-line utility for querying a
L<DiaColloDB|DiaColloDB> diachronic collocation database.
=cut
###############################################################
## OPTIONS AND ARGUMENTS
###############################################################
=pod
=head1 OPTIONS AND ARGUMENTS
=cut
###############################################################
# Arguments
###############################################################
=pod
=head2 Arguments
=over 4
=item DBURL
URL identifying the L<DiaColloDB|DiaColloDB>
database to be queried,
in a form accepted by L<DiaColloDB::Client-E<gt>open()|DiaColloDB::Client/open>.
In particular, I<DBURL> can be a local L<DiaColloDB|DiaColloDB> database directory,
in which case it will be queried via
the L<DiaColloDB::Client::file|DiaColloDB::Client::file> class.
A local L<DiaColloDB::Client|DiaColloDB::Client> configuration file L<RCFILE>
=item QUERY1
Primary target query as accepted by
L<DiaColloDB-E<gt>parseQuery|DiaColloDB/parseQuery>,
usually a space-separated of target string(s) C<LIST>,
a target C</REGEX/> or a DDC-query string.
=item QUERY2
Optional comparsion target query.
If specified, a "diff" profile is computed
as for L<DiaColloDB::compare()|DiaColloDB/compare>,
otherwise a unary profile is computed
as for L<DiaColloDB::profile()|DiaColloDB/profile>.
=back
=cut
###############################################################
# General Options
###############################################################
=pod
=head2 General Options
=over 4
=item -help
Display a brief help message and exit.
=item -version
Display version information and exit.
=item -time
=item -notime
Do/don't report operation timing (default=do).
=item -iters NITERS
Benchmark NITERS iterations of query (default=1).
=back
=cut
###############################################################
# Query Options
=pod
=head2 Query Options
=over 4
=item -col
Request "collocation" profiling via L<DiaColloDB::Relation::Cofreqs|DiaColloDB::Relation::Cofreqs> (default).
=item -ug
Request "unigram" profiling via L<DiaColloDB::Relation::Unigrams|DiaColloDB::Relation::Unigrams>
=item -ddc
Request profiling via L<DiaColloDB::Relation::DDC|DiaColloDB::Relation::DDC>.
Slow and generally inefficient, but very flexible.
Requires that the underlying DB be associated with a DDC server,
e.g. by means of the L<C<ddcServer>|DiaColloDB/new> DB key.
=item -tdf
Request (term x document) matrix profiling via L<DiaColloDB::Relation::TDF|DiaColloDB::Relation::TDF>.
Requires TDF support in the underlying DB.
=item -date DATES
=item -adate DATES
Set L<primary target|/QUERY1> date C<DATE> or C</REGEX/> or date-range C<MIN:MAX>.
Either C<MIN> or or C<MAX> may be an asterisk (C<*>) to indicate the
minimum rsp. maximum date indexed in the corpus.
=item -bdate DATES
As for L<-adate|/-adate DATES>, but specifies date for the
L<comparison target|/QUERY2>.
=item -slice SLICE
=item -aslice SLICE
Set the L<primary target|/QUERY1> date slice (default=1).
=item -bslice SLICE
Set the L<comparison target|/QUERY2> date slice (default=1).
=item -groupby GROUPBY
Aggregate collocates by the attributes specified in
I<GROUPBY>, which should be a list of indexed attributes
with optional restriction clauses as accepted by
L<DiaColloDB-E<gt>parseQuery|DiaColloDB/parseQuery>,
or (in L<-ddc|/-ddc> mode only) a DDC L<count-by list|http://odo.dwds.de/~moocow/software/ddc/ddc_query.html#rule_l_countkeys>
enclosed in square brackets C<[ I<l_countkeys> ]>.
=item -kbest KBEST
Return only KBEST items per date-slice (default=10).
=item -nokbest
Disable k-best pruning.
=item -cutoff CUTOFF
Set minimum score for returned items (unary profiles only; default=none).
=item -nocutoff
Disable cutoff pruning.
=item -[no]global
Do/don't trim profiles globally (vs. locally by date-slice; default=don't).
=item -[no]strings
Debug: do/don't stringify returned profile (default=do).
=item -1pass
Use fast but incorrect single-pass frequency acquisition method.
=item -2pass
Use slower but correct 2-pass frequency acqusition method (default).
=item -O KEY=VALUE
Set a L<DiaColloDB::Client|DiaColloDB::Client> option.
=back
=cut
###############################################################
# Scoring Options
=pod
=head2 Scoring Options
See L<DiaColloDB::Profile|DiaColloDB::Profile> for supported scoring functions.
=over 4
=item -f
score by raw frequency
=item -lf
score by log-frequency
=item -fm
score by frequency per million tokens
=item -lfm
score by log-frequency per million tokens
=item -milf
score by pointwise mutual information x log-frequency product
=item -mi1
score by raw pointwise mutual information
=item -mi3
score by pointwise mutual information^3 (RychlĂ˝ 2008)
=item -ld
score by scaled log-Dice coefficient (RychlĂ˝ 2008; default)
=item -ll
score by 1-sided log-likelihood ratio (Evert 2008)
=item -eps EPS
score function smoothing constant (default=0.5)
=item -diff DIFFOP
diff operation to use for
L<comparison profiles|/QUERY2>.
Known values:
adiff # absolute score difference (default)
diff # raw score difference
sum # sum
min # minimum
max # maximum
avg # average
havg # pseudo-harmonic average
gavg # pseudo-geometric average
=back
=cut
###############################################################
# I/O and Logging Options
=pod
=head2 I/O and Logging Options
=over 4
=item -user USER[:PASSWD]
Specify user credentials for HTTP queries
=item -text
generate text output (default).
=item -json
generate json output.
=item -html
generate HTML output.
=item -null
don't output profile data at all (for timing and debugging).
=item -[no]pretty
do/don't pretty-print json output (default=do)
=item -score-format FORMAT
L<sprintf|perlfunc/sprintf>-format for score formatting,
used by text and HTML output modes.
=item -log-level LEVEL
set minimum L<DiaColloDB::Logger|DiaColloDB::Logger> log-level.
=back
=cut
###############################################################
# Bugs and Limitations
###############################################################
=pod
=head1 BUGS AND LIMITATIONS
Probably many.
=cut
###############################################################
# Footer
###############################################################
=pod
=head1 ACKNOWLEDGEMENTS
Perl by Larry Wall.
=head1 AUTHOR
Bryan Jurish E<lt>moocow@cpan.orgE<gt>
=head1 SEE ALSO
L<DiaColloDB(3pm)|DiaColloDB>,
L<dcdb-create.perl(1)|dcdb-create.perl>,
L<dcdb-info.perl(1)|dcdb-info.perl>,
L<dcdb-export.perl(1)|dcdb-export.perl>,
perl(1).
=cut