NAME
DiaColloDB - diachronic collocation database, top-level
SYNOPSIS
##========================================================================
## PRELIMINARIES
use DiaColloDB;
##========================================================================
## Constructors etc.
$coldb = CLASS_OR_OBJECT->new(%args);
##========================================================================
## I/O: open/close
$coldb_or_undef = $coldb->open($dbdir,%opts);
@dbkeys = $coldb->dbkeys();
$coldb_or_undef = $coldb->close();
$bool = $coldb->opened();
@files = $obj->diskFiles();
##========================================================================
## create: utils
$multimap = $coldb->create_xmap($base, \%xs2i, $packfmt, $label="multimap");
\@attrs = $coldb->attrs();
$atitle = $CLASS_OR_OBJECT->attrTitle($attr_or_alias);
$acbexpr = $CLASS_OR_OBJECT->attrCountBy($attr_or_alias,$matchid=0);
$aquery_or_filter_or_undef = $CLASS_OR_OBJECT->attrQuery($attr_or_alias,$cquery);
\@attrdata = $coldb->attrData();
$bool = $coldb->hasAttr($attr);
##========================================================================
## create: from corpus
$bool = $coldb->create($corpus,%opts);
##========================================================================
## create: union (aka merge)
$coldb = $CLASS_OR_OBJECT->union(\@coldbs_or_dbdirs,%opts);
##========================================================================
## I/O: header
@keys = $coldb->headerKeys();
$bool = $coldb->loadHeaderData();
##========================================================================
## Export/Import
$bool = $coldb->dbexport();
$coldb = $coldb->dbimport();
##========================================================================
## Info
\%info = $coldb->dbinfo();
##========================================================================
## Profiling: Utils
$relname = $coldb->relname($rel);
$obj_or_undef = $coldb->relation($rel);
\@ids = $coldb->enumIds($enum,$req,%opts);
($dfilter,$sliceLo,$sliceHi,$dateLo,$dateHi)
= $coldb->parseDateRequest($dateRequest='', $sliceRequest=0, $fill=0, $ddcMode=0);
\%slice2xids = $coldb->xidsByDate(\@xids, $dateRequest, $sliceRequest, $fill);
$compiler = $coldb->qcompiler();
$cquery_or_undef = $coldb->qparse($ddc_query_string);
$cquery = $coldb->parseQuery([[$attr1,$val1],...], %opts) ##-- compat: ARRAY-of-ARRAYs
\@aqs = $coldb->queryAttributes($cquery,%opts);
\@aqs = $coldb->parseRequest($request, %opts);
\%groupby = $coldb->groupby($groupby_request, %opts);
$cqfilter = $coldb->query2filter($attr,$cquery,%opts);
($CQCountKeyExprs,\$CQRestrict,\@CQFilters)
= $coldb->parseGroupBy($groupby_string_or_request,%opts);
##========================================================================
## Profiling: Generic
$mprf = $coldb->profile($relation, %opts);
##========================================================================
## Profiling: Comparison (diff)
$mprf = $coldb->compare($relation, %opts);
DESCRIPTION
The DiaColloDB package is the top-level module for the DiaColloDB diachronic collocation database package. As a Perl class, a DiaColloDB object can be used to create or query a local native database instance.
Globals & Constants
- Variable: $VERSION
-
Package version.
- Variable: @ISA
-
DiaColloDB inherits from DiaColloDB::Client, and provides the low-level basis for the DiaColloDB::Client API.
- Variable: $PGOOD_DEFAULT
-
Default positive pos regex for document parsing -- don't use qr// here, since Storable doesn't like pre-compiled Regexps. Default =
q/^(?:N|TRUNC|VV|ADJ)/
. - Variable: $PBAD_DEFAULT
-
Default negative pos regex for document parsing. Default = undef (none).
- Variable: $WGOOD_DEFAULT
-
Default positive word regex for document parsing. Default =
q/[[:alpha:]]/
- Variable: $WBAD_DEFAULT
-
Default negative word regex for document parsing. Default = q/[\.]/
- Variable: $LGOOD_DEFAULT
-
Default positive lemma regex for document parsing. Default = undef (none).
- Variable: $LBAD_DEFAULT
-
Default negative lemma regex for document parsing. Default = undef (none).
- Variable: $ECLASS
-
enum class; default 'DiaColloDB::EnumFile::MMap'. Default = 'DiaColloDB::EnumFile::MMap'.
- Variable: $XECLASS
-
fixed-length enum class. Default = 'DiaColloDB::EnumFile::FixedLen'
- Variable: $MMCLASS
-
multimap class. Default = 'DiaColloDB::MultiMapFile'
Constructors etc.
- new
-
$coldb = CLASS_OR_OBJECT->new(%args);
%args, object structure:
( ##-- options dbdir => $dbdir, ##-- database directory; REQUIRED flags => $fcflags, ##-- fcntl flags or open()-style mode string; default='r' attrs => \@attrs, ##-- index attributes (input as space-separated or array; compiled to array); default=undef (==>['l']) ## + each attribute can be token-attribute qw(w p l) or a document metadata attribute "doc.ATTR" ## + document "date" attribute is always indexed info => \%info, ##-- additional data to return in info() method (e.g. collection, maintainer) bos => $bos, ##-- special string to use for BOS, undef or empty for none (default=undef) eos => $eos, ##-- special string to use for EOS, undef or empty for none (default=undef) pack_id => $fmt, ##-- pack-format for IDs (default='N') pack_f => $fmt, ##-- pack-format for frequencies (default='N') pack_date => $fmt, ##-- pack-format for dates (default='n') pack_off => $fmt, ##-- pack-format for file offsets (default='N') pack_len => $len, ##-- pack-format for string lengths (default='n') dmax => $dmax, ##-- maximum distance for collocation-frequencies and implicit ddc near() queries (default=5) cfmin => $cfmin, ##-- minimum co-occurrence frequency for Cofreqs and ddc queries (default=2) keeptmp => $bool, ##-- keep temporary files? (default=0) ## ##-- runtime ddc relation options ddcServer => "$host:$port", ##-- server for ddc relation ddcTimeout => $seconds, ##-- timeout for ddc relation ## ##-- source filtering (for create()) pgood => $regex, ##-- positive filter regex for part-of-speech tags pbad => $regex, ##-- negative filter regex for part-of-speech tags wgood => $regex, ##-- positive filter regex for word text wbad => $regex, ##-- negative filter regex for word text lgood => $regex, ##-- positive filter regex for lemma text lbad => $regex, ##-- negative filter regex for lemma text ## ##-- logging logOpen => $level, ##-- log-level for open/close (default='info') logCreate => $level, ##-- log-level for create messages (default='info') logCorpusFile => $level, ##-- log-level for corpus file-parsing (default='trace') logCorpusFileN => $N, ##-- log corpus file-parsing only for every N files (0 for none; default:undef ~ $corpus->size()/100) logExport => $level, ##-- log-level for export messages (default='info') logProfile => $level, ##-- log-level for verbose profiling messages (default='trace') logRequest => $level, ##-- log-level for request-level profiling messages (default='debug') ## ##-- runtime limits maxExpand => $size, ##-- maximum number of elements in query expansions (default=65535) ## ##-- attribute data ${a}enum => $aenum, ##-- attribute enum: $aenum : ($dbdir/${a}_enum.*) : $astr<=>$ai : A*<=>N ## e.g. lemmata: $lenum : ($dbdir/l_enum.* ) : $lstr<=>$li : A*<=>N ${a}2x => $a2x, ##-- attribute multimap: $a2x : ($dbdir/${a}_2x.*) : $ai=>@xis : N=>N* pack_x$a => $fmt ##-- pack format: extract attribute-id $ai from a packed tuple-string $xs ; $ai=unpack($coldb->{"pack_x$a"},$xs) ## ##-- tuple data (+dates) xenum => $xenum, ##-- enum: tuples ($dbdir/xenum.*) : [@ais,$di]<=>$xi : N*n<=>N pack_x => $fmt, ##-- symbol pack-format for $xenum : "${pack_id}[Nattrs]${pack_date}" xdmin => $xdmin, ##-- minimum date xdmax => $xdmax, ##-- maximum date ## ##-- tuple data (-dates) : TODO #tenum => $tenum, ##-- enum: attribute-tuples (no dates), only if $coldb->{indexAttrs} #pack_t => $fmt, ##-- symbol pack-format for $tenum : "${pack_id}[Nattrs]" ## ##-- relation data xf => $xf, ##-- ug: $xi => $f($xi) : N=>N cof => $cof, ##-- cf: [$xi1,$xi2] => $f12 ddc => $ddc, ##-- ddc client relation )
- promote
-
$cli_or_undef = $cli->promote($class,%opts);
DiaColloDB::Client method override: unsupported.
I/O: open/close
- open
-
$coldb_or_undef = $coldb->open($dbdir,%opts); $coldb_or_undef = $coldb->open();
Open the DB.
- dbkeys
-
@dbkeys = $coldb->dbkeys();
Returns list of %$coldb keys whose values are expected to be sub-objects.
- close
-
$coldb_or_undef = $coldb->close();
Close current DB, if opened.
- opened
-
$bool = $coldb->opened();
Returns truee iff db is opened.
- diskFiles
-
@files = $coldb->diskFiles();
Returns list of dist files for $coldb.
create: utils
- Variables: (%ATTR_ALIAS,%ATTR_RALIAS,%ATTR_TITLE,%ATTR_CBEXPR);
-
Global attribute alias hacks.
%ATTR_ALIAS = ($name_or_alias=>$name, ...) %ATTR_RALIAS = ($name=>\@aliases, ...) %ATTR_CBEXPR = ($name=>$ddcCountByExpr, ...) %ATTR_TITLE = ($name_or_alias=>$title, ...)
- create_xmap
-
$multimap = $coldb->create_xmap($base, \%xs2i, $packfmt, $label="multimap");
Create an expansion map.
- attrs
-
\@attrs = $coldb->attrs(); \@attrs = $coldb->attrs($attrs=$coldb-E<gt>{attrs}, $default=[]);
parse attributes in $attrs as array.
- attrName
-
$aname = $CLASS_OR_OBJECT->attrName($attr)
Returns canonical (short) attribute name for $attr. Supports aliases in %ATTR_ALIAS = ($alias=>$name, ...).
- attrTitle
-
$atitle = $CLASS_OR_OBJECT->attrTitle($attr_or_alias);
Returns an attribute title for $attr_or_alias
- attrCountBy
-
$acbexpr = $CLASS_OR_OBJECT->attrCountBy($attr_or_alias,$matchid=0);
Returns a DDC::XS:CQCountKeyExpr object for $attr_or_alias with match-id $matchid.
- attrQuery
-
$aquery_or_filter_or_undef = $CLASS_OR_OBJECT->attrQuery($attr_or_alias,$cquery);
returns a DDC::XS::CQuery or DDC::XS::CQFilter object for condition $cquery on $attr_or_alias.
- attrData
-
\@attrdata = $coldb->attrData(); \@attrdata = $coldb->attrData(\@attrs=$coldb->attrs)
get attribute data for \@attrs; returns @attrdata = ({a=>$a, i=>$i, enum=>$aenum, pack_x=>$pack_xa, a2x=>$a2x, ...})
- hasAttr
-
$bool = $coldb->hasAttr($attr);
Returns true iff $coldb natively supports the attribute (or alias) $attr.
create: from corpus
- create
-
$bool = $coldb->create($corpus,%opts);
%opts:
$key => $val, ##-- clobbers $coldb->{$key}
create: union (aka merge)
- union
-
$coldb = $CLASS_OR_OBJECT->union(\@coldbs_or_dbdirs,%opts);
Populates $coldb as union over @coldbs_or_dbdirs. Clobbers argument dbs {_union_${a}i2u}, {_union_xi2u}, {_union_argi}
I/O: header
Largely inherited from DiaColloDB::Persistent.
- headerKeys
-
@keys = $coldb->headerKeys();
keys to save as header
- loadHeaderData
-
$bool = $coldb->loadHeaderData(); $bool = $coldb->loadHeaderData($data)
loads header data.
Export/Import
- dbexport
-
$bool = $coldb->dbexport(); $bool = $coldb->dbexport($outdir,%opts);
$outdir defaults to "$coldb->{dbdir}/export" %opts:
export_sdat => $bool, ##-- whether to export *.sdat (stringified tuple files for debugging; default=0) export_cof => $bool, ##-- do/don't export cof.* (default=do)
- dbimport
-
$coldb = $coldb->dbimport(); $coldb = $coldb->dbimport($txtdir,%opts)
Import ColocDB data from $txtdir
TODO
Info
- dbinfo
-
\%info = $coldb->dbinfo();
get db info
Profiling: Utils
- relname
-
$relname = $coldb->relname($rel);
Returns an appropriate relation name for profile() and friends:
returns $rel if $coldb->{$rel} supports a profile() method
otherwise heuristically parses $relationName /xf|f?1|ug/ or /f1?2|c/
- relation
-
$obj_or_undef = $coldb->relation($rel);
returns an appropriate relation-like object for profile() and friends; really just wraps
$coldb->{$coldb->relname($rel)}
. - enumIds
-
\@ids = $coldb->enumIds($enum,$req,%opts);
parses enum IDs for $req, which is one of:
a DDC::XS::CQTokExact, ::CQTokInfl, ::CQTokSet, ::CQTokSetInfl, or ::CQTokRegex : interpreted
an ARRAY-ref : list of literal symbol-values
a Regexp ref : regexp for target strings, passed to $enum->re2i()
a string /REGEX/ : regexp for target strings, passed to $enum->re2i()
another string : space-, comma-, or |-separated list of literal values
%opts:
logLevel => $logLevel, ##-- logging level (default=undef) logPrefix => $prefix, ##-- logging prefix (default="enumIds(): fetch ids")
- parseDateRequest
-
($dfilter,$sliceLo,$sliceHi,$dateLo,$dateHi) = $coldb->parseDateRequest($dateRequest='', $sliceRequest=0, $fill=0, $ddcMode=0);
low-level parsing for date (slice) requests.
- xidsByDate
-
\%slice2xids = $coldb->xidsByDate(\@xids, $dateRequest, $sliceRequest, $fill);
parse and filter \@xids by $dateRequest, $sliceRequest; returns a HASH-ref from slice-ids to \@xids in that date-slice. If $fill is true, returned HASH-ref has a key for each date-slice in range
- qcompiler
-
$compiler = $coldb->qcompiler();
get DDC::XS::CQueryCompiler for this object (cached in $coldb->{_qcompiler})
- qparse
-
$cquery_or_undef = $coldb->qparse($ddc_query_string);
wraps parse in an eval {...} block and sets $coldb->{error} on failure
- parseQuery
-
$cquery = $coldb->parseQuery([[$attr1,$val1],...], %opts) ##-- compat: ARRAY-of-ARRAYs; $cquery = $coldb->parseQuery(["$attr1:$val1",...], %opts) ##-- compat: ARRAY-of-requests $cquery = $coldb->parseQuery({$attr1=>$val1, ...}, %opts) ##-- compat: HASH $cquery = $coldb->parseQuery("$attr1=$val1, ...", %opts) ##-- compat: string $cquery = $coldb->parseQuery($ddcQueryString, %opts) ##-- ddc string (with shorthand ","->WITH, "&&"->WITH)
Guts for parsing user target and groupby requests; returns a DDC::XS::CQuery object representing the request. Index-only items "$l" are mapped to $l=*
%opts:
warn => $level, ##-- log-level for unknown attributes (default: 'warn') logas => $reqtype, ##-- request type for warnings default => $attr, ##-- default attribute (for query requests) mapand => $bool, ##-- map CQAnd to CQWith? (default=true unless '&&' occurs in query string) ddcmode => $bool, ##-- force ddc query mode? (default=false)
- queryAttributes
-
\@aqs = $coldb->queryAttributes($cquery,%opts);
Utility for decomposing DDC queries into attribute-wise requests; returns an ARRAY-ref [[$attr1,$val1], ...]. Each value $vali is empty or undef (all values), a CQTokSet, a CQTokExact, CQTokRegex, or CQTokAny. Chokes on unsupported query types or filters
%opts:
warn => $level, ##-- log-level for unknown attributes (default: 'warn') logas => $reqtype, ##-- request type for warnings default => $attr, ##-- default attribute (for query requests) allowUnknown => $bool, ##-- allow unknown attributes? (default: 0)
- parseRequest
-
\@aqs = $coldb->parseRequest($request, %opts);
Guts for parsing user target and groupby requests into attribute-wise ARRAY-ref [[$attr1,$val1], ...], used by native profiling methods. See parseQuery() method for supported $request formats and %opts. Wraps $coldb->queryAttributes($coldb->parseQuery($request,%opts)).
- groupby
-
\%groupby = $coldb->groupby($groupby_request, %opts); \%groupby = $coldb->groupby(\%groupby, %opts);
Parse a user groupby request, used by native profiling methods. See parseRequest() for details on syntax of $groupby_request. Returns a HASH-ref of the form:
req => $request, ##-- save request x2g => \&x2g, ##-- group-id extraction code suitable for e.g. DiaColloDB::Relation::Cofreqs::profile(groupby=>\&x2g) g2s => \&g2s, ##-- stringification object suitable for DiaColloDB::Profile::stringify() [CODE,enum, or undef] areqs => \@areqs, ##-- parsed attribute requests ([$attr,$ahaving],...) attrs => \@attrs, ##-- like $coldb->attrs($groupby_request), modulo "having" parts titles => \@titles, ##-- like map {$coldb->attrTitle($_)} @attrs
%opts:
warn => $level, ##-- log-level for unknown attributes (default: 'warn') relax => $bool, ##-- allow unsupported attributes (default=0)
- query2filter
-
$cqfilter = $coldb->query2filter($attr,$cquery,%opts);
Converts a CQToken to a CQFilter, for ddc parsing. %opts:
logas => $logas, ##-- log-prefix for warnings
- parseGroupBy
-
($CQCountKeyExprs,\$CQRestrict,\@CQFilters) = $coldb->parseGroupBy($groupby_string_or_request,%opts);
ddc-mode groupby parsing utility. %opts:
date => $date, slice => $slice, matchid => $matchid, ##-- default match-id
Profiling: Generic
- profile
-
$mprf = $coldb->profile($relation, %opts);
Get a relation profile for selected items as a DiaColloDB::Profile::Multi object. %opts:
##-- selection parameters query => $query, ##-- target request ATTR:REQ... date => $date1, ##-- string or array or range "MIN-MAX" (inclusive) : default=all ## ##-- aggregation parameters slice => $slice, ##-- date slice (default=1, 0 for global profile) groupby => $groupby, ##-- string or array "ATTR1[:HAVING1] ...": default=$coldb->attrs; see groupby() method ## ##-- scoring and trimming parameters eps => $eps, ##-- smoothing constant (default=0) score => $func, ##-- scoring function (f,fm,lf,lfm,mi,ld) : default="f" kbest => $k, ##-- return only $k best collocates per date (slice) : default=-1:all cutoff => $cutoff, ##-- minimum score global => $bool, ##-- trim profiles globally (vs. locally for each date-slice?) (default=0) ## ##-- profiling and debugging parameters strings => $bool, ##-- do/don't stringify (default=do) fill => $bool, ##-- if true, returned multi-profile will have null profiles inserted for missing slices
Sets default %opts and wraps $coldb->relation($rel)->profile($coldb, %opts).
Profiling: Comparison (diff)
- compare
-
$mprf = $coldb->compare($relation, %opts);
Get a relation comparison profile for selected items as a DiaColloDB::Profile::MultiDiff object. %opts:
##-- selection parameters (a|b)?query => $query, ##-- target query as for parseRequest() (a|b)?date => $date1, ##-- string or array or range "MIN-MAX" (inclusive) : default=all ## ##-- aggregation parameters groupby => $groupby, ##-- string or array "ATTR1[:HAVING1] ...": default=$coldb->attrs; see groupby() method (a|b)?slice => $slice, ##-- date slice (default=1, 0 for global profile) ## ##-- scoring and trimming parameters eps => $eps, ##-- smoothing constant (default=0) score => $func, ##-- scoring function (f,fm,lf,lfm,mi,ld) : default="f" kbest => $k, ##-- return only $k best collocates per date (slice) : default=-1:all cutoff => $cutoff, ##-- minimum score (UNUSED for comparison profiles) global => $bool, ##-- trim profiles globally (vs. locally for each date-slice?) (default=0) diff => $diff, ##-- low-level score-diff operation (diff|adiff|sum|min|max|avg|havg|gavg|lavg); default='adiff' ## ##-- profiling and debugging parameters strings => $bool, ##-- do/don't stringify (default=do)
Sets default %opts and wraps $coldb->relation($rel)->compare($coldb, %opts)
AUTHOR
Bryan Jurish <moocow@cpan.org>
COPYRIGHT AND LICENSE
Copyright (C) 2015-2016 by Bryan Jurish
This package is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.14.2 or, at your option, any later version of Perl 5 you may have available.
SEE ALSO
dcdb-create.per(1), dcdb-query.perl(1), dcdb-info.perl(1), dcdb-export.perl(1), dcdb-dump.perl(1), DiaColloDB(3pm), perl(1), ...