## -*- Mode: CPerl -*-
## File: DTA::CAB::Analyzer::Dict.pm
## Author: Bryan Jurish <moocow@cpan.org>
## Description: generic analysis dictionary API using Lingua::TT::Dict
use DTA::CAB::Analyzer ':child';
use Carp;
use strict;
## Globals
our @ISA = qw(DTA::CAB::Analyzer);
## Globals: Accessors
## + dict application is computed as:
## $dic->accessClosure($dic->{analyzeCode})->();
## + analysis closure compiled from $dic->{analyzeCode} can use vars:
## $dic ##-- analyzer object
## $anl ##-- analyzer object (alias provided by Analyzer::accessClosure)
## $lab ##-- $dic->{label}
## $dhash ##-- $dic->dictHash()
## #$doc ##-- document being analyzed
## #$types ##-- types being analyzed with analyzeTypes()
## #$opts ##-- user options to analyzeTypes()
## + the following lexical temporaries are provided:
## $key ##-- temporary; unused here
## $val ##-- temporary; unused here
## @keys ##-- temporary; unused here
## @vals ##-- temporary; unused here
## %vals ##-- temporary; unused here
our $CODE_DEFAULT = '$_->{$lab}=$dhash->{'._am_xlit().'};'; # '._am_clean('$_->{$lab}'); ##-- useless, since expandTypes() puts undef back!
## Constructors etc.
## $obj = CLASS_OR_OBJ->new(%args)
## + object structure:
## (
## ##-- Filename Options
## dictFile=> $filename, ##-- default: none
## ##-- Analysis Output
## label => $lab, ##-- analyzer label
## analyzeCode => $code, ##-- pseudo-accessor ($code->()): apply dict to current token ($_)
## ##-- Analysis Options
## encoding => $enc, ##-- encoding of dict file (default='UTF-8')
## allowRegex => $re, ##-- only lookup tokens whose text matches $re (default=none)
## eqIdWeight => $w, ##-- weight for identity analyses for analyzeSet=>$DICT_SET_FST_EQ
## ##-- Analysis objects
## ttd => $ttdict, ##-- underlying Lingua::TT::Dict object
## )
sub new {
my $that = shift;
my $dic = $that->SUPER::new(
##-- filenames
dictFile => undef,
##-- analysis objects
##-- options
encoding => 'UTF-8',
##-- analysis output
label => 'dict',
analyzeCode => $CODE_DEFAULT,
allowRegex => undef,
##-- user args
return $dic;
## $dic = $dic->clear()
sub clear {
my $dic = shift;
return $dic;
## Methods: Embedded API
## $bool = $dict->dictOk()
## + returns false iff dict is undefined or "empty"
sub dictOk {
return defined($_[0]{ttd}) && scalar(%{$_[0]{ttd}{dict}});
## \%key2val = $dict->dictHash()
## + returns a (possibly tie()d hash) representing dict contents
## + default just returns $dic->{ttd}{dict} or a new empty hash
sub dictHash {
return $_[0]{ttd} && $_[0]{ttd}{dict} ? $_[0]{ttd}{dict} : {};
## $val_or_undef = $dict->dictLookup($key)
## + get stored value for key $key
## + default returns $dict->{ttd}{dict}{$key} or undef
sub dictLookup {
return $_[0]{ttd} && $_[0]{ttd}{dict} ? $_[0]{ttd}{dict}{$_[1]} : undef;
## Methods: I/O
## Methods: I/O: Input: all
## $bool = $dic->ensureLoaded()
## + ensures analyzer data is loaded from default files
sub ensureLoaded {
my $dic = shift;
my $rc = 1;
if ( defined($dic->{dictFile}) && !$dic->dictOk ) {
$dic->info("loading dictionary file '$dic->{dictFile}'");
$rc &&= $dic->{ttd}->loadFile($dic->{dictFile}, encoding=>$dic->{encoding});
return $rc;
## Methods: Persistence
## Methods: Persistence: Perl
## @keys = $class_or_obj->noSaveKeys()
## + returns list of keys not to be saved
sub noSaveKeys {
my $that = shift;
return ($that->SUPER::noSaveKeys, qw(ttd));
## $saveRef = $obj->savePerlRef()
## + inherited from DTA::CAB::Persistent
## $loadedObj = $CLASS_OR_OBJ->loadPerlRef($ref)
## + OLD: implicitly calls $obj->clear()
sub loadPerlRef {
my ($that,$ref) = @_;
my $obj = $that->SUPER::loadPerlRef($ref);
return $obj;
## Methods: Analysis
## Methods: Analysis: Generic
## $bool = $anl->canAnalyze()
## + returns true if analyzer can perform its function (e.g. data is loaded & non-empty)
## + override calls dictOk()
sub canAnalyze {
return $_[0]->dictOk();
## Methods: Analysis: v1.x: API
## $doc = $anl->analyzeTypes($doc,\%types,\%opts)
## + perform type-wise analysis of all (text) types in $doc->{types}
sub analyzeTypes {
my ($dic,$doc,$types,$opts) = @_;
##-- setup common variables
my $allow_re = defined($dic->{allowRegex}) ? qr($dic->{allowRegex}) : undef;
my $acode = $dic->analyzeCode;
foreach (values %$types) {
next if (defined($allow_re) && $_->{text} !~ $allow_re);
return $doc;
## Methods: Analysis: Utils
## $prefix = $dict->analyzePre()
sub analyzePre {
my $dic = shift;
return join('',
(map {"my $_;\n"}
($dic->{analyzePre} ? $dic->{analyzePre} : qw()),
## $coderef = $dict->analyzeCode()
## $coderef = $dict->analyzeCode($code)
sub analyzeCode {
my ($dic,$code) = @_;
#return $dic->analyzeCode_dummy($code) if ($dic->{label} eq 'exlex'); ##-- DEBUG
$code = defined($dic->{analyzeCode}) ? $dic->{analyzeCode} : $CODE_DEFAULT if (!defined($code));
my $acode = $dic->accessClosure($code, pre=>$dic->analyzePre);
return $acode;
1; ##-- be happy
## POD DOCUMENTATION, auto-generated by podextract.perl, edited
=head1 NAME
DTA::CAB::Analyzer::Dict - generic analysis dictionary API using Lingua::TT::Dict
## Footer
=head1 AUTHOR
Bryan Jurish E<lt>moocow@cpan.orgE<gt>
Copyright (C) 2011-2019 by Bryan Jurish
This package is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.24.1 or,
at your option, any later version of Perl 5 you may have available.
=head1 SEE ALSO