## -*- Mode: CPerl -*-
## File:
## Author: Bryan Jurish <>
## Description: generic analysis automaton API
use DTA::CAB::Unify ':all';
use Gfsm;
use Encode qw(encode decode);
#use File::Basename qw();
use Carp;
use strict;
## Globals
our @ISA = qw(DTA::CAB::Analyzer::Dyn);
## + code string for {analyzeGet}
## + eval()d in list context, may return multiples
## + available vars:
## $tok => token object being analyzed
## $aut => analyzer (automaton)
#our $DEFAULT_ANALYZE_GET = '$_[0]{xlit} ? $_[0]{xlit}{latin1Text} : $_[0]{text}';
our $DEFAULT_ANALYZE_GET = '$tok->{xlit} ? $tok->{xlit}{latin1Text} : $tok->{text}';
## + default code string for {analyzeSet}
## + available vars:
## $tok => token object being analyzed
## $a => analyses (array-ref, maybe blessed)
## $aut => analyzer (automaton)
#our $DEFAULT_ANALYZE_SET = '$_[0]{$anl->{label}}=$_[1]';
our $DEFAULT_ANALYZE_SET = '$tok->{$aut->{label}}=$wa';
## Constructors etc.
## $obj = CLASS_OR_OBJ->new(%args)
## + object structure:
## (
## ##-- Filename Options
## fstFile => $filename, ##-- source FST file (default: none)
## labFile => $filename, ##-- source labels file (default: none)
## dictFile => $filename, ##-- source dict file (default: none): clobbers $dict->{dictFile} if defined
## ##-- Exception lexicon options
## dict => $dict, ##-- exception lexicon as a DTA::CAB::Analyzer::Dict object or option hash
## ## + default=undef
## dictClass => $class, ##-- fallback class for new dict (default='DTA::CAB::Analyzer::Dict')
## ##-- Analysis Output
## analyzeGet => $code, ##-- accessor: coderef or string: source text (default=$DEFAULT_ANALYZE_GET; return undef for no analysis)
## analyzeSet => $code, ##-- accessor: coderef or string: set analyses (default=$DEFAULT_ANALYZE_SET)
## wantAnalysisLo => $bool, ##-- set to true to include 'lo' keys in analyses (default: true)
## wantAnalysisLemma => $bool, ##-- set to true to include 'lemma' keys in analyses (default: false)
## ##-- Analysis Options
## eow => $sym, ##-- EOW symbol for analysis FST
## check_symbols => $bool, ##-- check for unknown symbols? (default=1)
## labenc => $enc, ##-- encoding of labels file (default='auto')
## #dictenc => $enc, ##-- dictionary encoding (default='UTF-8') (set $aut->{dict}{encoding} instead)
## auto_connect => $bool, ##-- whether to call $result->_connect() after every lookup (default=0)
## tolower => $bool, ##-- if true, all input words will be bashed to lower-case (default=0)
## tolowerNI => $bool, ##-- if true, all non-initial characters of inputs will be lower-cased (default=0)
## toupperI => $bool, ##-- if true, initial character will be upper-cased (default=0)
## bashWS => $str, ##-- if defined, input whitespace will be bashed to '$str' (default='_')
## attInput => $bool, ##-- if true, respect AT&T lextools-style escapes in input (default=0)
## allowTextRegex => $re, ##-- if defined, only tokens with matching 'text' will be analyzed (default: none)
## ## : useful: /^(?:(?:[[:alpha:]\-\@\x{ac}]*[[:alpha:]]+)|(?:[[:alpha:]]+[[:alpha:]\-\@\x{ac}]+))(?:[\'\x{2018}\x{2019}]s)?$/
## ## : == DTA::CAB::Analyzer::_am_wordlike_regex()
## ##-- Analysis objects
## fst => $gfst, ##-- (child classes only) e.g. a Gfsm::Automaton::Dyn object (default=new)
## lab => $lab, ##-- (child classes only) e.g. a Gfsm::Alphabet object (default=new)
## labh => \%sym2lab, ##-- (?) label hash: $sym2lab{$labSym} = $labId;
## laba => \@lab2sym, ##-- (?) label array: $lab2sym[$labId] = $labSym;
## labc => \@chr2lab, ##-- (?)chr-label array: $chr2lab[ord($chr)] = $labId;, by unicode char number (e.g. unpack('U0U*'))
## result=>$resultfst, ##-- (child classes only) e.g. result fst
## ##-- INHERITED from DTA::CAB::Analyzer
## label => $label, ##-- analyzer label (default: from analyzer class name)
## typeKeys => \@keys, ##-- type-wise keys to expand
## )
sub new {
my $that = shift;
my $aut = $that->SUPER::new(
##-- filenames
fstFile => undef,
labFile => undef,
dictFile => undef,
##-- analysis objects
##-- options
eow =>'',
check_symbols => 1,
labenc => 'auto',
#dictenc => 'utf8',
auto_connect => 0,
tolower => 0,
tolowerNI => 0,
toupperI => 0,
bashWS => '_',
attInput => 0,
allowTextRegex => undef, #DTA::CAB::Analyzer::_am_wordlike_regex(),
##-- analysis I/O
analyzeSrc => 'text',
wantAnalysisLo => 1,
wantAnalysisLemma => 0,
##-- user args
return $aut;
## $aut = $aut->clear()
sub clear {
my $aut = shift;
##-- analysis sub(s)
##-- analysis objects
%{$aut->{labh}} = qw();
@{$aut->{laba}} = qw();
@{$aut->{labc}} = qw();
return $aut;
## Methods: Generic
## $class = $aut->fstClass()
## + default FST class for loadFst() method
sub fstClass { return 'Gfsm::Automaton::Dyn'; }
## $class = $aut->labClass()
## + default labels class for loadLabels() method
sub labClass { return 'Gfsm::Alphabet'; }
## $bool = $aut->fstOk()
## + should return false iff fst is undefined or "empty"
sub fstOk { return defined($_[0]{fst}) && $_[0]{fst}->n_states>0; }
## $bool = $aut->labOk()
## + should return false iff label-set is undefined or "empty"
sub labOk { return defined($_[0]{lab}) && $_[0]{lab}->size>0; }
## $bool = $aut->dictOk()
## + should return false iff dict is undefined or "empty"
sub dictOk { return $_[0]{dict} && $_[0]{dict}->dictOk; }
## Methods: I/O
## Methods: I/O: Input: all
## $bool = $aut->ensureLoaded()
## + ensures automaton data is loaded from default files
sub ensureLoaded {
my $aut = shift;
my $rc = 1;
##-- ensure: fst
if ( defined($aut->{fstFile}) && !$aut->fstOk ) {
$rc &&= $aut->loadFst($aut->{fstFile});
##-- ensure: lab
if ( defined($aut->{labFile}) && !$aut->labOk ) {
$rc &&= $aut->loadLabels($aut->{labFile});
##-- ensure: dict
if ( (defined($aut->{dictFile}) || ($aut->{dict} && $aut->{dict}{dictFile})) && !$aut->dictOk ) {
$rc &&= $aut->loadDict();
##-- ensure: closures
$rc &&= $aut->ensureDynSubs();
return $rc;
## $aut = $aut->load(fst=>$fstFile, lab=>$labFile, dict=>$dictFile)
sub load {
my ($aut,%args) = @_;
return 0 if (!grep {defined($_)} @args{qw(fst lab dict)});
my $rc = $aut;
$rc &&= $aut->loadFst($args{fst}) if (defined($args{fst}));
$rc &&= $aut->loadLabels($args{lab}) if (defined($args{lab}));
$rc &&= $aut->loadDict($args{dict}) if (defined($args{dict}));
return $rc;
## Methods: I/O: Input: FST
## $aut = $aut->loadFst($fstfile)
sub loadFst {
my ($aut,$fstfile) = @_;
$aut->info("loading FST file '$fstfile'");
$aut->{fst} = $aut->fstClass->new() if (!defined($aut->{fst}));
or $aut->logconfess("loadFst(): load failed for '$fstfile': $!");
$aut->{result} = $aut->{fst}->shadow; #if (defined($aut->{result}) && $aut->{fst}->can('shadow'));
return $aut;
## Methods: I/O: Input: Labels
## $aut = $aut->loadLabels($labfile)
sub loadLabels {
my ($aut,$labfile) = @_;
$aut->info("loading labels file '$labfile'");
$aut->{lab} = $aut->labClass->new() if (!defined($aut->{lab}));
or $aut->logconfess("loadLabels(): load failed for '$labfile': $!");
if (!$aut->{labenc} || $aut->{labenc} eq 'auto') {
##-- guess label encoding
my $buf = join('',@{$aut->{lab}->toArray});
$aut->{labenc} = utf8::decode($buf) ? 'utf8' : 'latin1';
$aut->debug("loadLabels(): guessed label encoding '$aut->{labenc}'");
if ($aut->{lab}->can('utf8') && (($aut->{labenc}||'') =~ /^utf\-?8$/i));
return $aut;
## $aut = $aut->parseLabels()
## + sets up $aut->{labh}, $aut->{laba}, $aut->{labc}
## + fixes encoding difficulties in $aut->{labh}, $aut->{laba}
sub parseLabels {
my $aut = shift;
my $laba = $aut->{laba};
@$laba = @{$aut->{lab}->asArray};
my ($i);
foreach $i (grep { defined($laba->[$_]) } 0..$#$laba) {
$laba->[$i] = decode($aut->{labenc}, $laba->[$i]) if ($aut->{labenc});
$aut->{labh}{$laba->[$i]} = $i;
##-- setup labc: $labId = $labc->[ord($c)]; ##-- single unicode characater
## : @labIds = @$labc[unpack('U0U*',$s)]; ##-- batch lookup for strings (fast)
my @csyms = grep {defined($_) && length($_)==1} @$laba; ##-- @csyms = ($sym1, ...) s.t. each sym has len==1
@{$aut->{labc}}[map {ord($_)} @csyms] = @{$aut->{labh}}{@csyms};
return $aut;
## Methods: I/O: Input: Dictionary
## $aut = $aut->loadDict()
## $aut = $aut->loadDict($dictfile)
sub loadDict {
my ($aut,$dictfile) = @_;
$dictfile = $aut->{dictFile} if (!defined($dictfile));
$dictfile = $aut->{dict}{dictFile} if (!defined($dictfile));
return $aut if (!defined($dictfile)); ##-- no dict file to load
$aut->info("loading exception lexicon from '$dictfile'");
##-- sanitize dict object
my $dclass = (ref($aut->{dict})||$aut->{dictClass}||'DTA::CAB::Analyzer::Dict');
my $dict = $aut->{dict} = bless(_unifyClobber($dclass->new,$aut->{dict},undef), $dclass);
$dict->{label} = $aut->{label}."_dict"; ##-- force sub-analyzer label
$dict->{dictFile} = $dictfile; ##-- clobber sub-analyzer file
##-- load dict object
return undef if (!$dict->dictOk);
return $aut;
## Methods: Persistence
## Methods: Persistence: Perl
## @keys = $class_or_obj->noSaveKeys()
## + returns list of keys not to be saved
sub noSaveKeys {
my $that = shift;
return ($that->SUPER::noSaveKeys, qw(dict fst lab laba labc labh result));
## $saveRef = $obj->savePerlRef()
## + inherited from DTA::CAB::Persistent
## $loadedObj = $CLASS_OR_OBJ->loadPerlRef($ref)
## + implicitly calls $obj->clear()
sub loadPerlRef {
my ($that,$ref) = @_;
my $obj = $that->SUPER::loadPerlRef($ref);
return $obj;
## Methods: Analysis
## Methods: Analysis: Generic
## $bool = $anl->canAnalyze()
## + returns true if analyzer can perform its function (e.g. data is loaded & non-empty)
sub canAnalyze {
return $_[0]->dictOk || ($_[0]->labOk && $_[0]->fstOk);
## Methods: Analysis: v1.x
## Methods: Analysis: v1.x: dynamic (newer)
sub dynSubCode {
my ($anl,$which) = @_;
return $anl->SUPER::dynSubCode($which) if ($which ne 'Types');
my $aget = $anl->{analyzeGet} || $DEFAULT_ANALYZE_GET;
my $aset = $anl->{analyzeSet} || $DEFAULT_ANALYZE_SET;
return 'sub {
my ($aut,$doc,$types,$opts) = @_;
$types = $doc->types if (!$types);
##-- common vars
my $dict = $aut->dictOk ? $aut->{dict}->dictHash : undef;
my $fst = $aut->{fst};
my $fst_ok = $aut->fstOk();
my $result = $aut->{result};
my $lab = $aut->{lab};
my $labc = $aut->{labc};
my $laba = $aut->{laba};
my $labenc = $aut->{labenc};
my @eowlab = (defined($aut->{eow}) && $aut->{eow} ne "" ? ($aut->{labh}{$aut->{eow}}) : qw());
##-- ananalysis options
my @analyzeOptionKeys = (qw(check_symbols auto_connect),
#qw(tolower tolowerNI toupperI bashWS attInput),
qw(wantAnalysisLo wantAnalysisLemma),
qw(max_paths max_weight max_ops),
my ($tok,@w,$w,$wa);
my ($uword,$ua,$ulword,@wlabs,$lemma);
foreach $tok (values(%$types)) {
'.($anl->{allowTextRegex} ? ('next if ($tok->{text} !~ /'.$anl->{allowTextRegex}.'/);') : '').' ##-- dyn:allowTextRegex
@w = grep {defined($_)} '.$aget.'; ##-- dyn:analyzeGet
next if (!@w); ##-- accessor returned undef: skip this token
$wa = [];
foreach $w (@w) {
##-- BEGIN analyzeWord
$opts->{src} = $w; ##-- set $opts->{src} (hack for setLookupOptions())
##-- set default options
$opts->{$_} = $aut->{$_} foreach (grep {!defined($opts->{$_})} @analyzeOptionKeys);
$aut->setLookupOptions($opts) if ($aut->can("setLookupOptions"));
##---- analyze
##-- normalize word
? '$uword = lc($w); ##-- dyn:tolower'
: ($anl->{tolowerNI}
? '($uword = $w) =~ s/^(.)(.*)$/$1\L$2\E/; ##-- dyn:tolowerNI'
: ('$uword = $w; ##-- dyn:no-lower'))).' ##-- dyn:norm:lower
'.($anl->{toupperI} ? '$uword = ucfirst($uword); ##-- dyn:toupperI' : '').' ##-- dyn:norm:upper
'.($anl->{bashWS} ? '$uword =~ s/\s+/$opts->{bashWS}/g; ##-- dyn:bashWS' : '').' ##-- dyn:norm:bashWS
##-- check for (normalized) word in dict
if ($dict && defined($ua=$dict->{$uword})) {
sort {($a->{w}||0) <=> ($b->{w}||0)}
map {DTA::CAB::Analyzer::Dict::parseFstString($_)}
grep {$_ ne ""}
} elsif ($fst_ok) {
##-- not in dict: fst lookup (if fst is kosher)
##-- dyn:str2labels
? ('##-- get labels: att-style (requires gfsm v0.0.10-pre11, gfsm-perl v0.0217)
$ulword = $uword;
@wlabs = (@{$lab->string_to_labels($ulword, $aut->{check_symbols}, 1)}, @eowlab);')
: ($anl->{check_symbols}
? ('##-- get labels: by character: verbose
@wlabs = (@$labc[unpack("U0U*",$uword)],@eowlab);
foreach (grep { !defined($wlabs[$_]) } (0..$#wlabs)) {
$aut->warn("ignoring unknown character \`", substr($uword,$_,1), "\' in word \`$w\' (normalized to \`$uword\').\n");
@wlabs = grep {defined($_)} @wlabs;')
: ('##-- get labels: by character: quiet
@wlabs = grep {defined($_)} (@$labc[unpack("U0U*",$uword)],@eowlab);'))).' ##-- dyn:string_to_labels
##-- fst lookup
$aut->{fst}->lookup(\@wlabs, $result);
'.($anl->{auto_connect} ? '$result->_connect();' : '').' ##-- dyn:auto_connect
#$result->_rmepsilon() if ($opts->{auto_rmeps});
##-- parse analyses
map {
'.($anl->{wantAnalysisLo} ? 'lo=>$uword,' : '').' ##-- dyn:wantAnalysisLo
"hi"=> (defined($labenc)
? Encode::decode($labenc,$lab->labels_to_string($_->{hi},0,1))
: $lab->labels_to_string($_->{hi},0,1)),
"w" => $_->{w},
} @{$result->paths($Gfsm::LSUpper)}
##-- parse lemmata?
'.($anl->{wantAnalysisLemma} ?
'foreach (@$wa) {
$lemma = $_->{hi};
if (defined($lemma) && $lemma ne "") {
$lemma =~ s/\[.*$//; ##-- trim everything after first non-character symbol
$lemma =~ s/(?:\/\w+)|(?:[\\\¬\~\|\=\+\#])//g;
substr($lemma,1) = lc(substr($lemma,1));
} else {
$lemma = $uword;
$lemma =~ s/^\s*//;
$lemma =~ s/\s*$//;
$lemma =~ s/\s+/_/g;
$_->{lemma} = $lemma;
: '').' ##-- dyn: wantAnalysisLemma
##-- END analyzeWord
undef($wa) if (!@$wa);
'.($aset).'; ##-- dyn:analyzeSet
return $doc;
1; ##-- be happy
## POD DOCUMENTATION, auto-generated by podextract.perl
=head1 NAME
DTA::CAB::Analyzer::Automaton::Dyn - generic analysis automaton API
use DTA::CAB::Analyzer::Automaton::Dyn;
## Constructors etc.
$obj = CLASS_OR_OBJ->new(%args);
$aut = $aut->clear();
## Methods: Generic
$class = $aut->fstClass();
$class = $aut->labClass();
$bool = $aut->fstOk();
$bool = $aut->labOk();
$bool = $aut->dictOk();
## Methods: I/O
$bool = $aut->ensureLoaded();
$aut = $aut->load(fst=>$fstFile, lab=>$labFile);
$aut = $aut->loadFst($fstfile);
$aut = $aut->loadLabels($labfile);
$aut = $aut->parseLabels();
$aut = $aut->loadDict($dictfile);
## Methods: Persistence: Perl
@keys = $class_or_obj->noSaveKeys();
$loadedObj = $CLASS_OR_OBJ->loadPerlRef($ref);
## Methods: Analysis
## DESCRIPTION: DTA::CAB::Analyzer::Automaton::Dyn: Globals
=head2 Globals
=over 4
=item Variable: @ISA
inherits from
## DESCRIPTION: DTA::CAB::Analyzer::Automaton::Dyn: Constructors etc.
=head2 Constructors etc.
=over 4
=item new
$aut = CLASS_OR_OBJ->new(%args);
%args, %$aut:
##-- Filename Options
fstFile => $filename, ##-- default: none
labFile => $filename, ##-- default: none
dictFile=> $filename, ##-- default: none (clobbers $aut->{dict}{dictFile} if defined)
##-- Analysis Output
analysisClass => $class, ##-- default: none (ARRAY)
analyzeSrc => $key, ##-- source key for analysis (default: 'text')
analyzeDst => $key, ##-- token output key (default: from __PACKAGE__)
wantAnalysisLo => $bool, ##-- set to true to include 'lo' keys in analyses (default: true)
##-- Analysis Options
eow => $sym, ##-- EOW symbol for analysis FST
check_symbols => $bool, ##-- check for unknown symbols? (default=1)
labenc => $enc, ##-- encoding of labels file (default='auto': utf8 > latin1)
#dictenc => $enc, ##-- dictionary encoding (default='utf8') : prefer $aut->{dict}{encoding}
auto_connect => $bool, ##-- whether to call $result->_connect() after every lookup (default=0)
tolower => $bool, ##-- if true, all input words will be bashed to lower-case (default=0)
tolowerNI => $bool, ##-- if true, all non-initial characters of inputs will be lower-cased (default=0)
toupperI => $bool, ##-- if true, initial character will be upper-cased (default=0)
bashWS => $str, ##-- if defined, input whitespace will be bashed to '$str' (default='_')
attInput => $bool, ##-- if true, respect AT&T lextools-style escapes in input (default=0)
allowTextRegex => $re, ##-- if defined, only tokens with matching 'text' will be analyzed (default: none)
##-- Analysis objects
fst => $gfst, ##-- (child classes only) e.g. a Gfsm::Automaton::Dyn object (default=new)
lab => $lab, ##-- (child classes only) e.g. a Gfsm::Alphabet object (default=new)
labh => \%sym2lab, ##-- (?) label hash: $sym2lab{$labSym} = $labId;
laba => \@lab2sym, ##-- (?) label array: $lab2sym[$labId] = $labSym;
labc => \@chr2lab, ##-- (?)chr-label array: $chr2lab[ord($chr)] = $labId;, by unicode char number (e.g. unpack('U0U*'))
result=>$resultfst, ##-- (child classes only) e.g. result fst
dict => $dict, ##-- exception lexicon / static cache as DTA::CAB::Analyzer::Dict object
=item clear
$aut = $aut->clear();
Clears the object.
## DESCRIPTION: DTA::CAB::Analyzer::Automaton::Dyn: Methods: Generic
=head2 Methods: Generic
=over 4
=item fstClass
$class = $aut->fstClass();
Returns default FST class for L</loadFst>() method.
Used by sub-classes.
=item labClass
$class = $aut->labClass();
Returns default alphabet class for L</loadLabels>() method.
Used by sub-classes.
=item fstOk
$bool = $aut->fstOk();
Should return false iff fst is undefined or "empty".
=item labOk
$bool = $aut->labOk();
Should return false iff alphabet (label-set) is undefined or "empty".
=item dictOk
$bool = $aut->dictOk();
Should return false iff dict is undefined or "empty".
## DESCRIPTION: DTA::CAB::Analyzer::Automaton::Dyn: Methods: I/O
=head2 Methods: I/O
=over 4
=item ensureLoaded
$bool = $aut->ensureLoaded();
Ensures automaton data is loaded from default files.
=item load
$aut = $aut->load(fst=>$fstFile, lab=>$labFile, dict=>$dictFile);
Loads specified files.
=item loadFst
$aut = $aut->loadFst($fstfile);
Loads automaton from $fstfile.
=item loadLabels
$aut = $aut->loadLabels($labfile);
Loads labels from $labfile.
=item parseLabels
$aut = $aut->parseLabels();
Parses some information from a (newly loaded) alphabet.
=over 4
=item *
sets up $aut-E<gt>{labh}, $aut-E<gt>{laba}, $aut-E<gt>{labc}
=item *
fixes encoding difficulties in $aut-E<gt>{labh}, $aut-E<gt>{laba}
=item loadDict
$aut = $aut->loadDict($dictfile);
Loads dictionary from $dictfile.
## DESCRIPTION: DTA::CAB::Analyzer::Automaton::Dyn: Methods: Persistence: Perl
=head2 Methods: Persistence: Perl
=over 4
=item noSaveKeys
@keys = $class_or_obj->noSaveKeys();
Returns list of keys not to be saved
This implementation returns:
qw(dict fst lab laba labc labh result)
=item loadPerlRef
$loadedObj = $CLASS_OR_OBJ->loadPerlRef($ref);
Implicitly calls $obj-E<gt>clear()
## DESCRIPTION: DTA::CAB::Analyzer::Automaton::Dyn: Methods: Analysis
=head2 Methods: Analysis
=over 4
=item canAnalyze
$bool = $anl->canAnalyze();
Returns true if analyzer can perform its function (e.g. data is loaded & non-empty)
This implementation just returns:
$anl->dictOk || ($anl->labOk && $anl->fstOk)
## END POD DOCUMENTATION, auto-generated by podextract.perl
## Footer
=head1 AUTHOR
Bryan Jurish E<lt>moocow@cpan.orgE<gt>
Copyright (C) 2009-2019 by Bryan Jurish
This package is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.24.1 or,
at your option, any later version of Perl 5 you may have available.