Dave Cross: Still Munging Data With Perl: Online event - Mar 17 Learn more

##-*- Mode: CPerl -*-
## File: DDC::Format::Kwic.pm
## Author: Bryan Jurish <moocow@cpan.org>
## Description:
## + DDC Query utilities: output formatting: keywords-in-context
##======================================================================
use Carp;
use strict;
##======================================================================
## Globals
our @ISA = qw(DDC::Format);
BEGIN {
*isa = \&UNIVERSAL::isa;
}
##======================================================================
## Constructors, etc.
## $fmt = $CLASS_OR_OBJ->new(%args)
## + %args:
## (
## start=>$previous_hit_num, ##-- pre-initial hit number (default=0)
## highlight=>[$pre,$post], ##-- highlighting substrings
## width=>$nchars, ##-- context width; default=32
## useMatchIds=>$bool, ##-- whether to use match-ids if available; undef (default) if non-trivial match-ids are specified
## )
sub new {
my $that = shift;
return bless {
highlight=>['__','__'],
width=>32,
useMatchIds=>undef,
@_
}, ref($that)||$that;
}
## $fmt = $fmt->reset()
## + reset counters, etc.
sub reset {
$_[0]{start}=0;
return $_[0]->SUPER::reset();
}
##======================================================================
## Helper functions
## $len = maxlen(@strings)
sub maxlen {
my $l = 0;
do { $l=length($_) if (length($_) > $l) } foreach (@_);
return $l;
}
##======================================================================
## API
## $str = $fmt->toString($hitList)
sub toString {
my ($fmt,$hits) = @_;
if ($hits->{counts_} && @{$hits->{counts_}}) {
##-- count-query: format as text
my ($i);
my @lens = map {$i=$_; maxlen(map {$_->[$i]} @{$hits->{counts_}})} (0..$#{$hits->{counts_}[0]});
my $fmt = join("\t", map {"%-${_}s"} @lens)."\n";
return join('', map {sprintf($fmt,@$_)} @{$hits->{counts_}});
}
my $xlen = $fmt->{width} || 2**31;
my $hnum = $hits->{start};
my $useMatchIds = defined($fmt->{useMatchIds}) ? $fmt->{useMatchIds} : (grep {$_>0 && $_!=1} map {$_->{hl_}} map {@{$_->{ctx_}[1]}} @{$hits->{hits_}});
my (@hits);
foreach my $hit (@{$hits->{hits_}}) {
##-- hit key: number + file basename + page
my $f = basename($hit->{meta_}{file_});
$f =~ s/\..*$//;
my $p = defined($hit->{meta_}{page_}) ? $hit->{meta_}{page_} : 0;
my $pagei = (grep {$hit->{meta_}{indices_}[$_] eq 'page'} (0..$#{$hit->{meta_}{indices_}}))[0];
my $targetMatchId = $useMatchIds ? (sort {$a<=>$b} grep {$_} map {$_->{hl_}} @{$hit->{ctx_}[1]})[0] : undef;
##-- hit context
my $fkey = $hits->{defaultField} || $hit->{meta_}{indices_}[0] || 'w';
my (@l,@c,@r);
my $ary = \@l;
my $hl = '__';
foreach (map {@$_} @{$hit->{ctx_}}) {
if ($ary eq \@l && ref($_) && ($useMatchIds ? $_->{hl_}==$targetMatchId : $_->{hl_})) {
$ary=\@c;
$p=$_->{page} if (!$p && defined($_->{page}));
}
elsif ($ary eq \@c) {
$ary = \@r;
}
$hl = ($ary eq \@c ? '__' : '_');
push(@$ary, (ref($_)
? ($_->{hl_}
? ($hl.$_->{$fkey}.$hl.($useMatchIds ? "/$_->{hl_}" : ''))
: $_->{$fkey})
: $_));
}
my $ls = join(' ', @l);
my $rs = join(' ', @r);
substr($ls, 0, length($ls)-$xlen+3, '...') if (length($ls) > $xlen);
substr($rs, $xlen-3, length($rs)-$xlen+3, '...') if (length($rs) > $xlen);
push(@hits,[$hnum++, "[$f:$p]", $ls, join(' ',@c), $rs]);
}
my $ln = maxlen(map {$_->[0]} @hits);
my $lf = maxlen(map {$_->[1]} @hits);
my $ll = maxlen(map {$_->[2]} @hits);
my $lc = maxlen(map {$_->[3]} @hits);
my $lr = maxlen(map {$_->[4]} @hits);
return (
"# Hit(s) $hits[0][0]-$hits[$#hits][0] of $hits->{nhits_}\n"
.join('', map {sprintf("%${ln}d: %-${lf}s %${ll}s %-${lc}s %-${lr}s\n", @$_)} @hits)
);
}
1; ##-- be happy
__END__
##======================================================================
## Docs
=pod
=head1 NAME
DDC::Format::Kwic - Keyword-in-context (KWIC) formatting for DDC hits
=head1 SYNOPSIS
use DDC::Concordance;
$hitList = DDC::Client::Distributed->new()->query('foo&&bar'); ##-- get some hits
$fmt = DDC::Format::Kwic->new(width=>$nchars);
$str = $fmt->toString($hitList); ##-- conversion to string
$fmt->toFile($hitList,$filename); ##-- output to file
$fmt->toFh($hitList,$fh); ##-- output to filehandle
=cut
##======================================================================
## Description
=pod
=head1 DESCRIPTION
Class for formatting DDC::Hit objects as plain text.
=cut
##----------------------------------------------------------------
## DESCRIPTION: DDC::Format::Kwic: Globals
=pod
=head2 Globals
=over 4
=item Variable: @ISA
DDC::Format::Kwic inherits from L<DDC::Format|DDC::Format>.
=back
=cut
##----------------------------------------------------------------
## DESCRIPTION: DDC::Format::Kwic: Constructors, etc.
=pod
=head2 Constructors, etc.
=over 4
=item new
$fmt = $CLASS_OR_OBJ->new(%args);
Accepted keywords in %args:
start => $previous_hit_num, ##-- pre-initial hit number (default=0)
highlight => [$pre,$post], ##-- highlighting substrings (default=['__','__'])
width => $nchars, ##-- context width; default=32
useMatchIds => $bool, ##-- whether to use match-ids if available; undef (default) if non-trivial match-ids are specified
=item reset
$fmt = $fmt->reset();
Resets the formatting object.
=back
=cut
##----------------------------------------------------------------
## DESCRIPTION: DDC::Format::Kwic: Helper functions
=pod
=head2 Helper functions
=over 4
=item hitString
$hitStr = $fmt->hitString($hit);
Formats a single $hit as a string,
incrementing the counter $fmt-E<gt>{start} as a side-effect.
=back
=cut
##----------------------------------------------------------------
## DESCRIPTION: DDC::Format::Kwic: API
=pod
=head2 API
=over 4
=item toString
$str = $fmt->toString($hitList);
Implements L<DDC::Format::toString()|DDC::Format/toString>.
=back
=cut
##========================================================================
## END POD DOCUMENTATION, auto-generated by podextract.perl
##======================================================================
## Footer
##======================================================================
=pod
=head1 AUTHOR
Bryan Jurish E<lt>moocow@cpan.orgE<gt>
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2014-2015 by Bryan Jurish
This package is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.7 or,
at your option, any later version of Perl 5 you may have available.
=cut