#!/usr/bin/perl -w

use lib qw(.);
use DTA::CAB;
use DTA::CAB::Utils ':all';
use DTA::CAB::Format;
use JSON::XS;
use File::Basename qw(basename);
use IO::File;
use Getopt::Long qw(:config no_ignore_case);
use Time::HiRes qw(gettimeofday tv_interval);
use Pod::Usage;

use strict;

##==============================================================================
## Constants & Globals
##==============================================================================

##-- program identity
our $prog = basename($0);
our $VERSION = $DTA::CAB::VERSION;

##-- General Options
our ($help,$man,$version,$verbose);

##-- Formats
our $inputClass  = undef;  ##-- default input format class
our %inputOpts   = ();
our $block       = undef;  ##-- block specification; default: format-dependent
our $outfile     = '-';

##==============================================================================
## Command-line
GetOptions(##-- General
	   'help|h'    => \$help,
	   'version|V' => \$version,

	   ##-- I/O: input
	   'input-class|ic|parser-class|pc=s'   => \$inputClass,
	   'input-option|io|parser-option|po=s' => \%inputOpts,
	   'block|block-size|bs|b=s'            => \$block,

	   ##-- I/O: output
	   'output-file|output|o=s' => \$outfile,

	   ##-- Log4perl
	   DTA::CAB::Logger->cabLogOptions('verbose'=>1),
	  );

if ($version) {
  print cab_version;
  exit(0);
}

pod2usage({-exitval=>0, -verbose=>0}) if ($help);


##==============================================================================
## MAIN
##==============================================================================

##-- log4perl initialization
DTA::CAB::Logger->logInit();

##======================================================
## Formats

our $ifmt = DTA::CAB::Format->newReader(class=>$inputClass,file=>$ARGV[0],%inputOpts)
  or die("$0: could not create input format of class $inputClass: $!");

##======================================================
## Main

##-- output
our $jxs = JSON::XS->new->utf8->indent(0)->space_before(0)->space_after(1)->canonical(1);
open(OUT,">$outfile") or die("$prog: open failed for output file '$outfile': $!");

##-- main loop
push(@ARGV,'-') if (!@ARGV);
my %blockOpts = $ifmt->blockOptions($block);
my ($file,$blocks);
foreach $file (@ARGV) {
  $blocks = $ifmt->blockScan($file, %blockOpts);

  ##-- write in pseudo-tj format
  foreach (@$blocks) {
    delete($_->{file});
    print OUT $file, "\t", $jxs->encode($_), "\n";
  }
  print OUT "\n";
}
close OUT;

__END__
=pod

=head1 NAME

dta-cab-blockscan.perl - scan for block boundaries in DTA::CAB documents

=head1 SYNOPSIS

 dta-cab-blockscan.perl [OPTIONS...] DOCUMENT_FILE(s)...

 General Options:
  -help                           ##-- show short usage summary
  -version                        ##-- show version & exit
  -verbose LEVEL                  ##-- set default log level

 I/O Options:
  -input-class CLASS              ##-- select input parse class (default: TT)
  -input-option OPT=VALUE         ##-- set input parser option
  -block SIZE[{k,M,G,T}][@EOB]    ##-- select block boundary specification (default: format-dependent)
  -output-file FILE               ##-- set output file (default: STDOUT)

=cut

##==============================================================================
## Description
##==============================================================================
=pod

=head1 DESCRIPTION

dta-cab-blockscan.perl is a command-line utility for testing the
DTA::CAB block-wise I/O API.

=cut

##==============================================================================
## Options and Arguments
##==============================================================================
=pod

=head1 OPTIONS AND ARGUMENTS

=cut

##==============================================================================
## Options: General Options
=pod

=head2 General Options

=over 4

=item -help

Display a short help message and exit.

=item -version

Display program and module version information and exit.

=item -verbose

Set default log level (trace|debug|info|warn|error|fatal).

=back

=cut

##==============================================================================
## Options: I/O Options
=pod

=head2 I/O Options

=over 4

=item -input-class CLASS

Select input parser class (default: Text)

=item -input-option OPT=VALUE

Set arbitrary input parser option C<OPT> to C<VALUE>.
May be multiply specified.

=item -output-file FILE

Set output file (default: STDOUT).
Output is written in L<DTA::CAB::Format::TJ|DTA::CAB::Format::TJ> format,
where each "sentence" represents a single input file, and each "token"
represents a single I/O block.  Token "text" is the filename, and block
attributes are returned as JSON in the token attribute field.

=back

=cut

##======================================================================
## Footer
##======================================================================

=pod

=head1 ACKNOWLEDGEMENTS

Perl by Larry Wall.

=head1 AUTHOR

Bryan Jurish E<lt>moocow@cpan.orgE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2011-2019  by Bryan Jurish. All rights reserved.
This program is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.24.1 or,
at your option, any later version of Perl 5 you may have available.

=head1 SEE ALSO

L<dta-cab-analyze.perl(1)|dta-cab-analyze.perl>,
L<dta-cab-convert.perl(1)|dta-cab-convert.perl>,
L<dta-cab-cachegen.perl(1)|dta-cab-cachegen.perl>,
L<dta-cab-xmlrpc-server.perl(1)|dta-cab-xmlrpc-server.perl>,
L<dta-cab-xmlrpc-client.perl(1)|dta-cab-xmlrpc-client.perl>,
L<DTA::CAB(3pm)|DTA::CAB>,
L<perl(1)|perl>,
...

=cut