#!/usr/bin/perl
use
Time::HiRes
qw [
gettimeofday tv_interval ] ;
my
${ dt_start } = [ gettimeofday ] ;
my
$time0
=
time
;
use
Getopt::Std ; getopts
'g:i:jl:m:suwz=!@:#:0:2:'
=> \
my
%o
;
use
Term::ANSIColor
qw/:constants color/
;
$Term::ANSIColor::AUTORESET
= 1 ;
use
Encode
qw[ decode_utf8 encode_utf8 ]
;
my
$sdt
=
sprintf
'%04d-%02d-%02d %02d:%02d:%02d'
,
do
{
my
@t
= @{[
localtime
]}[5,4,3,2,1,0];
$t
[0]+=1900;
$t
[1]++;
@t
} ;
eval
"use PerlIO::gzip;1"
or
die
"PerlIO::gzip cannot be loaded, so -z does not work. ($Script, $sdt)\n"
if
$o
{z} ;
sub
AlignOut ( @ ) ;
sub
ColStat ( $$ ) ;
sub
d3 ($) {
$_
[0] =~ s/(?<=\d)(?=(\d\d\d)+($|\D))/,/gr } ;
$/ =
"\r\n"
if
$o
{w} ;
my
$L
=
','
;
my
$isep
=
$o
{i} //
"\t"
;
my
$nc
= 0 ;
my
$sec
=
$o
{
'@'
} // 15 ;
my
$rl
;
$SIG
{ ALRM } =
sub
{
say
STDERR GREEN + (d3
$rl
) .
" lines read. "
,
scalar
localtime
;
alarm
$sec
} ;
my
${ INT1 } =
sub
{
&{
$SIG
{ALRM} } ;
print
STDERR BRIGHT_RED
'Do you want to get the halfway result? Then type Ctrl + C again within 2 seconds. '
.
"\n"
.
'Really want to Quit? Then press Ctrl + "\" or Ctrl + Yen-Mark. (Ctrl+Z may be what you want.) '
. RESET "\n" ;
$SIG
{INT} =
sub
{
select
*STDERR
; & ColStat ;
select
*STDOUT
;
return
} ;
sleep
2 ;
return
;
} ;
$SIG
{ INT } = ${ INT1 } ;
$o
{g} = 6
if
( !
defined
$o
{g} ) ;
$| = 1
if
$o
{
'!'
} ;
* decode =
$o
{u} ? * decode_utf8 :
sub
($){
$_
[0] } ;
$o
{
'#'
} = decode (
$o
{
'#'
} )
if
defined
$o
{
'#'
} ;
my
%fOut
= (
j
=> [
map
{UNDERLINE decode(
$_
)}
qw[列番号 値の異なり 数値化平均 列名 値の範囲 最頻値 最頻値の度数 ..テールの度数(重なり) 桁数範囲 ]
] ,
e
=> [
map
{UNDERLINE
$_
}
qw[ cpos diff ave. name range frequent frequency ..lower(x_mul) digits]
] ) ;
binmode
*STDOUT
,
':utf8'
if
$o
{u} ;
alarm
$sec
;
push
@ARGV
,
'-'
unless
@ARGV
;
& eachFile (
$_
)
for
@ARGV
;
exit
0 ;
sub
eachFile ( $ ) {
sub
colnames( $ ) ;
sub
filePinfo ;
sub
ColFreq ( $$ ) ;
my
$FH
=
do
{
my
$t
=
*STDIN
if
$_
[0] eq
'-'
;
open
$t
,
'<'
,
$_
[0]
if
!
$t
;
binmode
$t
,
':gzip(gzip)'
if
$o
{z} ;
$t
} ;
$rl
= 0 ;
my
@colnames
= colnames
$FH
if
$o
{
'='
} ;
my
$maxCols
= ColFreq
$FH
,
my
$colvals
;
close
$FH
;
AlignOut @{
$fOut
{
$o
{j}?
'j'
:
'e'
} }
if
0 ne (
$o
{0}//
''
) ;
defined
$colvals
->[
$_
] and ColStat
$colvals
->[
$_
] ,
$colnames
[
$_
]
for
0 ..
$maxCols
- 1 ;
filePinfo ;
}
sub
colnames ( $ ) {
my
@F
=
split
/
$isep
/,
do
{
my
$FH
=
$_
[0] ;
my
$t
= <
$FH
> ;
$rl
++
if
defined
$t
;
$t
//=
''
;
chomp
$t
; decode (
$t
) } , -1 }
sub
filePinfo {
exit
if
(
$o
{2}//
''
) eq 0 ;
$rl
= d3 (
$rl
// 0) ;
my
$procsec
= tv_interval ${ dt_start } ;
my
$out
=
"$rl line(s) read; "
;
$out
.=
"$nc cells are not counted; "
if
$nc
;
$out
.=
sprintf
'%0.6f seconds (colsummary)'
,
$procsec
;
say
STDERR BOLD DARK ITALIC CYAN
$out
;
}
sub
ColFreq ( $$ ) {
my
$maxCols
= 0 ;
my
$col
=
undef
;
* lenlim =
defined
$o
{l} ?
sub
{
grep
{
$_
=
substr
$_
, 0,
$o
{l} }
@_
} :
sub
{} ;
* tailspacetrim =
defined
$o
{s} ?
sub
{
grep
{ s/\s+$// }
@_
} :
sub
{} ;
* negcell =
defined
$o
{
'#'
} ?
sub
{
if
(m/
$o
{
'#'
}/ ) {
$col
++ ;
$nc
++ ;
goto
EACH_CELL } } :
sub
{} ; # o{
'0'
} をやめた
for
(
my
$FH
=
$_
[0] ; <
$FH
> ;
$rl
++ ) {
chomp
;
my
@F
=
map
{ decode (
$_
) }
split
/
$isep
/ ,
$_
, -1 ;
& lenlim (
@F
) ;
& tailspacetrim (
@F
) ;
$col
= 0 ;
EACH_CELL :
while
(
defined
(
$_
=
shift
@F
) ) {
& negcell ;
++
$_
[1] -> [
$col
] {
$_
} ;
$col
++ ;
}
$maxCols
=
$col
if
$maxCols
<
$col
;
}
return
$maxCols
;
}
sub
ColStat ( $$ ) {
sub
aveft ( $$ ) ;
sub
MultSpec ( $$ ) ;
sub
minmaxstr ( $ ) ;
my
%thash
= %{
$_
[0] } ;
my
@vals
=
keys
%thash
;
my
@skeys
=
splice
@{[
sort
{
$thash
{
$b
} <=>
$thash
{
$a
} }
@vals
]} , 0,
$o
{g} ;
my
%cct
;
$cct
{
$_
} ++
foreach
values
%thash
;
my
@kcct
=
sort
{
$b
<=>
$a
}
keys
%cct
;
my
@kcct1
=
splice
@kcct
, 0, min(
$o
{g} ,
$#kcct
+1 ) ; # <- tricky!
my
@kcct2
=
splice
@kcct
, - min(
$o
{g} ,
$#kcct
+1 ) ; # <- tricky!
my
@out
;
push
@out
,
$_
+ 1,
scalar
@vals
, aveft ( \
%thash
, \
@vals
),
$_
[1]//(
$_
+1) ;
push
@out
, (
join
$L
, minmaxstr \
@vals
) , (
join
$L
,
@skeys
) ;
push
@out
, (
join
$L
, MultSpec \
@kcct1
, \
%cct
) , (
@kcct2
?
@kcct
?
'..'
:
$L
:
''
) . (
join
$L
, MultSpec \
@kcct2
, \
%cct
) ;
push
@out
, minmaxstr( \@{[
map
{
length
decode (
$_
) }
@vals
]} ) ;
AlignOut
@out
;
return
;
}
sub
AlignOut ( @ ) {
my
@p
=
@_
;
my
@P
;
push
@P
,
$p
[0] ;
push
@P
, GREEN BOLD
$p
[1] ;
push
@P
, BRIGHT_BLUE
$p
[2]
if
(
$o
{m}//
''
) ne 0 ;
push
@P
, BRIGHT_YELLOW
$p
[3] ;
push
@P
, BRIGHT_WHITE
$p
[4] ;
push
@P
,
$p
[5] ;
push
@P
, BRIGHT_GREEN
$p
[6] . GREEN
$p
[7] ;
push
@P
, BRIGHT_BLUE
$p
[8] ;
say
join
"\t"
,
@P
;
}
sub
aveft ( $$ ) {
my
(
$rHash
,
$rKeys
) =
@_
;
my
(
$tval
,
$freq
,
$asum
,
$afreq
) ;
for
( @{
$rKeys
} ) {
(
my
$num
=
$_
) =~ s/(\d),/$1/g ;
$tval
= POSIX::strtod (
$num
) ;
$freq
=
$rHash
->{
$_
} ;
$asum
+=
$tval
*
$freq
;
$afreq
+=
$freq
;
}
return
sprintf
'%5.3f'
,
$asum
/
$afreq
;
}
sub
MultSpec ( $$ ) {
my
(
$p_kc
,
$p_ccount
) =
@_
;
my
@ostr
;
my
$c
=0 ;
while
(
my
$t
=
shift
@$p_kc
) {
$c
++ ;
push
@ostr
,
$t
if
(
$p_ccount
->{
$t
} == 1 ) ;
push
@ostr
,
$t
.
'(x'
.
$p_ccount
->{
$t
} .
')'
if
(
$p_ccount
->{
$t
} >= 2 ) ;
last
if
(
$c
>=
$o
{g} ) ;
}
return
@ostr
;
} ;
sub
minmaxstr ( $ ) {
sub
part ( &@ ) ;
sub
RangeStr ( $$ ) ;
my
@gps
= part {
$_
eq
''
? 0 : looks_like_number
$_
? 1 : 2} @{
$_
[0] } ;
my
@ostr
;
push
@ostr
,
''
if
$gps
[0] ;
push
@ostr
, RangeStr( min(@{
$gps
[1]}), max(@{
$gps
[1]}) )
if
$gps
[1] ;
push
@ostr
, RangeStr( minstr(@{
$gps
[2]}), maxstr(@{
$gps
[2]}) )
if
$gps
[2] ;
return
@ostr
;
} ;
sub
part ( &@ ) {
my
(
$cd
,
@l
) =
@_
;
my
@p
;
push
@{
$p
[
$cd
->(
$_
) ] } ,
$_
for
@l
;
@p
} ;
sub
RangeStr ( $$ ) {
$_
[0] eq
$_
[1] ?
"$_[0]"
:
"$_[0]..$_[1]"
}
sub
VERSION_MESSAGE {}
sub
HELP_MESSAGE {
$ARGV
[1] //=
''
;
open
my
$FH
,
'<'
, $0 ;
while
(<
$FH
>){
s/\$0/
$Script
/g ;
print
$_
if
s/^=head1// .. s/^=cut// and
$ARGV
[1] =~ /^o(p(t(i(o(ns?)?)?)?)?)?$/i ? m/^\s+\-/ : 1;
}
close
$FH
;
exit
0 ;
}