#!perl our $DATE = '2019-07-03'; # DATE our $VERSION = '0.122'; # VERSION use 5.010001; use strict; use warnings; use Getopt::Long; my %Opts = ( op => undef, ignore_case => 0, ignore_all_space => 0, skip_chars => 0, glue => ',', ); sub parse_cmdline { my $res = GetOptions( 'ignore-case|i' => \$Opts{ignore_case}, 'ignore-all-space|w' => \$Opts{ignore_all_space}, 'op=s' => \$Opts{op}, 'union' => sub { $Opts{op} = 'union' }, 'intersect' => sub { $Opts{op} = 'intersect' }, 'diff' => sub { $Opts{op} = 'diff' }, 'symdiff' => sub { $Opts{op} = 'symdiff' }, 'cross' => sub { $Opts{op} = 'cross' }, 'glue=s' => \$Opts{glue}, 'skip-chars=i' => \$Opts{skip_chars}, 'check-chars=i' => \$Opts{check_chars}, 'version|v' => sub { no warnings; say "setop version $main::VERSION ($main::DATE)"; exit 0; }, 'help|h' => sub { print <<USAGE; Usage: setop [OPTIONS]... FILE FILE ... setop --help Options: --ignore-case, -i --ignore-all-space, -w --op=s --union (shortcut for --op=union) --intersect (shortcut for --op=intersect) --diff (shortcut for --op=diff) --symdiff (shortcut for --op=symdiff) --cross (shortcut for --op=cross) --check-chars=i --skip-chars=i For more details, see the manpage/documentation. USAGE exit 0; }, ); exit 99 if !$res; if (!$Opts{op}) { warn "Please specify an operation (--op)\n"; exit 99; } if ($Opts{op} !~ /\A(union|intersect|diff|symdiff|cross)\z/) { warn "Unknown op, please see --help for known operations\n"; exit 99; } unless ($Opts{op} ne 'cross' || @ARGV > 1) { warn "Please specify at least 2 input files\n"; exit 99; } unless (@ARGV >= 1) { warn "Please specify at least 1 input file\n"; exit 99; } } sub run { require Tie::IxHash; # a regular hash my %res0; # a hash that remembers insert order tie my(%res), 'Tie::IxHash'; my $op = $Opts{op}; my $ic = $Opts{ignore_case}; my $ias = $Opts{ignore_all_space}; my $sch = $Opts{skip_chars}; my $cch = $Opts{check_chars}; my $ign = $ic || $ias || $sch || defined($cch); my @aoaos; my $i = 0; my $stdin_specified; FILE: for my $i (1..@ARGV) { my $fh; my $fname = $ARGV[$i-1]; if ($fname eq '-') { do { warn "Can't use STDIN twice\n"; exit 99 } if $stdin_specified++; $fh = *STDIN; } else { open $fh, "<", $fname or die "Can't open input file $fname: $!\n"; } if ($op eq 'union') { if ($ign) { while (<$fh>) { my $k = $ic ? lc($_) : $_; $k =~ s/\s+//g if $ias; $res{$k} = $_ unless exists $res{$k}; } # print result if ($i == @ARGV) { print $res{$_} for keys %res; } } else { while (<$fh>) { $res{$_}++ } # print result if ($i == @ARGV) { print for keys %res; } } } elsif ($op eq 'intersect') { if ($ign) { if ($i == 1) { while (<$fh>) { my $k = $ic ? lc($_) : $_; $k =~ s/\s+//g if $ias; $res{$k} = [1,$_] unless exists $res{$k}; } } else { while (<$fh>) { my $k = $ic ? lc($_) : $_; $k =~ s/\s+//g if $ias; if ($res{$k} && $res{$k}[0] == $i-1) { $res{$k}[0]++; } } } # print result if ($i == @ARGV) { for (keys %res) { print $res{$_}[1] if $res{$_}[0] == $i; } } } else { if ($i == 1) { while (<$fh>) { $res{$_} = 1 } } else { while (<$fh>) { if ($res{$_} && $res{$_} == $i-1) { $res{$_}++; } } } # print result if ($i == @ARGV) { for (keys %res) { print if $res{$_} == $i; } } } } elsif ($op eq 'diff') { if ($ign) { if ($i == 1) { while (<$fh>) { my $k = $ic ? lc($_) : $_; chomp $k; $k = substr($k, $sch) if $sch; $k = substr($k, 0, $cch) if defined $cch; $k =~ s/[ \t]+//g if $ias; $res0{$k} //= []; push @{ $res0{$k} }, [$., $_]; } #use DD; dd \%res0; } else { while (<$fh>) { my $k = $ic ? lc($_) : $_; chomp $k; $k = substr($k, $sch) if $sch; $k = substr($k, 0, $cch) if defined $cch; $k =~ s/\s+//g if $ias; delete $res0{$k}; } } # print result if ($i == @ARGV) { my @lines = map { @{ $res0{$_} } } keys %res0; for (sort { $a->[0] <=> $b->[0] } @lines) { print $_->[1]; } } } else { if ($i == 1) { while (<$fh>) { $res{$_}++ } } else { while (<$fh>) { delete $res{$_}; } } # print result if ($i == @ARGV) { print for keys %res; } } } elsif ($op eq 'symdiff') { if ($ign) { if ($i == 1) { while (<$fh>) { my $k = $ic ? lc($_) : $_; $k =~ s/\s+//g if $ias; $res{$k} = [1,$_] unless exists $res{$k}; } } else { while (<$fh>) { my $k = $ic ? lc($_) : $_; $k =~ s/\s+//g if $ias; if (!$res{$k}) { $res{$k} = [1, $_]; } elsif ($res{$k}[0] <= 2) { $res{$k}[0]++; } } } # print result if ($i == @ARGV) { for (keys %res) { print $res{$_}[1] if $res{$_}[0] == 1; } } } else { if ($i == 1) { while (<$fh>) { $res{$_} = 1 } } else { while (<$fh>) { if (!$res{$_} || $res{$_} <= 2) { $res{$_}++; } } } # print result if ($i == @ARGV) { for (keys %res) { print if $res{$_} == 1; } } } } elsif ($op eq 'cross') { my $aos = []; while (<$fh>) { chomp; push @$aos, $_; } push @aoaos, $aos; # print result if ($i == @ARGV) { require Set::CrossProduct; my $iter = Set::CrossProduct->new(\@aoaos); my $glue = $Opts{glue}; while (my $tuple = $iter->get) { print join($glue, @$tuple), "\n"; } } } } } # MAIN parse_cmdline(); run(); 1; # ABSTRACT: Set operations (union, intersection, difference, symmetric diff) on lines of files # PODNAME: setop __END__ =pod =encoding UTF-8 =head1 NAME setop - Set operations (union, intersection, difference, symmetric diff) on lines of files =head1 VERSION This document describes version 0.122 of setop (from Perl distribution App-setop), released on 2019-07-03. =head1 SYNOPSIS setop [OPTION]... FILE FILE ... Examples: % setop --union file1 file2 file3 ;# combine files, duplicate lines removed, order preserved % setop --intersect file1 file2 file3; # show lines common in all three files % setop --diff <(ls /path1) <(ls /path2) ;# show lines in /path1 not in /path2 =head1 DESCRIPTION C<setop> treats files as a sets of lines, and performs operations between the sets. =head1 EXIT CODES 0 on success. 255 on I/O error. 99 on command-line options error. =head1 OPTIONS =over =item * --help, -h Show help message and exit. =item * --version, -v Show version and exit. =item * --op=S Pick operation. Known operations are: B<union> (return lines from the first file and others, duplicate lines removed [even duplicates from the same file], order preserved), B<intersect> (return common lines found in every file, order preserved), B<diff> (return lines found in the first file but not the second, duplicate removed, order preserved), B<symdiff> (short for "symetric difference", return lines found in either file but not both, duplicate removed, order preserved), B<cross> (short for "cross product", produce cross product from lines from each file). =item * --union Shortcut for C<--op union>. =item * --intersect Shortcut for C<--op intersect>. =item * --diff Shortcut for C<--op diff>. =item * --symdiff Shortcut for C<--op symdiff>. =item * --cross Shortcut for C<--op cross>. =item * --ignore-case, -i =item * --ignore-all-space, -w =item * --glue=s Glue character to use to combine lines when operation is B<cross>. =item * --skip-chars=i Number of characters to skip when comparing, much like the same option in B<uniq> command. Currently this is only implemented for C<diff> operation. =item * --check-chars=i Number of characters to check when comparing, much like the same option in B<uniq> command. Currently this is only implemented for C<diff> operation. =back =head1 HISTORY I first wrote C<fileop> in Ruby in 2003, since Ruby has nice C<+> and C<-> operators for arrays. Rewrote in Perl in 2014. Script renamed to C<setop>, changed command-line options a bit, now preserves order of lines. =head1 HOMEPAGE Please visit the project's homepage at L<https://metacpan.org/release/App-setop>. =head1 SOURCE Source repository is at L<https://github.com/perlancar/perl-App-setop>. =head1 BUGS Please report any bugs or feature requests on the bugtracker website L<https://rt.cpan.org/Public/Dist/Display.html?Name=App-setop> When submitting a bug or request, please include a test-file or a patch to an existing test-file that illustrates the bug or desired feature. =head1 SEE ALSO =head1 AUTHOR perlancar <perlancar@cpan.org> =head1 COPYRIGHT AND LICENSE This software is copyright (c) 2019, 2016, 2015, 2014 by perlancar@cpan.org. This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. =cut