#!/usr/bin/env perl
# Autor: Boris DÃ¤ppen, 2015-2020
# No guarantee given, use at own risk and will

# PODNAME: compare-code
# ABSTRACT: find files with similar code

use strict;
use warnings;
use v5.14; # we can use 'for/when' but not 'given/when'
use utf8;
use feature 'say';

# because 'when' throws experimental warning after perl 5.18
no if $] >= 5.018, warnings => 'experimental::smartmatch';

use File::Slurp;
use File::Find::Rule;
use DateTime;
use IO::Prompt::Simple;
use Module::Load;
use Module::Installed::Tiny 'module_installed';

use School::Code::Compare;
use School::Code::Compare::Options;
use School::Code::Compare::Charset;
use School::Code::Compare::Judge;
use School::Code::Compare::Out;

# Kombinatorisches Verhalten
# -----------------------------------------------------------------------------
#
# Anzahl Vergleiche:
# n! / ((n - m)! * m!)
# wenn
# n = Anzahl Elemente   (Anzahl Code-Dateien die zu Vergleichen sind)
# m = gezogene Elemente (immer 2, da zwei Dateien miteinander verglichen werden)
#
# Bei 100 Skripte gibt das
# 100! / (98! * 2!) = 4950
#
# Rechner: http://de.numberempire.com/combinatorialcalculator.php

##################
# OPTION PARSING #
##################

# safe options to string, as given by user
my $argv_signature = join ' ', @ARGV;

my $o = School::Code::Compare::Options->new(@ARGV);

# try not to use outside interface further down in the code...
my @dir           = defined $o->{dir} ? @{$o->{dir}} : ();
my $file          = $o->{file};
my $lang          = $o->{in};
my $output_format = $o->{out};
my $to_file       = $o->{persist};
my $verbose       = $o->{verbose};
my $algo          = $o->{charset};
my $do_prompt     = !$o->{yes};
my $hide_skipped  = !$o->{all};
my $mime_match    = $o->{mime};
my $sort          = $o->{sort};
my $split         = $o->{split};
my $basedir       = $o->{basedir};

chop $basedir if (defined $basedir and $basedir =~ /\/$/);

# some input checking...
if ($algo !~ /^visibles$|^numsignes$|^signes$/) {
    say "charset not supported";
    exit 1;
}
if ($lang !~ /hashy|python|perl|bash|slashy|php|js|cpp|cs|c|java|html|xml|txt/) {
    say "lang not supported";
    exit 1;
}

my $magic;
if ($mime_match) {
    if (module_installed 'File::LibMagic') {
        load File::LibMagic;
        $magic = File::LibMagic->new();
    }
    else {
        say 'Option --mime needs the module File::LibMagic installed';
        exit 1;
    }
}

#####################
# GATHER INPUT DATA #
#####################

my @FILE_LIST  = ();
if (defined $file) {
    @FILE_LIST = read_file( $file, binmode => ':utf8' );
}
elsif ( @dir ) {
    @FILE_LIST = File::Find::Rule->file()->in( @dir );
}
else {
    @FILE_LIST = <STDIN>;
}

say scalar @FILE_LIST . ' files to compare, aborting...'
    and exit 1
    if (@FILE_LIST <= 1);

# Calulate how many comparisons will be needed
# TODO: maybe use math insead of loop
my $comparison_count = 0;
for (my $i=0; $i < @FILE_LIST - 1; $i++) {
    for (my $j=$i+1; $j < @FILE_LIST; $j++) {
        $comparison_count++;
    }
}

# since STDIN is processed we close it
# to avoid any trouble with later user ineraction in terminal
close STDIN;

# we reopen STDIN with the users terminal attached, accoring to comment here:
# https://stackoverflow.com/questions/9484431/can-i-prompt-for-user-input-after-reading-piped-input-on-stdin-in-perl
# NOTE: This might be a problem, when trying to run on windows!
open STDIN, "<", "/dev/tty";

# (maybe) ask if job should be started with the current input
if ($do_prompt) {
    my $answer = prompt("$comparison_count comparisons needed, continue? [Y/n]"
                 , { output => *STDERR });
    exit 0 if ($answer =~ /n/);
}

# close STDIN again, to undo our sins from above
close STDIN;

###################
# PREPROCESS DATA #
###################

say STDERR 'reading and preparing files...';

# simplify all file content and store it together with the path
my @files    = ();

foreach my $filepath ( @FILE_LIST ) {
    chomp( $filepath );
    # \r (carriage return) causes a nasty error, if it occures in the path
    # for read_file (sysopen). so we ensure, that there is no \r. ever.
    # since this loop isn't to big, the additional check isn't really an issue.
    chop($filepath) if ($filepath =~ m/\r$/); # deal with dos input

    my @content;
    if ($o->{split}) {
        local $/ = $o->{split};
        @content = read_file( $filepath, binmode => ':utf8' ) ;
    }
    else {
        @content = read_file( $filepath, binmode => ':utf8' ) ;
    }

    # try to detect MIME-type
    my $mimetype = '';
    my $mini_data = '';
    if ($mime_match) {
        $mini_data .= $content[0] if (defined $content[0]);
        $mini_data .= $content[1] if (defined $content[1]);
        $mini_data .= $content[2] if (defined $content[2]);

        $mimetype = $magic->info_from_string($mini_data)->{mime_type};
    }

    my $charset  = School::Code::Compare::Charset->new();
    $charset->set_language($lang);

    my $filtered;
    $filtered = $charset->get_visibles (\@content) if ($algo  eq 'visibles');
    $filtered = $charset->get_numsignes(\@content) if ($algo  eq 'numsignes');
    $filtered = $charset->get_signes   (\@content) if ($algo  eq 'signes'  );

    # sort if required
    $filtered = $charset->sort_by_lines($filtered) if ($sort);

    my $info = {};

    $info->{"code_$algo"} = join '', @{$filtered};
    $info->{path}         = $filepath;
    $info->{mime_type}    = $mimetype if ($mimetype);

    push @files, $info;
}

################################################
# DO THE ACTUAL WORK... COMPARING ALL THE DATA #
################################################

my $now      = DateTime->now;
my $comparer = School::Code::Compare->new()
                                    ->set_max_relative_difference(1.4)
                                    ->set_min_char_total        (20)
                                    ->set_max_relative_distance(0.6);

my %info = (
    visibles =>
        "All visible chars. Whitespace removed.",
    numsignes=>
        "Words and numbers ignored in meaning, but not in position. Whitespace removed",
    signes =>
        "Only special chars. Whitespace, letters, numbers and underscore removed.",
);

# measure Levenshtein distance within all possible file combinations
print STDERR "working on $algo... ";

my @result = ();
my $judge  = School::Code::Compare::Judge->new();

my $skip_report   = '';
my $skip_count    = 0;
my $lift_count = 0;

for (my $i=0; $i < @files - 1; $i++) {
    for (my $j=$i+1; $j < @files; $j++) {

        my $comparison = {};
        my $do_comparison = 1;

        if ($basedir) {
            my $path1 = $files[$i]->{path};
            my $path2 = $files[$j]->{path};

            chop $path1 if ($path1 =~ /\/$/);
            chop $path2 if ($path2 =~ /\/$/);

            my $project1 = '';
            if ($path1 =~ qr!^$basedir/([^/]+)/! ){
                $project1 = $1
            }
            my $project2 = '';
            if ($path2 =~ qr!^$basedir/([^/]+)/! ){
                $project2 = $1
            }

            if ($project1 eq $project2) {
                $comparison = {
                    distance     => undef,
                    ratio        => undef,
                    length1      => undef,
                    length2      => undef,
                    delta_length => undef,
                    comment      => "skipped: same project: $project1",
                };
                $do_comparison = 0;
            }
        }

        if ($mime_match and $files[$i]->{mime_type}
                            ne $files[$j]->{mime_type}) {
            $comparison = {
                distance     => undef,
                ratio        => undef,
                length1      => undef,
                length2      => undef,
                delta_length => undef,
                comment      => 'skipped: different mime:'
                                . $files[$i]->{mime_type}
                                . ' ; '
                                . $files[$j]->{mime_type},
            };
            $do_comparison = 0;
        }

        if ($do_comparison) {
            # Levenshtein
            $comparison = $comparer->measure( $files[$i]->{"code_$algo"},
                                              $files[$j]->{"code_$algo"}
                                            );
        }
        $do_comparison = 1;

        if ($verbose) {
            say STDERR '';
            say STDERR "---comparison $algo $i;$j---";
            say STDERR 'path1: '     . $files[$i]->{path};
            say STDERR 'path2: '     . $files[$j]->{path};
            say STDERR 'mime1:'      . $files[$i]->{mime_type}
                            if defined $files[$i]->{mime_type};
            say STDERR 'mime2:'      . $files[$j]->{mime_type}
                            if defined $files[$j]->{mime_type};
            say STDERR 'data1: '     . $files[$i]->{"code_$algo"}
                            if defined $files[$i]->{"code_$algo"};
            say STDERR 'data2: '     . $files[$j]->{"code_$algo"}
                            if defined $files[$j]->{"code_$algo"};
            say STDERR 'distance: '  . $comparison->{distance}
                            if defined $comparison->{distance};
            say STDERR 'length1: '   . $comparison->{length1}
                            if defined $comparison->{length1};
            say STDERR 'length2: '   . $comparison->{length2}
                            if defined $comparison->{length2};
            say STDERR 'similarity: '. $comparison->{ratio}
                            if defined $comparison->{ratio};
            say STDERR 'comment: '   . $comparison->{comment};
        }

        # throw the "skipped" comparisons away, to thin out the result
        if ( $comparison->{comment} =~ /^skipped/ ) {
            $skip_count++;
            next if ($hide_skipped);
        }
        else {
            $lift_count++;
        }

        $comparison->{file1} = $files[$i]->{path};
        $comparison->{file2} = $files[$j]->{path};

        $judge->look($comparison);

        push @result, $comparison;
    }
}

say STDERR "\tdone";

####################
# RENDERING OUTPUT #
####################

print STDERR "rendering...";

my $format = 'CSV';
for ($output_format) {
    $format = 'CSV'  when /^csv/;
    $format = 'HTML' when /^html/;
    $format = 'TAB'  when /^tab/;
}

my $filename = undef;
if ($to_file) {
    $filename =    'comparison_'
                 . $now->ymd() . '_' 
                 . $now->hms('-') . '_'
                 . $algo
                 . '.' 
                 . lc $format;
}

my $version = defined $School::Code::Compare::VERSION ?
                      $School::Code::Compare::VERSION : 'unknown';
my $signature = 'Created at '
                . $now->ymd() . ' '
                . $now->hms('-') . ' '
                . "with '$0 $argv_signature' in '$version' version.";

$skip_report = "From a total of $comparison_count possible comparisons,"
             . " $skip_count where skipped and $lift_count actually compared.";
$skip_report .= $hide_skipped ?
               ' Skipped results are not shown.' :
               ' All results listed.' ;

my $out = School::Code::Compare::Out->new();

$out->set_name($filename)
    ->set_format($format)
    ->set_lines(\@result)
    ->set_title($algo)
    ->set_description($info{$algo})
    ->set_signature($signature)
    ->set_endreport($skip_report)
    ->set_link('https://metacpan.org/pod/School::Code::Compare')
;

$out->write();

if (defined $filename) {
    say STDERR "\tdone. See $filename";
}
else {
    say STDERR "\tdone.";
}

__END__

=pod

=encoding UTF-8

=head1 NAME

compare-code - find files with similar code

=head1 VERSION

version 0.201

=head1 SYNOPSIS

This program is developed in an education/school environment.
It's purpose is to help detect similiarities in the code of IT projects,
and therefore making assessments (more) fair.

The script compares files containing source code (or any plain text) to each other.
The general approach for comparison is:
whitespace and comments are always removed (see C<--help> for more), then the comparison is done using the Levenshtein algorithm.
Future releases may bring more sophisticated techniques.

This program is written in the Perl Programming Language.

If you are unfamiliar with GNU/Linux you might want to read L<doc::Windows> in the doc directory.

=head2 Example Usage

 compare-code ./lib -i perl

 compare-code -i cpp -f list_of_filepaths.txt -o html -p

 find path/to/projects -type f -name Cow.java | compare-code -i java 

=head2 Options

 compare-code [DIR...] [OPTIONS...]
  
 Arguments:
   DIR             analyse files in given directory
                   Input can otherwise also be specified over:
                     - the option --file / -f
                     - STDIN, receiving filepaths (e.g. from a find command)
  
 Options:
   --all,     -a   show all results in output
                   Don't hide skipped comparisons.
                   Will somethimes cause a lot of output.
   --basedir, -b   skip comparisons within projects under base directory
                   Folders one below will be seen as project directories.
                   Files inside projects will not be compared with each other.
                   (This will currently not work on Windows)
   --charset, -c   chars used for comparison
                   Define one or more subsets of chars, used to compare the files:
                     - visibles
                         all chars without witespace
                     - numsignes (default)
                         like visibles, but words ignored in meaning (but not in position)
                     - signes
                         only special chars, no words or numbers
   --file,    -f   file to read from (containing filepaths)
   --help,    -h   show this manual
   --in,      -i   input format, optimize for language
                   Comments get stripped from code.
                   Supportet arguments:
                     - hashy:  python, perl, bash
                     - slashy: php, js, java, cpp, cs, c
                     - html, xml
                     - txt (default, no effect)
   --mime,    -m   only compare if same MIME-type
                   This options needs the Perl Library File::LibMagic installed.
                   You will also need libmagic development files on your system.
   --out,     -o   output format
                   You can define an output format:
                     - html
                     - tab (default)
                     - csv
   --persist, -p   print result to file (instead STDOUT)
                   Saved in local directory with name pattern:
                     - comparison_[year-month-day-hour-minute]_[method].[format]
   --sort,    -s   sort data by line before comparison
                   Useful to ignore order of method declaration.
                   See --split if you need to sort by something else then by line.
   --split,   -t   Split files on something else then newline
                   You might want to split for sentences with '\.' in normal text.
                   Use this option together with --sort.
   --verbose, -v   show actually compared data on STDERR
   --yes,     -y   Don't prompt for questions
                   Program will start working without further confirmation.
                   (Answer all user prompts with [yes])

=head1 AUTHOR

Boris DÃ¤ppen <bdaeppen.perl@gmail.com>

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2023 by Boris DÃ¤ppen.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut