The Perl and Raku Conference 2025: Greenville, South Carolina - June 27-29 Learn more

#!/usr/bin/env perl
# Autor: Boris Däppen, 2015-2020
# No guarantee given, use at own risk and will
# PODNAME: compare-code
# ABSTRACT: find files with similar code
use strict;
use v5.14; # we can use 'for/when' but not 'given/when'
use utf8;
use feature 'say';
# because 'when' throws experimental warning after perl 5.18
no if $] >= 5.018, warnings => 'experimental::smartmatch';
use Module::Installed::Tiny 'module_installed';
# Kombinatorisches Verhalten
# -----------------------------------------------------------------------------
#
# Anzahl Vergleiche:
# n! / ((n - m)! * m!)
# wenn
# n = Anzahl Elemente (Anzahl Code-Dateien die zu Vergleichen sind)
# m = gezogene Elemente (immer 2, da zwei Dateien miteinander verglichen werden)
#
# Bei 100 Skripte gibt das
# 100! / (98! * 2!) = 4950
#
##################
# OPTION PARSING #
##################
# safe options to string, as given by user
my $argv_signature = join ' ', @ARGV;
my $o = School::Code::Compare::Options->new(@ARGV);
# try not to use outside interface further down in the code...
my @dir = defined $o->{dir} ? @{$o->{dir}} : ();
my $file = $o->{file};
my $lang = $o->{in};
my $output_format = $o->{out};
my $to_file = $o->{persist};
my $verbose = $o->{verbose};
my $algo = $o->{charset};
my $do_prompt = !$o->{yes};
my $hide_skipped = !$o->{all};
my $mime_match = $o->{mime};
my $sort = $o->{sort};
my $split = $o->{split};
my $basedir = $o->{basedir};
chop $basedir if (defined $basedir and $basedir =~ /\/$/);
# some input checking...
if ($algo !~ /^visibles$|^numsignes$|^signes$/) {
say "charset not supported";
exit 1;
}
if ($lang !~ /hashy|python|perl|bash|slashy|php|js|cpp|cs|c|java|html|xml|txt/) {
say "lang not supported";
exit 1;
}
my $magic;
if ($mime_match) {
if (module_installed 'File::LibMagic') {
load File::LibMagic;
$magic = File::LibMagic->new();
}
else {
say 'Option --mime needs the module File::LibMagic installed';
exit 1;
}
}
#####################
# GATHER INPUT DATA #
#####################
my @FILE_LIST = ();
if (defined $file) {
@FILE_LIST = read_file( $file, binmode => ':utf8' );
}
elsif ( @dir ) {
@FILE_LIST = File::Find::Rule->file()->in( @dir );
}
else {
@FILE_LIST = <STDIN>;
}
say scalar @FILE_LIST . ' files to compare, aborting...'
and exit 1
if (@FILE_LIST <= 1);
# Calulate how many comparisons will be needed
# TODO: maybe use math insead of loop
my $comparison_count = 0;
for (my $i=0; $i < @FILE_LIST - 1; $i++) {
for (my $j=$i+1; $j < @FILE_LIST; $j++) {
$comparison_count++;
}
}
# since STDIN is processed we close it
# to avoid any trouble with later user ineraction in terminal
close STDIN;
# we reopen STDIN with the users terminal attached, accoring to comment here:
# NOTE: This might be a problem, when trying to run on windows!
open STDIN, "<", "/dev/tty";
# (maybe) ask if job should be started with the current input
if ($do_prompt) {
my $answer = prompt("$comparison_count comparisons needed, continue? [Y/n]"
, { output => *STDERR });
exit 0 if ($answer =~ /n/);
}
# close STDIN again, to undo our sins from above
close STDIN;
###################
# PREPROCESS DATA #
###################
say STDERR 'reading and preparing files...';
# simplify all file content and store it together with the path
my @files = ();
foreach my $filepath ( @FILE_LIST ) {
chomp( $filepath );
# \r (carriage return) causes a nasty error, if it occures in the path
# for read_file (sysopen). so we ensure, that there is no \r. ever.
# since this loop isn't to big, the additional check isn't really an issue.
chop($filepath) if ($filepath =~ m/\r$/); # deal with dos input
my @content;
if ($o->{split}) {
local $/ = $o->{split};
@content = read_file( $filepath, binmode => ':utf8' ) ;
}
else {
@content = read_file( $filepath, binmode => ':utf8' ) ;
}
# try to detect MIME-type
my $mimetype = '';
my $mini_data = '';
if ($mime_match) {
$mini_data .= $content[0] if (defined $content[0]);
$mini_data .= $content[1] if (defined $content[1]);
$mini_data .= $content[2] if (defined $content[2]);
$mimetype = $magic->info_from_string($mini_data)->{mime_type};
}
my $charset = School::Code::Compare::Charset->new();
$charset->set_language($lang);
my $filtered;
$filtered = $charset->get_visibles (\@content) if ($algo eq 'visibles');
$filtered = $charset->get_numsignes(\@content) if ($algo eq 'numsignes');
$filtered = $charset->get_signes (\@content) if ($algo eq 'signes' );
# sort if required
$filtered = $charset->sort_by_lines($filtered) if ($sort);
my $info = {};
$info->{"code_$algo"} = join '', @{$filtered};
$info->{path} = $filepath;
$info->{mime_type} = $mimetype if ($mimetype);
push @files, $info;
}
################################################
# DO THE ACTUAL WORK... COMPARING ALL THE DATA #
################################################
my $now = DateTime->now;
my $comparer = School::Code::Compare->new()
->set_max_relative_difference(1.4)
->set_min_char_total (20)
->set_max_relative_distance(0.6);
my %info = (
visibles =>
"All visible chars. Whitespace removed.",
numsignes=>
"Words and numbers ignored in meaning, but not in position. Whitespace removed",
signes =>
"Only special chars. Whitespace, letters, numbers and underscore removed.",
);
# measure Levenshtein distance within all possible file combinations
print STDERR "working on $algo... ";
my @result = ();
my $judge = School::Code::Compare::Judge->new();
my $skip_report = '';
my $skip_count = 0;
my $lift_count = 0;
for (my $i=0; $i < @files - 1; $i++) {
for (my $j=$i+1; $j < @files; $j++) {
my $comparison = {};
my $do_comparison = 1;
if ($basedir) {
my $path1 = $files[$i]->{path};
my $path2 = $files[$j]->{path};
chop $path1 if ($path1 =~ /\/$/);
chop $path2 if ($path2 =~ /\/$/);
my $project1 = '';
if ($path1 =~ qr!^$basedir/([^/]+)/! ){
$project1 = $1
}
my $project2 = '';
if ($path2 =~ qr!^$basedir/([^/]+)/! ){
$project2 = $1
}
if ($project1 eq $project2) {
$comparison = {
distance => undef,
ratio => undef,
length1 => undef,
length2 => undef,
delta_length => undef,
comment => "skipped: same project: $project1",
};
$do_comparison = 0;
}
}
if ($mime_match and $files[$i]->{mime_type}
ne $files[$j]->{mime_type}) {
$comparison = {
distance => undef,
ratio => undef,
length1 => undef,
length2 => undef,
delta_length => undef,
comment => 'skipped: different mime:'
. $files[$i]->{mime_type}
. ' ; '
. $files[$j]->{mime_type},
};
$do_comparison = 0;
}
if ($do_comparison) {
# Levenshtein
$comparison = $comparer->measure( $files[$i]->{"code_$algo"},
$files[$j]->{"code_$algo"}
);
}
$do_comparison = 1;
if ($verbose) {
say STDERR '';
say STDERR "---comparison $algo $i;$j---";
say STDERR 'path1: ' . $files[$i]->{path};
say STDERR 'path2: ' . $files[$j]->{path};
say STDERR 'mime1:' . $files[$i]->{mime_type}
if defined $files[$i]->{mime_type};
say STDERR 'mime2:' . $files[$j]->{mime_type}
if defined $files[$j]->{mime_type};
say STDERR 'data1: ' . $files[$i]->{"code_$algo"}
if defined $files[$i]->{"code_$algo"};
say STDERR 'data2: ' . $files[$j]->{"code_$algo"}
if defined $files[$j]->{"code_$algo"};
say STDERR 'distance: ' . $comparison->{distance}
if defined $comparison->{distance};
say STDERR 'length1: ' . $comparison->{length1}
if defined $comparison->{length1};
say STDERR 'length2: ' . $comparison->{length2}
if defined $comparison->{length2};
say STDERR 'similarity: '. $comparison->{ratio}
if defined $comparison->{ratio};
say STDERR 'comment: ' . $comparison->{comment};
}
# throw the "skipped" comparisons away, to thin out the result
if ( $comparison->{comment} =~ /^skipped/ ) {
$skip_count++;
next if ($hide_skipped);
}
else {
$lift_count++;
}
$comparison->{file1} = $files[$i]->{path};
$comparison->{file2} = $files[$j]->{path};
$judge->look($comparison);
push @result, $comparison;
}
}
say STDERR "\tdone";
####################
# RENDERING OUTPUT #
####################
print STDERR "rendering...";
my $format = 'CSV';
for ($output_format) {
$format = 'CSV' when /^csv/;
$format = 'HTML' when /^html/;
$format = 'TAB' when /^tab/;
}
my $filename = undef;
if ($to_file) {
$filename = 'comparison_'
. $now->ymd() . '_'
. $now->hms('-') . '_'
. $algo
. '.'
. lc $format;
}
my $version = defined $School::Code::Compare::VERSION ?
$School::Code::Compare::VERSION : 'unknown';
my $signature = 'Created at '
. $now->ymd() . ' '
. $now->hms('-') . ' '
. "with '$0 $argv_signature' in '$version' version.";
$skip_report = "From a total of $comparison_count possible comparisons,"
. " $skip_count where skipped and $lift_count actually compared.";
$skip_report .= $hide_skipped ?
' Skipped results are not shown.' :
' All results listed.' ;
my $out = School::Code::Compare::Out->new();
$out->set_name($filename)
->set_format($format)
->set_lines(\@result)
->set_title($algo)
->set_description($info{$algo})
->set_signature($signature)
->set_endreport($skip_report)
;
$out->write();
if (defined $filename) {
say STDERR "\tdone. See $filename";
}
else {
say STDERR "\tdone.";
}
__END__
=pod
=encoding UTF-8
=head1 NAME
compare-code - find files with similar code
=head1 VERSION
version 0.201
=head1 SYNOPSIS
This program is developed in an education/school environment.
It's purpose is to help detect similiarities in the code of IT projects,
and therefore making assessments (more) fair.
The script compares files containing source code (or any plain text) to each other.
The general approach for comparison is:
whitespace and comments are always removed (see C<--help> for more), then the comparison is done using the Levenshtein algorithm.
Future releases may bring more sophisticated techniques.
This program is written in the Perl Programming Language.
If you are unfamiliar with GNU/Linux you might want to read L<doc::Windows> in the doc directory.
=head2 Example Usage
compare-code ./lib -i perl
compare-code -i cpp -f list_of_filepaths.txt -o html -p
find path/to/projects -type f -name Cow.java | compare-code -i java
=head2 Options
compare-code [DIR...] [OPTIONS...]
Arguments:
DIR analyse files in given directory
Input can otherwise also be specified over:
- the option --file / -f
- STDIN, receiving filepaths (e.g. from a find command)
Options:
--all, -a show all results in output
Don't hide skipped comparisons.
Will somethimes cause a lot of output.
--basedir, -b skip comparisons within projects under base directory
Folders one below will be seen as project directories.
Files inside projects will not be compared with each other.
(This will currently not work on Windows)
--charset, -c chars used for comparison
Define one or more subsets of chars, used to compare the files:
- visibles
all chars without witespace
- numsignes (default)
like visibles, but words ignored in meaning (but not in position)
- signes
only special chars, no words or numbers
--file, -f file to read from (containing filepaths)
--help, -h show this manual
--in, -i input format, optimize for language
Comments get stripped from code.
Supportet arguments:
- hashy: python, perl, bash
- slashy: php, js, java, cpp, cs, c
- html, xml
- txt (default, no effect)
--mime, -m only compare if same MIME-type
This options needs the Perl Library File::LibMagic installed.
You will also need libmagic development files on your system.
--out, -o output format
You can define an output format:
- html
- tab (default)
- csv
--persist, -p print result to file (instead STDOUT)
Saved in local directory with name pattern:
- comparison_[year-month-day-hour-minute]_[method].[format]
--sort, -s sort data by line before comparison
Useful to ignore order of method declaration.
See --split if you need to sort by something else then by line.
--split, -t Split files on something else then newline
You might want to split for sentences with '\.' in normal text.
Use this option together with --sort.
--verbose, -v show actually compared data on STDERR
--yes, -y Don't prompt for questions
Program will start working without further confirmation.
(Answer all user prompts with [yes])
=head1 AUTHOR
Boris Däppen <bdaeppen.perl@gmail.com>
=head1 COPYRIGHT AND LICENSE
This software is copyright (c) 2023 by Boris Däppen.
This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.
=cut