bin/compare-code - metacpan.org


            
              1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
—
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
              #!/usr/bin/env perl
# Autor: Boris Däppen, 2015-2020
# No guarantee given, use at own risk and will
# PODNAME: compare-code
# ABSTRACT: find files with similar code
use strict;
use warnings;
use v5.14; # we can use 'for/when' but not 'given/when'
use utf8;
use feature 'say';
# because 'when' throws experimental warning after perl 5.18
no if $] >= 5.018, warnings => 'experimental::smartmatch';
use File::Slurp;
use File::Find::Rule;
use DateTime;
use IO::Prompt::Simple;
use Module::Load;
use Module::Installed::Tiny 'module_installed';
use School::Code::Compare;
use School::Code::Compare::Options;
use School::Code::Compare::Charset;
use School::Code::Compare::Judge;
use School::Code::Compare::Out;
# Kombinatorisches Verhalten
# -----------------------------------------------------------------------------
#
# Anzahl Vergleiche:
# n! / ((n - m)! * m!)
# wenn
# n = Anzahl Elemente   (Anzahl Code-Dateien die zu Vergleichen sind)
# m = gezogene Elemente (immer 2, da zwei Dateien miteinander verglichen werden)
#
# Bei 100 Skripte gibt das
# 100! / (98! * 2!) = 4950
#
# Rechner: http://de.numberempire.com/combinatorialcalculator.php
##################
# OPTION PARSING #
##################
# safe options to string, as given by user
my $argv_signature = join ' ', @ARGV;
my $o = School::Code::Compare::Options->new(@ARGV);
# try not to use outside interface further down in the code...
my @dir           = defined $o->{dir} ? @{$o->{dir}} : ();
my $file          = $o->{file};
my $lang          = $o->{in};
my $output_format = $o->{out};
my $to_file       = $o->{persist};
my $verbose       = $o->{verbose};
my $algo          = $o->{charset};
my $do_prompt     = !$o->{yes};
my $hide_skipped  = !$o->{all};
my $mime_match    = $o->{mime};
my $sort          = $o->{sort};
my $split         = $o->{split};
my $basedir       = $o->{basedir};
chop $basedir if (defined $basedir and $basedir =~ /\/$/);
# some input checking...
if ($algo !~ /^visibles$|^numsignes$|^signes$/) {
    say "charset not supported";
    exit 1;
}
if ($lang !~ /hashy|python|perl|bash|slashy|php|js|cpp|cs|c|java|html|xml|txt/) {
    say "lang not supported";
    exit 1;
}
my $magic;
if ($mime_match) {
    if (module_installed 'File::LibMagic') {
        load File::LibMagic;
        $magic = File::LibMagic->new();
    }
    else {
        say 'Option --mime needs the module File::LibMagic installed';
        exit 1;
    }
}
#####################
# GATHER INPUT DATA #
#####################
my @FILE_LIST  = ();
if (defined $file) {
    @FILE_LIST = read_file( $file, binmode => ':utf8' );
}
elsif ( @dir ) {
    @FILE_LIST = File::Find::Rule->file()->in( @dir );
}
else {
    @FILE_LIST = <STDIN>;
}
say scalar @FILE_LIST . ' files to compare, aborting...'
    and exit 1
    if (@FILE_LIST <= 1);
# Calulate how many comparisons will be needed
# TODO: maybe use math insead of loop
my $comparison_count = 0;
for (my $i=0; $i < @FILE_LIST - 1; $i++) {
    for (my $j=$i+1; $j < @FILE_LIST; $j++) {
        $comparison_count++;
    }
}
# since STDIN is processed we close it
# to avoid any trouble with later user ineraction in terminal
close STDIN;
# we reopen STDIN with the users terminal attached, accoring to comment here:
# https://stackoverflow.com/questions/9484431/can-i-prompt-for-user-input-after-reading-piped-input-on-stdin-in-perl
# NOTE: This might be a problem, when trying to run on windows!
open STDIN, "<", "/dev/tty";
# (maybe) ask if job should be started with the current input
if ($do_prompt) {
    my $answer = prompt("$comparison_count comparisons needed, continue? [Y/n]"
                 , { output => *STDERR });
    exit 0 if ($answer =~ /n/);
}
# close STDIN again, to undo our sins from above
close STDIN;
###################
# PREPROCESS DATA #
###################
say STDERR 'reading and preparing files...';
# simplify all file content and store it together with the path
my @files    = ();
foreach my $filepath ( @FILE_LIST ) {
    chomp( $filepath );
    # \r (carriage return) causes a nasty error, if it occures in the path
    # for read_file (sysopen). so we ensure, that there is no \r. ever.
    # since this loop isn't to big, the additional check isn't really an issue.
    chop($filepath) if ($filepath =~ m/\r$/); # deal with dos input
    my @content;
    if ($o->{split}) {
        local $/ = $o->{split};
        @content = read_file( $filepath, binmode => ':utf8' ) ;
    }
    else {
        @content = read_file( $filepath, binmode => ':utf8' ) ;
    }
    # try to detect MIME-type
    my $mimetype = '';
    my $mini_data = '';
    if ($mime_match) {
        $mini_data .= $content[0] if (defined $content[0]);
        $mini_data .= $content[1] if (defined $content[1]);
        $mini_data .= $content[2] if (defined $content[2]);
        $mimetype = $magic->info_from_string($mini_data)->{mime_type};
    }
    my $charset  = School::Code::Compare::Charset->new();
    $charset->set_language($lang);
    my $filtered;
    $filtered = $charset->get_visibles (\@content) if ($algo  eq 'visibles');
    $filtered = $charset->get_numsignes(\@content) if ($algo  eq 'numsignes');
    $filtered = $charset->get_signes   (\@content) if ($algo  eq 'signes'  );
    # sort if required
    $filtered = $charset->sort_by_lines($filtered) if ($sort);
    my $info = {};
    $info->{"code_$algo"} = join '', @{$filtered};
    $info->{path}         = $filepath;
    $info->{mime_type}    = $mimetype if ($mimetype);
    push @files, $info;
}
################################################
# DO THE ACTUAL WORK... COMPARING ALL THE DATA #
################################################
my $now      = DateTime->now;
my $comparer = School::Code::Compare->new()
                                    ->set_max_relative_difference(1.4)
                                    ->set_min_char_total        (20)
                                    ->set_max_relative_distance(0.6);
my %info = (
    visibles =>
        "All visible chars. Whitespace removed.",
    numsignes=>
        "Words and numbers ignored in meaning, but not in position. Whitespace removed",
    signes =>
        "Only special chars. Whitespace, letters, numbers and underscore removed.",
);
# measure Levenshtein distance within all possible file combinations
print STDERR "working on $algo... ";
my @result = ();
my $judge  = School::Code::Compare::Judge->new();
my $skip_report   = '';
my $skip_count    = 0;
my $lift_count = 0;
for (my $i=0; $i < @files - 1; $i++) {
    for (my $j=$i+1; $j < @files; $j++) {
        my $comparison = {};
        my $do_comparison = 1;
        if ($basedir) {
            my $path1 = $files[$i]->{path};
            my $path2 = $files[$j]->{path};
            chop $path1 if ($path1 =~ /\/$/);
            chop $path2 if ($path2 =~ /\/$/);
            my $project1 = '';
            if ($path1 =~ qr!^$basedir/([^/]+)/! ){
                $project1 = $1
            }
            my $project2 = '';
            if ($path2 =~ qr!^$basedir/([^/]+)/! ){
                $project2 = $1
            }
            if ($project1 eq $project2) {
                $comparison = {
                    distance     => undef,
                    ratio        => undef,
                    length1      => undef,
                    length2      => undef,
                    delta_length => undef,
                    comment      => "skipped: same project: $project1",
                };
                $do_comparison = 0;
            }
        }
        if ($mime_match and $files[$i]->{mime_type}
                            ne $files[$j]->{mime_type}) {
            $comparison = {
                distance     => undef,
                ratio        => undef,
                length1      => undef,
                length2      => undef,
                delta_length => undef,
                comment      => 'skipped: different mime:'
                                . $files[$i]->{mime_type}
                                . ' ; '
                                . $files[$j]->{mime_type},
            };
            $do_comparison = 0;
        }
        if ($do_comparison) {
            # Levenshtein
            $comparison = $comparer->measure( $files[$i]->{"code_$algo"},
                                              $files[$j]->{"code_$algo"}
                                            );
        }
        $do_comparison = 1;
        if ($verbose) {
            say STDERR '';
            say STDERR "---comparison $algo $i;$j---";
            say STDERR 'path1: '     . $files[$i]->{path};
            say STDERR 'path2: '     . $files[$j]->{path};
            say STDERR 'mime1:'      . $files[$i]->{mime_type}
                            if defined $files[$i]->{mime_type};
            say STDERR 'mime2:'      . $files[$j]->{mime_type}
                            if defined $files[$j]->{mime_type};
            say STDERR 'data1: '     . $files[$i]->{"code_$algo"}
                            if defined $files[$i]->{"code_$algo"};
            say STDERR 'data2: '     . $files[$j]->{"code_$algo"}
                            if defined $files[$j]->{"code_$algo"};
            say STDERR 'distance: '  . $comparison->{distance}
                            if defined $comparison->{distance};
            say STDERR 'length1: '   . $comparison->{length1}
                            if defined $comparison->{length1};
            say STDERR 'length2: '   . $comparison->{length2}
                            if defined $comparison->{length2};
            say STDERR 'similarity: '. $comparison->{ratio}
                            if defined $comparison->{ratio};
            say STDERR 'comment: '   . $comparison->{comment};
        }
        # throw the "skipped" comparisons away, to thin out the result
        if ( $comparison->{comment} =~ /^skipped/ ) {
            $skip_count++;
            next if ($hide_skipped);
        }
        else {
            $lift_count++;
        }
        $comparison->{file1} = $files[$i]->{path};
        $comparison->{file2} = $files[$j]->{path};
        $judge->look($comparison);
        push @result, $comparison;
    }
}
say STDERR "\tdone";
####################
# RENDERING OUTPUT #
####################
print STDERR "rendering...";
my $format = 'CSV';
for ($output_format) {
    $format = 'CSV'  when /^csv/;
    $format = 'HTML' when /^html/;
    $format = 'TAB'  when /^tab/;
}
my $filename = undef;
if ($to_file) {
    $filename =    'comparison_'
                 . $now->ymd() . '_' 
                 . $now->hms('-') . '_'
                 . $algo
                 . '.' 
                 . lc $format;
}
my $version = defined $School::Code::Compare::VERSION ?
                      $School::Code::Compare::VERSION : 'unknown';
my $signature = 'Created at '
                . $now->ymd() . ' '
                . $now->hms('-') . ' '
                . "with '$0 $argv_signature' in '$version' version.";
$skip_report = "From a total of $comparison_count possible comparisons,"
             . " $skip_count where skipped and $lift_count actually compared.";
$skip_report .= $hide_skipped ?
               ' Skipped results are not shown.' :
               ' All results listed.' ;
my $out = School::Code::Compare::Out->new();
$out->set_name($filename)
    ->set_format($format)
    ->set_lines(\@result)
    ->set_title($algo)
    ->set_description($info{$algo})
    ->set_signature($signature)
    ->set_endreport($skip_report)
    ->set_link('https://metacpan.org/pod/School::Code::Compare')
;
$out->write();
if (defined $filename) {
    say STDERR "\tdone. See $filename";
}
else {
    say STDERR "\tdone.";
}
__END__
=pod
=encoding UTF-8
=head1 NAME
compare-code - find files with similar code
=head1 VERSION
version 0.201
=head1 SYNOPSIS
This program is developed in an education/school environment.
It's purpose is to help detect similiarities in the code of IT projects,
and therefore making assessments (more) fair.
The script compares files containing source code (or any plain text) to each other.
The general approach for comparison is:
whitespace and comments are always removed (see C<--help> for more), then the comparison is done using the Levenshtein algorithm.
Future releases may bring more sophisticated techniques.
This program is written in the Perl Programming Language.
If you are unfamiliar with GNU/Linux you might want to read L<doc::Windows> in the doc directory.
=head2 Example Usage
 compare-code ./lib -i perl
 compare-code -i cpp -f list_of_filepaths.txt -o html -p
 find path/to/projects -type f -name Cow.java | compare-code -i java 
=head2 Options
 compare-code [DIR...] [OPTIONS...]
   
 Arguments:
   DIR             analyse files in given directory
                   Input can otherwise also be specified over:
                     - the option --file / -f
                     - STDIN, receiving filepaths (e.g. from a find command)
   
 Options:
   --all,     -a   show all results in output
                   Don't hide skipped comparisons.
                   Will somethimes cause a lot of output.
   --basedir, -b   skip comparisons within projects under base directory
                   Folders one below will be seen as project directories.
                   Files inside projects will not be compared with each other.
                   (This will currently not work on Windows)
   --charset, -c   chars used for comparison
                   Define one or more subsets of chars, used to compare the files:
                     - visibles
                         all chars without witespace
                     - numsignes (default)
                         like visibles, but words ignored in meaning (but not in position)
                     - signes
                         only special chars, no words or numbers
   --file,    -f   file to read from (containing filepaths)
   --help,    -h   show this manual
   --in,      -i   input format, optimize for language
                   Comments get stripped from code.
                   Supportet arguments:
                     - hashy:  python, perl, bash
                     - slashy: php, js, java, cpp, cs, c
                     - html, xml
                     - txt (default, no effect)
   --mime,    -m   only compare if same MIME-type
                   This options needs the Perl Library File::LibMagic installed.
                   You will also need libmagic development files on your system.
   --out,     -o   output format
                   You can define an output format:
                     - html
                     - tab (default)
                     - csv
   --persist, -p   print result to file (instead STDOUT)
                   Saved in local directory with name pattern:
                     - comparison_[year-month-day-hour-minute]_[method].[format]
   --sort,    -s   sort data by line before comparison
                   Useful to ignore order of method declaration.
                   See --split if you need to sort by something else then by line.
   --split,   -t   Split files on something else then newline
                   You might want to split for sentences with '\.' in normal text.
                   Use this option together with --sort.
   --verbose, -v   show actually compared data on STDERR
   --yes,     -y   Don't prompt for questions
                   Program will start working without further confirmation.
                   (Answer all user prompts with [yes])
=head1 AUTHOR
Boris Däppen <bdaeppen.perl@gmail.com>
=head1 COPYRIGHT AND LICENSE
This software is copyright (c) 2023 by Boris Däppen.
This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.
=cut
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)