#!/usr/bin/env perl
# ABSTRACT: Concatenate FASTA or FASTQ files
# PODNAME: fu-cat

use 5.012;
use warnings FATAL => 'all';
use Getopt::Long;
use Pod::Usage;
use File::Basename;
use FindBin qw($RealBin);
use Digest::MD5 qw(md5_base64);
use FASTX::Reader;
# The following placeholder is to be programmatically replaced with 'use lib "$RealBin/../lib"' if needed
#~loclib~
if ( -e "$RealBin/../lib/Proch/N50.pm" and -e "$RealBin/../Changes" ) {
    use lib "$RealBin/../lib";
}
use Proch::Seqfu;
my $VERSION = $Proch::Seqfu::VERSION // "<Dev>";
my $BASENAME = basename($0);

my ($opt_force_fasta, $opt_force_fastq, $opt_upper, $opt_help, $opt_verbose, $opt_version);
my ($opt_minlen, $opt_maxlen, $opt_dereplicate, $opt_rename_md5);
my $opt_separator = '.';
my $opt_rename_string;
my $opt_line_size = $Proch::Seqfu::fu_linesize;

if (!GetOptions(
    'f|fasta'       => \$opt_force_fasta,
    'q|fastq'       => \$opt_force_fastq,
    'd|dereplicate' => \$opt_dereplicate,
    '5|rename-md5'  => \$opt_rename_md5,
    'r|rename=s'    => \$opt_rename_string,
    's|separator=s' => \$opt_separator,
    'w|width=i'     => \$opt_line_size,
    'u|uppercase'   => \$opt_upper,
    'l|minlen=i'    => \$opt_minlen,
    'm|maxlen=i'    => \$opt_maxlen,
    'version'       => \$opt_version,
    'verbose'       => \$opt_verbose,
    'help'          => \$opt_help,
)) {
        say STDERR "Wrong parameters: type $BASENAME --help for full documentation.";
        exit 1  ;
}


$Proch::Seqfu::fu_linesize  = $opt_line_size;
$Proch::Seqfu::fu_verbose   = $opt_verbose;

# Check consistency of parameters
if ($opt_force_fasta and $opt_force_fastq){
    die " PARAMETER ERROR:\n You must specify either --fasta or --fastq (or none), not both.\n";
}

# Print version
if ($opt_version) {
    version();
}

# Print man (help)
pod2usage(
 {-exitval => 0, -verbose => 2}
) if $opt_help;

# Read from STDIN
if (not defined $ARGV[0]) {
    # Read from STDIN but also print a help message
    usage();
    push(@ARGV, '-');
}


# Autodetect output format if not specified
if (not $opt_force_fasta and not $opt_force_fastq){
    # Prescan all the files ...
    for my $filename (@ARGV) {

        if ($filename eq '-' or $filename eq '{{STDIN}}') {
            $opt_force_fasta = 1;
            verbose("When requiring to parse STDIN, output format is defaulted to FASTA (unless --fastq is on)")
        }
        if (not defined $opt_force_fastq and not defined $opt_force_fasta) {
            verbose "Autodetecting ";
            my $fx_reader = FASTX::Reader->new({ filename => "$filename" });
            if ($filename ne '-') {
                my $format = $fx_reader->getFileFormat("$filename");
                if ($format eq 'fasta') {
                    $opt_force_fasta = 1;
                }
            }
        }
    }
    if (not $opt_force_fasta) {
        verbose("Output format: FASTQ");
        $opt_force_fastq = 1;
    } else {
        verbose("Output format: FASTA");
    }
}


my %printed_seq_names = ();
my %printed_seqs      = ();

# Process all the files
for my $filename (@ARGV) {
    say STDERR " - $filename" if ($Proch::Seqfu::fu_verbose);
    $filename = '{{STDIN}}' if ($filename eq '-');
    my $reader = FASTX::Reader->new({ filename => "$filename"});
    while (my $s = $reader->getRead() ) {


        # Duplicate name finder
        my $name = $s->{name};
        if ($printed_seq_names{ $s->{name} }) {
            $name .= $opt_separator . $printed_seq_names{ $s->{name} };
        }
        $printed_seq_names{ $s->{name} }++;


        # Skip short sequences
        if (defined $opt_minlen and length($s->{seq}) < $opt_minlen) {
            next;
        }
        # Skip long sequences
        if (defined $opt_maxlen and length($s->{seq}) > $opt_maxlen) {
            next;
        }

        if ($opt_dereplicate) {
          my $seq_sum = md5_base64($s->{seq});
          $printed_seqs{$seq_sum}++;
          next if ($printed_seqs{$seq_sum} > 1);
          $name = $seq_sum if ($opt_rename_md5);
        }


        # Print seq
        if ($opt_force_fasta) {
            $s->{seq} = uc($s->{seq}) if ($opt_upper);
            fu_printfasta($name, $s->{comment}, $s->{seq});
        } elsif ($opt_force_fastq) {
            if (not defined $s->{qual}) {
                die " FATAL ERROR [$filename]:\n Trying to print sequence <$name> in FASTQ format, but no quality found\n";
            }
            fu_printfastq($name, $s->{comment}, $s->{seq}, $s->{qual});
        }

    }
}



sub usage {
    my $horizontal_bar = " " . '-' x 50;
    say STDERR " $BASENAME $VERSION";
    say STDERR " A program to concatenate sequence files";

    say STDERR $horizontal_bar;
    say STDERR " Type \`$BASENAME --help\` to display the full manual";
    say STDERR " Waiting for sequences from STDIN. Press Ctrl-C to exit."
}

sub version {
	say $BASENAME, " ", $VERSION;
	say STDERR "Using Proch::Seqfu=", $Proch::Seqfu::VERSION, " and FASTX::Reader=", $FASTX::Reader::VERSION;
	exit();
}
 ;

__END__

=pod

=encoding UTF-8

=head1 NAME

fu-cat - Concatenate FASTA or FASTQ files

=head1 VERSION

version 1.5.7

=head1 SYNOPSIS

  fu-cat [options] [FILE1 FILE2 FILE3...]

=head1 DESCRIPTION

This program parses a list of FASTA/FASTQ and will concatenate them
ensuring consistent output. Will rename duplicate sequence names.
Will try to autodetect the format of all files before executing and
decide accordingly the output format (FASTA if at least one of the
files is FASTA, otherwise FASTQ). If reading from STDIN the first
sequence is in FASTQ format, will skip all the sequences without a
quality string.

If no files are provided the program will try reading from STDIN,
otherwise add a '-' to the list of files to also read from STDIN.

=head1 NAME

fu-cat - concatenate FASTA or FASTQ files

=head1 PARAMETERS

=over 4

=item I<-s>, I<--separator>

When a second sequence with a name that was already printed is found,
the program will append a progressive number, separated by this string.
Use `fu-rename` if you need more options.
[default: "."]

=item I<-f>, I<--fasta>

Force FASTA output

=item I<-q>, I<--fastq>

Force FASTQ output. Will B<not> print any sequence without quality
(they will be skipped)

=item I<-d>, I<--dereplicate>

Print each sequence only only once

=item I<-5>, I<--rename-md5>

(use with -d) rename each sequence name with the MD5sum of the sequence.

=item I<-l>, I<--minlen> INT

Do not print sequences shorter (exclusive) than INT

=item I<-m>, I<--maxlen> INT

Do not print sequences longer (exclusive) than INT

=item I<-u>, I<--uppercase>

Will print the whole sequence in uppercase

=item I<-w>, I<--width> INT

Size of the FASTA lines. Specifing 0 will print the whole sequence in the same line (default: 0)

=item I<--verbose>

Print more details

=item I<--help>

Display this help page

=item I<--version>

Print version and exit

=back

=head1 MODERN ALTERNATIVE

This suite of tools has been superseded by B<SeqFu>, a compiled
program providing faster and safer tools for sequence analysis.
This suite is maintained for the higher portability of Perl scripts
under certain circumstances.

SeqFu is available at L<https://github.com/telatin/seqfu2>, and
can be installed with BioConda C<conda install -c bioconda seqfu>

=head1 CITING

Telatin A, Fariselli P, Birolo G.
I<SeqFu: A Suite of Utilities for the Robust and Reproducible Manipulation of Sequence Files>.
Bioengineering 2021, 8, 59. L<https://doi.org/10.3390/bioengineering8050059>

=head1 AUTHOR

Andrea Telatin <andrea@telatin.com>

=head1 COPYRIGHT AND LICENSE

This software is Copyright (c) 2018-2022 by Andrea Telatin.

This is free software, licensed under:

  The MIT (X11) License

=cut