#!/usr/bin/env perl # ABSTRACT: Concatenate FASTA or FASTQ files # PODNAME: fu-cat use 5.012; use warnings FATAL => 'all'; use Getopt::Long; use Pod::Usage; use File::Basename; use FindBin qw($RealBin); use Digest::MD5 qw(md5_base64); use FASTX::Reader; # The following placeholder is to be programmatically replaced with 'use lib "$RealBin/../lib"' if needed #~loclib~ if ( -e "$RealBin/../lib/Proch/N50.pm" and -e "$RealBin/../Changes" ) { use lib "$RealBin/../lib"; } use Proch::Seqfu; my $VERSION = $Proch::Seqfu::VERSION // "<Dev>"; my $BASENAME = basename($0); my ($opt_force_fasta, $opt_force_fastq, $opt_upper, $opt_help, $opt_verbose, $opt_version); my ($opt_minlen, $opt_maxlen, $opt_dereplicate, $opt_rename_md5); my $opt_separator = '.'; my $opt_rename_string; my $opt_line_size = $Proch::Seqfu::fu_linesize; if (!GetOptions( 'f|fasta' => \$opt_force_fasta, 'q|fastq' => \$opt_force_fastq, 'd|dereplicate' => \$opt_dereplicate, '5|rename-md5' => \$opt_rename_md5, 'r|rename=s' => \$opt_rename_string, 's|separator=s' => \$opt_separator, 'w|width=i' => \$opt_line_size, 'u|uppercase' => \$opt_upper, 'l|minlen=i' => \$opt_minlen, 'm|maxlen=i' => \$opt_maxlen, 'version' => \$opt_version, 'verbose' => \$opt_verbose, 'help' => \$opt_help, )) { say STDERR "Wrong parameters: type $BASENAME --help for full documentation."; exit 1 ; } $Proch::Seqfu::fu_linesize = $opt_line_size; $Proch::Seqfu::fu_verbose = $opt_verbose; # Check consistency of parameters if ($opt_force_fasta and $opt_force_fastq){ die " PARAMETER ERROR:\n You must specify either --fasta or --fastq (or none), not both.\n"; } # Print version if ($opt_version) { version(); } # Print man (help) pod2usage( {-exitval => 0, -verbose => 2} ) if $opt_help; # Read from STDIN if (not defined $ARGV[0]) { # Read from STDIN but also print a help message usage(); push(@ARGV, '-'); } # Autodetect output format if not specified if (not $opt_force_fasta and not $opt_force_fastq){ # Prescan all the files ... for my $filename (@ARGV) { if ($filename eq '-' or $filename eq '{{STDIN}}') { $opt_force_fasta = 1; verbose("When requiring to parse STDIN, output format is defaulted to FASTA (unless --fastq is on)") } if (not defined $opt_force_fastq and not defined $opt_force_fasta) { verbose "Autodetecting "; my $fx_reader = FASTX::Reader->new({ filename => "$filename" }); if ($filename ne '-') { my $format = $fx_reader->getFileFormat("$filename"); if ($format eq 'fasta') { $opt_force_fasta = 1; } } } } if (not $opt_force_fasta) { verbose("Output format: FASTQ"); $opt_force_fastq = 1; } else { verbose("Output format: FASTA"); } } my %printed_seq_names = (); my %printed_seqs = (); # Process all the files for my $filename (@ARGV) { say STDERR " - $filename" if ($Proch::Seqfu::fu_verbose); $filename = '{{STDIN}}' if ($filename eq '-'); my $reader = FASTX::Reader->new({ filename => "$filename"}); while (my $s = $reader->getRead() ) { # Duplicate name finder my $name = $s->{name}; if ($printed_seq_names{ $s->{name} }) { $name .= $opt_separator . $printed_seq_names{ $s->{name} }; } $printed_seq_names{ $s->{name} }++; # Skip short sequences if (defined $opt_minlen and length($s->{seq}) < $opt_minlen) { next; } # Skip long sequences if (defined $opt_maxlen and length($s->{seq}) > $opt_maxlen) { next; } if ($opt_dereplicate) { my $seq_sum = md5_base64($s->{seq}); $printed_seqs{$seq_sum}++; next if ($printed_seqs{$seq_sum} > 1); $name = $seq_sum if ($opt_rename_md5); } # Print seq if ($opt_force_fasta) { $s->{seq} = uc($s->{seq}) if ($opt_upper); fu_printfasta($name, $s->{comment}, $s->{seq}); } elsif ($opt_force_fastq) { if (not defined $s->{qual}) { die " FATAL ERROR [$filename]:\n Trying to print sequence <$name> in FASTQ format, but no quality found\n"; } fu_printfastq($name, $s->{comment}, $s->{seq}, $s->{qual}); } } } sub usage { my $horizontal_bar = " " . '-' x 50; say STDERR " $BASENAME $VERSION"; say STDERR " A program to concatenate sequence files"; say STDERR $horizontal_bar; say STDERR " Type \`$BASENAME --help\` to display the full manual"; say STDERR " Waiting for sequences from STDIN. Press Ctrl-C to exit." } sub version { say $BASENAME, " ", $VERSION; say STDERR "Using Proch::Seqfu=", $Proch::Seqfu::VERSION, " and FASTX::Reader=", $FASTX::Reader::VERSION; exit(); } ; __END__ =pod =encoding UTF-8 =head1 NAME fu-cat - Concatenate FASTA or FASTQ files =head1 VERSION version 1.5.7 =head1 SYNOPSIS fu-cat [options] [FILE1 FILE2 FILE3...] =head1 DESCRIPTION This program parses a list of FASTA/FASTQ and will concatenate them ensuring consistent output. Will rename duplicate sequence names. Will try to autodetect the format of all files before executing and decide accordingly the output format (FASTA if at least one of the files is FASTA, otherwise FASTQ). If reading from STDIN the first sequence is in FASTQ format, will skip all the sequences without a quality string. If no files are provided the program will try reading from STDIN, otherwise add a '-' to the list of files to also read from STDIN. =head1 NAME fu-cat - concatenate FASTA or FASTQ files =head1 PARAMETERS =over 4 =item I<-s>, I<--separator> When a second sequence with a name that was already printed is found, the program will append a progressive number, separated by this string. Use `fu-rename` if you need more options. [default: "."] =item I<-f>, I<--fasta> Force FASTA output =item I<-q>, I<--fastq> Force FASTQ output. Will B<not> print any sequence without quality (they will be skipped) =item I<-d>, I<--dereplicate> Print each sequence only only once =item I<-5>, I<--rename-md5> (use with -d) rename each sequence name with the MD5sum of the sequence. =item I<-l>, I<--minlen> INT Do not print sequences shorter (exclusive) than INT =item I<-m>, I<--maxlen> INT Do not print sequences longer (exclusive) than INT =item I<-u>, I<--uppercase> Will print the whole sequence in uppercase =item I<-w>, I<--width> INT Size of the FASTA lines. Specifing 0 will print the whole sequence in the same line (default: 0) =item I<--verbose> Print more details =item I<--help> Display this help page =item I<--version> Print version and exit =back =head1 MODERN ALTERNATIVE This suite of tools has been superseded by B<SeqFu>, a compiled program providing faster and safer tools for sequence analysis. This suite is maintained for the higher portability of Perl scripts under certain circumstances. SeqFu is available at L<https://github.com/telatin/seqfu2>, and can be installed with BioConda C<conda install -c bioconda seqfu> =head1 CITING Telatin A, Fariselli P, Birolo G. I<SeqFu: A Suite of Utilities for the Robust and Reproducible Manipulation of Sequence Files>. Bioengineering 2021, 8, 59. L<https://doi.org/10.3390/bioengineering8050059> =head1 AUTHOR Andrea Telatin <andrea@telatin.com> =head1 COPYRIGHT AND LICENSE This software is Copyright (c) 2018-2022 by Andrea Telatin. This is free software, licensed under: The MIT (X11) License =cut