NAME
Bio::Minimizer - minimizer package
Based on the ideas put forth by Roberts et al 2004: https://academic.oup.com/bioinformatics/article/20/18/3363/202143
SYNOPSIS
my $minimizer = Bio::Minimizer->new($sequenceString);
my $kmers = $minimizer->{kmers}; # hash of minimizer => kmer
my $minimizers= $minimizer->{minimizers};# hash of minimizer => [kmer1,kmer2,...]
# hash of minimizer => [start1,start2,...]
# Start coordinates are on the fwd strand even when
# matched against the rev strand.
my $starts = $minimizer->{starts};
# With more options
my $minimizer2= Bio::Minimizer->new($sequenceString,{k=>31,l=>21});
DESCRIPTION
Creates a set of minimizers from sequence
EXAMPLES
example: Sort a fastq file by minimizer, potentially shrinking gzip size.
This is implemented in this package's scripts/sort*.pl scripts.
use Bio::Minimizer
# Read fastq file via stdin, in this example
while(my $id = <>){
# Grab an entry
($seq,$plus,$qual) = (scalar(<>), scalar(<>), scalar(<>));
chomp($id,$seq,$plus,$qual);
# minimizer object
$MINIMIZER = Bio::Minimizer->new($seq,{k=>length($seq)});
# The only minimizer in this entry because k==length(seq)
$minMinimizer = (values(%{$$MINIMIZER{minimizers}}))[0];
# combine the minimum minimizer with the entry, for
# sorting later.
# Save the entry as a string so that we don't have to
# parse it later.
my $entry = [$minMinimizer, "$id\n$seq\n$plus\n$qual\n"];
push(@entry,$entry);
}
for my $e(sort {$$a[0] cmp $$b[0]} @entry){
print $$e[1];
}