#!/usr/bin/env perl
#
# Statistics::Discrete
#
# Chiara Orsini, CAIDA, UC San Diego
# chiara@caida.org
#
# Copyright (C) 2014 The Regents of the University of California.
#
# Statistics::Discrete is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Statistics::Discrete is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Statistics::Discrete.  If not, see <http://www.gnu.org/licenses/>.
#

package Statistics::Discrete;


use 5.014002;
use strict;
use warnings;
use List::Util;
use POSIX;
use Storable 'dclone';



our $VERSION = '0.05';

use constant NO_BINNING   => 0;
use constant LIN_BINNING  => 1;
use constant LOG_BINNING  => 2;

use constant LOG_BASE => 10;
use constant DEFAULT_BINS_NUMBER => 10;


# Constructor
sub new {  
  my $class = shift;
  my $self = {};
  $self->{"data_frequency"} = {}; # anonymous hash constructor
  # binning default values
  $self->{"bin-type"} = NO_BINNING;
  $self->{"optimal-binning"} = 1;
  # $self->{"bins"} - bins' structure
  # $self->{"num-bins"} - num bins when optimal binning is off
  # on-demand stored stats
  # $self->{"stats"}{"Desc"} - descriptive statistics
  # $self->{"stats"}{"Dist"} - distributions
  # $self->{"stats"}{"Dist"}{"binned"} - binned distributions
  # bind self and class  
  bless $self, $class;
  return $self;
}

################## Methods to load/add data  ##################

sub add_data {
  my $self = shift;
  my @data_to_add = @_;
  my $data;
  foreach $data(@data_to_add) {
    if(defined($self->{"data_frequency"}{$data})) {
      $self->{"data_frequency"}{$data}++;
    }
    else {
      $self->{"data_frequency"}{$data} = 1 ;
    }
  }
  # data has changed - stats and bins need to be computed again
  delete $self->{"stats"};
  delete $self->{"bins"};
  return;
}

sub add_data_from_file {
  my $self = shift;
  my $filename = shift;
  my @cols;
  my $f; 
  my $line;
  open($f, "<", $filename) or die "Cannot open " .$filename . ": $!";
  while($line = <$f>) {
    chomp($line);
    if($line =~ /^\s*#/) {
      next;
    }
    @cols = split /\s/ , $line;
    if(scalar @cols == 1) {   # assume list of values
      if(defined($self->{"data_frequency"}{$cols[0]})) {
	$self->{"data_frequency"}{$cols[0]}++;
      }
      else {
	$self->{"data_frequency"}{$cols[0]} = 1 ;
      }
    }
    if(scalar @cols == 2) {   # assume list of id - value pairs
      if(defined($self->{"data_frequency"}{$cols[1]})) {
	$self->{"data_frequency"}{$cols[1]}++;
      }
      else {
	$self->{"data_frequency"}{$cols[1]} = 1 ;
      }
    }
    # File format not recognized
    if(scalar @cols != 1 and scalar @cols != 2){
      print "File format not recognized\n";
      close($f);
      delete $self->{"stats"};
      delete $self->{"bins"};
      return;      
    }
  }
  close($f);
  # data has changed - stats and bins need to be computed again
  delete $self->{"stats"};
  delete $self->{"bins"};
  return;
}

sub add_data_from_frequencyfile {
  my $self = shift;
  my $filename = shift;
  my @cols;
  my $f; 
  my $line;
  open($f, "<", $filename) or die "Cannot open " .$filename . ": $!";
  while($line = <$f>) {
    chomp($line);
    if($line =~ /^\s*#/) {
      next;
    }
    @cols = split /\s+/ , $line;
    #DEBUG print $line . "\t" . @cols . "\n";
    if(scalar @cols == 2) {   # assume list of value - count
      if(defined($self->{"data_frequency"}{$cols[0]})) {
	$self->{"data_frequency"}{$cols[0]} += $cols[1];
      }
      else {
	$self->{"data_frequency"}{$cols[0]} = $cols[1];
      }
      next;
    }
    # File format not recognized
    if(scalar @cols != 2){
      print "File format not recognized\n";
      close($f);
      delete $self->{"stats"};
      delete $self->{"bins"};
      return;      
    }
  }
  close($f);
  # data has changed - stats and bins need to be computed again
  delete $self->{"stats"};
  delete $self->{"bins"};
  return;
}


################## Descriptive Statistics ##################

# Minimum value
# compute the minimum value, save the statistic, and return it
sub minimum {
  my $self = shift;
  if(!defined($self->{"stats"}{"Desc"}{"min"})) {
    my $count = $self->count();
    if($count > 0) {
      $self->{"stats"}{"Desc"}{"min"} = List::Util::min(keys $self->{"data_frequency"});
    }
    else {
      $self->{"stats"}{"Desc"}{"min"} = 0;
    }
  }
  return $self->{"stats"}{"Desc"}{"min"};
}


# Maximum value
# compute the minimum value, save the statistic, and return it
sub maximum {
  my $self = shift;
  if(!defined($self->{"stats"}{"Desc"}{"max"})) {
    my $count = $self->count();
    if($count > 0) {
      $self->{"stats"}{"Desc"}{"max"} = List::Util::max(keys $self->{"data_frequency"});
    }
    else {
      $self->{"stats"}{"Desc"}{"max"} = 0;
    }
  }
  return $self->{"stats"}{"Desc"}{"max"};
}


# count the number of samples provided
# save the statistic, and return it
sub count {
  my $self = shift;
  if(!defined($self->{"stats"}{"Desc"}{"count"})) {
    my $count = 0;
    my $data;
    foreach $data(keys %{$self->{"data_frequency"}}) {
      $count += $self->{"data_frequency"}{$data};
    }
    $self->{"stats"}{"Desc"}{"count"} = $count;
  }
  return $self->{"stats"}{"Desc"}{"count"};
}


# http://en.wikipedia.org/wiki/Expected_value
#  the weighted average of the possible values, 
#  using their probabilities as their weights
sub mean {
  my $self = shift;
  if(!defined($self->{"stats"}{"Desc"}{"mean"})) {
    my $cumul_value = 0;
    my $count = $self->count();
    my $v;
    foreach $v( keys %{$self->{"data_frequency"}}) {
      $cumul_value += $v * $self->{"data_frequency"}{$v};
    }
    if($count > 0) {
      $self->{"stats"}{"Desc"}{"mean"} = $cumul_value / $count;
    }
    else {
      $self->{"stats"}{"Desc"}{"mean"} = 0;     
    }
  }
  return $self->{"stats"}{"Desc"}{"mean"};
}



# http://en.wikipedia.org/wiki/Median
# the median is the numerical value separating
# the higher half of a data sample, a population,
# or a probability distribution, from the lower half. 
sub median {
  my $self = shift;
  if(!defined($self->{"stats"}{"Desc"}{"median"})) {
    my $count = $self->count();
    my $odd = 0;
    if($count % 2 != 0) {
      $odd = 1;
    }
    my $median_index = floor($count/2) + $odd;
    my $current_index = 0;

    my ($v,$i);
    foreach $v(sort {$a<=>$b} keys %{$self->{"data_frequency"}}) {
      for($i=0; $i < $self->{"data_frequency"}{$v}; $i++) {
	$current_index++;
	if($current_index == $median_index) {
	  $self->{"stats"}{"Desc"}{"median"} = $v;
	}
	if($current_index == ($median_index+1) and
	  $odd == 0) {
	  $self->{"stats"}{"Desc"}{"median"} = ($self->{"stats"}{"Desc"}{"median"} + $v)/2;
	}

      }
    }
  }
  return $self->{"stats"}{"Desc"}{"median"};
}


# http://en.wikipedia.org/wiki/Percentile
# A percentile (or a centile) is a measure
# used in statistics indicating the value 
# below which a given percentage of observations
# in a group of observations fall.
sub percentile {
  my $self = shift;
  my $perc = shift;
  my $max_probability = $perc/100;
  my $cdf = $self->empirical_distribution_function();
  my $v = 0;
  my $previous_v = 0;
  my $epsilon = 0;
  foreach $v(sort {$a<=>$b} keys %{$cdf}) {
    $epsilon = $v - $previous_v;
    if($cdf->{$v} > $max_probability) {
      return $v;
    }
    $previous_v = $v;
  }
  # if the program is here then $perc == 1
  # then we return $v+epsilon
  return $v+$epsilon;
}



# http://en.wikipedia.org/wiki/Variance
# The variance of a random variable X is
# its second central moment, the expected 
# value of the squared deviation from the mean
sub variance {
  my $self = shift;
  if(!defined($self->{"stats"}{"Desc"}{"variance"})) {
    my $mean = $self->mean();
    my $count = $self->count();
    my $cumul_value = 0;    
    my $square_mean = 0;
    my $v;
    foreach $v(keys %{$self->{"data_frequency"}}) {
      $cumul_value += ($v**2) * $self->{"data_frequency"}{$v};
    }
    if($count > 0) {
      $square_mean = $cumul_value / $count;
    }
    $self->{"stats"}{"Desc"}{"variance"} = $square_mean - ($mean**2);
  }
  return $self->{"stats"}{"Desc"}{"variance"};
}


# http://en.wikipedia.org/wiki/Standard_deviation
# The standard deviation of a random variable
# is the square root of its variance.
sub standard_deviation {
  my $self = shift;
  if(!defined($self->{"stats"}{"Desc"}{"sdev"})) { 
    my $variance = $self->variance();
    $self->{"stats"}{"Desc"}{"sdev"} = sqrt($variance);
  }
  return $self->{"stats"}{"Desc"}{"sdev"};
}


################## Distributions Statistics ##################

# http://en.wikipedia.org/wiki/Frequency_distribution
#  the frequency distribution is an arrangement of the values
#  that one or more variables take in a sample. Each entry in
#  the table contains the frequency or count of the occurrences
#  of values within a particular group or interval
sub frequency_distribution {
  my $self = shift;
  if($self->count() == 0) {
    print "WARNING: empty dataset, returning empty frequency distribution. \n"; 
    return (); # empty hash
  }
  if($self->{"bin-type"} == NO_BINNING) { 
    # $self->{"stats"}{"Dist"}{"frequency"}
    return $self->_frequency_distribution();
  }
  else {
    # $self->{"stats"}{"Dist"}{"binned"}{"frequency"}
    return $self->_binned_frequency_distribution();
  }
}


# http://en.wikipedia.org/wiki/Probability_mass_function
#  the probability mass function (pmf) is a function that gives
#  the probability that a discrete random variable is exactly 
#  equal to some value.
sub probability_mass_function {
  my $self = shift;
  if($self->count() == 0) {
    print "WARNING: empty dataset, returning empty probability mass function. \n"; 
    return (); # empty hash
  }
  if($self->{"bin-type"} == NO_BINNING) { 
    # $self->{"stats"}{"Dist"}{"pmf"}
    return $self->_probability_mass_function();
  }
  else {
    # $self->{"stats"}{"Dist"}{"binned"}{"pmf"}
    return $self->_binned_probability_mass_function();
  }
}


# http://en.wikipedia.org/wiki/Empirical_distribution_function
#  the empirical distribution function, or empirical cdf, is the
#  cumulative distribution function associated with the empirical 
#  measure of the sample. This cdf is a step function that jumps up
#  by 1/n at each of the n data points.
sub empirical_distribution_function {
  my $self = shift;
  if($self->count() == 0) {
    print "WARNING: empty dataset, returning empty emptirical distribution function. \n"; 
    return (); # empty hash
  }
  if($self->{"bin-type"} == NO_BINNING) { 
    # $self->{"stats"}{"Dist"}{"cdf"}
    return $self->_empirical_distribution_function();
  }
  else {
    # $self->{"stats"}{"Dist"}{"binned"}{"cdf"}
    return $self->_binned_empirical_distribution_function();
  }
}


# http://en.wikipedia.org/wiki/CCDF
#  how often the random variable is above a particular level.
sub complementary_cumulative_distribution_function  {
  my $self = shift;
    if($self->count() == 0) {
    print "WARNING: empty dataset, returning empty complementary cumulative distribution function. \n"; 
    return (); # empty hash
  }
  if($self->{"bin-type"} == NO_BINNING) { 
    # $self->{"stats"}{"Dist"}{"ccdf"}
    return $self->_complementary_cumulative_distribution_function();
  }
  else {
    # $self->{"stats"}{"Dist"}{"binned"}{"ccdf"}
    return $self->_binned_complementary_cumulative_distribution_function();
  }
}



################## Regular Distributions Statistics ##################


sub _frequency_distribution {
  my $self = shift;
  if(!defined($self->{"stats"}{"Dist"}{"frequency"})) {     
    $self->{"stats"}{"Dist"}{"frequency"} = dclone $self->{"data_frequency"};
  }
  return $self->{"stats"}{"Dist"}{"frequency"};
}


sub _probability_mass_function {
  my $self = shift;
  if(!defined($self->{"stats"}{"Dist"}{"pmf"})) {  
    my $count = $self->count();
    $self->_frequency_distribution();
    my ($val,$freq);
    foreach $val(keys %{$self->{"stats"}{"Dist"}{"frequency"}}) {
      $freq =  $self->{"stats"}{"Dist"}{"frequency"}{$val};
      $self->{"stats"}{"Dist"}{"pmf"}{$val} = $freq / $count;      
    }
  } 
  return $self->{"stats"}{"Dist"}{"pmf"};
}


sub _empirical_distribution_function {
  my $self = shift;
  if(!defined($self->{"stats"}{"Dist"}{"cdf"})) {  
    my $count = $self->count();
    $self->_frequency_distribution();
    my $val;
    my $cumul_freq = 0;
    foreach $val( sort {$a<=>$b}  keys %{$self->{"stats"}{"Dist"}{"frequency"}}) {
      $cumul_freq +=  $self->{"stats"}{"Dist"}{"frequency"}{$val};
      $self->{"stats"}{"Dist"}{"cdf"}{$val} = $cumul_freq / $count;      
    }
  }   
  return $self->{"stats"}{"Dist"}{"cdf"};
}


sub _complementary_cumulative_distribution_function  {
  my $self = shift;
  # warning, this creates self->{"stats"} if it doesn't exist
  if(!defined($self->{"stats"}{"Dist"}{"ccdf"})) {  
    my $count = $self->count();
    $self->frequency_distribution();
    my $val;
    my $cumul_freq = $count;
    foreach $val( sort {$a<=>$b}  keys %{$self->{"stats"}{"Dist"}{"frequency"}}) {
      $cumul_freq -=  $self->{"stats"}{"Dist"}{"frequency"}{$val};
      $self->{"stats"}{"Dist"}{"ccdf"}{$val} = $cumul_freq / $count;      
    }
  }   
  return $self->{"stats"}{"Dist"}{"ccdf"};
}



################## Binned Distributions Statistics ##################


sub _binned_frequency_distribution {
  my $self = shift;
  if(!defined($self->{"stats"}{"Dist"}{"binned"}{"frequency"})) {     
    # 1. get bins
    $self->bins(); # set $self->{"bins"}
    # 2. get regular frequency distribution
    $self->_frequency_distribution(); # set $self->{"stats"}{"Dist"}{"frequency"}
    # 3. compute binned property
    my $binned_frequency = {};
    my $i = 0;  # vals iterator
    my @sorted_vals = (sort {$a<=>$b} keys %{$self->{"stats"}{"Dist"}{"frequency"}});
    my $num_vals = scalar @sorted_vals;
    my $bi = 0; # bins iterator
    my @sorted_bins = (sort {$a<=>$b} keys %{$self->{"bins"}});
    my $num_bins = scalar @sorted_bins;  
    my $freq_sum = 0;
    for(; $bi < $num_bins; $bi++) { # for each bin
      for(; $i < $num_vals; $i++) {
	if($sorted_vals[$i] < $self->{"bins"}->{$sorted_bins[$bi]}{"right"}) {
	  $freq_sum += $self->{"stats"}{"Dist"}{"frequency"}->{$sorted_vals[$i]}
	}
	else {
	  $binned_frequency->{$sorted_bins[$bi]} = $freq_sum;
	  $freq_sum = 0;
	  last;
	}
      }

      # if last element of vals then we save the current freq sum
      # in the current bin
      if($i == $num_vals) {
	$binned_frequency->{$sorted_bins[$bi]} = $freq_sum;
      }
    }
    $self->{"stats"}{"Dist"}{"binned"}{"frequency"} = $binned_frequency;
  }
  return $self->{"stats"}{"Dist"}{"binned"}{"frequency"};
}


sub _binned_probability_mass_function {
  my $self = shift;
  if(!defined($self->{"stats"}{"Dist"}{"binned"}{"pmf"})) {
    my $count = $self->count();
    my $bins = $self->bins(); # set $self->{"bins"}
    my $bfd = $self->_binned_frequency_distribution();
    my ($val,$freq, $interval);
    foreach $val(keys %{$bfd}) {      
      $freq =  $self->{"stats"}{"Dist"}{"binned"}{"frequency"}{$val};
      $interval = $self->{"bins"}{$val}{"right"} - $self->{"bins"}{$val}{"left"};
      if($interval > 0) { 
	$self->{"stats"}{"Dist"}{"binned"}{"pmf"}{$val} = ($freq / $count) / $interval;      
      }
      else {
	$self->{"stats"}{"Dist"}{"binned"}{"pmf"}{$val} = $freq / $count;
      }
    }
  }
  return $self->{"stats"}{"Dist"}{"binned"}{"pmf"};
}


sub _binned_empirical_distribution_function {
  my $self = shift;
  if(!defined($self->{"stats"}{"Dist"}{"binned"}{"cdf"})) {
    my $bpmf = $self->_binned_probability_mass_function();
    my $val;
    my $cumul_prob = 0;
    foreach $val(sort {$a<=>$b} keys %{$bpmf}) {      
      $cumul_prob +=  $bpmf->{$val};
      $self->{"stats"}{"Dist"}{"binned"}{"cdf"}{$val} = $cumul_prob;
    }
  }
  return $self->{"stats"}{"Dist"}{"binned"}{"cdf"};
}


sub _binned_complementary_cumulative_distribution_function {
  my $self = shift;
  if(!defined($self->{"stats"}{"Dist"}{"binned"}{"ccdf"})) {
    my $bpmf = $self->_binned_probability_mass_function();
    my $val;
    my $cumul_prob = 0;
    my $old_cumul_prob = 0;
    foreach $val(sort {$b<=>$a} keys %{$bpmf}) {         # descending sort   
      $old_cumul_prob = $cumul_prob;
      $cumul_prob += $bpmf->{$val};
      $self->{"stats"}{"Dist"}{"binned"}{"ccdf"}{$val} = $old_cumul_prob;
    }
  }
  return $self->{"stats"}{"Dist"}{"binned"}{"ccdf"};
}


################## Binned Related Functions ##################


sub set_binning_type {
  my $self = shift;
  my $bin_type = shift;
  # check bin-type validity
  if($bin_type != NO_BINNING && $bin_type != LIN_BINNING &&
     $bin_type != LOG_BINNING) {
    die "Wrong binning type provided\n";    
  }
  # if bin type is already set
  if($bin_type == $self->{"bin-type"}) {
    return;
  }
  # if bin-type is different
  $self->{"bin-type"} = $bin_type;
  delete $self->{"num-bins"};
  delete $self->{"bins"};
  delete $self->{"stats"}{"Dist"}{"binned"};
}


sub set_optimal_binning {
  my $self = shift;
  if( $self->{"optimal-binning"} == 0) {
    delete $self->{"bins"};
    delete $self->{"num-bins"};
    delete $self->{"stats"}{"Dist"}{"binned"};
  }
  $self->{"optimal-binning"} = 1;
}


sub set_custom_num_bins {
 my $self = shift;
 my $nb = shift;
 if($self->{"bin-type"} == NO_BINNING) {
   die "Set a binning type before setting the num of bins\n";
 }
 if( $nb < 2 ) {
   die "Wrong number of bins provided: " . $nb . "\n";
 } 
 # check if there is a change
 if( $self->{"optimal-binning"} == 0 and
     defined ($self->{"num-bins"}) and
     $self->{"num-bins"} == $nb) {
   # nothing changed -> nothing to do
 }
 else {
   # update binning parameters
   $self->{"optimal-binning"} = 0;
   $self->{"num-bins"} = $nb;
   delete $self->{"stats"}{"Dist"}{"binned"};
   delete $self->{"bins"};
 }
}


sub bins {
  my $self = shift;
  if($self->count() == 0) {
    print "WARNING: empty dataset, returning empty bins. \n"; 
    return {}; # empty array
  }

  if(!defined($self->{"bins"})) {    
    # CASE 1: if no_binning is set we do not
    # save the binning structure
    if( $self->{"bin-type"} == NO_BINNING) {
      my $curbins = {};
      my $v;
      foreach $v(keys %{$self->{"data_frequency"}}) {
	my $ref = $v;
	$curbins->{$v}{"left"} = $v;
	$curbins->{$v}{"right"} = $v;
      }
      return $curbins;
    }
    
    # CASE 2: if a custom number of bins is
    # provided, we lin-/log- bin the support 
    # accordingly
    if($self->{"optimal-binning"} == 0) {
      if($self->{"bin-type"} == LIN_BINNING) {
	$self->{"bins"} = $self->compute_lin_bins($self->{"num-bins"});
      }
      if($self->{"bin-type"} == LOG_BINNING) {
	$self->{"bins"} = $self->compute_log_bins($self->{"num-bins"});
      }
    }

    # CASE 3: optimal binning
    else {
      my $res = $self->_optimal_binning();
      if($res != 0) {
	print "Optimal binning was not possible on this sample, ";
	print "using the NO_BINNING mode" . "\n";
	my $curbins = {};
	my $v;
	foreach $v(keys %{$self->{"data_frequency"}}) {
	  my $ref = $v;
	  $curbins->{$v}{"left"} = $v;
	  $curbins->{$v}{"right"} = $v;
	}
	$self->{"bin-type"} = NO_BINNING;
	$self->{"bins"} = $curbins;
      }
    }
  }
  return $self->{"bins"};
}


################## Optimal Binning Related Functions ##################

# Compute the optimal binning
sub _optimal_binning { 
  my $self = shift;
  # LEGEND:
  # b_* => bins
  # f_* => binned frequency distribution
  # d_* => binned density distribution
  # s_* => sum of density distribution values  
  #
  # start from extreme values ( min_num_bins and max_num_bins)
  # and use the bisection method on the interval
  # till the optimal is found (i.e. the binning such that
  # the sum of the density function values is the closest
  # to 1)
  my $min_num_bins = 2;
  my ($b_min, $f_min, $d_min, $s_min) = $self->_bin_attempt($min_num_bins);
  my $max_num_bins = scalar $self->_full_support();
  if($max_num_bins == 1) {
    return -1;
  }
  my ($b_max, $f_max, $d_max, $s_max) = $self->_bin_attempt($max_num_bins);
  # initial check
  if( $s_min > 1 or $s_max < 1) { 
    return -1;
  }
  my $avg_num_bins;
  my ($b_avg, $f_avg, $d_avg, $s_avg);
  while( ($max_num_bins - $min_num_bins) > 1) {
    $avg_num_bins = sprintf("%.f", (($max_num_bins + $min_num_bins)/2));
    ($b_avg, $f_avg, $d_avg, $s_avg) = $self->_bin_attempt($avg_num_bins);
    if($s_avg < 1) {
      $min_num_bins = $avg_num_bins;
      ($b_min, $f_min, $d_min, $s_min) = ($b_avg, $f_avg, $d_avg, $s_avg);
    }
    else {
      $max_num_bins = $avg_num_bins;
      ($b_max, $f_max, $d_max, $s_max) = ($b_avg, $f_avg, $d_avg, $s_avg);
    }
  }
  # the binning whose density sum is closer to 1 win
  if( (1 - $s_min) < ($s_max-1) ) {
    $self->{"bins"} = $b_min;
    $self->{"Dist"}{"binned"}{"frequency"} = $f_min;
    $self->{"Dist"}{"binned"}{"pmf"} = $d_min;
  }
  else {
    $self->{"bins"} = $b_max;
    $self->{"Dist"}{"binned"}{"frequency"} = $f_max;
    $self->{"Dist"}{"binned"}{"pmf"} = $d_max;
  }
  return 0;
}


# bin the current data into $num_bins intervals
# and return: the bins, the binned frequency distribution
# the binned density distribution and the sum of the 
# density values
sub _bin_attempt {
  my $self = shift;
  my $num_bins = shift;
  # output values
  my ($cur_bins, $cur_freq, $cur_density, $cur_sum);
  $cur_sum = 0;
  # get the bins
  if( $self->{"bin-type"} == LIN_BINNING ) {
    $cur_bins = $self->compute_lin_bins($num_bins);
  }
  if( $self->{"bin-type"} == LOG_BINNING ) {
    $cur_bins  =$self->compute_log_bins($num_bins);
  }
  # assert non binned frequency distribution is computed
  $self->_frequency_distribution(); # set $self->{"stats"}{"Dist"}{"frequency"}
  my $count = $self->count();
  #
  my $s = 0; # support iterator
  my @sorted_support = (sort {$a<=>$b} $self->_full_support());
  my $sup_size = scalar @sorted_support;
  my $bi = 0; # bins iterator
  my @sorted_bins = (sort {$a<=>$b} keys %{$cur_bins});
  my $n_bins = scalar @sorted_bins;  # assert(n_bins == num_bins)
  my $ref = 0;
  my $freq_sum = 0;
  my $interval = 0;
  for(; $bi < $n_bins; $bi++) { # for each bin
    $ref = $sorted_bins[$bi];
    $interval = $cur_bins->{$ref}{"right"} - $cur_bins->{$ref}{"left"};
    for(; $s < $sup_size; $s++) { # for each support value
      # if the current support value is less than the rightmost value in the bin
      if($sorted_support[$s] < $cur_bins->{$ref}{"right"}) {
	$freq_sum += $self->{"stats"}{"Dist"}{"frequency"}->{$sorted_support[$s]};
      }
      # else we have to save the information for the current bin
      else {
	$cur_freq->{$ref} = $freq_sum;
	if($interval > 0) {
	  $cur_density->{$ref} = ($freq_sum / $count) / $interval;
	}
	else {
	  $cur_density->{$ref} =  $freq_sum / $count;
	}
	$cur_sum += $cur_density->{$ref};
	$freq_sum = 0;
	last; # we do not increment s
      }
    }    
    # if last element of the support has been reached 
    # we have to save the current freq sum in the current bin
    if($s == $sup_size) {
      $cur_freq->{$ref} = $freq_sum;
      if($interval > 0) {
	$cur_density->{$ref} = ($freq_sum / $count) / $interval;
      }
      else {
	$cur_density->{$ref} =  $freq_sum / $count;
      }
      $cur_sum += $cur_density->{$ref};	
    }
  }
  #DEBUG print "attempt: " . $num_bins . " sum: " . $cur_sum . "\n";
  return ($cur_bins, $cur_freq, $cur_density, $cur_sum);
}



sub _full_support {
  my $self = shift;
  my $support = {};
  my $v;
  foreach $v( keys %{$self->{"data_frequency"}}) {
    $support->{$v} = 1;
  }
  return (keys %{$support});
}




################## Utilities ##################

# return a structure describing a linear binning
# of the current data ($num_bins provided)
sub compute_lin_bins {
  my $self = shift;
  my $num_bins = shift;
  my $min = $self->minimum();
  my $max = $self->maximum();
  my $linbins = {}; # reference to empty hash
  # last bin includes just the maximum value
  my $delta = abs($max - $min)/($num_bins-1);
  my $i;
  my $ref;
  for($i=0; $i < $num_bins; $i++) {
    $ref = $min + $i * $delta + $delta/2;
    $linbins->{$ref}{"left"} =  $min + $i * $delta;
    $linbins->{$ref}{"right"} = $min + $i * $delta + $delta;
  }
  return $linbins;
}


# return a structure describing a logarithmic binning
# of the current data ($num_bins provided)
# bins are not set
sub compute_log_bins {
  my $self = shift;
  my $num_bins = shift;
  my $min = $self->minimum();
  my $max = $self->maximum();
  if($min == 0 or $max == 0 or $min *$max <0) {
    # if the interval contains zero, then we cannot logbin
    # we return the linear binning
    print "Logarithmic binning was not possible on this sample, ";
    print "switching to linear binning mode" . "\n";
    $self->{"bin-type"} = LIN_BINNING;
    return $self->compute_lin_bins($num_bins);    
  }
  my $logbins = {}; # reference to empty hash
  my $min_exp = log($min)/log(LOG_BASE);
  my $max_exp = log($max+1)/log(LOG_BASE);
  my $delta_exp = abs($max_exp - $min_exp)/$num_bins;
  my $cur_exp;
  my $i;
  my $ref;
  for($i=0; $i < $num_bins; $i++) {
    $cur_exp = $min_exp + $i * $delta_exp;
    $ref = LOG_BASE ** ($cur_exp + $delta_exp/2);
    $logbins->{$ref}{"left"} = LOG_BASE ** $cur_exp;
    $logbins->{$ref}{"right"} = LOG_BASE ** ($cur_exp + $delta_exp);    
  }
  return $logbins;
}


1;