NAME

PDL::Stats::Distr -- parameter estimations and probability density functions for distributions.

DESCRIPTION

Parameter estimate is maximum likelihood estimate when there is closed form estimate, otherwise it is method of moments estimate.

SYNOPSIS

use PDL::LiteF;
use PDL::Stats::Distr;

# do a frequency (probability) plot with fitted normal curve
my $data = grandom(100)->abs;

my ($xvals, $hist) = $data->hist;

  # turn frequency into probability
$hist /= $data->nelem;

  # get maximum likelihood estimates of normal curve parameters
my ($m, $v) = $data->mle_gaussian();

  # fitted normal curve probabilities
my $p = $xvals->pdf_gaussian($m, $v);

use PDL::Graphics::PGPLOT::Window;
my $win = pgwin( Dev=>"/xs" );

$win->bin( $hist );
$win->hold;
$win->line( $p, {COLOR=>2} );
$win->close;

Or, play with different distributions with plot_distr :)

$data->plot_distr( 'gaussian', 'lognormal' );
my ($a, $b) = $data->mme_beta();

beta distribution. pdf: f(x; a,b) = 1/B(a,b) x^(a-1) (1-x)^(b-1)

probability density function for beta distribution. x defined on [0,1].

my ($n, $p) = $data->mme_binomial;

binomial distribution. pmf: f(k; n,p) = (n k) p^k (1-p)^(n-k) for k = 0,1,2..n ', );

pp_def('pmf_binomial', Pars => 'ushort x(); ushort n(); p(); float+ [o]out()', GenericTypes => $F, HandleBad => 1, Code => ' PDL_IF_BAD(if ($ISBAD($x()) || $ISBAD($n()) || $ISBAD($p())) { $SETBAD( $out() ); continue; },) $GENERIC(out) bc = gsl_sf_choose($n(), $x()); $out() = bc * pow($p(), $x()) * pow(1-$p(), $n() - $x()); ', Doc => ' =for ref

probability mass function for binomial distribution. ',

);

pp_def('mle_exp', Pars => 'a(n); float+ [o]l()', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(l) sa = 0; PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) continue;,) sa += $a(); PDL_IF_BAD(N++;,) %} PDL_IF_BAD(if (sa <= 0) { $SETBAD(l()); continue; },) $l() = N / sa; ', Doc => ' =for usage

my $lamda = $data->mle_exp;

exponential distribution. mle same as method of moments estimate. ', );

pp_def('pdf_exp', Pars => 'x(); l(); float+ [o]p()', GenericTypes => $F, HandleBad => 1, Code => ' PDL_IF_BAD(if ( $ISBAD($x()) || $ISBAD($l()) ) { $SETBAD( $p() ); continue; },) $p() = $l() * exp( -1 * $l() * $x() ); ', Doc => ' =for ref

probability density function for exponential distribution. ', );

pp_def('mme_gamma', Pars => 'a(n); float+ [o]shape(); float+ [o]scale()', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(shape) sa = 0, a2 = 0; PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) continue;,) sa += $a(); a2 += pow($a(), 2); PDL_IF_BAD(N++;,) %} PDL_IF_BAD(if (N < 1) { $SETBAD(shape()); $SETBAD(scale()); continue; },) $GENERIC(shape) m = sa / N; $GENERIC(shape) v = a2 / N - pow(m, 2); $shape() = pow(m, 2) / v; $scale() = v / m; ', Doc => ' =for usage

my ($shape, $scale) = $data->mme_gamma();

two-parameter gamma distribution ', );

pp_def('pdf_gamma', Pars => 'x(); a(); t(); float+ [o]p()', GenericTypes => $F, HandleBad => 1, Code => ' PDL_IF_BAD(if ($ISBAD($x()) || $ISBAD($a()) || $ISBAD($t())) { $SETBAD( $p() ); continue; },) double g = gsl_sf_gamma( $a() ); $p() = pow($x(), $a()-1) * exp(-1*$x() / $t()) / (pow($t(), $a()) * g); ', Doc => ' =for ref

probability density function for two-parameter gamma distribution. ', );

pp_def('mle_gaussian', Pars => 'a(n); float+ [o]m(); float+ [o]v()', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(m) sa = 0, a2 = 0; PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) continue;,) sa += $a(); a2 += pow($a(), 2); PDL_IF_BAD(N++;,) %} if (N < 1) { $SETBAD(m()); $SETBAD(v()); continue; } $m() = sa / N; $v() = a2 / N - pow($m(),2); ', Doc => ' =for usage

my ($m, $v) = $data->mle_gaussian();

gaussian aka normal distribution. same results as $data->average and $data->var. mle same as method of moments estimate. ', );

pp_def('pdf_gaussian', Pars => 'x(); m(); v(); float+ [o]p()', GenericTypes => $F, HandleBad => 1, Code => ' PDL_IF_BAD(if ($ISBAD($x()) || $ISBAD($m()) || $ISBAD($v())) { $SETBAD( $p() ); continue; },) $p() = 1 / sqrt($v() * 2 * M_PI) * exp( -1 * pow($x() - $m(), 2) / (2*$v()) ); ', Doc => ' =for ref

probability density function for gaussian distribution. ', );

pp_def('mle_geo', Pars => 'a(n); float+ [o]p();', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(p) sa = 0; PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) continue;,) sa += $a(); PDL_IF_BAD(N++;,) %} if (N < 1) { $SETBAD(p()); continue; } $p() = 1 / (1 + sa/N); ', Doc => ' =for ref

geometric distribution. mle same as method of moments estimate. ', );

pp_def('pmf_geo', Pars => 'ushort x(); p(); float+ [o]out()', GenericTypes => $F, HandleBad => 1, Code => ' PDL_IF_BAD(if ($ISBAD($x()) || $ISBAD($p())) { $SETBAD( $out() ); continue; },) $out() = pow(1-$p(), $x()) * $p(); ', Doc => ' =for ref

probability mass function for geometric distribution. x >= 0. ', );

pp_def('mle_geosh', Pars => 'a(n); float+ [o]p();', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(p) sa = 0; PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) continue;,) sa += $a(); PDL_IF_BAD(N++;,) %} if (sa <= 0) { $SETBAD(p()); continue; } $p() = N / sa; ', Doc => ' =for ref

shifted geometric distribution. mle same as method of moments estimate. ', );

pp_def('pmf_geosh', Pars => 'ushort x(); p(); float+ [o]out()', GenericTypes => $F, HandleBad => 1, Code => ' PDL_IF_BAD(if ($ISBAD($x()) || $ISBAD($p())) { $SETBAD( $out() ); continue; },) if ( $x() < 1 ) { $CROAK( "x >= 1 please" ); } $out() = pow(1-$p(), $x()-1) * $p(); ', Doc => ' =for ref

probability mass function for shifted geometric distribution. x >= 1. ', );

pp_def('mle_lognormal', Pars => 'a(n); float+ [o]m(); float+ [o]v()', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(m) sa = 0, a2 = 0; PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) continue;,) sa += log($a()); PDL_IF_BAD(N++;,) %} if (N < 1) { $SETBAD(m()); $SETBAD(v()); continue; } $m() = sa / N; loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) continue;,) a2 += pow(log($a()) - $m(), 2); %} $v() = a2 / N; ', Doc => ' =for usage

my ($m, $v) = $data->mle_lognormal();

lognormal distribution. maximum likelihood estimation. ', );

pp_def('mme_lognormal', Pars => 'a(n); float+ [o]m(); float+ [o]v()', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(m) sa = 0, a2 = 0; PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) continue;,) sa += $a(); a2 += pow($a(), 2); PDL_IF_BAD(N++;,) %} if (N < 1) { $SETBAD(m()); $SETBAD(v()); continue; } $m() = 2 * log(sa / N) - 1/2 * log( a2 / N ); $v() = log( a2 / N ) - 2 * log( sa / N ); ', Doc => ' =for usage

my ($m, $v) = $data->mme_lognormal();

lognormal distribution. method of moments estimation. ', );

pp_def('pdf_lognormal', Pars => 'x(); m(); v(); float+ [o]p()', GenericTypes => $F, HandleBad => 1, Code => ' PDL_IF_BAD(if ( $ISBAD($x()) || $ISBAD($m()) || $ISBAD($v()) ) { $SETBAD( $p() ); continue; },) if (!($x() > 0 && $v() > 0)) $CROAK( "x and v > 0 please" ); $p() = 1 / ($x() * sqrt($v() * 2 * M_PI)) * exp( -1 * pow(log($x()) - $m(), 2) / (2*$v()) ); ', Doc => ' =for ref

probability density function for lognormal distribution. x > 0. v > 0. ', );

pp_def('mme_nbd', Pars => 'a(n); float+ [o]r(); float+ [o]p()', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(p) sa = 0, a2 = 0; PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) continue;,) sa += $a(); a2 += pow($a(), 2); PDL_IF_BAD(N++;,) %} PDL_IF_BAD(if (N < 1) { $SETBAD(r()); $SETBAD(p()); continue; },) $GENERIC(p) m = sa / N; $GENERIC(p) v = a2 / N - pow(m, 2); $r() = pow(m, 2) / (v - m); $p() = m / v; ', Doc => ' =for usage

my ($r, $p) = $data->mme_nbd();

negative binomial distribution. pmf: f(x; r,p) = (x+r-1 r-1) p^r (1-p)^x for x=0,1,2... ', );

pp_def('pmf_nbd', Pars => 'ushort x(); r(); p(); float+ [o]out()', GenericTypes => $F, HandleBad => 1, Code => ' PDL_IF_BAD(if ( $ISBAD($x()) || $ISBAD($r()) || $ISBAD($p()) ) { $SETBAD( $out() ); continue; },) $GENERIC(out) nbc = gsl_sf_gamma($x()+$r()) / (gsl_sf_fact($x()) * gsl_sf_gamma($r())); $out() = nbc * pow($p(),$r()) * pow(1-$p(), $x()); ', Doc => ' =for ref

probability mass function for negative binomial distribution. ', );

pp_def('mme_pareto', Pars => 'a(n); float+ [o]k(); float+ [o]xm()', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(xm) sa = 0, min = $a(n=>0); PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) continue;,) sa += $a(); if (min > $a()) min = $a(); PDL_IF_BAD(N++;,) %} if (min <= 0) $CROAK("min <= 0!"); $k() = (sa - min) / ( N*( sa/N - min ) ); $xm() = (N * $k() - 1) * min / ( N * $k() ); ', Doc => ' =for usage

my ($k, $xm) = $data->mme_pareto();

pareto distribution. pdf: f(x; k,xm) = k xm^k / x^(k+1) for x >= xm > 0. ', );

pp_def('pdf_pareto', Pars => 'x(); k(); xm(); float+ [o]p()', GenericTypes => $F, HandleBad => 1, Code => ' PDL_IF_BAD(if ( $ISBAD($x()) || $ISBAD($k()) || $ISBAD($xm()) ) { $SETBAD( $p() ); continue; },) if (!($xm() > 0 && $x() >= $xm() )) $CROAK("x >= xm > 0 please"); $p() = $k() * pow($xm(),$k()) / pow($x(), $k()+1); ', Doc => ' =for ref

probability density function for pareto distribution. x >= xm > 0. ', );

pp_def('mle_poisson', Pars => 'a(n); float+ [o]l();', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(l) sa = 0; PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) continue;,) sa += $a(); PDL_IF_BAD(N++;,) %} if (N < 1) { $SETBAD(l()); continue; } $l() = sa / N; ', Doc => ' =for usage

my $lamda = $data->mle_poisson();

poisson distribution. pmf: f(x;l) = e^(-l) * l^x / x! ', );

pp_def('pmf_poisson', Pars => 'x(); l(); float+ [o]p()', GenericTypes => $F, HandleBad => 1, Code => q{ PDL_IF_BAD(if ($ISBAD($x()) || $ISBAD($l())) { $SETBAD($p()); continue; },) if ($x() < 0) { $p() = 0; } else if ($x() < GSL_SF_FACT_NMAX / 2) { /* Exact formula */ $p() = exp( -1 * $l()) * pow($l(),$x()) / gsl_sf_fact( (unsigned int) $x() ); } else { /* Use Stirling's approximation. See * http://en.wikipedia.org/wiki/Stirling%27s_approximation */ double log_p = $x() - $l() + $x() * log($l() / $x()) - 0.5 * log(2*M_PI * $x()) - 1. / 12. / $x() + 1 / 360. / $x()/$x()/$x() - 1. / 1260. / $x()/$x()/$x()/$x()/$x(); $p() = exp(log_p); } }, Doc => q{ =for ref

Probability mass function for poisson distribution. Uses Stirling's formula for x > 85. }, );

pp_def('pmf_poisson_stirling', Pars => 'x(); l(); [o]p()', GenericTypes => $F, HandleBad => 1, Code => q{ PDL_IF_BAD(if ($ISBAD($x()) || $ISBAD($l())) { $SETBAD($p()); continue; },) if ($x() < 0) { $p() = 0; } else if ($x() == 0) { $p() = exp(-$l()); } else { /* Use Stirling's approximation. See * http://en.wikipedia.org/wiki/Stirling%27s_approximation */ double log_p = $x() - $l() + $x() * log($l() / $x()) - 0.5 * log(2*M_PI * $x()) - 1. / 12. / $x() + 1 / 360. / $x()/$x()/$x() - 1. / 1260. / $x()/$x()/$x()/$x()/$x(); $p() = exp(log_p); } }, Doc => q{ =for ref

Probability mass function for poisson distribution. Uses Stirling's formula for all values of the input. See http://en.wikipedia.org/wiki/Stirling's_approximation for more info. }, );

pp_def('pmf_poisson_factorial', Pars => 'ushort x(); l(); float+ [o]p()', GenericTypes => $F, HandleBad => 1, Code => q{ PDL_IF_BAD(if ($ISBAD($x()) || $ISBAD($l())) { $SETBAD($p()); continue; },) if ($x() < GSL_SF_FACT_NMAX) { $p() = exp( -1 * $l()) * pow($l(),$x()) / gsl_sf_fact( $x() ); } else { /* bail out */ $p() = 0; } }, PMCode => pp_line_numbers(__LINE__, <<'EOF'), sub PDL::pmf_poisson_factorial { my ($x, $l) = @_; my $pdlx = PDL->topdl($x); croak "Does not support input greater than 170. Please use pmf_poisson or pmf_poisson_stirling instead." if any($pdlx >= 170); PDL::_pmf_poisson_factorial_int($pdlx, $l, my $p = PDL->null); $p; } EOF Doc => <<'EOD', =for ref

Probability mass function for poisson distribution. Input is limited to x < 170 to avoid gsl_sf_fact() overflow. EOD );

pp_addpm {At=>'Bot'}, pp_line_numbers(__LINE__, <<'EOD');

plot_distr

Plots data distribution. When given specific distribution(s) to fit, returns % ref to sum log likelihood and parameter values under fitted distribution(s). See FUNCTIONS above for available distributions.

Default options (case insensitive):

MAXBN => 20,
  # see PDL::Graphics::PGPLOT::Window for next options
WIN   => undef,   # pgwin object. not closed here if passed
                  # allows comparing multiple distr in same plot
                  # set env before passing WIN
DEV   => '/xs' ,  # open and close dev for plotting if no WIN
                  # defaults to '/png' in Windows
COLOR => 1,       # color for data distr

Usage:

  # yes it threads :)
my $data = grandom( 500, 3 )->abs;
  # ll on plot is sum across 3 data curves
my ($ll, $pars)
  = $data->plot_distr( 'gaussian', 'lognormal', {DEV=>'/png'} );

  # pars are from normalized data (ie data / bin_size)
print "$_\t@{$pars->{$_}}\n" for (sort keys %$pars);
print "$_\t$ll->{$_}\n" for (sort keys %$ll);

DEPENDENCIES

GSL - GNU Scientific Library

SEE ALSO

PDL::Graphics::PGPLOT

PDL::GSL::CDF

AUTHOR

Copyright (C) 2009 Maggie J. Xiong <maggiexyz users.sourceforge.net>, David Mertens

All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDL distribution.