package Algorithm::HyperLogLog; use strict; use warnings; use 5.008008; use Carp qw(croak); our $VERSION = '0.24'; our $PERL_ONLY; if ( !defined $PERL_ONLY ) { $PERL_ONLY = $ENV{PERL_HLL_PUREPERL} ? 1 : 0; } if ( !exists $INC{'Algorithm/HyperLogLog/PP.pm'} ) { if ( !$PERL_ONLY ) { require XSLoader; $PERL_ONLY = !eval { XSLoader::load( __PACKAGE__, $VERSION ); }; } if ($PERL_ONLY) { require 'Algorithm/HyperLogLog/PP.pm'; } } sub new_from_file { my ( $class, $filename ) = @_; open my $fh, '<', $filename or die $!; my $on_error = sub { close $fh; croak "Invalid dump file($filename)"; }; binmode $fh; my ( @dumpdata, $buf, $readed ); # Read register size data $readed = read( $fh, $buf, 1 ); $on_error->() if $readed != 1; my $k = unpack 'C', $buf; # Read register content data my $m = 2**$k; $readed = read $fh, $buf, $m; $on_error->() if $readed != $m; close $fh; @dumpdata = unpack 'C*', $buf; my $self = $class->_new_from_dump( $k, \@dumpdata ); return $self; } sub dump_to_file { my ( $self, $filename ) = @_; my $k = log( $self->register_size ) / log(2); # Calculate log2(register_size) my $dumpdata = $self->_dump_register(); open my $fh, '>', $filename or die $!; binmode $fh; my $buf = pack 'C', $k; print $fh $buf; $buf = pack 'C*', @$dumpdata; print $fh $buf; close $fh; } sub XS { !$PERL_ONLY; } 1; __END__ =pod =encoding utf8 =head1 NAME Algorithm::HyperLogLog - Implementation of the HyperLogLog cardinality estimation algorithm =head1 SYNOPSIS use Algorithm::HyperLogLog; my $hll = Algorithm::HyperLogLog->new(14); while(<>){ $hll->add($_); } my $cardinality = $hll->estimate(); # Estimate cardinality $hll->dump_to_file('hll_register.dump');# Dumps internal data Construct object from dumped file. use Algorithm::HyperLogLog; # Restore internal state my $hll = Algorithm::HyperLogLog->new_from_file('hll_register.dump'); =head1 DESCRIPTION This module is implementation of the HyperLogLog algorithm. HyperLogLog is an algorithm for estimating the cardinality of a set. =head1 METHODS =head2 new($b) Constructor. `$b` is the parameter for determining register size. (The register size is 2^$b.) `$b` must be a integer between 4 and 16. =head2 new_from_file($filename) This method constructs object and restore the internal data of object from dumped file(dumped by dump_to_file() method). =head2 dump_to_file($filename) This method dumps the internal data of an object to a file. =head2 add($data) Adds element to the cardinality estimator. =head2 estimate() Returns estimated cardinality value in floating point number. =head2 merge($other) Merges the estimate from 'other' into this object, returning the estimate of their union. =head2 register_size() Return number of register.(In the XS implementation, this equals size in bytes) =head2 XS() If using XS backend, this method return true value. =head1 SEE ALSO Philippe Flajolet, Éric Fusy, Olivier Gandouet and Frédéric Meunier. HyperLogLog: the analysis of a near-optimal cardinality estimation algorithm. 2007 Conference on Analysis of Algorithms, DMTCS proc. AH, pp. 127–146, 2007. L =head1 AUTHOR Hideaki Ohno Ehide.o.j55 {at} gmail.comE =head1 THANKS TO MurmurHash3(L) =over 4 =item Austin Appleby =item Peter Scott =back =head1 LICENSE This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut