—package
Algorithm::HyperLogLog;
use
strict;
use
warnings;
use
5.008008;
our
$VERSION
=
'0.24'
;
our
$PERL_ONLY
;
if
( !
defined
$PERL_ONLY
) {
$PERL_ONLY
=
$ENV
{PERL_HLL_PUREPERL} ? 1 : 0;
}
if
( !
exists
$INC
{
'Algorithm/HyperLogLog/PP.pm'
} ) {
if
( !
$PERL_ONLY
) {
$PERL_ONLY
= !
eval
{ XSLoader::load( __PACKAGE__,
$VERSION
); };
}
if
(
$PERL_ONLY
) {
require
'Algorithm/HyperLogLog/PP.pm'
;
}
}
sub
new_from_file {
my
(
$class
,
$filename
) =
@_
;
open
my
$fh
,
'<'
,
$filename
or
die
$!;
my
$on_error
=
sub
{
close
$fh
; croak
"Invalid dump file($filename)"
; };
binmode
$fh
;
my
(
@dumpdata
,
$buf
,
$readed
);
# Read register size data
$readed
=
read
(
$fh
,
$buf
, 1 );
$on_error
->()
if
$readed
!= 1;
my
$k
=
unpack
'C'
,
$buf
;
# Read register content data
my
$m
= 2*
*$k
;
$readed
=
read
$fh
,
$buf
,
$m
;
$on_error
->()
if
$readed
!=
$m
;
close
$fh
;
@dumpdata
=
unpack
'C*'
,
$buf
;
my
$self
=
$class
->_new_from_dump(
$k
, \
@dumpdata
);
return
$self
;
}
sub
dump_to_file {
my
(
$self
,
$filename
) =
@_
;
my
$k
=
log
(
$self
->register_size ) /
log
(2);
# Calculate log2(register_size)
my
$dumpdata
=
$self
->_dump_register();
open
my
$fh
,
'>'
,
$filename
or
die
$!;
binmode
$fh
;
my
$buf
=
pack
'C'
,
$k
;
$fh
$buf
;
$buf
=
pack
'C*'
,
@$dumpdata
;
$fh
$buf
;
close
$fh
;
}
sub
XS {
!
$PERL_ONLY
;
}
1;
__END__
=pod
=encoding utf8
=head1 NAME
Algorithm::HyperLogLog - Implementation of the HyperLogLog cardinality estimation algorithm
=head1 SYNOPSIS
use Algorithm::HyperLogLog;
my $hll = Algorithm::HyperLogLog->new(14);
while(<>){
$hll->add($_);
}
my $cardinality = $hll->estimate(); # Estimate cardinality
$hll->dump_to_file('hll_register.dump');# Dumps internal data
Construct object from dumped file.
use Algorithm::HyperLogLog;
# Restore internal state
my $hll = Algorithm::HyperLogLog->new_from_file('hll_register.dump');
=head1 DESCRIPTION
This module is implementation of the HyperLogLog algorithm.
HyperLogLog is an algorithm for estimating the cardinality of a set.
=head1 METHODS
=head2 new($b)
Constructor.
`$b` is the parameter for determining register size. (The register size is 2^$b.)
`$b` must be a integer between 4 and 16.
=head2 new_from_file($filename)
This method constructs object and restore the internal data of object from dumped file(dumped by dump_to_file() method).
=head2 dump_to_file($filename)
This method dumps the internal data of an object to a file.
=head2 add($data)
Adds element to the cardinality estimator.
=head2 estimate()
Returns estimated cardinality value in floating point number.
=head2 merge($other)
Merges the estimate from 'other' into this object, returning the estimate of their union.
=head2 register_size()
Return number of register.(In the XS implementation, this equals size in bytes)
=head2 XS()
If using XS backend, this method return true value.
=head1 SEE ALSO
Philippe Flajolet, Éric Fusy, Olivier Gandouet and Frédéric Meunier. HyperLogLog: the analysis of a near-optimal cardinality estimation algorithm. 2007 Conference on Analysis of Algorithms, DMTCS proc. AH, pp. 127–146, 2007. L<http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf>
=head1 AUTHOR
Hideaki Ohno E<lt>hide.o.j55 {at} gmail.comE<gt>
=head1 THANKS TO
MurmurHash3(L<https://github.com/PeterScott/murmur3>)
=over 4
=item Austin Appleby
=item Peter Scott
=back
=head1 LICENSE
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=cut