lib/Algorithm/PageRank/XS.pm


            
              1
2
3
4
5
6
7
8
9
10
11
12
13
14
—
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
—
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
—
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
—
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
—
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
              package Algorithm::PageRank::XS;
use 5.008005;
use strict;
use warnings;
use Carp;
require Exporter;
use AutoLoader;
our $VERSION = '0.03';
require XSLoader;
XSLoader::load('Algorithm::PageRank::XS', $VERSION);
=head1 NAME
Algorithm::PageRank::XS - A Fast PageRank implementation
=head1 DESCRIPTION
This module implements a simple PageRank algorithm in C. The goal is
to quickly get a vector that is closed to the eigenvector of the
stochastic matrix of a graph.
L<Algorithm::PageRank> does some pagerank calculations, but it's 
slow and memory intensive. This module was developed to compute pagerank
on graphs with millions of arcs. This module will not, however, scale
up to quadrillions of arcs (see L<TODO>).
=head1 SYNOPSYS
    use Algorithm::PageRank::XS;
    my $pr = Algorithm::PageRank::XS->new();
    $pr->graph([
              'John'  => 'Joey',
              'John'  => 'James',
              'Joey'  => 'John',
              'James' => 'Joey',
              ]
              );
    $pr->results();
    # {
    #      'James' => '0.569840431213379',
    #      'Joey'  => '1',
    #      'John'  => '0.754877686500549'
    # }
    #
    #
    # The following simple program takes up arcs and prints the ranks.
    use Algorithm::PageRank::XS;
    my $pr = Algorithm::PageRank::XS->new();
    while (<>) {
        chomp;
        my ($from, to) = split(/\t/, $_);
        $pr->add_arc($from, $to);
    }
    while (my ($name, $rank) = each(%{$pr->results()})) {
        print("$name,$rank\n");
    }
=head1 METHODS
=head2 new %PARAMS
Create a new PageRank object. Possible parameters:
=over 4
=item alpha
This is (1 - how much people can move from one node to another unconnected one randomly). Decreasing
this number makes convergence more likely, but brings us further from the true eigenvector.
=item max_tries
The maximum number of tries until we give up trying to achieve convergence.
=item convergence
The maximum number the difference between two subsequent vectors must be before we say we are
"convergent enough". The convergence rate is the rate at which C<alpha^t> goes to 0. Thus,
if you set C<alpha> to C<0.85>, and C<convergence> to C<0.000001>, then you will need C<85> tries.
=back
=cut
sub new {
    my ($class, %params) = @_;
    my $self = {
        alpha => 0.85,
        max_tries => 200,
        convergence => 0.001,
        %params,
        dim_map => {},
        rev_map => {},
    };
    bless $self, $class;
    $self->init();
    return $self;
}
=head2 add_arc
Add an arc to the pagerank object before running the computation.
The actual values don't matter. So you can run:
    $pr->add_arc("Apple", "Orange");
and you mean that C<"Apple"> links to C<"Orange">.
=cut
sub add_arc ($$$) {
    my ($self, $from, $to) = @_;
    $from = $self->_dim_map($from);
    $to = $self->_dim_map($to);
    pr_tableadd($self->{table}, $from, $to) or croak("Unable to add arc to pagerank table.");
}
=head2 graph
Add a graph, which is just an array of from, to combinations.
This is equivalent to calling C<add_arc> a bunch of times, but may
be more convenient.
=cut
sub graph ($$) {
    my ($self, $graph) = @_;
    if (scalar @{$graph} % 2 == 1) {
        croak("Odd number of members of graph. Even number expected.");
    }
    for (my $i = 0; $i < scalar @{$graph}; $i += 2) {
        $self->add_arc($graph->[$i], $graph->[$i + 1]);
    }
}
=head2 results
Compute the pagerank vector, and return it as a hash.
Whatever you called the nodes when specifying the arcs will be the keys of this hash, where the
values will be the vector.
The result vector is normalized such that the maximum value is C<1>. This is to prevent extremely
small values for large data sets. You can normalize it any other way you like if you don't like this.
=cut
sub results ($) {
    my $self = $_[0];
    if (pr_tablesize($self->{table}) < 2) {
        carp("Unable to compute PageRank since graph size is too small.");
        return [];
    }
    my $results = pr_pagerank($self->{table}, scalar keys %{$self->{dim_map}}, $self->{alpha}, $self->{convergence}, $self->{max_tries});
    if (!$results or ref $results ne 'ARRAY') {
        return {};
    }
    my %better_results = ();
    for (my $i=0; $i < scalar @{$results}; $i++) {
        $better_results{$self->{rev_map}->{$i}} = $results->[$i];
    }
    undef $results;
    pr_tabledel($self->{table});
    $self->init();
     
    return \%better_results;
}
# PRIVATE METHODS
sub _dim_map ($$) {
    my ($self, $item) = @_;
    if (defined($self->{dim_map}->{$item})) {
        return $self->{dim_map}->{$item};
    }
    else {
        my $i = scalar keys %{$self->{dim_map}};
        $self->{dim_map}->{$item} = $i;
        $self->{rev_map}->{$i} = $item;
        return $i;
    }
}
sub init ($) {
    my $self = $_[0];
    $self->{dim_map} = {};
    $self->{rev_map} = {};
    $self->{table} = pr_tableinit();
}
1;
__END__
=head1 BUGS
None known.
=head1 TODO
=over 4
=item * We may want to support C<double> values rather than single floats
=item * We may or may not want to adjust the weighting of individual arcs, as you cannot do now.
=item * At present the indexes are C<unsigned int>, rather than C<size_t>. Thus this will not scale with 64-bit architectures.
=item * It'd be nice to be able to use C<mmap(2)> to efficiently use the hard drive to scale to places where memory can't take us.
=back
=head1 PERFORMANCE
This module is pretty fast. I ran this on a 1 million node set with 4.5 million arcs in 57 seconds on my 32-bit 1.8GHz laptop. Let me know if you have any performance tips. It's orders of magnitude faster than L<Algorithm::PageRank>, but performance tests will be here shortly.
=head1 SEE ALSO
L<Algorithm::PageRank>
=head1 AUTHOR
Michael Axiak <mike@axiak.net>
=head1 COPYRIGHT
Copyright (C) 2008 by Michael Axiak <mike@axiak.net>
This package is free software; you can redistribute it and/or modify it under the same terms as Perl itself
=cut
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)