package Plucene::Index::SegmentsReader; =head1 NAME Plucene::Index::SegmentsReader - reads the segments =head1 SYNOPSIS my $segs_reader = Plucene::Index::SegmentsReader ->new($dir, Plucene::Index::SegmentReader @readers); my $num_docs = $segs_reader->num_docs; my $doc = $segs_reader->document($id); my $norms = $seg_reader->norms($field); my $doc_freq = $segs_reader->doc_freq($term); my Plucene::Index::SegmentsTermEnum $term_enum = $segs_reader->terms($term); my Plucene::Index::SegmentsTermDocs $term_docs = $segs_reader->term_docs; my Plucene::Index::SegmentsTermPositions $term_positions = $segs_reader->term_positions; if ($segs_reader->is_deleted($id)) { ... } =head1 DESCRIPTION This is the segments reader class. =head1 METHODS =cut use strict; use warnings; use Plucene::Index::SegmentMergeInfo; use List::Util qw(sum); use Carp qw(cluck); use Tie::Array::Sorted; use base qw(Plucene::Index::Reader Class::Accessor::Fast); __PACKAGE__->mk_accessors(qw(max_doc)); use Memoize; memoize("norms"); # Saves messing with normsCache =head2 new my $segs_reader = Plucene::Index::SegmentsReader ->new($dir, Plucene::Index::SegmentReader @readers); This will create a new Plucene::Index::SegmentsReader object with the passed directory and Plucene::Index::SegmentReader objects. =cut sub new { my ($class, $dir, @readers) = @_; my $self = $class->SUPER::new($dir); $self->{readers} = \@readers; $self->{max_doc} = 0; for my $reader (@readers) { push @{ $self->{starts} }, $self->{max_doc}; $self->{max_doc} += $reader->max_doc; } return $self; } =head2 num_docs my $num_docs = $segs_reader->num_docs; This will return the number of documents in all the segments in the Reader. =cut sub num_docs { my $self = shift; return $self->{num_docs} if exists $self->{num_docs}; return $self->{num_docs} = sum(map $_->num_docs, @{ $self->{readers} }); } =head2 document my $doc = $segs_reader->document($id); This will return the document at the passed document id. =cut sub document { my ($self, $n) = @_; my $i = $self->_reader_index($n); return $self->{readers}[$i]->document($n - $self->{starts}[$i]); } =head2 is_deleted if ($segs_reader->is_deleted($id)) { ... } =cut sub is_deleted { my ($self, $n) = @_; my $i = $self->_reader_index($n); return $self->{readers}[$i]->is_deleted($n - $self->{starts}[$i]); } sub _do_delete { my ($self, $n) = @_; delete $self->{num_docs}; # Invalidate cache my $i = $self->_reader_index($n); return $self->{readers}[$i]->_do_delete($n - $self->{starts}[$i]); } sub _do_close { my $self = shift; $_->close for @{ $self->{readers} }; } sub _reader_index { my ($self, $n) = @_; my ($lo, $hi) = (0, $#{ $self->{readers} }); # Binary search while ($hi >= $lo) { my $mid = int(($lo + $hi) / 2); my $mid_val = $self->{starts}[$mid]; if ($n < $mid_val) { $hi = $mid - 1; } elsif ($n > $mid_val) { $lo = $mid + 1; } else { return $mid; } } return $hi; } =head2 norms my $norms = $seg_reader->norms($field); This returns the norms for the passed field. =cut sub norms { my ($self, $field) = @_; my $bytes = "\0" x $self->max_doc; for my $i (0 .. $#{ $self->{readers} }) { my $norm = $self->{readers}[$i]->norms($field); substr($bytes, $self->{starts}[$i], length $norm) = $norm; } return $bytes; } =head2 terms my Plucene::Index::SegmentsTermEnum $term_enum = $segs_reader->terms($term); This will return the Plucene::Index::SegmentsTermEnum onject for the passed in term. =cut sub terms { my ($self, $term) = @_; return Plucene::Index::SegmentsTermEnum->new($self->{readers}, $self->{starts}, $term); } =head2 doc_freq my $doc_freq = $segs_reader->doc_freq($term); This returns the number of documents containing the passed term. =cut sub doc_freq { my ($self, $term) = @_; return sum map $_->doc_freq($term), @{ $self->{readers} }; } =head2 term_docs my Plucene::Index::SegmentsTermDocs $term_docs = $segs_reader->term_docs; This will return the Plucene::Index::SegmentsTermDocs object. =cut sub term_docs { my $self = shift; my $term = shift; my $docs = Plucene::Index::SegmentsTermDocs->new($self->{readers}, $self->{starts}); if ($term) { $docs->seek($term) } return $docs; } =head2 term_positions my Plucene::Index::SegmentsTermPositions $term_positions = $segs_reader->term_positions; This will return the Plucene::Index::SegmentsTermPositions object. =cut sub term_positions { my $self = shift; my $term = shift; my $pos = Plucene::Index::SegmentsTermPositions->new($self->{readers}, $self->{starts}); if ($term) { $pos->seek($term) } return $pos; } # TODO # # Move these to a file of their own. Possibly. # package Plucene::Index::SegmentsTermEnum; # Wondered why I hadn't seen that before use base 'Class::Accessor::Fast'; __PACKAGE__->mk_accessors(qw(doc_freq term)); sub new { my ($class, $readers, $starts, $t) = @_; my @queue; tie @queue, "Tie::Array::Sorted"; for my $i (0 .. $#{$readers}) { my $reader = $readers->[$i]; my $term_enum = $reader->terms($t); my $smi = Plucene::Index::SegmentMergeInfo->new($starts->[$i], $term_enum, $reader); if (!$t ? $smi->next : $term_enum->term) { # ??? push @queue, $smi; } } my $self = bless { queue => \@queue }, $class; if ($t and @queue) { my $top = $queue[0]; $self->{term} = $top->term_enum->term; $self->{doc_freq} = $top->term_enum->doc_freq; } return $self; } sub next { my $self = shift; my $top = $self->{queue}[0]; if (!$top) { undef $self->{term}; return; } $self->{term} = $top->term; $self->{doc_freq} = 0; while ($top && $self->{term}->eq($top->term)) { $self->{doc_freq} += $top->term_enum->doc_freq; # This might look funny, but it's right. The pop takes $top off # the queue, and when it has ->next called on it, its comparison # value changes; the queue is tied as a Tie::Array::Sorted, so # when it gets added back on, it may be put somewhere else. pop @{ $self->{queue} }; if ($top->next) { unshift @{ $self->{queue} }, $top; } $top = $self->{queue}[0]; } return 1; } package Plucene::Index::SegmentsTermDocs; sub new { my ($class, $readers, $starts) = @_; bless { readers => $readers, starts => $starts, seg_term_docs => [], base => 0, pointer => 0, current => undef, }, $class; } sub doc { my $self = shift; return $self->{base} + $self->{current}->doc; } sub freq { return shift->{current}->freq; } sub seek { my ($self, $term) = @_; $self->{term} = $term; $self->{base} = 0; $self->{pointer} = 0; $self->{current} = undef; } sub _at_end { $_[0]->{pointer} >= @{ $_[0]->{readers} } } sub _set_base_and_advance { my $self = shift; $self->{base} = $self->{starts}[ $self->{pointer} ]; $self->{current} = $self->term_docs($self->{pointer}++); } sub next { my $self = shift; return 1 if $self->{current} && $self->{current}->next; unless ($self->_at_end) { $self->_set_base_and_advance; return $self->next; } return 0; } sub read { my ($self) = @_; my ($docs, $freqs) = ([], []); while (1) { # Get a .current, somehow while (!$self->{current}) { goto done if $self->_at_end; # Don't fall off $self->_set_base_and_advance; } my ($new_docs, $new_freqs) = $self->{current}->read($docs, $freqs); if (!scalar @$new_docs) { # It's empty undef $self->{current}; } else { # Correct the doc positions to the appropriate base $_ += $self->{base} for @$new_docs; push @$docs, @$new_docs; push @$freqs, @$new_freqs; goto done; } } done: return ($docs, $freqs); } sub skip_to { my ($self, $target) = @_; $self->next || return while $target > $self->doc; return 1; } sub term_docs { my ($self, $i) = @_; return unless $self->{term}; $self->{seg_term_docs}[$i] = $self->term_docs_r($self->{readers}[$i]) unless exists $self->{seg_term_docs}[$i]; my $result = $self->{seg_term_docs}[$i]; $result->seek($self->{term}); return $result; } sub term_docs_r { my ($self, $reader) = @_; return $reader->term_docs; } package Plucene::Index::SegmentsTermPositions; use base 'Plucene::Index::SegmentsTermDocs'; sub term_docs_r { my ($self, $reader) = @_; return $reader->term_positions; } sub next_position { return shift->{current}->next_position; } 1;