package KinoSearch::InvIndexer; use strict; use warnings; use KinoSearch::Util::ToolSet; use base qw( KinoSearch::Util::Class ); use Clone qw( clone ); use File::Spec::Functions qw( catfile tmpdir ); use File::Temp qw(); use Sort::External; use KinoSearch::Document::Doc; use KinoSearch::Document::Field; use KinoSearch::Analysis::Analyzer; use KinoSearch::Store::FSInvIndex; use KinoSearch::Index::FieldInfos; use KinoSearch::Index::FieldsReader; use KinoSearch::Index::SegWriter; use KinoSearch::Index::SegInfos; use KinoSearch::Index::IndexFileNames qw( WRITE_LOCK_NAME COMMIT_LOCK_NAME WRITE_LOCK_TIMEOUT COMMIT_LOCK_TIMEOUT ); use KinoSearch::Search::Similarity; our %instance_vars = __PACKAGE__->init_instance_vars( # constructor args / members create => undef, invindex => undef, analyzer => KinoSearch::Analysis::Analyzer->new, # members analyzers => {}, sinfos => KinoSearch::Index::SegInfos->new, finfos => KinoSearch::Index::FieldInfos->new, doc_template => KinoSearch::Document::Doc->new, similarity => undef, seg_writer => undef, write_lock => undef, initialized => 0, ); sub init_instance { my $self = shift; # get a similarity object $self->{similarity} = KinoSearch::Search::Similarity->new; # confirm or create an InvIndex object my $invindex; if ( blessed( $self->{invindex} ) and $self->{invindex}->isa('KinoSearch::Store::InvIndex') ) { $invindex = $self->{invindex}; $self->{create} = $invindex->get_create unless defined $self->{create}; } elsif ( defined $self->{invindex} ) { $invindex = $self->{invindex} = KinoSearch::Store::FSInvIndex->new( create => $self->{create}, path => $self->{invindex}, ); } else { croak("Required parameter 'invindex' not supplied"); } # get a write lock for this invindex. my $write_lock = $self->{write_lock} = $invindex->make_lock( lock_name => WRITE_LOCK_NAME, timeout => WRITE_LOCK_TIMEOUT, ); $write_lock->obtain or croak( "invindex locked: " . $write_lock->get_lock_name ); # read/write SegInfos eval { $invindex->run_while_locked( lock_name => COMMIT_LOCK_NAME, timeout => COMMIT_LOCK_TIMEOUT, do_body => sub { $self->{create} ? $self->{sinfos}->write_infos($invindex) : $self->{sinfos}->read_infos($invindex); }, ); }; if ($@) { $self->{create} ? croak("failed to create invindex: $@") : croak("failed to open existing invindex: $@"); } # more initialization is coming after fields are spec'd... } sub _delayed_init { my $self = shift; $self->{initialized} = 1; my ( $invindex, $finfos ) = @{$self}{ 'invindex', 'finfos' }; # create a Doc object which will serve as a cloning template my $doc = $self->{doc_template}; for my $field ( $doc->get_fields ) { $field->set_field_num( $finfos->get_field_num( $field->get_name ) ); } # name a new segment and create a SegWriter my $out_seg_name = $self->_new_seg_name; $self->{seg_writer} = KinoSearch::Index::SegWriter->new( invindex => $invindex, seg_name => $out_seg_name, finfos => $finfos->clone, similarity => $self->{similarity}, ); } sub spec_field { my $self = shift; # don't allow new fields to be spec'd once the seg is in motion croak("Too late to spec field (new_doc has been called)") if $self->{initialized}; # detect or define a Field object my $field; if ( blessed( $_[0] ) ) { $field = shift; } else { eval { $field = KinoSearch::Document::Field->new(@_) }; croak $@ if $@; } # cache fnm_bits and fdt_bits $field->set_fnm_bits( KinoSearch::Index::FieldInfos->encode_fnm_bits($field) ); $field->set_fdt_bits( KinoSearch::Index::FieldsReader->encode_fdt_bits($field) ); # establish which analyzer will be used against the field $self->{analyzers}{ $field->get_name } = ( $field->get_analyzer || $self->{analyzer} ); # don't copy the analyzer into the template, so that it can be overridden $field->set_analyzer(undef); # add the field to the finfos and the template. $self->{finfos}->add_field($field); $self->{doc_template}->add_field($field); } sub new_doc { my $self = shift; $self->_delayed_init unless $self->{initialized}; return clone( $self->{doc_template} ); } sub add_doc { my ( $self, $doc ) = @_; # assign analyzers for my $field ( $doc->get_fields ) { if ( $field->get_analyzed ) { next if $field->get_analyzer; my $fieldname = $field->get_name; $field->set_analyzer( $self->{analyzers}{$fieldname} ); } } # add doc to output segment $self->{seg_writer}->add_doc($doc); } sub finish { my $self = shift; my $invindex = $self->{invindex}; # init, just in case no docs were ever added $self->_delayed_init unless $self->{initialized}; # finish the segment $self->{seg_writer}->finish; # now that the seg is complete, write its info to the 'segments' file $self->{sinfos}->add_info( KinoSearch::Index::SegInfo->new( seg_name => $self->{seg_writer}->get_seg_name, doc_count => $self->{seg_writer}->get_doc_count, invindex => $self->{invindex}, ) ); $invindex->run_while_locked( lock_name => COMMIT_LOCK_NAME, timeout => COMMIT_LOCK_TIMEOUT, do_body => sub { $self->{sinfos}->write_infos($invindex); }, ); } # Release the write lock - if it's there. sub _release_locks { my $self = shift; if ( defined $self->{write_lock} ) { $self->{write_lock}->release if $self->{write_lock}->is_locked; undef $self->{write_lock}; } } # Generate segment names (no longer Lucene compatible, as of 0.06). sub _new_seg_name { my $self = shift; my $counter = $self->{sinfos}->get_counter; $self->{sinfos}->set_counter( ++$counter ); return "_$counter"; } sub DESTROY { shift->_release_locks } 1; __END__ =head1 NAME KinoSearch::InvIndexer - build inverted indexes =head1 WARNING KinoSearch is alpha test software. The API and the file format are subject to change. =head1 SYNOPSIS use KinoSearch::InvIndexer; use KinoSearch::Analysis::PolyAnalyzer; my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( language => 'en' ); my $invindexer = KinoSearch::InvIndexer->new( invindex => '/path/to/invindex', create => 1, analyzer => $analyzer, ); $invindexer->spec_field( name => 'title' boost => 3, ); $invindexer->spec_field( name => 'bodytext' ); while ( my ( $title, $bodytext ) = each %source_documents ) { my $doc = $invindexer->new_doc($title); $doc->set_value( title => $title ); $doc->set_value( bodytext => $bodytext ); $invindexer->add_doc($doc); } $invindexer->finish; =head1 DESCRIPTION The InvIndexer class is KinoSearch's primary tool for creating and modifying inverted indexes, which may be searched using L<KinoSearch::Searcher|KinoSearch::Searcher>. =head1 METHODS =head2 new my $invindexer = KinoSearch::InvIndexer->new( invindex => '/path/to/invindex', # required create => 1, # default: 0 analyzer => $analyzer, # default: no-op Analyzer ); Create an InvIndexer object. =over =item * B<invindex> - can be either a filepath, or an InvIndex subclass such as L<KinoSearch::Store::FSInvIndex|KinoSearch::Store::FSInvIndex> or L<KinoSearch::Store::RAMInvIndex|KinoSearch::Store::RAMInvIndex>. =item * B<create> - create a new invindex, clobbering an existing one if necessary. =item * B<analyzer> - an object which subclasses L<KinoSearch::Analysis::Analyzer>, such as a L<PolyAnalyzer|KinoSearch::Analysis::PolyAnalyzer>. =back =head2 spec_field $invindexer->spec_field( name => 'url', # required boost => 1, # default: 1, analyzer => undef, # default: analyzer spec'd in new() indexed => 1, # default: 1 analyzed => 0, # default: 1 stored => 0, # default: 1 compressed => 0, # default: 0 vectorized => 0, # default: see below ); Define a field. This is analogous to defining a field in a database. =over =item * B<name> - the field's name. =item * B<boost> - A multiplier which determines how much a field contributes to a document's score. =item * B<analyzer> - By default, all indexed fields are analyzed using the analyzer that was supplied to new(). Supplying an alternate for a given field overrides the primary analyzer. =item * B<indexed> - index the field, so that it can be searched later. =item * B<analyzed> - analyze the field, using the relevant Analyzer. Fields such as "category" or "product_number" might be indexed but not analyzed. =item * B<stored> - store the field, so that it can be retrieved when the document turns up in a search. =item * B<compressed> - compress the stored field, using the zlib compression algorithm. =item * B<vectorized> - store the fields "term vectors", which are required by L<KinoSearch::Highlight::Highlighter|KinoSearch::Highlight::Highlighter> for excerpt selection and search term highlighting. By default, if a field is marked as C<stored>, it will be vectorized as well. =back =head2 new_doc my $doc = $invindexer->new_doc; Spawn an empty L<KinoSearch::Document::Doc|KinoSearch::Document::Doc> object, primed to accept values for the fields spec'd by spec_field. =head2 add_doc $invindexer->add_doc($doc); Add a document to the invindex. =head2 finish $invindexer->finish; Finish the invindex. Invalidates the InvIndexer. =head1 COPYRIGHT Copyright 2005-2006 Marvin Humphrey =head1 LICENSE, DISCLAIMER, BUGS, etc. See L<KinoSearch|KinoSearch> version 0.07. =cut