NAME
KinoSearch1::Docs::Tutorial - sample indexing and search applications
DESCRIPTION
The following sample code for invindexer.plx and search.cgi can be used to create a simple search engine. It requires the html presentation of the US Constitution included in the distribution for KinoSearch1, under t/us_constitution
.
Note that a proper indexer for html documents would not rely on quick-n-dirty regular expressions for stripping tags, as this one does for the sake of brevity -- it would use a dedicated parsing module such as HTML::Parser.
invindexer.plx
#!/usr/bin/perl
use strict;
use warnings;
use File::Spec;
use KinoSearch1::InvIndexer;
use KinoSearch1::Analysis::PolyAnalyzer;
### In order for invindexer.plx to work correctly, you must modify
### $source_dir, $path_to_invindex, and possibly $base_url.
###
### $source_dir must lead to the directory containing the US
### Constitution html files.
###
### $path_to_invindex is the future location of the invindex.
###
### $base_url should reflect the location of the us_constitution directory
### when accessed via a web browser.
my $source_dir = '';
my $path_to_invindex = '';
my $base_url = '/us_constitution';
opendir( my $source_dh, $source_dir )
or die "Couldn't opendir '$source_dir': $!";
my @filenames = grep {/\.html/} readdir $source_dh;
closedir $source_dh or die "Couldn't closedir '$source_dir': $!";
### STEP 1: Choose an Analyzer.
my $analyzer = KinoSearch1::Analysis::PolyAnalyzer->new(
language => 'en',
);
### STEP 2: Create a InvIndexer object.
my $invindexer = KinoSearch1::InvIndexer->new(
analyzer => $analyzer,
invindex => $path_to_invindex,
create => 1,
);
### STEP 3: Define fields.
$invindexer->spec_field( name => 'title' );
$invindexer->spec_field(
name => 'bodytext',
vectorized => 1,
);
$invindexer->spec_field(
name => 'url',
indexed => 0,
);
foreach my $filename (@filenames) {
next if $filename eq 'index.html';
my $filepath = File::Spec->catfile( $source_dir, $filename );
open( my $fh, '<', $filepath )
or die "couldn't open file '$filepath': $!";
my $content = do { local $/; <$fh> };
### STEP 4: Start a new document.
my $doc = $invindexer->new_doc;
$content =~ m#<title>(.*?)</title>#s
or die "couldn't isolate title in '$filepath'";
my $title = $1;
$content =~ m#<div id="bodytext">(.*?)</div><!--bodytext-->#s
or die "couldn't isolate bodytext in '$filepath'";
my $bodytext = $1;
$bodytext =~ s/<.*?>/ /gsm; # quick and dirty tag stripping
### STEP 5: Set the value for each field.
$doc->set_value( url => "$base_url/$filename" );
$doc->set_value( title => $title );
$doc->set_value( bodytext => $bodytext );
### STEP 6 Add the document to the invindex.
$invindexer->add_doc($doc);
### STEP 7 Repeat steps 3-5 for each document in the collection.
}
### STEP 8 Finalize the invindex.
$invindexer->finish;
search.cgi
#!/usr/bin/perl -T
use strict;
use warnings;
use CGI;
use List::Util qw( max min );
use POSIX qw( ceil );
use KinoSearch1::Searcher;
use KinoSearch1::Analysis::PolyAnalyzer;
use KinoSearch1::Highlight::Highlighter;
my $cgi = CGI->new;
my $q = $cgi->param('q');
my $offset = $cgi->param('offset');
my $hits_per_page = 10;
$q = '' unless defined $q;
$offset = 0 unless defined $offset;
### In order for search.cgi to work, $path_to_invindex must be modified so
### that it points to the invindex created by invindexer.plx, and
### $base_url may have to change to reflect where a web-browser should
### look for the us_constitution directory.
my $path_to_invindex = '';
my $base_url = '/us_constitution';
### STEP 1: Specify the same Analyzer used to create the invindex.
my $analyzer = KinoSearch1::Analysis::PolyAnalyzer->new(
language => 'en',
);
### STEP 2: Create a Searcher object.
my $searcher = KinoSearch1::Searcher->new(
invindex => $path_to_invindex,
analyzer => $analyzer,
);
### STEP 3: Feed a query to the Search object.
my $hits = $searcher->search($q);
### STEP 4: Arrange for highlighted excerpts to be created.
my $highlighter = KinoSearch1::Highlight::Highlighter->new(
excerpt_field => 'bodytext' );
$hits->create_excerpts( highlighter => $highlighter );
### STEP 5: Process the search.
$hits->seek( $offset, $hits_per_page );
### STEP 6: Format the results however you like.
# create result list
my $report = '';
while ( my $hit = $hits->fetch_hit_hashref ) {
my $score = sprintf( "%0.3f", $hit->{score} );
$report .= qq|
<p>
<a href="$hit->{url}"><strong>$hit->{title}</strong></a>
<em>$score</em>
<br>
$hit->{excerpt}
<br>
<span class="excerptURL">$hit->{url}</span>
</p>
|;
}
$q = CGI::escapeHTML($q);
# display info about the number of hits, paging links
my $total_hits = $hits->total_hits;
my $num_hits_info;
if ( !length $q ) {
# no query, no display
$num_hits_info = '';
}
elsif ( $total_hits == 0 ) {
# alert the user that their search failed
$num_hits_info = qq|<p>No matches for <strong>$q</strong></p>|;
}
else {
# calculate the nums for the first and last hit to display
my $last_result = min( ( $offset + $hits_per_page ), $total_hits );
my $first_result = min( ( $offset + 1 ), $last_result );
# display the result nums, start paging info
$num_hits_info = qq|
<p>
Results <strong>$first_result-$last_result</strong>
of <strong>$total_hits</strong> for <strong>$q</strong>.
</p>
<p>
Results Page:
|;
# calculate first and last hits pages to display / link to
my $current_page = int( $first_result / $hits_per_page ) + 1;
my $last_page = ceil( $total_hits / $hits_per_page );
my $first_page = max( 1, ( $current_page - 9 ) );
$last_page = min( $last_page, ( $current_page + 10 ) );
# create a url for use in paging links
my $href = $cgi->url( -relative => 1 ) . "?" . $cgi->query_string;
$href .= ";offset=0" unless $href =~ /offset=/;
# generate the "Prev" link;
if ( $current_page > 1 ) {
my $new_offset = ( $current_page - 2 ) * $hits_per_page;
$href =~ s/(?<=offset=)\d+/$new_offset/;
$num_hits_info .= qq|<a href="$href"><= Prev</a>\n|;
}
# generate paging links
for my $page_num ( $first_page .. $last_page ) {
if ( $page_num == $current_page ) {
$num_hits_info .= qq|$page_num \n|;
}
else {
my $new_offset = ( $page_num - 1 ) * $hits_per_page;
$href =~ s/(?<=offset=)\d+/$new_offset/;
$num_hits_info .= qq|<a href="$href">$page_num</a>\n|;
}
}
# generate the "Next" link
if ( $current_page != $last_page ) {
my $new_offset = $current_page * $hits_per_page;
$href =~ s/(?<=offset=)\d+/$new_offset/;
$num_hits_info .= qq|<a href="$href">Next =></a>\n|;
}
# finish paging links
$num_hits_info .= "</p>\n";
}
# blast it all out
print "Content-type: text/html\n\n";
print <<END_HTML;
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<meta http-equiv="Content-type"
content="text/html;charset=ISO-8859-1">
<link rel="stylesheet" type="text/css" href="$base_url/uscon.css">
<title>KinoSearch: $q</title>
</head>
<body>
<div id="navigation">
<form id="usconSearch" action="">
<strong>
Search the <a href="$base_url/index.html">US Constitution</a>:
</strong>
<input type="text" name="q" id="q" value="$q">
<input type="submit" value="=>">
<input type="hidden" name="offset" value="0">
</form>
</div><!--navigation-->
<div id="bodytext">
$report
$num_hits_info
<p style="font-size: smaller; color: #666">
<em>Powered by
<a href="http://www.rectangular.com/kinosearch/">
KinoSearch
</a>
</em>
</p>
</div><!--bodytext-->
</body>
</html>
END_HTML
COPYRIGHT
Copyright 2005-2010 Marvin Humphrey
LICENSE, DISCLAIMER, BUGS, etc.
See KinoSearch1 version 1.01.