—package
KinoSearch::Index::DelDocs;
use
strict;
use
warnings;
# instance vars:
my
%num_deletions
;
sub
new {
my
$self
=
shift
->SUPER::new;
$num_deletions
{
"$self"
} = 0;
return
$self
;
}
# Read a deletions file if one exists.
sub
read_deldocs {
my
(
$self
,
$invindex
,
$filename
) =
@_
;
# load the file into memory if it's there
if
(
$invindex
->file_exists(
$filename
) ) {
my
$instream
=
$invindex
->open_instream(
$filename
);
my
$byte_size
;
(
$byte_size
,
$num_deletions
{
"$self"
} ) =
$instream
->lu_read(
'ii'
);
$self
->set_bits(
$instream
->lu_read(
"a$byte_size"
) );
$instream
->
close
;
}
}
# Blast out a hard copy of the deletions held in memory.
sub
write_deldocs {
my
(
$self
,
$invindex
,
$filename
,
$max_doc
) =
@_
;
if
(
$invindex
->file_exists(
$filename
) ) {
$invindex
->delete_file(
$filename
);
}
my
$outstream
=
$invindex
->open_outstream(
$filename
);
# pad out deldocs->bits
$self
->set_capacity(
$max_doc
);
# write header followed by deletions data
my
$byte_size
= ceil(
$max_doc
/ 8 );
$outstream
->lu_write(
"iia$byte_size"
,
$byte_size
,
$num_deletions
{
"$self"
},
$self
->get_bits,
);
$outstream
->
close
;
}
# Mark a doc as deleted.
sub
set {
my
(
$self
,
$doc_num
) =
@_
;
# ... only if it isn't already deleted
if
( !
$self
->get(
$doc_num
) ) {
$self
->SUPER::set(
$doc_num
);
$num_deletions
{
"$self"
}++;
}
}
# Delete all the docs represented by a TermDocs object.
sub
delete_by_term_docs {
my
(
$self
,
$term_docs
) =
@_
;
$num_deletions
{
"$self"
} += _delete_by_term_docs(
$self
,
$term_docs
);
}
# Undelete a doc.
sub
clear {
my
(
$self
,
$doc_num
) =
@_
;
# ... only if it was deleted before
if
(
$self
->get(
$doc_num
) ) {
$self
->SUPER::clear(
$doc_num
);
$num_deletions
{
"$self"
}--;
}
}
sub
get_num_deletions {
$num_deletions
{
"$_[0]"
} }
# Map around deleted documents.
sub
generate_doc_map {
my
(
$self
,
$max
,
$offset
) =
@_
;
my
$map
=
$self
->_generate_doc_map(
$max
,
$offset
);
return
KinoSearch::Util::IntMap->new(
$map
);
}
# If these get implemented, we'll need to write a range_count(first, last)
# method for BitVector.
sub
bulk_set {
shift
->todo_death }
sub
bulk_clear {
shift
->todo_death }
sub
close
{ }
sub
DESTROY {
my
$self
=
shift
;
delete
$num_deletions
{
"$self"
};
$self
->SUPER::DESTROY;
}
1;
__END__
__XS__
MODULE = KinoSearch PACKAGE = KinoSearch::Index::DelDocs
SV*
_generate_doc_map(deldocs, max, offset);
BitVector *deldocs;
I32 max;
I32 offset;
PREINIT:
SV *map_sv;
CODE:
map_sv = Kino_DelDocs_generate_doc_map(deldocs, max, offset);
RETVAL = newRV_noinc(map_sv);
OUTPUT: RETVAL
I32
_delete_by_term_docs(deldocs, term_docs)
BitVector *deldocs;
TermDocs *term_docs;
CODE:
RETVAL = Kino_DelDocs_delete_by_term_docs(deldocs, term_docs);
OUTPUT: RETVAL
__H__
#ifndef H_KINOSEARCH_DELDOCS
#define H_KINOSEARCH_DELDOCS 1
#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
#include "KinoSearchIndexTermDocs.h"
#include "KinoSearchUtilBitVector.h"
SV* Kino_DelDocs_generate_doc_map(BitVector*, I32, I32);
I32 Kino_DelDocs_delete_by_term_docs(BitVector*, TermDocs*);
#endif /* include guard */
__C__
#include "KinoSearchIndexDelDocs.h"
SV*
Kino_DelDocs_generate_doc_map(BitVector *deldocs, I32 max, I32 offset) {
SV *doc_map_sv;
I32 *doc_map;
I32 new_doc_num;
int i;
/* allocate space for the doc map */
doc_map_sv = newSV(max * sizeof(I32) + 1);
SvCUR_set(doc_map_sv, max * sizeof(I32));
SvPOK_on(doc_map_sv);
doc_map = (I32*)SvPVX(doc_map_sv);
/* -1 for a deleted doc, a new number otherwise */
new_doc_num = 0;
for (i = 0; i < max; i++) {
if (Kino_BitVec_get(deldocs, i))
*doc_map++ = -1;
else
*doc_map++ = offset + new_doc_num++;
}
return doc_map_sv;
}
I32
Kino_DelDocs_delete_by_term_docs(BitVector* deldocs, TermDocs* term_docs) {
I32 doc;
I32 num_deleted = 0;
/* iterate through term docs, marking each doc returned as deleted */
while (term_docs->next(term_docs)) {
doc = term_docs->get_doc(term_docs);
if (Kino_BitVec_get(deldocs, doc))
continue;
Kino_BitVec_set(deldocs, doc);
num_deleted++;
}
return num_deleted;
}
__POD__
=begin devdocs
=head1 NAME
KinoSearch::Index::DelDocs - manage documents deleted from an invindex
=head1 DESCRIPTION
DelDocs provides the low-level mechanisms for declaring a document deleted
from a segment, and for finding out whether or not a particular document has
been deleted.
Note that documents are not actually gone from the invindex until the segment
gets rewritten.
=head1 TODO
Consider ways to synchronize instances of this class so that there will be
exactly one instance per segment. That way, if an InvIndexer uses the instance
to delete a document, readers would have the modified vecstring available
right away without having to reread the .del file.
=head1 COPYRIGHT
Copyright 2005-2009 Marvin Humphrey
=head1 LICENSE, DISCLAIMER, BUGS, etc.
See L<KinoSearch|KinoSearch> version 0.165.
=end devdocs
=cut