#include "KinoSearch/Util/ToolSet.h"
#include <stdio.h>
#define KINO_WANT_TERMVECTORSWRITER_VTABLE
#include "KinoSearch/Index/TermVectorsWriter.r"
#include "KinoSearch/Analysis/Token.r"
#include "KinoSearch/Analysis/TokenBatch.r"
#include "KinoSearch/InvIndex.r"
#include "KinoSearch/Index/SegInfo.r"
#include "KinoSearch/Index/TermVectorsReader.r"
#include "KinoSearch/Util/Native.r"
#include "KinoSearch/Util/IntMap.r"
#include "KinoSearch/Store/Folder.r"
#include "KinoSearch/Store/OutStream.r"
#include "KinoSearch/Store/InStream.r"
TermVectorsWriter*
TVWriter_new(InvIndex *invindex, SegInfo *seg_info)
{
ByteBuf *filename = BB_CLONE(seg_info->seg_name);
CREATE(self, TermVectorsWriter, TERMVECTORSWRITER);
/* assign */
self->invindex = REFCOUNT_INC(invindex);
self->seg_info = REFCOUNT_INC(seg_info);
/* open outstreams */
BB_Cat_Str(filename, ".tv", 3);
self->tv_out = Folder_Open_OutStream(invindex->folder, filename);
BB_Cat_Str(filename, "xtemp", 5);
self->tvx_out = Folder_Open_OutStream(invindex->folder, filename);
REFCOUNT_DEC(filename);
return self;
}
ByteBuf*
TVWriter_tv_string(TermVectorsWriter *self, TokenBatch *batch)
{
char *last_text = "";
size_t last_len = 0;
ByteBuf *tv_string = BB_new(20 + batch->size * 8); /* generous */
u32_t num_postings = 0;
char *dest;
Token **tokens;
u32_t freq;
UNUSED_VAR(self); /* heh. */
/* leave space for a vint indicating the number of postings. */
tv_string->len = VINT_MAX_BYTES;
TokenBatch_Reset(batch);
while ( (tokens = TokenBatch_Next_Cluster(batch, &freq)) != NULL ) {
Token *token = *tokens;
i32_t overlap = StrHelp_string_diff(last_text, token->text,
last_len, token->len);
char *ptr;
size_t new_size = tv_string->len
+ VINT_MAX_BYTES /* overlap */
+ VINT_MAX_BYTES /* length of string diff */
+ (token->len - overlap) /* diff char data */
+ VINT_MAX_BYTES /* num prox */
+ (VINT_MAX_BYTES * freq * 3); /* pos data */
/* allocate for worst-case scenario */
BB_GROW(tv_string, new_size);
ptr = BBEND(tv_string);
/* track number of postings */
num_postings += 1;
/* append the string diff to the tv_string */
ENCODE_VINT(overlap, ptr);
ENCODE_VINT( (token->len - overlap), ptr);
memcpy(ptr, (token->text + overlap), (token->len - overlap));
ptr += token->len - overlap;
/* save text and text_len for comparison next loop */
last_text = token->text;
last_len = token->len;
/* append the number of positions for this term */
ENCODE_VINT(freq, ptr);
do {
/* add position, start_offset, and end_offset to tv_string */
ENCODE_VINT(token->pos, ptr);
ENCODE_VINT(token->start_offset, ptr);
ENCODE_VINT(token->end_offset, ptr);
} while (--freq && (token = *++tokens));
/* set new length */
tv_string->len = ptr - tv_string->ptr;
}
/* go back and start the term vector string with the number of postings */
dest = tv_string->ptr;
ENCODE_FULL_VINT(num_postings, dest);
return tv_string;
}
void
TVWriter_add_segment(TermVectorsWriter *self, TermVectorsReader *tv_reader,
IntMap *doc_map, u32_t max_doc)
{
u32_t orig;
ByteBuf *bb = BB_new(0);
OutStream *tv_out = self->tv_out;
OutStream *tvx_out = self->tvx_out;
/* bail if the supplied segment is empty */
if (max_doc == 0)
return;
for (orig = 0; orig < max_doc; orig++) {
/* skip deleted docs */
if (IntMap_Get(doc_map, orig) == -1)
continue;
/* write file pointer */
OutStream_Write_Long( tvx_out, OutStream_STell(tv_out) );
/* copy the raw record */
TVReader_Read_Record(tv_reader, orig, bb);
OutStream_Write_Bytes(tv_out, bb->ptr, bb->len);
/* write length of entry */
OutStream_Write_Long(tvx_out, bb->len);
bb->len = 0;
}
REFCOUNT_DEC(bb);
}
void
TVWriter_finish(TermVectorsWriter *self, IntMap *doc_remap)
{
Folder *folder = self->invindex->folder;
Hash *metadata = Hash_new(0);
ByteBuf *tvxtemp_filename = BB_CLONE(self->seg_info->seg_name);
ByteBuf *tvx_filename = BB_CLONE(self->seg_info->seg_name);
/* build filenames */
BB_Cat_Str(tvxtemp_filename, ".tvxtemp", 8);
BB_Cat_Str(tvx_filename, ".tvx", 4);
/* close down the output streams */
OutStream_SClose(self->tv_out);
OutStream_SClose(self->tvx_out);
if (doc_remap == NULL) {
Folder_Rename_File(folder, tvxtemp_filename, tvx_filename);
}
/* remap document numbers */
else {
OutStream *final = Folder_Open_OutStream(folder, tvx_filename);
InStream *orig = Folder_Open_InStream(folder, tvxtemp_filename);
u32_t max_doc = InStream_SLength(orig) / 16;
u64_t *const entries = MALLOCATE(max_doc * 2, u64_t);
u32_t i;
for (i = 0; i < max_doc; i++) {
/* read bytes into memory, remapping as we go */
const u32_t new_doc = IntMap_Get(doc_remap, i);
char *const buf = (char*)entries + new_doc * 16;
InStream_Read_Bytes(orig, buf, 16);
}
/* blast out remapped info */
OutStream_Write_Bytes(final, (char*)entries, max_doc * 16);
/* clean up */
InStream_SClose(orig);
OutStream_SClose(final);
REFCOUNT_DEC(orig);
REFCOUNT_DEC(final);
free(entries);
}
/* generate and store metadata */
Hash_Store_I64(metadata, "format", 6, (i64_t)TVWRITER_FORMAT);
SegInfo_Add_Metadata(self->seg_info, "term_vectors", 12, (Obj*)metadata);
/* clean up */
REFCOUNT_DEC(metadata);
REFCOUNT_DEC(tvxtemp_filename);
REFCOUNT_DEC(tvx_filename);
}
void
TVWriter_destroy(TermVectorsWriter *self)
{
REFCOUNT_DEC(self->invindex);
REFCOUNT_DEC(self->seg_info);
REFCOUNT_DEC(self->tv_out);
REFCOUNT_DEC(self->tvx_out);
free(self);
}
/* Copyright 2006-2007 Marvin Humphrey
*
* This program is free software; you can redistribute it and/or modify
* under the same terms as Perl itself.
*/