c_src/KinoSearch/Search/PhraseScorer.c

#include "KinoSearch/Util/ToolSet.h"

#define KINO_WANT_PHRASESCORER_VTABLE
#include "KinoSearch/Search/PhraseScorer.r"

#include "KinoSearch/Index/PostingList.r"
#include "KinoSearch/Posting/ScorePosting.r"
#include "KinoSearch/Search/Similarity.r"
#include "KinoSearch/Search/ScoreProx.r"
#include "KinoSearch/Search/Tally.r"
#include "KinoSearch/Util/Native.r"
#include "KinoSearch/Util/Int.r"

/* Mark this scorer as invalid/finished.
 */
static bool_t
invalidate(PhraseScorer *self);

/* Return the highest value for doc() from an array of PostingList objects.
 */
static u32_t
highest_doc(PostingList **plists, u32_t num_elements);

/* Build up the array of positions which match this query within the current
 * doc.
 */
static void
build_prox(PhraseScorer *self);

PhraseScorer*
PhraseScorer_new(Similarity *sim, VArray *plists, VArray *phrase_offsets,
                 void *weight, float weight_val, u32_t slop)
{
    u32_t i;
    CREATE(self, PhraseScorer, PHRASESCORER);

    /* init */
    self->tally            = Tally_new();
    self->sprox            = ScoreProx_new();
    Tally_Add_SProx(self->tally, self->sprox);
    self->anchor_set       = BB_new(0);
    self->raw_prox_bb      = BB_new(0);
    self->phrase_freq      = 0.0;
    self->phrase_boost     = 0.0;
    self->first_time       = true;
    self->more             = true;

    /* extract posting lists and phrase offsets for quick access */
    self->num_elements   = plists->size;
    self->plists         = MALLOCATE(self->num_elements, PostingList*);
    self->phrase_offsets = MALLOCATE(self->num_elements, u32_t);
    for (i = 0; i < self->num_elements; i++) {
        PostingList *const plist = (PostingList*)VA_Fetch(plists, i);
        Int *const offset = (Int*)VA_Fetch(phrase_offsets, i);
        if (plist == NULL || offset == NULL)
            CONFESS("Missing element %u", i);
        self->plists[i] = REFCOUNT_INC(plist);
        self->phrase_offsets[i] = offset->value;
    }

    /* assign */
    self->sim             = REFCOUNT_INC(sim);
    self->weight_value    = weight_val;
    self->weight          = Native_new(weight);
    self->slop            = slop;

    return self;
}

void
PhraseScorer_destroy(PhraseScorer *self) 
{
    size_t i;
    PostingList **plists = self->plists;

    for (i = 0; i < self->num_elements; i++) {
        REFCOUNT_DEC(plists[i]);
    }
    free(self->plists);
    free(self->phrase_offsets);

    REFCOUNT_DEC(self->sprox);
    REFCOUNT_DEC(self->tally);
    REFCOUNT_DEC(self->raw_prox_bb);
    REFCOUNT_DEC(self->sim);
    REFCOUNT_DEC(self->anchor_set);
    REFCOUNT_DEC(self->weight);

    free(self);
}

static bool_t
invalidate(PhraseScorer *self)
{
    self->more = false;
    return false;
}

static u32_t
highest_doc(PostingList **plists, u32_t num_elements)
{
    u32_t highest = 0;

    while(num_elements--) {
        u32_t candidate = PList_Get_Doc_Num(*plists); 
        if (candidate > highest)
            highest = candidate;
        plists++;
    }

    return highest;
}

bool_t
PhraseScorer_next(PhraseScorer *self)
{
    if (self->first_time) {
        return Scorer_Skip_To(self, 0);
    }
    else if (self->more) {
        const u32_t target = PList_Get_Doc_Num(self->plists[0]) + 1;
        return Scorer_Skip_To(self, target);
    }
    else {
        return false;
    }
}

bool_t
PhraseScorer_skip_to(PhraseScorer *self, u32_t target) 
{
    PostingList **const plists       = self->plists;
    const u32_t         num_elements = self->num_elements;
    u32_t               highest      = 0;

    self->phrase_freq = 0.0;
    self->doc_num    = DOC_NUM_SENTINEL; 

    if (self->first_time) {
        u32_t i;
        self->first_time = false;
        /* advance all posting lists */
        for (i = 0; i < num_elements; i++) {
            if ( !PList_Next(plists[i]) )
                return invalidate(self);
        }
        highest = highest_doc(plists, num_elements);
    }
    else {
        /* seed the search, advancing only one posting list */
        if ( !PList_Next(plists[0]) )
            return false;
        highest = PList_Get_Doc_Num(plists[0]);
    }

    /* find a doc which contains all the terms */
    while (1) {
        u32_t i;
        bool_t agreement = true;

        /* scoot all posting lists up */
        for (i = 0; i < num_elements; i++) {
            PostingList *const plist = plists[i];
            u32_t candidate = PList_Get_Doc_Num(plist);

            /* maybe raise the bar */
            if (highest < candidate)
                highest = candidate;
            if (target < highest)
                target = highest;

            /* scoot this posting list up */
            if (candidate < target) {
                if ( !PList_Skip_To(plist, target) )
                    return invalidate(self);
                /* if somebody's raised the bar, don't wait till next loop */
                highest = PList_Get_Doc_Num(plist);
            }
        }

        /* if posting lists don't agree, send back through the loop */
        for (i = 0; i < num_elements; i++) {
            PostingList *const plist = plists[i];
            const u32_t candidate    = PList_Get_Doc_Num(plist);
            if (candidate != highest)
                agreement = false;
        }

        if (agreement && highest >= target) {
            self->phrase_freq = PhraseScorer_Calc_Phrase_Freq(self);
            if (self->phrase_freq == 0.0) {
                target += 1;
            }
            else {
                /* success! */
                self->doc_num  = highest;
                build_prox(self);
                return true;
            }
        }
    }
}

static void
build_prox(PhraseScorer *self)
{
    ScoreProx *const sprox    = self->sprox;
    ByteBuf *const anchor_set = self->anchor_set;
    u32_t *anchors            = (u32_t*)anchor_set->ptr;
    u32_t *anchors_end        = (u32_t*)BBEND(anchor_set);
    u32_t *dest;

    /* set num_prox */
    sprox->num_prox = (anchor_set->len / sizeof(u32_t)) * self->num_elements;
    
    /* allocate space for the prox as needed and get a pointer to write to */
    BB_GROW(self->raw_prox_bb, sprox->num_prox * sizeof(u32_t));
    self->raw_prox_bb->len = sprox->num_prox * sizeof(u32_t);
    dest = (u32_t*)self->raw_prox_bb->ptr;
    sprox->prox = dest;

    /* BUG!  The anchor set is not necessarily the first element in the 
     * phrase -- it's the rarest. 
     */
    /* create one group for each anchor */
    for (  ; anchors < anchors_end; anchors++) {
        u32_t i;
        /* create 1 pos for each element */
        for (i = 0; i < self->num_elements; i++) {
            *dest++ = *anchors + i;
        }
    }
}

static INLINE u32_t 
winnow_anchors(u32_t *anchors_start, const u32_t *const anchors_end,
               const u32_t *candidates, const u32_t *const candidates_end,
               u32_t offset) 
{                          
    u32_t *anchors = anchors_start;
    u32_t *anchors_found = anchors_start;
    u32_t target_anchor;
    u32_t target_candidate;

    /* Safety check, so there's no chance of a bad dereference. */
    if (anchors_start == anchors_end || candidates == candidates_end)
        return 0;

    /* This function is a loop that finds terms that can continue a phrase.
     * It overwrites the anchors in place, and returns the number remaining.
     * The basic algorithm is to alternately increment the candidates' pointer
     * until it is at or beyond its target position, and then increment the 
     * anchors' pointer until it is at or beyond its target.  The non-standard
     * form is to avoid unnecessary comparisons.  I have not tested this
     * loop for speed, but glancing at the object code produced (objdump -S) 
     * it appears to be significantly faster than the nested loop alternative.
     * But given the vagaries of modern processors, it merits actual testing.*/

 SPIN_CANDIDATES:
    target_candidate = *anchors + offset;
    while (*candidates < target_candidate) {
        if (++candidates == candidates_end) goto DONE;
    } 
    if (*candidates == target_candidate) goto MATCH;
    goto SPIN_ANCHORS;  

 SPIN_ANCHORS: 
    target_anchor = *candidates - offset;
    while (*anchors < target_anchor) { 
        if (++anchors == anchors_end) goto DONE;
    };
    if (*anchors == target_anchor) goto MATCH;
    goto SPIN_CANDIDATES;  

 MATCH:       
    *anchors_found++ = *anchors;
    if (++anchors == anchors_end) goto DONE;
    goto SPIN_CANDIDATES; 

 DONE:
    /* return number of anchors remaining */
    return anchors_found - anchors_start; 
}

float
PhraseScorer_calc_phrase_freq(PhraseScorer *self) 
{
    PostingList **const plists   = self->plists;
    u32_t  *phrase_offsets       = self->phrase_offsets;
    u32_t i;

    /* create a overwriteable anchor set from the first posting */
    ScorePosting *posting = (ScorePosting*)PList_Get_Posting(plists[0]);
    u32_t anchors_remaining = posting->freq;
    ByteBuf *const anchor_set  = self->anchor_set;
    u32_t *anchors_start, *anchors_end, *anchors;

    BB_GROW(self->anchor_set, anchors_remaining * sizeof(u32_t));
    anchors_start = (u32_t *)anchor_set->ptr;
    anchors_end   = anchors_start + anchors_remaining;
    anchors       = anchors_start;
    for (i = 0; ; i++) {
        if (anchors >= anchors_end) break;
        /* copy anchor positions skipping those less than initial offset */
        else if (phrase_offsets[0] > posting->prox[i]) anchors_end--;
        /* subtract phrase_offsets[0] to account for leading virtual terms */
        else *anchors++ = posting->prox[i] - phrase_offsets[0];
    }

    /* match the positions of other terms against the anchor set */
    anchors_remaining = anchors_end - anchors_start;
    for (i = 1; i < self->num_elements && anchors_remaining; i++) {
        /* prepare the non-overwritten list of potential next terms */
        ScorePosting *posting = (ScorePosting *)PList_Get_Posting(plists[i]);
        u32_t *candidates_start = posting->prox;
        u32_t *candidates_end   = candidates_start + posting->freq;

        /* reduce anchor_set in place to those that match the next term */
        anchors_remaining = winnow_anchors(anchors_start, anchors_end,
                                           candidates_start, candidates_end,
                                           phrase_offsets[i]);
        /* adjust end for number of anchors that remain */
        anchors_end = anchors_start + anchors_remaining;
    }

    /* the number of anchors left is the phrase freq */
    return anchors_remaining;
}

u32_t
PhraseScorer_doc(PhraseScorer *self) 
{
    return self->doc_num;
}

Tally*
PhraseScorer_tally(PhraseScorer *self) 
{
    ScorePosting *posting = (ScorePosting*)PList_Get_Posting(self->plists[0]);
    self->tally->score = Sim_TF(self->sim, self->phrase_freq) 
         * self->weight_value * posting->impact;
    return self->tally;
}

/* Copyright 2006-2007 Marvin Humphrey
 *
 * This program is free software; you can redistribute it and/or modify
 * under the same terms as Perl itself.
 */
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	Go to GitHub issues (only if GitHub is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)
Keyboard Shortcuts