c_src/KinoSearch/Analysis/TokenBatch.c

#include "KinoSearch/Util/ToolSet.h"

#define KINO_WANT_TOKENBATCH_VTABLE
#include "KinoSearch/Analysis/TokenBatch.r"

#include "KinoSearch/Analysis/Token.r"
#include "KinoSearch/Index/Term.h"

/* After inversion, record how many like tokens occur in each group.
 */
static void
count_clusters(TokenBatch *self);

TokenBatch*
TokenBatch_new(Token *seed_token) 
{
    TokenBatch *self = (TokenBatch*)VA_new(10);
    self = REALLOCATE(self, 1, TokenBatch);
    self->_ = &TOKENBATCH;

    /* init */
    self->cur                 = 0;
    self->inverted            = false;
    self->cluster_counts      = NULL;
    self->cluster_counts_size = 0;

    /* process the seed token */
    if (seed_token != NULL)
        TokenBatch_append(self, seed_token);

    return self;
}

void            
TokenBatch_destroy(TokenBatch *self)
{       
    free(self->cluster_counts);
    VA_destroy((VArray*)self);
}       

Token*
TokenBatch_next(TokenBatch *self) 
{
    /* kill the iteration if we're out of tokens */
    if (self->cur == self->size)
        return NULL;
    return (Token*)self->elems[ self->cur++ ];
}

void
TokenBatch_reset(TokenBatch *self) 
{
    self->cur = 0;
}

void
TokenBatch_append(TokenBatch *self, Token *token) 
{
    /* safety check */
    if (self->inverted)
        CONFESS("Can't append tokens after inversion");

    /* minimize reallocations */
    if (self->size >= self->cap) {
        if (self->cap < 100) {
            VA_Grow(self, 100);
        }
        else if (self->size < 10000) {
            VA_Grow(self, self->cap * 2);
        }
        else {
            VA_Grow(self, self->cap + 10000);
        }
    }

    /* inlined VA_Push */
    self->elems[ self->size ] = (Obj*)REFCOUNT_INC(token);
    self->size++;
}

Token**
TokenBatch_next_cluster(TokenBatch *self, u32_t *count)
{
    Token **cluster = (Token**)(self->elems + self->cur);

    if (self->cur == self->size) {
        *count = 0;
        return NULL;
    }

    /* don't read past the end of the cluster counts array */
    if (!self->inverted)
        CONFESS("TokenBatch not yet inverted");
    if (self->cur > self->cluster_counts_size)
        CONFESS("Tokens were added after inversion");

    /* place cluster count in passed-in var, advance bookmark */
    *count = self->cluster_counts[ self->cur ];
    self->cur += *count;

    return cluster;
}

void 
TokenBatch_invert(TokenBatch *self)
{
    Token **tokens = (Token**)self->elems;
    Token **limit  = tokens + self->size;
    i32_t   token_pos = 0;

    /* thwart future attempts to append */
    if (self->inverted)
        CONFESS("TokenBatch has already been inverted");
    self->inverted = true;

    /* assign token positions */
    for ( ;tokens < limit; tokens++) {
        Token *const cur_token = *tokens;
        cur_token->pos = token_pos;
        token_pos += cur_token->pos_inc;
        if (token_pos < cur_token->pos) {
            CONFESS("Token positions out of order: %ld %ld", 
                (long)cur_token->pos, (long)token_pos);
        }
    }

    /* sort the tokens lexically, and hand off to cluster counting routine */
    qsort(self->elems, self->size, sizeof(Token*), Token_compare);
    count_clusters(self);
}

static void
count_clusters(TokenBatch *self)
{
    Token **tokens      = (Token**)self->elems;
    u32_t  *counts      = CALLOCATE(self->size + 1, u32_t); 
    u32_t   i;

    /* save the cluster counts */
    self->cluster_counts_size = self->size;
    self->cluster_counts = counts;

    for (i = 0; i < self->size; ) {
        Token *const base_token = tokens[i];
        char  *const base_text  = base_token->text;
        const size_t base_len   = base_token->len;
        u32_t j = i + 1;

        /* iterate through tokens until text doesn't match */
        while (j < self->size) {
            Token *const candidate = tokens[j];

            if (   (candidate->len == base_len)
                && (memcmp(candidate->text, base_text, base_len) == 0)
            ) {
                j++;
            }
            else {
                break;
            }
        }

        /* record a count at the position of the first token in the cluster */
        counts[i] = j - i;

        /* start the next loop at the next token we haven't seen */
        i = j;
    }
}

/* Copyright 2006-2007 Marvin Humphrey
 *
 * This program is free software; you can redistribute it and/or modify
 * under the same terms as Perl itself.
 */

	Global
`s`	Focus search bar
`?`	Bring up this help dialog

	GitHub
`g` `p`	Go to pull requests
`g` `i`	Go to GitHub issues (only if GitHub is preferred repository)

	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse

Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)

Keyboard Shortcuts