#include "KinoSearch/Util/ToolSet.h"

#include "math.h"

#define KINO_WANT_SIMILARITY_VTABLE
#include "KinoSearch/Search/Similarity.r"


Similarity*
Sim_new(const char *class_name) 
{
    u32_t i;
    CREATE_SUBCLASS(self, class_name, Similarity, SIMILARITY);

    /* cache decoded norms and proximity boost factors */
    self->norm_decoder = MALLOCATE(256, float);
    self->prox_decoder = MALLOCATE(256, float);
    for (i = 0; i < 256; i++) {
        self->norm_decoder[i] = Sim_Decode_Norm(self, i);
        self->prox_decoder[i] = Sim_Prox_Boost(self, i);
    }

    return self;
}

void
Sim_destroy(Similarity *self) 
{
    free(self->norm_decoder);
    free(self->prox_decoder);
    free(self);
}

void
Sim_serialize(Similarity *self, ByteBuf *target)
{
    const u32_t class_name_len = strlen(self->_->class_name);
    const u32_t new_len = target->len + sizeof(u32_t) + class_name_len;
    char *ptr;

    /* make room */
    BB_GROW(target, new_len);
    ptr = BBEND(target);
    target->len = new_len;

    /* only the class name */
    MATH_ENCODE_U32(class_name_len, ptr);
    ptr += sizeof(u32_t);
    memcpy(ptr, (char*)self->_->class_name, class_name_len);
}

Similarity*
Sim_deserialize(ViewByteBuf *serialized)
{
    u32_t class_name_len;
    ByteBuf *class_name;
    Similarity *self;

    /* extract class name, sanity checking as we go */
    if (serialized->len < sizeof(u32_t))
        CONFESS("Not enough bytes: %u", serialized->len);
    MATH_DECODE_U32(class_name_len, serialized->ptr);
    serialized->len -= sizeof(u32_t);
    serialized->ptr += sizeof(u32_t);
    if (serialized->len < class_name_len)
        CONFESS("Not enough bytes: %u %u", class_name_len, serialized->len);
    class_name = BB_new_str(serialized->ptr, class_name_len);
    serialized->len -= class_name_len;
    serialized->ptr += class_name_len;

    /* just create a new object */
    self = Sim_new(class_name->ptr);
    REFCOUNT_DEC(class_name);
    
    return self;
}

float
Sim_tf(Similarity *self, float freq) 
{
    UNUSED_VAR(self);
    return( sqrt(freq) );
}

u32_t
Sim_encode_norm(Similarity *self, float f) 
{
    u32_t norm;
    UNUSED_VAR(self);

    if (f < 0.0)
        f = 0.0;

    if (f == 0.0) {
        norm = 0;
    }
    else {
        const u32_t bits = *(u32_t*)&f;
        u32_t mantissa   = (bits & 0xffffff) >> 21;
        u32_t exponent   = (((bits >> 24) & 0x7f)-63) + 15;

        if (exponent > 31) {
            exponent = 31;
            mantissa = 7;
        }
         
        norm = (exponent << 3) | mantissa;
    }

    return norm;
}

float
Sim_decode_norm(Similarity *self, u32_t input) 
{
    u8_t byte = input & 0xFF;
    u32_t result;
    UNUSED_VAR(self);

    if (byte == 0) {
        result = 0;
    }
    else {
        const u32_t mantissa = byte & 7;
        const u32_t exponent = (byte >> 3) & 31;
        result = ((exponent+(63-15)) << 24) | (mantissa << 21);
    }
    
    return *(float*)&result;
}

float
Sim_query_norm(Similarity *self, float sum_of_squared_weights)
{
    UNUSED_VAR(self);
    if (sum_of_squared_weights == 0.0f) /* guard against div by zero */
        return 0;
    else
        return ( 1.0f / sqrt(sum_of_squared_weights) );
}

float
Sim_prox_boost(Similarity *self, u32_t distance)
{
    UNUSED_VAR(self);
    if (distance == 0)
        return 0;
    else 
        return 1.0/(float)distance;

}

float
Sim_prox_coord(Similarity *self, u32_t *prox, u32_t num_prox)
{
    float *prox_decoder = self->prox_decoder;
    u32_t *a = prox;
    u32_t *const limit = prox + num_prox;
    float bonus = 0;

    /* add to bonus for each pair of positions within 256 tokens */
    for ( ; a < limit - 1; a++) {
        u32_t *b = a + 1;
        u32_t distance = *b - *a;
        for ( ; distance < 256 && b < limit; b++) {
            bonus += prox_decoder[distance];
            distance = *b - *a;
        }
    }

    /* Damp the scoring multiplier.  Ideally, we would want to know the number
     * of tokens in the field, but we don't have the token count available so
     * that's not a possibility.  Fortunately, using the number of positions
     * matched works OK because it's likely to be roughly proportionate to
     * field length.
     */
     return 1.0 + bonus/num_prox;
}

float
Sim_coord(Similarity *self, u32_t overlap, u32_t max_overlap) 
{
    UNUSED_VAR(self);
    if (max_overlap == 0)
        return 1;
    else 
        return (float)overlap / (float)max_overlap;
}

/* Copyright 2006-2007 Marvin Humphrey
 *
 * This program is free software; you can redistribute it and/or modify
 * under the same terms as Perl itself.
 */