#include "uri.h"
#include "ccv.h"
#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
#ifdef HAVE_TESSERACT
#include <tesseract/capi.h>
#endif
static void uri_swt_on_source_blob(void* context, ebb_buf data);
typedef struct {
ccv_swt_param_t params;
int max_dimension;
} ccv_swt_uri_param_t;
static const param_dispatch_t param_map[] = {
{
.property = "aspect_ratio",
.type = PARAM_TYPE_DOUBLE,
.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, aspect_ratio),
},
{
.property = "breakdown",
.type = PARAM_TYPE_BOOL,
.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, breakdown),
},
{
.property = "breakdown_ratio",
.type = PARAM_TYPE_DOUBLE,
.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, breakdown_ratio),
},
{
.property = "distance_ratio",
.type = PARAM_TYPE_DOUBLE,
.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, distance_ratio),
},
{
.property = "elongate_ratio",
.type = PARAM_TYPE_DOUBLE,
.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, elongate_ratio),
},
{
.property = "height_ratio",
.type = PARAM_TYPE_DOUBLE,
.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, height_ratio),
},
{
.property = "high_thresh",
.type = PARAM_TYPE_INT,
.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, high_thresh),
},
{
.property = "intensity_thresh",
.type = PARAM_TYPE_INT,
.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, intensity_thresh),
},
{
.property = "intersect_ratio",
.type = PARAM_TYPE_DOUBLE,
.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, intersect_ratio),
},
{
.property = "interval",
.type = PARAM_TYPE_INT,
.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, interval),
},
{
.property = "letter_occlude_thresh",
.type = PARAM_TYPE_INT,
.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, letter_occlude_thresh),
},
{
.property = "letter_thresh",
.type = PARAM_TYPE_INT,
.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, letter_thresh),
},
{
.property = "low_thresh",
.type = PARAM_TYPE_INT,
.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, low_thresh),
},
{
.property = "max_dimension",
.type = PARAM_TYPE_INT,
.offset = offsetof(ccv_swt_uri_param_t, max_dimension),
},
{
.property = "max_height",
.type = PARAM_TYPE_INT,
.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, max_height),
},
{
.property = "min_area",
.type = PARAM_TYPE_INT,
.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, min_area),
},
{
.property = "min_height",
.type = PARAM_TYPE_INT,
.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, min_height),
},
{
.property = "min_neighbors",
.type = PARAM_TYPE_INT,
.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, min_neighbors),
},
{
.property = "scale_invariant",
.type = PARAM_TYPE_BOOL,
.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, scale_invariant),
},
{
.property = "size",
.type = PARAM_TYPE_INT,
.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, size),
},
{
.property = "source",
.type = PARAM_TYPE_BODY,
.on_blob = uri_swt_on_source_blob,
.offset = 0,
},
{
.property = "std_ratio",
.type = PARAM_TYPE_DOUBLE,
.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, std_ratio),
},
{
.property = "thickness_ratio",
.type = PARAM_TYPE_DOUBLE,
.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, thickness_ratio),
},
};
typedef struct {
ebb_buf desc;
#ifdef HAVE_TESSERACT
TessBaseAPI* tesseract;
#endif
} swt_context_t;
typedef struct {
param_parser_t param_parser;
ccv_swt_uri_param_t params;
ebb_buf source;
swt_context_t* context;
} swt_param_parser_t;
void* uri_swt_detect_words_init(void)
{
assert(param_parser_map_alphabet(param_map, sizeof(param_map) / sizeof(param_dispatch_t)) == 0);
swt_context_t* context = (swt_context_t*)malloc(sizeof(swt_context_t));
#ifdef HAVE_TESSERACT
context->tesseract = TessBaseAPICreate();
if (TessBaseAPIInit3(context->tesseract, 0, "eng") != 0)
context->tesseract = 0;
#endif
context->desc = param_parser_map_http_body(param_map, sizeof(param_map) / sizeof(param_dispatch_t),
"[{"
"\"x\":\"number\","
"\"y\":\"number\","
"\"width\":\"number\","
"\"height\":\"number\""
"}]");
return context;
}
void uri_swt_detect_words_destroy(void* context)
{
swt_context_t* swt_context = (swt_context_t*)context;
#ifdef HAVE_TESSERACT
TessBaseAPIDelete(swt_context->tesseract);
#endif
free(swt_context->desc.data);
free(swt_context);
}
static void uri_swt_param_parser_init(swt_param_parser_t* parser)
{
param_parser_init(&parser->param_parser, param_map, sizeof(param_map) / sizeof(param_dispatch_t), &parser->params, parser);
parser->params.params = ccv_swt_default_params;
parser->params.max_dimension = 0;
parser->source.data = 0;
}
static void uri_swt_on_source_blob(void* context, ebb_buf data)
{
swt_param_parser_t* parser = (swt_param_parser_t*)context;
parser->source = data;
}
void* uri_swt_detect_words_parse(const void* context, void* parsed, int resource_id, const char* buf, size_t len, uri_parse_state_t state, int header_index)
{
swt_param_parser_t* parser;
if (parsed)
parser = (swt_param_parser_t*)parsed;
else {
parser = (swt_param_parser_t*)malloc(sizeof(swt_param_parser_t));
parser->context = (swt_context_t*)context;
uri_swt_param_parser_init(parser);
}
switch (state)
{
case URI_QUERY_STRING:
case URI_CONTENT_BODY:
case URI_PARSE_TERMINATE:
case URI_MULTIPART_HEADER_FIELD:
case URI_MULTIPART_HEADER_VALUE:
case URI_MULTIPART_DATA:
param_parser_execute(&parser->param_parser, resource_id, buf, len, state, header_index);
break;
}
return parser;
}
int uri_swt_detect_words_intro(const void* context, const void* parsed, ebb_buf* buf)
{
swt_context_t* swt_context = (swt_context_t*)context;
buf->data = swt_context->desc.data;
buf->len = swt_context->desc.len;
return 0;
}
int uri_swt_detect_words(const void* context, const void* parsed, ebb_buf* buf)
{
if (!parsed)
return -1;
swt_param_parser_t* parser = (swt_param_parser_t*)parsed;
param_parser_terminate(&parser->param_parser);
if (parser->source.data == 0)
{
free(parser);
return -1;
}
ccv_dense_matrix_t* image = 0;
ccv_read(parser->source.data, &image, CCV_IO_ANY_STREAM | CCV_IO_GRAY, parser->source.written);
free(parser->source.data);
if (image == 0)
{
free(parser);
return -1;
}
ccv_dense_matrix_t* resize = 0;
if (parser->params.max_dimension > 0 && (image->rows > parser->params.max_dimension || image->cols > parser->params.max_dimension))
{
ccv_resample(image, &resize, 0, ccv_min(parser->params.max_dimension, (int)(image->rows * (float)parser->params.max_dimension / image->cols + 0.5)), ccv_min(parser->params.max_dimension, (int)(image->cols * (float)parser->params.max_dimension / image->rows + 0.5)), CCV_INTER_AREA);
ccv_matrix_free(image);
} else
resize = image;
ccv_array_t* seq = ccv_swt_detect_words(resize, parser->params.params);
float width = resize->cols, height = resize->rows;
if (seq == 0)
{
ccv_matrix_free(resize);
free(parser);
return -1;
}
if (seq->rnum > 0)
{
int i;
buf->len = 192 + seq->rnum * 131 + 2;
char* data = (char*)malloc(buf->len);
data[0] = '[';
buf->written = 1;
for (i = 0; i < seq->rnum; i++)
{
char cell[1024];
ccv_rect_t* rect = (ccv_rect_t*)ccv_array_get(seq, i);
#ifdef HAVE_TESSERACT
if (parser->context->tesseract)
{
char empty[] = "";
char* word = TessBaseAPIRect(parser->context->tesseract, resize->data.u8, 1, resize->step, rect->x, rect->y, rect->width, rect->height);
if (!word)
word = empty;
int wordlen = strlen(word); // trust tesseract to return correct thing
int j;
for (j = 0; j < wordlen; j++)
if (!((word[j] >= 'a' && word[j] <= 'z') ||
(word[j] >= 'A' && word[j] <= 'Z') ||
(word[j] >= '0' && word[j] <= '9') ||
word[j] == ' ' ||
word[j] == '-')) // replace unsupported char to whitespace
word[j] = ' ';
for (j = wordlen - 1; j >= 0 && word[j] == ' '; j--); // remove trailing whitespace
word[j + 1] = 0, wordlen = j + 1;
for (j = 0; j < wordlen && word[j] == ' '; j++); // remove leading whitespace
wordlen -= j;
memmove(word, word + j, wordlen + 1);
if (wordlen > 512) // if the wordlen is greater than 512, trim it
word[512] = 0;
snprintf(cell, 1024, "{\"x\":%f,\"y\":%f,\"width\":%f,\"height\":%f,\"word\":\"%s\"}", rect->x / width, rect->y / height, rect->width / width, rect->height / height, word);
} else {
#endif
snprintf(cell, 1024, "{\"x\":%f,\"y\":%f,\"width\":%f,\"height\":%f}", rect->x / width, rect->y / height, rect->width / width, rect->height / height);
#ifdef HAVE_TESSERACT
}
#endif
size_t len = strnlen(cell, 1024);
while (buf->written + len + 1 >= buf->len)
{
buf->len = (buf->len * 3 + 1) / 2;
data = (char*)realloc(data, buf->len);
}
memcpy(data + buf->written, cell, len);
buf->written += len + 1;
data[buf->written - 1] = (i == seq->rnum - 1) ? ']' : ',';
}
char http_header[192];
snprintf(http_header, 192, ebb_http_header, buf->written + 1);
size_t len = strnlen(http_header, 192);
if (buf->written + len + 1 >= buf->len)
{
buf->len = buf->written + len + 1;
data = (char*)realloc(data, buf->len);
}
memmove(data + len, data, buf->written);
memcpy(data, http_header, len);
buf->written += len + 1;
data[buf->written - 1] = '\n';
buf->data = data;
buf->len = buf->written;
buf->on_release = uri_ebb_buf_free;
} else {
buf->data = (void*)ebb_http_empty_array;
buf->len = sizeof(ebb_http_empty_array);
buf->on_release = 0;
}
ccv_matrix_free(resize);
ccv_array_free(seq);
free(parser);
return 0;
}