#include "ccv.h"
#include "ccv_internal.h"
#if defined(HAVE_SSE2)
#include <xmmintrin.h>
#elif defined(HAVE_NEON)
#include <arm_neon.h>
#endif
#ifdef HAVE_GSL
#include <gsl/gsl_rng.h>
#include <gsl/gsl_randist.h>
#endif
#ifdef USE_OPENMP
#include <omp.h>
#endif
#ifdef USE_DISPATCH
#include <dispatch/dispatch.h>
#endif
#ifdef HAVE_CUDA
#include "cuda/cwc.h"
#endif
#include "3rdparty/sqlite3/sqlite3.h"
#include "inc/ccv_convnet_internal.h"

#ifndef CASE_TESTS

ccv_convnet_t* ccv_convnet_new(int use_cwc_accel, ccv_size_t input, ccv_convnet_layer_param_t params[], int count)
{
	ccv_convnet_t* convnet = (ccv_convnet_t*)ccmalloc(sizeof(ccv_convnet_t) + sizeof(ccv_convnet_layer_t) * count + sizeof(ccv_dense_matrix_t*) * count * 2);
	convnet->use_cwc_accel = use_cwc_accel;
#ifdef HAVE_GSL
	gsl_rng_env_setup();
	gsl_rng* rng = gsl_rng_alloc(gsl_rng_default);
	gsl_rng_set(rng, (unsigned long int)convnet);
#endif
	convnet->reserved = 0;
	convnet->layers = (ccv_convnet_layer_t*)(convnet + 1);
	convnet->acts = (ccv_dense_matrix_t**)(convnet->layers + count);
	memset(convnet->acts, 0, sizeof(ccv_dense_matrix_t*) * count);
	convnet->denoms = (ccv_dense_matrix_t**)(convnet->acts + count);
	memset(convnet->denoms, 0, sizeof(ccv_dense_matrix_t*) * count);
	convnet->count = count;
	convnet->input = input;
	convnet->rows = params[0].input.matrix.rows;
	convnet->cols = params[0].input.matrix.cols;
	convnet->channels = params[0].input.matrix.channels;
	convnet->mean_activity = ccv_dense_matrix_new(convnet->input.height, convnet->input.width, convnet->channels | CCV_32F, 0, 0);
	ccv_zero(convnet->mean_activity);
	ccv_convnet_layer_t* layers = convnet->layers;
	int i, j;
	for (i = 0; i < count; i++)
	{
		layers[i].type = params[i].type;
		layers[i].input = params[i].input;
		layers[i].net = params[i].output;
		layers[i].reserved = 0;
		switch (params[i].type)
		{
			case CCV_CONVNET_CONVOLUTIONAL:
				assert(params[i].input.matrix.channels % params[i].input.matrix.partition == 0);
				assert(params[i].output.convolutional.count % params[i].output.convolutional.partition == 0);
				assert(params[i].output.convolutional.partition % params[i].input.matrix.partition == 0);
				assert(params[i].output.convolutional.partition >= params[i].input.matrix.partition);
				layers[i].wnum = params[i].output.convolutional.rows * params[i].output.convolutional.cols * params[i].output.convolutional.channels / params[i].input.matrix.partition * params[i].output.convolutional.count;
				layers[i].w = (float*)ccmalloc(sizeof(float) * (layers[i].wnum + params[i].output.convolutional.count));
				layers[i].bias = layers[i].w + layers[i].wnum;
#ifdef HAVE_GSL
				for (j = 0; j < layers[i].wnum; j++)
					layers[i].w[j] = (gsl_rng_uniform_pos(rng) * 2 - 1) * params[i].glorot / sqrtf(params[i].output.convolutional.rows * params[i].output.convolutional.cols * params[i].output.convolutional.channels / params[i].input.matrix.partition + params[i].output.convolutional.count);
#else
				for (j = 0; j < layers[i].wnum; j++)
					layers[i].w[j] = 0;
#endif
				for (j = 0; j < params[i].output.convolutional.count; j++)
					layers[i].bias[j] = params[i].bias;
				break;
			case CCV_CONVNET_FULL_CONNECT:
				layers[i].wnum = params[i].input.node.count * params[i].output.full_connect.count;
				layers[i].w = (float*)ccmalloc(sizeof(float) * (layers[i].wnum + params[i].output.full_connect.count));
				layers[i].bias = layers[i].w + layers[i].wnum;
#ifdef HAVE_GSL
				for (j = 0; j < layers[i].wnum; j++)
					layers[i].w[j] = (gsl_rng_uniform_pos(rng) * 2 - 1) * params[i].glorot / sqrtf(params[i].input.node.count + params[i].output.full_connect.count);
#else
				for (j = 0; j < layers[i].wnum; j++)
					layers[i].w[j] = 0;
#endif
				for (j = 0; j < params[i].output.full_connect.count; j++)
					layers[i].bias[j] = params[i].bias;
				break;
			default:
				layers[i].wnum = 0;
				layers[i].w = 0;
				layers[i].bias = 0;
				break;
		}
	}
#ifdef HAVE_GSL
	gsl_rng_free(rng);
#endif
	return convnet;
}

int ccv_convnet_verify(ccv_convnet_t* convnet, int output)
{
	int i, out_rows, out_cols, out_partition, out_channels;
	if (convnet->count < 1)
		return -1;
	// the last layer has to be full connect
	if (convnet->layers[convnet->count - 1].type != CCV_CONVNET_FULL_CONNECT)
		return -1;
	// you cannot enable relu on the last layer
	if (convnet->layers[convnet->count - 1].net.full_connect.relu)
		return -1;
	out_channels = 3;
	for (i = 0; i < convnet->count; i++)
	{
		ccv_convnet_layer_t* layer = convnet->layers + i;
		if (i > 0 && (out_rows != layer->input.matrix.rows || out_cols != layer->input.matrix.cols))
			return -1;
		// the input channels should be equal to the previous output channels, skip this check for full connect as it is meaningless
		if (out_channels != layer->input.matrix.channels && layer->type != CCV_CONVNET_FULL_CONNECT)
			return -1;
		ccv_convnet_make_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &out_rows, &out_cols, &out_partition);
		if (layer->type == CCV_CONVNET_CONVOLUTIONAL)
		{
			// check to see if the input matrix channel is equal to the expected input of the convolutional layer filters
			if (layer->input.matrix.channels != layer->net.convolutional.channels)
				return -1;
			// if this layer is convolutional layer, its filter output should equal to next layer's channel input
			out_channels = layer->net.convolutional.count;
		}
	}
	if (out_rows * out_cols != output)
		return -1;
	int count = 0;
	for (i = 0; i < convnet->count; i++)
	{
		ccv_convnet_layer_t* layer = convnet->layers + i;
		if (layer->type == CCV_CONVNET_FULL_CONNECT)
		{
			count = i;
			break;
		}
	}
	// all the layers after the first full connect layer should only be full connect layer
	for (i = count; i < convnet->count; i++)
		if (convnet->layers[i].type != CCV_CONVNET_FULL_CONNECT ||
			convnet->layers[i].input.matrix.rows * convnet->layers[i].input.matrix.cols * convnet->layers[i].input.matrix.channels != convnet->layers[i].input.node.count)
			return -1;
	return 0;
}

#endif

#if defined(HAVE_SSE2) || defined(HAVE_NEON)

static void _ccv_convnet_layer_simd_alloc_reserved(ccv_convnet_layer_t* layer)
{
	if (layer->reserved)
		return;
	int partition = layer->input.matrix.partition;
	int ch = layer->net.convolutional.channels;
	int count = layer->net.convolutional.count;
	int kernel_rows = layer->net.convolutional.rows;
	int kernel_cols = layer->net.convolutional.cols;
	int ch_per_partition = ch / partition;
	int count_per_4 = count / 4;
	float* simd_w = (float*)ccmalloc(sizeof(float) * layer->wnum);
	int i, j, k, c;
	for (k = 0; k < count_per_4; k++)
		for (i = 0; i < kernel_rows * kernel_cols; i++)
			for (j = 0; j < ch_per_partition; j++)
				for (c = 0; c < 4; c++)
					simd_w[(k * kernel_rows * kernel_cols * ch_per_partition + i * ch_per_partition + j) * 4 + c] = layer->w[(k * 4 + c) * kernel_rows * kernel_cols * ch_per_partition + i * ch_per_partition + j];
	layer->reserved = simd_w;
}

#endif

#define SIMD(x) ((float*)((x)->reserved))

#if defined(HAVE_SSE2)
static inline void _ccv_convnet_convolutional_forward_propagate_sse2(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* db, int rows, int cols, int ch, int count, int strides, int border, int kernel_rows, int kernel_cols, int ch_per_partition, int count_per_partition)
{
	assert(SIMD(layer));
#define main_for(block) \
	parallel_for(k, (count >> 2)) { \
		int i, j, x, y, c; \
		int p = k * 4 / count_per_partition; \
		float* ap = a->data.f32 + p * ch_per_partition; \
		float* bp = db->data.f32 + k * 4; \
		float* layer_w = SIMD(layer) + k * 4 * kernel_rows * kernel_cols * ch_per_partition; \
		float bias[4] __attribute__ ((__aligned__(16))); \
		memcpy(bias, layer->bias + k * 4, sizeof(float) * 4); \
		/* 4 accumulators */ \
		__m128 z4 = _mm_setzero_ps(); \
		for (i = 0; i < db->rows; i++) \
		{ \
			int comy = ccv_max(i * strides - border, 0) - (i * strides - border); \
			int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(a->rows + border, i * strides + kernel_rows)); \
			comy *= ch_per_partition * kernel_cols; \
			for (j = 0; j < db->cols; j++) \
			{ \
				__m128 v40 = _mm_load_ps(bias); \
				__m128 v41 = _mm_setzero_ps(); \
				__m128 v42 = _mm_setzero_ps(); \
				__m128 v43 = _mm_setzero_ps(); \
				int comx = ccv_max(j * strides - border, 0) - (j * strides - border); \
				int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(a->cols + border, j * strides + kernel_cols)); \
				float* w = layer_w + (comx * ch_per_partition + comy) * 4; \
				float* apz = ap + ccv_max(j * strides - border, 0) * ch; \
				/* when we have border, we simply do zero padding */ \
				for (y = 0; y < maxy; y++) \
				{ \
					/* special casing for these cases to speed up SIMD computation */ \
					for (x = 0; x < maxx; x++) \
					{ \
						c = 0; \
						for (; c < ch_per_partition - 3; c += 4) \
						{ \
							__m128 apz4 = _mm_loadu_ps(apz + x * ch + c); \
							__m128 w40 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \
							__m128 w41 = _mm_loadu_ps(w + (x * ch_per_partition + c + 1) * 4); \
							__m128 w42 = _mm_loadu_ps(w + (x * ch_per_partition + c + 2) * 4); \
							__m128 w43 = _mm_loadu_ps(w + (x * ch_per_partition + c + 3) * 4); \
							__m128 apz40 = _mm_shuffle_ps(apz4, apz4, 0x00); \
							__m128 apz41 = _mm_shuffle_ps(apz4, apz4, 0x55); \
							__m128 apz42 = _mm_shuffle_ps(apz4, apz4, 0xAA); \
							__m128 apz43 = _mm_shuffle_ps(apz4, apz4, 0xFF); \
							v40 =_mm_add_ps(_mm_mul_ps(w40, apz40), v40); \
							v41 =_mm_add_ps(_mm_mul_ps(w41, apz41), v41); \
							v42 =_mm_add_ps(_mm_mul_ps(w42, apz42), v42); \
							v43 =_mm_add_ps(_mm_mul_ps(w43, apz43), v43); \
						} \
						block /* insert executions for tail partition */ \
					} \
					w += kernel_cols * ch_per_partition * 4; \
					apz += a->cols * ch; \
				} \
				__m128 v4 = _mm_max_ps(z4, _mm_add_ps(_mm_add_ps(v40, v41), _mm_add_ps(v42, v43))); \
				_mm_storeu_ps(bp + j * count, v4); /* ReLU */ \
			} \
			bp += db->cols * count; \
			ap += a->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0)); \
		} \
	} parallel_endfor
	if (ch_per_partition % 4 == 0)
	{
		main_for();
	} else if (ch_per_partition % 4 == 3) { // unroll the last for-loops
#define block \
		__m128 apz40 = _mm_load1_ps(apz + x * ch + c); \
		__m128 apz41 = _mm_load1_ps(apz + x * ch + c + 1); \
		__m128 apz42 = _mm_load1_ps(apz + x * ch + c + 2); \
		__m128 w40 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \
		__m128 w41 = _mm_loadu_ps(w + (x * ch_per_partition + c + 1) * 4); \
		__m128 w42 = _mm_loadu_ps(w + (x * ch_per_partition + c + 2) * 4); \
		v40 = _mm_add_ps(_mm_mul_ps(w40, apz40), v40); \
		v41 = _mm_add_ps(_mm_mul_ps(w41, apz41), v41); \
		v42 = _mm_add_ps(_mm_mul_ps(w42, apz42), v42);
		main_for(block);
#undef block
	} else if (ch_per_partition % 4 == 2) { // unroll the last for-loops
#define block \
		__m128 apz40 = _mm_load1_ps(apz + x * ch + c); \
		__m128 apz41 = _mm_load1_ps(apz + x * ch + c + 1); \
		__m128 w40 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \
		__m128 w41 = _mm_loadu_ps(w + (x * ch_per_partition + c + 1) * 4); \
		v40 = _mm_add_ps(_mm_mul_ps(w40, apz40), v40); \
		v41 = _mm_add_ps(_mm_mul_ps(w41, apz41), v41);
		main_for(block);
#undef block
	} else {
#define block \
		__m128 apz4 = _mm_load1_ps(apz + x * ch + c); \
		__m128 w4 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \
		v40 = _mm_add_ps(_mm_mul_ps(w4, apz4), v40);
		main_for(block);
#undef block
	}
#undef main_for
}
#elif defined(HAVE_NEON)
static inline void _ccv_convnet_convolutional_forward_propagate_neon(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* db, int rows, int cols, int ch, int count, int strides, int border, int kernel_rows, int kernel_cols, int ch_per_partition, int count_per_partition)
{
	assert(SIMD(layer));
#define main_for(block) \
	parallel_for(k, (count >> 2)) { \
		int i, j, x, y, c; \
		int p = k * 4 / count_per_partition; \
		float* ap = a->data.f32 + p * ch_per_partition; \
		float* bp = db->data.f32 + k * 4; \
		float* layer_w = SIMD(layer) + k * 4 * kernel_rows * kernel_cols * ch_per_partition; \
		float bias[4] __attribute__ ((__aligned__(16))); \
		memcpy(bias, layer->bias + k * 4, sizeof(float) * 4); \
		float32x4_t z4 = vmovq_n_f32(0); \
		for (i = 0; i < db->rows; i++) \
		{ \
			int comy = ccv_max(i * strides - border, 0) - (i * strides - border); \
			int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(a->rows + border, i * strides + kernel_rows)); \
			comy *= ch_per_partition * kernel_cols; \
			for (j = 0; j < db->cols; j++) \
			{ \
				float32x4_t v40 = vld1q_f32(bias); \
				float32x4_t v41 = vmovq_n_f32(0); \
				int comx = ccv_max(j * strides - border, 0) - (j * strides - border); \
				int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(a->cols + border, j * strides + kernel_cols)); \
				float* w = layer_w + (comx * ch_per_partition + comy) * 4; \
				float* apz = ap + ccv_max(j * strides - border, 0) * ch; \
				/* when we have border, we simply do zero padding */ \
				for (y = 0; y < maxy; y++) \
				{ \
					for (x = 0; x < maxx; x++) \
					{ \
						c = 0; \
						for (; c < ch_per_partition - 1; c += 2) \
						{ \
							float32x2_t apz4 = vld1_f32(apz + x * ch + c); \
							float32x4_t apz40 = vdupq_lane_f32(apz4, 0); \
							float32x4_t apz41 = vdupq_lane_f32(apz4, 1); \
							float32x4_t w40 = vld1q_f32(w + (x * ch_per_partition + c) * 4); \
							float32x4_t w41 = vld1q_f32(w + (x * ch_per_partition + c + 1) * 4); \
							v40 = vmlaq_f32(v40, w40, apz40); \
							v41 = vmlaq_f32(v41, w41, apz41); \
						} \
						block /* insert executions for tail partition */ \
					} \
					w += kernel_cols * ch_per_partition * 4; \
					apz += a->cols * ch; \
				} \
				float32x4_t v4 = vmaxq_f32(z4, vaddq_f32(v40, v41)); \
				vst1q_f32(bp + j * count, v4); /* ReLU */ \
			} \
			bp += db->cols * count; \
			ap += a->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0)); \
		} \
	} parallel_endfor
	if (ch_per_partition % 2 == 0)
	{
		main_for();
	} else { // unroll the last for-loops
#define block \
		float32x4_t apz4 = vmovq_n_f32(apz[x * ch + c]); \
		float32x4_t w4 = vld1q_f32(w + (x * ch_per_partition + c) * 4); \
		v40 = vmlaq_f32(v40, w4, apz4);
		main_for(block);
#undef block
	}
#undef main_for
}
#else
static inline void _ccv_convnet_convolutional_forward_propagate_fallback(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* db, int rows, int cols, int ch, int count, int strides, int border, int kernel_rows, int kernel_cols, int ch_per_partition, int count_per_partition)
{
	parallel_for(k, count) {
		int i, j, x, y, c;
		int p = k / count_per_partition;
		float* ap = a->data.f32 + p * ch_per_partition;
		float* bp = db->data.f32 + k;
		float* layer_w = layer->w + k * kernel_rows * kernel_cols * ch_per_partition;
		float bias = layer->bias[k];
		for (i = 0; i < db->rows; i++)
		{
			int comy = ccv_max(i * strides - border, 0) - (i * strides - border);
			int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(a->rows + border, i * strides + kernel_rows));
			comy *= ch_per_partition * kernel_cols;
			for (j = 0; j < db->cols; j++)
			{
				float v = bias;
				int comx = ccv_max(j * strides - border, 0) - (j * strides - border);
				int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(a->cols + border, j * strides + kernel_cols));
				float* w = layer_w + comx * ch_per_partition + comy;
				float* apz = ap + ccv_max(j * strides - border, 0) * ch;
				// when we have border, we simply do zero padding
				for (y = 0; y < maxy; y++)
				{
					for (x = 0; x < maxx; x++)
						for (c = 0; c < ch_per_partition; c++)
							v += w[x * ch_per_partition + c] * apz[x * ch + c];
					w += kernel_cols * ch_per_partition;
					apz += a->cols * ch;
				}
				bp[j * count] = ccv_max(0, v); // ReLU
			}
			bp += db->cols * count;
			ap += a->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0));
		}
	} parallel_endfor
}
#endif

static void _ccv_convnet_convolutional_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b)
{
	int rows, cols, partition;
	ccv_convnet_make_output(layer, a->rows, a->cols, &rows, &cols, &partition);
	int ch = layer->net.convolutional.channels;
	int count = layer->net.convolutional.count;
	int strides = layer->net.convolutional.strides;
	int border = layer->net.convolutional.border;
	int kernel_rows = layer->net.convolutional.rows;
	int kernel_cols = layer->net.convolutional.cols;
	int type = CCV_32F | count;
	assert(CCV_GET_CHANNEL(a->type) == ch);
	assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
	ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0);
	int ch_per_partition = ch / partition;
	int count_per_partition = count / partition;
	assert(count_per_partition % 4 == 0);
#if defined(HAVE_SSE2) || defined(HAVE_NEON)
	_ccv_convnet_layer_simd_alloc_reserved(layer);
#endif
#if defined(HAVE_SSE2)
	_ccv_convnet_convolutional_forward_propagate_sse2(layer, a, db, rows, cols, ch, count, strides, border, kernel_rows, kernel_cols, ch_per_partition, count_per_partition);
#elif defined(HAVE_NEON)
	_ccv_convnet_convolutional_forward_propagate_neon(layer, a, db, rows, cols, ch, count, strides, border, kernel_rows, kernel_cols, ch_per_partition, count_per_partition);
#else
	_ccv_convnet_convolutional_forward_propagate_fallback(layer, a, db, rows, cols, ch, count, strides, border, kernel_rows, kernel_cols, ch_per_partition, count_per_partition);
#endif
}

static void _ccv_convnet_full_connect_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b)
{
	assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
	ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, layer->net.full_connect.count, 1, CCV_32F | CCV_C1, CCV_32F | CCV_C1, 0);
	int ch = CCV_GET_CHANNEL(a->type);
	int rows = a->rows, cols = a->cols;
	// reshape a for gemm
	assert(a->step == a->cols * CCV_GET_DATA_TYPE_SIZE(a->type) * ch);
	a->rows = rows * cols * ch, a->cols = 1, a->type = (a->type - ch) | CCV_C1;
	assert(a->rows * db->rows == layer->wnum);
	a->step = a->cols * CCV_GET_DATA_TYPE_SIZE(a->type);
	int i;
	float* bptr = db->data.f32;
	for (i = 0; i < db->rows; i++)
		bptr[i] = layer->bias[i];
	ccv_dense_matrix_t dw = ccv_dense_matrix(db->rows, a->rows, CCV_32F | CCV_C1, layer->w, 0);
	ccv_gemm(&dw, a, 1, db, 1, 0, (ccv_matrix_t**)&db, 0); // supply db as matrix C is allowed
	if (layer->net.full_connect.relu)
		for (i = 0; i < db->rows; i++)
			bptr[i] = ccv_max(0, bptr[i]); // relu
	a->rows = rows, a->cols = cols, a->type = (a->type - CCV_GET_CHANNEL(a->type)) | ch;
	a->step = a->cols * CCV_GET_DATA_TYPE_SIZE(a->type) * CCV_GET_CHANNEL(a->type);
}

static void _ccv_convnet_rnorm_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, ccv_dense_matrix_t** denoms)
{
	int rows, cols, partition;
	ccv_convnet_make_output(layer, a->rows, a->cols, &rows, &cols, &partition);
	int size = layer->net.rnorm.size;
	float kappa = layer->net.rnorm.kappa;
	float alpha = layer->net.rnorm.alpha;
	float beta = layer->net.rnorm.beta;
	int way = size / 2;
	assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
	int ch = CCV_GET_CHANNEL(a->type);
	int type = CCV_32F | ch;
	ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0);
	int i, j, k, x, p;
	float* ap = a->data.f32;
	float* bp = db->data.f32;
	int ch_per_partition = ch / partition;
	if (denoms)
	{
		ccv_dense_matrix_t* ddenoms = *denoms = ccv_dense_matrix_renew(*denoms, rows, cols, type, type, 0);
		float* dp = ddenoms->data.f32;
		for (i = 0; i < db->rows; i++)
		{
			for (j = 0; j < db->cols; j++)
				for (p = 0; p < partition; p++)
					for (k = 0; k < ch_per_partition; k++)
					{
						float v = ap[j * ch + p * ch_per_partition + k];
						float denom = 0;
						for (x = ccv_max(k - way, 0); x <= ccv_min(k + way, ch_per_partition - 1); x++)
							denom += ap[j * ch + p * ch_per_partition + x] * ap[j * ch + p * ch_per_partition + x];
						denom = kappa + alpha * denom;
						dp[j * ch + p * ch_per_partition + k] = denom;
						bp[j * ch + p * ch_per_partition + k] = v * powf(denom, -beta);
					}
			ap += a->cols * ch;
			dp += ddenoms->cols * ch;
			bp += db->cols * ch;
		}
	} else {
		for (i = 0; i < db->rows; i++)
		{
			for (j = 0; j < db->cols; j++)
				for (p = 0; p < partition; p++)
					for (k = 0; k < ch_per_partition; k++)
					{
						float v = ap[j * ch + p * ch_per_partition + k];
						float denom = 0;
						for (x = ccv_max(k - way, 0); x <= ccv_min(k + way, ch_per_partition - 1); x++)
							denom += ap[j * ch + p * ch_per_partition + x] * ap[j * ch + p * ch_per_partition + x];
						denom = kappa + alpha * denom;
						bp[j * ch + p * ch_per_partition + k] = v * powf(denom, -beta);
					}
			ap += a->cols * ch;
			bp += db->cols * ch;
		}
	}
}

static void _ccv_convnet_max_pool_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b)
{
	int rows, cols, partition;
	ccv_convnet_make_output(layer, a->rows, a->cols, &rows, &cols, &partition);
	int size = layer->net.pool.size;
	int strides = layer->net.pool.strides;
	int border = layer->net.pool.border;
	assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
	int ch = CCV_GET_CHANNEL(a->type);
	int type = CCV_32F | ch;
	ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0);
	int i, j, k, x, y;
	float* ap = a->data.f32;
	float* bp = db->data.f32;
	for (i = 0; i < db->rows; i++)
	{
		const int start_y = ccv_max(i * strides - border, 0) - (i * strides - border);
		const int end_y = size + ccv_min(i * strides + size - border, a->rows) - (i * strides + size - border);
		for (j = 0; j < db->cols; j++)
		{
			const int start_x = ccv_max(j * strides - border, 0) - (j * strides - border);
			const int end_x = size + ccv_min(j * strides + size - border, a->cols) - (j * strides + size - border);
			for (k = 0; k < ch; k++)
			{
				float v = 0;
				for (y = start_y; y < end_y; y++)
					for (x = start_x; x < end_x; x++)
						if (x == start_x && y == start_y)
							v = ap[(j * strides - border + x + (y - border) * a->cols) * ch + k];
						else if (ap[(j * strides - border + x + (y - border) * a->cols) * ch + k] > v)
							v = ap[(j * strides - border + x + (y - border) * a->cols) * ch + k];
				bp[j * ch + k] = v;
			}
		}
		ap += a->cols * ch * strides;
		bp += db->cols * ch;
	}
}

static void _ccv_convnet_average_pool_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b)
{
	int rows, cols, partition;
	ccv_convnet_make_output(layer, a->rows, a->cols, &rows, &cols, &partition);
	int size = layer->net.pool.size;
	int strides = layer->net.pool.strides;
	int border = layer->net.pool.border;
	assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
	int ch = CCV_GET_CHANNEL(a->type);
	int type = CCV_32F | ch;
	ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0);
	int i, j, k, x, y;
	float* ap = a->data.f32;
	float* bp = db->data.f32;
	for (i = 0; i < db->rows; i++)
	{
		const int start_y = ccv_max(i * strides - border, 0) - (i * strides - border);
		const int end_y = size + ccv_min(i * strides + size - border, a->rows) - (i * strides + size - border);
		for (j = 0; j < db->cols; j++)
		{
			const int start_x = ccv_max(j * strides - border, 0) - (j * strides - border);
			const int end_x = size + ccv_min(j * strides + size - border, a->cols) - (j * strides + size - border);
			for (k = 0; k < ch; k++)
			{
				float v = 0;
				for (y = start_y; y < end_y; y++)
					for (x = start_x; x < end_x; x++)
						v += ap[(j * strides - border + x + (y - border) * a->cols) * ch + k];
				bp[j * ch + k] = v / ((end_x - start_x) * (end_y - start_y));
			}
		}
		ap += a->cols * ch * strides;
		bp += db->cols * ch;
	}
}

static void _ccv_convnet_layer_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, ccv_dense_matrix_t** denoms)
{
	switch(layer->type)
	{
		case CCV_CONVNET_CONVOLUTIONAL:
			_ccv_convnet_convolutional_forward_propagate(layer, a, b);
			break;
		case CCV_CONVNET_FULL_CONNECT:
			_ccv_convnet_full_connect_forward_propagate(layer, a, b);
			break;
		case CCV_CONVNET_LOCAL_RESPONSE_NORM:
			_ccv_convnet_rnorm_forward_propagate(layer, a, b, denoms);
			break;
		case CCV_CONVNET_MAX_POOL:
			_ccv_convnet_max_pool_forward_propagate(layer, a, b);
			break;
		case CCV_CONVNET_AVERAGE_POOL:
			_ccv_convnet_average_pool_forward_propagate(layer, a, b);
			break;
	}
}

static void _ccv_convnet_full_connect_forward_propagate_parallel(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b)
{
	assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
	ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, a->rows, layer->net.full_connect.count, CCV_32F | CCV_C1, CCV_32F | CCV_C1, 0);
	// reshape a for gemm
	int i, j;
	float* bptr = db->data.f32;
	for (i = 0; i < db->rows; i++)
	{
		for (j = 0; j < db->cols; j++)
			bptr[j] = layer->bias[j];
		bptr += db->cols;
	}
	ccv_dense_matrix_t dw = ccv_dense_matrix(db->cols, a->cols, CCV_32F | CCV_C1, layer->w, 0);
	ccv_gemm(a, &dw, 1, db, 1, CCV_B_TRANSPOSE, (ccv_matrix_t**)&db, 0); // supply db as matrix C is allowed
	bptr = db->data.f32;
	if (layer->net.full_connect.relu)
		for (i = 0; i < db->rows; i++)
		{
			for (j = 0; j < db->cols; j++)
				bptr[j] = ccv_max(0, bptr[j]); // relu
			bptr += db->cols;
		}
}

static void _ccv_convnet_compute_softmax_parallel(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type)
{
	assert(CCV_GET_CHANNEL(a->type) == CCV_C1);
	assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
	ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, 1, a->cols, CCV_32F | CCV_C1, CCV_32F | CCV_C1, 0);
	ccv_zero(db);
	int i, j;
	float* aptr = a->data.f32;
	float* bptr = db->data.f32;
	float* cptr = (float*)ccmalloc(sizeof(float) * a->cols);
	for (i = 0; i < a->rows; i++)
	{
		double max = aptr[0];
		for (j = 1; j < a->cols; j++)
			if (aptr[j] > max)
				max = aptr[j];
		double tt = 0;
		for (j = 0; j < a->cols; j++)
			tt += (cptr[j] = expf(aptr[j] - max));
		tt = 1.0 / tt;
		for (j = 0; j < a->cols; j++)
			bptr[j] += cptr[j] * tt;
		aptr += a->cols;
	}
	ccfree(cptr);
}

#ifndef CASE_TESTS

void ccv_convnet_encode(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, ccv_dense_matrix_t** b, int batch)
{
#ifdef HAVE_CUDA
	if (convnet->use_cwc_accel)
		cwc_convnet_encode(convnet, a, b, batch);
	else {
#endif
	assert(batch == 1);
	assert(CCV_GET_CHANNEL((*a)->type) == convnet->channels);
	assert((*a)->rows == convnet->rows);
	assert((*a)->cols == convnet->cols);
	int i;
	// save the last layer of neuron cache in case that we encode to a different matrix
	ccv_dense_matrix_t* out_neuron = convnet->acts[convnet->count - 1];
	convnet->acts[convnet->count - 1] = *b;
	_ccv_convnet_layer_forward_propagate(convnet->layers, *a, convnet->acts, convnet->denoms);
	for (i = 1; i < convnet->count; i++)
		_ccv_convnet_layer_forward_propagate(convnet->layers + i, convnet->acts[i - 1], convnet->acts + i, convnet->denoms + i);
	if (convnet->acts + convnet->count - 1 != b)
	{
		*b = convnet->acts[convnet->count - 1];
		// restore the last layer of neuron cache
		convnet->acts[convnet->count - 1] = out_neuron;
	}
#ifdef HAVE_CUDA
	}
#endif
}

// find the layer for scanning (it is the last convolutional layer)
static int _ccv_convnet_find_scan(ccv_convnet_t* convnet)
{
	int i;
	ccv_convnet_layer_t* layers = convnet->layers;
	for (i = convnet->count - 1; i >= 0; i--)
		if (layers[i].type == CCV_CONVNET_CONVOLUTIONAL)
			return i;
	return -1;
}

static int _ccv_convnet_derive_scale(ccv_convnet_t* convnet, int scan)
{
	int i, scale = 1;
	for (i = scan; i >= 0; i--)
	{
		ccv_convnet_layer_t* layer = convnet->layers + i;
		switch (layer->type)
		{
			case CCV_CONVNET_CONVOLUTIONAL:
				scale *= layer->net.convolutional.strides;
				break;
			case CCV_CONVNET_MAX_POOL:
			case CCV_CONVNET_AVERAGE_POOL:
				scale *= layer->net.pool.strides;
				break;
		}
	}
	return scale;
}

static int _ccv_convnet_find_full_connect(ccv_convnet_t* convnet)
{
	int i;
	for (i = 0; i < convnet->count; i++)
		if (convnet->layers[i].type == CCV_CONVNET_FULL_CONNECT)
			return i;
	return -1;
}

void ccv_convnet_classify(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, int symmetric, ccv_array_t** ranks, int tops, int batch)
{
#ifdef HAVE_CUDA
	if (convnet->use_cwc_accel)
		cwc_convnet_classify(convnet, a, symmetric, ranks, tops, batch);
	else {
#endif
	int i, j, k, t;
	ccv_dense_matrix_t** b = (ccv_dense_matrix_t**)alloca(sizeof(ccv_dense_matrix_t*) * (convnet->count + 1));
	int scan = _ccv_convnet_find_scan(convnet);
	int scale = _ccv_convnet_derive_scale(convnet, scan);
	int full_connect = _ccv_convnet_find_full_connect(convnet);
	assert(scan >= 0 && scan < convnet->count);
	assert(full_connect >= 0 && full_connect < convnet->count);
	memset(b, 0, sizeof(ccv_dense_matrix_t*) * (convnet->count + 1));
	for (i = 0; i < batch; i++)
	{
		assert(CCV_GET_CHANNEL(a[i]->type) == convnet->channels);
		assert(a[i]->rows == convnet->input.height || a[i]->cols == convnet->input.width);
		assert(a[i]->rows >= convnet->input.height && a[i]->cols >= convnet->input.width);
		// find optimal rows and cols to slice to
		int rows = convnet->rows + ((a[i]->rows - convnet->rows) / scale) * scale;
		int cols = convnet->cols + ((a[i]->cols - convnet->cols) / scale) * scale;
		assert(rows == convnet->input.height || cols == convnet->input.width);
		assert(rows <= a[i]->rows && cols <= a[i]->cols);
		ccv_dense_matrix_t* slice = 0;
		ccv_slice(a[i], (ccv_matrix_t**)&slice, CCV_32F, (a[i]->rows - rows) / 2, (a[i]->cols - cols) / 2, rows, cols);
		ccv_dense_matrix_t* mean_activity = 0;
		// scale mean activity up to be substractable (from this one, the CPU implementation is an approximation of GPU implementation)
		ccv_resample(convnet->mean_activity, &mean_activity, 0, rows, cols, CCV_INTER_CUBIC);
		ccv_subtract(slice, mean_activity, (ccv_matrix_t**)b, CCV_32F);
		ccv_matrix_free(mean_activity);
		ccv_matrix_free(slice);
		// doing the first few layers until the first scan layer
		int out_rows, out_cols, out_partition;
		ccv_dense_matrix_t* c = ccv_dense_matrix_new(5 * (!!symmetric + 1), convnet->layers[full_connect].input.node.count, CCV_32F | CCV_C1, 0, 0);
		for (t = 0; t <= !!symmetric; t++)
		{
			rows = b[0]->rows, cols = b[0]->cols;
			for (j = 0; j < scan + 1; j++)
			{
				ccv_convnet_layer_t* layer = convnet->layers + j;
				ccv_convnet_make_output(layer, rows, cols, &out_rows, &out_cols, &out_partition);
				_ccv_convnet_layer_forward_propagate(layer, b[j], b + j + 1, 0);
				assert(b[j + 1]->rows == out_rows && b[j + 1]->cols == out_cols);
				if (j > 0)
					ccv_matrix_free(b[j]);
				rows = out_rows, cols = out_cols;
			}
			int offsets[5][2] = {
				{0, 0},
				{cols - convnet->layers[scan + 1].input.matrix.cols, 0},
				{(cols - convnet->layers[scan + 1].input.matrix.cols) / 2, (rows - convnet->layers[scan + 1].input.matrix.rows) / 2},
				{0, rows - convnet->layers[scan + 1].input.matrix.rows},
				{cols - convnet->layers[scan + 1].input.matrix.cols, rows - convnet->layers[scan + 1].input.matrix.rows},
			};
			for (k = 0; k < 5; k++)
			{
				ccv_dense_matrix_t* input = 0;
				ccv_convnet_layer_t* layer = convnet->layers + scan + 1;
				ccv_slice(b[scan + 1], (ccv_matrix_t**)&input, CCV_32F, offsets[k][1], offsets[k][0], layer->input.matrix.rows, layer->input.matrix.cols);
				// copy the last layer for full connect compute
				b[full_connect] = ccv_dense_matrix_new(convnet->layers[full_connect].input.matrix.rows, convnet->layers[full_connect].input.matrix.cols, CCV_NO_DATA_ALLOC | CCV_32F | convnet->layers[full_connect].input.matrix.channels, c->data.f32 + (t * 5 + k) * convnet->layers[full_connect].input.node.count, 0);
				for (j = scan + 1; j < full_connect; j++)
				{
					layer = convnet->layers + j;
					_ccv_convnet_layer_forward_propagate(layer, j > scan + 1 ? b[j] : input, b + j + 1, 0);
					if (j > scan + 1)
						ccv_matrix_free(b[j]);
					else
						ccv_matrix_free(input);
				}
				ccv_matrix_free(b[full_connect]);
				// set it to 0
				memset(b + scan + 2, 0, sizeof(ccv_dense_matrix_t*) * (full_connect - scan - 1));
			}
			ccv_matrix_free(b[scan + 1]);
			memset(b + 1, 0, sizeof(ccv_dense_matrix_t*) * (scan + 1));
			if (t < !!symmetric)
				ccv_flip(b[0], &b[0], 0, CCV_FLIP_X);
		}
		ccv_matrix_free(b[0]);
		// now have everything in c, do the last full connect propagate
		b[full_connect] = c;
		for (j = full_connect; j < convnet->count; j++)
		{
			ccv_convnet_layer_t* layer = convnet->layers + j;
			assert(layer->type == CCV_CONVNET_FULL_CONNECT);
			_ccv_convnet_full_connect_forward_propagate_parallel(layer, b[j], b + j + 1);
			ccv_matrix_free(b[j]);
		}
		ccv_dense_matrix_t* softmax = 0;
		_ccv_convnet_compute_softmax_parallel(b[convnet->count], &softmax, 0);
		ccv_matrix_free(b[convnet->count]);
		ranks[i] = ccv_array_new(sizeof(ccv_classification_t), tops, 0);
		float* r = softmax->data.f32;
		assert(tops <= softmax->cols);
		for (j = 0; j < tops; j++)
		{
			float max_val = -1;
			int max_idx = -1;
			for (k = 0; k < softmax->cols; k++)
				if (r[k] >= 0 && r[k] > max_val)
					max_val = r[k], max_idx = k;
			assert(max_idx >= 0);
			r[max_idx] = -1;
			ccv_classification_t classification = {
				.id = max_idx,
				.confidence = max_val / ((!!symmetric + 1) * 5),
			};
			ccv_array_push(ranks[i], &classification);
		}
		ccv_matrix_free(softmax);
		memset(b, 0, sizeof(ccv_dense_matrix_t*) * (convnet->count + 1));
	}
#ifdef HAVE_CUDA
	}
#endif
}

#endif

#ifdef HAVE_GSL

// compute back propagated gradient & weight update delta
static void _ccv_convnet_convolutional_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* n, ccv_dense_matrix_t* m, ccv_dense_matrix_t** b, ccv_convnet_layer_t* update_params)
{
	// a is the input gradient (for back prop).
	// x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error)
	// note that y (the output from forward prop) is not included because the full connect net is simple enough that we don't need it
	int rows, cols, partition;
	ccv_convnet_make_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &rows, &cols, &partition);
	int ch = layer->net.convolutional.channels;
	int count = layer->net.convolutional.count;
	int strides = layer->net.convolutional.strides;
	int border = layer->net.convolutional.border;
	int kernel_rows = layer->net.convolutional.rows;
	int kernel_cols = layer->net.convolutional.cols;
	assert(a->rows == rows);
	assert(a->cols == cols);
	assert(CCV_GET_CHANNEL(a->type) == count);
	int a_rows = a->rows, a_cols = a->cols, a_ch = CCV_GET_CHANNEL(a->type);
	a->rows = rows, a->cols = cols, a->type = (a->type - a_ch) | count;
	assert(CCV_GET_CHANNEL(m->type) == ch);
	assert(CCV_GET_DATA_TYPE(m->type) == CCV_32F);
	int count_per_partition = count / partition;
	int ch_per_partition = ch / partition;
	// update weight gradient
	parallel_for(k, count) {
		int i, j, x, y, c;
		int p = k / count_per_partition;
		float* mp = m->data.f32 + p * ch_per_partition;
		float* ap = a->data.f32 + k;
		float* np = n->data.f32 + k;
		float* update_w = update_params->w + k * kernel_rows * kernel_cols * ch_per_partition;
		float bias = 0;
		for (i = 0; i < rows; i++)
		{
			int comy = ccv_max(i * strides - border, 0) - (i * strides - border);
			int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(m->rows + border, i * strides + kernel_rows));
			comy *= ch_per_partition * kernel_cols;
			for (j = 0; j < cols; j++)
			{
				if (np[j * count] > 0)
				{ /* when np is bigger than 0, relu continues to update the weight, otherwise it stops */
					float v = ap[j * count];
					bias += v;
					int comx = ccv_max(j * strides - border, 0) - (j * strides - border);
					int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(m->cols + border, j * strides + kernel_cols));
					float* w = update_w + comx * ch_per_partition + comy;
					float* mpz = mp + ccv_max(j * strides - border, 0) * ch;
					/* when we have border, we simply do zero padding */
					for (y = 0; y < maxy; y++)
					{
						for (x = 0; x < maxx; x++)
							for (c = 0; c < ch_per_partition; c++)
								w[x * ch_per_partition + c] += v * mpz[x * ch + c];
						w += kernel_cols * ch_per_partition;
						mpz += m->cols * ch;
					}
				}
			}
			ap += a->cols * count;
			np += n->cols * count;
			mp += m->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0));
		}
		update_params->bias[k] += bias;
	} parallel_endfor
	if (b)
	{
		ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, m->rows, m->cols, CCV_32F | CCV_GET_CHANNEL(m->type), CCV_32F | CCV_GET_CHANNEL(m->type), 0);
		// clear it up before propagate result
		ccv_zero(db);
		int k;
		for (k = 0; k < count; k++)
		{
			int i, j, x, y, c;
			int p = k / count_per_partition;
			float* bp = db->data.f32 + p * ch_per_partition;
			float* ap = a->data.f32 + k;
			float* np = n->data.f32 + k;
			float* layer_w = layer->w + k * kernel_rows * kernel_cols * ch_per_partition;
			for (i = 0; i < rows; i++)
			{
				int comy = ccv_max(i * strides - border, 0) - (i * strides - border);
				int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(db->rows + border, i * strides + kernel_rows));
				comy *= ch_per_partition * kernel_cols;
				for (j = 0; j < cols; j++)
				{
					if (np[j * count] > 0)
					{ /* when np is bigger than 0, relu continues to update the weight, otherwise it stops */
						float v = ap[j * count];
						int comx = ccv_max(j * strides - border, 0) - (j * strides - border);
						int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(db->cols + border, j * strides + kernel_cols));
						float* w = layer_w + comx * ch_per_partition + comy;
						float* bpz = bp + ccv_max(j * strides - border, 0) * ch;
						/* when we have border, we simply do zero padding */
						for (y = 0; y < maxy; y++)
						{
							for (x = 0; x < maxx; x++)
								for (c = 0; c < ch_per_partition; c++)
									bpz[x * ch + c] += v * w[x * ch_per_partition + c];
							w += kernel_cols * ch_per_partition;
							bpz += db->cols * ch;
						}
					}
				}
				ap += a->cols * count;
				np += n->cols * count;
				bp += db->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0));
			}
		}
	}
	a->rows = a_rows, a->cols = a_cols, a->type = (a->type - CCV_GET_CHANNEL(a->type)) | a_ch;
}

static void _ccv_convnet_full_connect_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* y, ccv_dense_matrix_t* x, ccv_dense_matrix_t** b, ccv_convnet_layer_t* update_params)
{
	// a is the input gradient (for back prop), y is the output (for forward prop)
	// x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error)
	ccv_dense_matrix_t* db = 0;
	if (b)
		db = *b = ccv_dense_matrix_renew(*b, x->rows, x->cols, CCV_32F | CCV_GET_CHANNEL(x->type), CCV_32F | CCV_GET_CHANNEL(x->type), 0);
	int x_rows = x->rows, x_cols = x->cols, x_ch = CCV_GET_CHANNEL(x->type);
	x->rows = x_rows * x_cols * x_ch, x->cols = 1, x->type = (x->type - x_ch) | CCV_C1;
	x->step = x->cols * CCV_GET_DATA_TYPE_SIZE(x->type);
	int i;
	if (layer->net.full_connect.relu)
		for (i = 0; i < y->rows; i++)
			if (y->data.f32[i] <= 0)
				a->data.f32[i] = 0;
	ccv_dense_matrix_t w = ccv_dense_matrix(a->rows, x->rows, CCV_32F | CCV_C1, update_params->w, 0);
	ccv_dense_matrix_t* dw = &w;
	// compute bias gradient
	ccv_dense_matrix_t bias = ccv_dense_matrix(a->rows, 1, CCV_32F | CCV_C1, update_params->bias, 0);
	ccv_dense_matrix_t* dbias = &bias;
	ccv_add(a, dbias, (ccv_matrix_t**)&dbias, 0);
	// compute weight gradient
	ccv_gemm(a, x, 1, dw, 1, CCV_B_TRANSPOSE, (ccv_matrix_t**)&dw, 0);
	w = ccv_dense_matrix(a->rows, x->rows, CCV_32F | CCV_C1, layer->w, 0);
	// propagate error
	if (db)
	{
		db->rows = x->rows, db->cols = x->cols, db->type = (db->type - x_ch) | CCV_C1;
		db->step = db->cols * CCV_GET_DATA_TYPE_SIZE(db->type);
		ccv_gemm(&w, a, 1, 0, 0, CCV_A_TRANSPOSE, (ccv_matrix_t**)&db, 0);
		db->rows = x_rows, db->cols = x_cols, db->type = (db->type - CCV_GET_CHANNEL(db->type)) | x_ch;
		db->step = db->cols * CCV_GET_DATA_TYPE_SIZE(db->type) * CCV_GET_CHANNEL(db->type);
	}
	x->rows = x_rows, x->cols = x_cols, x->type = (x->type - CCV_GET_CHANNEL(x->type)) | x_ch;
	x->step = x->cols * CCV_GET_DATA_TYPE_SIZE(x->type) * CCV_GET_CHANNEL(x->type);
}

static void _ccv_convnet_rnorm_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* n, ccv_dense_matrix_t* m, ccv_dense_matrix_t* denoms, ccv_dense_matrix_t** b)
{
	int rows, cols, partition;
	ccv_convnet_make_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &rows, &cols, &partition);
	int size = layer->net.rnorm.size;
	float alpha = layer->net.rnorm.alpha;
	float beta = layer->net.rnorm.beta;
	int way = size / 2;
	assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
	int ch = CCV_GET_CHANNEL(a->type);
	int type = CCV_32F | ch;
	ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0);
	int i, j, k, x, p;
	float* ap = a->data.f32;
	float* np = n->data.f32;
	float* mp = m->data.f32;
	float* dp = denoms->data.f32;
	float* bp = db->data.f32;
	int ch_per_partition = ch / partition;
	for (i = 0; i < db->rows; i++)
	{
		for (j = 0; j < db->cols; j++)
			for (p = 0; p < partition; p++)
				for (k = 0; k < ch_per_partition; k++)
				{
					float nom = 0;
					for (x = ccv_max(k - way, 0); x <= ccv_min(k + way, ch_per_partition - 1); x++)
						nom += -2 * alpha * beta * ap[j * ch + x + p * ch_per_partition] * np[j * ch + x + p * ch_per_partition] / dp[j * ch + x + p * ch_per_partition];
					bp[j * ch + k + p * ch_per_partition] = mp[j * ch + k + p * ch_per_partition] * nom + ap[j * ch + k + p * ch_per_partition] * powf(dp[j * ch + k + p * ch_per_partition], -beta);
				}
		ap += a->cols * ch;
		np += n->cols * ch;
		mp += m->cols * ch;
		dp += denoms->cols * ch;
		bp += db->cols * ch;
	}
}

static void _ccv_convnet_max_pool_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* n, ccv_dense_matrix_t* m, ccv_dense_matrix_t** b)
{
	// a is the input gradient (for back prop), y is the output (from forward prop),
	// x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error)
	// pooling layer doesn't need the dropout
	if (b)
	{
		assert(CCV_GET_CHANNEL(a->type) == CCV_GET_CHANNEL(n->type));
		assert(CCV_GET_CHANNEL(a->type) == CCV_GET_CHANNEL(m->type));
		int ch = CCV_GET_CHANNEL(a->type);
		ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, m->rows, m->cols, CCV_32F | ch, CCV_32F | ch, 0);
		ccv_zero(db);
		int size = layer->net.pool.size;
		int strides = layer->net.pool.strides;
		int border = layer->net.pool.border;
		int i, j, k, x, y;
		float* ap = a->data.f32;
		float* bp = db->data.f32;
		float* np = n->data.f32;
		float* mp = m->data.f32;
		for (i = 0; i < a->rows; i++)
		{
			const int start_y = ccv_max(i * strides - border, 0) - (i * strides - border);
			const int end_y = size + ccv_min(i * strides + size - border, db->rows) - (i * strides + size - border);
			for (j = 0; j < a->cols; j++)
			{
				const int start_x = ccv_max(j * strides - border, 0) - (j * strides - border);
				const int end_x = size + ccv_min(j * strides + size - border, db->cols) - (j * strides + size - border);
				for (k = 0; k < ch; k++)
				{
					float v = np[j * ch + k];
					float u = ap[j * ch + k];
					for (y = start_y; y < end_y; y++)
						for (x = start_x; x < end_x; x++)
							// we have to do direct comparison otherwise it will contribute to too many cells
							// and the propagation won't work. But CPU will have different result comparing with GPU
							if (mp[(j * strides - border + x + (y - border) * m->cols) * ch + k] == v)
								bp[(j * strides - border + x + (y - border) * db->cols) * ch + k] += u;
				}
			}
			ap += a->cols * ch;
			np += n->cols * ch;
			bp += db->cols * ch * strides;
			mp += m->cols * ch * strides;
		}
	}
}

static void _ccv_convnet_average_pool_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* m, ccv_dense_matrix_t** b)
{
	// a is the input gradient (for back prop), y is the output (from forward prop),
	// x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error)
	// pooling layer doesn't need the dropout
	if (b)
	{
		assert(CCV_GET_CHANNEL(a->type) == CCV_GET_CHANNEL(m->type));
		int ch = CCV_GET_CHANNEL(a->type);
		ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, m->rows, m->cols, CCV_32F | ch, CCV_32F | ch, 0);
		ccv_zero(db);
		int size = layer->net.pool.size;
		int strides = layer->net.pool.strides;
		int border = layer->net.pool.border;
		int i, j, k, x, y;
		float* ap = a->data.f32;
		float* bp = db->data.f32;
		for (i = 0; i < a->rows; i++)
		{
			const int start_y = ccv_max(i * strides - border, 0) - (i * strides - border);
			const int end_y = size + ccv_min(i * strides + size - border, db->rows) - (i * strides + size - border);
			for (j = 0; j < a->cols; j++)
			{
				const int start_x = ccv_max(j * strides - border, 0) - (j * strides - border);
				const int end_x = size + ccv_min(j * strides + size - border, db->cols) - (j * strides + size - border);
				for (k = 0; k < ch; k++)
				{
					float u = ap[j * ch + k] / ((end_x - start_x) * (end_y - start_y));
					for (y = start_y; y < end_y; y++)
						for (x = start_x; x < end_x; x++)
							bp[(j * strides - border + x + (y - border) * db->cols) * ch + k] += u;
				}
			}
			ap += a->cols * ch;
			bp += db->cols * ch * strides;
		}
	}
}

static void _ccv_convnet_propagate_loss(ccv_convnet_t* convnet, ccv_dense_matrix_t* a, ccv_dense_matrix_t* dloss, ccv_convnet_t* update_params)
{
	int i;
	ccv_convnet_layer_t* layer = convnet->layers + convnet->count - 1;
	assert(layer->type == CCV_CONVNET_FULL_CONNECT); // the last layer has too be a full connect one to generate softmax result
	_ccv_convnet_full_connect_backward_propagate(layer, dloss, convnet->acts[convnet->count - 1], convnet->acts[convnet->count - 2], convnet->count - 1 > 0 ? update_params->acts + convnet->count - 2 : 0, update_params->layers + convnet->count - 1);
	for (i = convnet->count - 2; i >= 0; i--)
	{
		layer = convnet->layers + i;
		switch (layer->type)
		{
			case CCV_CONVNET_CONVOLUTIONAL:
				_ccv_convnet_convolutional_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1] : a, i > 0 ? update_params->acts + i - 1 : 0, update_params->layers + i);
				break;
			case CCV_CONVNET_FULL_CONNECT:
				_ccv_convnet_full_connect_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1] : a, i > 0 ? update_params->acts + i - 1 : 0, update_params->layers + i);
				break;
			case CCV_CONVNET_LOCAL_RESPONSE_NORM:
				_ccv_convnet_rnorm_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1] : a, convnet->denoms[i], i > 0 ? update_params->acts + i - 1 : 0);
				break;
			case CCV_CONVNET_MAX_POOL:
				_ccv_convnet_max_pool_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1] : a, i > 0 ? update_params->acts + i - 1 : 0);
				break;
			case CCV_CONVNET_AVERAGE_POOL:
				_ccv_convnet_average_pool_backward_propagate(layer, update_params->acts[i], i > 0 ? convnet->acts[i - 1] : a, i > 0 ? update_params->acts + i - 1 : 0);
				break;
		}
	}
}

static void _ccv_convnet_update(ccv_convnet_t* convnet, int batch, ccv_convnet_t* momentum, ccv_convnet_t* update_params, ccv_convnet_layer_train_param_t* layer_params)
{
	int i, j;
	float learn_rate;
	for (i = 0; i < convnet->count; i++)
		switch (update_params->layers[i].type)
		{
			case CCV_CONVNET_CONVOLUTIONAL:
			{
				float* w = convnet->layers[i].w;
				float* vw = momentum->layers[i].w;
				float* dw = update_params->layers[i].w;
				learn_rate = layer_params[i].w.learn_rate / batch;
				for (j = 0; j < convnet->layers[i].wnum; j++)
				{
					vw[j] = layer_params[i].w.momentum * vw[j] - layer_params[i].w.decay * layer_params[i].w.learn_rate * w[j] + learn_rate * dw[j];
					w[j] += vw[j];
				}
				float* bias = convnet->layers[i].bias;
				float* vbias = momentum->layers[i].bias;
				float* dbias = update_params->layers[i].bias;
				learn_rate = layer_params[i].bias.learn_rate / batch;
				for (j = 0; j < convnet->layers[i].net.convolutional.count; j++)
				{
					vbias[j] = layer_params[i].bias.momentum * vbias[j] - layer_params[i].bias.decay * layer_params[i].bias.learn_rate * bias[j] + learn_rate * dbias[j];
					bias[j] += vbias[j];
				}
				break;
			}
			case CCV_CONVNET_FULL_CONNECT:
			{
				float* w = convnet->layers[i].w;
				float* vw = momentum->layers[i].w;
				float* dw = update_params->layers[i].w;
				learn_rate = layer_params[i].w.learn_rate / batch;
				for (j = 0; j < convnet->layers[i].wnum; j++)
				{
					vw[j] = layer_params[i].w.momentum * vw[j] - layer_params[i].w.decay * layer_params[i].w.learn_rate * w[j] + learn_rate * dw[j];
					w[j] += vw[j];
				}
				float* bias = convnet->layers[i].bias;
				float* vbias = momentum->layers[i].bias;
				float* dbias = update_params->layers[i].bias;
				learn_rate = layer_params[i].bias.learn_rate / batch;
				for (j = 0; j < convnet->layers[i].net.full_connect.count; j++)
				{
					vbias[j] = layer_params[i].bias.momentum * vbias[j] - layer_params[i].bias.decay * layer_params[i].bias.learn_rate * bias[j] + learn_rate * dbias[j];
					bias[j] += vbias[j];
				}
				break;
			}
		}
}

static void _ccv_convnet_update_zero(ccv_convnet_t* update_params)
{
	int i;
	for (i = 0; i < update_params->count; i++)
		switch (update_params->layers[i].type)
		{
			case CCV_CONVNET_CONVOLUTIONAL:
				memset(update_params->layers[i].w, 0, sizeof(float) * update_params->layers[i].wnum);
				memset(update_params->layers[i].bias, 0, sizeof(float) * update_params->layers[i].net.convolutional.count);
				break;
			case CCV_CONVNET_FULL_CONNECT:
				assert(update_params->layers[i].wnum % update_params->layers[i].net.full_connect.count == 0);
				memset(update_params->layers[i].w, 0, sizeof(float) * update_params->layers[i].wnum);
				memset(update_params->layers[i].bias, 0, sizeof(float) * update_params->layers[i].net.full_connect.count);
				break;
		}
}

static ccv_convnet_t* _ccv_convnet_update_new(ccv_convnet_t* convnet)
{
	ccv_convnet_t* update_params = (ccv_convnet_t*)ccmalloc(sizeof(ccv_convnet_t) + sizeof(ccv_convnet_layer_t) * convnet->count + sizeof(ccv_dense_matrix_t*) * convnet->count);
	update_params->reserved = 0;
	update_params->layers = (ccv_convnet_layer_t*)(update_params + 1);
	update_params->acts = (ccv_dense_matrix_t**)(update_params->layers + convnet->count);
	memset(update_params->acts, 0, sizeof(ccv_dense_matrix_t*) * convnet->count);
	update_params->denoms = 0;
	update_params->input = convnet->input;
	update_params->rows = convnet->rows;
	update_params->cols = convnet->cols;
	update_params->count = convnet->count;
	update_params->channels = convnet->channels;
	update_params->mean_activity = 0;
	int i;
	for (i = 0; i < convnet->count; i++)
	{
		update_params->layers[i].type = convnet->layers[i].type;
		update_params->layers[i].input = convnet->layers[i].input;
		update_params->layers[i].net = convnet->layers[i].net;
		update_params->layers[i].wnum = convnet->layers[i].wnum;
		update_params->layers[i].reserved = 0;
		switch (update_params->layers[i].type)
		{
			case CCV_CONVNET_CONVOLUTIONAL:
				update_params->layers[i].w = (float*)cccalloc(update_params->layers[i].wnum + update_params->layers[i].net.convolutional.count, sizeof(float));
				update_params->layers[i].bias = update_params->layers[i].w + update_params->layers[i].wnum;
				break;
			case CCV_CONVNET_FULL_CONNECT:
				assert(update_params->layers[i].wnum % update_params->layers[i].net.full_connect.count == 0);
				update_params->layers[i].w = (float*)cccalloc(update_params->layers[i].wnum + update_params->layers[i].net.full_connect.count, sizeof(float));
				update_params->layers[i].bias = update_params->layers[i].w + update_params->layers[i].wnum;
				break;
			case CCV_CONVNET_LOCAL_RESPONSE_NORM:
			case CCV_CONVNET_MAX_POOL:
			case CCV_CONVNET_AVERAGE_POOL:
				update_params->layers[i].w = 0;
				update_params->layers[i].bias = 0;
				break;
		}
	}
	return update_params;
}

static void _ccv_convnet_compute_softmax(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type)
{
	int ch = CCV_GET_CHANNEL(a->type);
	assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
	ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, a->rows, a->cols, CCV_32F | ch, CCV_32F | ch, 0);
	int i;
	float* aptr = a->data.f32;
	float* bptr = db->data.f32;
	double max = aptr[0];
	for (i = 1; i < a->rows * a->cols * ch; i++)
		if (aptr[i] > max)
			max = aptr[i];
	double tt = 0;
	for (i = 0; i < a->rows * a->cols * ch; i++)
		tt += (bptr[i] = expf(aptr[i] - max));
	tt = 1.0 / tt;
	for (i = 0; i < a->rows * a->cols * ch; i++)
		bptr[i] *= tt;
}

static void _ccv_convnet_classify(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, int* labels, int batch)
{
	assert(batch == 1);
	ccv_convnet_encode(convnet, a, convnet->acts + convnet->count - 1, 1);
	int i, c = 0;
	ccv_dense_matrix_t* b = convnet->acts[convnet->count - 1];
	float maxc = b->data.f32[0];
	for (i = 1; i < b->rows; i++)
		if (b->data.f32[i] > maxc)
			maxc = b->data.f32[i], c = i;
	labels[0] = c;
}

#endif

#ifndef CASE_TESTS

void ccv_convnet_supervised_train(ccv_convnet_t* convnet, ccv_array_t* categorizeds, ccv_array_t* tests, const char* filename, ccv_convnet_train_param_t params)
{
#ifdef HAVE_GSL
#ifdef HAVE_CUDA
	if (convnet->use_cwc_accel)
		cwc_convnet_supervised_train(convnet, categorizeds, tests, filename, params);
	else {
#endif
	int i, j, t;
	gsl_rng_env_setup();
	gsl_rng* rng = gsl_rng_alloc(gsl_rng_default);
	int aligned_padding = categorizeds->rnum % params.mini_batch;
	int aligned_rnum = categorizeds->rnum - aligned_padding;
	int* idx = (int*)ccmalloc(sizeof(int) * (categorizeds->rnum + aligned_padding));
	for (i = 0; i < categorizeds->rnum; i++)
		idx[i] = i;
	gsl_ran_shuffle(rng, idx, categorizeds->rnum, sizeof(int));
	// the last layer has to be full connect, thus we can use it as softmax layer
	assert(convnet->layers[convnet->count - 1].type == CCV_CONVNET_FULL_CONNECT);
	int category_count = convnet->layers[convnet->count - 1].net.full_connect.count;
	ccv_convnet_t* update_params = _ccv_convnet_update_new(convnet);
	ccv_convnet_t* momentum = _ccv_convnet_update_new(convnet);
	for (t = 0; t < params.max_epoch; t++)
	{
		for (i = 0; i < aligned_rnum; i++)
		{
			// dropout the first hidden layer
			ccv_categorized_t* categorized = (ccv_categorized_t*)ccv_array_get(categorizeds, idx[i]);
			ccv_convnet_encode(convnet, &categorized->matrix, convnet->acts + convnet->count - 1, 1);
			ccv_dense_matrix_t* softmax = convnet->acts[convnet->count - 1];
			float* dloss = softmax->data.f32;
			_ccv_convnet_compute_softmax(softmax, &softmax, 0);
			assert(softmax->rows == category_count && softmax->cols == 1);
			// this mashes softmax and logistic regression together
			// also, it gives you -D[loss w.r.t. to x_i] (note the negative sign)
			for (j = 0; j < category_count; j++)
				dloss[j] = (j == categorized->c) - dloss[j];
			_ccv_convnet_propagate_loss(convnet, categorized->matrix, softmax, update_params);
			if ((i + 1) % params.mini_batch == 0)
			{
				FLUSH(CCV_CLI_INFO, " - at epoch %03d / %d => stochastic gradient descent at %d / %d", t + 1, params.max_epoch, (i + 1) / params.mini_batch, aligned_rnum / params.mini_batch);
				// update weights
				_ccv_convnet_update(convnet, params.mini_batch, momentum, update_params, params.layer_params);
				_ccv_convnet_update_zero(update_params);
				// compact the convnet to avoid any staled temporary resource
				ccv_convnet_compact(convnet);
			}
		}
		int miss = 0;
		for (i = 0; i < tests->rnum; i++)
		{
			FLUSH(CCV_CLI_INFO, " - at epoch %03d / %d => going through %d / %d for tests", t + 1, params.max_epoch, i + 1, tests->rnum);
			ccv_categorized_t* test = (ccv_categorized_t*)ccv_array_get(tests, i);
			int c = 0;
			_ccv_convnet_classify(convnet, &test->matrix, &c, 1);
			if (c != test->c)
				++miss;
		}
		FLUSH(CCV_CLI_INFO, " - at epoch %03d / %d => with miss rate %.2f%%\n", t + 1, params.max_epoch, miss * 100.0f / tests->rnum);
		if (t + 1 < params.max_epoch)
		{
			// reshuffle the parts we visited and move the rest to the beginning
			memcpy(idx + categorizeds->rnum, idx + aligned_rnum, sizeof(int) * aligned_padding);
			memmove(idx + aligned_padding, idx, sizeof(int) * aligned_rnum);
			memcpy(idx, idx + categorizeds->rnum, sizeof(int) * aligned_padding);
			gsl_ran_shuffle(rng, idx + aligned_padding, aligned_rnum, sizeof(int));
		}
	}
	ccfree(idx);
	ccv_convnet_free(momentum);
	ccv_convnet_free(update_params);
	gsl_rng_free(rng);
#ifdef HAVE_CUDA
	}
#endif
#else
	assert(0 && "ccv_convnet_supervised_train requires GSL library support");
#endif
}

void ccv_convnet_compact(ccv_convnet_t* convnet)
{
#ifdef HAVE_CUDA
	cwc_convnet_compact(convnet);
#endif
	int i;
	for (i = 0; i < convnet->count; i++)
	{
		if (convnet->acts[i])
			ccv_matrix_free(convnet->acts[i]);
		convnet->acts[i] = 0;
		if (convnet->denoms)
		{
			if (convnet->denoms[i])
				ccv_matrix_free(convnet->denoms[i]);
			convnet->denoms[i] = 0;
		}
		if (SIMD(convnet->layers + i))
		{
			ccfree(convnet->layers[i].reserved);
			convnet->layers[i].reserved = 0;
		}
	}
}

void ccv_convnet_write(ccv_convnet_t* convnet, const char* filename, ccv_convnet_write_param_t params)
{
	sqlite3* db = 0;
	if (SQLITE_OK == sqlite3_open(filename, &db))
	{
		const char layer_create_table_qs[] =
			"CREATE TABLE IF NOT EXISTS layer_params "
			"(layer INTEGER PRIMARY KEY ASC, type INTEGER, "
			"input_matrix_rows INTEGER, input_matrix_cols INTEGER, input_matrix_channels INTEGER, input_matrix_partition INTEGER, input_node_count INTEGER, "
			"output_rows INTEGER, output_cols INTEGER, output_channels INTEGER, output_partition INTEGER, output_count INTEGER, output_strides INTEGER, output_border INTEGER, "
			"output_size INTEGER, output_kappa REAL, output_alpha REAL, output_beta REAL, output_relu INTEGER);"
			"CREATE TABLE IF NOT EXISTS convnet_params "
			"(convnet INTEGER PRIMARY KEY ASC, input_height INTEGER, input_width INTEGER, mean_activity BLOB);"
			"CREATE TABLE IF NOT EXISTS layer_data "
			"(layer INTEGER PRIMARY KEY ASC, weight BLOB, bias BLOB, half_precision INTEGER);";
		assert(SQLITE_OK == sqlite3_exec(db, layer_create_table_qs, 0, 0, 0));
		const char layer_params_insert_qs[] = 
			"REPLACE INTO layer_params "
			"(layer, type, "
			"input_matrix_rows, input_matrix_cols, input_matrix_channels, input_matrix_partition, input_node_count, "
			"output_rows, output_cols, output_channels, output_partition, output_count, output_strides, output_border, "
			"output_size, output_kappa, output_alpha, output_beta, output_relu) VALUES "
			"($layer, $type, " // 1
			"$input_matrix_rows, $input_matrix_cols, $input_matrix_channels, $input_matrix_partition, $input_node_count, " // 6
			"$output_rows, $output_cols, $output_channels, $output_partition, $output_count, $output_strides, $output_border, " // 13
			"$output_size, $output_kappa, $output_alpha, $output_beta, $output_relu);"; // 18
		sqlite3_stmt* layer_params_insert_stmt = 0;
		assert(SQLITE_OK == sqlite3_prepare_v2(db, layer_params_insert_qs, sizeof(layer_params_insert_qs), &layer_params_insert_stmt, 0));
		const char layer_data_insert_qs[] =
			"REPLACE INTO layer_data "
			"(layer, weight, bias, half_precision) VALUES ($layer, $weight, $bias, $half_precision);";
		sqlite3_stmt* layer_data_insert_stmt = 0;
		assert(SQLITE_OK == sqlite3_prepare_v2(db, layer_data_insert_qs, sizeof(layer_data_insert_qs), &layer_data_insert_stmt, 0));
		int i;
		for (i = 0; i < convnet->count; i++)
		{
			ccv_convnet_layer_t* layer = convnet->layers + i;
			// insert layer params
			sqlite3_bind_int(layer_params_insert_stmt, 1, i);
			sqlite3_bind_int(layer_params_insert_stmt, 2, layer->type);
			sqlite3_bind_int(layer_params_insert_stmt, 3, layer->input.matrix.rows);
			sqlite3_bind_int(layer_params_insert_stmt, 4, layer->input.matrix.cols);
			sqlite3_bind_int(layer_params_insert_stmt, 5, layer->input.matrix.channels);
			sqlite3_bind_int(layer_params_insert_stmt, 6, layer->input.matrix.partition);
			sqlite3_bind_int(layer_params_insert_stmt, 7, layer->input.node.count);
			switch (layer->type)
			{
				case CCV_CONVNET_CONVOLUTIONAL:
					sqlite3_bind_int(layer_params_insert_stmt, 8, layer->net.convolutional.rows);
					sqlite3_bind_int(layer_params_insert_stmt, 9, layer->net.convolutional.cols);
					sqlite3_bind_int(layer_params_insert_stmt, 10, layer->net.convolutional.channels);
					sqlite3_bind_int(layer_params_insert_stmt, 11, layer->net.convolutional.partition);
					sqlite3_bind_int(layer_params_insert_stmt, 12, layer->net.convolutional.count);
					sqlite3_bind_int(layer_params_insert_stmt, 13, layer->net.convolutional.strides);
					sqlite3_bind_int(layer_params_insert_stmt, 14, layer->net.convolutional.border);
					break;
				case CCV_CONVNET_FULL_CONNECT:
					sqlite3_bind_int(layer_params_insert_stmt, 12, layer->net.full_connect.count);
					sqlite3_bind_int(layer_params_insert_stmt, 19, layer->net.full_connect.relu);
					break;
				case CCV_CONVNET_MAX_POOL:
				case CCV_CONVNET_AVERAGE_POOL:
					sqlite3_bind_int(layer_params_insert_stmt, 13, layer->net.pool.strides);
					sqlite3_bind_int(layer_params_insert_stmt, 14, layer->net.pool.border);
					sqlite3_bind_int(layer_params_insert_stmt, 15, layer->net.pool.size);
					break;
				case CCV_CONVNET_LOCAL_RESPONSE_NORM:
					sqlite3_bind_int(layer_params_insert_stmt, 15, layer->net.rnorm.size);
					sqlite3_bind_double(layer_params_insert_stmt, 16, layer->net.rnorm.kappa);
					sqlite3_bind_double(layer_params_insert_stmt, 17, layer->net.rnorm.alpha);
					sqlite3_bind_double(layer_params_insert_stmt, 18, layer->net.rnorm.beta);
					break;
			}
			assert(SQLITE_DONE == sqlite3_step(layer_params_insert_stmt));
			sqlite3_reset(layer_params_insert_stmt);
			sqlite3_clear_bindings(layer_params_insert_stmt);
			// insert layer data
			if (layer->type == CCV_CONVNET_CONVOLUTIONAL || layer->type == CCV_CONVNET_FULL_CONNECT)
			{
				sqlite3_bind_int(layer_data_insert_stmt, 1, i);
				if (params.half_precision)
				{
					uint16_t* w = (uint16_t*)ccmalloc(sizeof(uint16_t) * layer->wnum);
					ccv_float_to_half_precision(layer->w, w, layer->wnum);
					uint16_t* bias = (uint16_t*)ccmalloc(sizeof(uint16_t) * (layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count : layer->net.full_connect.count));
					ccv_float_to_half_precision(layer->bias, bias, layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count : layer->net.full_connect.count);
					sqlite3_bind_blob(layer_data_insert_stmt, 2, w, sizeof(uint16_t) * layer->wnum, ccfree);
					sqlite3_bind_blob(layer_data_insert_stmt, 3, bias, sizeof(uint16_t) * (layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count : layer->net.full_connect.count), ccfree);
				} else {
					sqlite3_bind_blob(layer_data_insert_stmt, 2, layer->w, sizeof(float) * layer->wnum, SQLITE_STATIC);
					sqlite3_bind_blob(layer_data_insert_stmt, 3, layer->bias, sizeof(float) * (layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count : layer->net.full_connect.count), SQLITE_STATIC);
				}
				sqlite3_bind_int(layer_data_insert_stmt, 4, params.half_precision);
				assert(SQLITE_DONE == sqlite3_step(layer_data_insert_stmt));
				sqlite3_reset(layer_data_insert_stmt);
				sqlite3_clear_bindings(layer_data_insert_stmt);
			}
		}
		// insert convnet related params
		const char convnet_params_insert_qs[] =
			"REPLACE INTO convnet_params "
			"(convnet, mean_activity, input_height, input_width) VALUES (0, $mean_activity, $input_height, $input_width);";
		sqlite3_stmt* convnet_params_insert_stmt = 0;
		assert(SQLITE_OK == sqlite3_prepare_v2(db, convnet_params_insert_qs, sizeof(convnet_params_insert_qs), &convnet_params_insert_stmt, 0));
		assert(convnet->mean_activity->rows == convnet->input.height);
		assert(convnet->mean_activity->cols == convnet->input.width);
		assert(CCV_GET_CHANNEL(convnet->mean_activity->type) == convnet->channels);
		assert(CCV_GET_DATA_TYPE(convnet->mean_activity->type) == CCV_32F);
		sqlite3_bind_blob(convnet_params_insert_stmt, 1, convnet->mean_activity->data.f32, sizeof(float) * convnet->input.height * convnet->input.width * convnet->channels, SQLITE_STATIC);
		sqlite3_bind_int(convnet_params_insert_stmt, 2, convnet->input.height);
		sqlite3_bind_int(convnet_params_insert_stmt, 3, convnet->input.width);
		assert(SQLITE_DONE == sqlite3_step(convnet_params_insert_stmt));
		sqlite3_reset(convnet_params_insert_stmt);
		sqlite3_clear_bindings(convnet_params_insert_stmt);

		sqlite3_finalize(layer_params_insert_stmt);
		sqlite3_finalize(layer_data_insert_stmt);
		sqlite3_finalize(convnet_params_insert_stmt);
		sqlite3_close(db);
	}
}

ccv_convnet_t* ccv_convnet_read(int use_cwc_accel, const char* filename)
{
	sqlite3* db = 0;
	if (SQLITE_OK == sqlite3_open(filename, &db))
	{
		ccv_convnet_t* convnet = 0;
		sqlite3_stmt* layer_params_stmt = 0;
		// load layer params
		const char layer_params_qs[] =
			"SELECT type, " // 1
			"input_matrix_rows, input_matrix_cols, input_matrix_channels, input_matrix_partition, input_node_count, " // 6
			"output_rows, output_cols, output_channels, output_partition, output_count, output_strides, output_border, " // 13
			"output_size, output_kappa, output_alpha, output_beta, output_relu FROM layer_params ORDER BY layer ASC;"; // 18
		if (SQLITE_OK == sqlite3_prepare_v2(db, layer_params_qs, sizeof(layer_params_qs), &layer_params_stmt, 0))
		{
			ccv_array_t* layer_params = ccv_array_new(sizeof(ccv_convnet_layer_param_t), 3, 0);
			while (sqlite3_step(layer_params_stmt) == SQLITE_ROW)
			{
				ccv_convnet_layer_param_t layer_param;
				layer_param.type = sqlite3_column_int(layer_params_stmt, 0);
				layer_param.input.matrix.rows = sqlite3_column_int(layer_params_stmt, 1);
				layer_param.input.matrix.cols = sqlite3_column_int(layer_params_stmt, 2);
				layer_param.input.matrix.channels = sqlite3_column_int(layer_params_stmt, 3);
				layer_param.input.matrix.partition = sqlite3_column_int(layer_params_stmt, 4);
				layer_param.input.node.count = sqlite3_column_int(layer_params_stmt, 5);
				layer_param.bias = layer_param.glorot = 0; // this is irrelevant to read convnet
				switch (layer_param.type)
				{
					case CCV_CONVNET_CONVOLUTIONAL:
						layer_param.output.convolutional.rows = sqlite3_column_int(layer_params_stmt, 6);
						layer_param.output.convolutional.cols = sqlite3_column_int(layer_params_stmt, 7);
						layer_param.output.convolutional.channels = sqlite3_column_int(layer_params_stmt, 8);
						layer_param.output.convolutional.partition = sqlite3_column_int(layer_params_stmt, 9);
						layer_param.output.convolutional.count = sqlite3_column_int(layer_params_stmt, 10);
						layer_param.output.convolutional.strides = sqlite3_column_int(layer_params_stmt, 11);
						layer_param.output.convolutional.border = sqlite3_column_int(layer_params_stmt, 12);
						break;
					case CCV_CONVNET_FULL_CONNECT:
						layer_param.output.full_connect.count = sqlite3_column_int(layer_params_stmt, 10);
						layer_param.output.full_connect.relu = sqlite3_column_int(layer_params_stmt, 17);
						break;
					case CCV_CONVNET_MAX_POOL:
					case CCV_CONVNET_AVERAGE_POOL:
						layer_param.output.pool.strides = sqlite3_column_int(layer_params_stmt, 11);
						layer_param.output.pool.border = sqlite3_column_int(layer_params_stmt, 12);
						layer_param.output.pool.size = sqlite3_column_int(layer_params_stmt, 13);
						break;
					case CCV_CONVNET_LOCAL_RESPONSE_NORM:
						layer_param.output.rnorm.size = sqlite3_column_int(layer_params_stmt, 13);
						layer_param.output.rnorm.kappa = sqlite3_column_double(layer_params_stmt, 14);
						layer_param.output.rnorm.alpha = sqlite3_column_double(layer_params_stmt, 15);
						layer_param.output.rnorm.beta = sqlite3_column_double(layer_params_stmt, 16);
						break;
				}
				ccv_array_push(layer_params, &layer_param);
			}
			sqlite3_finalize(layer_params_stmt);
			sqlite3_stmt* convnet_params_input_stmt = 0;
			// load convnet params for input
			const char convnet_params_input_qs[] =
				"SELECT input_height, input_width FROM convnet_params WHERE convnet = 0;";
			ccv_size_t input = ccv_size(0, 0);
			if (SQLITE_OK == sqlite3_prepare_v2(db, convnet_params_input_qs, sizeof(convnet_params_input_qs), &convnet_params_input_stmt, 0))
			{
				if (sqlite3_step(convnet_params_input_stmt) == SQLITE_ROW)
				{
					input.height = sqlite3_column_int(convnet_params_input_stmt, 0);
					input.width = sqlite3_column_int(convnet_params_input_stmt, 1);
				}
				sqlite3_finalize(convnet_params_input_stmt);
			}
			assert(input.height != 0 && input.width != 0);
			convnet = ccv_convnet_new(use_cwc_accel, input, (ccv_convnet_layer_param_t*)ccv_array_get(layer_params, 0), layer_params->rnum);
			ccv_array_free(layer_params);
			// load layer data
			sqlite3_stmt* layer_data_stmt = 0;
			const char layer_data_qs[] =
				"SELECT layer, weight, bias, half_precision FROM layer_data;";
			if (SQLITE_OK == sqlite3_prepare_v2(db, layer_data_qs, sizeof(layer_data_qs), &layer_data_stmt, 0))
			{
				while (sqlite3_step(layer_data_stmt) == SQLITE_ROW)
				{
					ccv_convnet_layer_t* layer = convnet->layers + sqlite3_column_int(layer_data_stmt, 0);
					int half_precision = sqlite3_column_int(layer_data_stmt, 3);
					int wnum = sqlite3_column_bytes(layer_data_stmt, 1) / (half_precision ? sizeof(uint16_t) : sizeof(float));
					// if weights available, load weights
					if (wnum == layer->wnum)
					{
						const void* w = sqlite3_column_blob(layer_data_stmt, 1);
						if (half_precision)
						{
							float* f = (float*)ccmalloc(sizeof(float) * layer->wnum);
							ccv_half_precision_to_float((uint16_t*)w, f, layer->wnum);
							w = f;
						}
						switch (layer->type)
						{
							case CCV_CONVNET_CONVOLUTIONAL:
								memcpy(layer->w, w, sizeof(float) * layer->wnum);
								break;
							case CCV_CONVNET_FULL_CONNECT:
								memcpy(layer->w, w, sizeof(float) * layer->wnum);
								break;
						}
						if (half_precision)
							ccfree((void*)w);
					}
					int bnum = sqlite3_column_bytes(layer_data_stmt, 2) / (half_precision ? sizeof(uint16_t) : sizeof(float));
					// if bias available, load bias
					if (bnum == (layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count : layer->net.full_connect.count))
					{
						const void* bias = sqlite3_column_blob(layer_data_stmt, 2);
						if (half_precision)
						{
							float* f = (float*)ccmalloc(sizeof(float) * (layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count : layer->net.full_connect.count));
							ccv_half_precision_to_float((uint16_t*)bias, f, layer->type == CCV_CONVNET_CONVOLUTIONAL ? layer->net.convolutional.count : layer->net.full_connect.count);
							bias = f;
						}
						switch (layer->type)
						{
							case CCV_CONVNET_CONVOLUTIONAL:
								memcpy(layer->bias, bias, sizeof(float) * layer->net.convolutional.count);
								break;
							case CCV_CONVNET_FULL_CONNECT:
								memcpy(layer->bias, bias, sizeof(float) * layer->net.full_connect.count);
								break;
						}
						if (half_precision)
							ccfree((void*)bias);
					}
				}
				sqlite3_finalize(layer_data_stmt);
			}
			sqlite3_stmt* convnet_params_mean_activity_stmt = 0;
			// load convnet params for mean activity
			const char convnet_params_mean_activity_qs[] =
				"SELECT mean_activity FROM convnet_params WHERE convnet = 0;";
			if (SQLITE_OK == sqlite3_prepare_v2(db, convnet_params_mean_activity_qs, sizeof(convnet_params_mean_activity_qs), &convnet_params_mean_activity_stmt, 0))
			{
				if (sqlite3_step(convnet_params_mean_activity_stmt) == SQLITE_ROW)
				{
					int elems = sqlite3_column_bytes(convnet_params_mean_activity_stmt, 0) / sizeof(float);
					if (elems == convnet->input.height * convnet->input.width * convnet->channels)
						memcpy(convnet->mean_activity->data.f32, sqlite3_column_blob(convnet_params_mean_activity_stmt, 0), sizeof(float) * elems);
				}
				sqlite3_finalize(convnet_params_mean_activity_stmt);
			}
		}
		sqlite3_close(db);
		return convnet;
	}
	return 0;
}

void ccv_convnet_input_formation(ccv_size_t input, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b)
{
	if (a->rows > input.height && a->cols > input.width)
		ccv_resample(a, b, CCV_32F, ccv_max(input.height, (int)(a->rows * (float)input.height / a->cols + 0.5)), ccv_max(input.width, (int)(a->cols * (float)input.width / a->rows + 0.5)), CCV_INTER_AREA);
	else if (a->rows < input.height || a->cols < input.width)
		ccv_resample(a, b, CCV_32F, ccv_max(input.height, (int)(a->rows * (float)input.height / a->cols + 0.5)), ccv_max(input.width, (int)(a->cols * (float)input.width / a->rows + 0.5)), CCV_INTER_CUBIC);
	else
		ccv_shift(a, (ccv_matrix_t**)b, CCV_32F, 0, 0); // converting to 32f
}

void ccv_convnet_free(ccv_convnet_t* convnet)
{
	ccv_convnet_compact(convnet);
	int i;
	for (i = 0; i < convnet->count; i++)
		if (convnet->layers[i].w)
			ccfree(convnet->layers[i].w);
	if (convnet->mean_activity)
		ccv_matrix_free(convnet->mean_activity);
	ccfree(convnet);
}

#endif