#include "ccv_nnc.h"
#include "ccv_nnc_easy.h"
#include "ccv_nnc_internal.h"
#include "ccv_internal.h"
#ifdef HAVE_CUDA
#include "gpu/ccv_nnc_compat.h"
#endif
#include "_ccv_nnc_symbolic_graph.h"

/**
 * Level-4 API
 */

typedef struct {
	int f_wrt; // Check if both f_symbols and wrt_symbols flow through this node.
	ccv_array_t* outgoings; // backward traverse nodes.
	uint64_t* input_bitmasks;
	int input_bitmask_size;
	uint64_t* output_bitmasks;
	int output_bitmask_size;
} ccv_nnc_graph_backward_info_t;

typedef struct {
	int input_size;
	int* inputs;
	int output;
	ccv_array_t* outgoings;
	float value;
	ccv_nnc_graph_exec_symbol_t symbol;
} ccv_nnc_sum_graph_exec_symbol_t;

typedef struct {
	int input_size;
	int output_size;
	int* inputs;
	int* outputs;
	ccv_array_t* outgoings;
	ccv_nnc_cmd_t cmd;
	ccv_nnc_graph_exec_symbol_t symbol;
} ccv_nnc_autograd_graph_exec_symbol_t;

typedef struct {
	int d; // The pointer to the forward level object.
	int alias_ref; // The alias ref to itself (autograd_tensor_symbols array).
	int flags; // Flags for this symbol.
	ccv_nnc_tensor_symbol_t symbol;
} ccv_nnc_autograd_tensor_symbol_t;

typedef struct {
	int d; // The tensor symbol ref.
	int x; // The exec symbol ref.
	ccv_array_t* exec_registry; // Additional exec symbol refs, similar to x, only useful for aliasing.
	ccv_array_t* alias_registry; // int point to all the alias (if this is not an alias). The alias is the object in autograd_tensor_symbols, you need another level of indirection to get the actual forward level alias.
} ccv_nnc_tensor_ref_t;

typedef struct {
	int c; // The start non-accumulated version.
	ccv_array_t* ref_version; // tensor ref point to the reverse tensor symbol.
} ccv_nnc_autograd_tensor_version_t;

typedef struct {
	int d;
	int alias_ref;
} ccv_nnc_sum_variable_t;

// This method tries to figure out if a set of aliases can cover the whole tensor dim.
// This is not a precise implementation though. The requirement is to answer this question
// with a given memory constraint, therefore, only allow up to 65536 different tensor locations.
// If you have more than that, it will assume that it doesn't have fully assigned aliases,
// and will return 0.

// Return 1 if inserted successfully.
static inline int _ccv_nnc_try_mix(int* const md, const int ins, const int c)
{
	if (!c)
	{
		md[0] = ins;
		return 1;
	}
	int ll = 0, uu = c - 1;
	int mm;
	do {
		mm = ll + ((uu - ll) >> 1);
		if (ins == md[mm])
			return 0;
		else if (ins < md[mm])
			uu = mm - 1;
		else if (ins > md[mm])
			ll = mm + 1;
	} while (ll <= uu);
	if (ll < c)
		memmove(md + ll + 1, md + ll, sizeof(int) * (c - ll));
	md[ll] = ins;
	return 1;
}

static inline int _ccv_nnc_mix_idx(const int* const md, const int ins, const int c)
{
	if (c <= 1)
		return 0;
	int ll = 0, uu = c - 1;
	int mm;
	do {
		mm = ll + ((uu - ll) >> 1);
		if (ins == md[mm])
			return mm;
		else if (ins < md[mm])
			uu = mm - 1;
		else if (ins > md[mm])
			ll = mm + 1;
	} while (ll <= uu);
	assert(0 && "Shouldn't reach here");
	return -1;
}

static inline void _ccv_nnc_try_set_pix_0(const int* const ofs, const int* const dim, const int* const tensor_dim, int* const* const scmd, const int* const cube_dim, const int* const cube_step, uint8_t* const cube, int offset)
{
	const int s = (ofs[0] == 0) ? 0 : _ccv_nnc_mix_idx(scmd[0], ofs[0], cube_dim[0]) + 1;
	const int d = ((ofs[0] + dim[0] == tensor_dim[0]) ? cube_dim[0] : _ccv_nnc_mix_idx(scmd[0], ofs[0] + ccv_max(1, dim[0]), cube_dim[0])) + 1;
	assert(s >= 0 && d > s);
	int i;
	for (i = s; i < d; i++)
		// Fill this pix. I can make this faster by loop through full ones (divided by 8), but too lazy.
		cube[(offset + i) >> 3] |= (1 << ((offset + i) & 0x7));
}

static inline void _ccv_nnc_try_set_pix_1(const int* const ofs, const int* const dim, const int* const tensor_dim, int* const* const scmd, const int* const cube_dim, const int* const cube_step, uint8_t* const cube, int offset)
{
	const int s0 = (ofs[0] == 0) ? 0 : _ccv_nnc_mix_idx(scmd[0], ofs[0], cube_dim[0]) + 1;
	const int d0 = ((ofs[0] + dim[0] == tensor_dim[0]) ? cube_dim[0] : _ccv_nnc_mix_idx(scmd[0], ofs[0] + ccv_max(1, dim[0]), cube_dim[0])) + 1;
	assert(s0 >= 0 && d0 > s0);
	const int s1 = (ofs[1] == 0) ? 0 : _ccv_nnc_mix_idx(scmd[1], ofs[1], cube_dim[1]) + 1;
	const int d1 = ((ofs[1] + dim[1] == tensor_dim[1]) ? cube_dim[1] : _ccv_nnc_mix_idx(scmd[1], ofs[1] + ccv_max(1, dim[1]), cube_dim[1])) + 1;
	assert(s1 >= 0 && d1 > s1);
	int i, j;
	const int step1 = cube_step[1];
	if (step1 == d0 - s0)
	{
		// Faster one, we can simply loop through.
		for (i = s1 * step1; i < d1 * step1; i++)
			cube[(offset + i) >> 3] |= (1 << ((offset + i) & 0x7));
	} else {
		offset += s1 * step1;
		// There are gaps, slow one.
		for (i = s1; i < d1; i++, offset += step1)
			for (j = s0; j < d0; j++)
				cube[(offset + j) >> 3] |= (1 << ((offset + j) & 0x7));
	}
}

static inline void _ccv_nnc_try_set_pix(const int* const ofs, const int* const dim, const int* const tensor_dim, int* const* const scmd, const int* const cube_dim, const int* const cube_step, uint8_t* const cube, int offset, const int dim_idx)
{
	switch (dim_idx)
	{
		case 1:
			_ccv_nnc_try_set_pix_1(ofs, dim, tensor_dim, scmd, cube_dim, cube_step, cube, offset);
			return;
		case 0:
			_ccv_nnc_try_set_pix_0(ofs, dim, tensor_dim, scmd, cube_dim, cube_step, cube, offset);
			return;
	}
	int i;
	const int s = (ofs[dim_idx] == 0) ? 0 : _ccv_nnc_mix_idx(scmd[dim_idx], ofs[dim_idx], cube_dim[dim_idx]) + 1;
	const int d = ((ofs[dim_idx] + dim[dim_idx] == tensor_dim[dim_idx]) ? cube_dim[dim_idx] : _ccv_nnc_mix_idx(scmd[dim_idx], ofs[dim_idx] + ccv_max(1, dim[dim_idx]), cube_dim[dim_idx])) + 1;
	assert(s >= 0 && d > s);
	for (i = s; i < d; i++)
		_ccv_nnc_try_set_pix(ofs, dim, tensor_dim, scmd, cube_dim, cube_step, cube, offset + i * cube_step[dim_idx], dim_idx - 1);
}

static int _ccv_nnc_tensor_ref_fully_assigned_with_aliases(const ccv_nnc_tensor_ref_t* const tensor_ref, const ccv_array_t* const autograd_tensor_symbols, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info)
{
	// Only work with tensor_ref of aliases.
	assert(tensor_ref->alias_registry);
	const ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d);
	assert(tensor_symbol_info[autograd->d].alias_ref == 0);
	const int* tensor_dim = tensor_symbol_info[autograd->d].info.dim;
	int i, j;
	for (i = 0; i < tensor_ref->alias_registry->rnum; i++)
	{
		const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, i);
		assert(d < autograd_tensor_symbols->rnum);
		const ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d);
		assert(tensor_symbol_info[autograd->d].alias_ref);
		const int* inc = tensor_symbol_info[autograd->d].inc;
		if (memcmp(inc, tensor_dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) != 0)
			return 0;
	}
	/* We need a solid cube (potentially hyper dimensional) to compute if there are overlaps.
	 * To make this cube as small as possible, we need to map the actual tensor dimension
	 * (therefore, we don't actually allocate the whole tensor to compute overlaps) to a smaller
	 * cube given the ofs and dim size of its aliases.
	 *
	 * The following code generated the dimension mapping (using scratch space) with binary search + insertion
	 * and then we fill the cube with a given tensor alias's dimensional information (ofs, dim).
	 * Afterwards, we simply need to check if the cube is totally filled up to know if this tensor
	 * is fully assigned with its aliases (if that is the case, we can skip zeroing for this tensor).
	 *
	 * There are several restrictions though to make this faster: 1). I cannot handle any cube that all side
	 * lengths combined larger than 1023 (scm only have 1024 scratch space). 2). I cannot handle any cube
	 * that the total volume is larger than 2048 * 8 (I only allocate 2K on stack for this).
	 * */
	int scm[1024]; // Having 1024 int scratch space for mapping dimensions. (Or sparse coordinate mapping).
	int cube_dim[CCV_NNC_MAX_DIM_ALLOC] = {}; // Mapping dimension size.
	int cube_size = 1;
	int* scmptr = scm;
	for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && tensor_dim[i]; i++)
	{
		int head = 0, tail = 0; // Note that we touched both the head and tail (otherwise this dimension is not fully covered).
		int len = 0;
		for (j = 0; j < tensor_ref->alias_registry->rnum; j++)
		{
			const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, j);
			assert(d < autograd_tensor_symbols->rnum);
			const ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d);
			assert(tensor_symbol_info[autograd->d].alias_ref);
			const int* ofs = tensor_symbol_info[autograd->d].ofs;
			const int* dim = tensor_symbol_info[autograd->d].info.dim;
			head = head || (ofs[i] == 0);
			tail = tail || (ofs[i] + ccv_max(1, dim[i]) == tensor_dim[i]);
			if (ofs[i] != 0)
				len += _ccv_nnc_try_mix(scmptr, ofs[i], len);
			if (scmptr - scm + len >= 1024) // Cannot handle that much, abort.
				return 0;
			if (ofs[i] + ccv_max(1, dim[i]) < tensor_dim[i])
				len += _ccv_nnc_try_mix(scmptr, ofs[i] + ccv_max(1, dim[i]), len);
			if (scmptr - scm + len >= 1024) // Cannot handle that much, abort.
				return 0;
		}
		if (!head || !tail)
			return 0;
		cube_size *= (len + 1);
		cube_dim[i] = len;
		scmptr += len; // Moving to next level.
	}
	// The cube map is too large, cannot do the computation, assume it is not fully assigned.
	if (cube_size > 2048 * 8)
		return 0;
	// binary map to see if it fills up.
	uint8_t* cube = (uint8_t*)alloca(sizeof(uint8_t) * ((cube_size + 7) >> 3));
	memset(cube, 0, sizeof(uint8_t) * ((cube_size + 7) >> 3));
	int* scmd[CCV_NNC_MAX_DIM_ALLOC] = {}; // Sparse coordinate map at dimension x.
	int cube_step[CCV_NNC_MAX_DIM_ALLOC] = {};
	for (i = 0; i < CCV_NNC_MAX_DIM_ALLOC && tensor_dim[i]; i++)
	{
		cube_step[i] = (i > 0) ? cube_step[i - 1] * (cube_dim[i - 1] + 1) : 1;
		scmd[i] = (i > 0) ? scmd[i - 1] + cube_dim[i - 1] : scm;
	}
	const int max_dim = i;
	for (i = 0; i < tensor_ref->alias_registry->rnum; i++)
	{
		const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, i);
		assert(d < autograd_tensor_symbols->rnum);
		const ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d);
		assert(tensor_symbol_info[autograd->d].alias_ref);
		const int* ofs = tensor_symbol_info[autograd->d].ofs;
		const int* dim = tensor_symbol_info[autograd->d].info.dim;
		_ccv_nnc_try_set_pix(ofs, dim, tensor_dim, scmd, cube_dim, cube_step, cube, 0, max_dim - 1);
	}
	// Compare to see now if the binary map filled up. If it filled up, we know it is fully assigned.
	for (i = 0; i < (cube_size >> 3); i++)
		if (cube[i] < 0xff)
			return 0;
	if ((cube_size & 0x7) > 0)
	{
		// Fetch the rest.
		uint8_t r = 0;
		for (i = 0; i < (cube_size & 0x7); i++)
			r |= (1 << i);
		assert(cube[((cube_size + 7) >> 3) - 1] <= r);
		if (cube[((cube_size + 7) >> 3) - 1] < r)
			return 0;
	}
	return 1;
}

static void _ccv_nnc_graph_sum_autograd_tensor_versions(const int idx, const int d, const int exec_symbol_info_size, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, ccv_nnc_autograd_tensor_version_t* const tensor_ver, ccv_nnc_autograd_graph_exec_symbol_t* const autograd_execs, ccv_array_t* const autograd_tensor_symbols, ccv_array_t* const sum_execs)
{
	int i, j;
	assert(tensor_ver->c < tensor_ver->ref_version->rnum);
	const int input_size = tensor_ver->ref_version->rnum - tensor_ver->c;
	int* inputs = (int*)ccmalloc(sizeof(int) * input_size);
	for (i = tensor_ver->c; i < tensor_ver->ref_version->rnum; i++)
		inputs[i] = ((ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, i))->d;
	ccv_nnc_autograd_tensor_symbol_t tensor_sym = {
		.d = d
	};
	ccv_array_push(autograd_tensor_symbols, &tensor_sym);
	ccv_nnc_sum_graph_exec_symbol_t sum_exec = {
		.input_size = input_size,
		.inputs = inputs,
		.output = autograd_tensor_symbols->rnum - 1
	};
	if (idx >= 0)
	{
		sum_exec.outgoings = ccv_array_new(sizeof(int), 1, 0);
		ccv_array_push(sum_exec.outgoings, &idx);
	}
	ccv_array_push(sum_execs, &sum_exec);
	const int outgoing = exec_symbol_info_size + sum_execs->rnum - 1;
	for (i = tensor_ver->c; i < tensor_ver->ref_version->rnum; i++)
	{
		const ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, i);
		const int x = tensor_ref->x;
		assert(x >= 0); /* Otherwise, this is initialization tensor, which is impossible to be summed up by. */
		if (x < exec_symbol_info_size)
		{
			ccv_nnc_autograd_graph_exec_symbol_t* back_exec = autograd_execs + x;
			if (!back_exec->outgoings)
				back_exec->outgoings = ccv_array_new(sizeof(int), 1, 0);
			ccv_array_replace_int(back_exec->outgoings, idx, outgoing);
		} else {
			// This tensor_ref is generated by the sum operation.
			ccv_nnc_sum_graph_exec_symbol_t* sum_or_set = (ccv_nnc_sum_graph_exec_symbol_t*)ccv_array_get(sum_execs, x - exec_symbol_info_size);
			ccv_array_replace_int(sum_or_set->outgoings, idx, outgoing);
		}
		// If this tensor have associated alias, we need to init it to zeros when it is allocated (we only need to set a flag here)
		// it is handled at compilation phase.
		if (tensor_ref->alias_registry &&
			// Loop over to see if this tensor is fully occupied to avoid extra zero step.
			!_ccv_nnc_tensor_ref_fully_assigned_with_aliases(tensor_ref, autograd_tensor_symbols, tensor_symbol_info))
		{
			ccv_nnc_autograd_tensor_symbol_t* tensor_sym = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d);
			// By having alias_registry, what this symbol represents must not by an alias.
			assert(tensor_sym->alias_ref == 0);
			tensor_sym->flags = CCV_NNC_SYM_TENSOR_INIT_ZEROS;
		}
		if (tensor_ref->exec_registry)
			for (j = 0; j < tensor_ref->exec_registry->rnum; j++)
			{
				const int x = *(int*)ccv_array_get(tensor_ref->exec_registry, j);
				assert(x >= 0);
				// The exec_registry can only be generated by alias registry, therefore, it cannot reference to a sum operation.
				assert(x < exec_symbol_info_size);
				ccv_nnc_autograd_graph_exec_symbol_t* back_exec = autograd_execs + x;
				if (!back_exec->outgoings)
					back_exec->outgoings = ccv_array_new(sizeof(int), 1, 0);
				ccv_array_replace_int(back_exec->outgoings, idx, outgoing);
			}
	}
	const ccv_nnc_tensor_ref_t tensor_ref = {
		.d = autograd_tensor_symbols->rnum - 1,
		.x = outgoing
	};
	ccv_array_push(tensor_ver->ref_version, &tensor_ref);
	/* Move the c pointer up to the latest summed result. */
	tensor_ver->c = tensor_ver->ref_version->rnum - 1;
}

static int _ccv_nnc_tensor_ref_version_involve_alias(const ccv_nnc_tensor_ref_t* const tensor_ref, const ccv_array_t* const autograd_tensor_symbols, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_tensor_symbol_info_t* const alias)
{
	assert(alias->alias_ref > 0);
	// No alias_registry, must conflict (owns the whole band).
	if (!tensor_ref->alias_registry)
		return 1;
	int i, j;
	for (i = 0; i < tensor_ref->alias_registry->rnum; i++)
	{
		const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, i);
		assert(d < autograd_tensor_symbols->rnum);
		ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d);
		// This must reference to an alias.
		assert(tensor_symbol_info[autograd->d].alias_ref);
		const int* inc = tensor_symbol_info[autograd->d].inc;
		// Only can compare if the inc is the same, otherwise, we can only assume it overlaps.
		if (memcmp(inc, alias->inc, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) != 0)
			return 1;
		const int* ofs = tensor_symbol_info[autograd->d].ofs;
		const int* dim = tensor_symbol_info[autograd->d].info.dim;
		int none = 0;
		for (j = 0; j < CCV_NNC_MAX_DIM_ALLOC && dim[j] && alias->info.dim[j]; j++)
			if (ccv_min(ofs[j] + dim[j], alias->ofs[j] + alias->info.dim[j]) <= ccv_max(ofs[j], alias->ofs[j]))
			{
				none = 1;
				break;
			}
		// If it overlaps with this alias, nope.
		if (!none)
			return 1;
	}
	// All aliases referenced by this ref_version doesn't overlap with the provided one, thus, there is no conflict at all.
	return 0;
}

static int _ccv_nnc_tensor_ref_version_find_alias(const ccv_nnc_tensor_ref_t* const tensor_ref, const ccv_array_t* const autograd_tensor_symbols, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_tensor_symbol_info_t* const alias)
{
	assert(alias->alias_ref > 0);
	// No alias_registry, thus, cannot find the exact matched alias.
	if (!tensor_ref->alias_registry)
		return -1;
	int i;
	for (i = 0; i < tensor_ref->alias_registry->rnum; i++)
	{
		const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, i);
		assert(d < autograd_tensor_symbols->rnum);
		ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d);
		// This must reference to an alias.
		assert(tensor_symbol_info[autograd->d].alias_ref);
		const int* inc = tensor_symbol_info[autograd->d].inc;
		const int* ofs = tensor_symbol_info[autograd->d].ofs;
		const int* dim = tensor_symbol_info[autograd->d].info.dim;
		// If everything matches, this is the required alias.
		if (memcmp(inc, alias->inc, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) == 0 &&
			memcmp(ofs, alias->ofs, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) == 0 &&
			memcmp(dim, alias->info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) == 0)
			return d;
	}
	return -1;
}

static int _ccv_nnc_tensor_ref_version_has_this_alias_exclusively(const ccv_nnc_tensor_ref_t* const tensor_ref, const ccv_array_t* const autograd_tensor_symbols, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_tensor_symbol_info_t* const alias)
{
	assert(alias->alias_ref > 0);
	// No alias_registry, thus, cannot find the exact matched alias.
	if (!tensor_ref->alias_registry)
		return 0;
	int i;
	for (i = 0; i < tensor_ref->alias_registry->rnum; i++)
	{
		const int d = *(int*)ccv_array_get(tensor_ref->alias_registry, i);
		assert(d < autograd_tensor_symbols->rnum);
		ccv_nnc_autograd_tensor_symbol_t* autograd = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, d);
		// This must reference to an alias.
		assert(tensor_symbol_info[autograd->d].alias_ref);
		const int* inc = tensor_symbol_info[autograd->d].inc;
		const int* ofs = tensor_symbol_info[autograd->d].ofs;
		const int* dim = tensor_symbol_info[autograd->d].info.dim;
		if (memcmp(inc, alias->inc, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) != 0 ||
			memcmp(ofs, alias->ofs, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) != 0 ||
			memcmp(dim, alias->info.dim, sizeof(int) * CCV_NNC_MAX_DIM_ALLOC) != 0)
			return 0;
	}
	// If everything matches for every alias in registry, we can use any of the alias directly.
	return 1;
}

static int _ccv_nnc_graph_sum_autograd_tensor_versions_alias(const int idx, const int d, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const int exec_symbol_info_size, const ccv_nnc_tensor_symbol_info_t* const alias, ccv_nnc_autograd_tensor_version_t* const tensor_ver, ccv_nnc_autograd_graph_exec_symbol_t* const autograd_execs, ccv_array_t* const autograd_tensor_symbols, ccv_array_t* const sum_execs)
{
	assert(tensor_ver->c < tensor_ver->ref_version->rnum);
	int i, j = 0;
	struct {
		int k;
		int i;
	} kd[tensor_ver->ref_version->rnum - tensor_ver->c];
	for (i = tensor_ver->c; i < tensor_ver->ref_version->rnum; i++)
	{
		ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, i);
		const int k = _ccv_nnc_tensor_ref_version_find_alias(tensor_ref, autograd_tensor_symbols, tensor_symbol_info, alias);
		if (k >= 0)
			kd[j++] = (typeof(kd[0])){
				.k = k, .i = i
			};
		else if (_ccv_nnc_tensor_ref_version_involve_alias(tensor_ref, autograd_tensor_symbols, tensor_symbol_info, alias))
			kd[j++] = (typeof(kd[0])) {
				.k = -1, .i = i // It has dependency to the original tensor (non-alias) now, label this with highest bit.
			};
	}
	// Can only find one. This is the easy case, we can simply return that symbol (or its alias).
	if (j == 1)
	{
		if (kd[0].k >= 0)
			return kd[0].k; // Only can find one alias, that is the one.
		// Otherwise, need to create a new alias.
		ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, kd[0].i);
		ccv_nnc_autograd_tensor_symbol_t* ref = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d);
		// Since we create new alias, we need to set the referenced one to be allocated with 0s.
		if (ref->alias_ref) // If this is an alias, it has to be zero initialized.
		{
			ref = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, ref->alias_ref - 1);
			assert(ref->alias_ref == 0); // This is original.
			ref->flags = CCV_NNC_SYM_TENSOR_INIT_ZEROS;
		} else if (tensor_ref->alias_registry && // Otherwise, to see if this symbol is fully occupied.
				// Loop over to see if this tensor is fully occupied to avoid extra zero step.
				!_ccv_nnc_tensor_ref_fully_assigned_with_aliases(tensor_ref, autograd_tensor_symbols, tensor_symbol_info)) {
			ref->flags = CCV_NNC_SYM_TENSOR_INIT_ZEROS;
		}
		ccv_nnc_autograd_tensor_symbol_t tensor_sym = {
			.d = d,
			.alias_ref = tensor_ref->d + 1
		};
		ccv_array_push(autograd_tensor_symbols, &tensor_sym);
		const int ad = autograd_tensor_symbols->rnum - 1;
		if (tensor_ref->alias_registry) // Only push this when it has an alias registry (otherwise it already conflict with everyone).
			ccv_array_push(tensor_ref->alias_registry, &ad);
		// The newly inserted tensor symbol.
		return ad;
	}
	// Otherwise, we need to create the sum operation out of these.
	const int input_size = j;
	int has_this_alias_exclusively = 1;
	int* inputs = input_size > 0 ? (int*)ccmalloc(sizeof(int) * input_size) : 0;
	for (i = 0; i < input_size; i++)
	{
		ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, kd[i].i);
		// Can take a fast path if every ref involved has the same alias, our sum operation can be faster (using alias directly).
		if (has_this_alias_exclusively && kd[i].k >= 0 && _ccv_nnc_tensor_ref_version_has_this_alias_exclusively(tensor_ref, autograd_tensor_symbols, tensor_symbol_info, alias))
			inputs[i] = *(int*)ccv_array_get(tensor_ref->alias_registry, 0); // Assigning the alias.
		else {
			if (has_this_alias_exclusively)
			{
				has_this_alias_exclusively = 0;
				for (j = 0; j < i; j++)
					inputs[j] = ((ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, kd[j].i))->d;
			}
			inputs[i] = tensor_ref->d;
		}
	}
	ccv_nnc_autograd_tensor_symbol_t tensor_sym = {
		.d = alias->alias_ref - 1
	};
	ccv_array_push(autograd_tensor_symbols, &tensor_sym);
	const int tensor_ref_d = autograd_tensor_symbols->rnum - 1;
	tensor_sym.d = d;
	tensor_sym.alias_ref = tensor_ref_d + 1;
	ccv_array_push(autograd_tensor_symbols, &tensor_sym);
	const int ad = autograd_tensor_symbols->rnum - 1;
	ccv_nnc_sum_graph_exec_symbol_t sum_exec = {
		.input_size = input_size,
		.inputs = inputs,
		.output = has_this_alias_exclusively ? ad : tensor_ref_d /* If has this alias exclusively, the output should be alias as well. Otherwise the output is the real tensor. */
	};
	if (idx >= 0)
	{
		sum_exec.outgoings = ccv_array_new(sizeof(int), 1, 0);
		ccv_array_push(sum_exec.outgoings, &idx);
	}
	ccv_array_push(sum_execs, &sum_exec);
	const int outgoing = exec_symbol_info_size + sum_execs->rnum - 1;
	int no_alias_registry = 0;
	for (i = 0; i < input_size; i++)
	{
		ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, kd[i].i);
		if (!has_this_alias_exclusively)
		{
			// If the sum operation is not operating on one alias. I need to zero this tensor out when it is first
			// allocated (see discussions around the flags I use).
			ccv_nnc_autograd_tensor_symbol_t* tensor_sym = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d);
			if (tensor_sym->alias_ref)
			{
				// Find the original tensor_sym and set its flags (I prefer to set flags on its original).
				ccv_nnc_autograd_tensor_symbol_t* ref = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_sym->alias_ref - 1);
				assert(ref->alias_ref == 0); // This is original.
				ref->flags = CCV_NNC_SYM_TENSOR_INIT_ZEROS;
			} else if (tensor_ref->alias_registry && // Otherwise, to see if this symbol is fully occupied.
					// Loop over to see if this tensor is fully occupied to avoid extra zero step.
					!_ccv_nnc_tensor_ref_fully_assigned_with_aliases(tensor_ref, autograd_tensor_symbols, tensor_symbol_info)) {
				tensor_sym->flags = CCV_NNC_SYM_TENSOR_INIT_ZEROS;
			}
		}
		// Check to see if any of these tensors doesn't have alias.
		no_alias_registry |= (!tensor_ref->alias_registry);
		const int x = tensor_ref->x;
		assert(x >= 0); /* Otherwise, this is initialization tensor, which is impossible to be summed up by. */
		if (x < exec_symbol_info_size)
		{
			ccv_nnc_autograd_graph_exec_symbol_t* back_exec = autograd_execs + x;
			if (!back_exec->outgoings)
				back_exec->outgoings = ccv_array_new(sizeof(int), 1, 0);
			ccv_array_push(back_exec->outgoings, &outgoing);
		} else {
			ccv_nnc_sum_graph_exec_symbol_t* sum_or_set = (ccv_nnc_sum_graph_exec_symbol_t*)ccv_array_get(sum_execs, x - exec_symbol_info_size);
			ccv_array_push(sum_or_set->outgoings, &outgoing);
		}
		if (tensor_ref->exec_registry)
			for (j = 0; j < tensor_ref->exec_registry->rnum; j++)
			{
				const int x = *(int*)ccv_array_get(tensor_ref->exec_registry, j);
				assert(x >= 0); /* Otherwise, this is initialization tensor, which is impossible to be summed up by. */
				assert(x < exec_symbol_info_size); // exec_registry is only used by alias_registry, it simply cannot reference to a sum operation.
				ccv_nnc_autograd_graph_exec_symbol_t* back_exec = autograd_execs + x;
				if (!back_exec->outgoings)
					back_exec->outgoings = ccv_array_new(sizeof(int), 1, 0);
				ccv_array_push(back_exec->outgoings, &outgoing);
			}
	}
	const ccv_nnc_tensor_ref_t tensor_ref = {
		.d = tensor_ref_d,
		.x = outgoing,
		.exec_registry = 0, // I don't need to take execution dependencies because this tensor is generated by sum, therefore, we already take that dependency.
		.alias_registry = !no_alias_registry || has_this_alias_exclusively ? ccv_array_new(sizeof(int), 1, 0) : 0
	};
	// If there is no alias registry, then we take the whole tensor ref as one.
	if (!no_alias_registry || has_this_alias_exclusively)
	{
		// If this tensor ref contains multiple different types of alias, have to add them together (otherwise
		// the computation for if there is an empty slot in this tensor ref is not correct without all the
		// occupancy availability information).
		if (!has_this_alias_exclusively)
			for (i = 0; i < input_size; i++)
			{
				ccv_nnc_tensor_ref_t* ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, kd[i].i);
				assert(ref->alias_registry);
				// It may get duplicates. But whatever, won't matter the computation.
				for (j = 0; j < ref->alias_registry->rnum; j++)
					ccv_array_push(tensor_ref.alias_registry, ccv_array_get(ref->alias_registry, j));
			}
		ccv_array_push(tensor_ref.alias_registry, &ad);
	}
	assert(input_size <= tensor_ver->ref_version->rnum - tensor_ver->c);
	ccv_nnc_tensor_ref_t x;
	for (i = 0; i < input_size; i++)
		// If the current one (i + tensor_ver->c) is smaller than the one referenced to, exchange.
		if (kd[i].i > i + tensor_ver->c)
			CCV_SWAP(*(ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, i + tensor_ver->c), *(ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, kd[i].i), x);
	ccv_array_push(tensor_ver->ref_version, &tensor_ref);
	// We've consumed input_size tensor refs, now move c up to the pointer of non-consumed tensors.
	tensor_ver->c += input_size;
	return ad;
}

typedef struct {
	int exec_symbol_info_size; // Number of graph exec symbols before adding any new symbols related to automatic differentiation.
	int tensor_symbol_info_size; // Number of tensor symbols before adding anything new.
	ccv_nnc_graph_backward_info_t* backward_info; // Corresponding to forward graph exec symbol info, it is exactly in reverse.
	ccv_nnc_graph_visit_t* backward_visit; // The visitor structure (top sorted index) when doing reverse traversal.
	ccv_nnc_autograd_graph_exec_symbol_t* autograd_execs; // The graph exec symbols we need for automatic differentiation. This is a 1:1 mapping for forward graph exec symbols, however, unlike backward_info, its outgoings may be more complex (may contain outgoing flows to sum nodes).
	ccv_nnc_autograd_tensor_version_t* autograd_tensor_versions; // Corresponding to forward tensor symbols, each may contain multiple versions (due to multi-write).
	ccv_array_t* autograd_tensor_symbols; // The tensor symbols we need for automatic differentiation (it may not be 1:1 mapping).
	ccv_array_t* sum_execs; // The sum nodes, because in reverse mode, a tensor could have multiple versions, we need to sum them up before use.
} ccv_nnc_symbolic_graph_backward_prep_t;

static ccv_nnc_symbolic_graph_backward_prep_t _ccv_nnc_symbolic_graph_backward_prep(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_graph_visit_t* const forward_visit, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_t* const f_symbols, const int f_symbol_size, const ccv_nnc_tensor_symbol_t* const wrt_symbols, const int wrt_symbol_size)
{
	const int exec_symbol_info_size = graph->exec_symbol_info->rnum;
	const int tensor_symbol_info_size = graph->tensor_symbol_info->rnum;
	int i, j;
	// Now, for each one of these, find a reverse graph.
	ccv_nnc_graph_backward_info_t* backward_info = (ccv_nnc_graph_backward_info_t*)cccalloc(exec_symbol_info_size, sizeof(ccv_nnc_graph_backward_info_t));
	ccv_nnc_graph_visit_for(forward_visit, exec_symbol_info, node, idx) {
		assert(ccv_nnc_cmd_is_forward(node->cmd) || node->cmd.cmd == CCV_NNC_NOOP);
		if (node->outgoings)
			for (i = 0; i < node->outgoings->rnum; i++)
			{
				int d = *(int*)ccv_array_get(node->outgoings, i);
				if (backward_info[d].outgoings == 0)
					backward_info[d].outgoings = ccv_array_new(sizeof(int32_t), 1, 0);
				ccv_array_push(backward_info[d].outgoings, &idx);
			}
	} ccv_nnc_graph_visit_endfor
	// Find the f_symbols, and tag its flows.
	ccv_nnc_graph_visit_t* backward_visit = ccv_nnc_graph_visit_new(graph, backward_info, exec_symbol_info_size, destinations, destination_size, sources, source_size);
	ccv_nnc_graph_visit_for(backward_visit, backward_info, node, idx) {
		int f = node->f_wrt & 0x1;
		for (i = 0; i < exec_symbol_info[idx].output_size && !f; i++)
		{
			int d = exec_symbol_info[idx].outputs[i];
			while (tensor_symbol_info[d].alias_ref)
				d = tensor_symbol_info[d].alias_ref - 1;
			for (j = 0; j < f_symbol_size && !f; j++)
				if (d == f_symbols[j].d)
					f = 1;
		}
		if (f)
		{
			node->f_wrt |= f;
			if (node->outgoings)
				for (i = 0; i < node->outgoings->rnum; i++)
				{
					int d = *(int*)ccv_array_get(node->outgoings, i);
					backward_info[d].f_wrt |= f;
				}
		}
	} ccv_nnc_graph_visit_endfor
	// Find the wrt_symbols, and tag its flows.
	ccv_nnc_graph_visit_for(forward_visit, exec_symbol_info, node, idx) {
		int wrt = backward_info[idx].f_wrt & 0x2;
		for (i = 0; i < node->input_size && !wrt; i++)
		{
			int d = node->inputs[i];
			while (tensor_symbol_info[d].alias_ref)
				d = tensor_symbol_info[d].alias_ref - 1;
			for (j = 0; j < wrt_symbol_size && !wrt; j++)
				if (d == wrt_symbols[j].d)
					wrt = 0x2;
		}
		if (wrt)
		{
			backward_info[idx].f_wrt |= wrt;
			if (node->outgoings)
				for (i = 0; i < node->outgoings->rnum; i++)
				{
					int d = *(int*)ccv_array_get(node->outgoings, i);
					backward_info[d].f_wrt |= wrt;
				}
		}
	} ccv_nnc_graph_visit_endfor
	// Also mark only the output bits that we use.
	for (i = 0; i < exec_symbol_info_size; i++)
	{
		backward_info[i].input_bitmask_size = ((exec_symbol_info[i].output_size * 2 + exec_symbol_info[i].input_size + 63) >> 6);
		backward_info[i].output_bitmask_size = ((exec_symbol_info[i].input_size + 63) >> 6);
		// Allocate input / output bitmasks
		if (backward_info[i].input_bitmask_size + backward_info[i].output_bitmask_size > 0)
		{
			backward_info[i].input_bitmasks = (uint64_t*)cccalloc(backward_info[i].input_bitmask_size + backward_info[i].output_bitmask_size, sizeof(uint64_t));
			if (backward_info[i].output_bitmask_size)
				backward_info[i].output_bitmasks = backward_info[i].input_bitmasks + backward_info[i].input_bitmask_size;
		}
	}
	enum {
		WRT_SYMBOL_USE = 1,
		F_SYMBOL_USE = 2
	};
	uint8_t* used = (uint8_t*)cccalloc(tensor_symbol_info_size, sizeof(uint8_t));
	// First, all f_symbols and wrt_symbols are used.
	for (i = 0; i < f_symbol_size; i++)
		used[tensor_symbol_info[f_symbols[i].d].alias_ref ? tensor_symbol_info[f_symbols[i].d].alias_ref - 1 : f_symbols[i].d] |= F_SYMBOL_USE;
	for (i = 0; i < wrt_symbol_size; i++)
		used[tensor_symbol_info[wrt_symbols[i].d].alias_ref ? tensor_symbol_info[wrt_symbols[i].d].alias_ref - 1 : wrt_symbols[i].d] |= WRT_SYMBOL_USE;
	ccv_nnc_graph_visit_for(forward_visit, exec_symbol_info, _, idx) {
		ccv_nnc_graph_backward_info_t* node = backward_info + idx;
		/* Only interested in the ones on the f / wrt flow */
		if (node->f_wrt == 0x3)
		{
			const ccv_nnc_graph_exec_symbol_info_t* forw_exec = exec_symbol_info + idx;
			ccv_nnc_cmd_t cmd = forw_exec->cmd;
			if (cmd.cmd != CCV_NNC_NOOP)
				cmd.cmd += 1; /* Backward command is the one after forward command. */
			assert(ccv_nnc_cmd_is_backward(cmd) || cmd.cmd == CCV_NNC_NOOP);
			for (i = 0; i < forw_exec->output_size * 2 + forw_exec->input_size; i++)
				node->input_bitmasks[i >> 6] |= ((uint64_t)1 << i);
			for (i = 0; i < forw_exec->input_size; i++)
				node->output_bitmasks[i >> 6] |= ((uint64_t)1 << i);
			int maybe_noop = 1;
			for (i = 0; i < forw_exec->input_size; i++)
				/* See if it is used. */
				if (used[tensor_symbol_info[forw_exec->inputs[i]].alias_ref ? tensor_symbol_info[forw_exec->inputs[i]].alias_ref - 1 : forw_exec->inputs[i]] & WRT_SYMBOL_USE)
				{
					maybe_noop = 0;
					break;
				}
			if (maybe_noop)
			{
				node->output_bitmasks = 0;
				node->output_bitmask_size = 0;
			/* If this is ok, try something else. Otherwise, this is it, no trick. */
			} else if (ccv_nnc_cmd_bitmask(cmd, node->input_bitmasks, node->input_bitmask_size, node->output_bitmasks, node->output_bitmask_size)) {
				int flag; /* Only continue if it changed */
				do {
					flag = 0;
					/* Check if the output first */
					for (i = 0; i < forw_exec->input_size; i++)
						/* Only try to eliminate the one that is not used. */
						if (!(used[tensor_symbol_info[forw_exec->inputs[i]].alias_ref ? tensor_symbol_info[forw_exec->inputs[i]].alias_ref - 1 : forw_exec->inputs[i]] & WRT_SYMBOL_USE) &&
							(node->output_bitmasks[i >> 6] & ((uint64_t)1 << i)))
						{
							node->output_bitmasks[i >> 6] &= ~((uint64_t)1 << i);
							/* If it worked, mark it as flagged. */
							if (ccv_nnc_cmd_bitmask(cmd, node->input_bitmasks, node->input_bitmask_size, node->output_bitmasks, node->output_bitmask_size))
								flag = 1;
							else /* Refit this with the bit back again. */
								node->output_bitmasks[i >> 6] |= ((uint64_t)1 << i);
						}
					for (i = 0; i < forw_exec->output_size * 2 + forw_exec->input_size; i++)
						if ((i >= forw_exec->output_size ||
							 !(used[tensor_symbol_info[forw_exec->outputs[i]].alias_ref ? tensor_symbol_info[forw_exec->outputs[i]].alias_ref - 1 : forw_exec->outputs[i]] & F_SYMBOL_USE)) &&
							node->input_bitmasks[i >> 6] & ((uint64_t)1 << i))
						{ /* Try to eliminate one of the input. */
							node->input_bitmasks[i >> 6] &= ~((uint64_t)1 << i);
							/* If it worked, mark it as flagged. */
							if (ccv_nnc_cmd_bitmask(cmd, node->input_bitmasks, node->input_bitmask_size, node->output_bitmasks, node->output_bitmask_size))
								flag = 1;
							else /* Refit this with the bit back again. */
								node->input_bitmasks[i >> 6] |= ((uint64_t)1 << i);
						}
				} while (flag);
				for (i = 0; i < forw_exec->output_size; i++)
					if (node->input_bitmasks[i >> 6] & ((uint64_t)1 << i))
						/* Mark it as used. */
						used[tensor_symbol_info[forw_exec->outputs[i]].alias_ref ? tensor_symbol_info[forw_exec->outputs[i]].alias_ref - 1 : forw_exec->outputs[i]] |= WRT_SYMBOL_USE;
			}
		}
	} ccv_nnc_graph_visit_endfor
	ccfree(used);
	// Now, only the flow from f_symbols back to wrt_symbols are interested to us.
	// Visit the graph in reverse order, build the AD nodes.
	ccv_nnc_autograd_graph_exec_symbol_t* const autograd_execs = (ccv_nnc_autograd_graph_exec_symbol_t*)cccalloc(exec_symbol_info_size, sizeof(ccv_nnc_autograd_graph_exec_symbol_t));
	for (i = 0; i < exec_symbol_info_size; i++)
		if (backward_info[i].f_wrt == 0x3 && backward_info[i].outgoings)
		{
			// Copy over the outgoing bits.
			autograd_execs[i].outgoings = ccv_array_new(sizeof(int), backward_info[i].outgoings->rnum, 0);
			for (j = 0; j < backward_info[i].outgoings->rnum; j++)
			{
				const int d = *(int*)ccv_array_get(backward_info[i].outgoings, j);
				// Only push the outgoing node if it is in the f_wrt path.
				if (backward_info[d].f_wrt == 0x3)
					ccv_array_push(autograd_execs[i].outgoings, &d);
			}
		}
	ccv_nnc_autograd_tensor_version_t* const autograd_tensor_versions = (ccv_nnc_autograd_tensor_version_t*)cccalloc(tensor_symbol_info_size, sizeof(ccv_nnc_autograd_tensor_version_t));
	ccv_array_t* autograd_tensor_symbols = ccv_array_new(sizeof(ccv_nnc_autograd_tensor_symbol_t), tensor_symbol_info_size, 0);
	ccv_array_t* sum_execs = ccv_array_new(sizeof(ccv_nnc_sum_graph_exec_symbol_t), 0, 0);
	ccv_nnc_graph_visit_for(backward_visit, backward_info, node, idx) {
		/* This is required by both f flow and wrt flow, therefore, an interest to us */
		if (node->f_wrt == 0x3)
		{
			const ccv_nnc_graph_exec_symbol_info_t* forw_exec = exec_symbol_info + idx;
			const ccv_nnc_graph_backward_info_t* back_info = backward_info + idx;
			ccv_nnc_autograd_graph_exec_symbol_t* back_exec = autograd_execs + idx;
			back_exec->cmd = forw_exec->cmd;
			if (back_exec->cmd.cmd != CCV_NNC_NOOP)
				back_exec->cmd.cmd += 1; /* Backward command is the one after forward command. */
			assert(ccv_nnc_cmd_is_backward(back_exec->cmd) || back_exec->cmd.cmd == CCV_NNC_NOOP);
			if (!back_info->output_bitmasks) /* This has no output, can be a noop. */
				back_exec->cmd.cmd = CCV_NNC_NOOP;
			else {
				back_exec->output_size = forw_exec->input_size;
				back_exec->input_size = forw_exec->output_size;
				back_exec->inputs = ccmalloc(sizeof(int) * (back_exec->input_size + back_exec->output_size));
				back_exec->outputs = back_exec->inputs + back_exec->input_size;
				/* Need to compute input before we compute output */
				for (i = 0; i < forw_exec->output_size; i++)
				{
					/* If we can skip this input, do that. */
					if (!(back_info->input_bitmasks[i >> 6] & ((uint64_t)1 << i)))
						continue;
					const int d = forw_exec->outputs[i];
					const int alias_ref = tensor_symbol_info[d].alias_ref;
					ccv_nnc_autograd_tensor_version_t* tensor_ver = alias_ref ? autograd_tensor_versions + (alias_ref - 1) : autograd_tensor_versions + d;
					/* Initialization tensor, should corresponding to f symbols */
					if (!tensor_ver->ref_version)
					{
						ccv_nnc_autograd_tensor_symbol_t tensor_sym = {};
						if (!alias_ref)
						{
							tensor_sym.d = d;
							ccv_array_push(autograd_tensor_symbols, &tensor_sym);
							const ccv_nnc_tensor_ref_t tensor_ref = {
								.d = autograd_tensor_symbols->rnum - 1,
								.x = idx,
								.alias_registry = 0
							};
							tensor_ver->ref_version = ccv_array_new(sizeof(ccv_nnc_tensor_ref_t), 1, 0);
							ccv_array_push(tensor_ver->ref_version, &tensor_ref);
						} else {
							tensor_sym.d = alias_ref - 1;
							ccv_array_push(autograd_tensor_symbols, &tensor_sym);
							const ccv_nnc_tensor_ref_t tensor_ref = {
								.d = autograd_tensor_symbols->rnum - 1,
								.x = idx,
								.alias_registry = ccv_array_new(sizeof(int), 1, 0)
							};
							tensor_ver->ref_version = ccv_array_new(sizeof(ccv_nnc_tensor_ref_t), 1, 0);
							ccv_array_push(tensor_ver->ref_version, &tensor_ref);
							tensor_sym.d = d; /* set back */
							tensor_sym.alias_ref = tensor_ref.d + 1;
							ccv_array_push(autograd_tensor_symbols, &tensor_sym);
							const int ad = autograd_tensor_symbols->rnum - 1;
							ccv_array_push(tensor_ref.alias_registry, &ad);
						}
					}
					/* The simplest case (most common), it is not an alias. */
					if (!alias_ref)
					{
						/* Even simpler, this only have one reference tensor, thus, pass this as input. */
						if (tensor_ver->c == tensor_ver->ref_version->rnum - 1)
						{
							ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, tensor_ver->c);
							/* There are alias associated with this tensor ref, zero it out when this tensor is allocated. */
							/* This is is required. Consider the case that we have an alias of this tensor used somehwere */
							/* on forward pass, when we compute backward, we have that alias computed first, however, its */
							/* underlying tensor is not zero initialized, and we will end up with garbage values here. */
							if (tensor_ref->alias_registry &&
								/* Loop over to see if this tensor is fully occupied to avoid extra zero step. */
								!_ccv_nnc_tensor_ref_fully_assigned_with_aliases(tensor_ref, autograd_tensor_symbols, tensor_symbol_info))
							{
								ccv_nnc_autograd_tensor_symbol_t* tensor_sym = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d);
								assert(tensor_sym->alias_ref == 0);
								tensor_sym->flags = CCV_NNC_SYM_TENSOR_INIT_ZEROS;
							}
							back_exec->inputs[i] = tensor_ref->d;
						} else {
							/* Otherwise, we need to sum them up, and then pass the summed result to the computation. */
							_ccv_nnc_graph_sum_autograd_tensor_versions(idx, d, exec_symbol_info_size, tensor_symbol_info, tensor_ver, autograd_execs, autograd_tensor_symbols, sum_execs);
							ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, tensor_ver->c);
							back_exec->inputs[i] = tensor_ref->d;
						}
					} else
						/* If this is an alias, go through all available tensor ref versions */
						back_exec->inputs[i] = _ccv_nnc_graph_sum_autograd_tensor_versions_alias(idx, d, tensor_symbol_info, exec_symbol_info_size, tensor_symbol_info + d, tensor_ver, autograd_execs, autograd_tensor_symbols, sum_execs);
				}
				for (i = 0; i < forw_exec->input_size; i++)
				{
					/* If we can skip this output, do that. */
					if (!(back_info->output_bitmasks[i >> 6] & ((uint64_t)1 << i)))
						continue;
					const int d = forw_exec->inputs[i];
					const int alias_ref = tensor_symbol_info[d].alias_ref;
					ccv_nnc_autograd_tensor_symbol_t tensor_sym = {
						.d = d
					};
					/* The simplest case (most common), it is not an alias. */
					if (!alias_ref)
					{
						ccv_array_push(autograd_tensor_symbols, &tensor_sym);
						const ccv_nnc_tensor_ref_t tensor_ref = {
							.d = autograd_tensor_symbols->rnum - 1,
							.x = idx,
							.exec_registry = 0,
							.alias_registry = 0
						};
						ccv_nnc_autograd_tensor_version_t* tensor_ver = autograd_tensor_versions + d;
						if (!tensor_ver->ref_version)
							tensor_ver->ref_version = ccv_array_new(sizeof(ccv_nnc_tensor_ref_t), 1, 0);
						ccv_array_push(tensor_ver->ref_version, &tensor_ref);
						back_exec->outputs[i] = tensor_ref.d;
					} else {
						/* Otherwise, in case that this is an alias, we try to find the existing one (in tensor_ver
						 * see if can meet the need (thus, for the tensor info / ofs, it fits). */
						ccv_nnc_autograd_tensor_version_t* tensor_ver = autograd_tensor_versions + (alias_ref - 1);
						if (!tensor_ver->ref_version)
							tensor_ver->ref_version = ccv_array_new(sizeof(ccv_nnc_tensor_ref_t), 1, 0);
						/* If already exists a ref version, check if any of these not-sealed tensors have free space. */
						int found = 0;
						for (j = tensor_ver->c; !found && j < tensor_ver->ref_version->rnum; j++)
						{
							ccv_nnc_tensor_ref_t* tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, j);
							if (!_ccv_nnc_tensor_ref_version_involve_alias(tensor_ref, autograd_tensor_symbols, tensor_symbol_info, tensor_symbol_info + d))
							{
								tensor_sym.alias_ref = tensor_ref->d + 1;
								ccv_array_push(autograd_tensor_symbols, &tensor_sym);
								const int ad = autograd_tensor_symbols->rnum - 1;
								ccv_array_push(tensor_ref->alias_registry, &ad);
								if (!tensor_ref->exec_registry)
									tensor_ref->exec_registry = ccv_array_new(sizeof(int), 1, 0);
								ccv_array_push(tensor_ref->exec_registry, &idx);
								back_exec->outputs[i] = ad;
								found = 1;
							}
						}
						if (!found) /* Cannot find an tensor ref to insert, create one first */
						{
							tensor_sym.d = alias_ref - 1; /* Reference back to the non-alias. */
							ccv_array_push(autograd_tensor_symbols, &tensor_sym);
							const ccv_nnc_tensor_ref_t tensor_ref = {
								.d = autograd_tensor_symbols->rnum - 1,
								.x = idx,
								.exec_registry = 0,
								.alias_registry = ccv_array_new(sizeof(int), 1, 0)
							};
							ccv_array_push(tensor_ver->ref_version, &tensor_ref);
							tensor_sym.d = d; /* set back */
							tensor_sym.alias_ref = tensor_ref.d + 1;
							ccv_array_push(autograd_tensor_symbols, &tensor_sym);
							const int ad = autograd_tensor_symbols->rnum - 1;
							ccv_array_push(tensor_ref.alias_registry, &ad);
							back_exec->outputs[i] = ad;
						}
					}
				}
			}
		}
	} ccv_nnc_graph_visit_endfor
	// Find all relevant wrt symbols, generate sum for them if needed.
	for (i = 0; i < wrt_symbol_size; i++)
	{
		const int d = wrt_symbols[i].d;
		ccv_nnc_autograd_tensor_version_t* tensor_ver = (!tensor_symbol_info[d].alias_ref) ? autograd_tensor_versions + d : autograd_tensor_versions + (tensor_symbol_info[d].alias_ref - 1);
		// If there are more than one tensor in the list, it is possible to sum them up.
		if (tensor_ver->c < tensor_ver->ref_version->rnum - 1)
			_ccv_nnc_graph_sum_autograd_tensor_versions(-1, d, exec_symbol_info_size, tensor_symbol_info, tensor_ver, autograd_execs, autograd_tensor_symbols, sum_execs);
		// The tensor version should have ref_version, and only one now (after sum up).
		assert(tensor_ver->c == tensor_ver->ref_version->rnum - 1);
	}
	ccv_nnc_symbolic_graph_backward_prep_t backward_prep = {
		.exec_symbol_info_size = exec_symbol_info_size,
		.tensor_symbol_info_size = tensor_symbol_info_size,
		.backward_info = backward_info,
		.backward_visit = backward_visit,
		.autograd_execs = autograd_execs,
		.autograd_tensor_versions = autograd_tensor_versions,
		.autograd_tensor_symbols = autograd_tensor_symbols,
		.sum_execs = sum_execs
	};
	return backward_prep;
}

static void _ccv_nnc_symbolic_graph_backward_prep_free(const ccv_nnc_symbolic_graph_backward_prep_t backward_prep)
{
	int i, j;
	const int exec_symbol_info_size = backward_prep.exec_symbol_info_size;
	const int tensor_symbol_info_size = backward_prep.tensor_symbol_info_size;
	ccv_nnc_autograd_graph_exec_symbol_t* const autograd_execs = backward_prep.autograd_execs;
	for (i = 0; i < exec_symbol_info_size; i++)
	{
		if (autograd_execs[i].inputs)
			ccfree(autograd_execs[i].inputs);
		if (autograd_execs[i].outgoings)
			ccv_array_free(autograd_execs[i].outgoings);
	}
	ccfree(autograd_execs);
	ccv_nnc_autograd_tensor_version_t* const autograd_tensor_versions = backward_prep.autograd_tensor_versions;
	for (i = 0; i < tensor_symbol_info_size; i++)
	{
		if (autograd_tensor_versions[i].ref_version)
		{
			for (j = 0; j < autograd_tensor_versions[i].ref_version->rnum; j++)
			{
				ccv_nnc_tensor_ref_t* ref_version = (ccv_nnc_tensor_ref_t*)ccv_array_get(autograd_tensor_versions[i].ref_version, j);
				if (ref_version->exec_registry)
					ccv_array_free(ref_version->exec_registry);
				if (ref_version->alias_registry)
					ccv_array_free(ref_version->alias_registry);
			}
			ccv_array_free(autograd_tensor_versions[i].ref_version);
		}
	}
	ccfree(autograd_tensor_versions);
	ccv_array_free(backward_prep.autograd_tensor_symbols);
	ccv_array_t* const sum_execs = backward_prep.sum_execs;
	for (i = 0; i < sum_execs->rnum; i++)
	{
		ccv_nnc_sum_graph_exec_symbol_t* sum_or_set = (ccv_nnc_sum_graph_exec_symbol_t*)ccv_array_get(sum_execs, i);
		if (sum_or_set->inputs)
			ccfree(sum_or_set->inputs);
		if (sum_or_set->outgoings)
			ccv_array_free(sum_or_set->outgoings);
	}
	ccv_array_free(sum_execs);
	ccv_nnc_graph_backward_info_t* const backward_info = backward_prep.backward_info;
	for (i = 0; i < exec_symbol_info_size; i++)
	{
		if (backward_info[i].outgoings)
			ccv_array_free(backward_info[i].outgoings);
		if (backward_info[i].input_bitmasks)
			ccfree(backward_info[i].input_bitmasks);
	}
	ccfree(backward_info);
	ccv_nnc_graph_visit_free(backward_prep.backward_visit);
}

static void _ccv_nnc_symbolic_graph_backward_gen(const ccv_nnc_symbolic_graph_backward_prep_t backward_prep, const ccv_nnc_graph_exec_symbol_info_t* const exec_symbol_info, const ccv_nnc_tensor_symbol_info_t* const tensor_symbol_info, const ccv_nnc_tensor_symbol_t* const f_symbols, const int f_symbol_size, const ccv_nnc_tensor_symbol_t* const wrt_symbols, const int wrt_symbol_size, ccv_nnc_symbolic_graph_t* const graph)
{
	const int exec_symbol_info_size = graph->exec_symbol_info->rnum;
	const int tensor_symbol_info_size = graph->tensor_symbol_info->rnum;
	int i, j;
	ccv_array_t* const autograd_tensor_symbols = backward_prep.autograd_tensor_symbols;
	// Generate required symbols based on the information gathered above.
	for (i = 0; i < autograd_tensor_symbols->rnum; i++)
	{
		ccv_nnc_autograd_tensor_symbol_t* symbol = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, i);
		assert(symbol->d >= 0);
		assert(symbol->d < tensor_symbol_info_size);
		const ccv_nnc_tensor_symbol_info_t* const forw_symbol = tensor_symbol_info + symbol->d;
		if (!symbol->alias_ref)
		{
			assert(!forw_symbol->alias_ref);
			symbol->symbol = ccv_nnc_tensor_symbol_new(graph, forw_symbol->info, 0);
			ccv_nnc_tensor_symbol_set_flags(graph, symbol->symbol, symbol->flags);
		} else {
			assert(forw_symbol->alias_ref);
			assert(symbol->flags == 0); // We don't set flags on alias.
			// Due to our generation order, this must be after the original symbol is created.
			ccv_nnc_autograd_tensor_symbol_t* ref = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, symbol->alias_ref - 1);
			symbol->symbol = ccv_nnc_tensor_symbol_alias_new(graph, ref->symbol, forw_symbol->ofs, forw_symbol->inc, forw_symbol->info, 0);
		}
	}
	ccv_nnc_graph_backward_info_t* const backward_info = backward_prep.backward_info;
	ccv_nnc_autograd_graph_exec_symbol_t* const autograd_execs = backward_prep.autograd_execs;
	ccv_array_t* symbols = ccv_array_new(sizeof(ccv_nnc_tensor_symbol_t), 0, 0);
	for (i = 0; i < exec_symbol_info_size; i++)
	{
		// This is not going to be an interesting node. Skip.
		if (backward_info[i].f_wrt != 0x3)
			continue;
		ccv_nnc_graph_backward_info_t* const back_info = backward_info + i;
		ccv_nnc_autograd_graph_exec_symbol_t* const back_exec = autograd_execs + i;
		if (back_exec->cmd.cmd == CCV_NNC_NOOP)
		{
			back_exec->symbol = ccv_nnc_graph_exec_symbol_new(graph, back_exec->cmd, 0, 0, 0, 0, 0);
			continue;
		}
		ccv_array_clear(symbols);
		// Gradient inputs.
		for (j = 0; j < back_exec->input_size; j++)
			if (back_info->input_bitmasks[j >> 6] & ((uint64_t)1 << j))
				ccv_array_push(symbols, &(((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, back_exec->inputs[j]))->symbol));
			else
				ccv_array_push(symbols, &NO_TENSOR_SYMBOL);
		const ccv_nnc_graph_exec_symbol_info_t* const forw_exec = exec_symbol_info + i;
		// Inputs from forward function.
		for (j = 0; j < forw_exec->input_size; j++)
			if (!(back_info->input_bitmasks[(j + back_exec->input_size) >> 6] & ((uint64_t)1 << (j + back_exec->input_size))))
				ccv_array_push(symbols, &NO_TENSOR_SYMBOL);
			else {
				ccv_nnc_tensor_symbol_t symbol = {
					.info = tensor_symbol_info[forw_exec->inputs[j]].info,
					.d = forw_exec->inputs[j],
					.graph = graph
				};
				ccv_array_push(symbols, &symbol);
			}
		// Outputs from forward function.
		for (j = 0; j < forw_exec->output_size; j++)
			if (!(back_info->input_bitmasks[(j + back_exec->input_size + forw_exec->input_size) >> 6] & ((uint64_t)1 << (j + back_exec->input_size + forw_exec->input_size))))
				ccv_array_push(symbols, &NO_TENSOR_SYMBOL);
			else {
				ccv_nnc_tensor_symbol_t symbol = {
					.info = tensor_symbol_info[forw_exec->outputs[j]].info,
					.d = forw_exec->outputs[j],
					.graph = graph
				};
				ccv_array_push(symbols, &symbol);
			}
		for (j = 0; j < back_exec->output_size; j++)
			if (back_info->output_bitmasks[j >> 6] & ((uint64_t)1 << j))
				ccv_array_push(symbols, &(((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, back_exec->outputs[j]))->symbol));
			else
				ccv_array_push(symbols, &NO_TENSOR_SYMBOL);
		back_exec->symbol = ccv_nnc_graph_exec_symbol_new(graph, back_exec->cmd, ccv_array_get(symbols, 0), back_exec->input_size + forw_exec->input_size + forw_exec->output_size, ccv_array_get(symbols, back_exec->input_size + forw_exec->input_size + forw_exec->output_size), back_exec->output_size, 0);
	}
	ccv_array_t* const sum_execs = backward_prep.sum_execs;
	for (i = 0; i < sum_execs->rnum; i++)
	{
		ccv_nnc_sum_graph_exec_symbol_t* exec = (ccv_nnc_sum_graph_exec_symbol_t*)ccv_array_get(sum_execs, i);
		assert(exec->input_size);
		ccv_array_clear(symbols);
		// This is to sum.
		for (j = 0; j < exec->input_size; j++)
			ccv_array_push(symbols, &(((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, exec->inputs[j]))->symbol));
		ccv_nnc_cmd_t cmd = ccv_nnc_cmd(CCV_NNC_EWSUM_FORWARD, 0, CMD_GENERIC(), 0);
		exec->symbol = ccv_nnc_graph_exec_symbol_new(graph, cmd, ccv_array_get(symbols, 0), exec->input_size, &(((ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, exec->output))->symbol), 1, 0);
	}
	ccv_array_free(symbols);
	for (i = 0; i < exec_symbol_info_size; i++)
	{
		// This is not going to be an interesting node. Skip.
		if (backward_info[i].f_wrt != 0x3)
			continue;
		ccv_nnc_graph_exec_symbol_t forw_exec = {
			.d = i,
			.graph = graph
		};
		ccv_nnc_autograd_graph_exec_symbol_t* const back_exec = autograd_execs + i;
		ccv_nnc_graph_exec_symbol_concat(graph, forw_exec, back_exec->symbol);
		if (back_exec->outgoings)
			for (j = 0; j < back_exec->outgoings->rnum; j++)
			{
				int d = *(int*)ccv_array_get(back_exec->outgoings, j);
				if (d < exec_symbol_info_size)
					ccv_nnc_graph_exec_symbol_concat(graph, back_exec->symbol, autograd_execs[d].symbol);
				else
					ccv_nnc_graph_exec_symbol_concat(graph, back_exec->symbol, ((ccv_nnc_sum_graph_exec_symbol_t*)ccv_array_get(sum_execs, d - exec_symbol_info_size))->symbol);
			}
	}
	for (i = 0; i < sum_execs->rnum; i++)
	{
		ccv_nnc_sum_graph_exec_symbol_t* exec = (ccv_nnc_sum_graph_exec_symbol_t*)ccv_array_get(sum_execs, i);
		if (exec->outgoings)
			for (j = 0; j < exec->outgoings->rnum; j++)
			{
				int d = *(int*)ccv_array_get(exec->outgoings, j);
				if (d < exec_symbol_info_size)
					ccv_nnc_graph_exec_symbol_concat(graph, exec->symbol, autograd_execs[d].symbol);
				else
					ccv_nnc_graph_exec_symbol_concat(graph, exec->symbol, ((ccv_nnc_sum_graph_exec_symbol_t*)ccv_array_get(sum_execs, d - exec_symbol_info_size))->symbol);
			}
	}
	// Now, everything is done, set the metadata on graph so that we can lookup later for backward symbols
	if (graph->backward_tensor_symbols)
		graph->backward_tensor_symbols = (int*)ccrealloc(graph->backward_tensor_symbols, sizeof(int) * graph->tensor_symbol_info->rnum);
	else
		graph->backward_tensor_symbols = (int*)ccmalloc(sizeof(int) * graph->tensor_symbol_info->rnum);
	graph->backward_exec_symbols = graph->backward_tensor_symbols + tensor_symbol_info_size;
	graph->forward_symbol_size = tensor_symbol_info_size;
	graph->backward_symbol_size = graph->tensor_symbol_info->rnum - tensor_symbol_info_size;
	for (i = 0; i < graph->forward_symbol_size; i++)
		graph->backward_tensor_symbols[i] = -1;
	for (i = 0; i < graph->backward_symbol_size; i++)
		graph->backward_exec_symbols[i] = -1;
	ccv_nnc_autograd_tensor_version_t* const autograd_tensor_versions = backward_prep.autograd_tensor_versions;
	// Assigning for wrt symbols.
	for (i = 0; i < wrt_symbol_size; i++)
	{
		const int d = wrt_symbols[i].d;
		assert(d >= 0);
		assert(d < tensor_symbol_info_size);
		// If this wrt symbol is an alias, create extra alias for this.
		ccv_nnc_tensor_ref_t* tensor_ref;
		int dd;
		ccv_nnc_autograd_tensor_version_t* tensor_ver = autograd_tensor_versions + d;
		assert(tensor_ver->ref_version);
		tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, tensor_ver->c);
		ccv_nnc_autograd_tensor_symbol_t* autograd_symbol = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d);
		graph->backward_tensor_symbols[d] = autograd_symbol->symbol.d;
		assert(autograd_symbol->symbol.d >= tensor_symbol_info_size);
		dd = autograd_symbol->symbol.d - tensor_symbol_info_size;
		const int x = tensor_ref->x;
		if (tensor_ref->exec_registry && tensor_ref->exec_registry->rnum) // Create no-op node.
		{
			ccv_nnc_graph_exec_symbol_t noop = ccv_nnc_graph_exec_symbol_new(graph, ccv_nnc_cmd(CCV_NNC_NOOP, 0, CMD_GENERIC(), 0), 0, 0, 0, 0, 0);
			if (x < exec_symbol_info_size)
				ccv_nnc_graph_exec_symbol_concat(graph, autograd_execs[x].symbol, noop);
			else
				ccv_nnc_graph_exec_symbol_concat(graph, ((ccv_nnc_sum_graph_exec_symbol_t*)ccv_array_get(sum_execs, x - exec_symbol_info_size))->symbol, noop);
			for (j = 0; j < tensor_ref->exec_registry->rnum; j++)
			{
				const int x = *(int*)ccv_array_get(tensor_ref->exec_registry, j);
				assert(x >= 0); /* Otherwise, this is initialization tensor, which is impossible to be summed up by. */
				assert(x < exec_symbol_info_size); // exec_registry is only used by alias_registry, it simply cannot reference to a sum operation.
				ccv_nnc_graph_exec_symbol_concat(graph, autograd_execs[x].symbol, noop);
			}
			graph->backward_exec_symbols[dd] = noop.d;
		} else {
			if (x < exec_symbol_info_size)
				graph->backward_exec_symbols[dd] = autograd_execs[x].symbol.d;
			else
				graph->backward_exec_symbols[dd] = ((ccv_nnc_sum_graph_exec_symbol_t*)ccv_array_get(sum_execs, x - exec_symbol_info_size))->symbol.d;
		}
	}
	// Assigning for f symbols.
	for (i = 0; i < f_symbol_size; i++)
	{
		const int d = f_symbols[i].d;
		assert(d >= 0);
		assert(d < tensor_symbol_info_size);
		ccv_nnc_autograd_tensor_version_t* tensor_ver = autograd_tensor_versions + d;
		assert(tensor_ver->ref_version);
		ccv_nnc_tensor_ref_t* const tensor_ref = (ccv_nnc_tensor_ref_t*)ccv_array_get(tensor_ver->ref_version, tensor_ver->c);
		ccv_nnc_autograd_tensor_symbol_t* autograd_symbol = (ccv_nnc_autograd_tensor_symbol_t*)ccv_array_get(autograd_tensor_symbols, tensor_ref->d);
		graph->backward_tensor_symbols[d] = autograd_symbol->symbol.d;
		// Cannot find relevant backward exec symbols for f.
	}
}

void ccv_nnc_symbolic_graph_backward(ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_graph_exec_symbol_t* const sources, const int source_size, const ccv_nnc_graph_exec_symbol_t* const destinations, const int destination_size, const ccv_nnc_tensor_symbol_t* const f_symbols, const int f_symbol_size, const ccv_nnc_tensor_symbol_t* const wrt_symbols, const int wrt_symbol_size)
{
	// First, fill all the "auto" holes.
	// This is the symbol table that with "auto" info filled up.
	const int exec_symbol_info_size = graph->exec_symbol_info->rnum;
	const int tensor_symbol_info_size = graph->tensor_symbol_info->rnum;
	assert(exec_symbol_info_size > 0);
	assert(tensor_symbol_info_size > 0);
	ccv_nnc_graph_exec_symbol_info_t* exec_symbol_info = (ccv_nnc_graph_exec_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_graph_exec_symbol_info_t) * exec_symbol_info_size);
	ccv_nnc_tensor_symbol_info_t* tensor_symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccmalloc(sizeof(ccv_nnc_tensor_symbol_info_t) * tensor_symbol_info_size);
	ccv_nnc_graph_visit_t* forward_visit = ccv_nnc_graph_visit_new(graph, (ccv_nnc_graph_exec_symbol_info_t*)ccv_array_get(graph->exec_symbol_info, 0), exec_symbol_info_size, sources, source_size, destinations, destination_size);
	ccv_nnc_symbolic_graph_symbol_infer(graph, forward_visit, sources, source_size, destinations, destination_size, 0, 0, tensor_symbol_info, exec_symbol_info);
	int i;
	// f symbols cannot be alias.
	for (i = 0; i < f_symbol_size; i++)
		{ assert(!tensor_symbol_info[f_symbols[i].d].alias_ref); }
	// wrt symbols cannot be alias.
	for (i = 0; i < wrt_symbol_size; i++)
		{ assert(!tensor_symbol_info[wrt_symbols[i].d].alias_ref); }
	ccv_nnc_symbolic_graph_backward_prep_t backward_prep = _ccv_nnc_symbolic_graph_backward_prep(graph, forward_visit, exec_symbol_info, tensor_symbol_info, sources, source_size, destinations, destination_size, f_symbols, f_symbol_size, wrt_symbols, wrt_symbol_size);
	ccv_nnc_graph_visit_free(forward_visit);
	_ccv_nnc_symbolic_graph_backward_gen(backward_prep, exec_symbol_info, tensor_symbol_info, f_symbols, f_symbol_size, wrt_symbols, wrt_symbol_size, graph);
	_ccv_nnc_symbolic_graph_backward_prep_free(backward_prep);
	ccfree(exec_symbol_info);
	ccfree(tensor_symbol_info);
}

ccv_nnc_tensor_symbol_t ccv_nnc_tensor_symbol_for_backward(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t symbol)
{
	assert(symbol.d >= 0);
	assert(symbol.d < graph->forward_symbol_size);
	assert(graph->backward_tensor_symbols[symbol.d] >= 0);
	ccv_nnc_tensor_symbol_t tensor = {
		.d = graph->backward_tensor_symbols[symbol.d],
		.graph = graph,
		.info = symbol.info
	};
	return tensor;
}

ccv_nnc_graph_exec_symbol_t ccv_nnc_graph_exec_symbol_for_backward(const ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t symbol)
{
	assert(symbol.d >= graph->forward_symbol_size);
	assert(symbol.d < graph->forward_symbol_size + graph->backward_symbol_size);
	const int dd = symbol.d - graph->forward_symbol_size;
	assert(graph->backward_exec_symbols[dd] >= 0);
	ccv_nnc_graph_exec_symbol_t exec = {
		.d = graph->backward_exec_symbols[dd],
		.graph = graph
	};
	return exec;
}