#include <stdlib.h>
#include <malloc.h>
#include <memory.h>
#if 1400 <= _MSC_VER
#include <intrin.h>
#endif
#ifndef _MSC_VER
#include <xmmintrin.h>
#include <pmmintrin.h>
#endif
#if LBFGS_FLOAT == 32 && LBFGS_IEEE_FLOAT
#define fsigndiff(x, y) (((*(uint32_t*)(x)) ^ (*(uint32_t*)(y))) & 0x80000000U)
#else
#define fsigndiff(x, y) (*(x) * (*(y) / fabs(*(y))) < 0.)
#endif/*LBFGS_IEEE_FLOAT*/
inline
static
void
* vecalloc(
size_t
size)
{
void
*memblock = _aligned_malloc(size, 16);
if
(memblock != NULL) {
memset
(memblock, 0, size);
}
return
memblock;
}
inline
static
void
vecfree(
void
*memblock)
{
_aligned_free(memblock);
}
#define vecset(x, c, n) \
{ \
int
i; \
__m128 XMM0 = _mm_set_ps1(c); \
for
(i = 0;i < (n);i += 16) { \
_mm_store_ps((x)+i , XMM0); \
_mm_store_ps((x)+i+ 4, XMM0); \
_mm_store_ps((x)+i+ 8, XMM0); \
_mm_store_ps((x)+i+12, XMM0); \
} \
}
#define veccpy(y, x, n) \
{ \
int
i; \
for
(i = 0;i < (n);i += 16) { \
__m128 XMM0 = _mm_load_ps((x)+i ); \
__m128 XMM1 = _mm_load_ps((x)+i+ 4); \
__m128 XMM2 = _mm_load_ps((x)+i+ 8); \
__m128 XMM3 = _mm_load_ps((x)+i+12); \
_mm_store_ps((y)+i , XMM0); \
_mm_store_ps((y)+i+ 4, XMM1); \
_mm_store_ps((y)+i+ 8, XMM2); \
_mm_store_ps((y)+i+12, XMM3); \
} \
}
#define vecncpy(y, x, n) \
{ \
int
i; \
const
uint32_t mask = 0x80000000; \
__m128 XMM4 = _mm_load_ps1((
float
*)&mask); \
for
(i = 0;i < (n);i += 16) { \
__m128 XMM0 = _mm_load_ps((x)+i ); \
__m128 XMM1 = _mm_load_ps((x)+i+ 4); \
__m128 XMM2 = _mm_load_ps((x)+i+ 8); \
__m128 XMM3 = _mm_load_ps((x)+i+12); \
XMM0 = _mm_xor_ps(XMM0, XMM4); \
XMM1 = _mm_xor_ps(XMM1, XMM4); \
XMM2 = _mm_xor_ps(XMM2, XMM4); \
XMM3 = _mm_xor_ps(XMM3, XMM4); \
_mm_store_ps((y)+i , XMM0); \
_mm_store_ps((y)+i+ 4, XMM1); \
_mm_store_ps((y)+i+ 8, XMM2); \
_mm_store_ps((y)+i+12, XMM3); \
} \
}
#define vecadd(y, x, c, n) \
{ \
int
i; \
__m128 XMM7 = _mm_set_ps1(c); \
for
(i = 0;i < (n);i += 8) { \
__m128 XMM0 = _mm_load_ps((x)+i ); \
__m128 XMM1 = _mm_load_ps((x)+i+4); \
__m128 XMM2 = _mm_load_ps((y)+i ); \
__m128 XMM3 = _mm_load_ps((y)+i+4); \
XMM0 = _mm_mul_ps(XMM0, XMM7); \
XMM1 = _mm_mul_ps(XMM1, XMM7); \
XMM2 = _mm_add_ps(XMM2, XMM0); \
XMM3 = _mm_add_ps(XMM3, XMM1); \
_mm_store_ps((y)+i , XMM2); \
_mm_store_ps((y)+i+4, XMM3); \
} \
}
#define vecdiff(z, x, y, n) \
{ \
int
i; \
for
(i = 0;i < (n);i += 16) { \
__m128 XMM0 = _mm_load_ps((x)+i ); \
__m128 XMM1 = _mm_load_ps((x)+i+ 4); \
__m128 XMM2 = _mm_load_ps((x)+i+ 8); \
__m128 XMM3 = _mm_load_ps((x)+i+12); \
__m128 XMM4 = _mm_load_ps((y)+i ); \
__m128 XMM5 = _mm_load_ps((y)+i+ 4); \
__m128 XMM6 = _mm_load_ps((y)+i+ 8); \
__m128 XMM7 = _mm_load_ps((y)+i+12); \
XMM0 = _mm_sub_ps(XMM0, XMM4); \
XMM1 = _mm_sub_ps(XMM1, XMM5); \
XMM2 = _mm_sub_ps(XMM2, XMM6); \
XMM3 = _mm_sub_ps(XMM3, XMM7); \
_mm_store_ps((z)+i , XMM0); \
_mm_store_ps((z)+i+ 4, XMM1); \
_mm_store_ps((z)+i+ 8, XMM2); \
_mm_store_ps((z)+i+12, XMM3); \
} \
}
#define vecscale(y, c, n) \
{ \
int
i; \
__m128 XMM7 = _mm_set_ps1(c); \
for
(i = 0;i < (n);i += 8) { \
__m128 XMM0 = _mm_load_ps((y)+i ); \
__m128 XMM1 = _mm_load_ps((y)+i+4); \
XMM0 = _mm_mul_ps(XMM0, XMM7); \
XMM1 = _mm_mul_ps(XMM1, XMM7); \
_mm_store_ps((y)+i , XMM0); \
_mm_store_ps((y)+i+4, XMM1); \
} \
}
#define vecmul(y, x, n) \
{ \
int
i; \
for
(i = 0;i < (n);i += 16) { \
__m128 XMM0 = _mm_load_ps((x)+i ); \
__m128 XMM1 = _mm_load_ps((x)+i+ 4); \
__m128 XMM2 = _mm_load_ps((x)+i+ 8); \
__m128 XMM3 = _mm_load_ps((x)+i+12); \
__m128 XMM4 = _mm_load_ps((y)+i ); \
__m128 XMM5 = _mm_load_ps((y)+i+ 4); \
__m128 XMM6 = _mm_load_ps((y)+i+ 8); \
__m128 XMM7 = _mm_load_ps((y)+i+12); \
XMM4 = _mm_mul_ps(XMM4, XMM0); \
XMM5 = _mm_mul_ps(XMM5, XMM1); \
XMM6 = _mm_mul_ps(XMM6, XMM2); \
XMM7 = _mm_mul_ps(XMM7, XMM3); \
_mm_store_ps((y)+i , XMM4); \
_mm_store_ps((y)+i+ 4, XMM5); \
_mm_store_ps((y)+i+ 8, XMM6); \
_mm_store_ps((y)+i+12, XMM7); \
} \
}
#if 3 <= __SSE__
#define __horizontal_sum(r, rw) \
r = _mm_hadd_ps(r, r); \
r = _mm_hadd_ps(r, r);
#else
#define __horizontal_sum(r, rw) \
rw = r; \
r = _mm_shuffle_ps(r, rw, _MM_SHUFFLE(1, 0, 3, 2)); \
r = _mm_add_ps(r, rw); \
rw = r; \
r = _mm_shuffle_ps(r, rw, _MM_SHUFFLE(2, 3, 0, 1)); \
r = _mm_add_ps(r, rw);
#endif
#define vecdot(s, x, y, n) \
{ \
int
i; \
__m128 XMM0 = _mm_setzero_ps(); \
__m128 XMM1 = _mm_setzero_ps(); \
__m128 XMM2, XMM3, XMM4, XMM5; \
for
(i = 0;i < (n);i += 8) { \
XMM2 = _mm_load_ps((x)+i ); \
XMM3 = _mm_load_ps((x)+i+4); \
XMM4 = _mm_load_ps((y)+i ); \
XMM5 = _mm_load_ps((y)+i+4); \
XMM2 = _mm_mul_ps(XMM2, XMM4); \
XMM3 = _mm_mul_ps(XMM3, XMM5); \
XMM0 = _mm_add_ps(XMM0, XMM2); \
XMM1 = _mm_add_ps(XMM1, XMM3); \
} \
XMM0 = _mm_add_ps(XMM0, XMM1); \
__horizontal_sum(XMM0, XMM1); \
_mm_store_ss((s), XMM0); \
}
#define vecnorm(s, x, n) \
{ \
int
i; \
__m128 XMM0 = _mm_setzero_ps(); \
__m128 XMM1 = _mm_setzero_ps(); \
__m128 XMM2, XMM3; \
for
(i = 0;i < (n);i += 8) { \
XMM2 = _mm_load_ps((x)+i ); \
XMM3 = _mm_load_ps((x)+i+4); \
XMM2 = _mm_mul_ps(XMM2, XMM2); \
XMM3 = _mm_mul_ps(XMM3, XMM3); \
XMM0 = _mm_add_ps(XMM0, XMM2); \
XMM1 = _mm_add_ps(XMM1, XMM3); \
} \
XMM0 = _mm_add_ps(XMM0, XMM1); \
__horizontal_sum(XMM0, XMM1); \
XMM2 = XMM0; \
XMM1 = _mm_rsqrt_ss(XMM0); \
XMM3 = XMM1; \
XMM1 = _mm_mul_ss(XMM1, XMM1); \
XMM1 = _mm_mul_ss(XMM1, XMM3); \
XMM1 = _mm_mul_ss(XMM1, XMM0); \
XMM1 = _mm_mul_ss(XMM1, _mm_set_ss(-0.5f)); \
XMM3 = _mm_mul_ss(XMM3, _mm_set_ss(1.5f)); \
XMM3 = _mm_add_ss(XMM3, XMM1); \
XMM3 = _mm_mul_ss(XMM3, XMM2); \
_mm_store_ss((s), XMM3); \
}
#define vecrnorm(s, x, n) \
{ \
int
i; \
__m128 XMM0 = _mm_setzero_ps(); \
__m128 XMM1 = _mm_setzero_ps(); \
__m128 XMM2, XMM3; \
for
(i = 0;i < (n);i += 16) { \
XMM2 = _mm_load_ps((x)+i ); \
XMM3 = _mm_load_ps((x)+i+4); \
XMM2 = _mm_mul_ps(XMM2, XMM2); \
XMM3 = _mm_mul_ps(XMM3, XMM3); \
XMM0 = _mm_add_ps(XMM0, XMM2); \
XMM1 = _mm_add_ps(XMM1, XMM3); \
} \
XMM0 = _mm_add_ps(XMM0, XMM1); \
__horizontal_sum(XMM0, XMM1); \
XMM2 = XMM0; \
XMM1 = _mm_rsqrt_ss(XMM0); \
XMM3 = XMM1; \
XMM1 = _mm_mul_ss(XMM1, XMM1); \
XMM1 = _mm_mul_ss(XMM1, XMM3); \
XMM1 = _mm_mul_ss(XMM1, XMM0); \
XMM1 = _mm_mul_ss(XMM1, _mm_set_ss(-0.5f)); \
XMM3 = _mm_mul_ss(XMM3, _mm_set_ss(1.5f)); \
XMM3 = _mm_add_ss(XMM3, XMM1); \
_mm_store_ss((s), XMM3); \
}