40#define OVERRIDE_INNER_PRODUCT_SINGLE
41static inline float inner_product_single(
const float *a,
const float *b,
unsigned int len)
45 __m128 sum = _mm_setzero_ps();
46 for (i=0;i<(int)len;i+=8)
48 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
49 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i+4), _mm_loadu_ps(b+i+4)));
51 sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
52 sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
53 _mm_store_ss(&ret, sum);
57#define OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
58static inline float interpolate_product_single(
const float *a,
const float *b,
unsigned int len,
const spx_uint32_t oversample,
float *frac) {
61 __m128 sum = _mm_setzero_ps();
62 __m128 f = _mm_loadu_ps(frac);
63 for(i=0;i<(int)len;i+=2)
65 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i), _mm_loadu_ps(b+i*oversample)));
66 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_load1_ps(a+i+1), _mm_loadu_ps(b+(i+1)*oversample)));
68 sum = _mm_mul_ps(f, sum);
69 sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
70 sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
71 _mm_store_ss(&ret, sum);
77#define OVERRIDE_INNER_PRODUCT_DOUBLE
79static inline double inner_product_double(
const float *a,
const float *b,
unsigned int len)
83 __m128d sum = _mm_setzero_pd();
87 t = _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i));
88 sum = _mm_add_pd(sum, _mm_cvtps_pd(t));
89 sum = _mm_add_pd(sum, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
91 t = _mm_mul_ps(_mm_loadu_ps(a+i+4), _mm_loadu_ps(b+i+4));
92 sum = _mm_add_pd(sum, _mm_cvtps_pd(t));
93 sum = _mm_add_pd(sum, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
95 sum = _mm_add_sd(sum, _mm_unpackhi_pd(sum, sum));
96 _mm_store_sd(&ret, sum);
100#define OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
101static inline double interpolate_product_double(
const float *a,
const float *b,
unsigned int len,
const spx_uint32_t oversample,
float *frac) {
105 __m128d sum1 = _mm_setzero_pd();
106 __m128d sum2 = _mm_setzero_pd();
107 __m128 f = _mm_loadu_ps(frac);
108 __m128d f1 = _mm_cvtps_pd(f);
109 __m128d f2 = _mm_cvtps_pd(_mm_movehl_ps(f,f));
113 t = _mm_mul_ps(_mm_load1_ps(a+i), _mm_loadu_ps(b+i*oversample));
114 sum1 = _mm_add_pd(sum1, _mm_cvtps_pd(t));
115 sum2 = _mm_add_pd(sum2, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
117 t = _mm_mul_ps(_mm_load1_ps(a+i+1), _mm_loadu_ps(b+(i+1)*oversample));
118 sum1 = _mm_add_pd(sum1, _mm_cvtps_pd(t));
119 sum2 = _mm_add_pd(sum2, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
121 sum1 = _mm_mul_pd(f1, sum1);
122 sum2 = _mm_mul_pd(f2, sum2);
123 sum = _mm_add_pd(sum1, sum2);
124 sum = _mm_add_sd(sum, _mm_unpackhi_pd(sum, sum));
125 _mm_store_sd(&ret, sum);