SAF
Loading...
Searching...
No Matches
saf_utility_veclib.c
Go to the documentation of this file.
1/*
2 * Copyright 2016-2018 Leo McCormack
3 *
4 * Permission to use, copy, modify, and/or distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
9 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
10 * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
11 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
12 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
13 * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
14 * PERFORMANCE OF THIS SOFTWARE.
15 */
16
49
50#include "saf_utilities.h"
51#include "saf_externals.h"
52
53/* Specify which LAPACK interface should be used: */
54#if defined(SAF_USE_INTEL_MKL_LP64)
55/* Note that Intel MKL LP64 supports Fortran LAPACK and LAPACKE interfaces: */
56# define SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE
57#elif defined(SAF_USE_INTEL_MKL_ILP64)
58/* Note that Intel MKL ILP64 will only work with the LAPACKE interface: */
59# define SAF_VECLIB_USE_LAPACKE_INTERFACE
60#elif defined(SAF_USE_OPEN_BLAS_AND_LAPACKE)
61# define SAF_VECLIB_USE_LAPACKE_INTERFACE
62#elif defined(SAF_USE_ATLAS)
63# define SAF_VECLIB_USE_CLAPACK_INTERFACE
64#elif defined(__APPLE__) && (defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64))
65# define SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE
66#elif defined(SAF_USE_GSL)
67# define SAF_VECLIB_USE_GSL_LINALG
68#else
69# error No LAPACK interface was specified!
70#endif
71
72/* Additional flags for Intel MKL */
73#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
74# ifndef SAF_INTEL_MKL_VML_MODE
94# define SAF_INTEL_MKL_VML_MODE ( VML_LA | VML_FTZDAZ_ON )
95# endif
96#endif
97
98/* These are mainly just to remove compiler warnings: */
99#if defined(__APPLE__) && defined(SAF_USE_APPLE_ACCELERATE_LP64) && !defined(ACCELERATE_NEW_LAPACK)
100 typedef __CLPK_integer veclib_int;
101 typedef __CLPK_real veclib_float;
102 typedef __CLPK_doublereal veclib_double;
103 typedef __CLPK_complex veclib_float_complex;
104 typedef __CLPK_doublecomplex veclib_double_complex;
105#elif defined(__APPLE__) && (defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64))
106/* Using ACCELERATE_NEW_LAPACK */
107# ifdef SAF_USE_APPLE_ACCELERATE_LP64
108 typedef __LAPACK_int veclib_int;
109# else /* SAF_USE_APPLE_ACCELERATE_ILP64: */
110 typedef __LAPACK_int veclib_int;
111# endif
112 typedef float veclib_float;
113 typedef double veclib_double;
114 typedef __LAPACK_float_complex veclib_float_complex;
115 typedef __LAPACK_double_complex veclib_double_complex;
116#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
117# ifdef SAF_USE_INTEL_MKL_LP64
118 typedef MKL_INT veclib_int;
119# else /* SAF_USE_INTEL_MKL_ILP64: */
120 typedef MKL_INT veclib_int;
121# endif
122 typedef float veclib_float;
123 typedef double veclib_double;
124 typedef MKL_Complex8 veclib_float_complex;
125 typedef MKL_Complex16 veclib_double_complex;
126#elif defined(SAF_USE_OPEN_BLAS_AND_LAPACKE)
127 typedef lapack_int veclib_int;
128 typedef float veclib_float;
129 typedef double veclib_double;
130 typedef lapack_complex_float veclib_float_complex;
131 typedef lapack_complex_double veclib_double_complex;
132#else
133 typedef int veclib_int;
134 typedef float veclib_float;
135 typedef double veclib_double;
136 typedef float_complex veclib_float_complex;
137 typedef double_complex veclib_double_complex;
138#endif
139
140
141/* ========================================================================== */
142/* Built-in CBLAS Functions (Level 0) */
143/* ========================================================================== */
144
145#ifdef SAF_USE_BUILT_IN_NAIVE_CBLAS
146void cblas_scopy(const int N, const float *X, const int incX, float *Y, const int incY){
147 int i;
148#if defined(SAF_ENABLE_SIMD)
149 if(incX==1 && incY==1){
150 for(i=0; i<(N-3); i+=4)
151 _mm_storeu_ps(Y+i, _mm_loadu_ps(X+i));
152 for(;i<N; i++) /* The residual (if N was not divisable by 4): */
153 Y[i] = X[i];
154 }
155 else
156 for(i=0; i<N; i++)
157 Y[i*incY] = X[i*incX];
158#else
159 for(i=0; i<N; i++)
160 Y[i*incY] = X[i*incX];
161#endif
162}
163
164void cblas_dcopy(const int N, const double *X, const int incX, double *Y, const int incY){
165 int i,j;
166 for(i=j=0; i<N; i+=incX, j+=incY)
167 Y[i] = X[j];
168}
169
170void cblas_ccopy(const int N, const void *X, const int incX, void *Y, const int incY){
171 int i,j;
172 float_complex *cX, *cY;
173 cX = (float_complex*)X;
174 cY = (float_complex*)Y;
175 for(i=j=0; i<N; i+=incX, j+=incY)
176 cY[i] = cX[j];
177}
178
179void cblas_zcopy(const int N, const void *X, const int incX, void *Y, const int incY){
180 int i,j;
181 double_complex *cX, *cY;
182 cX = (double_complex*)X;
183 cY = (double_complex*)Y;
184 for(i=j=0; i<N; i+=incX, j+=incY)
185 cY[i] = cX[j];
186}
187
188void cblas_saxpy(const int N, const float alpha, const float* X, const int incX, float* Y, const int incY) {
189 int i,j;
190 for (i=j=0; i<N; i+=incX, j+=incY)
191 Y[i] = alpha * X[i] + Y[i];
192}
193
194void cblas_daxpy(const int N, const double alpha, const double* X, const int incX, double* Y, const int incY) {
195 int i,j;
196 for (i=j=0; i<N; i+=incX, j+=incY)
197 Y[i] = alpha * X[i] + Y[i];
198}
199
200float cblas_sdot(const int N, const float* X, const int incX, const float* Y, const int incY){
201 int i;
202 float ret;
203#if defined(SAF_ENABLE_SIMD)
204 float sum2;
205 if(incX==1 && incY==1){
206 sum2 = 0.0f;
207 __m128 sum = _mm_setzero_ps();
208 /* Sum over all elements in groups of 4 */
209 for(i=0; i<(N-3); i+=4)
210 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(X+i), _mm_loadu_ps(Y+i)));
211 /* sum the first 2 elements with the second 2 elements and store in the first 2 elements */
212 sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum)/* copy elements 3 and 4 to 1 and 2 */);
213 /* sum the first 2 elements and store in the first element */
214 sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 2, 0, 1))/* shuffle so that element 2 goes to element 1 */);
215 _mm_store_ss(&ret, sum); /* get the first element (the result) */
216 for(;i<N; i++) /* The residual (if N was not divisable by 4): */
217 sum2 += X[i] * Y[i];
218 ret += sum2;
219 }
220 else
221 for(i=0; i<N; i++)
222 ret += X[i*incX] * Y[i*incY];
223#else
224 ret = 0.0f;
225 for(i=0; i<N; i++)
226 ret += X[i*incX] * Y[i*incY];
227#endif
228 return ret;
229}
230
231#endif
232
233
234/* ========================================================================== */
235/* Built-in CBLAS Functions (Level 3) */
236/* ========================================================================== */
237
238#ifdef SAF_USE_BUILT_IN_NAIVE_CBLAS
239void cblas_sgemm(const CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE TransA,
240 const CBLAS_TRANSPOSE TransB, const int M, const int N,
241 const int K, const float alpha, const float *A,
242 const int lda, const float *B, const int ldb,
243 const float beta, float *C, const int ldc)
244{
245 saf_assert(0, "INCOMPLETE and UNTESTED");
246 int i,j,k;
247 int _transA;
248 int _transB;
249
250 switch(Layout){
251 case CblasRowMajor:
252 _transA = (TransA == CblasNoTrans) ? SAF_FALSE : SAF_TRUE;
253 _transB = (TransB == CblasNoTrans) ? SAF_FALSE : SAF_TRUE;
254 break;
255 case CblasColMajor:
256 _transA = (TransA == CblasNoTrans) ? SAF_TRUE : SAF_FALSE;
257 _transB = (TransB == CblasNoTrans) ? SAF_TRUE : SAF_FALSE;
258 break;
259 }
260
261 if(!_transA && !_transB){
262 for(i=0; i<M; i++){
263 for(j=0; j<N; j++){
264 C[i*ldc+j] = 0.0f;
265 for(k=0; k<K; k++)
266 C[i*ldc+j] += A[i*lda+k] * B[k*ldb+j];
267 }
268 }
269 }
270 else if(_transA && !_transB){
271 }
272 else if(_transA && _transB){
273 }
274}
275#endif
276
277
278/* ========================================================================== */
279/* Find Index of Min-Abs-Value (?iminv) */
280/* ========================================================================== */
281
283(
284 const float* a,
285 const int len,
286 int* index
287)
288{
289#if defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
290 float minVal;
291 vDSP_Length ind_tmp;
292 vDSP_minmgvi(a, 1, &minVal, &ind_tmp, (vDSP_Length)len);
293 *index = (int)ind_tmp;
294#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
295 *index = (int)cblas_isamin(len, a, 1);
296#else
297 int i;
298 *index = 0;
299 float minVal=FLT_MAX;
300 for(i=0; i<len; i++){
301 if(fabsf(a[i])<minVal){
302 minVal = fabsf(a[i]);
303 *index = i;
304 }
305 }
306#endif
307}
308
310(
311 const float_complex* a,
312 const int len,
313 int* index
314)
315{
316#if defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64) /* Unfortunately requires a malloc call */
317 float* abs_a;
318 float minVal;
319 abs_a = malloc1d(len*sizeof(float));
320 vDSP_vdist((float*)a/*real*/, 2, (float*)a+1/*imag*/, 2, abs_a, 1, (vDSP_Length)len); /* cabsf */
321 vDSP_Length ind_tmp;
322 vDSP_minmgvi(abs_a, 1, &minVal, &ind_tmp, (vDSP_Length)len);
323 *index = (int)ind_tmp;
324 free(abs_a);
325#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
326 *index = (int)cblas_icamin(len, a, 1);
327#else
328 int i;
329 *index = 0;
330 float minVal=FLT_MAX;
331 for(i=0; i<len; i++){
332 if(cabsf(a[i])<minVal){
333 minVal = cabsf(a[i]);
334 *index = i;
335 }
336 }
337#endif
338}
339
341(
342 const double* a,
343 const int len,
344 int* index
345)
346{
347#if defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
348 double minVal;
349 vDSP_Length ind_tmp;
350 vDSP_minmgviD(a, 1, &minVal, &ind_tmp, (vDSP_Length)len);
351 *index = (int)ind_tmp;
352#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
353 *index = (int)cblas_idamin(len, a, 1);
354#else
355 int i;
356 *index = 0;
357 double minVal=DBL_MAX;
358 for(i=0; i<len; i++){
359 if(fabs(a[i])<minVal){
360 minVal = fabs(a[i]);
361 *index = i;
362 }
363 }
364#endif
365}
366
368(
369 const double_complex* a,
370 const int len,
371 int* index
372)
373{
374#if defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64) /* Unfortunately requires a malloc call */
375 double* abs_a;
376 double minVal;
377 abs_a = malloc1d(len*sizeof(double));
378 vDSP_vdistD((double*)a/*real*/, 2, (double*)a+1/*imag*/, 2, abs_a, 1, (vDSP_Length)len); /* cabs */
379 vDSP_Length ind_tmp;
380 vDSP_minmgviD(abs_a, 1, &minVal, &ind_tmp, (vDSP_Length)len);
381 *index = (int)ind_tmp;
382 free(abs_a);
383#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
384 *index = (int)cblas_izamin(len, a, 1);
385#else
386 int i;
387 *index = 0;
388 double minVal=DBL_MAX;
389 for(i=0; i<len; i++){
390 if(cabs(a[i])<minVal){
391 minVal = cabs(a[i]);
392 *index = i;
393 }
394 }
395#endif
396}
397
398
399/* ========================================================================== */
400/* Find Index of Max-Abs-Value (?imaxv) */
401/* ========================================================================== */
402
404(
405 const float* a,
406 const int len,
407 int* index
408)
409{
410 *index = (int)cblas_isamax(len, a, 1);
411}
412
414(
415 const float_complex* a,
416 const int len,
417 int* index
418)
419{
420 *index = (int)cblas_icamax(len, a, 1);
421}
422
424(
425 const double* a,
426 const int len,
427 int* index
428)
429{
430 *index = (int)cblas_idamax(len, a, 1);
431}
432
434(
435 const double_complex* a,
436 const int len,
437 int* index
438)
439{
440 *index = (int)cblas_izamax(len, a, 1);
441}
442
443
444/* ========================================================================== */
445/* Vector-Abs (?vabs) */
446/* ========================================================================== */
447
449(
450 const float* a,
451 const int len,
452 float* c
453)
454{
455#if defined(SAF_USE_INTEL_IPP)
456 ippsAbs_32f((Ipp32f*)a, (Ipp32f*)c, len);
457#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
458 vDSP_vabs(a, 1, c, 1, (vDSP_Length)len);
459#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
460 vmsAbs(len, a, c, SAF_INTEL_MKL_VML_MODE);
461#else
462 int i;
463 for(i=0; i<len; i++)
464 c[i] = fabsf(a[i]);
465#endif
466}
467
469(
470 const float_complex* a,
471 const int len,
472 float* c
473)
474{
475#if defined(SAF_USE_INTEL_IPP)
476 ippsMagnitude_32fc((Ipp32fc*)a, (Ipp32f*)c, len);
477#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
478 vDSP_vdist((float*)a/*real*/, 2, (float*)a+1/*imag*/, 2, c, 1, (vDSP_Length)len);
479#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
480 vmcAbs(len, (MKL_Complex8*)a, c, SAF_INTEL_MKL_VML_MODE);
481#else
482 int i;
483 for(i=0; i<len; i++)
484 c[i] = cabsf(a[i]);
485#endif
486}
487
488
489/* ========================================================================== */
490/* Vector-Modulus (?vmod) */
491/* ========================================================================== */
492
494(
495 const float* a,
496 const float* b,
497 const int len,
498 float* c
499)
500{
501#if defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
502 vvfmodf(c, a, b, &len);
503#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
504 vmsFmod(len, a, b, c, SAF_INTEL_MKL_VML_MODE);
505#else
506 int i;
507 for(i=0; i<len; i++)
508 c[i] = fmodf(a[i], b[i]);
509#endif
510}
511
512
513/* ========================================================================== */
514/* Vector-Reciprocal (?vrecip) */
515/* ========================================================================== */
516
518(
519 const float* a,
520 const int len,
521 float* c
522)
523{
524#if defined(SAF_USE_INTEL_IPP)
525 ippsDivCRev_32f((Ipp32f*)a, 1.0f, (Ipp32f*)c, len);
526#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
527 float one;
528 one = 1.0f;
529 vDSP_svdiv(&one, a, 1, c, 1, (vDSP_Length)len);
530#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
531 vmsInv(len, a, c, SAF_INTEL_MKL_VML_MODE);
532#elif defined(SAF_ENABLE_SIMD)
533 int i;
534 i = 0;
535# if defined(__AVX512F__)
536 for(; i<(len-15); i+=16)
537 _mm512_storeu_ps(c+i, _mm512_rcp14_ps(_mm512_loadu_ps(a+i)));
538# endif
539# if defined(__AVX__) && defined(__AVX2__)
540 for(; i<(len-7); i+=8)
541 _mm256_storeu_ps(c+i, _mm256_rcp_ps(_mm256_loadu_ps(a+i)));
542# endif
543# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
544 for(; i<(len-3); i+=4)
545 _mm_storeu_ps(c+i, _mm_rcp_ps(_mm_loadu_ps(a+i)));
546# endif
547 for(;i<len; i++) /* The residual (if len was not divisable by the step size): */
548 c[i] = 1.0f/a[i];
549#else
550 int i;
551 for(i=0; i<len; i++)
552 c[i] = 1.0f/a[i];
553#endif
554}
555
556
557/* ========================================================================== */
558/* Vector-Conjugate (?vconj) */
559/* ========================================================================== */
560
562(
563 const float_complex* a,
564 const int len,
565 float_complex* c
566)
567{
568#if defined(SAF_USE_INTEL_IPP)
569 ippsConj_32fc((Ipp32fc*)a, (Ipp32fc*)c, len);
570#else
571 cblas_ccopy(len, a, 1, c, 1);
572 cblas_sscal(len, -1.0f, ((float*)c)+1, 2);
573#endif
574}
575
577(
578 const double_complex* a,
579 const int len,
580 double_complex* c
581)
582{
583#if defined(SAF_USE_INTEL_IPP)
584 ippsConj_64fc((Ipp64fc*)a, (Ipp64fc*)c, len);
585#else
586 cblas_zcopy(len, a, 1, c, 1);
587 cblas_dscal(len, -1.0, ((double*)c)+1, 2);
588#endif
589}
590
591
592/* ========================================================================== */
593/* Vector-Vector Copy (?vvcopy) */
594/* ========================================================================== */
595
597(
598 const float* a,
599 const int len,
600 float* c
601)
602{
603 cblas_scopy(len, a, 1, c, 1);
604}
605
607(
608 const float_complex* a,
609 const int len,
610 float_complex* c
611)
612{
613 cblas_ccopy(len, a, 1, c, 1);
614}
615
617(
618 const double* a,
619 const int len,
620 double* c
621)
622{
623 cblas_dcopy(len, a, 1, c, 1);
624}
625
627(
628 const double_complex* a,
629 const int len,
630 double_complex* c
631)
632{
633 cblas_zcopy(len, a, 1, c, 1);
634}
635
636
637/* ========================================================================== */
638/* Vector-Vector Addition (?vvadd) */
639/* ========================================================================== */
640
642(
643 const float* a,
644 const float* b,
645 const int len,
646 float* c
647)
648{
649 /* Checks: */
650#ifndef NDEBUG
651 saf_assert(c!=NULL, "'c' can no longer be NULL");
652 saf_assert(a!=c && b!=c, "In-place operation is no longer supported, use e.g. cblas_saxby() instead.");
653#endif
654
655 /* The operation: */
656#if defined(SAF_USE_INTEL_IPP)
657 ippsAdd_32f((Ipp32f*)a, (Ipp32f*)b, (Ipp32f*)c, len);
658#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
659 vDSP_vadd(a, 1, b, 1, c, 1, (vDSP_Length)len);
660#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
661 vmsAdd(len, a, b, c, SAF_INTEL_MKL_VML_MODE);
662#elif defined(SAF_ENABLE_SIMD)
663 int i;
664 i = 0;
665# if defined(__AVX512F__)
666 for(; i<(len-15); i+=16)
667 _mm512_storeu_ps(c+i, _mm512_add_ps(_mm512_loadu_ps(a+i), _mm512_loadu_ps(b+i)));
668# endif
669# if defined(__AVX__) && defined(__AVX2__)
670 for(; i<(len-7); i+=8)
671 _mm256_storeu_ps(c+i, _mm256_add_ps(_mm256_loadu_ps(a+i), _mm256_loadu_ps(b+i)));
672# endif
673# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
674 for(; i<(len-3); i+=4)
675 _mm_storeu_ps(c+i, _mm_add_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
676# endif
677 for(; i<len; i++) /* The residual (if len was not divisable by the step size): */
678 c[i] = a[i] + b[i];
679#elif defined(NDEBUG)
680 int i;
681 /* try to indirectly "trigger" some compiler optimisations */
682 for(i=0; i<len-3; i+=4){
683 c[i] = a[i] + b[i];
684 c[i+1] = a[i+1] + b[i+1];
685 c[i+2] = a[i+2] + b[i+2];
686 c[i+3] = a[i+3] + b[i+3];
687 }
688 for(; i<len; i++) /* The residual (if len was not divisable by the step size): */
689 c[i] = a[i] + b[i];
690#else
691 int j;
692 for (j = 0; j < len; j++)
693 c[j] = a[j] + b[j];
694#endif
695}
696
698(
699 const float_complex* a,
700 const float_complex* b,
701 const int len,
702 float_complex* c
703)
704{
705 /* Checks: */
706#ifndef NDEBUG
707 saf_assert(c!=NULL, "'c' can no longer be NULL");
708 saf_assert(a!=c && b!=c, "In-place operation is no longer supported, use e.g. cblas_caxby() instead.");
709#endif
710
711 /* The operation: */
712#if defined(SAF_USE_INTEL_IPP)
713 ippsAdd_32fc((Ipp32fc*)a, (Ipp32fc*)b, (Ipp32fc*)c, len);
714#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
715 vDSP_vadd((float*)a, 1, (float*)b, 1, (float*)c, 1, /*re+im*/2*(vDSP_Length)len);
716#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
717 vmcAdd(len, (MKL_Complex8*)a, (MKL_Complex8*)b, (MKL_Complex8*)c, SAF_INTEL_MKL_VML_MODE);
718#elif defined(SAF_ENABLE_SIMD)
719 int i, len2;
720 float* sa, *sb, *sc;
721 len2 = len*2;
722 sa = (float*)a; sb = (float*)b; sc = (float*)c;
723 i = 0;
724# if defined(__AVX512F__)
725 for(; i<(len2-15); i+=16)
726 _mm512_storeu_ps(sc+i, _mm512_add_ps(_mm512_loadu_ps(sa+i), _mm512_loadu_ps(sb+i)));
727# endif
728# if defined(__AVX__) && defined(__AVX2__)
729 for(; i<(len2-7); i+=8)
730 _mm256_storeu_ps(sc+i, _mm256_add_ps(_mm256_loadu_ps(sa+i), _mm256_loadu_ps(sb+i)));
731# endif
732# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
733 for(; i<(len2-3); i+=4)
734 _mm_storeu_ps(sc+i, _mm_add_ps(_mm_loadu_ps(sa+i), _mm_loadu_ps(sb+i)));
735# endif
736 for(; i<len2; i++) /* The residual (if len2 was not divisable by the step size): */
737 sc[i] = sa[i] + sb[i];
738#elif __STDC_VERSION__ >= 199901L && defined(NDEBUG)
739 int i;
740 /* try to indirectly "trigger" some compiler optimisations */
741 for(i=0; i<len-3; i+=4){
742 c[i] = a[i] + b[i];
743 c[i+1] = a[i+1] + b[i+1];
744 c[i+2] = a[i+2] + b[i+2];
745 c[i+3] = a[i+3] + b[i+3];
746 }
747 for(; i<len; i++) /* The residual (if len was not divisable by the step size): */
748 c[i] = a[i] + b[i];
749#else
750 int j;
751 for (j = 0; j < len; j++)
752 c[j] = ccaddf(a[j], b[j]);
753#endif
754}
755
757(
758 const double* a,
759 const double* b,
760 const int len,
761 double* c
762)
763{
764 /* Checks: */
765#ifndef NDEBUG
766 saf_assert(c!=NULL, "'c' can no longer be NULL");
767 saf_assert(a!=c && b!=c, "In-place operation is no longer supported, use e.g. cblas_daxby() instead.");
768#endif
769
770 /* The operation: */
771#if defined(SAF_USE_INTEL_IPP)
772 ippsAdd_64f((Ipp64f*)a, (Ipp64f*)b, (Ipp64f*)c, len);
773#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
774 vDSP_vaddD(a, 1, b, 1, c, 1, (vDSP_Length)len);
775#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
776 vmdAdd(len, a, b, c, SAF_INTEL_MKL_VML_MODE);
777#elif defined(SAF_ENABLE_SIMD)
778 int i;
779 i = 0;
780# if defined(__AVX512F__)
781 for(; i<(len-7); i+=8)
782 _mm512_storeu_pd(c+i, _mm512_add_pd(_mm512_loadu_pd(a+i), _mm512_loadu_pd(b+i)));
783# endif
784# if defined(__AVX__) && defined(__AVX2__)
785 for(; i<(len-3); i+=4)
786 _mm256_storeu_pd(c+i, _mm256_add_pd(_mm256_loadu_pd(a+i), _mm256_loadu_pd(b+i)));
787# endif
788# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
789 for(; i<(len-1); i+=2)
790 _mm_storeu_pd(c+i, _mm_add_pd(_mm_loadu_pd(a+i), _mm_loadu_pd(b+i)));
791# endif
792 for(; i<len; i++) /* The residual (if len was not divisable by the step size): */
793 c[i] = a[i] + b[i];
794#else
795 int j;
796 for (j = 0; j < len; j++)
797 c[j] = a[j] + b[j];
798#endif
799}
800
802(
803 const double_complex* a,
804 const double_complex* b,
805 const int len,
806 double_complex* c
807)
808{
809 /* Checks: */
810#ifndef NDEBUG
811 saf_assert(c!=NULL, "'c' can no longer be NULL");
812 saf_assert(a!=c && b!=c, "In-place operation is no longer supported, use e.g. cblas_zaxby() instead.");
813#endif
814
815 /* The operation: */
816#if defined(SAF_USE_INTEL_IPP)
817 ippsAdd_64fc((Ipp64fc*)a, (Ipp64fc*)b, (Ipp64fc*)c, len);
818#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
819 vDSP_vaddD((double*)a, 1, (double*)b, 1, (double*)c, 1, /*re+im*/2*(vDSP_Length)len);
820#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
821 vmzAdd(len, (MKL_Complex16*)a, (MKL_Complex16*)b, (MKL_Complex16*)c, SAF_INTEL_MKL_VML_MODE);
822#elif defined(SAF_ENABLE_SIMD)
823 int i, len2;
824 double* sa, *sb, *sc;
825 len2 = len*2;
826 sa = (double*)a; sb = (double*)b; sc = (double*)c;
827 i = 0;
828# if defined(__AVX512F__)
829 for(; i<(len2-7); i+=8)
830 _mm512_storeu_pd(sc+i, _mm512_add_pd(_mm512_loadu_pd(sa+i), _mm512_loadu_pd(sb+i)));
831# endif
832# if defined(__AVX__) && defined(__AVX2__)
833 for(; i<(len2-3); i+=4)
834 _mm256_storeu_pd(sc+i, _mm256_add_pd(_mm256_loadu_pd(sa+i), _mm256_loadu_pd(sb+i)));
835# endif
836# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
837 for(; i<(len2-1); i+=2)
838 _mm_storeu_pd(sc+i, _mm_add_pd(_mm_loadu_pd(sa+i), _mm_loadu_pd(sb+i)));
839# endif
840 for(; i<len2; i++) /* The residual (if len2 was not divisable by the step size): */
841 sc[i] = sa[i] + sb[i];
842#else
843 int j;
844 for (j = 0; j < len; j++)
845 c[j] = ccadd(a[j], b[j]);
846#endif
847}
848
849
850/* ========================================================================== */
851/* Vector-Vector Subtraction (?vvsub) */
852/* ========================================================================== */
853
855(
856 const float* a,
857 const float* b,
858 const int len,
859 float* c
860)
861{
862 /* Checks: */
863#ifndef NDEBUG
864 saf_assert(c!=NULL, "'c' cannot be NULL");
865 saf_assert(a!=c && b!=c, "In-place operation is not supported.");
866#endif
867
868 /* The operation: */
869#if defined(SAF_USE_INTEL_IPP)
870 ippsSub_32f((Ipp32f*)b, (Ipp32f*)a, (Ipp32f*)c, len); /* 'a' and 'b' are switched */
871#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
872 vDSP_vsub(b, 1, a, 1, c, 1, (vDSP_Length)len); /* 'a' and 'b' are switched */
873#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
874 vmsSub(len, a, b, c, SAF_INTEL_MKL_VML_MODE);
875#elif defined(SAF_ENABLE_SIMD)
876 int i;
877 i = 0;
878# if defined(__AVX512F__)
879 for(; i<(len-15); i+=16)
880 _mm512_storeu_ps(c+i, _mm512_sub_ps(_mm512_loadu_ps(a+i), _mm512_loadu_ps(b+i)));
881# endif
882# if defined(__AVX__) && defined(__AVX2__)
883 for(; i<(len-7); i+=8)
884 _mm256_storeu_ps(c+i, _mm256_sub_ps(_mm256_loadu_ps(a+i), _mm256_loadu_ps(b+i)));
885# endif
886# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
887 for(; i<(len-3); i+=4)
888 _mm_storeu_ps(c+i, _mm_sub_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
889# endif
890 for(; i<len; i++) /* The residual (if len was not divisable by the step size): */
891 c[i] = a[i] - b[i];
892#elif defined(NDEBUG)
893 int i;
894 /* try to indirectly "trigger" some compiler optimisations */
895 for(i=0; i<len-3; i+=4){
896 c[i] = a[i] - b[i];
897 c[i+1] = a[i+1] - b[i+1];
898 c[i+2] = a[i+2] - b[i+2];
899 c[i+3] = a[i+3] - b[i+3];
900 }
901 for(; i<len; i++) /* The residual (if len was not divisable by the step size): */
902 c[i] = a[i] - b[i];
903#else
904 int j;
905 for (j = 0; j < len; j++)
906 c[j] = a[j] - b[j];
907#endif
908}
909
911(
912 const float_complex* a,
913 const float_complex* b,
914 const int len,
915 float_complex* c
916)
917{
918 /* Checks: */
919#ifndef NDEBUG
920 saf_assert(c!=NULL, "'c' cannot be NULL");
921 saf_assert(a!=c && b!=c, "In-place operation is not supported.");
922#endif
923
924 /* The operation: */
925#if defined(SAF_USE_INTEL_IPP)
926 ippsSub_32fc((Ipp32fc*)b, (Ipp32fc*)a, (Ipp32fc*)c, len); /* 'a' and 'b' are switched */
927#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
928 vDSP_vsub((float*)b, 1, (float*)a, 1, (float*)c, 1, /*re+im*/2*(vDSP_Length)len); /* 'a' and 'b' are switched */
929#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
930 vmcSub(len, (MKL_Complex8*)a, (MKL_Complex8*)b, (MKL_Complex8*)c, SAF_INTEL_MKL_VML_MODE);
931#elif defined(SAF_ENABLE_SIMD)
932 int i, len2;
933 float* sa, *sb, *sc;
934 len2 = len*2;
935 sa = (float*)a; sb = (float*)b; sc = (float*)c;
936 i = 0;
937# if defined(__AVX512F__)
938 for(; i<(len2-15); i+=16)
939 _mm512_storeu_ps(sc+i, _mm512_sub_ps(_mm512_loadu_ps(sa+i), _mm512_loadu_ps(sb+i)));
940# endif
941# if defined(__AVX__) && defined(__AVX2__)
942 for(; i<(len2-7); i+=8)
943 _mm256_storeu_ps(sc+i, _mm256_sub_ps(_mm256_loadu_ps(sa+i), _mm256_loadu_ps(sb+i)));
944# endif
945# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
946 for(; i<(len2-3); i+=4)
947 _mm_storeu_ps(sc+i, _mm_sub_ps(_mm_loadu_ps(sa+i), _mm_loadu_ps(sb+i)));
948# endif
949 for(; i<len2; i++) /* The residual (if len2 was not divisable by the step size): */
950 sc[i] = sa[i] - sb[i];
951#elif __STDC_VERSION__ >= 199901L && defined(NDEBUG)
952 int i;
953 /* try to indirectly "trigger" some compiler optimisations */
954 for(i=0; i<len-3; i+=4){
955 c[i] = a[i] - b[i];
956 c[i+1] = a[i+1] - b[i+1];
957 c[i+2] = a[i+2] - b[i+2];
958 c[i+3] = a[i+3] - b[i+3];
959 }
960 for(; i<len; i++) /* The residual (if len was not divisable by the step size): */
961 c[i] = a[i] - b[i];
962#else
963 int j;
964 for (j = 0; j < len; j++)
965 c[j] = ccsubf(a[j], b[j]);
966#endif
967}
968
970(
971 const double* a,
972 const double* b,
973 const int len,
974 double* c
975)
976{
977 /* Checks: */
978#ifndef NDEBUG
979 saf_assert(c!=NULL, "'c' cannot be NULL");
980 saf_assert(a!=c && b!=c, "In-place operation is not supported.");
981#endif
982
983 /* The operation: */
984#if defined(SAF_USE_INTEL_IPP)
985 ippsSub_64f((Ipp64f*)b, (Ipp64f*)a, (Ipp64f*)c, len); /* 'a' and 'b' are switched */
986#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
987 vDSP_vsubD(b, 1, a, 1, c, 1, (vDSP_Length)len); /* 'a' and 'b' are switched */
988#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
989 vmdSub(len, a, b, c, SAF_INTEL_MKL_VML_MODE);
990#elif defined(SAF_ENABLE_SIMD)
991 int i;
992 i = 0;
993# if defined(__AVX512F__)
994 for(; i<(len-7); i+=8)
995 _mm512_storeu_pd(c+i, _mm512_sub_pd(_mm512_loadu_pd(a+i), _mm512_loadu_pd(b+i)));
996# endif
997# if defined(__AVX__) && defined(__AVX2__)
998 for(; i<(len-3); i+=4)
999 _mm256_storeu_pd(c+i, _mm256_sub_pd(_mm256_loadu_pd(a+i), _mm256_loadu_pd(b+i)));
1000# endif
1001# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
1002 for(; i<(len-1); i+=2)
1003 _mm_storeu_pd(c+i, _mm_sub_pd(_mm_loadu_pd(a+i), _mm_loadu_pd(b+i)));
1004# endif
1005 for(;i<len; i++) /* The residual (if len was not divisable by the step size): */
1006 c[i] = a[i] - b[i];
1007#else
1008 int j;
1009 for (j = 0; j < len; j++)
1010 c[j] = a[j] - b[j];
1011#endif
1012}
1013
1015(
1016 const double_complex* a,
1017 const double_complex* b,
1018 const int len,
1019 double_complex* c
1020)
1021{
1022 /* Checks: */
1023#ifndef NDEBUG
1024 saf_assert(c!=NULL, "'c' cannot be NULL");
1025 saf_assert(a!=c && b!=c, "In-place operation is not supported.");
1026#endif
1027
1028 /* The operation: */
1029#if defined(SAF_USE_INTEL_IPP)
1030 ippsSub_64fc((Ipp64fc*)b, (Ipp64fc*)a, (Ipp64fc*)c, len); /* 'a' and 'b' are switched */
1031#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
1032 vDSP_vsubD((double*)b, 1, (double*)a, 1, (double*)c, 1, /*re+im*/2*(vDSP_Length)len); /* 'a' and 'b' are switched */
1033#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1034 vmzSub(len, (MKL_Complex16*)a, (MKL_Complex16*)b, (MKL_Complex16*)c, SAF_INTEL_MKL_VML_MODE);
1035#elif defined(SAF_ENABLE_SIMD)
1036 int i, len2;
1037 double* sa, *sb, *sc;
1038 len2 = len*2;
1039 sa = (double*)a; sb = (double*)b; sc = (double*)c;
1040 i = 0;
1041# if defined(__AVX512F__)
1042 for(; i<(len2-7); i+=8)
1043 _mm512_storeu_pd(sc+i, _mm512_sub_pd(_mm512_loadu_pd(sa+i), _mm512_loadu_pd(sb+i)));
1044# endif
1045# if defined(__AVX__) && defined(__AVX2__)
1046 for(; i<(len2-3); i+=4)
1047 _mm256_storeu_pd(sc+i, _mm256_sub_pd(_mm256_loadu_pd(sa+i), _mm256_loadu_pd(sb+i)));
1048# endif
1049# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
1050 for(; i<(len2-1); i+=2)
1051 _mm_storeu_pd(sc+i, _mm_sub_pd(_mm_loadu_pd(sa+i), _mm_loadu_pd(sb+i)));
1052# endif
1053 for(;i<len2; i++) /* The residual (if len2 was not divisable by the step size): */
1054 sc[i] = sa[i] - sb[i];
1055#else
1056 int j;
1057 for (j = 0; j < len; j++)
1058 c[j] = ccsub(a[j], b[j]);
1059#endif
1060}
1061
1062
1063/* ========================================================================== */
1064/* Vector-Vector Multiplication (?vvmul) */
1065/* ========================================================================== */
1066
1068(
1069 const float* a,
1070 const float* b,
1071 const int len,
1072 float* c
1073)
1074{
1075 /* Checks: */
1076#ifndef NDEBUG
1077 saf_assert(c!=NULL, "'c' can no longer be NULL");
1078 saf_assert(a!=c && b!=c, "In-place operation is no longer supported.");
1079#endif
1080
1081 /* The operation: */
1082#if defined(SAF_USE_INTEL_IPP)
1083 ippsMul_32f((Ipp32f*)a, (Ipp32f*)b, (Ipp32f*)c, len);
1084#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
1085 vDSP_vmul(a, 1, b, 1, c, 1, (vDSP_Length)len);
1086#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1087 vmsMul(len, a, b, c, SAF_INTEL_MKL_VML_MODE);
1088#elif defined(SAF_ENABLE_SIMD)
1089 int i;
1090 i = 0;
1091# if defined(__AVX512F__)
1092 for(; i<(len-15); i+=16)
1093 _mm512_storeu_ps(c+i, _mm512_mul_ps(_mm512_loadu_ps(a+i), _mm512_loadu_ps(b+i)));
1094# endif
1095# if defined(__AVX__) && defined(__AVX2__)
1096 for(; i<(len-7); i+=8)
1097 _mm256_storeu_ps(c+i, _mm256_mul_ps(_mm256_loadu_ps(a+i), _mm256_loadu_ps(b+i)));
1098# endif
1099# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
1100 for(; i<(len-3); i+=4)
1101 _mm_storeu_ps(c+i, _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
1102# endif
1103 for(;i<len; i++) /* The residual (if len was not divisable by the step size): */
1104 c[i] = a[i] * b[i];
1105#elif defined(NDEBUG)
1106 int i;
1107 /* try to indirectly "trigger" some compiler optimisations */
1108 for(i=0; i<len-3; i+=4){
1109 c[i] = a[i] * b[i];
1110 c[i+1] = a[i+1] * b[i+1];
1111 c[i+2] = a[i+2] * b[i+2];
1112 c[i+3] = a[i+3] * b[i+3];
1113 }
1114 for(; i<len; i++) /* The residual (if len was not divisable by the step size): */
1115 c[i] = a[i] * b[i];
1116#else
1117 int j;
1118 for (j = 0; j < len; j++)
1119 c[j] = a[j] * b[j];
1120#endif
1121}
1122
1124(
1125 const float_complex* a,
1126 const float_complex* b,
1127 const int len,
1128 float_complex* c
1129)
1130{
1131 /* Checks: */
1132#ifndef NDEBUG
1133 saf_assert(c!=NULL, "'c' can no longer be NULL");
1134 saf_assert(a!=c && b!=c, "In-place operation is no longer supported.");
1135#endif
1136
1137 /* The operation: */
1138#if defined(SAF_USE_INTEL_IPP)
1139 ippsMul_32fc((Ipp32fc*)a, (Ipp32fc*)b, (Ipp32fc*)c, len);
1140#elif (defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)) && 0
1141 /* Imaginary part of the output */
1142 vDSP_vmul((float*)a/*real*/, 2, (float*)b+1/*imag*/, 2, (float*)c+1/*imag*/, 2, (vDSP_Length)len);
1143 vDSP_vma((float*)a+1/*imag*/, 2, (float*)b/*real*/, 2, (float*)c+1/*imag*/, 2, (float*)c/*real*/, 2, (vDSP_Length)len); /* Use the real part of c as a temporary buffer */
1144 cblas_scopy(len, (float*)c/*real*/, 2, (float*)c+1/*imag*/, 2); /* Copy the imaginary part from the temporary buffer */
1145 /* Real part of the output */
1146 vDSP_vmmsb((float*)a/*real*/, 2, (float*)b/*real*/, 2, (float*)a+1/*imag*/, 2, (float*)b+1/*imag*/, 2, (float*)c/*real*/, 2, (vDSP_Length)len);
1147#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1148 vmcMul(len, (MKL_Complex8*)a, (MKL_Complex8*)b, (MKL_Complex8*)c, SAF_INTEL_MKL_VML_MODE);
1149#elif defined(SAF_ENABLE_SIMD)
1150 int i;
1151 float* sa, *sb, *sc;
1152 sa = (float*)a; sb = (float*)b; sc = (float*)c;
1153# if (defined(__AVX__) && defined(__AVX2__)) || defined(__AVX512F__) /* I couldn't figure out an alternative for addsub with AVX-512... */
1154 __m256i permute_ri = _mm256_set_epi32(6, 7, 4, 5, 2, 3, 0, 1);
1155 for(i=0; i<(len-3); i+=4){
1156 /* Load only the real parts of a */
1157 __m256 src1 = _mm256_moveldup_ps(_mm256_loadu_ps(sa+2*i)/*|a1|b1|a2|b2|a3|b3|a4|b4|*/); /*|a1|a1|a2|a2|a3|a3|a4|a4|*/
1158 /* Load real+imag parts of b */
1159 __m256 src2 = _mm256_loadu_ps(sb+2*i); /*|c1|d1|c2|d2|c3|d3|c4|d4|*/
1160 /* Multiply together */
1161 __m256 tmp1 = _mm256_mul_ps(src1, src2);
1162 /* Swap the real+imag parts of b to be imag+real instead: */
1163 __m256 b1 = _mm256_permutevar8x32_ps(src2, permute_ri);
1164 /* Load only the imag parts of a */
1165 src1 = _mm256_movehdup_ps(_mm256_loadu_ps(sa+2*i)/*|a1|b1|a2|b2|a3|b3|a4|b4|*/); /*|b1|b1|b2|b2|b3|b3|b4|b4|*/
1166 /* Multiply together */
1167 __m256 tmp2 = _mm256_mul_ps(src1, b1);
1168 /* Add even indices, subtract odd indices */
1169 _mm256_storeu_ps(sc+2*i, _mm256_addsub_ps(tmp1, tmp2));
1170 }
1171# elif defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
1172 for(i=0; i<(len-1); i+=2){
1173 /* Load only the real parts of a */
1174 __m128 src1 = _mm_moveldup_ps(_mm_loadu_ps(sa+2*i)/*|a1|b1|a2|b2|*/); /*|a1|a1|a2|a2|*/
1175 /* Load real+imag parts of b */
1176 __m128 src2 = _mm_loadu_ps(sb+2*i); /*|c1|d1|c2|d2|*/
1177 /* Multiply together */
1178 __m128 tmp1 = _mm_mul_ps(src1, src2);
1179 /* Swap the real+imag parts of b to be imag+real instead: */
1180 __m128 b1 = _mm_shuffle_ps(src2, src2, _MM_SHUFFLE(2, 3, 0, 1));
1181 /* Load only the imag parts of a */
1182 src1 = _mm_movehdup_ps(_mm_loadu_ps(sa+2*i)/*|a1|b1|a2|b2|*/); /*|b1|b1|b2|b2|*/
1183 /* Multiply together */
1184 __m128 tmp2 = _mm_mul_ps(src1, b1);
1185 /* Add even indices, subtract odd indices */
1186 _mm_storeu_ps(sc+2*i, _mm_addsub_ps(tmp1, tmp2));
1187 }
1188# endif
1189 for(;i<len; i++){ /* The residual (if len was not divisable by the step size): */
1190 sc[2*i] = sa[2*i] * sb[2*i] - sa[2*i+1] * sb[2*i+1];
1191 sc[2*i+1] = sa[2*i] * sb[2*i+1] + sa[2*i+1] * sb[2*i];
1192 }
1193#elif __STDC_VERSION__ >= 199901L && defined(NDEBUG)
1194 int i;
1195 /* try to indirectly "trigger" some compiler optimisations */
1196 for(i=0; i<len-3; i+=4){
1197 c[i] = a[i] * b[i];
1198 c[i+1] = a[i+1] * b[i+1];
1199 c[i+2] = a[i+2] * b[i+2];
1200 c[i+3] = a[i+3] * b[i+3];
1201 }
1202 for(; i<len; i++) /* The residual (if len was not divisable by the step size): */
1203 c[i] = a[i] * b[i];
1204#elif __STDC_VERSION__ >= 199901L
1205 int i;
1206 for (i = 0; i < len; i++)
1207 c[i] = a[i] * b[i];
1208#else
1209 int i;
1210 for (i = 0; i < len; i++)
1211 c[i] = ccmulf(a[i], b[i]);
1212#endif
1213}
1214
1215
1216/* ========================================================================== */
1217/* Vector-Vector Dot Product (?vvdot) */
1218/* ========================================================================== */
1219
1221(
1222 const float* a,
1223 const float* b,
1224 const int len,
1225 float* c
1226)
1227{
1228 c[0] = cblas_sdot (len, a, 1, b, 1);
1229}
1230
1232(
1233 const float_complex* a,
1234 const float_complex* b,
1235 const int len,
1236 CONJ_FLAG flag,
1237 float_complex* c
1238)
1239{
1240 switch(flag){
1241 default:
1242 case NO_CONJ:
1243 cblas_cdotu_sub(len, a, 1, b, 1, c);
1244 break;
1245 case CONJ:
1246 cblas_cdotc_sub(len, a, 1, b, 1, c);
1247 break;
1248 }
1249}
1250
1251
1252/* ========================================================================== */
1253/* Vector-Scalar Product (?vsmul) */
1254/* ========================================================================== */
1255
1257(
1258 float* a,
1259 const float* s,
1260 const int len,
1261 float* c
1262)
1263{
1264#if defined(SAF_USE_INTEL_IPP)
1265 if(c==NULL)
1266 ippsMulC_32f_I((Ipp32f)s[0], (Ipp32f*)a, len);
1267 else
1268 ippsMulC_32f((Ipp32f*)a, (Ipp32f)s[0], (Ipp32f*)c, len);
1269#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
1270 if(c==NULL)
1271 cblas_sscal(len, s[0], a, 1);
1272 else
1273 vDSP_vsmul(a, 1, s, c, 1, (vDSP_Length)len);
1274#else
1275 if (c == NULL)
1276 cblas_sscal(len, s[0], a, 1);
1277 else {
1278 utility_svvcopy(a, len, c);
1279 cblas_sscal(len, s[0], c, 1);
1280 }
1281#endif
1282}
1283
1285(
1286 float_complex* a,
1287 const float_complex* s,
1288 const int len,
1289 float_complex* c
1290)
1291{
1292 if (c == NULL)
1293 cblas_cscal(len, s, a, 1);
1294 else {
1295 cblas_ccopy(len, a, 1, c, 1);
1296 cblas_cscal(len, s, c, 1);
1297 }
1298}
1299
1301(
1302 double* a,
1303 const double* s,
1304 const int len,
1305 double* c
1306)
1307{
1308#if defined(SAF_USE_INTEL_IPP)
1309 if(c==NULL)
1310 ippsMulC_64f_I((Ipp64f)s[0], (Ipp64f*)a, len);
1311 else
1312 ippsMulC_64f((Ipp64f*)a, (Ipp64f)s[0], (Ipp64f*)c, len);
1313#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
1314 if(c==NULL)
1315 cblas_dscal(len, s[0], a, 1);
1316 else
1317 vDSP_vsmulD(a, 1, s, c, 1, (vDSP_Length)len);
1318#else
1319 if (c == NULL)
1320 cblas_dscal(len, s[0], a, 1);
1321 else {
1322 utility_dvvcopy(a, len, c);
1323 cblas_dscal(len, s[0], c, 1);
1324 }
1325#endif
1326}
1327
1329(
1330 double_complex* a,
1331 const double_complex* s,
1332 const int len,
1333 double_complex* c
1334)
1335{
1336 if (c == NULL)
1337 cblas_zscal(len, s, a, 1);
1338 else {
1339 cblas_zcopy(len, a, 1, c, 1);
1340 cblas_zscal(len, s, c, 1);
1341 }
1342}
1343
1344
1345/* ========================================================================== */
1346/* Vector-Scalar Division (?vsdiv) */
1347/* ========================================================================== */
1348
1350(
1351 const float* a,
1352 const float* s,
1353 const int len,
1354 float* c
1355)
1356{
1357 if(s[0] == 0.0f){
1358 memset(c, 0, len*sizeof(float));
1359 return;
1360 }
1361#if defined(SAF_USE_INTEL_IPP)
1362 ippsDivC_32f((Ipp32f*)a, (Ipp32f)s[0], (Ipp32f*)c, len);
1363#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
1364 vDSP_vsdiv(a, 1, s, c, 1, (vDSP_Length)len);
1365#else
1366 cblas_scopy(len, a, 1, c, 1);
1367 cblas_sscal(len, 1.0f/s[0], c, 1);
1368#endif
1369}
1370
1371
1372/* ========================================================================== */
1373/* Vector-Scalar Addition (?vsadd) */
1374/* ========================================================================== */
1375
1377(
1378 float* a,
1379 const float* s,
1380 const int len,
1381 float* c
1382)
1383{
1384#if defined(SAF_USE_INTEL_IPP)
1385 ippsAddC_32f((Ipp32f*)a, (Ipp32f)s[0], (Ipp32f*)c, len);
1386#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
1387 vDSP_vsadd(a, 1, s, c, 1, (vDSP_Length)len);
1388#elif defined(SAF_ENABLE_SIMD)
1389 int i;
1390# if defined(__AVX512F__)
1391 __m512 s16 = _mm512_set1_ps(s[0]);
1392 for(i=0; i<(len-15); i+=16)
1393 _mm512_storeu_ps(c+i, _mm512_add_ps(_mm512_loadu_ps(a+i), s16));
1394# elif defined(__AVX__) && defined(__AVX2__)
1395 __m256 s8 = _mm256_set1_ps(s[0]);
1396 for(i=0; i<(len-7); i+=8)
1397 _mm256_storeu_ps(c+i, _mm256_add_ps(_mm256_loadu_ps(a+i), s8));
1398# elif defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
1399 __m128 s4 = _mm_load_ps1(s);
1400 for(i=0; i<(len-3); i+=4)
1401 _mm_storeu_ps(c+i, _mm_add_ps(_mm_loadu_ps(a+i), s4));
1402# endif
1403 for(;i<len; i++) /* The residual (if len was not divisable by the step size): */
1404 c[i] = a[i] + s[0];
1405#else
1406 int i;
1407 for(i=0; i<len; i++)
1408 c[i] = a[i] + s[0];
1409#endif
1410}
1411
1412
1413/* ========================================================================== */
1414/* Vector-Scalar Subtraction (?vssub) */
1415/* ========================================================================== */
1416
1418(
1419 float* a,
1420 const float* s,
1421 const int len,
1422 float* c
1423)
1424{
1425#if defined(SAF_USE_INTEL_IPP)
1426 ippsSubC_32f((Ipp32f*)a, (Ipp32f)s[0], (Ipp32f*)c, len);
1427#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
1428 float inv_s;
1429 inv_s = -s[0];
1430 vDSP_vsadd(a, 1, &inv_s, c, 1, (vDSP_Length)len);
1431#elif defined(SAF_ENABLE_SIMD)
1432 int i;
1433# if defined(__AVX512F__)
1434 __m512 s16 = _mm512_set1_ps(s[0]);
1435 for(i=0; i<(len-15); i+=16)
1436 _mm512_storeu_ps(c+i, _mm512_sub_ps(_mm512_loadu_ps(a+i), s16));
1437# elif defined(__AVX__) && defined(__AVX2__)
1438 __m256 s8 = _mm256_set1_ps(s[0]);
1439 for(i=0; i<(len-7); i+=8)
1440 _mm256_storeu_ps(c+i, _mm256_sub_ps(_mm256_loadu_ps(a+i), s8));
1441# elif defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
1442 __m128 s4 = _mm_load_ps1(s);
1443 for(i=0; i<(len-3); i+=4)
1444 _mm_storeu_ps(c+i, _mm_sub_ps(_mm_loadu_ps(a+i), s4));
1445# endif
1446 for(;i<len; i++) /* The residual (if len was not divisable by the step size): */
1447 c[i] = a[i] - s[0];
1448#else
1449 int i;
1450 for(i=0; i<len; i++)
1451 c[i] = a[i] - s[0];
1452#endif
1453}
1454
1455
1456/* ========================================================================== */
1457/* Sparse-Vector to Compressed-Vector (Known Indices) (?sv2cv_inds) */
1458/* ========================================================================== */
1459
1461(
1462 const float* sv,
1463 const int* inds,
1464 const int len,
1465 float* cv
1466)
1467{
1468 int i;
1469#if defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64) /* Unfortunately requires a malloc call */
1470 /* Due to Apple "logic", we first need to add 1 to all of the indicies, since "vDSP_vgathr" is going to then subtract 1 from them all... */
1471 vDSP_Length* inds_vDSP;
1472 inds_vDSP = malloc1d(len*sizeof(vDSP_Length));
1473 for(i=0; i<len; i++)
1474 inds_vDSP[i] = (vDSP_Length)(inds[i] + 1);
1475 vDSP_vgathr(sv, inds_vDSP, 1, cv, 1, (vDSP_Length)len);
1476 free(inds_vDSP);
1477#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1478 veclib_int* inds_tmp;
1479 if(sizeof(veclib_int)==sizeof(int)) /* LP64 MKL */
1480 cblas_sgthr(len, sv, cv, (veclib_int*)inds);
1481 else{ /* ILP64 MKL */
1482 inds_tmp = malloc1d(len*sizeof(veclib_int)); /* Unfortunately requires a malloc call */
1483 for(i=0; i<len; i++)
1484 inds_tmp[i] = (veclib_int)inds[i];
1485 cblas_sgthr(len, sv, cv, (veclib_int*)inds_tmp);
1486 free(inds_tmp);
1487 }
1488#elif defined(NDEBUG)
1489 /* try to indirectly "trigger" some compiler optimisations */
1490 for(i=0; i<len-3; i+=4){
1491 cv[i] = sv[inds[i]];
1492 cv[i+1] = sv[inds[i+1]];
1493 cv[i+2] = sv[inds[i+2]];
1494 cv[i+3] = sv[inds[i+3]];
1495 }
1496 for(; i<len; i++) /* The residual (if len was not divisable by the step size): */
1497 cv[i] = sv[inds[i]];
1498#else
1499 for(i=0; i<len; i++)
1500 cv[i] = sv[inds[i]];
1501#endif
1502}
1503
1505(
1506 const float_complex* sv,
1507 const int* inds,
1508 const int len,
1509 float_complex* cv
1510)
1511{
1512 int i;
1513#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1514 veclib_int* inds_tmp;
1515 if(sizeof(veclib_int)==sizeof(int)) /* LP64 MKL */
1516 cblas_cgthr(len, sv, cv, (veclib_int*)inds);
1517 else{ /* ILP64 MKL */
1518 inds_tmp = malloc1d(len*sizeof(veclib_int)); /* Unfortunately requires a malloc call */
1519 for(i=0; i<len; i++)
1520 inds_tmp[i] = (veclib_int)inds[i];
1521 cblas_cgthr(len, sv, cv, (veclib_int*)inds_tmp);
1522 free(inds_tmp);
1523 }
1524#elif defined(NDEBUG)
1525 /* try to indirectly "trigger" some compiler optimisations */
1526 for(i=0; i<len-3; i+=4){
1527 cv[i] = sv[inds[i]];
1528 cv[i+1] = sv[inds[i+1]];
1529 cv[i+2] = sv[inds[i+2]];
1530 cv[i+3] = sv[inds[i+3]];
1531 }
1532 for(; i<len; i++) /* The residual (if len was not divisable by the step size): */
1533 cv[i] = sv[inds[i]];
1534#else
1535 for(i=0; i<len; i++)
1536 cv[i] = sv[inds[i]];
1537#endif
1538}
1539
1541(
1542 const double* sv,
1543 const int* inds,
1544 const int len,
1545 double* cv
1546)
1547{
1548 int i;
1549#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1550 veclib_int* inds_tmp;
1551 if(sizeof(veclib_int)==sizeof(int)) /* LP64 MKL */
1552 cblas_dgthr(len, sv, cv, (veclib_int*)inds);
1553 else{ /* ILP64 MKL */
1554 inds_tmp = malloc1d(len*sizeof(veclib_int)); /* Unfortunately requires a malloc call */
1555 for(i=0; i<len; i++)
1556 inds_tmp[i] = (veclib_int)inds[i];
1557 cblas_dgthr(len, sv, cv, (veclib_int*)inds_tmp);
1558 free(inds_tmp);
1559 }
1560#elif defined(NDEBUG)
1561 /* try to indirectly "trigger" some compiler optimisations */
1562 for(i=0; i<len-3; i+=4){
1563 cv[i] = sv[inds[i]];
1564 cv[i+1] = sv[inds[i+1]];
1565 cv[i+2] = sv[inds[i+2]];
1566 cv[i+3] = sv[inds[i+3]];
1567 }
1568 for(; i<len; i++) /* The residual (if len was not divisable by the step size): */
1569 cv[i] = sv[inds[i]];
1570#else
1571 for(i=0; i<len; i++)
1572 cv[i] = sv[inds[i]];
1573#endif
1574}
1575
1577(
1578 const double_complex* sv,
1579 const int* inds,
1580 const int len,
1581 double_complex* cv
1582)
1583{
1584 int i;
1585#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1586 veclib_int* inds_tmp;
1587 if(sizeof(veclib_int)==sizeof(int)) /* LP64 MKL */
1588 cblas_zgthr(len, sv, cv, (veclib_int*)inds);
1589 else{ /* ILP64 MKL */
1590 inds_tmp = malloc1d(len*sizeof(veclib_int)); /* Unfortunately requires a malloc call */
1591 for(i=0; i<len; i++)
1592 inds_tmp[i] = (veclib_int)inds[i];
1593 cblas_zgthr(len, sv, cv, (veclib_int*)inds_tmp);
1594 free(inds_tmp);
1595 }
1596#elif defined(NDEBUG)
1597 /* try to indirectly "trigger" some compiler optimisations */
1598 for(i=0; i<len-3; i+=4){
1599 cv[i] = sv[inds[i]];
1600 cv[i+1] = sv[inds[i+1]];
1601 cv[i+2] = sv[inds[i+2]];
1602 cv[i+3] = sv[inds[i+3]];
1603 }
1604 for(; i<len; i++) /* The residual (if len was not divisable by the step size): */
1605 cv[i] = sv[inds[i]];
1606#else
1607 for(i=0; i<len; i++)
1608 cv[i] = sv[inds[i]];
1609#endif
1610}
1611
1612
1613/* ========================================================================== */
1614/* Singular-Value Decomposition (?svd) */
1615/* ========================================================================== */
1616
1618typedef struct _utility_ssvd_data {
1619 int maxDim1, maxDim2;
1620 veclib_int currentWorkSize;
1621 float* a, *s, *u, *vt;
1622 float* work;
1624
1625void utility_ssvd_create(void ** const phWork, int maxDim1, int maxDim2)
1626{
1627 *phWork = malloc1d(sizeof(utility_ssvd_data));
1628 utility_ssvd_data *h = (utility_ssvd_data*)(*phWork);
1629
1630 h->maxDim1 = maxDim1;
1631 h->maxDim2 = maxDim2;
1632 h->currentWorkSize = 0;
1633 h->a = malloc1d(maxDim1*maxDim2*sizeof(float));
1634 h->s = malloc1d(SAF_MIN(maxDim2,maxDim1)*sizeof(float));
1635 h->u = malloc1d(maxDim1*maxDim1*sizeof(float));
1636 h->vt = malloc1d(maxDim2*maxDim2*sizeof(float));
1637 h->work = NULL;
1638}
1639
1640void utility_ssvd_destroy(void ** const phWork)
1641{
1642 utility_ssvd_data *h = (utility_ssvd_data*)(*phWork);
1643
1644 if(h!=NULL){
1645 free(h->a);
1646 free(h->s);
1647 free(h->u);
1648 free(h->vt);
1649 free(h->work);
1650
1651 free(h);
1652 h=NULL;
1653 *phWork = NULL;
1654 }
1655}
1656
1658(
1659 void* const hWork,
1660 const float* A,
1661 const int dim1,
1662 const int dim2,
1663 float* U,
1664 float* S,
1665 float* V,
1666 float* sing
1667)
1668{
1670 veclib_int i, j, m, n, lda, ldu, ldvt, info;
1671 veclib_int lwork;
1672 float wkopt;
1673
1674 m = dim1; n = dim2; lda = dim1; ldu = dim1; ldvt = dim2;
1675
1676 /* Work struct */
1677 if(hWork==NULL)
1678 utility_ssvd_create((void**)&h, dim1, dim2);
1679 else{
1680 h = (utility_ssvd_data*)(hWork);
1681#ifndef NDEBUG
1682 saf_assert(dim1<=h->maxDim1, "dim1 exceeds the maximum length specified");
1683 saf_assert(dim2<=h->maxDim2, "dim2 exceeds the maximum length specified");
1684#endif
1685 }
1686
1687 /* store in column major order */
1688 for(i=0; i<dim1; i++)
1689 for(j=0; j<dim2; j++)
1690 h->a[j*dim1+i] = A[i*dim2 +j];
1691
1692 /* Query how much "work" memory is required */
1693 lwork = -1;
1694#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
1695 sgesvd_( "A", "A", &m, &n, h->a, &lda, h->s, h->u, &ldu, h->vt, &ldvt, &wkopt, &lwork, &info );
1696#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
1697 saf_print_error("No such implementation available in ATLAS CLAPACK");
1698#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
1699 info = LAPACKE_sgesvd_work(CblasColMajor, 'A', 'A', m, n, h->a, lda, h->s, h->u, ldu, h->vt, ldvt, &wkopt, lwork);
1700#endif
1701 lwork = (veclib_int)wkopt;
1702 if(lwork>h->currentWorkSize){
1703 h->currentWorkSize = lwork;
1704 h->work = realloc1d(h->work, h->currentWorkSize*sizeof(float));
1705 }
1706
1707 /* perform the singular value decomposition */
1708#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
1709 sgesvd_( "A", "A", &m, &n, h->a, &lda, h->s, h->u, &ldu, h->vt, &ldvt, h->work, &lwork, &info );
1710#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
1711 saf_print_error("No such implementation available in ATLAS CLAPACK");
1712#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
1713 info = LAPACKE_sgesvd_work(CblasColMajor, 'A', 'A', m, n, h->a, lda, h->s, h->u, ldu, h->vt, ldvt, h->work, lwork);
1714#endif
1715
1716 /* svd failed to converge */
1717 if( info != 0 ) {
1718 if (S != NULL)
1719 memset(S, 0, dim1*dim2*sizeof(float));
1720 if (U != NULL)
1721 memset(U, 0, dim1*dim1*sizeof(float));
1722 if (V != NULL)
1723 memset(V, 0, dim2*dim2*sizeof(float));
1724 if (sing != NULL)
1725 memset(sing, 0, SAF_MIN(dim1, dim2)*sizeof(float));
1726#ifndef NDEBUG
1727 /* The SVD failed to converge, or the input matrix contained illegal
1728 * values so no solution was attempted. In these cases this function
1729 * will zero all output matrices and/or vectors. */
1730 saf_print_warning("Could not compute SVD in utility_ssvd(). Output matrices/vectors have been zeroed.");
1731#endif
1732 }
1733 /* svd successful */
1734 else {
1735 if (S != NULL){
1736 memset(S, 0, dim1*dim2*sizeof(float));
1737 /* singular values on the diagonal MIN(dim1, dim2). The remaining elements are 0. */
1738 for(i=0; i<SAF_MIN(dim1, dim2); i++)
1739 S[i*dim2+i] = h->s[i];
1740 }
1741
1742 /*return as row-major*/
1743 if (U != NULL)
1744 for(i=0; i<dim1; i++)
1745 for(j=0; j<dim1; j++)
1746 U[i*dim1+j] = h->u[j*dim1+i];
1747
1748 /* lapack returns VT, i.e. row-major V already */
1749 if (V != NULL)
1750 for(i=0; i<dim2; i++)
1751 for(j=0; j<dim2; j++)
1752 V[i*dim2+j] = h->vt[i*dim2+j];
1753 if (sing != NULL)
1754 for(i=0; i<SAF_MIN(dim1, dim2); i++)
1755 sing[i] = h->s[i];
1756 }
1757
1758 if(hWork == NULL)
1759 utility_ssvd_destroy((void**)&h);
1760}
1761
1763typedef struct _utility_csvd_data {
1764 int maxDim1, maxDim2;
1765 veclib_int currentWorkSize;
1766 float_complex* a, *u, *vt, *work;
1767 float *s, *rwork;
1769
1770void utility_csvd_create(void ** const phWork, int maxDim1, int maxDim2)
1771{
1772 *phWork = malloc1d(sizeof(utility_csvd_data));
1773 utility_csvd_data *h = (utility_csvd_data*)(*phWork);
1774
1775 h->maxDim1 = maxDim1;
1776 h->maxDim2 = maxDim2;
1777 h->currentWorkSize = 0;
1778 h->a = malloc1d(maxDim1*maxDim2*sizeof(float_complex));
1779 h->s = malloc1d(SAF_MIN(maxDim2,maxDim1)*sizeof(float));
1780 h->u = malloc1d(maxDim1*maxDim1*sizeof(float_complex));
1781 h->vt = malloc1d(maxDim2*maxDim2*sizeof(float_complex));
1782 h->rwork = malloc1d(maxDim1*SAF_MAX(1, 5*SAF_MIN(maxDim2,maxDim1))*sizeof(float));
1783 h->work = NULL;
1784}
1785
1786void utility_csvd_destroy(void ** const phWork)
1787{
1788 utility_csvd_data *h = (utility_csvd_data*)(*phWork);
1789
1790 if(h!=NULL){
1791 free(h->a);
1792 free(h->s);
1793 free(h->u);
1794 free(h->vt);
1795 free(h->work);
1796 free(h->rwork);
1797
1798 free(h);
1799 h=NULL;
1800 *phWork = NULL;
1801 }
1802}
1803
1805(
1806 void* const hWork,
1807 const float_complex* A,
1808 const int dim1,
1809 const int dim2,
1810 float_complex* U,
1811 float_complex* S,
1812 float_complex* V,
1813 float* sing
1814)
1815{
1817 veclib_int m, n, lda, ldu, ldvt, info;
1818 veclib_int lwork;
1819 float_complex wkopt;
1820#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1821 const MKL_Complex8 calpha = {1.0f, 0.0f};
1822#else
1823 int i, j;
1824#endif
1825
1826 m = dim1; n = dim2; lda = dim1; ldu = dim1; ldvt = dim2;
1827
1828 /* Work struct */
1829 if(hWork==NULL)
1830 utility_csvd_create((void**)&h, dim1, dim2);
1831 else{
1832 h = (utility_csvd_data*)(hWork);
1833#ifndef NDEBUG
1834 saf_assert(dim1<=h->maxDim1, "dim1 exceeds the maximum length specified");
1835 saf_assert(dim2<=h->maxDim2, "dim2 exceeds the maximum length specified");
1836#endif
1837 }
1838
1839 /* store in column major order (i.e. transpose) */
1840#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1841 MKL_Comatcopy('R', 'T', dim1, dim2, calpha, (veclib_float_complex*)A, dim2, (veclib_float_complex*)h->a, dim1);
1842#else
1843 for(i=0; i<dim1; i++)
1844 for(j=0; j<dim2; j++)
1845 h->a[j*dim1+i] = A[i*dim2+j];
1846#endif
1847
1848 /* Query how much "work" memory is required */
1849 lwork = -1;
1850#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
1851 cgesvd_( "A", "A", (veclib_int*)&m, (veclib_int*)&n, (veclib_float_complex*)h->a, (veclib_int*)&lda, h->s, (veclib_float_complex*)h->u, (veclib_int*)&ldu,
1852 (veclib_float_complex*)h->vt, &ldvt, (veclib_float_complex*)&wkopt, &lwork, h->rwork, (veclib_int*)&info );
1853#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
1854 saf_print_error("No such implementation available in ATLAS CLAPACK");
1855#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
1856 info = LAPACKE_cgesvd_work(CblasColMajor, 'A', 'A', m, n, (veclib_float_complex*)h->a, lda, h->s, (veclib_float_complex*)h->u, ldu,
1857 (veclib_float_complex*)h->vt, ldvt, (veclib_float_complex*)&wkopt, lwork, h->rwork);
1858#endif
1859 lwork = (veclib_int)(crealf(wkopt)+0.01f);
1860 if(lwork>h->currentWorkSize){
1861 h->currentWorkSize = lwork;
1862 h->work = realloc1d(h->work, h->currentWorkSize*sizeof(float_complex));
1863 }
1864
1865 /* perform the singular value decomposition */
1866#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
1867 cgesvd_( "A", "A", &m, &n, (veclib_float_complex*)h->a, &lda, h->s, (veclib_float_complex*)h->u, &ldu, (veclib_float_complex*)h->vt, &ldvt,
1868 (veclib_float_complex*)h->work, &lwork, h->rwork, &info);
1869#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
1870 saf_print_error("No such implementation available in ATLAS CLAPACK");
1871#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
1872 info = LAPACKE_cgesvd_work(CblasColMajor, 'A', 'A', m, n, (veclib_float_complex*)h->a, lda, h->s, (veclib_float_complex*)h->u, ldu, (veclib_float_complex*)h->vt, ldvt,
1873 (veclib_float_complex*)h->work, lwork, h->rwork);
1874#endif
1875
1876 /* svd failed to converge */
1877 if( info != 0 ) {
1878 if (S != NULL)
1879 memset(S, 0, dim1*dim2*sizeof(float_complex));
1880 if (U != NULL)
1881 memset(U, 0, dim1*dim1*sizeof(float_complex));
1882 if (V != NULL)
1883 memset(V, 0, dim2*dim2*sizeof(float_complex));
1884 if (sing != NULL)
1885 memset(sing, 0, SAF_MIN(dim1, dim2)*sizeof(float_complex));
1886#ifndef NDEBUG
1887 /* The SVD failed to converge, or the input matrix contained illegal
1888 * values so no solution was attempted. In these cases this function
1889 * will zero all output matrices and/or vectors. */
1890 saf_print_warning("Could not compute SVD in utility_csvd(). Output matrices/vectors have been zeroed.");
1891#endif
1892 }
1893 /* svd successful */
1894 else {
1895 if (S != NULL){
1896 /* singular values on the diagonal MIN(dim1, dim2). The remaining elements are 0. */
1897 memset(S, 0, dim1*dim2*sizeof(float_complex));
1898 cblas_scopy(SAF_MIN(dim1, dim2), h->s, 1, (float*)S, 2*(dim2+1));
1899 }
1900
1901 /*return as row-major*/
1902 if (U!=NULL){
1903#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1904 MKL_Comatcopy('R', 'T', dim1, dim1, calpha, (veclib_float_complex*)h->u, dim1, (veclib_float_complex*)U, dim1);
1905#else
1906 for(i=0; i<dim1; i++)
1907 for(j=0; j<dim1; j++)
1908 U[i*dim1+j] = h->u[j*dim1+i];
1909#endif
1910 }
1911
1912 /* lapack returns V^T, i.e. row-major V already, but we need V^H! */
1913 if (V != NULL){
1914 cblas_ccopy(dim2*dim2, h->vt, 1, V, 1);
1915 cblas_sscal(dim2*dim2, -1.0f, ((float*)V)+1, 2); /* conj */
1916 }
1917
1918 if (sing != NULL)
1919 cblas_scopy(SAF_MIN(dim1, dim2), h->s, 1, sing, 1);
1920 }
1921
1922 if(hWork == NULL)
1923 utility_csvd_destroy((void**)&h);
1924}
1925
1926
1927/* ========================================================================== */
1928/* Symmetric Eigenvalue Decomposition (?seig) */
1929/* ========================================================================== */
1930
1932typedef struct _utility_sseig_data {
1933 int maxDim;
1934 veclib_int currentWorkSize;
1935 float* w;
1936 float* a;
1937 float* work;
1939
1940void utility_sseig_create(void ** const phWork, int maxDim)
1941{
1942 *phWork = malloc1d(sizeof(utility_sseig_data));
1943 utility_sseig_data *h = (utility_sseig_data*)(*phWork);
1944
1945 h->maxDim = maxDim;
1946 h->currentWorkSize = 0;
1947 h->w = malloc1d(maxDim*sizeof(float));
1948 h->a = malloc1d(maxDim*maxDim*sizeof(float));
1949 h->work = NULL;
1950}
1951
1952void utility_sseig_destroy(void ** const phWork)
1953{
1954 utility_sseig_data *h = (utility_sseig_data*)(*phWork);
1955
1956 if(h!=NULL){
1957 free(h->w);
1958 free(h->a);
1959 free(h->work);
1960
1961 free(h);
1962 h=NULL;
1963 *phWork = NULL;
1964 }
1965}
1966
1968(
1969 void* const hWork,
1970 const float* A,
1971 const int dim,
1972 int sortDecFLAG,
1973 float* V,
1974 float* D,
1975 float* eig
1976)
1977{
1979 veclib_int i, j, n, lda, info;
1980 veclib_int lwork;
1981 float wkopt;
1982
1983 n = dim;
1984 lda = dim;
1985
1986 /* Work struct */
1987 if(hWork==NULL)
1988 utility_sseig_create((void**)&h, dim);
1989 else{
1990 h = (utility_sseig_data*)(hWork);
1991#ifndef NDEBUG
1992 saf_assert(dim<=h->maxDim, "dim exceeds the maximum length specified");
1993#endif
1994 }
1995
1996 /* store in column major order (i.e. transpose) */
1997 for(i=0; i<dim; i++)
1998 for(j=0; j<dim; j++)
1999 h->a[i*dim+j] = A[j*dim+i];
2000
2001 /* Query how much "work" memory is required */
2002 lwork = -1;
2003#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2004 ssyev_( "Vectors", "Upper", &n, h->a, &lda, h->w, &wkopt, &lwork, &info );
2005#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2006 saf_print_error("No such implementation available in ATLAS CLAPACK");
2007#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2008 info = LAPACKE_ssyev_work(CblasColMajor, 'V', 'U', n, h->a, lda, h->w, &wkopt, lwork);
2009#endif
2010 lwork = (veclib_int)wkopt;
2011 if(lwork>h->currentWorkSize){
2012 h->currentWorkSize = lwork;
2013 h->work = realloc1d(h->work, h->currentWorkSize*sizeof(float));
2014 }
2015
2016 /* solve the eigenproblem */
2017#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2018 ssyev_( "Vectors", "Upper", &n, h->a, &lda, h->w, h->work, &lwork, &info );
2019#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2020 saf_print_error("No such implementation available in ATLAS CLAPACK");
2021#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2022 info = LAPACKE_ssyev_work(CblasColMajor, 'V', 'U', n, h->a, lda, h->w, h->work, lwork);
2023#endif
2024
2025 /* output */
2026 if(D!=NULL)
2027 memset(D, 0, dim*dim*sizeof(float));
2028 if( info != 0 ) {
2029 /* failed to converge and find the eigenvalues */
2030 if(V!=NULL)
2031 memset(V, 0, dim*dim*sizeof(float));
2032#ifndef NDEBUG
2033 /* Failed to compute all of the eigenvalues, some eigenvectors have not
2034 * been computed, or the input matrix contained illegal values so no
2035 * solution was attempted. In these cases the function will zero all
2036 * output matrices and vectors. */
2037 saf_print_warning("Could not compute EVD in utility_sseig(). Output matrices/vectors have been zeroed.");
2038#endif
2039 }
2040 else{
2041 if(sortDecFLAG){
2042 for(i=0; i<dim; i++){
2043 if(V!=NULL)
2044 for(j=0; j<dim; j++)
2045 V[i*dim+j] = h->a[(dim-j-1)*dim+i]; /* transpose, back to row-major and reverse order */
2046 if(D!=NULL)
2047 D[i*dim+i] = h->w[dim-i-1]; /* store along the diagonal, reversing the order */
2048 if(eig!=NULL)
2049 eig[i] = h->w[dim-i-1];
2050 }
2051 }
2052 else{
2053 for(i=0; i<dim; i++){
2054 if(V!=NULL)
2055 for(j=0; j<dim; j++)
2056 V[i*dim+j] = h->a[j*dim+i]; /* transpose, back to row-major */
2057 if(D!=NULL)
2058 D[i*dim+i] = h->w[i]; /* store along the diagonal */
2059 if(eig!=NULL)
2060 eig[i] = h->w[i];
2061 }
2062 }
2063 }
2064
2065 if(hWork == NULL)
2066 utility_sseig_destroy((void**)&h);
2067}
2068
2070typedef struct _utility_cseig_data {
2071 int maxDim;
2072 veclib_int currentWorkSize;
2073 float* rwork;
2074 float* w;
2075 float_complex* a;
2076 float_complex* work;
2078
2079void utility_cseig_create(void ** const phWork, int maxDim)
2080{
2081 *phWork = malloc1d(sizeof(utility_cseig_data));
2082 utility_cseig_data *h = (utility_cseig_data*)(*phWork);
2083
2084 h->maxDim = maxDim;
2085 h->currentWorkSize = SAF_MAX(1, 2*maxDim-1);
2086 h->rwork = malloc1d((3*maxDim-2)*sizeof(float));
2087 h->w = malloc1d(maxDim*sizeof(float));
2088 h->a = malloc1d(maxDim*maxDim*sizeof(float_complex));
2089 h->work = malloc1d(h->currentWorkSize*sizeof(float_complex));
2090}
2091
2092void utility_cseig_destroy(void ** const phWork)
2093{
2094 utility_cseig_data *h = (utility_cseig_data*)(*phWork);
2095
2096 if(h!=NULL){
2097 free(h->rwork);
2098 free(h->w);
2099 free(h->a);
2100 free(h->work);
2101
2102 free(h);
2103 h=NULL;
2104 *phWork = NULL;
2105 }
2106}
2107
2109(
2110 void* const hWork,
2111 const float_complex* A,
2112 const int dim,
2113 int sortDecFLAG,
2114 float_complex* V,
2115 float_complex* D,
2116 float* eig
2117)
2118{
2120 veclib_int i, n, lda, info;
2121 veclib_int lwork;
2122 float_complex wkopt;
2123#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
2124 const MKL_Complex8 calpha = {1.0f, 0.0f};
2125#else
2126 int j;
2127#endif
2128
2129 n = dim;
2130 lda = dim;
2131
2132 /* Work struct */
2133 if(hWork==NULL)
2134 utility_cseig_create((void**)&h, dim);
2135 else{
2136 h = (utility_cseig_data*)(hWork);
2137#ifndef NDEBUG
2138 saf_assert(dim<=h->maxDim, "dim exceeds the maximum length specified");
2139#endif
2140 }
2141
2142 /* store in column major order (i.e. transpose) */
2143#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
2144 MKL_Comatcopy('R', 'T', dim, dim, calpha, (veclib_float_complex*)A, dim, (veclib_float_complex*)h->a, dim);
2145#else
2146 for(i=0; i<dim; i++)
2147 for(j=0; j<dim; j++)
2148 h->a[i*dim+j] = A[j*dim+i];
2149#endif
2150
2151 /* Query how much "work" memory is required */
2152 lwork = -1;
2153#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2154 cheev_( "Vectors", "Upper", &n, (veclib_float_complex*)h->a, &lda, h->w, (veclib_float_complex*)&wkopt, &lwork, h->rwork, &info );
2155#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2156 saf_print_error("No such implementation available in ATLAS CLAPACK");
2157#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2158 info = LAPACKE_cheev_work(CblasColMajor, 'V', 'U', n, (veclib_float_complex*)h->a, lda, h->w, (veclib_float_complex*)&wkopt, lwork, h->rwork);
2159#endif
2160 lwork = (veclib_int)crealf(wkopt);
2161 if(lwork>h->currentWorkSize){
2162 h->currentWorkSize = lwork;
2163 h->work = realloc1d(h->work, h->currentWorkSize*sizeof(float_complex));
2164 }
2165
2166 /* solve the eigenproblem */
2167#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2168 cheev_( "Vectors", "Upper", &n, (veclib_float_complex*)h->a, &lda, h->w, (veclib_float_complex*)h->work, &lwork, h->rwork, &info );
2169#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2170 saf_print_error("No such implementation available in ATLAS CLAPACK");
2171#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2172 info = LAPACKE_cheev_work(CblasColMajor, 'V', 'U', n, (veclib_float_complex*)h->a, lda, h->w, (veclib_float_complex*)h->work, lwork, h->rwork);
2173#endif
2174
2175 /* output */
2176 if(D!=NULL)
2177 memset(D, 0, dim*dim*sizeof(float_complex));
2178 if( info != 0 ) {
2179 /* failed to converge and find the eigenvalues */
2180 if(V!=NULL)
2181 memset(V, 0, dim*dim*sizeof(float_complex));
2182#ifndef NDEBUG
2183 /* Failed to compute all of the eigenvalues, some eigenvectors have not
2184 * been computed, or the input matrix contained illegal values so no
2185 * solution was attempted. In these cases the function will zero all
2186 * output matrices and vectors. */
2187 saf_print_warning("Could not compute EVD in utility_cseig(). Output matrices/vectors have been zeroed.");
2188#endif
2189 }
2190
2191 /* transpose, back to row-major and reverse order if sortDecFlag==1 */
2192 else{
2193 if(sortDecFLAG && V!=NULL)
2194 for(i=0; i<(int)((float)dim/2.0f); i++)
2195 cblas_cswap(dim, &h->a[i*dim], 1, &h->a[(dim-i-1)*dim], 1);
2196
2197 if(V!=NULL){
2198#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
2199 MKL_Comatcopy('R', 'T', dim, dim, calpha, (veclib_float_complex*)h->a, dim, (veclib_float_complex*)V, dim);
2200#else
2201 for(i=0; i<dim; i++)
2202 for(j=0; j<dim; j++)
2203 V[i*dim+j] = h->a[j*dim+i];
2204#endif
2205 }
2206
2207 if(sortDecFLAG){
2208 for(i=0; i<dim; i++) {
2209 if(D!=NULL)
2210 D[i*dim+i] = cmplxf(h->w[dim-i-1], 0.0f); /* store along the diagonal, reversing the order */
2211 if(eig!=NULL)
2212 eig[i] = h->w[dim-i-1];
2213 }
2214 }
2215 else {
2216 for(i=0; i<dim; i++){
2217 if(D!=NULL)
2218 D[i*dim+i] = cmplxf(h->w[i], 0.0f); /* store along the diagonal */
2219 if(eig!=NULL)
2220 eig[i] = h->w[i];
2221 }
2222 }
2223 }
2224
2225 if(hWork == NULL)
2226 utility_cseig_destroy((void**)&h);
2227}
2228
2229
2230/* ========================================================================== */
2231/* Eigenvalues of Matrix Pair (?eigmp) */
2232/* ========================================================================== */
2233
2235typedef struct _utility_ceigmp_data {
2236 int maxDim;
2237 veclib_int currentWorkSize;
2238 float_complex* a, *b, *vl, *vr, *alpha, *beta;
2239 float* rwork;
2240 float_complex* work;
2242
2243void utility_ceigmp_create(void** const phWork, int maxDim)
2244{
2245 *phWork = malloc1d(sizeof(utility_ceigmp_data));
2247
2248 h->maxDim = maxDim;
2249 h->currentWorkSize = 4*maxDim;
2250 h->rwork = malloc1d(4*(h->currentWorkSize)*sizeof(float));
2251 h->a = malloc1d(maxDim*maxDim*sizeof(float_complex));
2252 h->b = malloc1d(maxDim*maxDim*sizeof(float_complex));
2253 h->vl = malloc1d(maxDim*maxDim*sizeof(float_complex));
2254 h->vr = malloc1d(maxDim*maxDim*sizeof(float_complex));
2255 h->alpha = malloc1d(maxDim*sizeof(float_complex));
2256 h->beta = malloc1d(maxDim*sizeof(float_complex));
2257 h->work = malloc1d(h->currentWorkSize*sizeof(float_complex));
2258}
2259
2260void utility_ceigmp_destroy(void ** const phWork)
2261{
2263
2264 if(h!=NULL){
2265 free(h->rwork);
2266 free(h->a);
2267 free(h->b);
2268 free(h->vl);
2269 free(h->vr);
2270 free(h->alpha);
2271 free(h->beta);
2272 free(h->work);
2273
2274 free(h);
2275 h=NULL;
2276 *phWork = NULL;
2277 }
2278}
2279
2281(
2282 void* const hWork,
2283 const float_complex* A,
2284 const float_complex* B,
2285 const int dim,
2286 float_complex* VL,
2287 float_complex* VR,
2288 float_complex* D
2289)
2290{
2292 veclib_int i, j;
2293 veclib_int n, lda, ldb, ldvl, ldvr, info;
2294 veclib_int lwork;
2295
2296 n = lda = ldb = ldvl = ldvr = dim;
2297
2298 /* Work struct */
2299 if(hWork==NULL)
2300 utility_ceigmp_create((void**)&h, dim);
2301 else{
2302 h = (utility_ceigmp_data*)(hWork);
2303#ifndef NDEBUG
2304 saf_assert(dim<=h->maxDim, "dim exceeds the maximum length specified");
2305#endif
2306 }
2307
2308 /* store in column major order */
2309 for(i=0; i<dim; i++)
2310 for(j=0; j<dim; j++)
2311 h->a[j*dim+i] = A[i*dim+j];
2312 for(i=0; i<dim; i++)
2313 for(j=0; j<dim; j++)
2314 h->b[j*dim+i] = B[i*dim+j];
2315
2316 /* solve eigen problem */
2317 lwork = h->currentWorkSize;
2318#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2319 cggev_("V", "V", &n, (veclib_float_complex*)h->a, &lda, (veclib_float_complex*)h->b, &ldb, (veclib_float_complex*)h->alpha, (veclib_float_complex*)h->beta,
2320 (veclib_float_complex*)h->vl, &ldvl, (veclib_float_complex*)h->vr, &ldvr, (veclib_float_complex*)h->work, &lwork, h->rwork, &info);
2321#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2322 saf_print_error("No such implementation available in ATLAS CLAPACK");
2323#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2324 info = LAPACKE_cggev_work(CblasColMajor, 'V', 'V', n, (veclib_float_complex*)h->a, lda, (veclib_float_complex*)h->b, ldb, (veclib_float_complex*)h->alpha, (veclib_float_complex*)h->beta,
2325 (veclib_float_complex*)h->vl, ldvl, (veclib_float_complex*)h->vr, ldvr, (veclib_float_complex*)h->work, lwork, h->rwork);
2326#endif
2327
2328 if(D!=NULL)
2329 memset(D, 0, dim*dim*sizeof(float_complex));
2330
2331 /* failed to converge and find the eigenvalues */
2332 if( info != 0 ) {
2333 if(VL!=NULL)
2334 memset(VL, 0, dim*dim*sizeof(float_complex));
2335 if(VR!=NULL)
2336 memset(VR, 0, dim*dim*sizeof(float_complex));
2337#ifndef NDEBUG
2338 /* Failed to compute all of the eigenvalues, some eigenvectors have not
2339 * been computed, or the input matrix contained illegal values so no
2340 * solution was attempted. In these cases the function will zero all
2341 * output matrices and vectors. */
2342 saf_print_warning("Could not compute EVD in utility_ceigmp(). Output matrices/vectors have been zeroed.");
2343#endif
2344 }
2345 /* transpose, back to row-major */
2346 else{
2347 if(D!=NULL)
2348 for(i=0; i<dim; i++)
2349 D[i*dim+i] = ccdivf(h->alpha[i],h->beta[i]);
2350
2351 if(VL!=NULL)
2352 for(i=0; i<dim; i++)
2353 for(j=0; j<dim; j++)
2354 VL[i*dim+j] = h->vl[j*dim+i];
2355 if(VR!=NULL)
2356 for(i=0; i<dim; i++)
2357 for(j=0; j<dim; j++)
2358 VR[i*dim+j] = h->vr[j*dim+i];
2359 }
2360
2361 if(hWork == NULL)
2362 utility_ceigmp_destroy((void**)&h);
2363}
2364
2366typedef struct _utility_zeigmp_data {
2367 int maxDim;
2368 veclib_int currentWorkSize;
2369 double_complex* a, *b, *vl, *vr, *alpha, *beta;
2370 double* rwork;
2371 double_complex* work;
2373
2374void utility_zeigmp_create(void** const phWork, int maxDim)
2375{
2376 *phWork = malloc1d(sizeof(utility_zeigmp_data));
2378
2379 h->maxDim = maxDim;
2380 h->currentWorkSize = 4*maxDim;
2381 h->rwork = malloc1d(4*(h->currentWorkSize)*sizeof(double));
2382 h->a = malloc1d(maxDim*maxDim*sizeof(double_complex));
2383 h->b = malloc1d(maxDim*maxDim*sizeof(double_complex));
2384 h->vl = malloc1d(maxDim*maxDim*sizeof(double_complex));
2385 h->vr = malloc1d(maxDim*maxDim*sizeof(double_complex));
2386 h->alpha = malloc1d(maxDim*sizeof(double_complex));
2387 h->beta = malloc1d(maxDim*sizeof(double_complex));
2388 h->work = malloc1d(h->currentWorkSize*sizeof(double_complex));
2389}
2390
2391void utility_zeigmp_destroy(void ** const phWork)
2392{
2394
2395 if(h!=NULL){
2396 free(h->rwork);
2397 free(h->a);
2398 free(h->b);
2399 free(h->vl);
2400 free(h->vr);
2401 free(h->alpha);
2402 free(h->beta);
2403 free(h->work);
2404
2405 free(h);
2406 h=NULL;
2407 *phWork = NULL;
2408 }
2409}
2410
2412(
2413 void* const hWork,
2414 const double_complex* A,
2415 const double_complex* B,
2416 const int dim,
2417 double_complex* VL,
2418 double_complex* VR,
2419 double_complex* D
2420)
2421{
2423 veclib_int i, j;
2424 veclib_int n, lda, ldb, ldvl, ldvr, info;
2425 veclib_int lwork;
2426
2427 n = lda = ldb = ldvl = ldvr = dim;
2428
2429 /* Work struct */
2430 if(hWork==NULL)
2431 utility_zeigmp_create((void**)&h, dim);
2432 else{
2433 h = (utility_zeigmp_data*)(hWork);
2434#ifndef NDEBUG
2435 saf_assert(dim<=h->maxDim, "dim exceeds the maximum length specified");
2436#endif
2437 }
2438
2439 /* store in column major order */
2440 for(i=0; i<dim; i++)
2441 for(j=0; j<dim; j++)
2442 h->a[j*dim+i] = A[i*dim+j];
2443 for(i=0; i<dim; i++)
2444 for(j=0; j<dim; j++)
2445 h->b[j*dim+i] = B[i*dim+j]; /* store in column major order */
2446
2447 /* solve eigen problem */
2448 lwork = h->currentWorkSize;
2449#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2450 zggev_("V", "V", &n, (veclib_double_complex*)h->a, &lda, (veclib_double_complex*)h->b, &ldb, (veclib_double_complex*)h->alpha, (veclib_double_complex*)h->beta,
2451 (veclib_double_complex*)h->vl, &ldvl, (veclib_double_complex*)h->vr, &ldvr, (veclib_double_complex*)h->work, &lwork, h->rwork, &info);
2452#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2453 saf_print_error("No such implementation available in ATLAS CLAPACK");
2454#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2455 info = LAPACKE_zggev_work(CblasColMajor, 'V', 'V', n, (veclib_double_complex*)h->a, lda, (veclib_double_complex*)h->b, ldb, (veclib_double_complex*)h->alpha, (veclib_double_complex*)h->beta,
2456 (veclib_double_complex*)h->vl, ldvl, (veclib_double_complex*)h->vr, ldvr, (veclib_double_complex*)h->work, lwork, h->rwork);
2457#endif
2458
2459 if(D!=NULL)
2460 memset(D, 0, dim*dim*sizeof(double_complex));
2461
2462 /* failed to converge and find the eigenvalues */
2463 if( info != 0 ) {
2464 if(VL!=NULL)
2465 memset(VL, 0, dim*dim*sizeof(double_complex));
2466 if(VR!=NULL)
2467 memset(VR, 0, dim*dim*sizeof(double_complex));
2468#ifndef NDEBUG
2469 /* Failed to compute all of the eigenvalues, some eigenvectors have not
2470 * been computed, or the input matrix contained illegal values so no
2471 * solution was attempted. In these cases the function will zero all
2472 * output matrices and vectors. */
2473 saf_print_warning("Could not compute EVD in utility_zeigmp(). Output matrices/vectors have been zeroed.");
2474#endif
2475 }
2476 /* transpose, back to row-major */
2477 else{
2478 if(D!=NULL)
2479 for(i=0; i<dim; i++)
2480 D[i*dim+i] = ccdiv(h->alpha[i],h->beta[i]);
2481
2482 if(VL!=NULL)
2483 for(i=0; i<dim; i++)
2484 for(j=0; j<dim; j++)
2485 VL[i*dim+j] = h->vl[j*dim+i];
2486 if(VR!=NULL)
2487 for(i=0; i<dim; i++)
2488 for(j=0; j<dim; j++)
2489 VR[i*dim+j] = h->vr[j*dim+i];
2490 }
2491
2492 if(hWork == NULL)
2493 utility_zeigmp_destroy((void**)&h);
2494}
2495
2496
2497/* ========================================================================== */
2498/* Eigenvalue Decomposition (?eig) */
2499/* ========================================================================== */
2500
2502typedef struct _utility_ceig_data {
2503 int maxDim;
2504 veclib_int currentWorkSize;
2505 float_complex *w, *vl, *vr, *a;
2506 float* rwork;
2507 float_complex* work;
2509
2510void utility_ceig_create(void ** const phWork, int maxDim)
2511{
2512 *phWork = malloc1d(sizeof(utility_ceig_data));
2513 utility_ceig_data *h = (utility_ceig_data*)(*phWork);
2514
2515 h->maxDim = maxDim;
2516 h->currentWorkSize = 0;
2517 h->rwork = malloc1d(4*maxDim*sizeof(float));
2518 h->w = malloc1d(maxDim*sizeof(float_complex));
2519 h->vl = malloc1d(maxDim*maxDim*sizeof(float_complex));
2520 h->vr = malloc1d(maxDim*maxDim*sizeof(float_complex));
2521 h->a = malloc1d(maxDim*maxDim*sizeof(float_complex));
2522 h->work = NULL;
2523}
2524
2525void utility_ceig_destroy(void ** const phWork)
2526{
2527 utility_ceig_data *h = (utility_ceig_data*)(*phWork);
2528
2529 if(h!=NULL){
2530 free(h->rwork);
2531 free(h->w);
2532 free(h->vl);
2533 free(h->vr);
2534 free(h->a);
2535 free(h->work);
2536
2537 free(h);
2538 h=NULL;
2539 *phWork = NULL;
2540 }
2541}
2542
2544(
2545 void* const hWork,
2546 const float_complex* A,
2547 const int dim,
2548 float_complex* VL,
2549 float_complex* VR,
2550 float_complex* D,
2551 float_complex* eig
2552)
2553{
2555 veclib_int i, j, n, lda, ldvl, ldvr, info;
2556 veclib_int lwork;
2557 float_complex wkopt;
2558
2559 n = lda = ldvl = ldvr = dim;
2560
2561 /* Work struct */
2562 if(hWork==NULL)
2563 utility_ceig_create((void**)&h, dim);
2564 else{
2565 h = (utility_ceig_data*)(hWork);
2566#ifndef NDEBUG
2567 saf_assert(dim<=h->maxDim, "dim exceeds the maximum length specified");
2568#endif
2569 }
2570
2571 /* store in column major order (i.e. transpose) */
2572 for(i=0; i<dim; i++)
2573 for(j=0; j<dim; j++)
2574 h->a[i*dim+j] = A[j*dim+i];
2575
2576 /* Query how much "work" memory is required */
2577 lwork = -1;
2578#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2579 cgeev_( "Vectors", "Vectors", &n, (veclib_float_complex*)h->a, &lda, (veclib_float_complex*)h->w, (veclib_float_complex*)h->vl, &ldvl,
2580 (veclib_float_complex*)h->vr, &ldvr, (veclib_float_complex*)&wkopt, &lwork, h->rwork, &info );
2581#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2582 saf_print_error("No such implementation available in ATLAS CLAPACK");
2583#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2584 info = LAPACKE_cgeev_work(CblasColMajor, 'V', 'V', n, (veclib_float_complex*)h->a, lda, (veclib_float_complex*)h->w, (veclib_float_complex*)h->vl, ldvl,
2585 (veclib_float_complex*)h->vr, ldvr, (veclib_float_complex*)&wkopt, lwork, h->rwork);
2586#endif
2587 lwork = (veclib_int)crealf(wkopt);
2588 if(lwork>h->currentWorkSize){
2589 h->currentWorkSize = lwork;
2590 h->work = realloc1d(h->work, h->currentWorkSize*sizeof(float_complex));
2591 }
2592
2593 /* solve the eigenproblem */
2594#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2595 cgeev_( "Vectors", "Vectors", &n, (veclib_float_complex*)h->a, &lda, (veclib_float_complex*)h->w, (veclib_float_complex*)h->vl, &ldvl,
2596 (veclib_float_complex*)h->vr, &ldvr, (veclib_float_complex*)h->work, &lwork, h->rwork, &info );
2597#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2598 saf_print_error("No such implementation available in ATLAS CLAPACK");
2599#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2600 info = LAPACKE_cgeev_work(CblasColMajor, 'V', 'V', n, (veclib_float_complex*)h->a, lda, (veclib_float_complex*)h->w, (veclib_float_complex*)h->vl, ldvl,
2601 (veclib_float_complex*)h->vr, ldvr, (veclib_float_complex*)h->work, lwork, h->rwork);
2602#endif
2603
2604 /* output */
2605 if(D!=NULL)
2606 memset(D, 0, dim*dim*sizeof(float_complex));
2607
2608 /* failed to converge and find the eigenvalues */
2609 if( info != 0 ) {
2610 if(VL!=NULL)
2611 memset(VL, 0, dim*dim*sizeof(float_complex));
2612 if(VR!=NULL)
2613 memset(VR, 0, dim*dim*sizeof(float_complex));
2614 if(eig!=NULL)
2615 memset(eig, 0, dim*sizeof(float_complex));
2616#ifndef NDEBUG
2617 /* Failed to compute all of the eigenvalues, some eigenvectors have not
2618 * been computed, or the input matrix contained illegal values so no
2619 * solution was attempted. In these cases the function will zero all
2620 * output matrices and vectors. */
2621 saf_print_warning("Could not compute EVD in utility_ceig(). Output matrices/vectors have been zeroed.");
2622#endif
2623 }
2624 /* transpose, back to row-major */
2625 else{
2626 for(i=0; i<dim; i++){
2627 if(VL!=NULL)
2628 for(j=0; j<dim; j++)
2629 VL[i*dim+j] = h->vl[j*dim+i];
2630 if(VR!=NULL)
2631 for(j=0; j<dim; j++)
2632 VR[i*dim+j] = h->vr[j*dim+i];
2633 if(D!=NULL)
2634 D[i*dim+i] = h->w[i]; /* store along the diagonal */
2635 if(eig!=NULL)
2636 eig[i] = h->w[i];
2637 }
2638 }
2639
2640 if(hWork == NULL)
2641 utility_ceig_destroy((void**)&h);
2642}
2643
2645typedef struct _utility_zeig_data {
2646 int maxDim;
2647 veclib_int currentWorkSize;
2648 double_complex *w, *vl, *vr, *a;
2649 double* rwork;
2650 double_complex* work;
2652
2653void utility_zeig_create(void ** const phWork, int maxDim)
2654{
2655 *phWork = malloc1d(sizeof(utility_zeig_data));
2656 utility_zeig_data *h = (utility_zeig_data*)(*phWork);
2657
2658 h->maxDim = maxDim;
2659 h->currentWorkSize = 0;
2660 h->rwork = malloc1d(4*maxDim*sizeof(double));
2661 h->w = malloc1d(maxDim*sizeof(double_complex));
2662 h->vl = malloc1d(maxDim*maxDim*sizeof(double_complex));
2663 h->vr = malloc1d(maxDim*maxDim*sizeof(double_complex));
2664 h->a = malloc1d(maxDim*maxDim*sizeof(double_complex));
2665 h->work = NULL;
2666}
2667
2668void utility_zeig_destroy(void ** const phWork)
2669{
2670 utility_zeig_data *h = (utility_zeig_data*)(*phWork);
2671
2672 if(h!=NULL){
2673 free(h->rwork);
2674 free(h->w);
2675 free(h->vl);
2676 free(h->vr);
2677 free(h->a);
2678 free(h->work);
2679
2680 free(h);
2681 h=NULL;
2682 *phWork = NULL;
2683 }
2684}
2685
2687(
2688 void* const hWork,
2689 const double_complex* A,
2690 const int dim,
2691 double_complex* VL,
2692 double_complex* VR,
2693 double_complex* D,
2694 double_complex* eig
2695)
2696{
2698 veclib_int i, j, n, lda, ldvl, ldvr, info;
2699 veclib_int lwork;
2700 double_complex wkopt;
2701
2702 n = lda = ldvl = ldvr = dim;
2703
2704 /* Work struct */
2705 if(hWork==NULL)
2706 utility_zeig_create((void**)&h, dim);
2707 else{
2708 h = (utility_zeig_data*)(hWork);
2709#ifndef NDEBUG
2710 saf_assert(dim<=h->maxDim, "dim exceeds the maximum length specified");
2711#endif
2712 }
2713
2714 /* store in column major order (i.e. transpose) */
2715 for(i=0; i<dim; i++)
2716 for(j=0; j<dim; j++)
2717 h->a[i*dim+j] = A[j*dim+i];
2718
2719 /* Query how much "work" memory is required */
2720 lwork = -1;
2721#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2722 zgeev_( "Vectors", "Vectors", &n, (veclib_double_complex*)h->a, &lda, (veclib_double_complex*)h->w, (veclib_double_complex*)h->vl, &ldvl,
2723 (veclib_double_complex*)h->vr, &ldvr, (veclib_double_complex*)&wkopt, &lwork, h->rwork, &info );
2724#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2725 saf_print_error("No such implementation available in ATLAS CLAPACK");
2726#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2727 info = LAPACKE_zgeev_work(CblasColMajor, 'V', 'V', n, (veclib_double_complex*)h->a, lda, (veclib_double_complex*)h->w, (veclib_double_complex*)h->vl, ldvl,
2728 (veclib_double_complex*)h->vr, ldvr, (veclib_double_complex*)&wkopt, lwork, h->rwork);
2729#endif
2730 lwork = (veclib_int)creal(wkopt);
2731 if(lwork>h->currentWorkSize){
2732 h->currentWorkSize = lwork;
2733 h->work = realloc1d(h->work, h->currentWorkSize*sizeof(double_complex));
2734 }
2735
2736 /* solve the eigenproblem */
2737#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2738 zgeev_( "Vectors", "Vectors", &n, (veclib_double_complex*)h->a, &lda, (veclib_double_complex*)h->w, (veclib_double_complex*)h->vl, &ldvl,
2739 (veclib_double_complex*)h->vr, &ldvr, (veclib_double_complex*)h->work, &lwork, h->rwork, &info );
2740#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2741 saf_print_error("No such implementation available in ATLAS CLAPACK");
2742#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2743 info = LAPACKE_zgeev_work(CblasColMajor, 'V', 'V', n, (veclib_double_complex*)h->a, lda, (veclib_double_complex*)h->w, (veclib_double_complex*)h->vl, ldvl,
2744 (veclib_double_complex*)h->vr, ldvr, (veclib_double_complex*)h->work, lwork, h->rwork);
2745#endif
2746
2747 /* output */
2748 if(D!=NULL)
2749 memset(D, 0, dim*dim*sizeof(double_complex));
2750
2751 /* failed to converge and find the eigenvalues */
2752 if( info != 0 ) {
2753 if(VL!=NULL)
2754 memset(VL, 0, dim*dim*sizeof(double_complex));
2755 if(VR!=NULL)
2756 memset(VR, 0, dim*dim*sizeof(double_complex));
2757 if(eig!=NULL)
2758 memset(eig, 0, dim*sizeof(double_complex));
2759#ifndef NDEBUG
2760 /* Failed to compute all of the eigenvalues, some eigenvectors have not
2761 * been computed, or the input matrix contained illegal values so no
2762 * solution was attempted. In these cases the function will zero all
2763 * output matrices and vectors. */
2764 saf_print_warning("Could not compute EVD in utility_zeig(). Output matrices/vectors have been zeroed.");
2765#endif
2766 }
2767 /* transpose, back to row-major */
2768 else{
2769 for(i=0; i<dim; i++){
2770 if(VL!=NULL)
2771 for(j=0; j<dim; j++)
2772 VL[i*dim+j] = h->vl[j*dim+i];
2773 if(VR!=NULL)
2774 for(j=0; j<dim; j++)
2775 VR[i*dim+j] = h->vr[j*dim+i];
2776 if(D!=NULL)
2777 D[i*dim+i] = h->w[i]; /* store along the diagonal */
2778 if(eig!=NULL)
2779 eig[i] = h->w[i];
2780 }
2781 }
2782
2783 if(hWork == NULL)
2784 utility_zeig_destroy((void**)&h);
2785}
2786
2787
2788/* ========================================================================== */
2789/* General Linear Solver (?glslv) */
2790/* ========================================================================== */
2791
2793typedef struct _utility_sglslv_data {
2794 int maxDim;
2795 int maxNCol;
2796 veclib_int* IPIV;
2797 float* a;
2798 float* b;
2800
2801void utility_sglslv_create(void ** const phWork, int maxDim, int maxNCol)
2802{
2803 *phWork = malloc1d(sizeof(utility_sglslv_data));
2805
2806 h->maxDim = maxDim;
2807 h->maxNCol = maxNCol;
2808
2809 h->IPIV = malloc1d(maxDim*sizeof(veclib_int));
2810 h->a = malloc1d(maxDim*maxDim*sizeof(float));
2811 h->b = malloc1d(maxDim*maxNCol*sizeof(float));
2812}
2813
2814void utility_sglslv_destroy(void ** const phWork)
2815{
2817
2818 if(h!=NULL){
2819 free(h->IPIV);
2820 free(h->a);
2821 free(h->b);
2822
2823 free(h);
2824 h=NULL;
2825 *phWork = NULL;
2826 }
2827}
2828
2830(
2831 void* const hWork,
2832 const float* A,
2833 const int dim,
2834 float* B,
2835 int nCol,
2836 float* X
2837)
2838{
2840 veclib_int n = dim, nrhs = nCol, lda = dim, ldb = dim, info;
2841#if !defined(SAF_VECLIB_USE_LAPACKE_INTERFACE) && !(defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64))
2842 veclib_int i, j;
2843#endif
2844
2845 /* Work struct */
2846 if(hWork==NULL)
2847 utility_sglslv_create((void**)&h, dim, nCol);
2848 else {
2849 h = (utility_sglslv_data*)(hWork);
2850#ifndef NDEBUG
2851 saf_assert(dim<=h->maxDim, "dim exceeds the maximum length specified");
2852 saf_assert(nCol<=h->maxNCol, "nCol exceeds the maximum length specified");
2853#endif
2854 }
2855
2856#if defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2857 /* Copy locally */
2858 cblas_scopy(dim*dim, A, 1, h->a, 1);
2859 cblas_scopy(dim*nCol, B, 1, h->b, 1);
2860#else
2861 /* store in column major order */
2862# if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
2863 MKL_Somatcopy('R', 'T', dim, dim, 1.0f, A, dim, h->a, dim);
2864 MKL_Somatcopy('R', 'T', dim, nCol, 1.0f, B, nCol, h->b, dim);
2865# else
2866 for(i=0; i<dim; i++)
2867 for(j=0; j<dim; j++)
2868 h->a[i*dim+j] = A[j*dim+i];
2869 for(i=0; i<dim; i++)
2870 for(j=0; j<nCol; j++)
2871 h->b[j*dim+i] = B[i*nCol+j];
2872# endif
2873#endif
2874
2875 /* solve Ax = b for each column in b (b is replaced by the solution: x) */
2876#ifdef SAF_VECLIB_USE_CLAPACK_INTERFACE
2877 info = clapack_sgesv(CblasColMajor, n, nrhs, h->a, lda, h->IPIV, h->b, ldb);
2878#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2879 info = LAPACKE_sgesv_work(CblasRowMajor, n, nrhs, h->a, lda, h->IPIV, h->b, ldb);
2880#elif defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2881 sgesv_( &n, &nrhs, h->a, &lda, h->IPIV, h->b, &ldb, &info );
2882#endif
2883
2884 if(info!=0){
2885 /* A is singular, solution not possible */
2886 memset(X, 0, dim*nCol*sizeof(float));
2887#ifndef NDEBUG
2888 /* Input matrix was singular, solution was unable to be computed, or the
2889 * input matrix contained illegal values so no solution was attempted.
2890 * In these cases the function will zero the output matrix. */
2891 saf_print_warning("Could not solve the linear equation in utility_sglslv(). Output matrices/vectors have been zeroed.");
2892#endif
2893 }
2894 else{
2895#if defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2896 cblas_scopy(dim*nCol, h->b, 1, X, 1);
2897#else
2898 /* store solution in row-major order */
2899# if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
2900 MKL_Somatcopy('R', 'T', nCol, dim, 1.0f, h->b, dim, X, nCol);
2901# else
2902 for(i=0; i<dim; i++)
2903 for(j=0; j<nCol; j++)
2904 X[i*nCol+j] = h->b[j*dim+i];
2905# endif
2906#endif
2907 }
2908
2909 if(hWork == NULL)
2910 utility_sglslv_destroy((void**)&h);
2911}
2912
2914typedef struct _utility_cglslv_data {
2915 int maxDim;
2916 int maxNCol;
2917 veclib_int* IPIV;
2918 float_complex* a;
2919 float_complex* b;
2921
2922void utility_cglslv_create(void ** const phWork, int maxDim, int maxNCol)
2923{
2924 *phWork = malloc1d(sizeof(utility_cglslv_data));
2926
2927 h->maxDim = maxDim;
2928 h->maxNCol = maxNCol;
2929
2930 h->IPIV = malloc1d(maxDim*sizeof(veclib_int));
2931 h->a = malloc1d(maxDim*maxDim*sizeof(float_complex));
2932 h->b = malloc1d(maxDim*maxNCol*sizeof(float_complex));
2933}
2934
2935void utility_cglslv_destroy(void ** const phWork)
2936{
2938
2939 if(h!=NULL){
2940 free(h->IPIV);
2941 free(h->a);
2942 free(h->b);
2943
2944 free(h);
2945 h=NULL;
2946 *phWork = NULL;
2947 }
2948}
2949
2951(
2952 void* const hWork,
2953 const float_complex* A,
2954 const int dim,
2955 float_complex* B,
2956 int nCol,
2957 float_complex* X
2958)
2959{
2961 veclib_int i, j, n, nrhs, lda, ldb, info;
2962
2963 n = dim;
2964 nrhs = nCol;
2965 lda = dim;
2966 ldb = dim;
2967
2968 /* Work struct */
2969 if(hWork==NULL)
2970 utility_cglslv_create((void**)&h, dim, nCol);
2971 else {
2972 h = (utility_cglslv_data*)(hWork);
2973#ifndef NDEBUG
2974 saf_assert(dim<=h->maxDim, "dim exceeds the maximum length specified");
2975 saf_assert(nCol<=h->maxNCol, "nCol exceeds the maximum length specified");
2976#endif
2977 }
2978
2979 /* store in column major order */
2980 for(i=0; i<dim; i++)
2981 for(j=0; j<dim; j++)
2982 h->a[j*dim+i] = A[i*dim+j];
2983 for(i=0; i<dim; i++)
2984 for(j=0; j<nCol; j++)
2985 h->b[j*dim+i] = B[i*nCol+j];
2986
2987 /* solve Ax = b for each column in b (b is replaced by the solution: x) */
2988#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2989 cgesv_( &n, &nrhs, (veclib_float_complex*)h->a, &lda, h->IPIV, (veclib_float_complex*)h->b, &ldb, &info );
2990#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2991 info = clapack_cgesv(CblasColMajor, n, nrhs, (veclib_float_complex*)h->a, lda, h->IPIV, (veclib_float_complex*)h->b, ldb);
2992#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2993 info = LAPACKE_cgesv_work(CblasColMajor, n, nrhs, (veclib_float_complex*)h->a, lda, h->IPIV, (veclib_float_complex*)h->b, ldb);
2994#endif
2995
2996 /* A is singular, solution not possible */
2997 if(info!=0){
2998 memset(X, 0, dim*nCol*sizeof(float_complex));
2999#ifndef NDEBUG
3000 /* Input matrix was singular, solution was unable to be computed, or the
3001 * input matrix contained illegal values so no solution was attempted.
3002 * In these cases the function will zero the output matrix. */
3003 saf_print_warning("Could not solve the linear equation in utility_cglslv(). Output matrices/vectors have been zeroed.");
3004#endif
3005 }
3006 /* store solution in row-major order */
3007 else{
3008 for(i=0; i<dim; i++)
3009 for(j=0; j<nCol; j++)
3010 X[i*nCol+j] = h->b[j*dim+i];
3011 }
3012
3013 if(hWork == NULL)
3014 utility_cglslv_destroy((void**)&h);
3015}
3016
3018typedef struct _utility_dglslv_data {
3019 int maxDim;
3020 int maxNCol;
3021 veclib_int* IPIV;
3022 double* a;
3023 double* b;
3025
3026void utility_dglslv_create(void ** const phWork, int maxDim, int maxNCol)
3027{
3028 *phWork = malloc1d(sizeof(utility_dglslv_data));
3030
3031 h->maxDim = maxDim;
3032 h->maxNCol = maxNCol;
3033 h->IPIV = malloc1d(maxDim*sizeof(veclib_int));
3034 h->a = malloc1d(maxDim*maxDim*sizeof(double));
3035 h->b = malloc1d(maxDim*maxNCol*sizeof(double));
3036}
3037
3038void utility_dglslv_destroy(void ** const phWork)
3039{
3041
3042 if(h!=NULL){
3043 free(h->IPIV);
3044 free(h->a);
3045 free(h->b);
3046
3047 free(h);
3048 h=NULL;
3049 *phWork = NULL;
3050 }
3051}
3052
3054(
3055 void* const hWork,
3056 const double* A,
3057 const int dim,
3058 double* B,
3059 int nCol,
3060 double* X
3061)
3062{
3064 veclib_int i, j, n, nrhs, lda, ldb, info;
3065
3066 n = dim;
3067 nrhs = nCol;
3068 lda = dim;
3069 ldb = dim;
3070
3071 /* Work struct */
3072 if(hWork==NULL)
3073 utility_dglslv_create((void**)&h, dim, nCol);
3074 else {
3075 h = (utility_dglslv_data*)(hWork);
3076#ifndef NDEBUG
3077 saf_assert(dim<=h->maxDim, "dim exceeds the maximum length specified");
3078 saf_assert(nCol<=h->maxNCol, "nCol exceeds the maximum length specified");
3079#endif
3080 }
3081
3082 /* store in column major order */
3083 for(i=0; i<dim; i++)
3084 for(j=0; j<dim; j++)
3085 h->a[j*dim+i] = A[i*dim+j];
3086 for(i=0; i<dim; i++)
3087 for(j=0; j<nCol; j++)
3088 h->b[j*dim+i] = B[i*nCol+j];
3089
3090 /* solve Ax = b for each column in b (b is replaced by the solution: x) */
3091#ifdef SAF_VECLIB_USE_CLAPACK_INTERFACE
3092 info = clapack_dgesv(CblasColMajor, n, nrhs, h->a, lda, h->IPIV, h->b, ldb);
3093#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3094 info = LAPACKE_dgesv_work(CblasColMajor, n, nrhs, h->a, lda, h->IPIV, h->b, ldb);
3095#elif defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3096 dgesv_( &n, &nrhs, h->a, &lda, h->IPIV, h->b, &ldb, &info );
3097#endif
3098
3099 /* A is singular, solution not possible */
3100 if(info!=0){
3101 memset(X, 0, dim*nCol*sizeof(double));
3102#ifndef NDEBUG
3103 /* Input matrix was singular, solution was unable to be computed, or the
3104 * input matrix contained illegal values so no solution was attempted.
3105 * In these cases the function will zero the output matrix. */
3106 saf_print_warning("Could not solve the linear equation in utility_dglslv(). Output matrices/vectors have been zeroed.");
3107#endif
3108 }
3109 /* store solution in row-major order */
3110 else{
3111 for(i=0; i<dim; i++)
3112 for(j=0; j<nCol; j++)
3113 X[i*nCol+j] = h->b[j*dim+i];
3114 }
3115
3116 if(hWork == NULL)
3117 utility_dglslv_destroy((void**)&h);
3118}
3119
3121typedef struct _utility_zglslv_data {
3122 int maxDim;
3123 int maxNCol;
3124 veclib_int* IPIV;
3125 double_complex* a;
3126 double_complex* b;
3128
3129void utility_zglslv_create(void ** const phWork, int maxDim, int maxNCol)
3130{
3131 *phWork = malloc1d(sizeof(utility_zglslv_data));
3133
3134 h->maxDim = maxDim;
3135 h->maxNCol = maxNCol;
3136 h->IPIV = malloc1d(maxDim*sizeof(veclib_int));
3137 h->a = malloc1d(maxDim*maxDim*sizeof(double_complex));
3138 h->b = malloc1d(maxDim*maxNCol*sizeof(double_complex));
3139}
3140
3141void utility_zglslv_destroy(void ** const phWork)
3142{
3144
3145 if(h!=NULL){
3146 free(h->IPIV);
3147 free(h->a);
3148 free(h->b);
3149
3150 free(h);
3151 h=NULL;
3152 *phWork = NULL;
3153 }
3154}
3155
3157(
3158 void* const hWork,
3159 const double_complex* A,
3160 const int dim,
3161 double_complex* B,
3162 int nCol,
3163 double_complex* X
3164)
3165{
3167 veclib_int i, j, n = dim, nrhs = nCol, lda = dim, ldb = dim, info;
3168
3169 n = dim;
3170 nrhs = nCol;
3171 lda = dim;
3172 ldb = dim;
3173
3174 /* Work struct */
3175 if(hWork==NULL)
3176 utility_zglslv_create((void**)&h, dim, nCol);
3177 else {
3178 h = (utility_zglslv_data*)(hWork);
3179#ifndef NDEBUG
3180 saf_assert(dim<=h->maxDim, "dim exceeds the maximum length specified");
3181 saf_assert(nCol<=h->maxNCol, "nCol exceeds the maximum length specified");
3182#endif
3183 }
3184
3185 /* store in column major order */
3186 for(i=0; i<dim; i++)
3187 for(j=0; j<dim; j++)
3188 h->a[j*dim+i] = A[i*dim+j];
3189 for(i=0; i<dim; i++)
3190 for(j=0; j<nCol; j++)
3191 h->b[j*dim+i] = B[i*nCol+j];
3192
3193 /* solve Ax = b for each column in b (b is replaced by the solution: x) */
3194#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3195 zgesv_( &n, &nrhs, (veclib_double_complex*)h->a, &lda, h->IPIV, (veclib_double_complex*)h->b, &ldb, &info );
3196#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3197 info = clapack_zgesv(CblasColMajor, n, nrhs, (veclib_double_complex*)h->a, lda, h->IPIV, (veclib_double_complex*)h->b, ldb);
3198#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3199 info = LAPACKE_zgesv_work(CblasColMajor, n, nrhs, (veclib_double_complex*)h->a, lda, h->IPIV, (veclib_double_complex*)h->b, ldb);
3200#endif
3201
3202 /* A is singular, solution not possible */
3203 if(info!=0){
3204 memset(X, 0, dim*nCol*sizeof(double_complex));
3205#ifndef NDEBUG
3206 /* Input matrix was singular, solution was unable to be computed, or the
3207 * input matrix contained illegal values so no solution was attempted.
3208 * In these cases the function will zero the output matrix. */
3209 saf_print_warning("Could not solve the linear equation in utility_zglslv(). Output matrices/vectors have been zeroed.");
3210#endif
3211 }
3212 /* store solution in row-major order */
3213 else{
3214 for(i=0; i<dim; i++)
3215 for(j=0; j<nCol; j++)
3216 X[i*nCol+j] = h->b[j*dim+i];
3217 }
3218
3219 if(hWork == NULL)
3220 utility_zglslv_destroy((void**)&h);
3221}
3222
3223
3224/* ========================================================================== */
3225/* General Linear Solver (?glslvt) */
3226/* ========================================================================== */
3227
3229typedef struct _utility_sglslvt_data {
3230 int maxDim;
3231 int maxNCol;
3232 veclib_int* IPIV;
3233 float* a;
3234 float* b;
3236
3237void utility_sglslvt_create(void ** const phWork, int maxDim, int maxNCol)
3238{
3239 *phWork = malloc1d(sizeof(utility_sglslvt_data));
3241
3242 h->maxDim = maxDim;
3243 h->maxNCol = maxNCol;
3244 h->IPIV = malloc1d(maxDim*sizeof(veclib_int));
3245 h->a = malloc1d(maxDim*maxDim*sizeof(float));
3246 h->b = malloc1d(maxDim*maxNCol*sizeof(float));
3247}
3248
3249void utility_sglslvt_destroy(void ** const phWork)
3250{
3252
3253 if(h!=NULL){
3254 free(h->IPIV);
3255 free(h->a);
3256 free(h->b);
3257
3258 free(h);
3259 h=NULL;
3260 *phWork = NULL;
3261 }
3262}
3263
3265(
3266 void* const hWork,
3267 const float* A,
3268 const int dim,
3269 float* B,
3270 int nCol,
3271 float* X
3272)
3273{
3275 veclib_int n = nCol, nrhs = dim, lda = nCol, ldb = nCol, info;
3276
3277 /* Work struct */
3278 if(hWork==NULL)
3279 utility_sglslvt_create((void**)&h, dim, nCol);
3280 else {
3281 h = (utility_sglslvt_data*)(hWork);
3282#ifndef NDEBUG
3283 saf_assert(dim<=h->maxDim, "dim exceeds the maximum length specified");
3284 saf_assert(nCol<=h->maxNCol, "nCol exceeds the maximum length specified");
3285#endif
3286 }
3287
3288 /* store locally */
3289 cblas_scopy(dim*dim, A, 1, h->a, 1);
3290 cblas_scopy(dim*nCol, B, 1, h->b, 1);
3291
3292 /* solve Ax = b for each column in b (b is replaced by the solution: x) */
3293#ifdef SAF_VECLIB_USE_CLAPACK_INTERFACE
3294 info = clapack_sgesv(CblasColMajor, n, nrhs, h->b, ldb, h->IPIV, h->a, lda);
3295#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3296 info = LAPACKE_sgesv_work(CblasColMajor, n, nrhs, h->b, ldb, h->IPIV, h->a, lda);
3297#elif defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3298 //sposv_("U", &n, &nrhs, h->b, &ldb, h->a, &lda, &info );
3299 sgesv_( &n, &nrhs, h->b, &ldb, h->IPIV, h->a, &lda, &info );
3300#endif
3301
3302 if(info!=0){
3303 /* A is singular, solution not possible */
3304 memset(X, 0, dim*nCol*sizeof(float));
3305#ifndef NDEBUG
3306 /* Input matrix was singular, solution was unable to be computed, or the
3307 * input matrix contained illegal values so no solution was attempted.
3308 * In these cases the function will zero the output matrix. */
3309 saf_print_warning("Could not solve the linear equation in utility_sglslvt(). Output matrices/vectors have been zeroed.");
3310#endif
3311 }
3312 else
3313 cblas_scopy(dim*nCol, h->a, 1, X, 1);
3314
3315 if(hWork == NULL)
3316 utility_sglslvt_destroy((void**)&h);
3317}
3318
3319
3320/* ========================================================================== */
3321/* Symmetric Linear Solver (?slslv) */
3322/* ========================================================================== */
3323
3325typedef struct _utility_sslslv_data {
3326 int maxDim;
3327 int maxNCol;
3328 float* a;
3329 float* b;
3331
3332void utility_sslslv_create(void ** const phWork, int maxDim, int maxNCol)
3333{
3334 *phWork = malloc1d(sizeof(utility_sslslv_data));
3336
3337 h->maxDim = maxDim;
3338 h->maxNCol = maxNCol;
3339 h->a = malloc1d(maxDim*maxDim*sizeof(float));
3340 h->b = malloc1d(maxDim*maxNCol*sizeof(float));
3341}
3342
3343void utility_sslslv_destroy(void ** const phWork)
3344{
3346
3347 if(h!=NULL){
3348 free(h->a);
3349 free(h->b);
3350
3351 free(h);
3352 h=NULL;
3353 *phWork = NULL;
3354 }
3355}
3356
3358(
3359 void* const hWork,
3360 const float* A,
3361 const int dim,
3362 float* B,
3363 int nCol,
3364 float* X
3365)
3366{
3368 veclib_int i, j, n, nrhs, lda, ldb, info;
3369
3370 n = dim;
3371 nrhs = nCol;
3372 lda = dim;
3373 ldb = dim;
3374
3375 /* Work struct */
3376 if(hWork==NULL)
3377 utility_sslslv_create((void**)&h, dim, nCol);
3378 else {
3379 h = (utility_sslslv_data*)(hWork);
3380#ifndef NDEBUG
3381 saf_assert(dim<=h->maxDim, "dim exceeds the maximum length specified");
3382 saf_assert(nCol<=h->maxNCol, "nCol exceeds the maximum length specified");
3383#endif
3384 }
3385
3386 /* store in column major order */
3387 for(i=0; i<dim; i++)
3388 for(j=0; j<dim; j++)
3389 h->a[j*dim+i] = A[i*dim+j];
3390 for(i=0; i<dim; i++)
3391 for(j=0; j<nCol; j++)
3392 h->b[j*dim+i] = B[i*nCol+j];
3393
3394 /* solve Ax = b for each column in b (b is replaced by the solution: x) */
3395#ifdef SAF_VECLIB_USE_CLAPACK_INTERFACE
3396 info = clapack_sposv(CblasColMajor, CblasUpper, n, nrhs, h->a, lda, h->b, ldb);
3397#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3398 info = LAPACKE_sposv_work(CblasColMajor, CblasUpper, n, nrhs, h->a, lda, h->b, ldb);
3399#elif defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3400 sposv_( "U", &n, &nrhs, h->a, &lda, h->b, &ldb, &info );
3401#endif
3402
3403 /* A is not symmetric positive definate, solution not possible */
3404 if(info!=0){
3405 memset(X, 0, dim*nCol*sizeof(float));
3406#ifndef NDEBUG
3407 /* Input matrix was singular, solution was unable to be computed, or the
3408 * input matrix contained illegal values so no solution was attempted.
3409 * In these cases the function will zero the output matrix. */
3410 saf_print_warning("Could not solve the linear equation in utility_sslslv(). Output matrices/vectors have been zeroed.");
3411#endif
3412 }
3413 /* store solution in row-major order */
3414 else{
3415 for(i=0; i<dim; i++)
3416 for(j=0; j<nCol; j++)
3417 X[i*nCol+j] = h->b[j*dim+i];
3418 }
3419
3420 if(hWork == NULL)
3421 utility_sslslv_destroy((void**)&h);
3422}
3423
3425typedef struct _utility_cslslv_data {
3426 int maxDim;
3427 int maxNCol;
3428 float_complex* a;
3429 float_complex* b;
3431
3432void utility_cslslv_create(void ** const phWork, int maxDim, int maxNCol)
3433{
3434 *phWork = malloc1d(sizeof(utility_cslslv_data));
3436
3437 h->maxDim = maxDim;
3438 h->maxNCol = maxNCol;
3439 h->a = malloc1d(maxDim*maxDim*sizeof(float_complex));
3440 h->b = malloc1d(maxDim*maxNCol*sizeof(float_complex));
3441}
3442
3443void utility_cslslv_destroy(void ** const phWork)
3444{
3446
3447 if(h!=NULL){
3448 free(h->a);
3449 free(h->b);
3450
3451 free(h);
3452 h=NULL;
3453 *phWork = NULL;
3454 }
3455}
3456
3458(
3459 void* const hWork,
3460 const float_complex* A,
3461 const int dim,
3462 float_complex* B,
3463 int nCol,
3464 float_complex* X
3465)
3466{
3468 veclib_int i, j, n, nrhs, lda, ldb, info;
3469
3470 n = dim;
3471 nrhs = nCol;
3472 lda = dim;
3473 ldb = dim;
3474
3475 /* Work struct */
3476 if(hWork==NULL)
3477 utility_cslslv_create((void**)&h, dim, nCol);
3478 else {
3479 h = (utility_cslslv_data*)(hWork);
3480#ifndef NDEBUG
3481 saf_assert(dim<=h->maxDim, "dim exceeds the maximum length specified");
3482 saf_assert(nCol<=h->maxNCol, "nCol exceeds the maximum length specified");
3483#endif
3484 }
3485
3486 /* store in column major order */
3487 for(i=0; i<dim; i++)
3488 for(j=0; j<dim; j++)
3489 h->a[j*dim+i] = A[i*dim+j];
3490 for(i=0; i<dim; i++)
3491 for(j=0; j<nCol; j++)
3492 h->b[j*dim+i] = B[i*nCol+j];
3493
3494 /* solve Ax = b for each column in b (b is replaced by the solution: x) */
3495#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3496 cposv_( "U", &n, &nrhs, (veclib_float_complex*)h->a, &lda, (veclib_float_complex*)h->b, &ldb, &info );
3497#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3498 info = clapack_cposv(CblasColMajor, CblasUpper, n, nrhs, (veclib_float_complex*)h->a, lda, (veclib_float_complex*)h->b, ldb);
3499#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3500 info = LAPACKE_cposv_work(CblasColMajor, CblasUpper, n, nrhs, (veclib_float_complex*)h->a, lda, (veclib_float_complex*)h->b, ldb);
3501#endif
3502
3503 /* A is not symmetric positive definate, solution not possible */
3504 if(info!=0){
3505 memset(X, 0, dim*nCol*sizeof(float_complex));
3506#ifndef NDEBUG
3507 /* Input matrix was singular, solution was unable to be computed, or the
3508 * input matrix contained illegal values so no solution was attempted.
3509 * In these cases the function will zero the output matrix. */
3510 saf_print_warning("Could not solve the linear equation in utility_cslslv(). Output matrices/vectors have been zeroed.");
3511#endif
3512 }
3513 /* store solution in row-major order */
3514 else{
3515 for(i=0; i<dim; i++)
3516 for(j=0; j<nCol; j++)
3517 X[i*nCol+j] = h->b[j*dim+i];
3518 }
3519
3520 if(hWork == NULL)
3521 utility_cslslv_destroy((void**)&h);
3522}
3523
3524
3525/* ========================================================================== */
3526/* Matrix Pseudo-Inverse (?pinv) */
3527/* ========================================================================== */
3528
3530typedef struct _utility_spinv_data {
3531 int maxDim1, maxDim2;
3532 veclib_int currentWorkSize;
3533 float* a, *s, *u, *vt, *inva;
3534 float* work;
3536
3537void utility_spinv_create(void ** const phWork, int maxDim1, int maxDim2)
3538{
3539 *phWork = malloc1d(sizeof(utility_spinv_data));
3540 utility_spinv_data *h = (utility_spinv_data*)(*phWork);
3541
3542 h->maxDim1 = maxDim1;
3543 h->maxDim2 = maxDim2;
3544 h->currentWorkSize = 0;
3545 h->a = malloc1d(maxDim1*maxDim2*sizeof(float));
3546 h->s = malloc1d(SAF_MIN(maxDim2,maxDim1)*sizeof(float));
3547 h->u = malloc1d(maxDim1*maxDim1*sizeof(float));
3548 h->vt = malloc1d(maxDim2*maxDim2*sizeof(float));
3549 h->inva = malloc1d(maxDim1*maxDim2*sizeof(float));
3550 h->work = NULL;
3551}
3552
3553void utility_spinv_destroy(void ** const phWork)
3554{
3555 utility_spinv_data *h = (utility_spinv_data*)(*phWork);
3556
3557 if(h!=NULL){
3558 free(h->a);
3559 free(h->s);
3560 free(h->u);
3561 free(h->vt);
3562 free(h->inva);
3563 free(h->work);
3564
3565 free(h);
3566 h=NULL;
3567 *phWork = NULL;
3568 }
3569}
3570
3572(
3573 void* const hWork,
3574 const float* inM,
3575 const int dim1,
3576 const int dim2,
3577 float* outM
3578)
3579{
3581 veclib_int i, j, m, n, k, lda, ldu, ldvt, ld_inva, info;
3582 float ss;
3583 veclib_int lwork;
3584 float wkopt;
3585
3586 m = lda = ldu = dim1;
3587 n = ldvt = dim2;
3588 k = m < n ? m : n;
3589
3590 /* Work struct */
3591 if(hWork==NULL)
3592 utility_spinv_create((void**)&h, dim1, dim2);
3593 else{
3594 h = (utility_spinv_data*)(hWork);
3595#ifndef NDEBUG
3596 saf_assert(dim1<=h->maxDim1, "dim1 exceeds the maximum length specified");
3597 saf_assert(dim2<=h->maxDim2, "dim2 exceeds the maximum length specified");
3598#endif
3599 }
3600
3601 /* store in column major order */
3602 for(i=0; i<m; i++)
3603 for(j=0; j<n; j++)
3604 h->a[j*m+i] = inM[i*n+j];
3605
3606 /* Query how much "work" memory is required */
3607 lwork = -1;
3608#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3609 sgesvd_("S", "S", &m, &n, h->a, &lda, h->s, h->u, &ldu, h->vt, &ldvt, &wkopt, &lwork, &info);
3610#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3611 saf_print_error("No such implementation available in ATLAS CLAPACK");
3612#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3613 info = LAPACKE_sgesvd_work(CblasColMajor, 'S', 'S', m, n, h->a, lda, h->s, h->u, ldu, h->vt, ldvt, &wkopt, lwork);
3614#endif
3615 lwork = (veclib_int)wkopt;
3616 if(lwork>h->currentWorkSize){
3617 h->currentWorkSize = lwork;
3618 h->work = realloc1d(h->work, h->currentWorkSize*sizeof(float));
3619 }
3620
3621 /* Perform the singular value decomposition */
3622#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3623 sgesvd_("S", "S", &m, &n, h->a, &lda, h->s, h->u, &ldu, h->vt, &ldvt, h->work, &lwork, &info );
3624#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3625 saf_print_error("No such implementation available in ATLAS CLAPACK");
3626#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3627 info = LAPACKE_sgesvd_work(CblasColMajor, 'S', 'S', m, n, h->a, lda, h->s, h->u, ldu, h->vt, ldvt, h->work, lwork);
3628#endif
3629
3630 if( info != 0 ) {
3631 memset(outM, 0, dim1*dim2*sizeof(float));
3632#ifndef NDEBUG
3633 /* The SVD failed to converge, or the input matrix contained illegal
3634 * values so no solution was attempted. In these cases this function
3635 * will zero all output matrices and/or vectors. */
3636 saf_print_warning("Could not compute SVD in utility_spinv(). Output matrices/vectors have been zeroed.");
3637#endif
3638 }
3639 else{
3640 for(i=0; i<k; i++){
3641 if(h->s[i] > 1.0e-5f)
3642 ss=1.0f/h->s[i];
3643 else
3644 ss=h->s[i];
3645 cblas_sscal(m, ss, &h->u[i*m], 1);
3646 }
3647 ld_inva=n;
3648 cblas_sgemm(CblasColMajor, CblasTrans, CblasTrans, n, m, k, 1.0f,
3649 h->vt, ldvt,
3650 h->u, ldu, 0.0f,
3651 h->inva, ld_inva);
3652
3653 /* return in row-major order */
3654 for(i=0; i<m; i++)
3655 for(j=0; j<n; j++)
3656 outM[j*m+i] = h->inva[i*n+j];
3657 }
3658
3659 /* clean-up */
3660 if(hWork == NULL)
3661 utility_spinv_destroy((void**)&h);
3662}
3663
3665typedef struct _utility_cpinv_data {
3666 int maxDim1, maxDim2;
3667 veclib_int currentWorkSize;
3668 float_complex* a, *u, *vt, *inva;
3669 float* s, *rwork;
3670 float_complex* work;
3672
3673void utility_cpinv_create(void ** const phWork, int maxDim1, int maxDim2)
3674{
3675 *phWork = malloc1d(sizeof(utility_cpinv_data));
3676 utility_cpinv_data *h = (utility_cpinv_data*)(*phWork);
3677
3678 h->maxDim1 = maxDim1;
3679 h->maxDim2 = maxDim2;
3680 h->currentWorkSize = 0;
3681 h->a = malloc1d(maxDim1*maxDim2*sizeof(float_complex));
3682 h->s = malloc1d(SAF_MIN(maxDim2,maxDim1)*sizeof(float));
3683 h->u = malloc1d(maxDim1*maxDim1*sizeof(float_complex));
3684 h->vt = malloc1d(maxDim2*maxDim2*sizeof(float_complex));
3685 h->inva = malloc1d(maxDim1*maxDim2*sizeof(float_complex));
3686 h->rwork = malloc1d(maxDim1*SAF_MAX(1, 5*SAF_MIN(maxDim2,maxDim1))*sizeof(float));
3687 h->work = NULL;
3688}
3689
3690void utility_cpinv_destroy(void ** const phWork)
3691{
3692 utility_cpinv_data *h = (utility_cpinv_data*)(*phWork);
3693
3694 if(h!=NULL){
3695 free(h->a);
3696 free(h->s);
3697 free(h->u);
3698 free(h->vt);
3699 free(h->inva);
3700 free(h->work);
3701
3702 free(h);
3703 h=NULL;
3704 *phWork = NULL;
3705 }
3706}
3707
3709(
3710 void* const hWork,
3711 const float_complex* inM,
3712 const int dim1,
3713 const int dim2,
3714 float_complex* outM
3715)
3716{
3718 veclib_int i, j, m, n, k, lda, ldu, ldvt, ld_inva, info;
3719 float_complex ss_cmplx;
3720 const float_complex calpha = cmplxf(1.0f, 0.0f); const float_complex cbeta = cmplxf(0.0f, 0.0f); /* blas */
3721 float ss;
3722 veclib_int lwork;
3723 float_complex wkopt;
3724
3725 m = lda = ldu = dim1;
3726 n = ldvt = dim2;
3727 k = m < n ? m : n;
3728
3729 /* Work struct */
3730 if(hWork==NULL)
3731 utility_cpinv_create((void**)&h, dim1, dim2);
3732 else{
3733 h = (utility_cpinv_data*)(hWork);
3734#ifndef NDEBUG
3735 saf_assert(dim1<=h->maxDim1, "dim1 exceeds the maximum length specified");
3736 saf_assert(dim2<=h->maxDim2, "dim2 exceeds the maximum length specified");
3737#endif
3738 }
3739
3740 /* store in column major order */
3741 for(i=0; i<dim1; i++)
3742 for(j=0; j<dim2; j++)
3743 h->a[j*dim1+i] = inM[i*dim2 +j];
3744
3745 /* Query how much "work" memory is required */
3746 lwork = -1;
3747#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3748 cgesvd_( "A", "A", &m, &n, (veclib_float_complex*)h->a, &lda, h->s, (veclib_float_complex*)h->u, &ldu, (veclib_float_complex*)h->vt, &ldvt,
3749 (veclib_float_complex*)&wkopt, &lwork, h->rwork, &info );
3750#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3751 saf_print_error("No such implementation available in ATLAS CLAPACK");
3752#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3753 info = LAPACKE_cgesvd_work(CblasColMajor, 'S', 'S', m, n, (veclib_float_complex*)h->a, lda, h->s, (veclib_float_complex*)h->u, ldu, (veclib_float_complex*)h->vt, ldvt,
3754 (veclib_float_complex*)&wkopt, lwork, h->rwork);
3755#endif
3756 lwork = (veclib_int)(crealf(wkopt)+0.01f);
3757 if(lwork>h->currentWorkSize){
3758 h->currentWorkSize = lwork;
3759 h->work = realloc1d(h->work, h->currentWorkSize*sizeof(float_complex));
3760 }
3761
3762 /* Perform the singular value decomposition */
3763#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3764 cgesvd_( "A", "A", &m, &n, (veclib_float_complex*)h->a, &lda, h->s, (veclib_float_complex*)h->u, &ldu, (veclib_float_complex*)h->vt, &ldvt,
3765 (veclib_float_complex*)h->work, &lwork, h->rwork, &info);
3766#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3767 saf_print_error("No such implementation available in ATLAS CLAPACK");
3768#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3769 info = LAPACKE_cgesvd_work(CblasColMajor, 'S', 'S', m, n, (veclib_float_complex*)h->a, lda, h->s, (veclib_float_complex*)h->u, ldu, (veclib_float_complex*)h->vt, ldvt,
3770 (veclib_float_complex*)h->work, lwork, h->rwork);
3771#endif
3772
3773 if( info != 0 ) {
3774 memset(outM, 0, dim1*dim2*sizeof(float_complex));
3775#ifndef NDEBUG
3776 /* The SVD failed to converge, or the input matrix contained illegal
3777 * values so no solution was attempted. In these cases this function
3778 * will zero all output matrices and/or vectors. */
3779 saf_print_warning("Could not compute SVD in utility_cpinv(). Output matrices/vectors have been zeroed.");
3780#endif
3781 }
3782 else{
3783 for(i=0; i<k; i++){
3784 if(h->s[i] > 1.0e-5f)
3785 ss=1.0f/h->s[i];
3786 else
3787 ss=h->s[i];
3788 ss_cmplx = cmplxf(ss, 0.0f);
3789 cblas_cscal(m, &ss_cmplx, &h->u[i*m], 1);
3790 }
3791 ld_inva=n;
3792 cblas_cgemm(CblasColMajor, CblasConjTrans, CblasConjTrans, n, m, k, &calpha,
3793 h->vt, ldvt,
3794 h->u, ldu, &cbeta,
3795 h->inva, ld_inva);
3796
3797 /* return in row-major order */
3798 for(i=0; i<m; i++)
3799 for(j=0; j<n; j++)
3800 outM[j*m+i] = h->inva[i*n+j];
3801 }
3802
3803 /* clean-up */
3804 if(hWork == NULL)
3805 utility_cpinv_destroy((void**)&h);
3806}
3807
3809typedef struct _utility_dpinv_data {
3810 int maxDim1, maxDim2;
3811 veclib_int currentWorkSize;
3812 double* a, *s, *u, *vt, *inva;
3813 double* work;
3815
3816void utility_dpinv_create(void ** const phWork, int maxDim1, int maxDim2)
3817{
3818 *phWork = malloc1d(sizeof(utility_dpinv_data));
3819 utility_dpinv_data *h = (utility_dpinv_data*)(*phWork);
3820
3821 h->maxDim1 = maxDim1;
3822 h->maxDim2 = maxDim2;
3823 h->currentWorkSize = 0;
3824 h->a = malloc1d(maxDim1*maxDim2*sizeof(double));
3825 h->s = malloc1d(SAF_MIN(maxDim2,maxDim1)*sizeof(double));
3826 h->u = malloc1d(maxDim1*maxDim1*sizeof(double));
3827 h->vt = malloc1d(maxDim2*maxDim2*sizeof(double));
3828 h->inva = malloc1d(maxDim1*maxDim2*sizeof(double));
3829 h->work = NULL;
3830}
3831
3832void utility_dpinv_destroy(void ** const phWork)
3833{
3834 utility_dpinv_data *h = (utility_dpinv_data*)(*phWork);
3835
3836 if(h!=NULL){
3837 free(h->a);
3838 free(h->s);
3839 free(h->u);
3840 free(h->vt);
3841 free(h->inva);
3842 free(h->work);
3843
3844 free(h);
3845 h=NULL;
3846 *phWork = NULL;
3847 }
3848}
3849
3851(
3852 void* const hWork,
3853 const double* inM,
3854 const int dim1,
3855 const int dim2,
3856 double* outM
3857)
3858{
3860 veclib_int i, j, m, n, k, lda, ldu, ldvt, ld_inva, info;
3861 double ss;
3862 veclib_int lwork;
3863 double wkopt;
3864
3865 m = lda = ldu = dim1;
3866 n = ldvt = dim2;
3867 k = m < n ? m : n;
3868
3869 /* Work struct */
3870 if(hWork==NULL)
3871 utility_dpinv_create((void**)&h, dim1, dim2);
3872 else{
3873 h = (utility_dpinv_data*)(hWork);
3874#ifndef NDEBUG
3875 saf_assert(dim1<=h->maxDim1, "dim1 exceeds the maximum length specified");
3876 saf_assert(dim2<=h->maxDim2, "dim2 exceeds the maximum length specified");
3877#endif
3878 }
3879
3880 /* store in column major order */
3881 for(i=0; i<m; i++)
3882 for(j=0; j<n; j++)
3883 h->a[j*m+i] = inM[i*n+j];
3884
3885 /* Query how much "work" memory is required */
3886 lwork = -1;
3887#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3888 dgesvd_("S", "S", &m, &n, h->a, &lda, h->s, h->u, &ldu, h->vt, &ldvt, &wkopt, &lwork, &info);
3889#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3890 saf_print_error("No such implementation available in ATLAS CLAPACK");
3891#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3892 info = LAPACKE_dgesvd_work(CblasColMajor, 'S', 'S', m, n, h->a, lda, h->s, h->u, ldu, h->vt, ldvt, &wkopt, lwork);
3893#endif
3894 lwork = (veclib_int)wkopt;
3895 if(lwork>h->currentWorkSize){
3896 h->currentWorkSize = lwork;
3897 h->work = realloc1d(h->work, h->currentWorkSize*sizeof(double));
3898 }
3899
3900 /* Perform the singular value decomposition */
3901#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3902 dgesvd_("S", "S", &m, &n, h->a, &lda, h->s, h->u, &ldu, h->vt, &ldvt, h->work, &lwork, &info );
3903#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3904 saf_print_error("No such implementation available in ATLAS CLAPACK");
3905#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3906 info = LAPACKE_dgesvd_work(CblasColMajor, 'S', 'S', m, n, h->a, lda, h->s, h->u, ldu, h->vt, ldvt, h->work, lwork);
3907#endif
3908
3909 if( info != 0 ) {
3910 memset(outM, 0, dim1*dim2*sizeof(double));
3911#ifndef NDEBUG
3912 /* The SVD failed to converge, or the input matrix contained illegal
3913 * values so no solution was attempted. In these cases this function
3914 * will zero all output matrices and/or vectors. */
3915 saf_print_warning("Could not compute SVD in utility_dpinv(). Output matrices/vectors have been zeroed.");
3916#endif
3917 }
3918 else{
3919 for(i=0; i<k; i++){
3920 if(h->s[i] > 1.0e-9)
3921 ss=1.0/h->s[i];
3922 else
3923 ss=h->s[i];
3924 cblas_dscal(m, ss, &h->u[i*m], 1);
3925 }
3926 ld_inva=n;
3927 cblas_dgemm(CblasColMajor, CblasTrans, CblasTrans, n, m, k, 1.0,
3928 h->vt, ldvt,
3929 h->u, ldu, 0.0,
3930 h->inva, ld_inva);
3931
3932 /* return in row-major order */
3933 for(i=0; i<m; i++)
3934 for(j=0; j<n; j++)
3935 outM[j*m+i] = h->inva[i*n+j];
3936 }
3937
3938 /* clean-up */
3939 if(hWork == NULL)
3940 utility_dpinv_destroy((void**)&h);
3941}
3942
3944typedef struct _utility_zpinv_data {
3945 int maxDim1, maxDim2;
3946 veclib_int currentWorkSize;
3947 double_complex* a, *u, *vt, *inva;
3948 double* s, *rwork;
3949 double_complex* work;
3951
3952void utility_zpinv_create(void ** const phWork, int maxDim1, int maxDim2)
3953{
3954 *phWork = malloc1d(sizeof(utility_zpinv_data));
3955 utility_zpinv_data *h = (utility_zpinv_data*)(*phWork);
3956
3957 h->maxDim1 = maxDim1;
3958 h->maxDim2 = maxDim2;
3959 h->currentWorkSize = 0;
3960 h->a = malloc1d(maxDim1*maxDim2*sizeof(double_complex));
3961 h->s = malloc1d(SAF_MIN(maxDim2,maxDim1)*sizeof(double));
3962 h->u = malloc1d(maxDim1*maxDim1*sizeof(double_complex));
3963 h->vt = malloc1d(maxDim2*maxDim2*sizeof(double_complex));
3964 h->inva = malloc1d(maxDim1*maxDim2*sizeof(double_complex));
3965 h->rwork = malloc1d(maxDim1*SAF_MAX(1, 5*SAF_MIN(maxDim2,maxDim1))*sizeof(double));
3966 h->work = NULL;
3967}
3968
3969void utility_zpinv_destroy(void ** const phWork)
3970{
3971 utility_zpinv_data *h = (utility_zpinv_data*)(*phWork);
3972
3973 if(h!=NULL){
3974 free(h->a);
3975 free(h->s);
3976 free(h->u);
3977 free(h->vt);
3978 free(h->inva);
3979 free(h->work);
3980
3981 free(h);
3982 h=NULL;
3983 *phWork = NULL;
3984 }
3985}
3986
3988(
3989 void* const hWork,
3990 const double_complex* inM,
3991 const int dim1,
3992 const int dim2,
3993 double_complex* outM
3994)
3995{
3997 veclib_int i, j, m, n, k, lda, ldu, ldvt, info, ld_inva;
3998 double_complex ss_cmplx;
3999 const double_complex calpha = cmplx(1.0, 0.0); const double_complex cbeta = cmplx(0.0, 0.0); /* blas */
4000 double ss;
4001 veclib_int lwork;
4002 double_complex wkopt;
4003
4004 m = lda = ldu = dim1;
4005 n = ldvt = dim2;
4006 k = m < n ? m : n;
4007
4008 /* Work struct */
4009 if(hWork==NULL)
4010 utility_zpinv_create((void**)&h, dim1, dim2);
4011 else{
4012 h = (utility_zpinv_data*)(hWork);
4013#ifndef NDEBUG
4014 saf_assert(dim1<=h->maxDim1, "dim1 exceeds the maximum length specified");
4015 saf_assert(dim2<=h->maxDim2, "dim2 exceeds the maximum length specified");
4016#endif
4017 }
4018
4019 /* store in column major order */
4020 for(i=0; i<dim1; i++)
4021 for(j=0; j<dim2; j++)
4022 h->a[j*dim1+i] = inM[i*dim2 +j];
4023
4024 /* Query how much "work" memory is required */
4025 lwork = -1;
4026#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4027 zgesvd_( "A", "A", &m, &n, (veclib_double_complex*)h->a, &lda, h->s, (veclib_double_complex*)h->u, &ldu, (veclib_double_complex*)h->vt, &ldvt,
4028 (veclib_double_complex*)&wkopt, &lwork, h->rwork, &info );
4029#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4030 saf_print_error("No such implementation available in ATLAS CLAPACK");
4031#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4032 info = LAPACKE_zgesvd_work(CblasColMajor, 'A', 'A', m, n, (veclib_double_complex*)h->a, lda, h->s, (veclib_double_complex*)h->u, ldu, (veclib_double_complex*)h->vt, ldvt,
4033 (veclib_double_complex*)&wkopt, lwork, h->rwork);
4034#endif
4035 lwork = (veclib_int)(creal(wkopt)+0.01);
4036 if(lwork>h->currentWorkSize){
4037 h->currentWorkSize = lwork;
4038 h->work = realloc1d(h->work, h->currentWorkSize*sizeof(double_complex));
4039 }
4040
4041 /* Perform the singular value decomposition */
4042#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4043 zgesvd_( "A", "A", &m, &n, (veclib_double_complex*)h->a, &lda, h->s, (veclib_double_complex*)h->u, &ldu, (veclib_double_complex*)h->vt, &ldvt,
4044 (veclib_double_complex*)h->work, &lwork, h->rwork, &info);
4045#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4046 saf_print_error("No such implementation available in ATLAS CLAPACK");
4047#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4048 info = LAPACKE_zgesvd_work(CblasColMajor, 'A', 'A', m, n, (veclib_double_complex*)h->a, lda, h->s, (veclib_double_complex*)h->u, ldu, (veclib_double_complex*)h->vt, ldvt,
4049 (veclib_double_complex*)h->work, lwork, h->rwork);
4050#endif
4051
4052 if( info != 0 ) {
4053 memset(outM, 0, dim1*dim2*sizeof(double_complex));
4054#ifndef NDEBUG
4055 /* The SVD failed to converge, or the input matrix contained illegal
4056 * values so no solution was attempted. In these cases this function
4057 * will zero all output matrices and/or vectors. */
4058 saf_print_warning("Could not compute SVD in utility_zpinv(). Output matrices/vectors have been zeroed.");
4059#endif
4060 }
4061 else{
4062 for(i=0; i<k; i++){
4063 if(h->s[i] > 1.0e-5)
4064 ss=1.0/h->s[i];
4065 else
4066 ss=h->s[i];
4067 ss_cmplx = cmplx(ss, 0.0);
4068 cblas_zscal(m, &ss_cmplx, &h->u[i*m], 1);
4069 }
4070 ld_inva=n;
4071 cblas_zgemm(CblasColMajor, CblasConjTrans, CblasConjTrans, n, m, k, &calpha,
4072 h->vt, ldvt,
4073 h->u, ldu, &cbeta,
4074 h->inva, ld_inva);
4075
4076 /* return in row-major order */
4077 for(i=0; i<m; i++)
4078 for(j=0; j<n; j++)
4079 outM[j*m+i] = h->inva[i*n+j];
4080 }
4081
4082 /* clean-up */
4083 if(hWork == NULL)
4084 utility_zpinv_destroy((void**)&h);
4085}
4086
4087
4088/* ========================================================================== */
4089/* Cholesky Factorisation (?chol) */
4090/* ========================================================================== */
4091
4093typedef struct _utility_schol_data {
4094 int maxDim;
4095 float* a;
4097
4098void utility_schol_create(void ** const phWork, int maxDim)
4099{
4100 *phWork = malloc1d(sizeof(utility_schol_data));
4101 utility_schol_data *h = (utility_schol_data*)(*phWork);
4102
4103 h->maxDim = maxDim;
4104 h->a = malloc1d(maxDim*maxDim*sizeof(float));
4105}
4106
4107void utility_schol_destroy(void ** const phWork)
4108{
4109 utility_schol_data *h = (utility_schol_data*)(*phWork);
4110
4111 if(h!=NULL){
4112 free(h->a);
4113
4114 free(h);
4115 h=NULL;
4116 *phWork = NULL;
4117 }
4118}
4119
4121(
4122 void* const hWork,
4123 const float* A,
4124 const int dim,
4125 float* X
4126)
4127{
4129 veclib_int i, j, info, n, lda;
4130
4131 n = lda = dim;
4132
4133 /* Work struct */
4134 if(hWork==NULL)
4135 utility_schol_create((void**)&h, dim);
4136 else{
4137 h = (utility_schol_data*)(hWork);
4138#ifndef NDEBUG
4139 saf_assert(dim<=h->maxDim, "dim exceeds the maximum length specified");
4140#endif
4141 }
4142
4143 /* store in column major order */
4144 for(i=0; i<dim; i++)
4145 for(j=0; j<dim; j++)
4146 h->a[j*dim+i] = A[i*dim+j];
4147
4148 /* a is replaced by solution */
4149#ifdef SAF_VECLIB_USE_CLAPACK_INTERFACE
4150 info = clapack_spotrf(CblasColMajor, CblasUpper, n, h->a, lda);
4151#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4152 info = LAPACKE_spotrf_work(CblasColMajor, CblasUpper, n, h->a, lda);
4153#elif defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4154 spotrf_( "U", &n, h->a, &lda, &info );
4155#endif
4156
4157 /* A is not positive definate, solution not possible */
4158 if(info!=0){
4159 memset(X, 0, dim*dim*sizeof(float));
4160#ifndef NDEBUG
4161 /* Input matrix is not positive definite so the Cholesky factorisation
4162 * could not be computed, or the input matrix contained illegal values
4163 * so no solution was attempted. In these cases the function will zero
4164 * the output matrix. */
4165 saf_print_warning("Could not compute the Cholesky factorisation in utility_schol(). Output matrices/vectors have been zeroed.");
4166#endif
4167 }
4168 /* store solution in row-major order */
4169 else{
4170 for(i=0; i<dim; i++)
4171 for(j=0; j<dim; j++)
4172 X[i*dim+j] = j>=i ? h->a[j*dim+i] : 0.0f;
4173 }
4174
4175 /* clean-up */
4176 if(hWork == NULL)
4177 utility_schol_destroy((void**)&h);
4178}
4179
4181typedef struct _utility_cchol_data {
4182 int maxDim;
4183 float_complex* a;
4185
4186void utility_cchol_create(void ** const phWork, int maxDim)
4187{
4188 *phWork = malloc1d(sizeof(utility_cchol_data));
4189 utility_cchol_data *h = (utility_cchol_data*)(*phWork);
4190
4191 h->maxDim = maxDim;
4192 h->a = malloc1d(maxDim*maxDim*sizeof(float_complex));
4193}
4194
4195void utility_cchol_destroy(void ** const phWork)
4196{
4197 utility_cchol_data *h = (utility_cchol_data*)(*phWork);
4198
4199 if(h!=NULL){
4200 free(h->a);
4201
4202 free(h);
4203 h=NULL;
4204 *phWork = NULL;
4205 }
4206}
4207
4209(
4210 void* const hWork,
4211 const float_complex* A,
4212 const int dim,
4213 float_complex* X
4214)
4215{
4217 veclib_int i, j, info, n, lda;
4218
4219 n = lda = dim;
4220
4221 /* Work struct */
4222 if(hWork==NULL)
4223 utility_cchol_create((void**)&h, dim);
4224 else{
4225 h = (utility_cchol_data*)(hWork);
4226#ifndef NDEBUG
4227 saf_assert(dim<=h->maxDim, "dim exceeds the maximum length specified");
4228#endif
4229 }
4230
4231 /* store in column major order */
4232 for(i=0; i<dim; i++)
4233 for(j=0; j<dim; j++)
4234 h->a[j*dim+i] = A[i*dim+j];
4235
4236 /* a is replaced by solution */
4237#if defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4238 info = clapack_cpotrf(CblasColMajor, CblasUpper, n, (veclib_float_complex*)h->a, lda);
4239#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4240 info = LAPACKE_cpotrf_work(CblasColMajor, CblasUpper, n, (veclib_float_complex*)h->a, lda);
4241#elif defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4242 cpotrf_( "U", &n, (veclib_float_complex*)h->a, &lda, &info );
4243#endif
4244
4245 /* A is not positive definate, solution not possible */
4246 if(info!=0){
4247 memset(X, 0, dim*dim*sizeof(float_complex));
4248#ifndef NDEBUG
4249 /* Input matrix is not positive definite so the Cholesky factorisation
4250 * could not be computed, or the input matrix contained illegal values
4251 * so no solution was attempted. In these cases the function will zero
4252 * the output matrix. */
4253 saf_print_warning("Could not compute the Cholesky factorisation in utility_cchol(). Output matrices/vectors have been zeroed.");
4254#endif
4255 }
4256 /* store solution in row-major order */
4257 else{
4258 for(i=0; i<dim; i++)
4259 for(j=0; j<dim; j++)
4260 X[i*dim+j] = j>=i ? h->a[j*dim+i] : cmplxf(0.0f, 0.0f);
4261 }
4262
4263 /* clean-up */
4264 if(hWork == NULL)
4265 utility_cchol_destroy((void**)&h);
4266}
4267
4268/* ========================================================================== */
4269/* Determinant of a Matrix (?det) */
4270/* ========================================================================== */
4271
4273typedef struct _utility_sdet_data {
4274 veclib_int maxN;
4275 veclib_int *IPIV;
4276 float *tmp;
4278
4279void utility_sdet_create(void ** const phWork, int maxN)
4280{
4281 *phWork = malloc1d(sizeof(utility_sdet_data));
4282 utility_sdet_data *h = (utility_sdet_data*)(*phWork);
4283
4284 h->maxN = maxN;
4285 h->IPIV = malloc1d(maxN*sizeof(veclib_int));
4286 h->tmp = malloc1d(maxN*maxN*sizeof(float));
4287}
4288
4289void utility_sdet_destroy(void ** const phWork)
4290{
4291 utility_sdet_data *h = (utility_sdet_data*)(*phWork);
4292
4293 if(h!=NULL){
4294 free(h->IPIV);
4295 free(h->tmp);
4296
4297 free(h);
4298 h=NULL;
4299 *phWork = NULL;
4300 }
4301}
4302
4304(
4305 void* const hWork,
4306 float* A,
4307 int N
4308)
4309{
4311 veclib_int i, j, INFO;
4312 float det;
4313
4314 /* Simple cases: */
4315 if(N==2){
4316 return A[0]*A[3] - A[2]*A[1];
4317 }
4318 else if(N==3){
4319 return A[0] * ((A[4]*A[8]) - (A[7]*A[5])) -A[1] * (A[3]
4320 * A[8] - A[6] * A[5]) + A[2] * (A[3] * A[7] - A[6] * A[4]);
4321 }
4322 else if (N==4){
4323 return
4324 A[3] * A[6] * A[9] * A[12] - A[2] * A[7] * A[9] * A[12] -
4325 A[3] * A[5] * A[10] * A[12] + A[1] * A[7] * A[10] * A[12] +
4326 A[2] * A[5] * A[11] * A[12] - A[1] * A[6] * A[11] * A[12] -
4327 A[3] * A[6] * A[8] * A[13] + A[2] * A[7] * A[8] * A[13] +
4328 A[3] * A[4] * A[10] * A[13] - A[0] * A[7] * A[10] * A[13] -
4329 A[2] * A[4] * A[11] * A[13] + A[0] * A[6] * A[11] * A[13] +
4330 A[3] * A[5] * A[8] * A[14] - A[1] * A[7] * A[8] * A[14] -
4331 A[3] * A[4] * A[9] * A[14] + A[0] * A[7] * A[9] * A[14] +
4332 A[1] * A[4] * A[11] * A[14] - A[0] * A[5] * A[11] * A[14] -
4333 A[2] * A[5] * A[8] * A[15] + A[1] * A[6] * A[8] * A[15] +
4334 A[2] * A[4] * A[9] * A[15] - A[0] * A[6] * A[9] * A[15] -
4335 A[1] * A[4] * A[10] * A[15] + A[0] * A[5] * A[10] * A[15];
4336 }
4337
4338 /* Otherwise, the arbitrary NxN solution: */
4339 /* Work struct */
4340 if(hWork==NULL)
4341 utility_sdet_create((void**)&h, N);
4342 else{
4343 h = (utility_sdet_data*)(hWork);
4344#ifndef NDEBUG
4345 saf_assert(N<=h->maxN, "N exceeds the maximum length specified");
4346#endif
4347 }
4348
4349 /* Store in column major order */
4350 for(i=0; i<N; i++)
4351 for(j=0; j<N; j++)
4352 h->tmp[j*N+i] = A[i*N+j];
4353
4354#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4355 sgetrf_((veclib_int*)&N, (veclib_int*)&N, h->tmp, (veclib_int*)&N, h->IPIV, &INFO);
4356#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4357 INFO = clapack_sgetrf(CblasColMajor, N, N, h->tmp, N, h->IPIV);
4358#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4359 INFO = LAPACKE_sgetrf_work(CblasColMajor, N, N, h->tmp, N, h->IPIV);
4360#endif
4361
4362 if(INFO!=0) {
4363 det=0.0f;
4364#ifndef NDEBUG
4365 /* Input matrix was singular or contained illegal values so no solution
4366 * was attempted. In these cases the determinant is returned as 0. */
4367 saf_print_warning("Unable to compute determinant of input matrix. The function utility_sdet() returned 0. ");
4368#endif
4369 }
4370 else {
4371 det = 1.0f;
4372 for( i = 0; i < N; i++ ) {
4373 det*=h->tmp[i*N+i];
4374 if(h->IPIV[i] != i+1)
4375 det *= -1.0f;
4376 }
4377 }
4378
4379 /* clean-up */
4380 if(hWork == NULL)
4381 utility_sdet_destroy((void**)&h);
4382
4383 return det;
4384}
4385
4387typedef struct _utility_ddet_data {
4388 veclib_int currentWorkSize;
4389 veclib_int maxN;
4390 veclib_int *IPIV;
4391 double *tmp, *TAU, *WORK;
4393
4394void utility_ddet_create(void ** const phWork, int maxN)
4395{
4396 *phWork = malloc1d(sizeof(utility_ddet_data));
4397 utility_ddet_data *h = (utility_ddet_data*)(*phWork);
4398
4399 h->maxN = maxN;
4400 h->currentWorkSize = 0;
4401 h->IPIV = malloc1d(maxN*sizeof(veclib_int));
4402 h->tmp = malloc1d(maxN*maxN*sizeof(double));
4403 h->TAU = malloc1d(maxN*sizeof(double));
4404 h->WORK = NULL;
4405}
4406
4407void utility_ddet_destroy(void ** const phWork)
4408{
4409 utility_ddet_data *h = (utility_ddet_data*)(*phWork);
4410
4411 if(h!=NULL){
4412 free(h->IPIV);
4413 free(h->tmp);
4414 free(h->TAU);
4415 free(h->WORK);
4416
4417 free(h);
4418 h=NULL;
4419 *phWork = NULL;
4420 }
4421}
4422
4424(
4425 void* const hWork,
4426 double* A,
4427 int N
4428)
4429{
4431 veclib_int i,j,INFO, LWORK, lwork3;
4432 double lwork2, det;
4433
4434 /* Simple cases: */
4435 if(N==2){
4436 return A[0]*A[3] - A[2]*A[1];
4437 }
4438 else if(N==3){
4439 return A[0] * ((A[4]*A[8]) - (A[7]*A[5])) -A[1] * (A[3]
4440 * A[8] - A[6] * A[5]) + A[2] * (A[3] * A[7] - A[6] * A[4]);
4441 }
4442 else if (N==4){
4443 return
4444 A[3] * A[6] * A[9] * A[12] - A[2] * A[7] * A[9] * A[12] -
4445 A[3] * A[5] * A[10] * A[12] + A[1] * A[7] * A[10] * A[12] +
4446 A[2] * A[5] * A[11] * A[12] - A[1] * A[6] * A[11] * A[12] -
4447 A[3] * A[6] * A[8] * A[13] + A[2] * A[7] * A[8] * A[13] +
4448 A[3] * A[4] * A[10] * A[13] - A[0] * A[7] * A[10] * A[13] -
4449 A[2] * A[4] * A[11] * A[13] + A[0] * A[6] * A[11] * A[13] +
4450 A[3] * A[5] * A[8] * A[14] - A[1] * A[7] * A[8] * A[14] -
4451 A[3] * A[4] * A[9] * A[14] + A[0] * A[7] * A[9] * A[14] +
4452 A[1] * A[4] * A[11] * A[14] - A[0] * A[5] * A[11] * A[14] -
4453 A[2] * A[5] * A[8] * A[15] + A[1] * A[6] * A[8] * A[15] +
4454 A[2] * A[4] * A[9] * A[15] - A[0] * A[6] * A[9] * A[15] -
4455 A[1] * A[4] * A[10] * A[15] + A[0] * A[5] * A[10] * A[15];
4456 }
4457
4458 /* Otherwise, the arbitrary NxN solution: */
4459 /* Work struct */
4460 if(hWork==NULL)
4461 utility_ddet_create((void**)&h, N);
4462 else{
4463 h = (utility_ddet_data*)(hWork);
4464#ifndef NDEBUG
4465 saf_assert(N<=h->maxN, "N exceeds the maximum length specified");
4466#endif
4467 }
4468
4469 /* Store in column major order */
4470 for(i=0; i<N; i++)
4471 for(j=0; j<N; j++)
4472 h->tmp[j*N+i] = A[i*N+j];
4473
4474 /* Query how much "work" memory is required */
4475 LWORK=-1;
4476#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4477 veclib_int _N = N;
4478 dgeqrf_(&_N, &_N, h->tmp, &_N, h->TAU, &lwork2, &LWORK, &INFO);
4479#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4480 saf_print_error("No such implementation in ATLAS CLAPACK");
4481#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4482 INFO = LAPACKE_dgeqrf_work(CblasColMajor, N, N, h->tmp, N, h->TAU, &lwork2, LWORK);
4483#endif
4484 lwork3=(veclib_int)lwork2;
4485 if(lwork3>h->currentWorkSize){
4486 h->currentWorkSize = lwork3;
4487 h->WORK = realloc1d(h->WORK, h->currentWorkSize*sizeof(double));
4488 }
4489
4490 /* Decompose matrix */
4491#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4492 dgeqrf_(&_N, &_N, h->tmp, &_N, h->TAU, h->WORK, &lwork3, &INFO);
4493#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4494 saf_print_error("No such implementation in ATLAS CLAPACK");
4495#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4496 INFO = LAPACKE_dgeqrf_work(CblasColMajor, N, N, h->tmp, N, h->TAU, h->WORK, lwork3);
4497#endif
4498
4499 if(INFO!=0) {
4500 det=0.0;
4501#ifndef NDEBUG
4502 /* Input matrix was singular or contained illegal values so no solution
4503 * was attempted. In these cases the determinant is returned as 0. */
4504 saf_print_warning("Unable to compute determinant of input matrix. The function utility_ddet() returned 0. ");
4505#endif
4506 }
4507 else {
4508 det=1;
4509 for (i=0;i<N;i++)
4510 det*=h->tmp[i*N+i];
4511 /* Householder algorithm */
4512 if(N%2==0)
4513 det*=-1;
4514 }
4515
4516 /* clean-up */
4517 if(hWork == NULL)
4518 utility_ddet_destroy((void**)&h);
4519
4520 return det;
4521}
4522
4523
4524/* ========================================================================== */
4525/* Matrix Inversion (?inv) */
4526/* ========================================================================== */
4527
4529typedef struct _utility_sinv_data {
4530 int maxN;
4531 veclib_int *IPIV;
4532 float *WORK, *tmp;
4534
4535void utility_sinv_create(void ** const phWork, int maxN)
4536{
4537 *phWork = malloc1d(sizeof(utility_sinv_data));
4538 utility_sinv_data *h = (utility_sinv_data*)(*phWork);
4539
4540 h->maxN = maxN;
4541 h->IPIV = malloc1d(maxN*sizeof(veclib_int));
4542 h->tmp = malloc1d(maxN*maxN*sizeof(float));
4543 h->WORK = malloc1d(maxN*maxN*sizeof(float));
4544}
4545
4546void utility_sinv_destroy(void ** const phWork)
4547{
4548 utility_sinv_data *h = (utility_sinv_data*)(*phWork);
4549
4550 if(h!=NULL){
4551 free(h->IPIV);
4552 free(h->tmp);
4553 free(h->WORK);
4554
4555 free(h);
4556 h=NULL;
4557 *phWork = NULL;
4558 }
4559}
4560
4562(
4563 void* const hWork,
4564 float* A,
4565 float* B,
4566 const int N
4567)
4568{
4570 veclib_int i, j, N_;
4571 veclib_int LWORK;
4572 veclib_int INFO;
4573
4574 LWORK = N*N;
4575 N_ = (veclib_int)N;
4576
4577 /* Work struct */
4578 if(hWork==NULL)
4579 utility_sinv_create((void**)&h, N);
4580 else{
4581 h = (utility_sinv_data*)(hWork);
4582#ifndef NDEBUG
4583 saf_assert(N<=h->maxN, "N exceeds the maximum length specified");
4584#endif
4585 }
4586
4587 /* Store in column major order */
4588 for(i=0; i<N; i++)
4589 for(j=0; j<N; j++)
4590 h->tmp[j*N+i] = A[i*N+j];
4591
4592#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4593 sgetrf_((veclib_int*)&N_, (veclib_int*)&N_, h->tmp, (veclib_int*)&N_, h->IPIV, &INFO);
4594 sgetri_((veclib_int*)&N_, h->tmp, (veclib_int*)&N_, h->IPIV, h->WORK, &LWORK, &INFO);
4595#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4596 INFO = clapack_sgetrf(CblasColMajor, N_, N_, h->tmp, N_, h->IPIV);
4597 INFO = clapack_sgetri(CblasColMajor, N_, h->tmp, N_, h->IPIV);
4598#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4599 INFO = LAPACKE_sgetrf_work(CblasColMajor, N_, N_, h->tmp, N_, h->IPIV);
4600 INFO = LAPACKE_sgetri_work(CblasColMajor, N_, h->tmp, N_, h->IPIV, h->WORK, LWORK);
4601#endif
4602
4603 if(INFO!=0) {
4604 memset(B, 0, N*N*sizeof(float));
4605#ifndef NDEBUG
4606 /* Input matrix was singular or contained illegal values so no inversion
4607 * was attempted. In these cases the function will zero the output
4608 * matrix. */
4609 saf_print_warning("Unable to compute the inverse of input matrix. The function utility_sinv() returned a matrix of zeros. ");
4610#endif
4611 }
4612 else {
4613 /* Output in row major order */
4614 for(i=0; i<N; i++)
4615 for(j=0; j<N; j++)
4616 B[j*N+i] = h->tmp[i*N+j];
4617 }
4618
4619 /* clean-up */
4620 if(hWork == NULL)
4621 utility_sinv_destroy((void**)&h);
4622}
4623
4625typedef struct _utility_dinv_data {
4626 int maxN;
4627 veclib_int *IPIV;
4628 double *WORK, *tmp;
4630
4631void utility_dinv_create(void ** const phWork, int maxN)
4632{
4633 *phWork = malloc1d(sizeof(utility_dinv_data));
4634 utility_dinv_data *h = (utility_dinv_data*)(*phWork);
4635
4636 h->maxN = maxN;
4637 h->IPIV = malloc1d(maxN*sizeof(veclib_int));
4638 h->tmp = malloc1d(maxN*maxN*sizeof(double));
4639 h->WORK = malloc1d(maxN*maxN*sizeof(double));
4640}
4641
4642void utility_dinv_destroy(void ** const phWork)
4643{
4644 utility_dinv_data *h = (utility_dinv_data*)(*phWork);
4645
4646 if(h!=NULL){
4647 free(h->IPIV);
4648 free(h->tmp);
4649 free(h->WORK);
4650
4651 free(h);
4652 h=NULL;
4653 *phWork = NULL;
4654 }
4655}
4656
4658(
4659 void* const hWork,
4660 double* A,
4661 double* B,
4662 const int N
4663)
4664{
4666 veclib_int i, j, N_;
4667 veclib_int LWORK;
4668 veclib_int INFO;
4669
4670 LWORK = N*N;
4671 N_ = (veclib_int)N;
4672
4673 /* Work struct */
4674 if(hWork==NULL)
4675 utility_dinv_create((void**)&h, N);
4676 else{
4677 h = (utility_dinv_data*)(hWork);
4678#ifndef NDEBUG
4679 saf_assert(N<=h->maxN, "N exceeds the maximum length specified");
4680#endif
4681 }
4682
4683 /* Store in column major order */
4684 for(i=0; i<N; i++)
4685 for(j=0; j<N; j++)
4686 h->tmp[j*N+i] = A[i*N+j];
4687
4688#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4689 dgetrf_((veclib_int*)&N_, (veclib_int*)&N_, h->tmp, (veclib_int*)&N_, h->IPIV, &INFO);
4690 dgetri_((veclib_int*)&N_, h->tmp, (veclib_int*)&N_, h->IPIV, h->WORK, &LWORK, &INFO);
4691#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4692 INFO = clapack_dgetrf(CblasColMajor, N_, N_, h->tmp, N_, h->IPIV);
4693 INFO = clapack_dgetri(CblasColMajor, N_, h->tmp, N_, h->IPIV);
4694#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4695 INFO = LAPACKE_dgetrf_work(CblasColMajor, N_, N_, h->tmp, N_, h->IPIV);
4696 INFO = LAPACKE_dgetri_work(CblasColMajor, N_, h->tmp, N_, h->IPIV, h->WORK, LWORK);
4697#endif
4698
4699 if(INFO!=0) {
4700 memset(B, 0, N*N*sizeof(double));
4701#ifndef NDEBUG
4702 /* Input matrix was singular or contained illegal values so no inversion
4703 * was attempted. In these cases the function will zero the output
4704 * matrix. */
4705 saf_print_warning("Unable to compute the inverse of input matrix. The function utility_dinv() returned a matrix of zeros. ");
4706#endif
4707 }
4708 else {
4709 /* Output in row major order */
4710 for(i=0; i<N; i++)
4711 for(j=0; j<N; j++)
4712 B[j*N+i] = h->tmp[i*N+j];
4713 }
4714
4715 /* clean-up */
4716 if(hWork == NULL)
4717 utility_dinv_destroy((void**)&h);
4718}
4719
4721typedef struct _utility_cinv_data {
4722 int maxN;
4723 veclib_int *IPIV;
4724 float_complex *WORK, *tmp;
4726
4727void utility_cinv_create(void ** const phWork, int maxN)
4728{
4729 *phWork = malloc1d(sizeof(utility_cinv_data));
4730 utility_cinv_data *h = (utility_cinv_data*)(*phWork);
4731
4732 h->maxN = maxN;
4733 h->IPIV = malloc1d(maxN*sizeof(veclib_int));
4734 h->tmp = malloc1d(maxN*maxN*sizeof(float_complex));
4735 h->WORK = malloc1d(maxN*maxN*sizeof(float_complex));
4736}
4737
4738void utility_cinv_destroy(void ** const phWork)
4739{
4740 utility_cinv_data *h = (utility_cinv_data*)(*phWork);
4741
4742 if(h!=NULL){
4743 free(h->IPIV);
4744 free(h->tmp);
4745 free(h->WORK);
4746
4747 free(h);
4748 h=NULL;
4749 *phWork = NULL;
4750 }
4751}
4752
4754(
4755 void* const hWork,
4756 float_complex* A,
4757 float_complex* B,
4758 const int N
4759)
4760{
4762 veclib_int i, j, N_;
4763 veclib_int LWORK;
4764 veclib_int INFO;
4765
4766 N_ = (veclib_int)N;
4767 LWORK = N*N;
4768
4769 /* Work struct */
4770 if(hWork==NULL)
4771 utility_cinv_create((void**)&h, N);
4772 else{
4773 h = (utility_cinv_data*)(hWork);
4774#ifndef NDEBUG
4775 saf_assert(N<=h->maxN, "N exceeds the maximum length specified");
4776#endif
4777 }
4778
4779 /* Store in column major order */
4780 for(i=0; i<N; i++)
4781 for(j=0; j<N; j++)
4782 h->tmp[j*N+i] = A[i*N+j];
4783
4784#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4785 cgetrf_((veclib_int*)&N_, (veclib_int*)&N_, (veclib_float_complex*)h->tmp, (veclib_int*)&N_, h->IPIV, &INFO);
4786 cgetri_((veclib_int*)&N_, (veclib_float_complex*)h->tmp, (veclib_int*)&N_, h->IPIV, (veclib_float_complex*)h->WORK, &LWORK, &INFO);
4787#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4788 INFO = clapack_cgetrf(CblasColMajor, N_, N_, (veclib_float_complex*)h->tmp, N_, h->IPIV);
4789 INFO = clapack_cgetri(CblasColMajor, N_, (veclib_float_complex*)h->tmp, N_, h->IPIV);
4790#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4791 INFO = LAPACKE_cgetrf_work(CblasColMajor, N_, N_, (veclib_float_complex*)h->tmp, N_, h->IPIV);
4792 INFO = LAPACKE_cgetri_work(CblasColMajor, N_, (veclib_float_complex*)h->tmp, N_, h->IPIV, (veclib_float_complex*)h->WORK, LWORK);
4793#endif
4794
4795 if(INFO!=0) {
4796 memset(B, 0, N*N*sizeof(float_complex));
4797#ifndef NDEBUG
4798 /* Input matrix was singular or contained illegal values so no inversion
4799 * was attempted. In these cases the function will zero the output
4800 * matrix. */
4801 saf_print_warning("Unable to compute the inverse of input matrix. The function utility_cinv() returned a matrix of zeros. ");
4802#endif
4803 }
4804 else {
4805 /* Output in row major order */
4806 for(i=0; i<N; i++)
4807 for(j=0; j<N; j++)
4808 B[j*N+i] = h->tmp[i*N+j];
4809 }
4810
4811 /* clean-up */
4812 if(hWork == NULL)
4813 utility_cinv_destroy((void**)&h);
4814}
void utility_dimaxv(const double *a, const int len, int *index)
Double-precision, index of maximum absolute value in a vector, i.e.
#define saf_print_error(message)
Macro to print a error message along with the filename and line number.
void utility_spinv_create(void **const phWork, int maxDim1, int maxDim2)
(Optional) Pre-allocate the working struct used by utility_spinv()
void utility_dinv(void *const hWork, double *A, double *B, const int N)
Matrix inversion: double precision, i.e.
double utility_ddet(void *const hWork, double *A, int N)
Determinant of a Matrix, double precision, i,e.
void utility_dinv_create(void **const phWork, int maxN)
(Optional) Pre-allocate the working struct used by utility_dinv()
void utility_zvvsub(const double_complex *a, const double_complex *b, const int len, double_complex *c)
Double-precision, complex, vector-vector subtraction, i.e.
void utility_ceigmp(void *const hWork, const float_complex *A, const float_complex *B, const int dim, float_complex *VL, float_complex *VR, float_complex *D)
Computes eigenvalues of a matrix pair using the QZ method, single precision complex,...
#define saf_assert(x, message)
Macro to make an assertion, along with a string explaining its purpose.
void utility_cpinv_create(void **const phWork, int maxDim1, int maxDim2)
(Optional) Pre-allocate the working struct used by utility_cpinv()
void utility_zsv2cv_inds(const double_complex *sv, const int *inds, const int len, double_complex *cv)
Double-precision complex, sparse-vector to compressed vector given known indices.
void utility_cglslv_destroy(void **const phWork)
De-allocate the working struct used by utility_cglslv()
void utility_cglslv(void *const hWork, const float_complex *A, const int dim, float_complex *B, int nCol, float_complex *X)
General linear solver: single precision complex, i.e.
void utility_ddet_destroy(void **const phWork)
De-allocate the working struct used by utility_ddet()
void utility_zeig_destroy(void **const phWork)
De-allocate the working struct used by utility_zeig()
void utility_cvvmul(const float_complex *a, const float_complex *b, const int len, float_complex *c)
Single-precision, complex, element-wise vector-vector multiplication i.e.
void utility_svvmul(const float *a, const float *b, const int len, float *c)
Single-precision, element-wise vector-vector multiplication i.e.
void utility_ceig_destroy(void **const phWork)
De-allocate the working struct used by utility_ceig()
void utility_cinv(void *const hWork, float_complex *A, float_complex *B, const int N)
Matrix inversion: double precision complex, i.e.
void utility_dvsmul(double *a, const double *s, const int len, double *c)
Double-precision, multiplies each element in vector 'a' with a scalar 's', i.e.
void utility_csv2cv_inds(const float_complex *sv, const int *inds, const int len, float_complex *cv)
Single-precision complex, sparse-vector to compressed vector given known indices.
void utility_svmod(const float *a, const float *b, const int len, float *c)
Single-precision, modulus of vector elements, i.e.
void utility_ziminv(const double_complex *a, const int len, int *index)
Double-precision, complex, index of maximum absolute value in a vector, i.e.
void utility_dvvsub(const double *a, const double *b, const int len, double *c)
Double-precision, vector-vector subtraction, i.e.
void utility_dsv2cv_inds(const double *sv, const int *inds, const int len, double *cv)
Double-precision, sparse-vector to compressed vector given known indices i.e.
#define SAF_FALSE
Boolean false.
void utility_csvd_destroy(void **const phWork)
De-allocate the working struct used by utility_csvd()
#define SAF_MAX(a, b)
Returns the maximum of the two values.
void utility_ceigmp_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_ceigmp()
void utility_dglslv(void *const hWork, const double *A, const int dim, double *B, int nCol, double *X)
General linear solver: double precision, i.e.
void utility_zvsmul(double_complex *a, const double_complex *s, const int len, double_complex *c)
Double-precision, complex, multiplies each element in vector 'a' with a scalar 's',...
void utility_dglslv_destroy(void **const phWork)
De-allocate the working struct used by utility_dglslv()
void utility_schol_destroy(void **const phWork)
De-allocate the working struct used by utility_schol()
void utility_zeig_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_zeig()
void utility_svsadd(float *a, const float *s, const int len, float *c)
Single-precision, adds each element in vector 'a' with a scalar 's', i.e.
void utility_cseig_destroy(void **const phWork)
De-allocate the working struct used by utility_cseig()
void utility_ciminv(const float_complex *a, const int len, int *index)
Single-precision, complex, index of maximum absolute value in a vector, i.e.
void utility_dvvadd(const double *a, const double *b, const int len, double *c)
Double-precision, vector-vector addition, i.e.
void utility_svssub(float *a, const float *s, const int len, float *c)
Single-precision, subtracts each element in vector 'a' with a scalar 's', i.e.
void utility_svvsub(const float *a, const float *b, const int len, float *c)
Single-precision, vector-vector subtraction, i.e.
void utility_ssvd_create(void **const phWork, int maxDim1, int maxDim2)
(Optional) Pre-allocate the working struct used by utility_ssvd()
void utility_sglslvt(void *const hWork, const float *A, const int dim, float *B, int nCol, float *X)
General linear solver (the other way): single precision, i.e.
void utility_ssvd(void *const hWork, const float *A, const int dim1, const int dim2, float *U, float *S, float *V, float *sing)
Singular value decomposition: single precision, i.e.
void utility_zimaxv(const double_complex *a, const int len, int *index)
Double-precision, complex, index of maximum absolute value in a vector, i.e.
void utility_zeigmp(void *const hWork, const double_complex *A, const double_complex *B, const int dim, double_complex *VL, double_complex *VR, double_complex *D)
Computes eigenvalues of a matrix pair using the QZ method, double precision complex,...
void utility_cinv_destroy(void **const phWork)
De-allocate the working struct used by utility_cinv()
void utility_ceig(void *const hWork, const float_complex *A, const int dim, float_complex *VL, float_complex *VR, float_complex *D, float_complex *eig)
Eigenvalue decomposition of a NON-SYMMETRIC matrix: single precision complex, i.e.
void utility_cvvadd(const float_complex *a, const float_complex *b, const int len, float_complex *c)
Single-precision, complex, vector-vector addition, i.e.
void utility_sinv_create(void **const phWork, int maxN)
(Optional) Pre-allocate the working struct used by utility_sinv()
void utility_zglslv(void *const hWork, const double_complex *A, const int dim, double_complex *B, int nCol, double_complex *X)
General linear solver: double precision complex, i.e.
void utility_sglslv(void *const hWork, const float *A, const int dim, float *B, int nCol, float *X)
General linear solver: single precision, i.e.
CONJ_FLAG
Whether a vector should be conjugated or not (e.g.
void utility_svvcopy(const float *a, const int len, float *c)
Single-precision, vector-vector copy, i.e.
void utility_cimaxv(const float_complex *a, const int len, int *index)
Single-precision, complex, index of maximum absolute value in a vector, i.e.
#define SAF_TRUE
Boolean true.
void utility_spinv(void *const hWork, const float *inM, const int dim1, const int dim2, float *outM)
General matrix pseudo-inverse (the svd way): single precision, i.e.
void utility_dpinv_create(void **const phWork, int maxDim1, int maxDim2)
(Optional) Pre-allocate the working struct used by utility_dpinv()
void utility_cpinv(void *const hWork, const float_complex *inM, const int dim1, const int dim2, float_complex *outM)
General matrix pseudo-inverse (the svd way): single precision complex, i.e.
void utility_cslslv_create(void **const phWork, int maxDim, int maxNCol)
(Optional) Pre-allocate the working struct used by utility_cslslv()
void utility_sseig_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_sseig()
void utility_sinv(void *const hWork, float *A, float *B, const int N)
Matrix inversion: single precision, i.e.
void utility_zglslv_destroy(void **const phWork)
De-allocate the working struct used by utility_zglslv()
void utility_cpinv_destroy(void **const phWork)
De-allocate the working struct used by utility_cpinv()
void utility_sglslvt_destroy(void **const phWork)
De-allocate the working struct used by utility_sglslvt()
void utility_dpinv_destroy(void **const phWork)
De-allocate the working struct used by utility_dpinv()
void utility_cseig(void *const hWork, const float_complex *A, const int dim, int sortDecFLAG, float_complex *V, float_complex *D, float *eig)
Eigenvalue decomposition of a SYMMETRIC/HERMITION matrix: single precision complex,...
void utility_csvd(void *const hWork, const float_complex *A, const int dim1, const int dim2, float_complex *U, float_complex *S, float_complex *V, float *sing)
Singular value decomposition: single precision complex, i.e.
void utility_dpinv(void *const hWork, const double *inM, const int dim1, const int dim2, double *outM)
General matrix pseudo-inverse (the svd way): double precision, i.e.
void utility_cseig_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_cseig()
void utility_diminv(const double *a, const int len, int *index)
Double-precision, index of minimum absolute value in a vector, i.e.
void utility_sslslv_destroy(void **const phWork)
De-allocate the working struct used by utility_sslslv()
void utility_schol(void *const hWork, const float *A, const int dim, float *X)
Cholesky factorisation of a symmetric matrix positive-definate matrix: single precision,...
#define SAF_MIN(a, b)
Returns the minimum of the two values.
void utility_zeig(void *const hWork, const double_complex *A, const int dim, double_complex *VL, double_complex *VR, double_complex *D, double_complex *eig)
Eigenvalue decomposition of a NON-SYMMETRIC matrix: double precision complex, i.e.
void utility_cvabs(const float_complex *a, const int len, float *c)
Single-precision, complex, absolute values of vector elements, i.e.
void utility_cvvdot(const float_complex *a, const float_complex *b, const int len, CONJ_FLAG flag, float_complex *c)
Single-precision, complex, vector-vector dot product, i.e.
void utility_sglslv_create(void **const phWork, int maxDim, int maxNCol)
(Optional) Pre-allocate the working struct used by utility_sglslv()
void utility_cinv_create(void **const phWork, int maxN)
(Optional) Pre-allocate the working struct used by utility_cinv()
void utility_cvvsub(const float_complex *a, const float_complex *b, const int len, float_complex *c)
Single-precision, complex, vector-vector subtraction, i.e.
void utility_zpinv(void *const hWork, const double_complex *inM, const int dim1, const int dim2, double_complex *outM)
General matrix pseudo-inverse (the svd way): double precision complex, i.e.
void utility_zpinv_create(void **const phWork, int maxDim1, int maxDim2)
(Optional) Pre-allocate the working struct used by utility_zpinv()
void utility_sseig(void *const hWork, const float *A, const int dim, int sortDecFLAG, float *V, float *D, float *eig)
Eigenvalue decomposition of a SYMMETRIC matrix: single precision, i.e.
void utility_dinv_destroy(void **const phWork)
De-allocate the working struct used by utility_dinv()
void utility_cchol(void *const hWork, const float_complex *A, const int dim, float_complex *X)
Cholesky factorisation of a hermitian positive-definate matrix: single precision complex,...
#define saf_print_warning(message)
Macro to print a warning message along with the filename and line number.
void utility_cchol_destroy(void **const phWork)
De-allocate the working struct used by utility_cchol()
void utility_zvvcopy(const double_complex *a, const int len, double_complex *c)
double-precision, complex, vector-vector copy, i.e.
float utility_sdet(void *const hWork, float *A, int N)
Determinant of a Matrix, single precision, i,e.
void utility_zeigmp_destroy(void **const phWork)
De-allocate the working struct used by utility_zeigmp()
void utility_schol_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_schol()
void utility_sslslv_create(void **const phWork, int maxDim, int maxNCol)
(Optional) Pre-allocate the working struct used by utility_sslslv()
void utility_zvvadd(const double_complex *a, const double_complex *b, const int len, double_complex *c)
Double-precision, complex, vector-vector addition, i.e.
void utility_ceigmp_destroy(void **const phWork)
De-allocate the working struct used by utility_ceigmp()
void utility_csvd_create(void **const phWork, int maxDim1, int maxDim2)
(Optional) Pre-allocate the working struct used by utility_csvd()
void utility_cvvcopy(const float_complex *a, const int len, float_complex *c)
Single-precision, complex, vector-vector copy, i.e.
void utility_svvadd(const float *a, const float *b, const int len, float *c)
Single-precision, vector-vector addition, i.e.
void utility_svrecip(const float *a, const int len, float *c)
Single-precision, vector-reciprocal/inversion, i.e.
void utility_sglslvt_create(void **const phWork, int maxDim, int maxNCol)
(Optional) Pre-allocate the working struct used by utility_sglslvt()
void utility_spinv_destroy(void **const phWork)
De-allocate the working struct used by utility_spinv()
void utility_sslslv(void *const hWork, const float *A, const int dim, float *B, int nCol, float *X)
Linear solver for SYMMETRIC positive-definate 'A': single precision, i.e.
void utility_ddet_create(void **const phWork, int maxN)
(Optional) Pre-allocate the working struct used by utility_ddet()
void utility_sdet_create(void **const phWork, int maxN)
(Optional) Pre-allocate the working struct used by utility_sdet()
void utility_cvsmul(float_complex *a, const float_complex *s, const int len, float_complex *c)
Single-precision, complex, multiplies each element in vector 'a' with a scalar 's',...
void utility_cglslv_create(void **const phWork, int maxDim, int maxNCol)
(Optional) Pre-allocate the working struct used by utility_cglslv()
void utility_ceig_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_ceig()
void utility_dglslv_create(void **const phWork, int maxDim, int maxNCol)
(Optional) Pre-allocate the working struct used by utility_dglslv()
void utility_svvdot(const float *a, const float *b, const int len, float *c)
Single-precision, vector-vector dot product, i.e.
void utility_svsdiv(const float *a, const float *s, const int len, float *c)
Single-precision, divides each element in vector 'a' with a scalar 's', i.e.
void utility_cslslv_destroy(void **const phWork)
De-allocate the working struct used by utility_cslslv()
void utility_zglslv_create(void **const phWork, int maxDim, int maxNCol)
(Optional) Pre-allocate the working struct used by utility_zglslv()
void utility_cvconj(const float_complex *a, const int len, float_complex *c)
Single-precision, complex, vector-conjugate, i.e.
void utility_dvvcopy(const double *a, const int len, double *c)
double-precision, vector-vector copy, i.e.
void utility_sseig_destroy(void **const phWork)
De-allocate the working struct used by utility_sseig()
void utility_svabs(const float *a, const int len, float *c)
Single-precision, absolute values of vector elements, i.e.
void utility_svsmul(float *a, const float *s, const int len, float *c)
Single-precision, multiplies each element in vector 'a' with a scalar 's', i.e.
void utility_zvconj(const double_complex *a, const int len, double_complex *c)
Double-precision, complex, vector-conjugate, i.e.
void utility_sinv_destroy(void **const phWork)
De-allocate the working struct used by utility_sinv()
void utility_cchol_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_cchol()
void utility_sdet_destroy(void **const phWork)
De-allocate the working struct used by utility_sdet()
void utility_siminv(const float *a, const int len, int *index)
Single-precision, index of minimum absolute value in a vector, i.e.
void utility_zeigmp_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_zeigmp()
void utility_ssv2cv_inds(const float *sv, const int *inds, const int len, float *cv)
Single-precision, sparse-vector to compressed vector given known indices i.e.
void utility_sglslv_destroy(void **const phWork)
De-allocate the working struct used by utility_sglslv()
void utility_simaxv(const float *a, const int len, int *index)
Single-precision, index of maximum absolute value in a vector, i.e.
void utility_ssvd_destroy(void **const phWork)
De-allocate the working struct used by utility_ssvd()
void utility_zpinv_destroy(void **const phWork)
De-allocate the working struct used by utility_zpinv()
void utility_cslslv(void *const hWork, const float_complex *A, const int dim, float_complex *B, int nCol, float_complex *X)
Linear solver for HERMITIAN positive-definate 'A': single precision complex, i.e.
@ NO_CONJ
Do not take the conjugate.
@ CONJ
Take the conjugate.
void * malloc1d(size_t dim1_data_size)
1-D malloc (same as malloc, but with error checking)
Definition md_malloc.c:59
void * realloc1d(void *ptr, size_t dim1_data_size)
1-D realloc (same as realloc, but with error checking)
Definition md_malloc.c:79
Include header for SAF externals.
Main header for the utilities module (SAF_UTILITIES_MODULE)
double veclib_double
real: 8-bytes
MKL_INT veclib_int
integer: 4-bytes
MKL_Complex16 veclib_double_complex
complex: 16-bytes
MKL_Complex8 veclib_float_complex
complex: 8-bytes
#define SAF_INTEL_MKL_VML_MODE
Taken from the Intel MKL VML developers reference documentation: The flag(s) that can be passed to In...
float veclib_float
real: 4-bytes
Data structure for utility_cchol()
Data structure for utility_ceig()
Data structure for utility_ceigmp()
Data structure for utility_cglslv()
Data structure for utility_cinv()
Data structure for utility_cpinv()
Data structure for utility_cseig()
Data structure for utility_cslslv()
Data structure for utility_csvd()
Data structure for utility_ddet()
Data structure for utility_dglslv()
Data structure for utility_dinv()
Data structure for utility_dpinv()
Data structure for utility_schol()
Data structure for utility_sdet()
Data structure for utility_sglslv()
Data structure for utility_sglslvt()
Data structure for utility_sinv()
Data structure for utility_spinv()
Data structure for utility_sseig()
Data structure for utility_sslslv()
Data structure for utility_ssvd()
Data structure for utility_zeig()
Data structure for utility_zeigmp()
Data structure for utility_zglslv()
Data structure for utility_zpinv()