SAF
Loading...
Searching...
No Matches
saf_utility_veclib.c
Go to the documentation of this file.
1/*
2 * Copyright 2016-2018 Leo McCormack
3 *
4 * Permission to use, copy, modify, and/or distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
9 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
10 * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
11 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
12 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
13 * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
14 * PERFORMANCE OF THIS SOFTWARE.
15 */
16
48#include "saf_utilities.h"
49#include "saf_externals.h"
50
51/* Specify which LAPACK interface should be used: */
52#if defined(SAF_USE_INTEL_MKL_LP64)
53/* Note that Intel MKL LP64 supports Fortran LAPACK and LAPACKE interfaces: */
54# define SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE
55#elif defined(SAF_USE_INTEL_MKL_ILP64)
56/* Note that Intel MKL ILP64 will only work with the LAPACKE interface: */
57# define SAF_VECLIB_USE_LAPACKE_INTERFACE
58#elif defined(SAF_USE_OPEN_BLAS_AND_LAPACKE)
59# define SAF_VECLIB_USE_LAPACKE_INTERFACE
60#elif defined(SAF_USE_ATLAS)
61# define SAF_VECLIB_USE_CLAPACK_INTERFACE
62#elif defined(__APPLE__) && defined(SAF_USE_APPLE_ACCELERATE)
63# define SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE
64#elif defined(SAF_USE_GSL)
65# define SAF_VECLIB_USE_GSL_LINALG
66#else
67# error No LAPACK interface was specified!
68#endif
69
70/* Additional flags for Intel MKL */
71#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
72# ifndef SAF_INTEL_MKL_VML_MODE
92# define SAF_INTEL_MKL_VML_MODE ( VML_LA | VML_FTZDAZ_ON )
93# endif
94#endif
95
96/* These are mainly just to remove compiler warnings: */
97#if defined(__APPLE__) && defined(SAF_USE_APPLE_ACCELERATE)
98 typedef __CLPK_integer veclib_int;
99 typedef __CLPK_real veclib_float;
100 typedef __CLPK_doublereal veclib_double;
101 typedef __CLPK_complex veclib_float_complex;
102 typedef __CLPK_doublecomplex veclib_double_complex;
103#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
104# ifdef SAF_USE_INTEL_MKL_LP64
105 typedef MKL_INT veclib_int;
106# else /* SAF_USE_INTEL_MKL_ILP64: */
107 typedef MKL_INT veclib_int;
108# endif
109 typedef float veclib_float;
110 typedef double veclib_double;
111 typedef MKL_Complex8 veclib_float_complex;
112 typedef MKL_Complex16 veclib_double_complex;
113#elif defined(SAF_USE_OPEN_BLAS_AND_LAPACKE)
114 typedef lapack_int veclib_int;
115 typedef float veclib_float;
116 typedef double veclib_double;
117 typedef lapack_complex_float veclib_float_complex;
118 typedef lapack_complex_double veclib_double_complex;
119#else
120 typedef int veclib_int;
121 typedef float veclib_float;
122 typedef double veclib_double;
123 typedef float_complex veclib_float_complex;
124 typedef double_complex veclib_double_complex;
125#endif
126
127
128/* ========================================================================== */
129/* Built-in CBLAS Functions (Level 0) */
130/* ========================================================================== */
131
132#ifdef SAF_USE_BUILT_IN_NAIVE_CBLAS
133void cblas_scopy(const int N, const float *X, const int incX, float *Y, const int incY){
134 int i;
135#if defined(SAF_ENABLE_SIMD)
136 if(incX==1 && incY==1){
137 for(i=0; i<(N-3); i+=4)
138 _mm_storeu_ps(Y+i, _mm_loadu_ps(X+i));
139 for(;i<N; i++) /* The residual (if N was not divisable by 4): */
140 Y[i] = X[i];
141 }
142 else
143 for(i=0; i<N; i++)
144 Y[i*incY] = X[i*incX];
145#else
146 for(i=0; i<N; i++)
147 Y[i*incY] = X[i*incX];
148#endif
149}
150
151void cblas_dcopy(const int N, const double *X, const int incX, double *Y, const int incY){
152 int i,j;
153 for(i=j=0; i<N; i+=incX, j+=incY)
154 Y[i] = X[j];
155}
156
157void cblas_ccopy(const int N, const void *X, const int incX, void *Y, const int incY){
158 int i,j;
159 float_complex *cX, *cY;
160 cX = (float_complex*)X;
161 cY = (float_complex*)Y;
162 for(i=j=0; i<N; i+=incX, j+=incY)
163 cY[i] = cX[j];
164}
165
166void cblas_zcopy(const int N, const void *X, const int incX, void *Y, const int incY){
167 int i,j;
168 double_complex *cX, *cY;
169 cX = (double_complex*)X;
170 cY = (double_complex*)Y;
171 for(i=j=0; i<N; i+=incX, j+=incY)
172 cY[i] = cX[j];
173}
174
175void cblas_saxpy(const int N, const float alpha, const float* X, const int incX, float* Y, const int incY) {
176 int i,j;
177 for (i=j=0; i<N; i+=incX, j+=incY)
178 Y[i] = alpha * X[i] + Y[i];
179}
180
181void cblas_daxpy(const int N, const double alpha, const double* X, const int incX, double* Y, const int incY) {
182 int i,j;
183 for (i=j=0; i<N; i+=incX, j+=incY)
184 Y[i] = alpha * X[i] + Y[i];
185}
186
187float cblas_sdot(const int N, const float* X, const int incX, const float* Y, const int incY){
188 int i;
189 float ret;
190#if defined(SAF_ENABLE_SIMD)
191 float sum2;
192 if(incX==1 && incY==1){
193 sum2 = 0.0f;
194 __m128 sum = _mm_setzero_ps();
195 /* Sum over all elements in groups of 4 */
196 for(i=0; i<(N-3); i+=4)
197 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(X+i), _mm_loadu_ps(Y+i)));
198 /* sum the first 2 elements with the second 2 elements and store in the first 2 elements */
199 sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum)/* copy elements 3 and 4 to 1 and 2 */);
200 /* sum the first 2 elements and store in the first element */
201 sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 2, 0, 1))/* shuffle so that element 2 goes to element 1 */);
202 _mm_store_ss(&ret, sum); /* get the first element (the result) */
203 for(;i<N; i++) /* The residual (if N was not divisable by 4): */
204 sum2 += X[i] * Y[i];
205 ret += sum2;
206 }
207 else
208 for(i=0; i<N; i++)
209 ret += X[i*incX] * Y[i*incY];
210#else
211 ret = 0.0f;
212 for(i=0; i<N; i++)
213 ret += X[i*incX] * Y[i*incY];
214#endif
215 return ret;
216}
217
218#endif
219
220
221/* ========================================================================== */
222/* Built-in CBLAS Functions (Level 3) */
223/* ========================================================================== */
224
225#ifdef SAF_USE_BUILT_IN_NAIVE_CBLAS
226void cblas_sgemm(const CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE TransA,
227 const CBLAS_TRANSPOSE TransB, const int M, const int N,
228 const int K, const float alpha, const float *A,
229 const int lda, const float *B, const int ldb,
230 const float beta, float *C, const int ldc)
231{
232 saf_assert(0, "INCOMPLETE and UNTESTED");
233 int i,j,k;
234 int _transA;
235 int _transB;
236
237 switch(Layout){
238 case CblasRowMajor:
239 _transA = (TransA == CblasNoTrans) ? SAF_FALSE : SAF_TRUE;
240 _transB = (TransB == CblasNoTrans) ? SAF_FALSE : SAF_TRUE;
241 break;
242 case CblasColMajor:
243 _transA = (TransA == CblasNoTrans) ? SAF_TRUE : SAF_FALSE;
244 _transB = (TransB == CblasNoTrans) ? SAF_TRUE : SAF_FALSE;
245 break;
246 }
247
248 if(!_transA && !_transB){
249 for(i=0; i<M; i++){
250 for(j=0; j<N; j++){
251 C[i*ldc+j] = 0.0f;
252 for(k=0; k<K; k++)
253 C[i*ldc+j] += A[i*lda+k] * B[k*ldb+j];
254 }
255 }
256 }
257 else if(_transA && !_transB){
258 }
259 else if(_transA && _transB){
260 }
261}
262#endif
263
264
265/* ========================================================================== */
266/* Find Index of Min-Abs-Value (?iminv) */
267/* ========================================================================== */
268
270(
271 const float* a,
272 const int len,
273 int* index
274)
275{
276#if defined(SAF_USE_APPLE_ACCELERATE)
277 float minVal;
278 vDSP_Length ind_tmp;
279 vDSP_minmgvi(a, 1, &minVal, &ind_tmp, (vDSP_Length)len);
280 *index = (int)ind_tmp;
281#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
282 *index = (int)cblas_isamin(len, a, 1);
283#else
284 int i;
285 *index = 0;
286 float minVal=FLT_MAX;
287 for(i=0; i<len; i++){
288 if(fabsf(a[i])<minVal){
289 minVal = fabsf(a[i]);
290 *index = i;
291 }
292 }
293#endif
294}
295
297(
298 const float_complex* a,
299 const int len,
300 int* index
301)
302{
303#if defined(SAF_USE_APPLE_ACCELERATE) /* Unfortunately requires a malloc call */
304 float* abs_a;
305 float minVal;
306 abs_a = malloc1d(len*sizeof(float));
307 vDSP_vdist((float*)a/*real*/, 2, (float*)a+1/*imag*/, 2, abs_a, 1, (vDSP_Length)len); /* cabsf */
308 vDSP_Length ind_tmp;
309 vDSP_minmgvi(abs_a, 1, &minVal, &ind_tmp, (vDSP_Length)len);
310 *index = (int)ind_tmp;
311 free(abs_a);
312#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
313 *index = (int)cblas_icamin(len, a, 1);
314#else
315 int i;
316 *index = 0;
317 float minVal=FLT_MAX;
318 for(i=0; i<len; i++){
319 if(cabsf(a[i])<minVal){
320 minVal = cabsf(a[i]);
321 *index = i;
322 }
323 }
324#endif
325}
326
328(
329 const double* a,
330 const int len,
331 int* index
332)
333{
334#if defined(SAF_USE_APPLE_ACCELERATE)
335 double minVal;
336 vDSP_Length ind_tmp;
337 vDSP_minmgviD(a, 1, &minVal, &ind_tmp, (vDSP_Length)len);
338 *index = (int)ind_tmp;
339#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
340 *index = (int)cblas_idamin(len, a, 1);
341#else
342 int i;
343 *index = 0;
344 double minVal=DBL_MAX;
345 for(i=0; i<len; i++){
346 if(fabs(a[i])<minVal){
347 minVal = fabs(a[i]);
348 *index = i;
349 }
350 }
351#endif
352}
353
355(
356 const double_complex* a,
357 const int len,
358 int* index
359)
360{
361#if defined(SAF_USE_APPLE_ACCELERATE) /* Unfortunately requires a malloc call */
362 double* abs_a;
363 double minVal;
364 abs_a = malloc1d(len*sizeof(double));
365 vDSP_vdistD((double*)a/*real*/, 2, (double*)a+1/*imag*/, 2, abs_a, 1, (vDSP_Length)len); /* cabs */
366 vDSP_Length ind_tmp;
367 vDSP_minmgviD(abs_a, 1, &minVal, &ind_tmp, (vDSP_Length)len);
368 *index = (int)ind_tmp;
369 free(abs_a);
370#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
371 *index = (int)cblas_izamin(len, a, 1);
372#else
373 int i;
374 *index = 0;
375 double minVal=DBL_MAX;
376 for(i=0; i<len; i++){
377 if(cabs(a[i])<minVal){
378 minVal = cabs(a[i]);
379 *index = i;
380 }
381 }
382#endif
383}
384
385
386/* ========================================================================== */
387/* Find Index of Max-Abs-Value (?imaxv) */
388/* ========================================================================== */
389
391(
392 const float* a,
393 const int len,
394 int* index
395)
396{
397 *index = (int)cblas_isamax(len, a, 1);
398}
399
401(
402 const float_complex* a,
403 const int len,
404 int* index
405)
406{
407 *index = (int)cblas_icamax(len, a, 1);
408}
409
411(
412 const double* a,
413 const int len,
414 int* index
415)
416{
417 *index = (int)cblas_idamax(len, a, 1);
418}
419
421(
422 const double_complex* a,
423 const int len,
424 int* index
425)
426{
427 *index = (int)cblas_izamax(len, a, 1);
428}
429
430
431/* ========================================================================== */
432/* Vector-Abs (?vabs) */
433/* ========================================================================== */
434
436(
437 const float* a,
438 const int len,
439 float* c
440)
441{
442#if defined(SAF_USE_INTEL_IPP)
443 ippsAbs_32f((Ipp32f*)a, (Ipp32f*)c, len);
444#elif defined(SAF_USE_APPLE_ACCELERATE)
445 vDSP_vabs(a, 1, c, 1, (vDSP_Length)len);
446#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
447 vmsAbs(len, a, c, SAF_INTEL_MKL_VML_MODE);
448#else
449 int i;
450 for(i=0; i<len; i++)
451 c[i] = fabsf(a[i]);
452#endif
453}
454
456(
457 const float_complex* a,
458 const int len,
459 float* c
460)
461{
462#if defined(SAF_USE_INTEL_IPP)
463 ippsMagnitude_32fc((Ipp32fc*)a, (Ipp32f*)c, len);
464#elif defined(SAF_USE_APPLE_ACCELERATE)
465 vDSP_vdist((float*)a/*real*/, 2, (float*)a+1/*imag*/, 2, c, 1, (vDSP_Length)len);
466#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
467 vmcAbs(len, (MKL_Complex8*)a, c, SAF_INTEL_MKL_VML_MODE);
468#else
469 int i;
470 for(i=0; i<len; i++)
471 c[i] = cabsf(a[i]);
472#endif
473}
474
475
476/* ========================================================================== */
477/* Vector-Modulus (?vmod) */
478/* ========================================================================== */
479
481(
482 const float* a,
483 const float* b,
484 const int len,
485 float* c
486)
487{
488#if defined(SAF_USE_APPLE_ACCELERATE)
489 vvfmodf(c, a, b, &len);
490#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
491 vmsFmod(len, a, b, c, SAF_INTEL_MKL_VML_MODE);
492#else
493 int i;
494 for(i=0; i<len; i++)
495 c[i] = fmodf(a[i], b[i]);
496#endif
497}
498
499
500/* ========================================================================== */
501/* Vector-Reciprocal (?vrecip) */
502/* ========================================================================== */
503
505(
506 const float* a,
507 const int len,
508 float* c
509)
510{
511#if defined(SAF_USE_INTEL_IPP)
512 ippsDivCRev_32f((Ipp32f*)a, 1.0f, (Ipp32f*)c, len);
513#elif defined(SAF_USE_APPLE_ACCELERATE)
514 float one;
515 one = 1.0f;
516 vDSP_svdiv(&one, a, 1, c, 1, (vDSP_Length)len);
517#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
518 vmsInv(len, a, c, SAF_INTEL_MKL_VML_MODE);
519#elif defined(SAF_ENABLE_SIMD)
520 int i;
521 i = 0;
522# if defined(__AVX512F__)
523 for(; i<(len-15); i+=16)
524 _mm512_storeu_ps(c+i, _mm512_rcp14_ps(_mm512_loadu_ps(a+i)));
525# endif
526# if defined(__AVX__) && defined(__AVX2__)
527 for(; i<(len-7); i+=8)
528 _mm256_storeu_ps(c+i, _mm256_rcp_ps(_mm256_loadu_ps(a+i)));
529# endif
530# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
531 for(; i<(len-3); i+=4)
532 _mm_storeu_ps(c+i, _mm_rcp_ps(_mm_loadu_ps(a+i)));
533# endif
534 for(;i<len; i++) /* The residual (if len was not divisable by the step size): */
535 c[i] = 1.0f/a[i];
536#else
537 int i;
538 for(i=0; i<len; i++)
539 c[i] = 1.0f/a[i];
540#endif
541}
542
543
544/* ========================================================================== */
545/* Vector-Conjugate (?vconj) */
546/* ========================================================================== */
547
549(
550 const float_complex* a,
551 const int len,
552 float_complex* c
553)
554{
555#if defined(SAF_USE_INTEL_IPP)
556 ippsConj_32fc((Ipp32fc*)a, (Ipp32fc*)c, len);
557#else
558 cblas_ccopy(len, a, 1, c, 1);
559 cblas_sscal(len, -1.0f, ((float*)c)+1, 2);
560#endif
561}
562
564(
565 const double_complex* a,
566 const int len,
567 double_complex* c
568)
569{
570#if defined(SAF_USE_INTEL_IPP)
571 ippsConj_64fc((Ipp64fc*)a, (Ipp64fc*)c, len);
572#else
573 cblas_zcopy(len, a, 1, c, 1);
574 cblas_dscal(len, -1.0, ((double*)c)+1, 2);
575#endif
576}
577
578
579/* ========================================================================== */
580/* Vector-Vector Copy (?vvcopy) */
581/* ========================================================================== */
582
584(
585 const float* a,
586 const int len,
587 float* c
588)
589{
590 cblas_scopy(len, a, 1, c, 1);
591}
592
594(
595 const float_complex* a,
596 const int len,
597 float_complex* c
598)
599{
600 cblas_ccopy(len, a, 1, c, 1);
601}
602
604(
605 const double* a,
606 const int len,
607 double* c
608)
609{
610 cblas_dcopy(len, a, 1, c, 1);
611}
612
614(
615 const double_complex* a,
616 const int len,
617 double_complex* c
618)
619{
620 cblas_zcopy(len, a, 1, c, 1);
621}
622
623
624/* ========================================================================== */
625/* Vector-Vector Addition (?vvadd) */
626/* ========================================================================== */
627
629(
630 const float* a,
631 const float* b,
632 const int len,
633 float* c
634)
635{
636 /* Checks: */
637#ifndef NDEBUG
638 saf_assert(c!=NULL, "'c' can no longer be NULL");
639 saf_assert(a!=c && b!=c, "In-place operation is no longer supported, use e.g. cblas_saxby() instead.");
640#endif
641
642 /* The operation: */
643#if defined(SAF_USE_INTEL_IPP)
644 ippsAdd_32f((Ipp32f*)a, (Ipp32f*)b, (Ipp32f*)c, len);
645#elif defined(SAF_USE_APPLE_ACCELERATE)
646 vDSP_vadd(a, 1, b, 1, c, 1, (vDSP_Length)len);
647#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
648 vmsAdd(len, a, b, c, SAF_INTEL_MKL_VML_MODE);
649#elif defined(SAF_ENABLE_SIMD)
650 int i;
651 i = 0;
652# if defined(__AVX512F__)
653 for(; i<(len-15); i+=16)
654 _mm512_storeu_ps(c+i, _mm512_add_ps(_mm512_loadu_ps(a+i), _mm512_loadu_ps(b+i)));
655# endif
656# if defined(__AVX__) && defined(__AVX2__)
657 for(; i<(len-7); i+=8)
658 _mm256_storeu_ps(c+i, _mm256_add_ps(_mm256_loadu_ps(a+i), _mm256_loadu_ps(b+i)));
659# endif
660# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
661 for(; i<(len-3); i+=4)
662 _mm_storeu_ps(c+i, _mm_add_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
663# endif
664 for(; i<len; i++) /* The residual (if len was not divisable by the step size): */
665 c[i] = a[i] + b[i];
666#elif defined(NDEBUG)
667 int i;
668 /* try to indirectly "trigger" some compiler optimisations */
669 for(i=0; i<len-3; i+=4){
670 c[i] = a[i] + b[i];
671 c[i+1] = a[i+1] + b[i+1];
672 c[i+2] = a[i+2] + b[i+2];
673 c[i+3] = a[i+3] + b[i+3];
674 }
675 for(; i<len; i++) /* The residual (if len was not divisable by the step size): */
676 c[i] = a[i] + b[i];
677#else
678 int j;
679 for (j = 0; j < len; j++)
680 c[j] = a[j] + b[j];
681#endif
682}
683
685(
686 const float_complex* a,
687 const float_complex* b,
688 const int len,
689 float_complex* c
690)
691{
692 /* Checks: */
693#ifndef NDEBUG
694 saf_assert(c!=NULL, "'c' can no longer be NULL");
695 saf_assert(a!=c && b!=c, "In-place operation is no longer supported, use e.g. cblas_caxby() instead.");
696#endif
697
698 /* The operation: */
699#if defined(SAF_USE_INTEL_IPP)
700 ippsAdd_32fc((Ipp32fc*)a, (Ipp32fc*)b, (Ipp32fc*)c, len);
701#elif defined(SAF_USE_APPLE_ACCELERATE)
702 vDSP_vadd((float*)a, 1, (float*)b, 1, (float*)c, 1, /*re+im*/2*(vDSP_Length)len);
703#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
704 vmcAdd(len, (MKL_Complex8*)a, (MKL_Complex8*)b, (MKL_Complex8*)c, SAF_INTEL_MKL_VML_MODE);
705#elif defined(SAF_ENABLE_SIMD)
706 int i, len2;
707 float* sa, *sb, *sc;
708 len2 = len*2;
709 sa = (float*)a; sb = (float*)b; sc = (float*)c;
710 i = 0;
711# if defined(__AVX512F__)
712 for(; i<(len2-15); i+=16)
713 _mm512_storeu_ps(sc+i, _mm512_add_ps(_mm512_loadu_ps(sa+i), _mm512_loadu_ps(sb+i)));
714# endif
715# if defined(__AVX__) && defined(__AVX2__)
716 for(; i<(len2-7); i+=8)
717 _mm256_storeu_ps(sc+i, _mm256_add_ps(_mm256_loadu_ps(sa+i), _mm256_loadu_ps(sb+i)));
718# endif
719# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
720 for(; i<(len2-3); i+=4)
721 _mm_storeu_ps(sc+i, _mm_add_ps(_mm_loadu_ps(sa+i), _mm_loadu_ps(sb+i)));
722# endif
723 for(; i<len2; i++) /* The residual (if len2 was not divisable by the step size): */
724 sc[i] = sa[i] + sb[i];
725#elif __STDC_VERSION__ >= 199901L && defined(NDEBUG)
726 int i;
727 /* try to indirectly "trigger" some compiler optimisations */
728 for(i=0; i<len-3; i+=4){
729 c[i] = a[i] + b[i];
730 c[i+1] = a[i+1] + b[i+1];
731 c[i+2] = a[i+2] + b[i+2];
732 c[i+3] = a[i+3] + b[i+3];
733 }
734 for(; i<len; i++) /* The residual (if len was not divisable by the step size): */
735 c[i] = a[i] + b[i];
736#else
737 int j;
738 for (j = 0; j < len; j++)
739 c[j] = ccaddf(a[j], b[j]);
740#endif
741}
742
744(
745 const double* a,
746 const double* b,
747 const int len,
748 double* c
749)
750{
751 /* Checks: */
752#ifndef NDEBUG
753 saf_assert(c!=NULL, "'c' can no longer be NULL");
754 saf_assert(a!=c && b!=c, "In-place operation is no longer supported, use e.g. cblas_daxby() instead.");
755#endif
756
757 /* The operation: */
758#if defined(SAF_USE_INTEL_IPP)
759 ippsAdd_64f((Ipp64f*)a, (Ipp64f*)b, (Ipp64f*)c, len);
760#elif defined(SAF_USE_APPLE_ACCELERATE)
761 vDSP_vaddD(a, 1, b, 1, c, 1, (vDSP_Length)len);
762#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
763 vmdAdd(len, a, b, c, SAF_INTEL_MKL_VML_MODE);
764#elif defined(SAF_ENABLE_SIMD)
765 int i;
766 i = 0;
767# if defined(__AVX512F__)
768 for(; i<(len-7); i+=8)
769 _mm512_storeu_pd(c+i, _mm512_add_pd(_mm512_loadu_pd(a+i), _mm512_loadu_pd(b+i)));
770# endif
771# if defined(__AVX__) && defined(__AVX2__)
772 for(; i<(len-3); i+=4)
773 _mm256_storeu_pd(c+i, _mm256_add_pd(_mm256_loadu_pd(a+i), _mm256_loadu_pd(b+i)));
774# endif
775# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
776 for(; i<(len-1); i+=2)
777 _mm_storeu_pd(c+i, _mm_add_pd(_mm_loadu_pd(a+i), _mm_loadu_pd(b+i)));
778# endif
779 for(; i<len; i++) /* The residual (if len was not divisable by the step size): */
780 c[i] = a[i] + b[i];
781#else
782 int j;
783 for (j = 0; j < len; j++)
784 c[j] = a[j] + b[j];
785#endif
786}
787
789(
790 const double_complex* a,
791 const double_complex* b,
792 const int len,
793 double_complex* c
794)
795{
796 /* Checks: */
797#ifndef NDEBUG
798 saf_assert(c!=NULL, "'c' can no longer be NULL");
799 saf_assert(a!=c && b!=c, "In-place operation is no longer supported, use e.g. cblas_zaxby() instead.");
800#endif
801
802 /* The operation: */
803#if defined(SAF_USE_INTEL_IPP)
804 ippsAdd_64fc((Ipp64fc*)a, (Ipp64fc*)b, (Ipp64fc*)c, len);
805#elif defined(SAF_USE_APPLE_ACCELERATE)
806 vDSP_vaddD((double*)a, 1, (double*)b, 1, (double*)c, 1, /*re+im*/2*(vDSP_Length)len);
807#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
808 vmzAdd(len, (MKL_Complex16*)a, (MKL_Complex16*)b, (MKL_Complex16*)c, SAF_INTEL_MKL_VML_MODE);
809#elif defined(SAF_ENABLE_SIMD)
810 int i, len2;
811 double* sa, *sb, *sc;
812 len2 = len*2;
813 sa = (double*)a; sb = (double*)b; sc = (double*)c;
814 i = 0;
815# if defined(__AVX512F__)
816 for(; i<(len2-7); i+=8)
817 _mm512_storeu_pd(sc+i, _mm512_add_pd(_mm512_loadu_pd(sa+i), _mm512_loadu_pd(sb+i)));
818# endif
819# if defined(__AVX__) && defined(__AVX2__)
820 for(; i<(len2-3); i+=4)
821 _mm256_storeu_pd(sc+i, _mm256_add_pd(_mm256_loadu_pd(sa+i), _mm256_loadu_pd(sb+i)));
822# endif
823# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
824 for(; i<(len2-1); i+=2)
825 _mm_storeu_pd(sc+i, _mm_add_pd(_mm_loadu_pd(sa+i), _mm_loadu_pd(sb+i)));
826# endif
827 for(; i<len2; i++) /* The residual (if len2 was not divisable by the step size): */
828 sc[i] = sa[i] + sb[i];
829#else
830 int j;
831 for (j = 0; j < len; j++)
832 c[j] = ccadd(a[j], b[j]);
833#endif
834}
835
836
837/* ========================================================================== */
838/* Vector-Vector Subtraction (?vvsub) */
839/* ========================================================================== */
840
842(
843 const float* a,
844 const float* b,
845 const int len,
846 float* c
847)
848{
849 /* Checks: */
850#ifndef NDEBUG
851 saf_assert(c!=NULL, "'c' cannot be NULL");
852 saf_assert(a!=c && b!=c, "In-place operation is not supported.");
853#endif
854
855 /* The operation: */
856#if defined(SAF_USE_INTEL_IPP)
857 ippsSub_32f((Ipp32f*)b, (Ipp32f*)a, (Ipp32f*)c, len); /* 'a' and 'b' are switched */
858#elif defined(SAF_USE_APPLE_ACCELERATE)
859 vDSP_vsub(b, 1, a, 1, c, 1, (vDSP_Length)len); /* 'a' and 'b' are switched */
860#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
861 vmsSub(len, a, b, c, SAF_INTEL_MKL_VML_MODE);
862#elif defined(SAF_ENABLE_SIMD)
863 int i;
864 i = 0;
865# if defined(__AVX512F__)
866 for(; i<(len-15); i+=16)
867 _mm512_storeu_ps(c+i, _mm512_sub_ps(_mm512_loadu_ps(a+i), _mm512_loadu_ps(b+i)));
868# endif
869# if defined(__AVX__) && defined(__AVX2__)
870 for(; i<(len-7); i+=8)
871 _mm256_storeu_ps(c+i, _mm256_sub_ps(_mm256_loadu_ps(a+i), _mm256_loadu_ps(b+i)));
872# endif
873# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
874 for(; i<(len-3); i+=4)
875 _mm_storeu_ps(c+i, _mm_sub_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
876# endif
877 for(; i<len; i++) /* The residual (if len was not divisable by the step size): */
878 c[i] = a[i] - b[i];
879#elif defined(NDEBUG)
880 int i;
881 /* try to indirectly "trigger" some compiler optimisations */
882 for(i=0; i<len-3; i+=4){
883 c[i] = a[i] - b[i];
884 c[i+1] = a[i+1] - b[i+1];
885 c[i+2] = a[i+2] - b[i+2];
886 c[i+3] = a[i+3] - b[i+3];
887 }
888 for(; i<len; i++) /* The residual (if len was not divisable by the step size): */
889 c[i] = a[i] - b[i];
890#else
891 int j;
892 for (j = 0; j < len; j++)
893 c[j] = a[j] - b[j];
894#endif
895}
896
898(
899 const float_complex* a,
900 const float_complex* b,
901 const int len,
902 float_complex* c
903)
904{
905 /* Checks: */
906#ifndef NDEBUG
907 saf_assert(c!=NULL, "'c' cannot be NULL");
908 saf_assert(a!=c && b!=c, "In-place operation is not supported.");
909#endif
910
911 /* The operation: */
912#if defined(SAF_USE_INTEL_IPP)
913 ippsSub_32fc((Ipp32fc*)b, (Ipp32fc*)a, (Ipp32fc*)c, len); /* 'a' and 'b' are switched */
914#elif defined(SAF_USE_APPLE_ACCELERATE)
915 vDSP_vsub((float*)b, 1, (float*)a, 1, (float*)c, 1, /*re+im*/2*(vDSP_Length)len); /* 'a' and 'b' are switched */
916#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
917 vmcSub(len, (MKL_Complex8*)a, (MKL_Complex8*)b, (MKL_Complex8*)c, SAF_INTEL_MKL_VML_MODE);
918#elif defined(SAF_ENABLE_SIMD)
919 int i, len2;
920 float* sa, *sb, *sc;
921 len2 = len*2;
922 sa = (float*)a; sb = (float*)b; sc = (float*)c;
923 i = 0;
924# if defined(__AVX512F__)
925 for(; i<(len2-15); i+=16)
926 _mm512_storeu_ps(sc+i, _mm512_sub_ps(_mm512_loadu_ps(sa+i), _mm512_loadu_ps(sb+i)));
927# endif
928# if defined(__AVX__) && defined(__AVX2__)
929 for(; i<(len2-7); i+=8)
930 _mm256_storeu_ps(sc+i, _mm256_sub_ps(_mm256_loadu_ps(sa+i), _mm256_loadu_ps(sb+i)));
931# endif
932# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
933 for(; i<(len2-3); i+=4)
934 _mm_storeu_ps(sc+i, _mm_sub_ps(_mm_loadu_ps(sa+i), _mm_loadu_ps(sb+i)));
935# endif
936 for(; i<len2; i++) /* The residual (if len2 was not divisable by the step size): */
937 sc[i] = sa[i] - sb[i];
938#elif __STDC_VERSION__ >= 199901L && defined(NDEBUG)
939 int i;
940 /* try to indirectly "trigger" some compiler optimisations */
941 for(i=0; i<len-3; i+=4){
942 c[i] = a[i] - b[i];
943 c[i+1] = a[i+1] - b[i+1];
944 c[i+2] = a[i+2] - b[i+2];
945 c[i+3] = a[i+3] - b[i+3];
946 }
947 for(; i<len; i++) /* The residual (if len was not divisable by the step size): */
948 c[i] = a[i] - b[i];
949#else
950 int j;
951 for (j = 0; j < len; j++)
952 c[j] = ccsubf(a[j], b[j]);
953#endif
954}
955
957(
958 const double* a,
959 const double* b,
960 const int len,
961 double* c
962)
963{
964 /* Checks: */
965#ifndef NDEBUG
966 saf_assert(c!=NULL, "'c' cannot be NULL");
967 saf_assert(a!=c && b!=c, "In-place operation is not supported.");
968#endif
969
970 /* The operation: */
971#if defined(SAF_USE_INTEL_IPP)
972 ippsSub_64f((Ipp64f*)b, (Ipp64f*)a, (Ipp64f*)c, len); /* 'a' and 'b' are switched */
973#elif defined(SAF_USE_APPLE_ACCELERATE)
974 vDSP_vsubD(b, 1, a, 1, c, 1, (vDSP_Length)len); /* 'a' and 'b' are switched */
975#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
976 vmdSub(len, a, b, c, SAF_INTEL_MKL_VML_MODE);
977#elif defined(SAF_ENABLE_SIMD)
978 int i;
979 i = 0;
980# if defined(__AVX512F__)
981 for(; i<(len-7); i+=8)
982 _mm512_storeu_pd(c+i, _mm512_sub_pd(_mm512_loadu_pd(a+i), _mm512_loadu_pd(b+i)));
983# endif
984# if defined(__AVX__) && defined(__AVX2__)
985 for(; i<(len-3); i+=4)
986 _mm256_storeu_pd(c+i, _mm256_sub_pd(_mm256_loadu_pd(a+i), _mm256_loadu_pd(b+i)));
987# endif
988# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
989 for(; i<(len-1); i+=2)
990 _mm_storeu_pd(c+i, _mm_sub_pd(_mm_loadu_pd(a+i), _mm_loadu_pd(b+i)));
991# endif
992 for(;i<len; i++) /* The residual (if len was not divisable by the step size): */
993 c[i] = a[i] - b[i];
994#else
995 int j;
996 for (j = 0; j < len; j++)
997 c[j] = a[j] - b[j];
998#endif
999}
1000
1002(
1003 const double_complex* a,
1004 const double_complex* b,
1005 const int len,
1006 double_complex* c
1007)
1008{
1009 /* Checks: */
1010#ifndef NDEBUG
1011 saf_assert(c!=NULL, "'c' cannot be NULL");
1012 saf_assert(a!=c && b!=c, "In-place operation is not supported.");
1013#endif
1014
1015 /* The operation: */
1016#if defined(SAF_USE_INTEL_IPP)
1017 ippsSub_64fc((Ipp64fc*)b, (Ipp64fc*)a, (Ipp64fc*)c, len); /* 'a' and 'b' are switched */
1018#elif defined(SAF_USE_APPLE_ACCELERATE)
1019 vDSP_vsubD((double*)b, 1, (double*)a, 1, (double*)c, 1, /*re+im*/2*(vDSP_Length)len); /* 'a' and 'b' are switched */
1020#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1021 vmzSub(len, (MKL_Complex16*)a, (MKL_Complex16*)b, (MKL_Complex16*)c, SAF_INTEL_MKL_VML_MODE);
1022#elif defined(SAF_ENABLE_SIMD)
1023 int i, len2;
1024 double* sa, *sb, *sc;
1025 len2 = len*2;
1026 sa = (double*)a; sb = (double*)b; sc = (double*)c;
1027 i = 0;
1028# if defined(__AVX512F__)
1029 for(; i<(len2-7); i+=8)
1030 _mm512_storeu_pd(sc+i, _mm512_sub_pd(_mm512_loadu_pd(sa+i), _mm512_loadu_pd(sb+i)));
1031# endif
1032# if defined(__AVX__) && defined(__AVX2__)
1033 for(; i<(len2-3); i+=4)
1034 _mm256_storeu_pd(sc+i, _mm256_sub_pd(_mm256_loadu_pd(sa+i), _mm256_loadu_pd(sb+i)));
1035# endif
1036# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
1037 for(; i<(len2-1); i+=2)
1038 _mm_storeu_pd(sc+i, _mm_sub_pd(_mm_loadu_pd(sa+i), _mm_loadu_pd(sb+i)));
1039# endif
1040 for(;i<len2; i++) /* The residual (if len2 was not divisable by the step size): */
1041 sc[i] = sa[i] - sb[i];
1042#else
1043 int j;
1044 for (j = 0; j < len; j++)
1045 c[j] = ccsub(a[j], b[j]);
1046#endif
1047}
1048
1049
1050/* ========================================================================== */
1051/* Vector-Vector Multiplication (?vvmul) */
1052/* ========================================================================== */
1053
1055(
1056 const float* a,
1057 const float* b,
1058 const int len,
1059 float* c
1060)
1061{
1062 /* Checks: */
1063#ifndef NDEBUG
1064 saf_assert(c!=NULL, "'c' can no longer be NULL");
1065 saf_assert(a!=c && b!=c, "In-place operation is no longer supported.");
1066#endif
1067
1068 /* The operation: */
1069#if defined(SAF_USE_INTEL_IPP)
1070 ippsMul_32f((Ipp32f*)a, (Ipp32f*)b, (Ipp32f*)c, len);
1071#elif defined(SAF_USE_APPLE_ACCELERATE)
1072 vDSP_vmul(a, 1, b, 1, c, 1, (vDSP_Length)len);
1073#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1074 vmsMul(len, a, b, c, SAF_INTEL_MKL_VML_MODE);
1075#elif defined(SAF_ENABLE_SIMD)
1076 int i;
1077 i = 0;
1078# if defined(__AVX512F__)
1079 for(; i<(len-15); i+=16)
1080 _mm512_storeu_ps(c+i, _mm512_mul_ps(_mm512_loadu_ps(a+i), _mm512_loadu_ps(b+i)));
1081# endif
1082# if defined(__AVX__) && defined(__AVX2__)
1083 for(; i<(len-7); i+=8)
1084 _mm256_storeu_ps(c+i, _mm256_mul_ps(_mm256_loadu_ps(a+i), _mm256_loadu_ps(b+i)));
1085# endif
1086# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
1087 for(; i<(len-3); i+=4)
1088 _mm_storeu_ps(c+i, _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
1089# endif
1090 for(;i<len; i++) /* The residual (if len was not divisable by the step size): */
1091 c[i] = a[i] * b[i];
1092#elif defined(NDEBUG)
1093 int i;
1094 /* try to indirectly "trigger" some compiler optimisations */
1095 for(i=0; i<len-3; i+=4){
1096 c[i] = a[i] * b[i];
1097 c[i+1] = a[i+1] * b[i+1];
1098 c[i+2] = a[i+2] * b[i+2];
1099 c[i+3] = a[i+3] * b[i+3];
1100 }
1101 for(; i<len; i++) /* The residual (if len was not divisable by the step size): */
1102 c[i] = a[i] * b[i];
1103#else
1104 int j;
1105 for (j = 0; j < len; j++)
1106 c[j] = a[j] * b[j];
1107#endif
1108}
1109
1111(
1112 const float_complex* a,
1113 const float_complex* b,
1114 const int len,
1115 float_complex* c
1116)
1117{
1118 /* Checks: */
1119#ifndef NDEBUG
1120 saf_assert(c!=NULL, "'c' can no longer be NULL");
1121 saf_assert(a!=c && b!=c, "In-place operation is no longer supported.");
1122#endif
1123
1124 /* The operation: */
1125#if defined(SAF_USE_INTEL_IPP)
1126 ippsMul_32fc((Ipp32fc*)a, (Ipp32fc*)b, (Ipp32fc*)c, len);
1127#elif defined(SAF_USE_APPLE_ACCELERATE) /* Due to Apple "logic", this is unfortunately quite complicated, and probably slower than it should be... */
1128 /* Imaginary part of the output */
1129 vDSP_vmul((float*)a/*real*/, 2, (float*)b+1/*imag*/, 2, (float*)c+1/*imag*/, 2, (vDSP_Length)len);
1130 vDSP_vma((float*)a+1/*imag*/, 2, (float*)b/*real*/, 2, (float*)c+1/*imag*/, 2, (float*)c/*real*/, 2, (vDSP_Length)len); /* Use the real part of c as a temporary buffer */
1131 cblas_scopy(len, (float*)c/*real*/, 2, (float*)c+1/*imag*/, 2); /* Copy the imaginary part from the temporary buffer */
1132 /* Real part of the output */
1133 vDSP_vmmsb((float*)a/*real*/, 2, (float*)b/*real*/, 2, (float*)a+1/*imag*/, 2, (float*)b+1/*imag*/, 2, (float*)c/*real*/, 2, (vDSP_Length)len);
1134#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1135 vmcMul(len, (MKL_Complex8*)a, (MKL_Complex8*)b, (MKL_Complex8*)c, SAF_INTEL_MKL_VML_MODE);
1136#elif defined(SAF_ENABLE_SIMD)
1137 int i;
1138 float* sa, *sb, *sc;
1139 sa = (float*)a; sb = (float*)b; sc = (float*)c;
1140# if (defined(__AVX__) && defined(__AVX2__)) || defined(__AVX512F__) /* I couldn't figure out an alternative for addsub with AVX-512... */
1141 __m256i permute_ri = _mm256_set_epi32(6, 7, 4, 5, 2, 3, 0, 1);
1142 for(i=0; i<(len-3); i+=4){
1143 /* Load only the real parts of a */
1144 __m256 src1 = _mm256_moveldup_ps(_mm256_loadu_ps(sa+2*i)/*|a1|b1|a2|b2|a3|b3|a4|b4|*/); /*|a1|a1|a2|a2|a3|a3|a4|a4|*/
1145 /* Load real+imag parts of b */
1146 __m256 src2 = _mm256_loadu_ps(sb+2*i); /*|c1|d1|c2|d2|c3|d3|c4|d4|*/
1147 /* Multiply together */
1148 __m256 tmp1 = _mm256_mul_ps(src1, src2);
1149 /* Swap the real+imag parts of b to be imag+real instead: */
1150 __m256 b1 = _mm256_permutevar8x32_ps(src2, permute_ri);
1151 /* Load only the imag parts of a */
1152 src1 = _mm256_movehdup_ps(_mm256_loadu_ps(sa+2*i)/*|a1|b1|a2|b2|a3|b3|a4|b4|*/); /*|b1|b1|b2|b2|b3|b3|b4|b4|*/
1153 /* Multiply together */
1154 __m256 tmp2 = _mm256_mul_ps(src1, b1);
1155 /* Add even indices, subtract odd indices */
1156 _mm256_storeu_ps(sc+2*i, _mm256_addsub_ps(tmp1, tmp2));
1157 }
1158# elif defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
1159 for(i=0; i<(len-1); i+=2){
1160 /* Load only the real parts of a */
1161 __m128 src1 = _mm_moveldup_ps(_mm_loadu_ps(sa+2*i)/*|a1|b1|a2|b2|*/); /*|a1|a1|a2|a2|*/
1162 /* Load real+imag parts of b */
1163 __m128 src2 = _mm_loadu_ps(sb+2*i); /*|c1|d1|c2|d2|*/
1164 /* Multiply together */
1165 __m128 tmp1 = _mm_mul_ps(src1, src2);
1166 /* Swap the real+imag parts of b to be imag+real instead: */
1167 __m128 b1 = _mm_shuffle_ps(src2, src2, _MM_SHUFFLE(2, 3, 0, 1));
1168 /* Load only the imag parts of a */
1169 src1 = _mm_movehdup_ps(_mm_loadu_ps(sa+2*i)/*|a1|b1|a2|b2|*/); /*|b1|b1|b2|b2|*/
1170 /* Multiply together */
1171 __m128 tmp2 = _mm_mul_ps(src1, b1);
1172 /* Add even indices, subtract odd indices */
1173 _mm_storeu_ps(sc+2*i, _mm_addsub_ps(tmp1, tmp2));
1174 }
1175# endif
1176 for(;i<len; i++){ /* The residual (if len was not divisable by the step size): */
1177 sc[2*i] = sa[2*i] * sb[2*i] - sa[2*i+1] * sb[2*i+1];
1178 sc[2*i+1] = sa[2*i] * sb[2*i+1] + sa[2*i+1] * sb[2*i];
1179 }
1180#elif __STDC_VERSION__ >= 199901L && defined(NDEBUG)
1181 int i;
1182 /* try to indirectly "trigger" some compiler optimisations */
1183 for(i=0; i<len-3; i+=4){
1184 c[i] = a[i] * b[i];
1185 c[i+1] = a[i+1] * b[i+1];
1186 c[i+2] = a[i+2] * b[i+2];
1187 c[i+3] = a[i+3] * b[i+3];
1188 }
1189 for(; i<len; i++) /* The residual (if len was not divisable by the step size): */
1190 c[i] = a[i] * b[i];
1191#elif __STDC_VERSION__ >= 199901L
1192 int i;
1193 for (i = 0; i < len; i++)
1194 c[i] = a[i] * b[i];
1195#else
1196 int i;
1197 for (i = 0; i < len; i++)
1198 c[i] = ccmulf(a[i], b[i]);
1199#endif
1200}
1201
1202
1203/* ========================================================================== */
1204/* Vector-Vector Dot Product (?vvdot) */
1205/* ========================================================================== */
1206
1208(
1209 const float* a,
1210 const float* b,
1211 const int len,
1212 float* c
1213)
1214{
1215 c[0] = cblas_sdot (len, a, 1, b, 1);
1216}
1217
1219(
1220 const float_complex* a,
1221 const float_complex* b,
1222 const int len,
1223 CONJ_FLAG flag,
1224 float_complex* c
1225)
1226{
1227 switch(flag){
1228 default:
1229 case NO_CONJ:
1230 cblas_cdotu_sub(len, a, 1, b, 1, c);
1231 break;
1232 case CONJ:
1233 cblas_cdotc_sub(len, a, 1, b, 1, c);
1234 break;
1235 }
1236}
1237
1238
1239/* ========================================================================== */
1240/* Vector-Scalar Product (?vsmul) */
1241/* ========================================================================== */
1242
1244(
1245 float* a,
1246 const float* s,
1247 const int len,
1248 float* c
1249)
1250{
1251#if defined(SAF_USE_INTEL_IPP)
1252 if(c==NULL)
1253 ippsMulC_32f_I((Ipp32f)s[0], (Ipp32f*)a, len);
1254 else
1255 ippsMulC_32f((Ipp32f*)a, (Ipp32f)s[0], (Ipp32f*)c, len);
1256#elif defined(SAF_USE_APPLE_ACCELERATE)
1257 if(c==NULL)
1258 cblas_sscal(len, s[0], a, 1);
1259 else
1260 vDSP_vsmul(a, 1, s, c, 1, (vDSP_Length)len);
1261#else
1262 if (c == NULL)
1263 cblas_sscal(len, s[0], a, 1);
1264 else {
1265 utility_svvcopy(a, len, c);
1266 cblas_sscal(len, s[0], c, 1);
1267 }
1268#endif
1269}
1270
1272(
1273 float_complex* a,
1274 const float_complex* s,
1275 const int len,
1276 float_complex* c
1277)
1278{
1279 if (c == NULL)
1280 cblas_cscal(len, s, a, 1);
1281 else {
1282 cblas_ccopy(len, a, 1, c, 1);
1283 cblas_cscal(len, s, c, 1);
1284 }
1285}
1286
1288(
1289 double* a,
1290 const double* s,
1291 const int len,
1292 double* c
1293)
1294{
1295#if defined(SAF_USE_INTEL_IPP)
1296 if(c==NULL)
1297 ippsMulC_64f_I((Ipp64f)s[0], (Ipp64f*)a, len);
1298 else
1299 ippsMulC_64f((Ipp64f*)a, (Ipp64f)s[0], (Ipp64f*)c, len);
1300#elif defined(SAF_USE_APPLE_ACCELERATE)
1301 if(c==NULL)
1302 cblas_dscal(len, s[0], a, 1);
1303 else
1304 vDSP_vsmulD(a, 1, s, c, 1, (vDSP_Length)len);
1305#else
1306 if (c == NULL)
1307 cblas_dscal(len, s[0], a, 1);
1308 else {
1309 utility_dvvcopy(a, len, c);
1310 cblas_dscal(len, s[0], c, 1);
1311 }
1312#endif
1313}
1314
1316(
1317 double_complex* a,
1318 const double_complex* s,
1319 const int len,
1320 double_complex* c
1321)
1322{
1323 if (c == NULL)
1324 cblas_zscal(len, s, a, 1);
1325 else {
1326 cblas_zcopy(len, a, 1, c, 1);
1327 cblas_zscal(len, s, c, 1);
1328 }
1329}
1330
1331
1332/* ========================================================================== */
1333/* Vector-Scalar Division (?vsdiv) */
1334/* ========================================================================== */
1335
1337(
1338 const float* a,
1339 const float* s,
1340 const int len,
1341 float* c
1342)
1343{
1344 if(s[0] == 0.0f){
1345 memset(c, 0, len*sizeof(float));
1346 return;
1347 }
1348#if defined(SAF_USE_INTEL_IPP)
1349 ippsDivC_32f((Ipp32f*)a, (Ipp32f)s[0], (Ipp32f*)c, len);
1350#elif defined(SAF_USE_APPLE_ACCELERATE)
1351 vDSP_vsdiv(a, 1, s, c, 1, (vDSP_Length)len);
1352#else
1353 cblas_scopy(len, a, 1, c, 1);
1354 cblas_sscal(len, 1.0f/s[0], c, 1);
1355#endif
1356}
1357
1358
1359/* ========================================================================== */
1360/* Vector-Scalar Addition (?vsadd) */
1361/* ========================================================================== */
1362
1364(
1365 float* a,
1366 const float* s,
1367 const int len,
1368 float* c
1369)
1370{
1371#if defined(SAF_USE_INTEL_IPP)
1372 ippsAddC_32f((Ipp32f*)a, (Ipp32f)s[0], (Ipp32f*)c, len);
1373#elif defined(SAF_USE_APPLE_ACCELERATE)
1374 vDSP_vsadd(a, 1, s, c, 1, (vDSP_Length)len);
1375#elif defined(SAF_ENABLE_SIMD)
1376 int i;
1377# if defined(__AVX512F__)
1378 __m512 s16 = _mm512_set1_ps(s[0]);
1379 for(i=0; i<(len-15); i+=16)
1380 _mm512_storeu_ps(c+i, _mm512_add_ps(_mm512_loadu_ps(a+i), s16));
1381# elif defined(__AVX__) && defined(__AVX2__)
1382 __m256 s8 = _mm256_set1_ps(s[0]);
1383 for(i=0; i<(len-7); i+=8)
1384 _mm256_storeu_ps(c+i, _mm256_add_ps(_mm256_loadu_ps(a+i), s8));
1385# elif defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
1386 __m128 s4 = _mm_load_ps1(s);
1387 for(i=0; i<(len-3); i+=4)
1388 _mm_storeu_ps(c+i, _mm_add_ps(_mm_loadu_ps(a+i), s4));
1389# endif
1390 for(;i<len; i++) /* The residual (if len was not divisable by the step size): */
1391 c[i] = a[i] + s[0];
1392#else
1393 int i;
1394 for(i=0; i<len; i++)
1395 c[i] = a[i] + s[0];
1396#endif
1397}
1398
1399
1400/* ========================================================================== */
1401/* Vector-Scalar Subtraction (?vssub) */
1402/* ========================================================================== */
1403
1405(
1406 float* a,
1407 const float* s,
1408 const int len,
1409 float* c
1410)
1411{
1412#if defined(SAF_USE_INTEL_IPP)
1413 ippsSubC_32f((Ipp32f*)a, (Ipp32f)s[0], (Ipp32f*)c, len);
1414#elif defined(SAF_USE_APPLE_ACCELERATE)
1415 float inv_s;
1416 inv_s = -s[0];
1417 vDSP_vsadd(a, 1, &inv_s, c, 1, (vDSP_Length)len);
1418#elif defined(SAF_ENABLE_SIMD)
1419 int i;
1420# if defined(__AVX512F__)
1421 __m512 s16 = _mm512_set1_ps(s[0]);
1422 for(i=0; i<(len-15); i+=16)
1423 _mm512_storeu_ps(c+i, _mm512_sub_ps(_mm512_loadu_ps(a+i), s16));
1424# elif defined(__AVX__) && defined(__AVX2__)
1425 __m256 s8 = _mm256_set1_ps(s[0]);
1426 for(i=0; i<(len-7); i+=8)
1427 _mm256_storeu_ps(c+i, _mm256_sub_ps(_mm256_loadu_ps(a+i), s8));
1428# elif defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
1429 __m128 s4 = _mm_load_ps1(s);
1430 for(i=0; i<(len-3); i+=4)
1431 _mm_storeu_ps(c+i, _mm_sub_ps(_mm_loadu_ps(a+i), s4));
1432# endif
1433 for(;i<len; i++) /* The residual (if len was not divisable by the step size): */
1434 c[i] = a[i] - s[0];
1435#else
1436 int i;
1437 for(i=0; i<len; i++)
1438 c[i] = a[i] - s[0];
1439#endif
1440}
1441
1442
1443/* ========================================================================== */
1444/* Sparse-Vector to Compressed-Vector (Known Indices) (?sv2cv_inds) */
1445/* ========================================================================== */
1446
1448(
1449 const float* sv,
1450 const int* inds,
1451 const int len,
1452 float* cv
1453)
1454{
1455 int i;
1456#ifdef SAF_USE_APPLE_ACCELERATE /* Unfortunately requires a malloc call */
1457 /* Due to Apple "logic", we first need to add 1 to all of the indicies, since "vDSP_vgathr" is going to then subtract 1 from them all... */
1458 vDSP_Length* inds_vDSP;
1459 inds_vDSP = malloc1d(len*sizeof(vDSP_Length));
1460 for(i=0; i<len; i++)
1461 inds_vDSP[i] = (vDSP_Length)(inds[i] + 1);
1462 vDSP_vgathr(sv, inds_vDSP, 1, cv, 1, (vDSP_Length)len);
1463 free(inds_vDSP);
1464#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1465 veclib_int* inds_tmp;
1466 if(sizeof(veclib_int)==sizeof(int)) /* LP64 MKL */
1467 cblas_sgthr(len, sv, cv, (veclib_int*)inds);
1468 else{ /* ILP64 MKL */
1469 inds_tmp = malloc1d(len*sizeof(veclib_int)); /* Unfortunately requires a malloc call */
1470 for(i=0; i<len; i++)
1471 inds_tmp[i] = (veclib_int)inds[i];
1472 cblas_sgthr(len, sv, cv, (veclib_int*)inds_tmp);
1473 free(inds_tmp);
1474 }
1475#elif defined(NDEBUG)
1476 /* try to indirectly "trigger" some compiler optimisations */
1477 for(i=0; i<len-3; i+=4){
1478 cv[i] = sv[inds[i]];
1479 cv[i+1] = sv[inds[i+1]];
1480 cv[i+2] = sv[inds[i+2]];
1481 cv[i+3] = sv[inds[i+3]];
1482 }
1483 for(; i<len; i++) /* The residual (if len was not divisable by the step size): */
1484 cv[i] = sv[inds[i]];
1485#else
1486 for(i=0; i<len; i++)
1487 cv[i] = sv[inds[i]];
1488#endif
1489}
1490
1492(
1493 const float_complex* sv,
1494 const int* inds,
1495 const int len,
1496 float_complex* cv
1497)
1498{
1499 int i;
1500#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1501 veclib_int* inds_tmp;
1502 if(sizeof(veclib_int)==sizeof(int)) /* LP64 MKL */
1503 cblas_cgthr(len, sv, cv, (veclib_int*)inds);
1504 else{ /* ILP64 MKL */
1505 inds_tmp = malloc1d(len*sizeof(veclib_int)); /* Unfortunately requires a malloc call */
1506 for(i=0; i<len; i++)
1507 inds_tmp[i] = (veclib_int)inds[i];
1508 cblas_cgthr(len, sv, cv, (veclib_int*)inds_tmp);
1509 free(inds_tmp);
1510 }
1511#elif defined(NDEBUG)
1512 /* try to indirectly "trigger" some compiler optimisations */
1513 for(i=0; i<len-3; i+=4){
1514 cv[i] = sv[inds[i]];
1515 cv[i+1] = sv[inds[i+1]];
1516 cv[i+2] = sv[inds[i+2]];
1517 cv[i+3] = sv[inds[i+3]];
1518 }
1519 for(; i<len; i++) /* The residual (if len was not divisable by the step size): */
1520 cv[i] = sv[inds[i]];
1521#else
1522 for(i=0; i<len; i++)
1523 cv[i] = sv[inds[i]];
1524#endif
1525}
1526
1528(
1529 const double* sv,
1530 const int* inds,
1531 const int len,
1532 double* cv
1533)
1534{
1535 int i;
1536#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1537 veclib_int* inds_tmp;
1538 if(sizeof(veclib_int)==sizeof(int)) /* LP64 MKL */
1539 cblas_dgthr(len, sv, cv, (veclib_int*)inds);
1540 else{ /* ILP64 MKL */
1541 inds_tmp = malloc1d(len*sizeof(veclib_int)); /* Unfortunately requires a malloc call */
1542 for(i=0; i<len; i++)
1543 inds_tmp[i] = (veclib_int)inds[i];
1544 cblas_dgthr(len, sv, cv, (veclib_int*)inds_tmp);
1545 free(inds_tmp);
1546 }
1547#elif defined(NDEBUG)
1548 /* try to indirectly "trigger" some compiler optimisations */
1549 for(i=0; i<len-3; i+=4){
1550 cv[i] = sv[inds[i]];
1551 cv[i+1] = sv[inds[i+1]];
1552 cv[i+2] = sv[inds[i+2]];
1553 cv[i+3] = sv[inds[i+3]];
1554 }
1555 for(; i<len; i++) /* The residual (if len was not divisable by the step size): */
1556 cv[i] = sv[inds[i]];
1557#else
1558 for(i=0; i<len; i++)
1559 cv[i] = sv[inds[i]];
1560#endif
1561}
1562
1564(
1565 const double_complex* sv,
1566 const int* inds,
1567 const int len,
1568 double_complex* cv
1569)
1570{
1571 int i;
1572#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1573 veclib_int* inds_tmp;
1574 if(sizeof(veclib_int)==sizeof(int)) /* LP64 MKL */
1575 cblas_zgthr(len, sv, cv, (veclib_int*)inds);
1576 else{ /* ILP64 MKL */
1577 inds_tmp = malloc1d(len*sizeof(veclib_int)); /* Unfortunately requires a malloc call */
1578 for(i=0; i<len; i++)
1579 inds_tmp[i] = (veclib_int)inds[i];
1580 cblas_zgthr(len, sv, cv, (veclib_int*)inds_tmp);
1581 free(inds_tmp);
1582 }
1583#elif defined(NDEBUG)
1584 /* try to indirectly "trigger" some compiler optimisations */
1585 for(i=0; i<len-3; i+=4){
1586 cv[i] = sv[inds[i]];
1587 cv[i+1] = sv[inds[i+1]];
1588 cv[i+2] = sv[inds[i+2]];
1589 cv[i+3] = sv[inds[i+3]];
1590 }
1591 for(; i<len; i++) /* The residual (if len was not divisable by the step size): */
1592 cv[i] = sv[inds[i]];
1593#else
1594 for(i=0; i<len; i++)
1595 cv[i] = sv[inds[i]];
1596#endif
1597}
1598
1599
1600/* ========================================================================== */
1601/* Singular-Value Decomposition (?svd) */
1602/* ========================================================================== */
1603
1605typedef struct _utility_ssvd_data {
1606 int maxDim1, maxDim2;
1607 veclib_int currentWorkSize;
1608 float* a, *s, *u, *vt;
1609 float* work;
1611
1612void utility_ssvd_create(void ** const phWork, int maxDim1, int maxDim2)
1613{
1614 *phWork = malloc1d(sizeof(utility_ssvd_data));
1615 utility_ssvd_data *h = (utility_ssvd_data*)(*phWork);
1616
1617 h->maxDim1 = maxDim1;
1618 h->maxDim2 = maxDim2;
1619 h->currentWorkSize = 0;
1620 h->a = malloc1d(maxDim1*maxDim2*sizeof(float));
1621 h->s = malloc1d(SAF_MIN(maxDim2,maxDim1)*sizeof(float));
1622 h->u = malloc1d(maxDim1*maxDim1*sizeof(float));
1623 h->vt = malloc1d(maxDim2*maxDim2*sizeof(float));
1624 h->work = NULL;
1625}
1626
1627void utility_ssvd_destroy(void ** const phWork)
1628{
1629 utility_ssvd_data *h = (utility_ssvd_data*)(*phWork);
1630
1631 if(h!=NULL){
1632 free(h->a);
1633 free(h->s);
1634 free(h->u);
1635 free(h->vt);
1636 free(h->work);
1637
1638 free(h);
1639 h=NULL;
1640 *phWork = NULL;
1641 }
1642}
1643
1645(
1646 void* const hWork,
1647 const float* A,
1648 const int dim1,
1649 const int dim2,
1650 float* U,
1651 float* S,
1652 float* V,
1653 float* sing
1654)
1655{
1657 veclib_int i, j, m, n, lda, ldu, ldvt, info;
1658 veclib_int lwork;
1659 float wkopt;
1660
1661 m = dim1; n = dim2; lda = dim1; ldu = dim1; ldvt = dim2;
1662
1663 /* Work struct */
1664 if(hWork==NULL)
1665 utility_ssvd_create((void**)&h, dim1, dim2);
1666 else{
1667 h = (utility_ssvd_data*)(hWork);
1668#ifndef NDEBUG
1669 saf_assert(dim1<=h->maxDim1, "dim1 exceeds the maximum length specified");
1670 saf_assert(dim2<=h->maxDim2, "dim2 exceeds the maximum length specified");
1671#endif
1672 }
1673
1674 /* store in column major order */
1675 for(i=0; i<dim1; i++)
1676 for(j=0; j<dim2; j++)
1677 h->a[j*dim1+i] = A[i*dim2 +j];
1678
1679 /* Query how much "work" memory is required */
1680 lwork = -1;
1681#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
1682 sgesvd_( "A", "A", &m, &n, h->a, &lda, h->s, h->u, &ldu, h->vt, &ldvt, &wkopt, &lwork, &info );
1683#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
1684 saf_print_error("No such implementation available in ATLAS CLAPACK");
1685#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
1686 info = LAPACKE_sgesvd_work(CblasColMajor, 'A', 'A', m, n, h->a, lda, h->s, h->u, ldu, h->vt, ldvt, &wkopt, lwork);
1687#endif
1688 lwork = (veclib_int)wkopt;
1689 if(lwork>h->currentWorkSize){
1690 h->currentWorkSize = lwork;
1691 h->work = realloc1d(h->work, h->currentWorkSize*sizeof(float));
1692 }
1693
1694 /* perform the singular value decomposition */
1695#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
1696 sgesvd_( "A", "A", &m, &n, h->a, &lda, h->s, h->u, &ldu, h->vt, &ldvt, h->work, &lwork, &info );
1697#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
1698 saf_print_error("No such implementation available in ATLAS CLAPACK");
1699#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
1700 info = LAPACKE_sgesvd_work(CblasColMajor, 'A', 'A', m, n, h->a, lda, h->s, h->u, ldu, h->vt, ldvt, h->work, lwork);
1701#endif
1702
1703 /* svd failed to converge */
1704 if( info != 0 ) {
1705 if (S != NULL)
1706 memset(S, 0, dim1*dim2*sizeof(float));
1707 if (U != NULL)
1708 memset(U, 0, dim1*dim1*sizeof(float));
1709 if (V != NULL)
1710 memset(V, 0, dim2*dim2*sizeof(float));
1711 if (sing != NULL)
1712 memset(sing, 0, SAF_MIN(dim1, dim2)*sizeof(float));
1713#ifndef NDEBUG
1714 /* The SVD failed to converge, or the input matrix contained illegal
1715 * values so no solution was attempted. In these cases this function
1716 * will zero all output matrices and/or vectors. */
1717 saf_print_warning("Could not compute SVD in utility_ssvd(). Output matrices/vectors have been zeroed.");
1718#endif
1719 }
1720 /* svd successful */
1721 else {
1722 if (S != NULL){
1723 memset(S, 0, dim1*dim2*sizeof(float));
1724 /* singular values on the diagonal MIN(dim1, dim2). The remaining elements are 0. */
1725 for(i=0; i<SAF_MIN(dim1, dim2); i++)
1726 S[i*dim2+i] = h->s[i];
1727 }
1728
1729 /*return as row-major*/
1730 if (U != NULL)
1731 for(i=0; i<dim1; i++)
1732 for(j=0; j<dim1; j++)
1733 U[i*dim1+j] = h->u[j*dim1+i];
1734
1735 /* lapack returns VT, i.e. row-major V already */
1736 if (V != NULL)
1737 for(i=0; i<dim2; i++)
1738 for(j=0; j<dim2; j++)
1739 V[i*dim2+j] = h->vt[i*dim2+j];
1740 if (sing != NULL)
1741 for(i=0; i<SAF_MIN(dim1, dim2); i++)
1742 sing[i] = h->s[i];
1743 }
1744
1745 if(hWork == NULL)
1746 utility_ssvd_destroy((void**)&h);
1747}
1748
1750typedef struct _utility_csvd_data {
1751 int maxDim1, maxDim2;
1752 veclib_int currentWorkSize;
1753 float_complex* a, *u, *vt, *work;
1754 float *s, *rwork;
1756
1757void utility_csvd_create(void ** const phWork, int maxDim1, int maxDim2)
1758{
1759 *phWork = malloc1d(sizeof(utility_csvd_data));
1760 utility_csvd_data *h = (utility_csvd_data*)(*phWork);
1761
1762 h->maxDim1 = maxDim1;
1763 h->maxDim2 = maxDim2;
1764 h->currentWorkSize = 0;
1765 h->a = malloc1d(maxDim1*maxDim2*sizeof(float_complex));
1766 h->s = malloc1d(SAF_MIN(maxDim2,maxDim1)*sizeof(float));
1767 h->u = malloc1d(maxDim1*maxDim1*sizeof(float_complex));
1768 h->vt = malloc1d(maxDim2*maxDim2*sizeof(float_complex));
1769 h->rwork = malloc1d(maxDim1*SAF_MAX(1, 5*SAF_MIN(maxDim2,maxDim1))*sizeof(float));
1770 h->work = NULL;
1771}
1772
1773void utility_csvd_destroy(void ** const phWork)
1774{
1775 utility_csvd_data *h = (utility_csvd_data*)(*phWork);
1776
1777 if(h!=NULL){
1778 free(h->a);
1779 free(h->s);
1780 free(h->u);
1781 free(h->vt);
1782 free(h->work);
1783 free(h->rwork);
1784
1785 free(h);
1786 h=NULL;
1787 *phWork = NULL;
1788 }
1789}
1790
1792(
1793 void* const hWork,
1794 const float_complex* A,
1795 const int dim1,
1796 const int dim2,
1797 float_complex* U,
1798 float_complex* S,
1799 float_complex* V,
1800 float* sing
1801)
1802{
1804 veclib_int m, n, lda, ldu, ldvt, info;
1805 veclib_int lwork;
1806 float_complex wkopt;
1807#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1808 const MKL_Complex8 calpha = {1.0f, 0.0f};
1809#else
1810 int i, j;
1811#endif
1812
1813 m = dim1; n = dim2; lda = dim1; ldu = dim1; ldvt = dim2;
1814
1815 /* Work struct */
1816 if(hWork==NULL)
1817 utility_csvd_create((void**)&h, dim1, dim2);
1818 else{
1819 h = (utility_csvd_data*)(hWork);
1820#ifndef NDEBUG
1821 saf_assert(dim1<=h->maxDim1, "dim1 exceeds the maximum length specified");
1822 saf_assert(dim2<=h->maxDim2, "dim2 exceeds the maximum length specified");
1823#endif
1824 }
1825
1826 /* store in column major order (i.e. transpose) */
1827#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1828 MKL_Comatcopy('R', 'T', dim1, dim2, calpha, (veclib_float_complex*)A, dim2, (veclib_float_complex*)h->a, dim1);
1829#else
1830 for(i=0; i<dim1; i++)
1831 for(j=0; j<dim2; j++)
1832 h->a[j*dim1+i] = A[i*dim2+j];
1833#endif
1834
1835 /* Query how much "work" memory is required */
1836 lwork = -1;
1837#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
1838 cgesvd_( "A", "A", (veclib_int*)&m, (veclib_int*)&n, (veclib_float_complex*)h->a, (veclib_int*)&lda, h->s, (veclib_float_complex*)h->u, (veclib_int*)&ldu,
1839 (veclib_float_complex*)h->vt, &ldvt, (veclib_float_complex*)&wkopt, &lwork, h->rwork, (veclib_int*)&info );
1840#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
1841 saf_print_error("No such implementation available in ATLAS CLAPACK");
1842#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
1843 info = LAPACKE_cgesvd_work(CblasColMajor, 'A', 'A', m, n, (veclib_float_complex*)h->a, lda, h->s, (veclib_float_complex*)h->u, ldu,
1844 (veclib_float_complex*)h->vt, ldvt, (veclib_float_complex*)&wkopt, lwork, h->rwork);
1845#endif
1846 lwork = (veclib_int)(crealf(wkopt)+0.01f);
1847 if(lwork>h->currentWorkSize){
1848 h->currentWorkSize = lwork;
1849 h->work = realloc1d(h->work, h->currentWorkSize*sizeof(float_complex));
1850 }
1851
1852 /* perform the singular value decomposition */
1853#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
1854 cgesvd_( "A", "A", &m, &n, (veclib_float_complex*)h->a, &lda, h->s, (veclib_float_complex*)h->u, &ldu, (veclib_float_complex*)h->vt, &ldvt,
1855 (veclib_float_complex*)h->work, &lwork, h->rwork, &info);
1856#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
1857 saf_print_error("No such implementation available in ATLAS CLAPACK");
1858#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
1859 info = LAPACKE_cgesvd_work(CblasColMajor, 'A', 'A', m, n, (veclib_float_complex*)h->a, lda, h->s, (veclib_float_complex*)h->u, ldu, (veclib_float_complex*)h->vt, ldvt,
1860 (veclib_float_complex*)h->work, lwork, h->rwork);
1861#endif
1862
1863 /* svd failed to converge */
1864 if( info != 0 ) {
1865 if (S != NULL)
1866 memset(S, 0, dim1*dim2*sizeof(float_complex));
1867 if (U != NULL)
1868 memset(U, 0, dim1*dim1*sizeof(float_complex));
1869 if (V != NULL)
1870 memset(V, 0, dim2*dim2*sizeof(float_complex));
1871 if (sing != NULL)
1872 memset(sing, 0, SAF_MIN(dim1, dim2)*sizeof(float_complex));
1873#ifndef NDEBUG
1874 /* The SVD failed to converge, or the input matrix contained illegal
1875 * values so no solution was attempted. In these cases this function
1876 * will zero all output matrices and/or vectors. */
1877 saf_print_warning("Could not compute SVD in utility_csvd(). Output matrices/vectors have been zeroed.");
1878#endif
1879 }
1880 /* svd successful */
1881 else {
1882 if (S != NULL){
1883 /* singular values on the diagonal MIN(dim1, dim2). The remaining elements are 0. */
1884 memset(S, 0, dim1*dim2*sizeof(float_complex));
1885 cblas_scopy(SAF_MIN(dim1, dim2), h->s, 1, (float*)S, 2*(dim2+1));
1886 }
1887
1888 /*return as row-major*/
1889 if (U!=NULL){
1890#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1891 MKL_Comatcopy('R', 'T', dim1, dim1, calpha, (veclib_float_complex*)h->u, dim1, (veclib_float_complex*)U, dim1);
1892#else
1893 for(i=0; i<dim1; i++)
1894 for(j=0; j<dim1; j++)
1895 U[i*dim1+j] = h->u[j*dim1+i];
1896#endif
1897 }
1898
1899 /* lapack returns V^T, i.e. row-major V already, but we need V^H! */
1900 if (V != NULL){
1901 cblas_ccopy(dim2*dim2, h->vt, 1, V, 1);
1902 cblas_sscal(dim2*dim2, -1.0f, ((float*)V)+1, 2); /* conj */
1903 }
1904
1905 if (sing != NULL)
1906 cblas_scopy(SAF_MIN(dim1, dim2), h->s, 1, sing, 1);
1907 }
1908
1909 if(hWork == NULL)
1910 utility_csvd_destroy((void**)&h);
1911}
1912
1913
1914/* ========================================================================== */
1915/* Symmetric Eigenvalue Decomposition (?seig) */
1916/* ========================================================================== */
1917
1919typedef struct _utility_sseig_data {
1920 int maxDim;
1921 veclib_int currentWorkSize;
1922 float* w;
1923 float* a;
1924 float* work;
1926
1927void utility_sseig_create(void ** const phWork, int maxDim)
1928{
1929 *phWork = malloc1d(sizeof(utility_sseig_data));
1930 utility_sseig_data *h = (utility_sseig_data*)(*phWork);
1931
1932 h->maxDim = maxDim;
1933 h->currentWorkSize = 0;
1934 h->w = malloc1d(maxDim*sizeof(float));
1935 h->a = malloc1d(maxDim*maxDim*sizeof(float));
1936 h->work = NULL;
1937}
1938
1939void utility_sseig_destroy(void ** const phWork)
1940{
1941 utility_sseig_data *h = (utility_sseig_data*)(*phWork);
1942
1943 if(h!=NULL){
1944 free(h->w);
1945 free(h->a);
1946 free(h->work);
1947
1948 free(h);
1949 h=NULL;
1950 *phWork = NULL;
1951 }
1952}
1953
1955(
1956 void* const hWork,
1957 const float* A,
1958 const int dim,
1959 int sortDecFLAG,
1960 float* V,
1961 float* D,
1962 float* eig
1963)
1964{
1966 veclib_int i, j, n, lda, info;
1967 veclib_int lwork;
1968 float wkopt;
1969
1970 n = dim;
1971 lda = dim;
1972
1973 /* Work struct */
1974 if(hWork==NULL)
1975 utility_sseig_create((void**)&h, dim);
1976 else{
1977 h = (utility_sseig_data*)(hWork);
1978#ifndef NDEBUG
1979 saf_assert(dim<=h->maxDim, "dim exceeds the maximum length specified");
1980#endif
1981 }
1982
1983 /* store in column major order (i.e. transpose) */
1984 for(i=0; i<dim; i++)
1985 for(j=0; j<dim; j++)
1986 h->a[i*dim+j] = A[j*dim+i];
1987
1988 /* Query how much "work" memory is required */
1989 lwork = -1;
1990#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
1991 ssyev_( "Vectors", "Upper", &n, h->a, &lda, h->w, &wkopt, &lwork, &info );
1992#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
1993 saf_print_error("No such implementation available in ATLAS CLAPACK");
1994#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
1995 info = LAPACKE_ssyev_work(CblasColMajor, 'V', 'U', n, h->a, lda, h->w, &wkopt, lwork);
1996#endif
1997 lwork = (veclib_int)wkopt;
1998 if(lwork>h->currentWorkSize){
1999 h->currentWorkSize = lwork;
2000 h->work = realloc1d(h->work, h->currentWorkSize*sizeof(float));
2001 }
2002
2003 /* solve the eigenproblem */
2004#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2005 ssyev_( "Vectors", "Upper", &n, h->a, &lda, h->w, h->work, &lwork, &info );
2006#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2007 saf_print_error("No such implementation available in ATLAS CLAPACK");
2008#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2009 info = LAPACKE_ssyev_work(CblasColMajor, 'V', 'U', n, h->a, lda, h->w, h->work, lwork);
2010#endif
2011
2012 /* output */
2013 if(D!=NULL)
2014 memset(D, 0, dim*dim*sizeof(float));
2015 if( info != 0 ) {
2016 /* failed to converge and find the eigenvalues */
2017 if(V!=NULL)
2018 memset(V, 0, dim*dim*sizeof(float));
2019#ifndef NDEBUG
2020 /* Failed to compute all of the eigenvalues, some eigenvectors have not
2021 * been computed, or the input matrix contained illegal values so no
2022 * solution was attempted. In these cases the function will zero all
2023 * output matrices and vectors. */
2024 saf_print_warning("Could not compute EVD in utility_sseig(). Output matrices/vectors have been zeroed.");
2025#endif
2026 }
2027 else{
2028 if(sortDecFLAG){
2029 for(i=0; i<dim; i++){
2030 if(V!=NULL)
2031 for(j=0; j<dim; j++)
2032 V[i*dim+j] = h->a[(dim-j-1)*dim+i]; /* transpose, back to row-major and reverse order */
2033 if(D!=NULL)
2034 D[i*dim+i] = h->w[dim-i-1]; /* store along the diagonal, reversing the order */
2035 if(eig!=NULL)
2036 eig[i] = h->w[dim-i-1];
2037 }
2038 }
2039 else{
2040 for(i=0; i<dim; i++){
2041 if(V!=NULL)
2042 for(j=0; j<dim; j++)
2043 V[i*dim+j] = h->a[j*dim+i]; /* transpose, back to row-major */
2044 if(D!=NULL)
2045 D[i*dim+i] = h->w[i]; /* store along the diagonal */
2046 if(eig!=NULL)
2047 eig[i] = h->w[i];
2048 }
2049 }
2050 }
2051
2052 if(hWork == NULL)
2053 utility_sseig_destroy((void**)&h);
2054}
2055
2057typedef struct _utility_cseig_data {
2058 int maxDim;
2059 veclib_int currentWorkSize;
2060 float* rwork;
2061 float* w;
2062 float_complex* a;
2063 float_complex* work;
2065
2066void utility_cseig_create(void ** const phWork, int maxDim)
2067{
2068 *phWork = malloc1d(sizeof(utility_cseig_data));
2069 utility_cseig_data *h = (utility_cseig_data*)(*phWork);
2070
2071 h->maxDim = maxDim;
2072 h->currentWorkSize = SAF_MAX(1, 2*maxDim-1);
2073 h->rwork = malloc1d((3*maxDim-2)*sizeof(float));
2074 h->w = malloc1d(maxDim*sizeof(float));
2075 h->a = malloc1d(maxDim*maxDim*sizeof(float_complex));
2076 h->work = malloc1d(h->currentWorkSize*sizeof(float_complex));
2077}
2078
2079void utility_cseig_destroy(void ** const phWork)
2080{
2081 utility_cseig_data *h = (utility_cseig_data*)(*phWork);
2082
2083 if(h!=NULL){
2084 free(h->rwork);
2085 free(h->w);
2086 free(h->a);
2087 free(h->work);
2088
2089 free(h);
2090 h=NULL;
2091 *phWork = NULL;
2092 }
2093}
2094
2096(
2097 void* const hWork,
2098 const float_complex* A,
2099 const int dim,
2100 int sortDecFLAG,
2101 float_complex* V,
2102 float_complex* D,
2103 float* eig
2104)
2105{
2107 veclib_int i, n, lda, info;
2108 veclib_int lwork;
2109 float_complex wkopt;
2110#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
2111 const MKL_Complex8 calpha = {1.0f, 0.0f};
2112#else
2113 int j;
2114#endif
2115
2116 n = dim;
2117 lda = dim;
2118
2119 /* Work struct */
2120 if(hWork==NULL)
2121 utility_cseig_create((void**)&h, dim);
2122 else{
2123 h = (utility_cseig_data*)(hWork);
2124#ifndef NDEBUG
2125 saf_assert(dim<=h->maxDim, "dim exceeds the maximum length specified");
2126#endif
2127 }
2128
2129 /* store in column major order (i.e. transpose) */
2130#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
2131 MKL_Comatcopy('R', 'T', dim, dim, calpha, (veclib_float_complex*)A, dim, (veclib_float_complex*)h->a, dim);
2132#else
2133 for(i=0; i<dim; i++)
2134 for(j=0; j<dim; j++)
2135 h->a[i*dim+j] = A[j*dim+i];
2136#endif
2137
2138 /* Query how much "work" memory is required */
2139 lwork = -1;
2140#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2141 cheev_( "Vectors", "Upper", &n, (veclib_float_complex*)h->a, &lda, h->w, (veclib_float_complex*)&wkopt, &lwork, h->rwork, &info );
2142#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2143 saf_print_error("No such implementation available in ATLAS CLAPACK");
2144#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2145 info = LAPACKE_cheev_work(CblasColMajor, 'V', 'U', n, (veclib_float_complex*)h->a, lda, h->w, (veclib_float_complex*)&wkopt, lwork, h->rwork);
2146#endif
2147 lwork = (veclib_int)crealf(wkopt);
2148 if(lwork>h->currentWorkSize){
2149 h->currentWorkSize = lwork;
2150 h->work = realloc1d(h->work, h->currentWorkSize*sizeof(float_complex));
2151 }
2152
2153 /* solve the eigenproblem */
2154#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2155 cheev_( "Vectors", "Upper", &n, (veclib_float_complex*)h->a, &lda, h->w, (veclib_float_complex*)h->work, &lwork, h->rwork, &info );
2156#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2157 saf_print_error("No such implementation available in ATLAS CLAPACK");
2158#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2159 info = LAPACKE_cheev_work(CblasColMajor, 'V', 'U', n, (veclib_float_complex*)h->a, lda, h->w, (veclib_float_complex*)h->work, lwork, h->rwork);
2160#endif
2161
2162 /* output */
2163 if(D!=NULL)
2164 memset(D, 0, dim*dim*sizeof(float_complex));
2165 if( info != 0 ) {
2166 /* failed to converge and find the eigenvalues */
2167 if(V!=NULL)
2168 memset(V, 0, dim*dim*sizeof(float_complex));
2169#ifndef NDEBUG
2170 /* Failed to compute all of the eigenvalues, some eigenvectors have not
2171 * been computed, or the input matrix contained illegal values so no
2172 * solution was attempted. In these cases the function will zero all
2173 * output matrices and vectors. */
2174 saf_print_warning("Could not compute EVD in utility_cseig(). Output matrices/vectors have been zeroed.");
2175#endif
2176 }
2177
2178 /* transpose, back to row-major and reverse order if sortDecFlag==1 */
2179 else{
2180 if(sortDecFLAG && V!=NULL)
2181 for(i=0; i<(int)((float)dim/2.0f); i++)
2182 cblas_cswap(dim, &h->a[i*dim], 1, &h->a[(dim-i-1)*dim], 1);
2183
2184 if(V!=NULL){
2185#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
2186 MKL_Comatcopy('R', 'T', dim, dim, calpha, (veclib_float_complex*)h->a, dim, (veclib_float_complex*)V, dim);
2187#else
2188 for(i=0; i<dim; i++)
2189 for(j=0; j<dim; j++)
2190 V[i*dim+j] = h->a[j*dim+i];
2191#endif
2192 }
2193
2194 if(sortDecFLAG){
2195 for(i=0; i<dim; i++) {
2196 if(D!=NULL)
2197 D[i*dim+i] = cmplxf(h->w[dim-i-1], 0.0f); /* store along the diagonal, reversing the order */
2198 if(eig!=NULL)
2199 eig[i] = h->w[dim-i-1];
2200 }
2201 }
2202 else {
2203 for(i=0; i<dim; i++){
2204 if(D!=NULL)
2205 D[i*dim+i] = cmplxf(h->w[i], 0.0f); /* store along the diagonal */
2206 if(eig!=NULL)
2207 eig[i] = h->w[i];
2208 }
2209 }
2210 }
2211
2212 if(hWork == NULL)
2213 utility_cseig_destroy((void**)&h);
2214}
2215
2216
2217/* ========================================================================== */
2218/* Eigenvalues of Matrix Pair (?eigmp) */
2219/* ========================================================================== */
2220
2222typedef struct _utility_ceigmp_data {
2223 int maxDim;
2224 veclib_int currentWorkSize;
2225 float_complex* a, *b, *vl, *vr, *alpha, *beta;
2226 float* rwork;
2227 float_complex* work;
2229
2230void utility_ceigmp_create(void** const phWork, int maxDim)
2231{
2232 *phWork = malloc1d(sizeof(utility_ceigmp_data));
2234
2235 h->maxDim = maxDim;
2236 h->currentWorkSize = 4*maxDim;
2237 h->rwork = malloc1d(4*(h->currentWorkSize)*sizeof(float));
2238 h->a = malloc1d(maxDim*maxDim*sizeof(float_complex));
2239 h->b = malloc1d(maxDim*maxDim*sizeof(float_complex));
2240 h->vl = malloc1d(maxDim*maxDim*sizeof(float_complex));
2241 h->vr = malloc1d(maxDim*maxDim*sizeof(float_complex));
2242 h->alpha = malloc1d(maxDim*sizeof(float_complex));
2243 h->beta = malloc1d(maxDim*sizeof(float_complex));
2244 h->work = malloc1d(h->currentWorkSize*sizeof(float_complex));
2245}
2246
2247void utility_ceigmp_destroy(void ** const phWork)
2248{
2250
2251 if(h!=NULL){
2252 free(h->rwork);
2253 free(h->a);
2254 free(h->b);
2255 free(h->vl);
2256 free(h->vr);
2257 free(h->alpha);
2258 free(h->beta);
2259 free(h->work);
2260
2261 free(h);
2262 h=NULL;
2263 *phWork = NULL;
2264 }
2265}
2266
2268(
2269 void* const hWork,
2270 const float_complex* A,
2271 const float_complex* B,
2272 const int dim,
2273 float_complex* VL,
2274 float_complex* VR,
2275 float_complex* D
2276)
2277{
2279 veclib_int i, j;
2280 veclib_int n, lda, ldb, ldvl, ldvr, info;
2281 veclib_int lwork;
2282
2283 n = lda = ldb = ldvl = ldvr = dim;
2284
2285 /* Work struct */
2286 if(hWork==NULL)
2287 utility_ceigmp_create((void**)&h, dim);
2288 else{
2289 h = (utility_ceigmp_data*)(hWork);
2290#ifndef NDEBUG
2291 saf_assert(dim<=h->maxDim, "dim exceeds the maximum length specified");
2292#endif
2293 }
2294
2295 /* store in column major order */
2296 for(i=0; i<dim; i++)
2297 for(j=0; j<dim; j++)
2298 h->a[j*dim+i] = A[i*dim+j];
2299 for(i=0; i<dim; i++)
2300 for(j=0; j<dim; j++)
2301 h->b[j*dim+i] = B[i*dim+j];
2302
2303 /* solve eigen problem */
2304 lwork = h->currentWorkSize;
2305#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2306 cggev_("V", "V", &n, (veclib_float_complex*)h->a, &lda, (veclib_float_complex*)h->b, &ldb, (veclib_float_complex*)h->alpha, (veclib_float_complex*)h->beta,
2307 (veclib_float_complex*)h->vl, &ldvl, (veclib_float_complex*)h->vr, &ldvr, (veclib_float_complex*)h->work, &lwork, h->rwork, &info);
2308#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2309 saf_print_error("No such implementation available in ATLAS CLAPACK");
2310#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2311 info = LAPACKE_cggev_work(CblasColMajor, 'V', 'V', n, (veclib_float_complex*)h->a, lda, (veclib_float_complex*)h->b, ldb, (veclib_float_complex*)h->alpha, (veclib_float_complex*)h->beta,
2312 (veclib_float_complex*)h->vl, ldvl, (veclib_float_complex*)h->vr, ldvr, (veclib_float_complex*)h->work, lwork, h->rwork);
2313#endif
2314
2315 if(D!=NULL)
2316 memset(D, 0, dim*dim*sizeof(float_complex));
2317
2318 /* failed to converge and find the eigenvalues */
2319 if( info != 0 ) {
2320 if(VL!=NULL)
2321 memset(VL, 0, dim*dim*sizeof(float_complex));
2322 if(VR!=NULL)
2323 memset(VR, 0, dim*dim*sizeof(float_complex));
2324#ifndef NDEBUG
2325 /* Failed to compute all of the eigenvalues, some eigenvectors have not
2326 * been computed, or the input matrix contained illegal values so no
2327 * solution was attempted. In these cases the function will zero all
2328 * output matrices and vectors. */
2329 saf_print_warning("Could not compute EVD in utility_ceigmp(). Output matrices/vectors have been zeroed.");
2330#endif
2331 }
2332 /* transpose, back to row-major */
2333 else{
2334 if(D!=NULL)
2335 for(i=0; i<dim; i++)
2336 D[i*dim+i] = ccdivf(h->alpha[i],h->beta[i]);
2337
2338 if(VL!=NULL)
2339 for(i=0; i<dim; i++)
2340 for(j=0; j<dim; j++)
2341 VL[i*dim+j] = h->vl[j*dim+i];
2342 if(VR!=NULL)
2343 for(i=0; i<dim; i++)
2344 for(j=0; j<dim; j++)
2345 VR[i*dim+j] = h->vr[j*dim+i];
2346 }
2347
2348 if(hWork == NULL)
2349 utility_ceigmp_destroy((void**)&h);
2350}
2351
2353typedef struct _utility_zeigmp_data {
2354 int maxDim;
2355 veclib_int currentWorkSize;
2356 double_complex* a, *b, *vl, *vr, *alpha, *beta;
2357 double* rwork;
2358 double_complex* work;
2360
2361void utility_zeigmp_create(void** const phWork, int maxDim)
2362{
2363 *phWork = malloc1d(sizeof(utility_zeigmp_data));
2365
2366 h->maxDim = maxDim;
2367 h->currentWorkSize = 4*maxDim;
2368 h->rwork = malloc1d(4*(h->currentWorkSize)*sizeof(double));
2369 h->a = malloc1d(maxDim*maxDim*sizeof(double_complex));
2370 h->b = malloc1d(maxDim*maxDim*sizeof(double_complex));
2371 h->vl = malloc1d(maxDim*maxDim*sizeof(double_complex));
2372 h->vr = malloc1d(maxDim*maxDim*sizeof(double_complex));
2373 h->alpha = malloc1d(maxDim*sizeof(double_complex));
2374 h->beta = malloc1d(maxDim*sizeof(double_complex));
2375 h->work = malloc1d(h->currentWorkSize*sizeof(double_complex));
2376}
2377
2378void utility_zeigmp_destroy(void ** const phWork)
2379{
2381
2382 if(h!=NULL){
2383 free(h->rwork);
2384 free(h->a);
2385 free(h->b);
2386 free(h->vl);
2387 free(h->vr);
2388 free(h->alpha);
2389 free(h->beta);
2390 free(h->work);
2391
2392 free(h);
2393 h=NULL;
2394 *phWork = NULL;
2395 }
2396}
2397
2399(
2400 void* const hWork,
2401 const double_complex* A,
2402 const double_complex* B,
2403 const int dim,
2404 double_complex* VL,
2405 double_complex* VR,
2406 double_complex* D
2407)
2408{
2410 veclib_int i, j;
2411 veclib_int n, lda, ldb, ldvl, ldvr, info;
2412 veclib_int lwork;
2413
2414 n = lda = ldb = ldvl = ldvr = dim;
2415
2416 /* Work struct */
2417 if(hWork==NULL)
2418 utility_zeigmp_create((void**)&h, dim);
2419 else{
2420 h = (utility_zeigmp_data*)(hWork);
2421#ifndef NDEBUG
2422 saf_assert(dim<=h->maxDim, "dim exceeds the maximum length specified");
2423#endif
2424 }
2425
2426 /* store in column major order */
2427 for(i=0; i<dim; i++)
2428 for(j=0; j<dim; j++)
2429 h->a[j*dim+i] = A[i*dim+j];
2430 for(i=0; i<dim; i++)
2431 for(j=0; j<dim; j++)
2432 h->b[j*dim+i] = B[i*dim+j]; /* store in column major order */
2433
2434 /* solve eigen problem */
2435 lwork = h->currentWorkSize;
2436#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2437 zggev_("V", "V", &n, (veclib_double_complex*)h->a, &lda, (veclib_double_complex*)h->b, &ldb, (veclib_double_complex*)h->alpha, (veclib_double_complex*)h->beta,
2438 (veclib_double_complex*)h->vl, &ldvl, (veclib_double_complex*)h->vr, &ldvr, (veclib_double_complex*)h->work, &lwork, h->rwork, &info);
2439#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2440 saf_print_error("No such implementation available in ATLAS CLAPACK");
2441#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2442 info = LAPACKE_zggev_work(CblasColMajor, 'V', 'V', n, (veclib_double_complex*)h->a, lda, (veclib_double_complex*)h->b, ldb, (veclib_double_complex*)h->alpha, (veclib_double_complex*)h->beta,
2443 (veclib_double_complex*)h->vl, ldvl, (veclib_double_complex*)h->vr, ldvr, (veclib_double_complex*)h->work, lwork, h->rwork);
2444#endif
2445
2446 if(D!=NULL)
2447 memset(D, 0, dim*dim*sizeof(double_complex));
2448
2449 /* failed to converge and find the eigenvalues */
2450 if( info != 0 ) {
2451 if(VL!=NULL)
2452 memset(VL, 0, dim*dim*sizeof(double_complex));
2453 if(VR!=NULL)
2454 memset(VR, 0, dim*dim*sizeof(double_complex));
2455#ifndef NDEBUG
2456 /* Failed to compute all of the eigenvalues, some eigenvectors have not
2457 * been computed, or the input matrix contained illegal values so no
2458 * solution was attempted. In these cases the function will zero all
2459 * output matrices and vectors. */
2460 saf_print_warning("Could not compute EVD in utility_zeigmp(). Output matrices/vectors have been zeroed.");
2461#endif
2462 }
2463 /* transpose, back to row-major */
2464 else{
2465 if(D!=NULL)
2466 for(i=0; i<dim; i++)
2467 D[i*dim+i] = ccdiv(h->alpha[i],h->beta[i]);
2468
2469 if(VL!=NULL)
2470 for(i=0; i<dim; i++)
2471 for(j=0; j<dim; j++)
2472 VL[i*dim+j] = h->vl[j*dim+i];
2473 if(VR!=NULL)
2474 for(i=0; i<dim; i++)
2475 for(j=0; j<dim; j++)
2476 VR[i*dim+j] = h->vr[j*dim+i];
2477 }
2478
2479 if(hWork == NULL)
2480 utility_zeigmp_destroy((void**)&h);
2481}
2482
2483
2484/* ========================================================================== */
2485/* Eigenvalue Decomposition (?eig) */
2486/* ========================================================================== */
2487
2489typedef struct _utility_ceig_data {
2490 int maxDim;
2491 veclib_int currentWorkSize;
2492 float_complex *w, *vl, *vr, *a;
2493 float* rwork;
2494 float_complex* work;
2496
2497void utility_ceig_create(void ** const phWork, int maxDim)
2498{
2499 *phWork = malloc1d(sizeof(utility_ceig_data));
2500 utility_ceig_data *h = (utility_ceig_data*)(*phWork);
2501
2502 h->maxDim = maxDim;
2503 h->currentWorkSize = 0;
2504 h->rwork = malloc1d(4*maxDim*sizeof(float));
2505 h->w = malloc1d(maxDim*sizeof(float_complex));
2506 h->vl = malloc1d(maxDim*maxDim*sizeof(float_complex));
2507 h->vr = malloc1d(maxDim*maxDim*sizeof(float_complex));
2508 h->a = malloc1d(maxDim*maxDim*sizeof(float_complex));
2509 h->work = NULL;
2510}
2511
2512void utility_ceig_destroy(void ** const phWork)
2513{
2514 utility_ceig_data *h = (utility_ceig_data*)(*phWork);
2515
2516 if(h!=NULL){
2517 free(h->rwork);
2518 free(h->w);
2519 free(h->vl);
2520 free(h->vr);
2521 free(h->a);
2522 free(h->work);
2523
2524 free(h);
2525 h=NULL;
2526 *phWork = NULL;
2527 }
2528}
2529
2531(
2532 void* const hWork,
2533 const float_complex* A,
2534 const int dim,
2535 float_complex* VL,
2536 float_complex* VR,
2537 float_complex* D,
2538 float_complex* eig
2539)
2540{
2542 veclib_int i, j, n, lda, ldvl, ldvr, info;
2543 veclib_int lwork;
2544 float_complex wkopt;
2545
2546 n = lda = ldvl = ldvr = dim;
2547
2548 /* Work struct */
2549 if(hWork==NULL)
2550 utility_ceig_create((void**)&h, dim);
2551 else{
2552 h = (utility_ceig_data*)(hWork);
2553#ifndef NDEBUG
2554 saf_assert(dim<=h->maxDim, "dim exceeds the maximum length specified");
2555#endif
2556 }
2557
2558 /* store in column major order (i.e. transpose) */
2559 for(i=0; i<dim; i++)
2560 for(j=0; j<dim; j++)
2561 h->a[i*dim+j] = A[j*dim+i];
2562
2563 /* Query how much "work" memory is required */
2564 lwork = -1;
2565#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2566 cgeev_( "Vectors", "Vectors", &n, (veclib_float_complex*)h->a, &lda, (veclib_float_complex*)h->w, (veclib_float_complex*)h->vl, &ldvl,
2567 (veclib_float_complex*)h->vr, &ldvr, (veclib_float_complex*)&wkopt, &lwork, h->rwork, &info );
2568#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2569 saf_print_error("No such implementation available in ATLAS CLAPACK");
2570#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2571 info = LAPACKE_cgeev_work(CblasColMajor, 'V', 'V', n, (veclib_float_complex*)h->a, lda, (veclib_float_complex*)h->w, (veclib_float_complex*)h->vl, ldvl,
2572 (veclib_float_complex*)h->vr, ldvr, (veclib_float_complex*)&wkopt, lwork, h->rwork);
2573#endif
2574 lwork = (veclib_int)crealf(wkopt);
2575 if(lwork>h->currentWorkSize){
2576 h->currentWorkSize = lwork;
2577 h->work = realloc1d(h->work, h->currentWorkSize*sizeof(float_complex));
2578 }
2579
2580 /* solve the eigenproblem */
2581#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2582 cgeev_( "Vectors", "Vectors", &n, (veclib_float_complex*)h->a, &lda, (veclib_float_complex*)h->w, (veclib_float_complex*)h->vl, &ldvl,
2583 (veclib_float_complex*)h->vr, &ldvr, (veclib_float_complex*)h->work, &lwork, h->rwork, &info );
2584#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2585 saf_print_error("No such implementation available in ATLAS CLAPACK");
2586#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2587 info = LAPACKE_cgeev_work(CblasColMajor, 'V', 'V', n, (veclib_float_complex*)h->a, lda, (veclib_float_complex*)h->w, (veclib_float_complex*)h->vl, ldvl,
2588 (veclib_float_complex*)h->vr, ldvr, (veclib_float_complex*)h->work, lwork, h->rwork);
2589#endif
2590
2591 /* output */
2592 if(D!=NULL)
2593 memset(D, 0, dim*dim*sizeof(float_complex));
2594
2595 /* failed to converge and find the eigenvalues */
2596 if( info != 0 ) {
2597 if(VL!=NULL)
2598 memset(VL, 0, dim*dim*sizeof(float_complex));
2599 if(VR!=NULL)
2600 memset(VR, 0, dim*dim*sizeof(float_complex));
2601 if(eig!=NULL)
2602 memset(eig, 0, dim*sizeof(float_complex));
2603#ifndef NDEBUG
2604 /* Failed to compute all of the eigenvalues, some eigenvectors have not
2605 * been computed, or the input matrix contained illegal values so no
2606 * solution was attempted. In these cases the function will zero all
2607 * output matrices and vectors. */
2608 saf_print_warning("Could not compute EVD in utility_ceig(). Output matrices/vectors have been zeroed.");
2609#endif
2610 }
2611 /* transpose, back to row-major */
2612 else{
2613 for(i=0; i<dim; i++){
2614 if(VL!=NULL)
2615 for(j=0; j<dim; j++)
2616 VL[i*dim+j] = h->vl[j*dim+i];
2617 if(VR!=NULL)
2618 for(j=0; j<dim; j++)
2619 VR[i*dim+j] = h->vr[j*dim+i];
2620 if(D!=NULL)
2621 D[i*dim+i] = h->w[i]; /* store along the diagonal */
2622 if(eig!=NULL)
2623 eig[i] = h->w[i];
2624 }
2625 }
2626
2627 if(hWork == NULL)
2628 utility_ceig_destroy((void**)&h);
2629}
2630
2632typedef struct _utility_zeig_data {
2633 int maxDim;
2634 veclib_int currentWorkSize;
2635 double_complex *w, *vl, *vr, *a;
2636 double* rwork;
2637 double_complex* work;
2639
2640void utility_zeig_create(void ** const phWork, int maxDim)
2641{
2642 *phWork = malloc1d(sizeof(utility_zeig_data));
2643 utility_zeig_data *h = (utility_zeig_data*)(*phWork);
2644
2645 h->maxDim = maxDim;
2646 h->currentWorkSize = 0;
2647 h->rwork = malloc1d(4*maxDim*sizeof(double));
2648 h->w = malloc1d(maxDim*sizeof(double_complex));
2649 h->vl = malloc1d(maxDim*maxDim*sizeof(double_complex));
2650 h->vr = malloc1d(maxDim*maxDim*sizeof(double_complex));
2651 h->a = malloc1d(maxDim*maxDim*sizeof(double_complex));
2652 h->work = NULL;
2653}
2654
2655void utility_zeig_destroy(void ** const phWork)
2656{
2657 utility_zeig_data *h = (utility_zeig_data*)(*phWork);
2658
2659 if(h!=NULL){
2660 free(h->rwork);
2661 free(h->w);
2662 free(h->vl);
2663 free(h->vr);
2664 free(h->a);
2665 free(h->work);
2666
2667 free(h);
2668 h=NULL;
2669 *phWork = NULL;
2670 }
2671}
2672
2674(
2675 void* const hWork,
2676 const double_complex* A,
2677 const int dim,
2678 double_complex* VL,
2679 double_complex* VR,
2680 double_complex* D,
2681 double_complex* eig
2682)
2683{
2685 veclib_int i, j, n, lda, ldvl, ldvr, info;
2686 veclib_int lwork;
2687 double_complex wkopt;
2688
2689 n = lda = ldvl = ldvr = dim;
2690
2691 /* Work struct */
2692 if(hWork==NULL)
2693 utility_zeig_create((void**)&h, dim);
2694 else{
2695 h = (utility_zeig_data*)(hWork);
2696#ifndef NDEBUG
2697 saf_assert(dim<=h->maxDim, "dim exceeds the maximum length specified");
2698#endif
2699 }
2700
2701 /* store in column major order (i.e. transpose) */
2702 for(i=0; i<dim; i++)
2703 for(j=0; j<dim; j++)
2704 h->a[i*dim+j] = A[j*dim+i];
2705
2706 /* Query how much "work" memory is required */
2707 lwork = -1;
2708#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2709 zgeev_( "Vectors", "Vectors", &n, (veclib_double_complex*)h->a, &lda, (veclib_double_complex*)h->w, (veclib_double_complex*)h->vl, &ldvl,
2710 (veclib_double_complex*)h->vr, &ldvr, (veclib_double_complex*)&wkopt, &lwork, h->rwork, &info );
2711#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2712 saf_print_error("No such implementation available in ATLAS CLAPACK");
2713#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2714 info = LAPACKE_zgeev_work(CblasColMajor, 'V', 'V', n, (veclib_double_complex*)h->a, lda, (veclib_double_complex*)h->w, (veclib_double_complex*)h->vl, ldvl,
2715 (veclib_double_complex*)h->vr, ldvr, (veclib_double_complex*)&wkopt, lwork, h->rwork);
2716#endif
2717 lwork = (veclib_int)creal(wkopt);
2718 if(lwork>h->currentWorkSize){
2719 h->currentWorkSize = lwork;
2720 h->work = realloc1d(h->work, h->currentWorkSize*sizeof(double_complex));
2721 }
2722
2723 /* solve the eigenproblem */
2724#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2725 zgeev_( "Vectors", "Vectors", &n, (veclib_double_complex*)h->a, &lda, (veclib_double_complex*)h->w, (veclib_double_complex*)h->vl, &ldvl,
2726 (veclib_double_complex*)h->vr, &ldvr, (veclib_double_complex*)h->work, &lwork, h->rwork, &info );
2727#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2728 saf_print_error("No such implementation available in ATLAS CLAPACK");
2729#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2730 info = LAPACKE_zgeev_work(CblasColMajor, 'V', 'V', n, (veclib_double_complex*)h->a, lda, (veclib_double_complex*)h->w, (veclib_double_complex*)h->vl, ldvl,
2731 (veclib_double_complex*)h->vr, ldvr, (veclib_double_complex*)h->work, lwork, h->rwork);
2732#endif
2733
2734 /* output */
2735 if(D!=NULL)
2736 memset(D, 0, dim*dim*sizeof(double_complex));
2737
2738 /* failed to converge and find the eigenvalues */
2739 if( info != 0 ) {
2740 if(VL!=NULL)
2741 memset(VL, 0, dim*dim*sizeof(double_complex));
2742 if(VR!=NULL)
2743 memset(VR, 0, dim*dim*sizeof(double_complex));
2744 if(eig!=NULL)
2745 memset(eig, 0, dim*sizeof(double_complex));
2746#ifndef NDEBUG
2747 /* Failed to compute all of the eigenvalues, some eigenvectors have not
2748 * been computed, or the input matrix contained illegal values so no
2749 * solution was attempted. In these cases the function will zero all
2750 * output matrices and vectors. */
2751 saf_print_warning("Could not compute EVD in utility_zeig(). Output matrices/vectors have been zeroed.");
2752#endif
2753 }
2754 /* transpose, back to row-major */
2755 else{
2756 for(i=0; i<dim; i++){
2757 if(VL!=NULL)
2758 for(j=0; j<dim; j++)
2759 VL[i*dim+j] = h->vl[j*dim+i];
2760 if(VR!=NULL)
2761 for(j=0; j<dim; j++)
2762 VR[i*dim+j] = h->vr[j*dim+i];
2763 if(D!=NULL)
2764 D[i*dim+i] = h->w[i]; /* store along the diagonal */
2765 if(eig!=NULL)
2766 eig[i] = h->w[i];
2767 }
2768 }
2769
2770 if(hWork == NULL)
2771 utility_zeig_destroy((void**)&h);
2772}
2773
2774
2775/* ========================================================================== */
2776/* General Linear Solver (?glslv) */
2777/* ========================================================================== */
2778
2780typedef struct _utility_sglslv_data {
2781 int maxDim;
2782 int maxNCol;
2783 veclib_int* IPIV;
2784 float* a;
2785 float* b;
2787
2788void utility_sglslv_create(void ** const phWork, int maxDim, int maxNCol)
2789{
2790 *phWork = malloc1d(sizeof(utility_sglslv_data));
2792
2793 h->maxDim = maxDim;
2794 h->maxNCol = maxNCol;
2795
2796 h->IPIV = malloc1d(maxDim*sizeof(veclib_int));
2797 h->a = malloc1d(maxDim*maxDim*sizeof(float));
2798 h->b = malloc1d(maxDim*maxNCol*sizeof(float));
2799}
2800
2801void utility_sglslv_destroy(void ** const phWork)
2802{
2804
2805 if(h!=NULL){
2806 free(h->IPIV);
2807 free(h->a);
2808 free(h->b);
2809
2810 free(h);
2811 h=NULL;
2812 *phWork = NULL;
2813 }
2814}
2815
2817(
2818 void* const hWork,
2819 const float* A,
2820 const int dim,
2821 float* B,
2822 int nCol,
2823 float* X
2824)
2825{
2827 veclib_int n = dim, nrhs = nCol, lda = dim, ldb = dim, info;
2828#if !defined(SAF_VECLIB_USE_LAPACKE_INTERFACE) && !(defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64))
2829 veclib_int i, j;
2830#endif
2831
2832 /* Work struct */
2833 if(hWork==NULL)
2834 utility_sglslv_create((void**)&h, dim, nCol);
2835 else {
2836 h = (utility_sglslv_data*)(hWork);
2837#ifndef NDEBUG
2838 saf_assert(dim<=h->maxDim, "dim exceeds the maximum length specified");
2839 saf_assert(nCol<=h->maxNCol, "nCol exceeds the maximum length specified");
2840#endif
2841 }
2842
2843#if defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2844 /* Copy locally */
2845 cblas_scopy(dim*dim, A, 1, h->a, 1);
2846 cblas_scopy(dim*nCol, B, 1, h->b, 1);
2847#else
2848 /* store in column major order */
2849# if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
2850 MKL_Somatcopy('R', 'T', dim, dim, 1.0f, A, dim, h->a, dim);
2851 MKL_Somatcopy('R', 'T', dim, nCol, 1.0f, B, nCol, h->b, dim);
2852# else
2853 for(i=0; i<dim; i++)
2854 for(j=0; j<dim; j++)
2855 h->a[i*dim+j] = A[j*dim+i];
2856 for(i=0; i<dim; i++)
2857 for(j=0; j<nCol; j++)
2858 h->b[j*dim+i] = B[i*nCol+j];
2859# endif
2860#endif
2861
2862 /* solve Ax = b for each column in b (b is replaced by the solution: x) */
2863#ifdef SAF_VECLIB_USE_CLAPACK_INTERFACE
2864 info = clapack_sgesv(CblasColMajor, n, nrhs, h->a, lda, h->IPIV, h->b, ldb);
2865#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2866 info = LAPACKE_sgesv_work(CblasRowMajor, n, nrhs, h->a, lda, h->IPIV, h->b, ldb);
2867#elif defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2868 sgesv_( &n, &nrhs, h->a, &lda, h->IPIV, h->b, &ldb, &info );
2869#endif
2870
2871 if(info!=0){
2872 /* A is singular, solution not possible */
2873 memset(X, 0, dim*nCol*sizeof(float));
2874#ifndef NDEBUG
2875 /* Input matrix was singular, solution was unable to be computed, or the
2876 * input matrix contained illegal values so no solution was attempted.
2877 * In these cases the function will zero the output matrix. */
2878 saf_print_warning("Could not solve the linear equation in utility_sglslv(). Output matrices/vectors have been zeroed.");
2879#endif
2880 }
2881 else{
2882#if defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2883 cblas_scopy(dim*nCol, h->b, 1, X, 1);
2884#else
2885 /* store solution in row-major order */
2886# if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
2887 MKL_Somatcopy('R', 'T', nCol, dim, 1.0f, h->b, dim, X, nCol);
2888# else
2889 for(i=0; i<dim; i++)
2890 for(j=0; j<nCol; j++)
2891 X[i*nCol+j] = h->b[j*dim+i];
2892# endif
2893#endif
2894 }
2895
2896 if(hWork == NULL)
2897 utility_sglslv_destroy((void**)&h);
2898}
2899
2901typedef struct _utility_cglslv_data {
2902 int maxDim;
2903 int maxNCol;
2904 veclib_int* IPIV;
2905 float_complex* a;
2906 float_complex* b;
2908
2909void utility_cglslv_create(void ** const phWork, int maxDim, int maxNCol)
2910{
2911 *phWork = malloc1d(sizeof(utility_cglslv_data));
2913
2914 h->maxDim = maxDim;
2915 h->maxNCol = maxNCol;
2916
2917 h->IPIV = malloc1d(maxDim*sizeof(veclib_int));
2918 h->a = malloc1d(maxDim*maxDim*sizeof(float_complex));
2919 h->b = malloc1d(maxDim*maxNCol*sizeof(float_complex));
2920}
2921
2922void utility_cglslv_destroy(void ** const phWork)
2923{
2925
2926 if(h!=NULL){
2927 free(h->IPIV);
2928 free(h->a);
2929 free(h->b);
2930
2931 free(h);
2932 h=NULL;
2933 *phWork = NULL;
2934 }
2935}
2936
2938(
2939 void* const hWork,
2940 const float_complex* A,
2941 const int dim,
2942 float_complex* B,
2943 int nCol,
2944 float_complex* X
2945)
2946{
2948 veclib_int i, j, n, nrhs, lda, ldb, info;
2949
2950 n = dim;
2951 nrhs = nCol;
2952 lda = dim;
2953 ldb = dim;
2954
2955 /* Work struct */
2956 if(hWork==NULL)
2957 utility_cglslv_create((void**)&h, dim, nCol);
2958 else {
2959 h = (utility_cglslv_data*)(hWork);
2960#ifndef NDEBUG
2961 saf_assert(dim<=h->maxDim, "dim exceeds the maximum length specified");
2962 saf_assert(nCol<=h->maxNCol, "nCol exceeds the maximum length specified");
2963#endif
2964 }
2965
2966 /* store in column major order */
2967 for(i=0; i<dim; i++)
2968 for(j=0; j<dim; j++)
2969 h->a[j*dim+i] = A[i*dim+j];
2970 for(i=0; i<dim; i++)
2971 for(j=0; j<nCol; j++)
2972 h->b[j*dim+i] = B[i*nCol+j];
2973
2974 /* solve Ax = b for each column in b (b is replaced by the solution: x) */
2975#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2976 cgesv_( &n, &nrhs, (veclib_float_complex*)h->a, &lda, h->IPIV, (veclib_float_complex*)h->b, &ldb, &info );
2977#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2978 info = clapack_cgesv(CblasColMajor, n, nrhs, (veclib_float_complex*)h->a, lda, h->IPIV, (veclib_float_complex*)h->b, ldb);
2979#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2980 info = LAPACKE_cgesv_work(CblasColMajor, n, nrhs, (veclib_float_complex*)h->a, lda, h->IPIV, (veclib_float_complex*)h->b, ldb);
2981#endif
2982
2983 /* A is singular, solution not possible */
2984 if(info!=0){
2985 memset(X, 0, dim*nCol*sizeof(float_complex));
2986#ifndef NDEBUG
2987 /* Input matrix was singular, solution was unable to be computed, or the
2988 * input matrix contained illegal values so no solution was attempted.
2989 * In these cases the function will zero the output matrix. */
2990 saf_print_warning("Could not solve the linear equation in utility_cglslv(). Output matrices/vectors have been zeroed.");
2991#endif
2992 }
2993 /* store solution in row-major order */
2994 else{
2995 for(i=0; i<dim; i++)
2996 for(j=0; j<nCol; j++)
2997 X[i*nCol+j] = h->b[j*dim+i];
2998 }
2999
3000 if(hWork == NULL)
3001 utility_cglslv_destroy((void**)&h);
3002}
3003
3005typedef struct _utility_dglslv_data {
3006 int maxDim;
3007 int maxNCol;
3008 veclib_int* IPIV;
3009 double* a;
3010 double* b;
3012
3013void utility_dglslv_create(void ** const phWork, int maxDim, int maxNCol)
3014{
3015 *phWork = malloc1d(sizeof(utility_dglslv_data));
3017
3018 h->maxDim = maxDim;
3019 h->maxNCol = maxNCol;
3020 h->IPIV = malloc1d(maxDim*sizeof(veclib_int));
3021 h->a = malloc1d(maxDim*maxDim*sizeof(double));
3022 h->b = malloc1d(maxDim*maxNCol*sizeof(double));
3023}
3024
3025void utility_dglslv_destroy(void ** const phWork)
3026{
3028
3029 if(h!=NULL){
3030 free(h->IPIV);
3031 free(h->a);
3032 free(h->b);
3033
3034 free(h);
3035 h=NULL;
3036 *phWork = NULL;
3037 }
3038}
3039
3041(
3042 void* const hWork,
3043 const double* A,
3044 const int dim,
3045 double* B,
3046 int nCol,
3047 double* X
3048)
3049{
3051 veclib_int i, j, n, nrhs, lda, ldb, info;
3052
3053 n = dim;
3054 nrhs = nCol;
3055 lda = dim;
3056 ldb = dim;
3057
3058 /* Work struct */
3059 if(hWork==NULL)
3060 utility_dglslv_create((void**)&h, dim, nCol);
3061 else {
3062 h = (utility_dglslv_data*)(hWork);
3063#ifndef NDEBUG
3064 saf_assert(dim<=h->maxDim, "dim exceeds the maximum length specified");
3065 saf_assert(nCol<=h->maxNCol, "nCol exceeds the maximum length specified");
3066#endif
3067 }
3068
3069 /* store in column major order */
3070 for(i=0; i<dim; i++)
3071 for(j=0; j<dim; j++)
3072 h->a[j*dim+i] = A[i*dim+j];
3073 for(i=0; i<dim; i++)
3074 for(j=0; j<nCol; j++)
3075 h->b[j*dim+i] = B[i*nCol+j];
3076
3077 /* solve Ax = b for each column in b (b is replaced by the solution: x) */
3078#ifdef SAF_VECLIB_USE_CLAPACK_INTERFACE
3079 info = clapack_dgesv(CblasColMajor, n, nrhs, h->a, lda, h->IPIV, h->b, ldb);
3080#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3081 info = LAPACKE_dgesv_work(CblasColMajor, n, nrhs, h->a, lda, h->IPIV, h->b, ldb);
3082#elif defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3083 dgesv_( &n, &nrhs, h->a, &lda, h->IPIV, h->b, &ldb, &info );
3084#endif
3085
3086 /* A is singular, solution not possible */
3087 if(info!=0){
3088 memset(X, 0, dim*nCol*sizeof(double));
3089#ifndef NDEBUG
3090 /* Input matrix was singular, solution was unable to be computed, or the
3091 * input matrix contained illegal values so no solution was attempted.
3092 * In these cases the function will zero the output matrix. */
3093 saf_print_warning("Could not solve the linear equation in utility_dglslv(). Output matrices/vectors have been zeroed.");
3094#endif
3095 }
3096 /* store solution in row-major order */
3097 else{
3098 for(i=0; i<dim; i++)
3099 for(j=0; j<nCol; j++)
3100 X[i*nCol+j] = h->b[j*dim+i];
3101 }
3102
3103 if(hWork == NULL)
3104 utility_dglslv_destroy((void**)&h);
3105}
3106
3108typedef struct _utility_zglslv_data {
3109 int maxDim;
3110 int maxNCol;
3111 veclib_int* IPIV;
3112 double_complex* a;
3113 double_complex* b;
3115
3116void utility_zglslv_create(void ** const phWork, int maxDim, int maxNCol)
3117{
3118 *phWork = malloc1d(sizeof(utility_zglslv_data));
3120
3121 h->maxDim = maxDim;
3122 h->maxNCol = maxNCol;
3123 h->IPIV = malloc1d(maxDim*sizeof(veclib_int));
3124 h->a = malloc1d(maxDim*maxDim*sizeof(double_complex));
3125 h->b = malloc1d(maxDim*maxNCol*sizeof(double_complex));
3126}
3127
3128void utility_zglslv_destroy(void ** const phWork)
3129{
3131
3132 if(h!=NULL){
3133 free(h->IPIV);
3134 free(h->a);
3135 free(h->b);
3136
3137 free(h);
3138 h=NULL;
3139 *phWork = NULL;
3140 }
3141}
3142
3144(
3145 void* const hWork,
3146 const double_complex* A,
3147 const int dim,
3148 double_complex* B,
3149 int nCol,
3150 double_complex* X
3151)
3152{
3154 veclib_int i, j, n = dim, nrhs = nCol, lda = dim, ldb = dim, info;
3155
3156 n = dim;
3157 nrhs = nCol;
3158 lda = dim;
3159 ldb = dim;
3160
3161 /* Work struct */
3162 if(hWork==NULL)
3163 utility_zglslv_create((void**)&h, dim, nCol);
3164 else {
3165 h = (utility_zglslv_data*)(hWork);
3166#ifndef NDEBUG
3167 saf_assert(dim<=h->maxDim, "dim exceeds the maximum length specified");
3168 saf_assert(nCol<=h->maxNCol, "nCol exceeds the maximum length specified");
3169#endif
3170 }
3171
3172 /* store in column major order */
3173 for(i=0; i<dim; i++)
3174 for(j=0; j<dim; j++)
3175 h->a[j*dim+i] = A[i*dim+j];
3176 for(i=0; i<dim; i++)
3177 for(j=0; j<nCol; j++)
3178 h->b[j*dim+i] = B[i*nCol+j];
3179
3180 /* solve Ax = b for each column in b (b is replaced by the solution: x) */
3181#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3182 zgesv_( &n, &nrhs, (veclib_double_complex*)h->a, &lda, h->IPIV, (veclib_double_complex*)h->b, &ldb, &info );
3183#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3184 info = clapack_zgesv(CblasColMajor, n, nrhs, (veclib_double_complex*)h->a, lda, h->IPIV, (veclib_double_complex*)h->b, ldb);
3185#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3186 info = LAPACKE_zgesv_work(CblasColMajor, n, nrhs, (veclib_double_complex*)h->a, lda, h->IPIV, (veclib_double_complex*)h->b, ldb);
3187#endif
3188
3189 /* A is singular, solution not possible */
3190 if(info!=0){
3191 memset(X, 0, dim*nCol*sizeof(double_complex));
3192#ifndef NDEBUG
3193 /* Input matrix was singular, solution was unable to be computed, or the
3194 * input matrix contained illegal values so no solution was attempted.
3195 * In these cases the function will zero the output matrix. */
3196 saf_print_warning("Could not solve the linear equation in utility_zglslv(). Output matrices/vectors have been zeroed.");
3197#endif
3198 }
3199 /* store solution in row-major order */
3200 else{
3201 for(i=0; i<dim; i++)
3202 for(j=0; j<nCol; j++)
3203 X[i*nCol+j] = h->b[j*dim+i];
3204 }
3205
3206 if(hWork == NULL)
3207 utility_zglslv_destroy((void**)&h);
3208}
3209
3210
3211/* ========================================================================== */
3212/* General Linear Solver (?glslvt) */
3213/* ========================================================================== */
3214
3216typedef struct _utility_sglslvt_data {
3217 int maxDim;
3218 int maxNCol;
3219 veclib_int* IPIV;
3220 float* a;
3221 float* b;
3223
3224void utility_sglslvt_create(void ** const phWork, int maxDim, int maxNCol)
3225{
3226 *phWork = malloc1d(sizeof(utility_sglslvt_data));
3228
3229 h->maxDim = maxDim;
3230 h->maxNCol = maxNCol;
3231 h->IPIV = malloc1d(maxDim*sizeof(veclib_int));
3232 h->a = malloc1d(maxDim*maxDim*sizeof(float));
3233 h->b = malloc1d(maxDim*maxNCol*sizeof(float));
3234}
3235
3236void utility_sglslvt_destroy(void ** const phWork)
3237{
3239
3240 if(h!=NULL){
3241 free(h->IPIV);
3242 free(h->a);
3243 free(h->b);
3244
3245 free(h);
3246 h=NULL;
3247 *phWork = NULL;
3248 }
3249}
3250
3252(
3253 void* const hWork,
3254 const float* A,
3255 const int dim,
3256 float* B,
3257 int nCol,
3258 float* X
3259)
3260{
3262 veclib_int n = nCol, nrhs = dim, lda = nCol, ldb = nCol, info;
3263
3264 /* Work struct */
3265 if(hWork==NULL)
3266 utility_sglslvt_create((void**)&h, dim, nCol);
3267 else {
3268 h = (utility_sglslvt_data*)(hWork);
3269#ifndef NDEBUG
3270 saf_assert(dim<=h->maxDim, "dim exceeds the maximum length specified");
3271 saf_assert(nCol<=h->maxNCol, "nCol exceeds the maximum length specified");
3272#endif
3273 }
3274
3275 /* store locally */
3276 cblas_scopy(dim*dim, A, 1, h->a, 1);
3277 cblas_scopy(dim*nCol, B, 1, h->b, 1);
3278
3279 /* solve Ax = b for each column in b (b is replaced by the solution: x) */
3280#ifdef SAF_VECLIB_USE_CLAPACK_INTERFACE
3281 info = clapack_sgesv(CblasColMajor, n, nrhs, h->b, ldb, h->IPIV, h->a, lda);
3282#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3283 info = LAPACKE_sgesv_work(CblasColMajor, n, nrhs, h->b, ldb, h->IPIV, h->a, lda);
3284#elif defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3285 //sposv_("U", &n, &nrhs, h->b, &ldb, h->a, &lda, &info );
3286 sgesv_( &n, &nrhs, h->b, &ldb, h->IPIV, h->a, &lda, &info );
3287#endif
3288
3289 if(info!=0){
3290 /* A is singular, solution not possible */
3291 memset(X, 0, dim*nCol*sizeof(float));
3292#ifndef NDEBUG
3293 /* Input matrix was singular, solution was unable to be computed, or the
3294 * input matrix contained illegal values so no solution was attempted.
3295 * In these cases the function will zero the output matrix. */
3296 saf_print_warning("Could not solve the linear equation in utility_sglslvt(). Output matrices/vectors have been zeroed.");
3297#endif
3298 }
3299 else
3300 cblas_scopy(dim*nCol, h->a, 1, X, 1);
3301
3302 if(hWork == NULL)
3303 utility_sglslvt_destroy((void**)&h);
3304}
3305
3306
3307/* ========================================================================== */
3308/* Symmetric Linear Solver (?slslv) */
3309/* ========================================================================== */
3310
3312typedef struct _utility_sslslv_data {
3313 int maxDim;
3314 int maxNCol;
3315 float* a;
3316 float* b;
3318
3319void utility_sslslv_create(void ** const phWork, int maxDim, int maxNCol)
3320{
3321 *phWork = malloc1d(sizeof(utility_sslslv_data));
3323
3324 h->maxDim = maxDim;
3325 h->maxNCol = maxNCol;
3326 h->a = malloc1d(maxDim*maxDim*sizeof(float));
3327 h->b = malloc1d(maxDim*maxNCol*sizeof(float));
3328}
3329
3330void utility_sslslv_destroy(void ** const phWork)
3331{
3333
3334 if(h!=NULL){
3335 free(h->a);
3336 free(h->b);
3337
3338 free(h);
3339 h=NULL;
3340 *phWork = NULL;
3341 }
3342}
3343
3345(
3346 void* const hWork,
3347 const float* A,
3348 const int dim,
3349 float* B,
3350 int nCol,
3351 float* X
3352)
3353{
3355 veclib_int i, j, n, nrhs, lda, ldb, info;
3356
3357 n = dim;
3358 nrhs = nCol;
3359 lda = dim;
3360 ldb = dim;
3361
3362 /* Work struct */
3363 if(hWork==NULL)
3364 utility_sslslv_create((void**)&h, dim, nCol);
3365 else {
3366 h = (utility_sslslv_data*)(hWork);
3367#ifndef NDEBUG
3368 saf_assert(dim<=h->maxDim, "dim exceeds the maximum length specified");
3369 saf_assert(nCol<=h->maxNCol, "nCol exceeds the maximum length specified");
3370#endif
3371 }
3372
3373 /* store in column major order */
3374 for(i=0; i<dim; i++)
3375 for(j=0; j<dim; j++)
3376 h->a[j*dim+i] = A[i*dim+j];
3377 for(i=0; i<dim; i++)
3378 for(j=0; j<nCol; j++)
3379 h->b[j*dim+i] = B[i*nCol+j];
3380
3381 /* solve Ax = b for each column in b (b is replaced by the solution: x) */
3382#ifdef SAF_VECLIB_USE_CLAPACK_INTERFACE
3383 info = clapack_sposv(CblasColMajor, CblasUpper, n, nrhs, h->a, lda, h->b, ldb);
3384#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3385 info = LAPACKE_sposv_work(CblasColMajor, CblasUpper, n, nrhs, h->a, lda, h->b, ldb);
3386#elif defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3387 sposv_( "U", &n, &nrhs, h->a, &lda, h->b, &ldb, &info );
3388#endif
3389
3390 /* A is not symmetric positive definate, solution not possible */
3391 if(info!=0){
3392 memset(X, 0, dim*nCol*sizeof(float));
3393#ifndef NDEBUG
3394 /* Input matrix was singular, solution was unable to be computed, or the
3395 * input matrix contained illegal values so no solution was attempted.
3396 * In these cases the function will zero the output matrix. */
3397 saf_print_warning("Could not solve the linear equation in utility_sslslv(). Output matrices/vectors have been zeroed.");
3398#endif
3399 }
3400 /* store solution in row-major order */
3401 else{
3402 for(i=0; i<dim; i++)
3403 for(j=0; j<nCol; j++)
3404 X[i*nCol+j] = h->b[j*dim+i];
3405 }
3406
3407 if(hWork == NULL)
3408 utility_sslslv_destroy((void**)&h);
3409}
3410
3412typedef struct _utility_cslslv_data {
3413 int maxDim;
3414 int maxNCol;
3415 float_complex* a;
3416 float_complex* b;
3418
3419void utility_cslslv_create(void ** const phWork, int maxDim, int maxNCol)
3420{
3421 *phWork = malloc1d(sizeof(utility_cslslv_data));
3423
3424 h->maxDim = maxDim;
3425 h->maxNCol = maxNCol;
3426 h->a = malloc1d(maxDim*maxDim*sizeof(float_complex));
3427 h->b = malloc1d(maxDim*maxNCol*sizeof(float_complex));
3428}
3429
3430void utility_cslslv_destroy(void ** const phWork)
3431{
3433
3434 if(h!=NULL){
3435 free(h->a);
3436 free(h->b);
3437
3438 free(h);
3439 h=NULL;
3440 *phWork = NULL;
3441 }
3442}
3443
3445(
3446 void* const hWork,
3447 const float_complex* A,
3448 const int dim,
3449 float_complex* B,
3450 int nCol,
3451 float_complex* X
3452)
3453{
3455 veclib_int i, j, n, nrhs, lda, ldb, info;
3456
3457 n = dim;
3458 nrhs = nCol;
3459 lda = dim;
3460 ldb = dim;
3461
3462 /* Work struct */
3463 if(hWork==NULL)
3464 utility_cslslv_create((void**)&h, dim, nCol);
3465 else {
3466 h = (utility_cslslv_data*)(hWork);
3467#ifndef NDEBUG
3468 saf_assert(dim<=h->maxDim, "dim exceeds the maximum length specified");
3469 saf_assert(nCol<=h->maxNCol, "nCol exceeds the maximum length specified");
3470#endif
3471 }
3472
3473 /* store in column major order */
3474 for(i=0; i<dim; i++)
3475 for(j=0; j<dim; j++)
3476 h->a[j*dim+i] = A[i*dim+j];
3477 for(i=0; i<dim; i++)
3478 for(j=0; j<nCol; j++)
3479 h->b[j*dim+i] = B[i*nCol+j];
3480
3481 /* solve Ax = b for each column in b (b is replaced by the solution: x) */
3482#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3483 cposv_( "U", &n, &nrhs, (veclib_float_complex*)h->a, &lda, (veclib_float_complex*)h->b, &ldb, &info );
3484#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3485 info = clapack_cposv(CblasColMajor, CblasUpper, n, nrhs, (veclib_float_complex*)h->a, lda, (veclib_float_complex*)h->b, ldb);
3486#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3487 info = LAPACKE_cposv_work(CblasColMajor, CblasUpper, n, nrhs, (veclib_float_complex*)h->a, lda, (veclib_float_complex*)h->b, ldb);
3488#endif
3489
3490 /* A is not symmetric positive definate, solution not possible */
3491 if(info!=0){
3492 memset(X, 0, dim*nCol*sizeof(float_complex));
3493#ifndef NDEBUG
3494 /* Input matrix was singular, solution was unable to be computed, or the
3495 * input matrix contained illegal values so no solution was attempted.
3496 * In these cases the function will zero the output matrix. */
3497 saf_print_warning("Could not solve the linear equation in utility_cslslv(). Output matrices/vectors have been zeroed.");
3498#endif
3499 }
3500 /* store solution in row-major order */
3501 else{
3502 for(i=0; i<dim; i++)
3503 for(j=0; j<nCol; j++)
3504 X[i*nCol+j] = h->b[j*dim+i];
3505 }
3506
3507 if(hWork == NULL)
3508 utility_cslslv_destroy((void**)&h);
3509}
3510
3511
3512/* ========================================================================== */
3513/* Matrix Pseudo-Inverse (?pinv) */
3514/* ========================================================================== */
3515
3517typedef struct _utility_spinv_data {
3518 int maxDim1, maxDim2;
3519 veclib_int currentWorkSize;
3520 float* a, *s, *u, *vt, *inva;
3521 float* work;
3523
3524void utility_spinv_create(void ** const phWork, int maxDim1, int maxDim2)
3525{
3526 *phWork = malloc1d(sizeof(utility_spinv_data));
3527 utility_spinv_data *h = (utility_spinv_data*)(*phWork);
3528
3529 h->maxDim1 = maxDim1;
3530 h->maxDim2 = maxDim2;
3531 h->currentWorkSize = 0;
3532 h->a = malloc1d(maxDim1*maxDim2*sizeof(float));
3533 h->s = malloc1d(SAF_MIN(maxDim2,maxDim1)*sizeof(float));
3534 h->u = malloc1d(maxDim1*maxDim1*sizeof(float));
3535 h->vt = malloc1d(maxDim2*maxDim2*sizeof(float));
3536 h->inva = malloc1d(maxDim1*maxDim2*sizeof(float));
3537 h->work = NULL;
3538}
3539
3540void utility_spinv_destroy(void ** const phWork)
3541{
3542 utility_spinv_data *h = (utility_spinv_data*)(*phWork);
3543
3544 if(h!=NULL){
3545 free(h->a);
3546 free(h->s);
3547 free(h->u);
3548 free(h->vt);
3549 free(h->inva);
3550 free(h->work);
3551
3552 free(h);
3553 h=NULL;
3554 *phWork = NULL;
3555 }
3556}
3557
3559(
3560 void* const hWork,
3561 const float* inM,
3562 const int dim1,
3563 const int dim2,
3564 float* outM
3565)
3566{
3568 veclib_int i, j, m, n, k, lda, ldu, ldvt, ld_inva, info;
3569 float ss;
3570 veclib_int lwork;
3571 float wkopt;
3572
3573 m = lda = ldu = dim1;
3574 n = ldvt = dim2;
3575 k = m < n ? m : n;
3576
3577 /* Work struct */
3578 if(hWork==NULL)
3579 utility_spinv_create((void**)&h, dim1, dim2);
3580 else{
3581 h = (utility_spinv_data*)(hWork);
3582#ifndef NDEBUG
3583 saf_assert(dim1<=h->maxDim1, "dim1 exceeds the maximum length specified");
3584 saf_assert(dim2<=h->maxDim2, "dim2 exceeds the maximum length specified");
3585#endif
3586 }
3587
3588 /* store in column major order */
3589 for(i=0; i<m; i++)
3590 for(j=0; j<n; j++)
3591 h->a[j*m+i] = inM[i*n+j];
3592
3593 /* Query how much "work" memory is required */
3594 lwork = -1;
3595#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3596 sgesvd_("S", "S", &m, &n, h->a, &lda, h->s, h->u, &ldu, h->vt, &ldvt, &wkopt, &lwork, &info);
3597#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3598 saf_print_error("No such implementation available in ATLAS CLAPACK");
3599#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3600 info = LAPACKE_sgesvd_work(CblasColMajor, 'S', 'S', m, n, h->a, lda, h->s, h->u, ldu, h->vt, ldvt, &wkopt, lwork);
3601#endif
3602 lwork = (veclib_int)wkopt;
3603 if(lwork>h->currentWorkSize){
3604 h->currentWorkSize = lwork;
3605 h->work = realloc1d(h->work, h->currentWorkSize*sizeof(float));
3606 }
3607
3608 /* Perform the singular value decomposition */
3609#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3610 sgesvd_("S", "S", &m, &n, h->a, &lda, h->s, h->u, &ldu, h->vt, &ldvt, h->work, &lwork, &info );
3611#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3612 saf_print_error("No such implementation available in ATLAS CLAPACK");
3613#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3614 info = LAPACKE_sgesvd_work(CblasColMajor, 'S', 'S', m, n, h->a, lda, h->s, h->u, ldu, h->vt, ldvt, h->work, lwork);
3615#endif
3616
3617 if( info != 0 ) {
3618 memset(outM, 0, dim1*dim2*sizeof(float));
3619#ifndef NDEBUG
3620 /* The SVD failed to converge, or the input matrix contained illegal
3621 * values so no solution was attempted. In these cases this function
3622 * will zero all output matrices and/or vectors. */
3623 saf_print_warning("Could not compute SVD in utility_spinv(). Output matrices/vectors have been zeroed.");
3624#endif
3625 }
3626 else{
3627 for(i=0; i<k; i++){
3628 if(h->s[i] > 1.0e-5f)
3629 ss=1.0f/h->s[i];
3630 else
3631 ss=h->s[i];
3632 cblas_sscal(m, ss, &h->u[i*m], 1);
3633 }
3634 ld_inva=n;
3635 cblas_sgemm(CblasColMajor, CblasTrans, CblasTrans, n, m, k, 1.0f,
3636 h->vt, ldvt,
3637 h->u, ldu, 0.0f,
3638 h->inva, ld_inva);
3639
3640 /* return in row-major order */
3641 for(i=0; i<m; i++)
3642 for(j=0; j<n; j++)
3643 outM[j*m+i] = h->inva[i*n+j];
3644 }
3645
3646 /* clean-up */
3647 if(hWork == NULL)
3648 utility_spinv_destroy((void**)&h);
3649}
3650
3652typedef struct _utility_cpinv_data {
3653 int maxDim1, maxDim2;
3654 veclib_int currentWorkSize;
3655 float_complex* a, *u, *vt, *inva;
3656 float* s, *rwork;
3657 float_complex* work;
3659
3660void utility_cpinv_create(void ** const phWork, int maxDim1, int maxDim2)
3661{
3662 *phWork = malloc1d(sizeof(utility_cpinv_data));
3663 utility_cpinv_data *h = (utility_cpinv_data*)(*phWork);
3664
3665 h->maxDim1 = maxDim1;
3666 h->maxDim2 = maxDim2;
3667 h->currentWorkSize = 0;
3668 h->a = malloc1d(maxDim1*maxDim2*sizeof(float_complex));
3669 h->s = malloc1d(SAF_MIN(maxDim2,maxDim1)*sizeof(float));
3670 h->u = malloc1d(maxDim1*maxDim1*sizeof(float_complex));
3671 h->vt = malloc1d(maxDim2*maxDim2*sizeof(float_complex));
3672 h->inva = malloc1d(maxDim1*maxDim2*sizeof(float_complex));
3673 h->rwork = malloc1d(maxDim1*SAF_MAX(1, 5*SAF_MIN(maxDim2,maxDim1))*sizeof(float));
3674 h->work = NULL;
3675}
3676
3677void utility_cpinv_destroy(void ** const phWork)
3678{
3679 utility_cpinv_data *h = (utility_cpinv_data*)(*phWork);
3680
3681 if(h!=NULL){
3682 free(h->a);
3683 free(h->s);
3684 free(h->u);
3685 free(h->vt);
3686 free(h->inva);
3687 free(h->work);
3688
3689 free(h);
3690 h=NULL;
3691 *phWork = NULL;
3692 }
3693}
3694
3696(
3697 void* const hWork,
3698 const float_complex* inM,
3699 const int dim1,
3700 const int dim2,
3701 float_complex* outM
3702)
3703{
3705 veclib_int i, j, m, n, k, lda, ldu, ldvt, ld_inva, info;
3706 float_complex ss_cmplx;
3707 const float_complex calpha = cmplxf(1.0f, 0.0f); const float_complex cbeta = cmplxf(0.0f, 0.0f); /* blas */
3708 float ss;
3709 veclib_int lwork;
3710 float_complex wkopt;
3711
3712 m = lda = ldu = dim1;
3713 n = ldvt = dim2;
3714 k = m < n ? m : n;
3715
3716 /* Work struct */
3717 if(hWork==NULL)
3718 utility_cpinv_create((void**)&h, dim1, dim2);
3719 else{
3720 h = (utility_cpinv_data*)(hWork);
3721#ifndef NDEBUG
3722 saf_assert(dim1<=h->maxDim1, "dim1 exceeds the maximum length specified");
3723 saf_assert(dim2<=h->maxDim2, "dim2 exceeds the maximum length specified");
3724#endif
3725 }
3726
3727 /* store in column major order */
3728 for(i=0; i<dim1; i++)
3729 for(j=0; j<dim2; j++)
3730 h->a[j*dim1+i] = inM[i*dim2 +j];
3731
3732 /* Query how much "work" memory is required */
3733 lwork = -1;
3734#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3735 cgesvd_( "A", "A", &m, &n, (veclib_float_complex*)h->a, &lda, h->s, (veclib_float_complex*)h->u, &ldu, (veclib_float_complex*)h->vt, &ldvt,
3736 (veclib_float_complex*)&wkopt, &lwork, h->rwork, &info );
3737#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3738 saf_print_error("No such implementation available in ATLAS CLAPACK");
3739#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3740 info = LAPACKE_cgesvd_work(CblasColMajor, 'S', 'S', m, n, (veclib_float_complex*)h->a, lda, h->s, (veclib_float_complex*)h->u, ldu, (veclib_float_complex*)h->vt, ldvt,
3741 (veclib_float_complex*)&wkopt, lwork, h->rwork);
3742#endif
3743 lwork = (veclib_int)(crealf(wkopt)+0.01f);
3744 if(lwork>h->currentWorkSize){
3745 h->currentWorkSize = lwork;
3746 h->work = realloc1d(h->work, h->currentWorkSize*sizeof(float_complex));
3747 }
3748
3749 /* Perform the singular value decomposition */
3750#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3751 cgesvd_( "A", "A", &m, &n, (veclib_float_complex*)h->a, &lda, h->s, (veclib_float_complex*)h->u, &ldu, (veclib_float_complex*)h->vt, &ldvt,
3752 (veclib_float_complex*)h->work, &lwork, h->rwork, &info);
3753#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3754 saf_print_error("No such implementation available in ATLAS CLAPACK");
3755#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3756 info = LAPACKE_cgesvd_work(CblasColMajor, 'S', 'S', m, n, (veclib_float_complex*)h->a, lda, h->s, (veclib_float_complex*)h->u, ldu, (veclib_float_complex*)h->vt, ldvt,
3757 (veclib_float_complex*)h->work, lwork, h->rwork);
3758#endif
3759
3760 if( info != 0 ) {
3761 memset(outM, 0, dim1*dim2*sizeof(float_complex));
3762#ifndef NDEBUG
3763 /* The SVD failed to converge, or the input matrix contained illegal
3764 * values so no solution was attempted. In these cases this function
3765 * will zero all output matrices and/or vectors. */
3766 saf_print_warning("Could not compute SVD in utility_cpinv(). Output matrices/vectors have been zeroed.");
3767#endif
3768 }
3769 else{
3770 for(i=0; i<k; i++){
3771 if(h->s[i] > 1.0e-5f)
3772 ss=1.0f/h->s[i];
3773 else
3774 ss=h->s[i];
3775 ss_cmplx = cmplxf(ss, 0.0f);
3776 cblas_cscal(m, &ss_cmplx, &h->u[i*m], 1);
3777 }
3778 ld_inva=n;
3779 cblas_cgemm(CblasColMajor, CblasConjTrans, CblasConjTrans, n, m, k, &calpha,
3780 h->vt, ldvt,
3781 h->u, ldu, &cbeta,
3782 h->inva, ld_inva);
3783
3784 /* return in row-major order */
3785 for(i=0; i<m; i++)
3786 for(j=0; j<n; j++)
3787 outM[j*m+i] = h->inva[i*n+j];
3788 }
3789
3790 /* clean-up */
3791 if(hWork == NULL)
3792 utility_cpinv_destroy((void**)&h);
3793}
3794
3796typedef struct _utility_dpinv_data {
3797 int maxDim1, maxDim2;
3798 veclib_int currentWorkSize;
3799 double* a, *s, *u, *vt, *inva;
3800 double* work;
3802
3803void utility_dpinv_create(void ** const phWork, int maxDim1, int maxDim2)
3804{
3805 *phWork = malloc1d(sizeof(utility_dpinv_data));
3806 utility_dpinv_data *h = (utility_dpinv_data*)(*phWork);
3807
3808 h->maxDim1 = maxDim1;
3809 h->maxDim2 = maxDim2;
3810 h->currentWorkSize = 0;
3811 h->a = malloc1d(maxDim1*maxDim2*sizeof(double));
3812 h->s = malloc1d(SAF_MIN(maxDim2,maxDim1)*sizeof(double));
3813 h->u = malloc1d(maxDim1*maxDim1*sizeof(double));
3814 h->vt = malloc1d(maxDim2*maxDim2*sizeof(double));
3815 h->inva = malloc1d(maxDim1*maxDim2*sizeof(double));
3816 h->work = NULL;
3817}
3818
3819void utility_dpinv_destroy(void ** const phWork)
3820{
3821 utility_dpinv_data *h = (utility_dpinv_data*)(*phWork);
3822
3823 if(h!=NULL){
3824 free(h->a);
3825 free(h->s);
3826 free(h->u);
3827 free(h->vt);
3828 free(h->inva);
3829 free(h->work);
3830
3831 free(h);
3832 h=NULL;
3833 *phWork = NULL;
3834 }
3835}
3836
3838(
3839 void* const hWork,
3840 const double* inM,
3841 const int dim1,
3842 const int dim2,
3843 double* outM
3844)
3845{
3847 veclib_int i, j, m, n, k, lda, ldu, ldvt, ld_inva, info;
3848 double ss;
3849 veclib_int lwork;
3850 double wkopt;
3851
3852 m = lda = ldu = dim1;
3853 n = ldvt = dim2;
3854 k = m < n ? m : n;
3855
3856 /* Work struct */
3857 if(hWork==NULL)
3858 utility_dpinv_create((void**)&h, dim1, dim2);
3859 else{
3860 h = (utility_dpinv_data*)(hWork);
3861#ifndef NDEBUG
3862 saf_assert(dim1<=h->maxDim1, "dim1 exceeds the maximum length specified");
3863 saf_assert(dim2<=h->maxDim2, "dim2 exceeds the maximum length specified");
3864#endif
3865 }
3866
3867 /* store in column major order */
3868 for(i=0; i<m; i++)
3869 for(j=0; j<n; j++)
3870 h->a[j*m+i] = inM[i*n+j];
3871
3872 /* Query how much "work" memory is required */
3873 lwork = -1;
3874#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3875 dgesvd_("S", "S", &m, &n, h->a, &lda, h->s, h->u, &ldu, h->vt, &ldvt, &wkopt, &lwork, &info);
3876#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3877 saf_print_error("No such implementation available in ATLAS CLAPACK");
3878#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3879 info = LAPACKE_dgesvd_work(CblasColMajor, 'S', 'S', m, n, h->a, lda, h->s, h->u, ldu, h->vt, ldvt, &wkopt, lwork);
3880#endif
3881 lwork = (veclib_int)wkopt;
3882 if(lwork>h->currentWorkSize){
3883 h->currentWorkSize = lwork;
3884 h->work = realloc1d(h->work, h->currentWorkSize*sizeof(double));
3885 }
3886
3887 /* Perform the singular value decomposition */
3888#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3889 dgesvd_("S", "S", &m, &n, h->a, &lda, h->s, h->u, &ldu, h->vt, &ldvt, h->work, &lwork, &info );
3890#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3891 saf_print_error("No such implementation available in ATLAS CLAPACK");
3892#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3893 info = LAPACKE_dgesvd_work(CblasColMajor, 'S', 'S', m, n, h->a, lda, h->s, h->u, ldu, h->vt, ldvt, h->work, lwork);
3894#endif
3895
3896 if( info != 0 ) {
3897 memset(outM, 0, dim1*dim2*sizeof(double));
3898#ifndef NDEBUG
3899 /* The SVD failed to converge, or the input matrix contained illegal
3900 * values so no solution was attempted. In these cases this function
3901 * will zero all output matrices and/or vectors. */
3902 saf_print_warning("Could not compute SVD in utility_dpinv(). Output matrices/vectors have been zeroed.");
3903#endif
3904 }
3905 else{
3906 for(i=0; i<k; i++){
3907 if(h->s[i] > 1.0e-9)
3908 ss=1.0/h->s[i];
3909 else
3910 ss=h->s[i];
3911 cblas_dscal(m, ss, &h->u[i*m], 1);
3912 }
3913 ld_inva=n;
3914 cblas_dgemm(CblasColMajor, CblasTrans, CblasTrans, n, m, k, 1.0,
3915 h->vt, ldvt,
3916 h->u, ldu, 0.0,
3917 h->inva, ld_inva);
3918
3919 /* return in row-major order */
3920 for(i=0; i<m; i++)
3921 for(j=0; j<n; j++)
3922 outM[j*m+i] = h->inva[i*n+j];
3923 }
3924
3925 /* clean-up */
3926 if(hWork == NULL)
3927 utility_dpinv_destroy((void**)&h);
3928}
3929
3931typedef struct _utility_zpinv_data {
3932 int maxDim1, maxDim2;
3933 veclib_int currentWorkSize;
3934 double_complex* a, *u, *vt, *inva;
3935 double* s, *rwork;
3936 double_complex* work;
3938
3939void utility_zpinv_create(void ** const phWork, int maxDim1, int maxDim2)
3940{
3941 *phWork = malloc1d(sizeof(utility_zpinv_data));
3942 utility_zpinv_data *h = (utility_zpinv_data*)(*phWork);
3943
3944 h->maxDim1 = maxDim1;
3945 h->maxDim2 = maxDim2;
3946 h->currentWorkSize = 0;
3947 h->a = malloc1d(maxDim1*maxDim2*sizeof(double_complex));
3948 h->s = malloc1d(SAF_MIN(maxDim2,maxDim1)*sizeof(double));
3949 h->u = malloc1d(maxDim1*maxDim1*sizeof(double_complex));
3950 h->vt = malloc1d(maxDim2*maxDim2*sizeof(double_complex));
3951 h->inva = malloc1d(maxDim1*maxDim2*sizeof(double_complex));
3952 h->rwork = malloc1d(maxDim1*SAF_MAX(1, 5*SAF_MIN(maxDim2,maxDim1))*sizeof(double));
3953 h->work = NULL;
3954}
3955
3956void utility_zpinv_destroy(void ** const phWork)
3957{
3958 utility_zpinv_data *h = (utility_zpinv_data*)(*phWork);
3959
3960 if(h!=NULL){
3961 free(h->a);
3962 free(h->s);
3963 free(h->u);
3964 free(h->vt);
3965 free(h->inva);
3966 free(h->work);
3967
3968 free(h);
3969 h=NULL;
3970 *phWork = NULL;
3971 }
3972}
3973
3975(
3976 void* const hWork,
3977 const double_complex* inM,
3978 const int dim1,
3979 const int dim2,
3980 double_complex* outM
3981)
3982{
3984 veclib_int i, j, m, n, k, lda, ldu, ldvt, info, ld_inva;
3985 double_complex ss_cmplx;
3986 const double_complex calpha = cmplx(1.0, 0.0); const double_complex cbeta = cmplx(0.0, 0.0); /* blas */
3987 double ss;
3988 veclib_int lwork;
3989 double_complex wkopt;
3990
3991 m = lda = ldu = dim1;
3992 n = ldvt = dim2;
3993 k = m < n ? m : n;
3994
3995 /* Work struct */
3996 if(hWork==NULL)
3997 utility_zpinv_create((void**)&h, dim1, dim2);
3998 else{
3999 h = (utility_zpinv_data*)(hWork);
4000#ifndef NDEBUG
4001 saf_assert(dim1<=h->maxDim1, "dim1 exceeds the maximum length specified");
4002 saf_assert(dim2<=h->maxDim2, "dim2 exceeds the maximum length specified");
4003#endif
4004 }
4005
4006 /* store in column major order */
4007 for(i=0; i<dim1; i++)
4008 for(j=0; j<dim2; j++)
4009 h->a[j*dim1+i] = inM[i*dim2 +j];
4010
4011 /* Query how much "work" memory is required */
4012 lwork = -1;
4013#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4014 zgesvd_( "A", "A", &m, &n, (veclib_double_complex*)h->a, &lda, h->s, (veclib_double_complex*)h->u, &ldu, (veclib_double_complex*)h->vt, &ldvt,
4015 (veclib_double_complex*)&wkopt, &lwork, h->rwork, &info );
4016#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4017 saf_print_error("No such implementation available in ATLAS CLAPACK");
4018#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4019 info = LAPACKE_zgesvd_work(CblasColMajor, 'A', 'A', m, n, (veclib_double_complex*)h->a, lda, h->s, (veclib_double_complex*)h->u, ldu, (veclib_double_complex*)h->vt, ldvt,
4020 (veclib_double_complex*)&wkopt, lwork, h->rwork);
4021#endif
4022 lwork = (veclib_int)(creal(wkopt)+0.01);
4023 if(lwork>h->currentWorkSize){
4024 h->currentWorkSize = lwork;
4025 h->work = realloc1d(h->work, h->currentWorkSize*sizeof(double_complex));
4026 }
4027
4028 /* Perform the singular value decomposition */
4029#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4030 zgesvd_( "A", "A", &m, &n, (veclib_double_complex*)h->a, &lda, h->s, (veclib_double_complex*)h->u, &ldu, (veclib_double_complex*)h->vt, &ldvt,
4031 (veclib_double_complex*)h->work, &lwork, h->rwork, &info);
4032#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4033 saf_print_error("No such implementation available in ATLAS CLAPACK");
4034#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4035 info = LAPACKE_zgesvd_work(CblasColMajor, 'A', 'A', m, n, (veclib_double_complex*)h->a, lda, h->s, (veclib_double_complex*)h->u, ldu, (veclib_double_complex*)h->vt, ldvt,
4036 (veclib_double_complex*)h->work, lwork, h->rwork);
4037#endif
4038
4039 if( info != 0 ) {
4040 memset(outM, 0, dim1*dim2*sizeof(double_complex));
4041#ifndef NDEBUG
4042 /* The SVD failed to converge, or the input matrix contained illegal
4043 * values so no solution was attempted. In these cases this function
4044 * will zero all output matrices and/or vectors. */
4045 saf_print_warning("Could not compute SVD in utility_zpinv(). Output matrices/vectors have been zeroed.");
4046#endif
4047 }
4048 else{
4049 for(i=0; i<k; i++){
4050 if(h->s[i] > 1.0e-5)
4051 ss=1.0/h->s[i];
4052 else
4053 ss=h->s[i];
4054 ss_cmplx = cmplx(ss, 0.0);
4055 cblas_zscal(m, &ss_cmplx, &h->u[i*m], 1);
4056 }
4057 ld_inva=n;
4058 cblas_zgemm(CblasColMajor, CblasConjTrans, CblasConjTrans, n, m, k, &calpha,
4059 h->vt, ldvt,
4060 h->u, ldu, &cbeta,
4061 h->inva, ld_inva);
4062
4063 /* return in row-major order */
4064 for(i=0; i<m; i++)
4065 for(j=0; j<n; j++)
4066 outM[j*m+i] = h->inva[i*n+j];
4067 }
4068
4069 /* clean-up */
4070 if(hWork == NULL)
4071 utility_zpinv_destroy((void**)&h);
4072}
4073
4074
4075/* ========================================================================== */
4076/* Cholesky Factorisation (?chol) */
4077/* ========================================================================== */
4078
4080typedef struct _utility_schol_data {
4081 int maxDim;
4082 float* a;
4084
4085void utility_schol_create(void ** const phWork, int maxDim)
4086{
4087 *phWork = malloc1d(sizeof(utility_schol_data));
4088 utility_schol_data *h = (utility_schol_data*)(*phWork);
4089
4090 h->maxDim = maxDim;
4091 h->a = malloc1d(maxDim*maxDim*sizeof(float));
4092}
4093
4094void utility_schol_destroy(void ** const phWork)
4095{
4096 utility_schol_data *h = (utility_schol_data*)(*phWork);
4097
4098 if(h!=NULL){
4099 free(h->a);
4100
4101 free(h);
4102 h=NULL;
4103 *phWork = NULL;
4104 }
4105}
4106
4108(
4109 void* const hWork,
4110 const float* A,
4111 const int dim,
4112 float* X
4113)
4114{
4116 veclib_int i, j, info, n, lda;
4117
4118 n = lda = dim;
4119
4120 /* Work struct */
4121 if(hWork==NULL)
4122 utility_schol_create((void**)&h, dim);
4123 else{
4124 h = (utility_schol_data*)(hWork);
4125#ifndef NDEBUG
4126 saf_assert(dim<=h->maxDim, "dim exceeds the maximum length specified");
4127#endif
4128 }
4129
4130 /* store in column major order */
4131 for(i=0; i<dim; i++)
4132 for(j=0; j<dim; j++)
4133 h->a[j*dim+i] = A[i*dim+j];
4134
4135 /* a is replaced by solution */
4136#ifdef SAF_VECLIB_USE_CLAPACK_INTERFACE
4137 info = clapack_spotrf(CblasColMajor, CblasUpper, n, h->a, lda);
4138#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4139 info = LAPACKE_spotrf_work(CblasColMajor, CblasUpper, n, h->a, lda);
4140#elif defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4141 spotrf_( "U", &n, h->a, &lda, &info );
4142#endif
4143
4144 /* A is not positive definate, solution not possible */
4145 if(info!=0){
4146 memset(X, 0, dim*dim*sizeof(float));
4147#ifndef NDEBUG
4148 /* Input matrix is not positive definite so the Cholesky factorisation
4149 * could not be computed, or the input matrix contained illegal values
4150 * so no solution was attempted. In these cases the function will zero
4151 * the output matrix. */
4152 saf_print_warning("Could not compute the Cholesky factorisation in utility_schol(). Output matrices/vectors have been zeroed.");
4153#endif
4154 }
4155 /* store solution in row-major order */
4156 else{
4157 for(i=0; i<dim; i++)
4158 for(j=0; j<dim; j++)
4159 X[i*dim+j] = j>=i ? h->a[j*dim+i] : 0.0f;
4160 }
4161
4162 /* clean-up */
4163 if(hWork == NULL)
4164 utility_schol_destroy((void**)&h);
4165}
4166
4168typedef struct _utility_cchol_data {
4169 int maxDim;
4170 float_complex* a;
4172
4173void utility_cchol_create(void ** const phWork, int maxDim)
4174{
4175 *phWork = malloc1d(sizeof(utility_cchol_data));
4176 utility_cchol_data *h = (utility_cchol_data*)(*phWork);
4177
4178 h->maxDim = maxDim;
4179 h->a = malloc1d(maxDim*maxDim*sizeof(float_complex));
4180}
4181
4182void utility_cchol_destroy(void ** const phWork)
4183{
4184 utility_cchol_data *h = (utility_cchol_data*)(*phWork);
4185
4186 if(h!=NULL){
4187 free(h->a);
4188
4189 free(h);
4190 h=NULL;
4191 *phWork = NULL;
4192 }
4193}
4194
4196(
4197 void* const hWork,
4198 const float_complex* A,
4199 const int dim,
4200 float_complex* X
4201)
4202{
4204 veclib_int i, j, info, n, lda;
4205
4206 n = lda = dim;
4207
4208 /* Work struct */
4209 if(hWork==NULL)
4210 utility_cchol_create((void**)&h, dim);
4211 else{
4212 h = (utility_cchol_data*)(hWork);
4213#ifndef NDEBUG
4214 saf_assert(dim<=h->maxDim, "dim exceeds the maximum length specified");
4215#endif
4216 }
4217
4218 /* store in column major order */
4219 for(i=0; i<dim; i++)
4220 for(j=0; j<dim; j++)
4221 h->a[j*dim+i] = A[i*dim+j];
4222
4223 /* a is replaced by solution */
4224#if defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4225 info = clapack_cpotrf(CblasColMajor, CblasUpper, n, (veclib_float_complex*)h->a, lda);
4226#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4227 info = LAPACKE_cpotrf_work(CblasColMajor, CblasUpper, n, (veclib_float_complex*)h->a, lda);
4228#elif defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4229 cpotrf_( "U", &n, (veclib_float_complex*)h->a, &lda, &info );
4230#endif
4231
4232 /* A is not positive definate, solution not possible */
4233 if(info!=0){
4234 memset(X, 0, dim*dim*sizeof(float_complex));
4235#ifndef NDEBUG
4236 /* Input matrix is not positive definite so the Cholesky factorisation
4237 * could not be computed, or the input matrix contained illegal values
4238 * so no solution was attempted. In these cases the function will zero
4239 * the output matrix. */
4240 saf_print_warning("Could not compute the Cholesky factorisation in utility_cchol(). Output matrices/vectors have been zeroed.");
4241#endif
4242 }
4243 /* store solution in row-major order */
4244 else{
4245 for(i=0; i<dim; i++)
4246 for(j=0; j<dim; j++)
4247 X[i*dim+j] = j>=i ? h->a[j*dim+i] : cmplxf(0.0f, 0.0f);
4248 }
4249
4250 /* clean-up */
4251 if(hWork == NULL)
4252 utility_cchol_destroy((void**)&h);
4253}
4254
4255/* ========================================================================== */
4256/* Determinant of a Matrix (?det) */
4257/* ========================================================================== */
4258
4260typedef struct _utility_sdet_data {
4261 veclib_int maxN;
4262 veclib_int *IPIV;
4263 float *tmp;
4265
4266void utility_sdet_create(void ** const phWork, int maxN)
4267{
4268 *phWork = malloc1d(sizeof(utility_sdet_data));
4269 utility_sdet_data *h = (utility_sdet_data*)(*phWork);
4270
4271 h->maxN = maxN;
4272 h->IPIV = malloc1d(maxN*sizeof(veclib_int));
4273 h->tmp = malloc1d(maxN*maxN*sizeof(float));
4274}
4275
4276void utility_sdet_destroy(void ** const phWork)
4277{
4278 utility_sdet_data *h = (utility_sdet_data*)(*phWork);
4279
4280 if(h!=NULL){
4281 free(h->IPIV);
4282 free(h->tmp);
4283
4284 free(h);
4285 h=NULL;
4286 *phWork = NULL;
4287 }
4288}
4289
4291(
4292 void* const hWork,
4293 float* A,
4294 int N
4295)
4296{
4298 veclib_int i, j, INFO;
4299 float det;
4300
4301 /* Simple cases: */
4302 if(N==2){
4303 return A[0]*A[3] - A[2]*A[1];
4304 }
4305 else if(N==3){
4306 return A[0] * ((A[4]*A[8]) - (A[7]*A[5])) -A[1] * (A[3]
4307 * A[8] - A[6] * A[5]) + A[2] * (A[3] * A[7] - A[6] * A[4]);
4308 }
4309 else if (N==4){
4310 return
4311 A[3] * A[6] * A[9] * A[12] - A[2] * A[7] * A[9] * A[12] -
4312 A[3] * A[5] * A[10] * A[12] + A[1] * A[7] * A[10] * A[12] +
4313 A[2] * A[5] * A[11] * A[12] - A[1] * A[6] * A[11] * A[12] -
4314 A[3] * A[6] * A[8] * A[13] + A[2] * A[7] * A[8] * A[13] +
4315 A[3] * A[4] * A[10] * A[13] - A[0] * A[7] * A[10] * A[13] -
4316 A[2] * A[4] * A[11] * A[13] + A[0] * A[6] * A[11] * A[13] +
4317 A[3] * A[5] * A[8] * A[14] - A[1] * A[7] * A[8] * A[14] -
4318 A[3] * A[4] * A[9] * A[14] + A[0] * A[7] * A[9] * A[14] +
4319 A[1] * A[4] * A[11] * A[14] - A[0] * A[5] * A[11] * A[14] -
4320 A[2] * A[5] * A[8] * A[15] + A[1] * A[6] * A[8] * A[15] +
4321 A[2] * A[4] * A[9] * A[15] - A[0] * A[6] * A[9] * A[15] -
4322 A[1] * A[4] * A[10] * A[15] + A[0] * A[5] * A[10] * A[15];
4323 }
4324
4325 /* Otherwise, the arbitrary NxN solution: */
4326 /* Work struct */
4327 if(hWork==NULL)
4328 utility_sdet_create((void**)&h, N);
4329 else{
4330 h = (utility_sdet_data*)(hWork);
4331#ifndef NDEBUG
4332 saf_assert(N<=h->maxN, "N exceeds the maximum length specified");
4333#endif
4334 }
4335
4336 /* Store in column major order */
4337 for(i=0; i<N; i++)
4338 for(j=0; j<N; j++)
4339 h->tmp[j*N+i] = A[i*N+j];
4340
4341#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4342 sgetrf_((veclib_int*)&N, (veclib_int*)&N, h->tmp, (veclib_int*)&N, h->IPIV, &INFO);
4343#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4344 INFO = clapack_sgetrf(CblasColMajor, N, N, h->tmp, N, h->IPIV);
4345#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4346 INFO = LAPACKE_sgetrf_work(CblasColMajor, N, N, h->tmp, N, h->IPIV);
4347#endif
4348
4349 if(INFO!=0) {
4350 det=0.0f;
4351#ifndef NDEBUG
4352 /* Input matrix was singular or contained illegal values so no solution
4353 * was attempted. In these cases the determinant is returned as 0. */
4354 saf_print_warning("Unable to compute determinant of input matrix. The function utility_sdet() returned 0. ");
4355#endif
4356 }
4357 else {
4358 det = 1.0f;
4359 for( i = 0; i < N; i++ ) {
4360 det*=h->tmp[i*N+i];
4361 if(h->IPIV[i] != i+1)
4362 det *= -1.0f;
4363 }
4364 }
4365
4366 /* clean-up */
4367 if(hWork == NULL)
4368 utility_sdet_destroy((void**)&h);
4369
4370 return det;
4371}
4372
4374typedef struct _utility_ddet_data {
4375 int currentWorkSize;
4376 veclib_int maxN;
4377 veclib_int *IPIV;
4378 double *tmp, *TAU, *WORK;
4380
4381void utility_ddet_create(void ** const phWork, int maxN)
4382{
4383 *phWork = malloc1d(sizeof(utility_ddet_data));
4384 utility_ddet_data *h = (utility_ddet_data*)(*phWork);
4385
4386 h->maxN = maxN;
4387 h->currentWorkSize = 0;
4388 h->IPIV = malloc1d(maxN*sizeof(veclib_int));
4389 h->tmp = malloc1d(maxN*maxN*sizeof(double));
4390 h->TAU = malloc1d(maxN*sizeof(double));
4391 h->WORK = NULL;
4392}
4393
4394void utility_ddet_destroy(void ** const phWork)
4395{
4396 utility_ddet_data *h = (utility_ddet_data*)(*phWork);
4397
4398 if(h!=NULL){
4399 free(h->IPIV);
4400 free(h->tmp);
4401 free(h->TAU);
4402 free(h->WORK);
4403
4404 free(h);
4405 h=NULL;
4406 *phWork = NULL;
4407 }
4408}
4409
4411(
4412 void* const hWork,
4413 double* A,
4414 int N
4415)
4416{
4418 veclib_int i,j,INFO, LWORK, lwork3;
4419 double lwork2, det;
4420
4421 /* Simple cases: */
4422 if(N==2){
4423 return A[0]*A[3] - A[2]*A[1];
4424 }
4425 else if(N==3){
4426 return A[0] * ((A[4]*A[8]) - (A[7]*A[5])) -A[1] * (A[3]
4427 * A[8] - A[6] * A[5]) + A[2] * (A[3] * A[7] - A[6] * A[4]);
4428 }
4429 else if (N==4){
4430 return
4431 A[3] * A[6] * A[9] * A[12] - A[2] * A[7] * A[9] * A[12] -
4432 A[3] * A[5] * A[10] * A[12] + A[1] * A[7] * A[10] * A[12] +
4433 A[2] * A[5] * A[11] * A[12] - A[1] * A[6] * A[11] * A[12] -
4434 A[3] * A[6] * A[8] * A[13] + A[2] * A[7] * A[8] * A[13] +
4435 A[3] * A[4] * A[10] * A[13] - A[0] * A[7] * A[10] * A[13] -
4436 A[2] * A[4] * A[11] * A[13] + A[0] * A[6] * A[11] * A[13] +
4437 A[3] * A[5] * A[8] * A[14] - A[1] * A[7] * A[8] * A[14] -
4438 A[3] * A[4] * A[9] * A[14] + A[0] * A[7] * A[9] * A[14] +
4439 A[1] * A[4] * A[11] * A[14] - A[0] * A[5] * A[11] * A[14] -
4440 A[2] * A[5] * A[8] * A[15] + A[1] * A[6] * A[8] * A[15] +
4441 A[2] * A[4] * A[9] * A[15] - A[0] * A[6] * A[9] * A[15] -
4442 A[1] * A[4] * A[10] * A[15] + A[0] * A[5] * A[10] * A[15];
4443 }
4444
4445 /* Otherwise, the arbitrary NxN solution: */
4446 /* Work struct */
4447 if(hWork==NULL)
4448 utility_ddet_create((void**)&h, N);
4449 else{
4450 h = (utility_ddet_data*)(hWork);
4451#ifndef NDEBUG
4452 saf_assert(N<=h->maxN, "N exceeds the maximum length specified");
4453#endif
4454 }
4455
4456 /* Store in column major order */
4457 for(i=0; i<N; i++)
4458 for(j=0; j<N; j++)
4459 h->tmp[j*N+i] = A[i*N+j];
4460
4461 /* Query how much "work" memory is required */
4462 LWORK=-1;
4463#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4464 dgeqrf_(&N, &N, h->tmp, &N, h->TAU, &lwork2, &LWORK, &INFO);
4465#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4466 saf_print_error("No such implementation in ATLAS CLAPACK");
4467#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4468 INFO = LAPACKE_dgeqrf_work(CblasColMajor, N, N, h->tmp, N, h->TAU, &lwork2, LWORK);
4469#endif
4470 lwork3=(veclib_int)lwork2;
4471 if(lwork3>h->currentWorkSize){
4472 h->currentWorkSize = lwork3;
4473 h->WORK = realloc1d(h->WORK, h->currentWorkSize*sizeof(double));
4474 }
4475
4476 /* Decompose matrix */
4477#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4478 dgeqrf_(&N, &N, h->tmp, &N, h->TAU, h->WORK, &lwork3, &INFO);
4479#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4480 saf_print_error("No such implementation in ATLAS CLAPACK");
4481#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4482 INFO = LAPACKE_dgeqrf_work(CblasColMajor, N, N, h->tmp, N, h->TAU, h->WORK, lwork3);
4483#endif
4484
4485 if(INFO!=0) {
4486 det=0.0;
4487#ifndef NDEBUG
4488 /* Input matrix was singular or contained illegal values so no solution
4489 * was attempted. In these cases the determinant is returned as 0. */
4490 saf_print_warning("Unable to compute determinant of input matrix. The function utility_ddet() returned 0. ");
4491#endif
4492 }
4493 else {
4494 det=1;
4495 for (i=0;i<N;i++)
4496 det*=h->tmp[i*N+i];
4497 /* Householder algorithm */
4498 if(N%2==0)
4499 det*=-1;
4500 }
4501
4502 /* clean-up */
4503 if(hWork == NULL)
4504 utility_ddet_destroy((void**)&h);
4505
4506 return det;
4507}
4508
4509
4510/* ========================================================================== */
4511/* Matrix Inversion (?inv) */
4512/* ========================================================================== */
4513
4515typedef struct _utility_sinv_data {
4516 int maxN;
4517 veclib_int *IPIV;
4518 float *WORK, *tmp;
4520
4521void utility_sinv_create(void ** const phWork, int maxN)
4522{
4523 *phWork = malloc1d(sizeof(utility_sinv_data));
4524 utility_sinv_data *h = (utility_sinv_data*)(*phWork);
4525
4526 h->maxN = maxN;
4527 h->IPIV = malloc1d(maxN*sizeof(veclib_int));
4528 h->tmp = malloc1d(maxN*maxN*sizeof(float));
4529 h->WORK = malloc1d(maxN*maxN*sizeof(float));
4530}
4531
4532void utility_sinv_destroy(void ** const phWork)
4533{
4534 utility_sinv_data *h = (utility_sinv_data*)(*phWork);
4535
4536 if(h!=NULL){
4537 free(h->IPIV);
4538 free(h->tmp);
4539 free(h->WORK);
4540
4541 free(h);
4542 h=NULL;
4543 *phWork = NULL;
4544 }
4545}
4546
4548(
4549 void* const hWork,
4550 float* A,
4551 float* B,
4552 const int N
4553)
4554{
4556 veclib_int i, j, N_;
4557 veclib_int LWORK;
4558 veclib_int INFO;
4559
4560 LWORK = N*N;
4561 N_ = (veclib_int)N;
4562
4563 /* Work struct */
4564 if(hWork==NULL)
4565 utility_sinv_create((void**)&h, N);
4566 else{
4567 h = (utility_sinv_data*)(hWork);
4568#ifndef NDEBUG
4569 saf_assert(N<=h->maxN, "N exceeds the maximum length specified");
4570#endif
4571 }
4572
4573 /* Store in column major order */
4574 for(i=0; i<N; i++)
4575 for(j=0; j<N; j++)
4576 h->tmp[j*N+i] = A[i*N+j];
4577
4578#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4579 sgetrf_((veclib_int*)&N_, (veclib_int*)&N_, h->tmp, (veclib_int*)&N_, h->IPIV, &INFO);
4580 sgetri_((veclib_int*)&N_, h->tmp, (veclib_int*)&N_, h->IPIV, h->WORK, &LWORK, &INFO);
4581#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4582 INFO = clapack_sgetrf(CblasColMajor, N_, N_, h->tmp, N_, h->IPIV);
4583 INFO = clapack_sgetri(CblasColMajor, N_, h->tmp, N_, h->IPIV);
4584#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4585 INFO = LAPACKE_sgetrf_work(CblasColMajor, N_, N_, h->tmp, N_, h->IPIV);
4586 INFO = LAPACKE_sgetri_work(CblasColMajor, N_, h->tmp, N_, h->IPIV, h->WORK, LWORK);
4587#endif
4588
4589 if(INFO!=0) {
4590 memset(B, 0, N*N*sizeof(float));
4591#ifndef NDEBUG
4592 /* Input matrix was singular or contained illegal values so no inversion
4593 * was attempted. In these cases the function will zero the output
4594 * matrix. */
4595 saf_print_warning("Unable to compute the inverse of input matrix. The function utility_sinv() returned a matrix of zeros. ");
4596#endif
4597 }
4598 else {
4599 /* Output in row major order */
4600 for(i=0; i<N; i++)
4601 for(j=0; j<N; j++)
4602 B[j*N+i] = h->tmp[i*N+j];
4603 }
4604
4605 /* clean-up */
4606 if(hWork == NULL)
4607 utility_sinv_destroy((void**)&h);
4608}
4609
4611typedef struct _utility_dinv_data {
4612 int maxN;
4613 veclib_int *IPIV;
4614 double *WORK, *tmp;
4616
4617void utility_dinv_create(void ** const phWork, int maxN)
4618{
4619 *phWork = malloc1d(sizeof(utility_dinv_data));
4620 utility_dinv_data *h = (utility_dinv_data*)(*phWork);
4621
4622 h->maxN = maxN;
4623 h->IPIV = malloc1d(maxN*sizeof(veclib_int));
4624 h->tmp = malloc1d(maxN*maxN*sizeof(double));
4625 h->WORK = malloc1d(maxN*maxN*sizeof(double));
4626}
4627
4628void utility_dinv_destroy(void ** const phWork)
4629{
4630 utility_dinv_data *h = (utility_dinv_data*)(*phWork);
4631
4632 if(h!=NULL){
4633 free(h->IPIV);
4634 free(h->tmp);
4635 free(h->WORK);
4636
4637 free(h);
4638 h=NULL;
4639 *phWork = NULL;
4640 }
4641}
4642
4644(
4645 void* const hWork,
4646 double* A,
4647 double* B,
4648 const int N
4649)
4650{
4652 veclib_int i, j, N_;
4653 veclib_int LWORK;
4654 veclib_int INFO;
4655
4656 LWORK = N*N;
4657 N_ = (veclib_int)N;
4658
4659 /* Work struct */
4660 if(hWork==NULL)
4661 utility_dinv_create((void**)&h, N);
4662 else{
4663 h = (utility_dinv_data*)(hWork);
4664#ifndef NDEBUG
4665 saf_assert(N<=h->maxN, "N exceeds the maximum length specified");
4666#endif
4667 }
4668
4669 /* Store in column major order */
4670 for(i=0; i<N; i++)
4671 for(j=0; j<N; j++)
4672 h->tmp[j*N+i] = A[i*N+j];
4673
4674#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4675 dgetrf_((veclib_int*)&N_, (veclib_int*)&N_, h->tmp, (veclib_int*)&N_, h->IPIV, &INFO);
4676 dgetri_((veclib_int*)&N_, h->tmp, (veclib_int*)&N_, h->IPIV, h->WORK, &LWORK, &INFO);
4677#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4678 INFO = clapack_dgetrf(CblasColMajor, N_, N_, h->tmp, N_, h->IPIV);
4679 INFO = clapack_dgetri(CblasColMajor, N_, h->tmp, N_, h->IPIV);
4680#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4681 INFO = LAPACKE_dgetrf_work(CblasColMajor, N_, N_, h->tmp, N_, h->IPIV);
4682 INFO = LAPACKE_dgetri_work(CblasColMajor, N_, h->tmp, N_, h->IPIV, h->WORK, LWORK);
4683#endif
4684
4685 if(INFO!=0) {
4686 memset(B, 0, N*N*sizeof(double));
4687#ifndef NDEBUG
4688 /* Input matrix was singular or contained illegal values so no inversion
4689 * was attempted. In these cases the function will zero the output
4690 * matrix. */
4691 saf_print_warning("Unable to compute the inverse of input matrix. The function utility_dinv() returned a matrix of zeros. ");
4692#endif
4693 }
4694 else {
4695 /* Output in row major order */
4696 for(i=0; i<N; i++)
4697 for(j=0; j<N; j++)
4698 B[j*N+i] = h->tmp[i*N+j];
4699 }
4700
4701 /* clean-up */
4702 if(hWork == NULL)
4703 utility_dinv_destroy((void**)&h);
4704}
4705
4707typedef struct _utility_cinv_data {
4708 int maxN;
4709 veclib_int *IPIV;
4710 float_complex *WORK, *tmp;
4712
4713void utility_cinv_create(void ** const phWork, int maxN)
4714{
4715 *phWork = malloc1d(sizeof(utility_cinv_data));
4716 utility_cinv_data *h = (utility_cinv_data*)(*phWork);
4717
4718 h->maxN = maxN;
4719 h->IPIV = malloc1d(maxN*sizeof(veclib_int));
4720 h->tmp = malloc1d(maxN*maxN*sizeof(float_complex));
4721 h->WORK = malloc1d(maxN*maxN*sizeof(float_complex));
4722}
4723
4724void utility_cinv_destroy(void ** const phWork)
4725{
4726 utility_cinv_data *h = (utility_cinv_data*)(*phWork);
4727
4728 if(h!=NULL){
4729 free(h->IPIV);
4730 free(h->tmp);
4731 free(h->WORK);
4732
4733 free(h);
4734 h=NULL;
4735 *phWork = NULL;
4736 }
4737}
4738
4740(
4741 void* const hWork,
4742 float_complex* A,
4743 float_complex* B,
4744 const int N
4745)
4746{
4748 veclib_int i, j, N_;
4749 veclib_int LWORK;
4750 veclib_int INFO;
4751
4752 N_ = (veclib_int)N;
4753 LWORK = N*N;
4754
4755 /* Work struct */
4756 if(hWork==NULL)
4757 utility_cinv_create((void**)&h, N);
4758 else{
4759 h = (utility_cinv_data*)(hWork);
4760#ifndef NDEBUG
4761 saf_assert(N<=h->maxN, "N exceeds the maximum length specified");
4762#endif
4763 }
4764
4765 /* Store in column major order */
4766 for(i=0; i<N; i++)
4767 for(j=0; j<N; j++)
4768 h->tmp[j*N+i] = A[i*N+j];
4769
4770#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4771 cgetrf_((veclib_int*)&N_, (veclib_int*)&N_, (veclib_float_complex*)h->tmp, (veclib_int*)&N_, h->IPIV, &INFO);
4772 cgetri_((veclib_int*)&N_, (veclib_float_complex*)h->tmp, (veclib_int*)&N_, h->IPIV, (veclib_float_complex*)h->WORK, &LWORK, &INFO);
4773#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4774 INFO = clapack_cgetrf(CblasColMajor, N_, N_, (veclib_float_complex*)h->tmp, N_, h->IPIV);
4775 INFO = clapack_cgetri(CblasColMajor, N_, (veclib_float_complex*)h->tmp, N_, h->IPIV);
4776#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4777 INFO = LAPACKE_cgetrf_work(CblasColMajor, N_, N_, (veclib_float_complex*)h->tmp, N_, h->IPIV);
4778 INFO = LAPACKE_cgetri_work(CblasColMajor, N_, (veclib_float_complex*)h->tmp, N_, h->IPIV, (veclib_float_complex*)h->WORK, LWORK);
4779#endif
4780
4781 if(INFO!=0) {
4782 memset(B, 0, N*N*sizeof(float_complex));
4783#ifndef NDEBUG
4784 /* Input matrix was singular or contained illegal values so no inversion
4785 * was attempted. In these cases the function will zero the output
4786 * matrix. */
4787 saf_print_warning("Unable to compute the inverse of input matrix. The function utility_cinv() returned a matrix of zeros. ");
4788#endif
4789 }
4790 else {
4791 /* Output in row major order */
4792 for(i=0; i<N; i++)
4793 for(j=0; j<N; j++)
4794 B[j*N+i] = h->tmp[i*N+j];
4795 }
4796
4797 /* clean-up */
4798 if(hWork == NULL)
4799 utility_cinv_destroy((void**)&h);
4800}
void utility_dimaxv(const double *a, const int len, int *index)
Double-precision, index of maximum absolute value in a vector, i.e.
#define saf_print_error(message)
Macro to print a error message along with the filename and line number.
void utility_spinv_create(void **const phWork, int maxDim1, int maxDim2)
(Optional) Pre-allocate the working struct used by utility_spinv()
void utility_dinv(void *const hWork, double *A, double *B, const int N)
Matrix inversion: double precision, i.e.
double utility_ddet(void *const hWork, double *A, int N)
Determinant of a Matrix, double precision, i,e.
void utility_dinv_create(void **const phWork, int maxN)
(Optional) Pre-allocate the working struct used by utility_dinv()
void utility_zvvsub(const double_complex *a, const double_complex *b, const int len, double_complex *c)
Double-precision, complex, vector-vector subtraction, i.e.
void utility_ceigmp(void *const hWork, const float_complex *A, const float_complex *B, const int dim, float_complex *VL, float_complex *VR, float_complex *D)
Computes eigenvalues of a matrix pair using the QZ method, single precision complex,...
#define saf_assert(x, message)
Macro to make an assertion, along with a string explaining its purpose.
void utility_cpinv_create(void **const phWork, int maxDim1, int maxDim2)
(Optional) Pre-allocate the working struct used by utility_cpinv()
void utility_zsv2cv_inds(const double_complex *sv, const int *inds, const int len, double_complex *cv)
Double-precision complex, sparse-vector to compressed vector given known indices.
void utility_cglslv_destroy(void **const phWork)
De-allocate the working struct used by utility_cglslv()
void utility_cglslv(void *const hWork, const float_complex *A, const int dim, float_complex *B, int nCol, float_complex *X)
General linear solver: single precision complex, i.e.
void utility_ddet_destroy(void **const phWork)
De-allocate the working struct used by utility_ddet()
void utility_zeig_destroy(void **const phWork)
De-allocate the working struct used by utility_zeig()
void utility_cvvmul(const float_complex *a, const float_complex *b, const int len, float_complex *c)
Single-precision, complex, element-wise vector-vector multiplication i.e.
void utility_svvmul(const float *a, const float *b, const int len, float *c)
Single-precision, element-wise vector-vector multiplication i.e.
void utility_ceig_destroy(void **const phWork)
De-allocate the working struct used by utility_ceig()
void utility_cinv(void *const hWork, float_complex *A, float_complex *B, const int N)
Matrix inversion: double precision complex, i.e.
void utility_dvsmul(double *a, const double *s, const int len, double *c)
Double-precision, multiplies each element in vector 'a' with a scalar 's', i.e.
void utility_csv2cv_inds(const float_complex *sv, const int *inds, const int len, float_complex *cv)
Single-precision complex, sparse-vector to compressed vector given known indices.
void utility_svmod(const float *a, const float *b, const int len, float *c)
Single-precision, modulus of vector elements, i.e.
void utility_ziminv(const double_complex *a, const int len, int *index)
Double-precision, complex, index of maximum absolute value in a vector, i.e.
void utility_dvvsub(const double *a, const double *b, const int len, double *c)
Double-precision, vector-vector subtraction, i.e.
void utility_dsv2cv_inds(const double *sv, const int *inds, const int len, double *cv)
Double-precision, sparse-vector to compressed vector given known indices i.e.
#define SAF_FALSE
Boolean false.
void utility_csvd_destroy(void **const phWork)
De-allocate the working struct used by utility_csvd()
#define SAF_MAX(a, b)
Returns the maximum of the two values.
void utility_ceigmp_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_ceigmp()
void utility_dglslv(void *const hWork, const double *A, const int dim, double *B, int nCol, double *X)
General linear solver: double precision, i.e.
void utility_zvsmul(double_complex *a, const double_complex *s, const int len, double_complex *c)
Double-precision, complex, multiplies each element in vector 'a' with a scalar 's',...
void utility_dglslv_destroy(void **const phWork)
De-allocate the working struct used by utility_dglslv()
void utility_schol_destroy(void **const phWork)
De-allocate the working struct used by utility_schol()
void utility_zeig_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_zeig()
void utility_svsadd(float *a, const float *s, const int len, float *c)
Single-precision, adds each element in vector 'a' with a scalar 's', i.e.
void utility_cseig_destroy(void **const phWork)
De-allocate the working struct used by utility_cseig()
void utility_ciminv(const float_complex *a, const int len, int *index)
Single-precision, complex, index of maximum absolute value in a vector, i.e.
void utility_dvvadd(const double *a, const double *b, const int len, double *c)
Double-precision, vector-vector addition, i.e.
void utility_svssub(float *a, const float *s, const int len, float *c)
Single-precision, subtracts each element in vector 'a' with a scalar 's', i.e.
void utility_svvsub(const float *a, const float *b, const int len, float *c)
Single-precision, vector-vector subtraction, i.e.
void utility_ssvd_create(void **const phWork, int maxDim1, int maxDim2)
(Optional) Pre-allocate the working struct used by utility_ssvd()
void utility_sglslvt(void *const hWork, const float *A, const int dim, float *B, int nCol, float *X)
General linear solver (the other way): single precision, i.e.
void utility_ssvd(void *const hWork, const float *A, const int dim1, const int dim2, float *U, float *S, float *V, float *sing)
Singular value decomposition: single precision, i.e.
void utility_zimaxv(const double_complex *a, const int len, int *index)
Double-precision, complex, index of maximum absolute value in a vector, i.e.
void utility_zeigmp(void *const hWork, const double_complex *A, const double_complex *B, const int dim, double_complex *VL, double_complex *VR, double_complex *D)
Computes eigenvalues of a matrix pair using the QZ method, double precision complex,...
void utility_cinv_destroy(void **const phWork)
De-allocate the working struct used by utility_cinv()
void utility_ceig(void *const hWork, const float_complex *A, const int dim, float_complex *VL, float_complex *VR, float_complex *D, float_complex *eig)
Eigenvalue decomposition of a NON-SYMMETRIC matrix: single precision complex, i.e.
void utility_cvvadd(const float_complex *a, const float_complex *b, const int len, float_complex *c)
Single-precision, complex, vector-vector addition, i.e.
void utility_sinv_create(void **const phWork, int maxN)
(Optional) Pre-allocate the working struct used by utility_sinv()
void utility_zglslv(void *const hWork, const double_complex *A, const int dim, double_complex *B, int nCol, double_complex *X)
General linear solver: double precision complex, i.e.
void utility_sglslv(void *const hWork, const float *A, const int dim, float *B, int nCol, float *X)
General linear solver: single precision, i.e.
CONJ_FLAG
Whether a vector should be conjugated or not (e.g.
void utility_svvcopy(const float *a, const int len, float *c)
Single-precision, vector-vector copy, i.e.
void utility_cimaxv(const float_complex *a, const int len, int *index)
Single-precision, complex, index of maximum absolute value in a vector, i.e.
#define SAF_TRUE
Boolean true.
void utility_spinv(void *const hWork, const float *inM, const int dim1, const int dim2, float *outM)
General matrix pseudo-inverse (the svd way): single precision, i.e.
void utility_dpinv_create(void **const phWork, int maxDim1, int maxDim2)
(Optional) Pre-allocate the working struct used by utility_dpinv()
void utility_cpinv(void *const hWork, const float_complex *inM, const int dim1, const int dim2, float_complex *outM)
General matrix pseudo-inverse (the svd way): single precision complex, i.e.
void utility_cslslv_create(void **const phWork, int maxDim, int maxNCol)
(Optional) Pre-allocate the working struct used by utility_cslslv()
void utility_sseig_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_sseig()
void utility_sinv(void *const hWork, float *A, float *B, const int N)
Matrix inversion: single precision, i.e.
void utility_zglslv_destroy(void **const phWork)
De-allocate the working struct used by utility_zglslv()
void utility_cpinv_destroy(void **const phWork)
De-allocate the working struct used by utility_cpinv()
void utility_sglslvt_destroy(void **const phWork)
De-allocate the working struct used by utility_sglslvt()
void utility_dpinv_destroy(void **const phWork)
De-allocate the working struct used by utility_dpinv()
void utility_cseig(void *const hWork, const float_complex *A, const int dim, int sortDecFLAG, float_complex *V, float_complex *D, float *eig)
Eigenvalue decomposition of a SYMMETRIC/HERMITION matrix: single precision complex,...
void utility_csvd(void *const hWork, const float_complex *A, const int dim1, const int dim2, float_complex *U, float_complex *S, float_complex *V, float *sing)
Singular value decomposition: single precision complex, i.e.
void utility_dpinv(void *const hWork, const double *inM, const int dim1, const int dim2, double *outM)
General matrix pseudo-inverse (the svd way): double precision, i.e.
void utility_cseig_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_cseig()
void utility_diminv(const double *a, const int len, int *index)
Double-precision, index of minimum absolute value in a vector, i.e.
void utility_sslslv_destroy(void **const phWork)
De-allocate the working struct used by utility_sslslv()
void utility_schol(void *const hWork, const float *A, const int dim, float *X)
Cholesky factorisation of a symmetric matrix positive-definate matrix: single precision,...
#define SAF_MIN(a, b)
Returns the minimum of the two values.
void utility_zeig(void *const hWork, const double_complex *A, const int dim, double_complex *VL, double_complex *VR, double_complex *D, double_complex *eig)
Eigenvalue decomposition of a NON-SYMMETRIC matrix: double precision complex, i.e.
void utility_cvabs(const float_complex *a, const int len, float *c)
Single-precision, complex, absolute values of vector elements, i.e.
void utility_cvvdot(const float_complex *a, const float_complex *b, const int len, CONJ_FLAG flag, float_complex *c)
Single-precision, complex, vector-vector dot product, i.e.
void utility_sglslv_create(void **const phWork, int maxDim, int maxNCol)
(Optional) Pre-allocate the working struct used by utility_sglslv()
void utility_cinv_create(void **const phWork, int maxN)
(Optional) Pre-allocate the working struct used by utility_cinv()
void utility_cvvsub(const float_complex *a, const float_complex *b, const int len, float_complex *c)
Single-precision, complex, vector-vector subtraction, i.e.
void utility_zpinv(void *const hWork, const double_complex *inM, const int dim1, const int dim2, double_complex *outM)
General matrix pseudo-inverse (the svd way): double precision complex, i.e.
void utility_zpinv_create(void **const phWork, int maxDim1, int maxDim2)
(Optional) Pre-allocate the working struct used by utility_zpinv()
void utility_sseig(void *const hWork, const float *A, const int dim, int sortDecFLAG, float *V, float *D, float *eig)
Eigenvalue decomposition of a SYMMETRIC matrix: single precision, i.e.
void utility_dinv_destroy(void **const phWork)
De-allocate the working struct used by utility_dinv()
void utility_cchol(void *const hWork, const float_complex *A, const int dim, float_complex *X)
Cholesky factorisation of a hermitian positive-definate matrix: single precision complex,...
#define saf_print_warning(message)
Macro to print a warning message along with the filename and line number.
void utility_cchol_destroy(void **const phWork)
De-allocate the working struct used by utility_cchol()
void utility_zvvcopy(const double_complex *a, const int len, double_complex *c)
double-precision, complex, vector-vector copy, i.e.
float utility_sdet(void *const hWork, float *A, int N)
Determinant of a Matrix, single precision, i,e.
void utility_zeigmp_destroy(void **const phWork)
De-allocate the working struct used by utility_zeigmp()
void utility_schol_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_schol()
void utility_sslslv_create(void **const phWork, int maxDim, int maxNCol)
(Optional) Pre-allocate the working struct used by utility_sslslv()
void utility_zvvadd(const double_complex *a, const double_complex *b, const int len, double_complex *c)
Double-precision, complex, vector-vector addition, i.e.
void utility_ceigmp_destroy(void **const phWork)
De-allocate the working struct used by utility_ceigmp()
void utility_csvd_create(void **const phWork, int maxDim1, int maxDim2)
(Optional) Pre-allocate the working struct used by utility_csvd()
void utility_cvvcopy(const float_complex *a, const int len, float_complex *c)
Single-precision, complex, vector-vector copy, i.e.
void utility_svvadd(const float *a, const float *b, const int len, float *c)
Single-precision, vector-vector addition, i.e.
void utility_svrecip(const float *a, const int len, float *c)
Single-precision, vector-reciprocal/inversion, i.e.
void utility_sglslvt_create(void **const phWork, int maxDim, int maxNCol)
(Optional) Pre-allocate the working struct used by utility_sglslvt()
void utility_spinv_destroy(void **const phWork)
De-allocate the working struct used by utility_spinv()
void utility_sslslv(void *const hWork, const float *A, const int dim, float *B, int nCol, float *X)
Linear solver for SYMMETRIC positive-definate 'A': single precision, i.e.
void utility_ddet_create(void **const phWork, int maxN)
(Optional) Pre-allocate the working struct used by utility_ddet()
void utility_sdet_create(void **const phWork, int maxN)
(Optional) Pre-allocate the working struct used by utility_sdet()
void utility_cvsmul(float_complex *a, const float_complex *s, const int len, float_complex *c)
Single-precision, complex, multiplies each element in vector 'a' with a scalar 's',...
void utility_cglslv_create(void **const phWork, int maxDim, int maxNCol)
(Optional) Pre-allocate the working struct used by utility_cglslv()
void utility_ceig_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_ceig()
void utility_dglslv_create(void **const phWork, int maxDim, int maxNCol)
(Optional) Pre-allocate the working struct used by utility_dglslv()
void utility_svvdot(const float *a, const float *b, const int len, float *c)
Single-precision, vector-vector dot product, i.e.
void utility_svsdiv(const float *a, const float *s, const int len, float *c)
Single-precision, divides each element in vector 'a' with a scalar 's', i.e.
void utility_cslslv_destroy(void **const phWork)
De-allocate the working struct used by utility_cslslv()
void utility_zglslv_create(void **const phWork, int maxDim, int maxNCol)
(Optional) Pre-allocate the working struct used by utility_zglslv()
void utility_cvconj(const float_complex *a, const int len, float_complex *c)
Single-precision, complex, vector-conjugate, i.e.
void utility_dvvcopy(const double *a, const int len, double *c)
double-precision, vector-vector copy, i.e.
void utility_sseig_destroy(void **const phWork)
De-allocate the working struct used by utility_sseig()
void utility_svabs(const float *a, const int len, float *c)
Single-precision, absolute values of vector elements, i.e.
void utility_svsmul(float *a, const float *s, const int len, float *c)
Single-precision, multiplies each element in vector 'a' with a scalar 's', i.e.
void utility_zvconj(const double_complex *a, const int len, double_complex *c)
Double-precision, complex, vector-conjugate, i.e.
void utility_sinv_destroy(void **const phWork)
De-allocate the working struct used by utility_sinv()
void utility_cchol_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_cchol()
void utility_sdet_destroy(void **const phWork)
De-allocate the working struct used by utility_sdet()
void utility_siminv(const float *a, const int len, int *index)
Single-precision, index of minimum absolute value in a vector, i.e.
void utility_zeigmp_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_zeigmp()
void utility_ssv2cv_inds(const float *sv, const int *inds, const int len, float *cv)
Single-precision, sparse-vector to compressed vector given known indices i.e.
void utility_sglslv_destroy(void **const phWork)
De-allocate the working struct used by utility_sglslv()
void utility_simaxv(const float *a, const int len, int *index)
Single-precision, index of maximum absolute value in a vector, i.e.
void utility_ssvd_destroy(void **const phWork)
De-allocate the working struct used by utility_ssvd()
void utility_zpinv_destroy(void **const phWork)
De-allocate the working struct used by utility_zpinv()
void utility_cslslv(void *const hWork, const float_complex *A, const int dim, float_complex *B, int nCol, float_complex *X)
Linear solver for HERMITIAN positive-definate 'A': single precision complex, i.e.
@ NO_CONJ
Do not take the conjugate.
@ CONJ
Take the conjugate.
void * malloc1d(size_t dim1_data_size)
1-D malloc (same as malloc, but with error checking)
Definition md_malloc.c:59
void * realloc1d(void *ptr, size_t dim1_data_size)
1-D realloc (same as realloc, but with error checking)
Definition md_malloc.c:79
Include header for SAF externals.
Main header for the utilities module (SAF_UTILITIES_MODULE)
double veclib_double
real: 8-bytes
MKL_INT veclib_int
integer: 4-bytes
MKL_Complex16 veclib_double_complex
complex: 16-bytes
MKL_Complex8 veclib_float_complex
complex: 8-bytes
#define SAF_INTEL_MKL_VML_MODE
Taken from the Intel MKL VML developers reference documentation: The flag(s) that can be passed to In...
float veclib_float
real: 4-bytes
Data structure for utility_cchol()
Data structure for utility_ceig()
Data structure for utility_ceigmp()
Data structure for utility_cglslv()
Data structure for utility_cinv()
Data structure for utility_cpinv()
Data structure for utility_cseig()
Data structure for utility_cslslv()
Data structure for utility_csvd()
Data structure for utility_ddet()
Data structure for utility_dglslv()
Data structure for utility_dinv()
Data structure for utility_dpinv()
Data structure for utility_schol()
Data structure for utility_sdet()
Data structure for utility_sglslv()
Data structure for utility_sglslvt()
Data structure for utility_sinv()
Data structure for utility_spinv()
Data structure for utility_sseig()
Data structure for utility_sslslv()
Data structure for utility_ssvd()
Data structure for utility_zeig()
Data structure for utility_zeigmp()
Data structure for utility_zglslv()
Data structure for utility_zpinv()