52#if defined(SAF_USE_INTEL_MKL_LP64)
54# define SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE
55#elif defined(SAF_USE_INTEL_MKL_ILP64)
57# define SAF_VECLIB_USE_LAPACKE_INTERFACE
58#elif defined(SAF_USE_OPEN_BLAS_AND_LAPACKE)
59# define SAF_VECLIB_USE_LAPACKE_INTERFACE
60#elif defined(SAF_USE_ATLAS)
61# define SAF_VECLIB_USE_CLAPACK_INTERFACE
62#elif defined(__APPLE__) && defined(SAF_USE_APPLE_ACCELERATE)
63# define SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE
64#elif defined(SAF_USE_GSL)
65# define SAF_VECLIB_USE_GSL_LINALG
67# error No LAPACK interface was specified!
71#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
72# ifndef SAF_INTEL_MKL_VML_MODE
92# define SAF_INTEL_MKL_VML_MODE ( VML_LA | VML_FTZDAZ_ON )
97#if defined(__APPLE__) && defined(SAF_USE_APPLE_ACCELERATE)
103#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
104# ifdef SAF_USE_INTEL_MKL_LP64
113#elif defined(SAF_USE_OPEN_BLAS_AND_LAPACKE)
132#ifdef SAF_USE_BUILT_IN_NAIVE_CBLAS
133void cblas_scopy(
const int N,
const float *X,
const int incX,
float *Y,
const int incY){
135#if defined(SAF_ENABLE_SIMD)
136 if(incX==1 && incY==1){
137 for(i=0; i<(N-3); i+=4)
138 _mm_storeu_ps(Y+i, _mm_loadu_ps(X+i));
144 Y[i*incY] = X[i*incX];
147 Y[i*incY] = X[i*incX];
151void cblas_dcopy(
const int N,
const double *X,
const int incX,
double *Y,
const int incY){
153 for(i=j=0; i<N; i+=incX, j+=incY)
157void cblas_ccopy(
const int N,
const void *X,
const int incX,
void *Y,
const int incY){
159 float_complex *cX, *cY;
160 cX = (float_complex*)X;
161 cY = (float_complex*)Y;
162 for(i=j=0; i<N; i+=incX, j+=incY)
166void cblas_zcopy(
const int N,
const void *X,
const int incX,
void *Y,
const int incY){
168 double_complex *cX, *cY;
169 cX = (double_complex*)X;
170 cY = (double_complex*)Y;
171 for(i=j=0; i<N; i+=incX, j+=incY)
175void cblas_saxpy(
const int N,
const float alpha,
const float* X,
const int incX,
float* Y,
const int incY) {
177 for (i=j=0; i<N; i+=incX, j+=incY)
178 Y[i] = alpha * X[i] + Y[i];
181void cblas_daxpy(
const int N,
const double alpha,
const double* X,
const int incX,
double* Y,
const int incY) {
183 for (i=j=0; i<N; i+=incX, j+=incY)
184 Y[i] = alpha * X[i] + Y[i];
187float cblas_sdot(
const int N,
const float* X,
const int incX,
const float* Y,
const int incY){
190#if defined(SAF_ENABLE_SIMD)
192 if(incX==1 && incY==1){
194 __m128 sum = _mm_setzero_ps();
196 for(i=0; i<(N-3); i+=4)
197 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(X+i), _mm_loadu_ps(Y+i)));
199 sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
201 sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 2, 0, 1)));
202 _mm_store_ss(&ret, sum);
209 ret += X[i*incX] * Y[i*incY];
213 ret += X[i*incX] * Y[i*incY];
225#ifdef SAF_USE_BUILT_IN_NAIVE_CBLAS
226void cblas_sgemm(
const CBLAS_LAYOUT Layout,
const CBLAS_TRANSPOSE TransA,
227 const CBLAS_TRANSPOSE TransB,
const int M,
const int N,
228 const int K,
const float alpha,
const float *A,
229 const int lda,
const float *B,
const int ldb,
230 const float beta,
float *C,
const int ldc)
248 if(!_transA && !_transB){
253 C[i*ldc+j] += A[i*lda+k] * B[k*ldb+j];
257 else if(_transA && !_transB){
259 else if(_transA && _transB){
276#if defined(SAF_USE_APPLE_ACCELERATE)
279 vDSP_minmgvi(a, 1, &minVal, &ind_tmp, (vDSP_Length)len);
280 *index = (int)ind_tmp;
281#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
282 *index = (int)cblas_isamin(len, a, 1);
286 float minVal=FLT_MAX;
287 for(i=0; i<len; i++){
288 if(fabsf(a[i])<minVal){
289 minVal = fabsf(a[i]);
298 const float_complex* a,
303#if defined(SAF_USE_APPLE_ACCELERATE)
306 abs_a =
malloc1d(len*
sizeof(
float));
307 vDSP_vdist((
float*)a, 2, (
float*)a+1, 2, abs_a, 1, (vDSP_Length)len);
309 vDSP_minmgvi(abs_a, 1, &minVal, &ind_tmp, (vDSP_Length)len);
310 *index = (int)ind_tmp;
312#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
313 *index = (int)cblas_icamin(len, a, 1);
317 float minVal=FLT_MAX;
318 for(i=0; i<len; i++){
319 if(cabsf(a[i])<minVal){
320 minVal = cabsf(a[i]);
334#if defined(SAF_USE_APPLE_ACCELERATE)
337 vDSP_minmgviD(a, 1, &minVal, &ind_tmp, (vDSP_Length)len);
338 *index = (int)ind_tmp;
339#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
340 *index = (int)cblas_idamin(len, a, 1);
344 double minVal=DBL_MAX;
345 for(i=0; i<len; i++){
346 if(fabs(a[i])<minVal){
356 const double_complex* a,
361#if defined(SAF_USE_APPLE_ACCELERATE)
364 abs_a =
malloc1d(len*
sizeof(
double));
365 vDSP_vdistD((
double*)a, 2, (
double*)a+1, 2, abs_a, 1, (vDSP_Length)len);
367 vDSP_minmgviD(abs_a, 1, &minVal, &ind_tmp, (vDSP_Length)len);
368 *index = (int)ind_tmp;
370#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
371 *index = (int)cblas_izamin(len, a, 1);
375 double minVal=DBL_MAX;
376 for(i=0; i<len; i++){
377 if(cabs(a[i])<minVal){
397 *index = (int)cblas_isamax(len, a, 1);
402 const float_complex* a,
407 *index = (int)cblas_icamax(len, a, 1);
417 *index = (int)cblas_idamax(len, a, 1);
422 const double_complex* a,
427 *index = (int)cblas_izamax(len, a, 1);
442#if defined(SAF_USE_INTEL_IPP)
443 ippsAbs_32f((Ipp32f*)a, (Ipp32f*)c, len);
444#elif defined(SAF_USE_APPLE_ACCELERATE)
445 vDSP_vabs(a, 1, c, 1, (vDSP_Length)len);
446#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
457 const float_complex* a,
462#if defined(SAF_USE_INTEL_IPP)
463 ippsMagnitude_32fc((Ipp32fc*)a, (Ipp32f*)c, len);
464#elif defined(SAF_USE_APPLE_ACCELERATE)
465 vDSP_vdist((
float*)a, 2, (
float*)a+1, 2, c, 1, (vDSP_Length)len);
466#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
488#if defined(SAF_USE_APPLE_ACCELERATE)
489 vvfmodf(c, a, b, &len);
490#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
495 c[i] = fmodf(a[i], b[i]);
511#if defined(SAF_USE_INTEL_IPP)
512 ippsDivCRev_32f((Ipp32f*)a, 1.0f, (Ipp32f*)c, len);
513#elif defined(SAF_USE_APPLE_ACCELERATE)
516 vDSP_svdiv(&one, a, 1, c, 1, (vDSP_Length)len);
517#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
519#elif defined(SAF_ENABLE_SIMD)
522# if defined(__AVX512F__)
523 for(; i<(len-15); i+=16)
524 _mm512_storeu_ps(c+i, _mm512_rcp14_ps(_mm512_loadu_ps(a+i)));
526# if defined(__AVX__) && defined(__AVX2__)
527 for(; i<(len-7); i+=8)
528 _mm256_storeu_ps(c+i, _mm256_rcp_ps(_mm256_loadu_ps(a+i)));
530# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
531 for(; i<(len-3); i+=4)
532 _mm_storeu_ps(c+i, _mm_rcp_ps(_mm_loadu_ps(a+i)));
550 const float_complex* a,
555#if defined(SAF_USE_INTEL_IPP)
556 ippsConj_32fc((Ipp32fc*)a, (Ipp32fc*)c, len);
558 cblas_ccopy(len, a, 1, c, 1);
559 cblas_sscal(len, -1.0f, ((
float*)c)+1, 2);
565 const double_complex* a,
570#if defined(SAF_USE_INTEL_IPP)
571 ippsConj_64fc((Ipp64fc*)a, (Ipp64fc*)c, len);
573 cblas_zcopy(len, a, 1, c, 1);
574 cblas_dscal(len, -1.0, ((
double*)c)+1, 2);
590 cblas_scopy(len, a, 1, c, 1);
595 const float_complex* a,
600 cblas_ccopy(len, a, 1, c, 1);
610 cblas_dcopy(len, a, 1, c, 1);
615 const double_complex* a,
620 cblas_zcopy(len, a, 1, c, 1);
638 saf_assert(c!=NULL,
"'c' can no longer be NULL");
639 saf_assert(a!=c && b!=c,
"In-place operation is no longer supported, use e.g. cblas_saxby() instead.");
643#if defined(SAF_USE_INTEL_IPP)
644 ippsAdd_32f((Ipp32f*)a, (Ipp32f*)b, (Ipp32f*)c, len);
645#elif defined(SAF_USE_APPLE_ACCELERATE)
646 vDSP_vadd(a, 1, b, 1, c, 1, (vDSP_Length)len);
647#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
649#elif defined(SAF_ENABLE_SIMD)
652# if defined(__AVX512F__)
653 for(; i<(len-15); i+=16)
654 _mm512_storeu_ps(c+i, _mm512_add_ps(_mm512_loadu_ps(a+i), _mm512_loadu_ps(b+i)));
656# if defined(__AVX__) && defined(__AVX2__)
657 for(; i<(len-7); i+=8)
658 _mm256_storeu_ps(c+i, _mm256_add_ps(_mm256_loadu_ps(a+i), _mm256_loadu_ps(b+i)));
660# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
661 for(; i<(len-3); i+=4)
662 _mm_storeu_ps(c+i, _mm_add_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
669 for(i=0; i<len-3; i+=4){
671 c[i+1] = a[i+1] + b[i+1];
672 c[i+2] = a[i+2] + b[i+2];
673 c[i+3] = a[i+3] + b[i+3];
679 for (j = 0; j < len; j++)
686 const float_complex* a,
687 const float_complex* b,
694 saf_assert(c!=NULL,
"'c' can no longer be NULL");
695 saf_assert(a!=c && b!=c,
"In-place operation is no longer supported, use e.g. cblas_caxby() instead.");
699#if defined(SAF_USE_INTEL_IPP)
700 ippsAdd_32fc((Ipp32fc*)a, (Ipp32fc*)b, (Ipp32fc*)c, len);
701#elif defined(SAF_USE_APPLE_ACCELERATE)
702 vDSP_vadd((
float*)a, 1, (
float*)b, 1, (
float*)c, 1, 2*(vDSP_Length)len);
703#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
705#elif defined(SAF_ENABLE_SIMD)
709 sa = (
float*)a; sb = (
float*)b; sc = (
float*)c;
711# if defined(__AVX512F__)
712 for(; i<(len2-15); i+=16)
713 _mm512_storeu_ps(sc+i, _mm512_add_ps(_mm512_loadu_ps(sa+i), _mm512_loadu_ps(sb+i)));
715# if defined(__AVX__) && defined(__AVX2__)
716 for(; i<(len2-7); i+=8)
717 _mm256_storeu_ps(sc+i, _mm256_add_ps(_mm256_loadu_ps(sa+i), _mm256_loadu_ps(sb+i)));
719# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
720 for(; i<(len2-3); i+=4)
721 _mm_storeu_ps(sc+i, _mm_add_ps(_mm_loadu_ps(sa+i), _mm_loadu_ps(sb+i)));
724 sc[i] = sa[i] + sb[i];
725#elif __STDC_VERSION__ >= 199901L && defined(NDEBUG)
728 for(i=0; i<len-3; i+=4){
730 c[i+1] = a[i+1] + b[i+1];
731 c[i+2] = a[i+2] + b[i+2];
732 c[i+3] = a[i+3] + b[i+3];
738 for (j = 0; j < len; j++)
739 c[j] = ccaddf(a[j], b[j]);
753 saf_assert(c!=NULL,
"'c' can no longer be NULL");
754 saf_assert(a!=c && b!=c,
"In-place operation is no longer supported, use e.g. cblas_daxby() instead.");
758#if defined(SAF_USE_INTEL_IPP)
759 ippsAdd_64f((Ipp64f*)a, (Ipp64f*)b, (Ipp64f*)c, len);
760#elif defined(SAF_USE_APPLE_ACCELERATE)
761 vDSP_vaddD(a, 1, b, 1, c, 1, (vDSP_Length)len);
762#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
764#elif defined(SAF_ENABLE_SIMD)
767# if defined(__AVX512F__)
768 for(; i<(len-7); i+=8)
769 _mm512_storeu_pd(c+i, _mm512_add_pd(_mm512_loadu_pd(a+i), _mm512_loadu_pd(b+i)));
771# if defined(__AVX__) && defined(__AVX2__)
772 for(; i<(len-3); i+=4)
773 _mm256_storeu_pd(c+i, _mm256_add_pd(_mm256_loadu_pd(a+i), _mm256_loadu_pd(b+i)));
775# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
776 for(; i<(len-1); i+=2)
777 _mm_storeu_pd(c+i, _mm_add_pd(_mm_loadu_pd(a+i), _mm_loadu_pd(b+i)));
783 for (j = 0; j < len; j++)
790 const double_complex* a,
791 const double_complex* b,
798 saf_assert(c!=NULL,
"'c' can no longer be NULL");
799 saf_assert(a!=c && b!=c,
"In-place operation is no longer supported, use e.g. cblas_zaxby() instead.");
803#if defined(SAF_USE_INTEL_IPP)
804 ippsAdd_64fc((Ipp64fc*)a, (Ipp64fc*)b, (Ipp64fc*)c, len);
805#elif defined(SAF_USE_APPLE_ACCELERATE)
806 vDSP_vaddD((
double*)a, 1, (
double*)b, 1, (
double*)c, 1, 2*(vDSP_Length)len);
807#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
809#elif defined(SAF_ENABLE_SIMD)
811 double* sa, *sb, *sc;
813 sa = (
double*)a; sb = (
double*)b; sc = (
double*)c;
815# if defined(__AVX512F__)
816 for(; i<(len2-7); i+=8)
817 _mm512_storeu_pd(sc+i, _mm512_add_pd(_mm512_loadu_pd(sa+i), _mm512_loadu_pd(sb+i)));
819# if defined(__AVX__) && defined(__AVX2__)
820 for(; i<(len2-3); i+=4)
821 _mm256_storeu_pd(sc+i, _mm256_add_pd(_mm256_loadu_pd(sa+i), _mm256_loadu_pd(sb+i)));
823# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
824 for(; i<(len2-1); i+=2)
825 _mm_storeu_pd(sc+i, _mm_add_pd(_mm_loadu_pd(sa+i), _mm_loadu_pd(sb+i)));
828 sc[i] = sa[i] + sb[i];
831 for (j = 0; j < len; j++)
832 c[j] = ccadd(a[j], b[j]);
852 saf_assert(a!=c && b!=c,
"In-place operation is not supported.");
856#if defined(SAF_USE_INTEL_IPP)
857 ippsSub_32f((Ipp32f*)b, (Ipp32f*)a, (Ipp32f*)c, len);
858#elif defined(SAF_USE_APPLE_ACCELERATE)
859 vDSP_vsub(b, 1, a, 1, c, 1, (vDSP_Length)len);
860#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
862#elif defined(SAF_ENABLE_SIMD)
865# if defined(__AVX512F__)
866 for(; i<(len-15); i+=16)
867 _mm512_storeu_ps(c+i, _mm512_sub_ps(_mm512_loadu_ps(a+i), _mm512_loadu_ps(b+i)));
869# if defined(__AVX__) && defined(__AVX2__)
870 for(; i<(len-7); i+=8)
871 _mm256_storeu_ps(c+i, _mm256_sub_ps(_mm256_loadu_ps(a+i), _mm256_loadu_ps(b+i)));
873# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
874 for(; i<(len-3); i+=4)
875 _mm_storeu_ps(c+i, _mm_sub_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
882 for(i=0; i<len-3; i+=4){
884 c[i+1] = a[i+1] - b[i+1];
885 c[i+2] = a[i+2] - b[i+2];
886 c[i+3] = a[i+3] - b[i+3];
892 for (j = 0; j < len; j++)
899 const float_complex* a,
900 const float_complex* b,
908 saf_assert(a!=c && b!=c,
"In-place operation is not supported.");
912#if defined(SAF_USE_INTEL_IPP)
913 ippsSub_32fc((Ipp32fc*)b, (Ipp32fc*)a, (Ipp32fc*)c, len);
914#elif defined(SAF_USE_APPLE_ACCELERATE)
915 vDSP_vsub((
float*)b, 1, (
float*)a, 1, (
float*)c, 1, 2*(vDSP_Length)len);
916#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
918#elif defined(SAF_ENABLE_SIMD)
922 sa = (
float*)a; sb = (
float*)b; sc = (
float*)c;
924# if defined(__AVX512F__)
925 for(; i<(len2-15); i+=16)
926 _mm512_storeu_ps(sc+i, _mm512_sub_ps(_mm512_loadu_ps(sa+i), _mm512_loadu_ps(sb+i)));
928# if defined(__AVX__) && defined(__AVX2__)
929 for(; i<(len2-7); i+=8)
930 _mm256_storeu_ps(sc+i, _mm256_sub_ps(_mm256_loadu_ps(sa+i), _mm256_loadu_ps(sb+i)));
932# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
933 for(; i<(len2-3); i+=4)
934 _mm_storeu_ps(sc+i, _mm_sub_ps(_mm_loadu_ps(sa+i), _mm_loadu_ps(sb+i)));
937 sc[i] = sa[i] - sb[i];
938#elif __STDC_VERSION__ >= 199901L && defined(NDEBUG)
941 for(i=0; i<len-3; i+=4){
943 c[i+1] = a[i+1] - b[i+1];
944 c[i+2] = a[i+2] - b[i+2];
945 c[i+3] = a[i+3] - b[i+3];
951 for (j = 0; j < len; j++)
952 c[j] = ccsubf(a[j], b[j]);
967 saf_assert(a!=c && b!=c,
"In-place operation is not supported.");
971#if defined(SAF_USE_INTEL_IPP)
972 ippsSub_64f((Ipp64f*)b, (Ipp64f*)a, (Ipp64f*)c, len);
973#elif defined(SAF_USE_APPLE_ACCELERATE)
974 vDSP_vsubD(b, 1, a, 1, c, 1, (vDSP_Length)len);
975#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
977#elif defined(SAF_ENABLE_SIMD)
980# if defined(__AVX512F__)
981 for(; i<(len-7); i+=8)
982 _mm512_storeu_pd(c+i, _mm512_sub_pd(_mm512_loadu_pd(a+i), _mm512_loadu_pd(b+i)));
984# if defined(__AVX__) && defined(__AVX2__)
985 for(; i<(len-3); i+=4)
986 _mm256_storeu_pd(c+i, _mm256_sub_pd(_mm256_loadu_pd(a+i), _mm256_loadu_pd(b+i)));
988# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
989 for(; i<(len-1); i+=2)
990 _mm_storeu_pd(c+i, _mm_sub_pd(_mm_loadu_pd(a+i), _mm_loadu_pd(b+i)));
996 for (j = 0; j < len; j++)
1003 const double_complex* a,
1004 const double_complex* b,
1012 saf_assert(a!=c && b!=c,
"In-place operation is not supported.");
1016#if defined(SAF_USE_INTEL_IPP)
1017 ippsSub_64fc((Ipp64fc*)b, (Ipp64fc*)a, (Ipp64fc*)c, len);
1018#elif defined(SAF_USE_APPLE_ACCELERATE)
1019 vDSP_vsubD((
double*)b, 1, (
double*)a, 1, (
double*)c, 1, 2*(vDSP_Length)len);
1020#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1022#elif defined(SAF_ENABLE_SIMD)
1024 double* sa, *sb, *sc;
1026 sa = (
double*)a; sb = (
double*)b; sc = (
double*)c;
1028# if defined(__AVX512F__)
1029 for(; i<(len2-7); i+=8)
1030 _mm512_storeu_pd(sc+i, _mm512_sub_pd(_mm512_loadu_pd(sa+i), _mm512_loadu_pd(sb+i)));
1032# if defined(__AVX__) && defined(__AVX2__)
1033 for(; i<(len2-3); i+=4)
1034 _mm256_storeu_pd(sc+i, _mm256_sub_pd(_mm256_loadu_pd(sa+i), _mm256_loadu_pd(sb+i)));
1036# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
1037 for(; i<(len2-1); i+=2)
1038 _mm_storeu_pd(sc+i, _mm_sub_pd(_mm_loadu_pd(sa+i), _mm_loadu_pd(sb+i)));
1041 sc[i] = sa[i] - sb[i];
1044 for (j = 0; j < len; j++)
1045 c[j] = ccsub(a[j], b[j]);
1064 saf_assert(c!=NULL,
"'c' can no longer be NULL");
1065 saf_assert(a!=c && b!=c,
"In-place operation is no longer supported.");
1069#if defined(SAF_USE_INTEL_IPP)
1070 ippsMul_32f((Ipp32f*)a, (Ipp32f*)b, (Ipp32f*)c, len);
1071#elif defined(SAF_USE_APPLE_ACCELERATE)
1072 vDSP_vmul(a, 1, b, 1, c, 1, (vDSP_Length)len);
1073#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1075#elif defined(SAF_ENABLE_SIMD)
1078# if defined(__AVX512F__)
1079 for(; i<(len-15); i+=16)
1080 _mm512_storeu_ps(c+i, _mm512_mul_ps(_mm512_loadu_ps(a+i), _mm512_loadu_ps(b+i)));
1082# if defined(__AVX__) && defined(__AVX2__)
1083 for(; i<(len-7); i+=8)
1084 _mm256_storeu_ps(c+i, _mm256_mul_ps(_mm256_loadu_ps(a+i), _mm256_loadu_ps(b+i)));
1086# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
1087 for(; i<(len-3); i+=4)
1088 _mm_storeu_ps(c+i, _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
1092#elif defined(NDEBUG)
1095 for(i=0; i<len-3; i+=4){
1097 c[i+1] = a[i+1] * b[i+1];
1098 c[i+2] = a[i+2] * b[i+2];
1099 c[i+3] = a[i+3] * b[i+3];
1105 for (j = 0; j < len; j++)
1112 const float_complex* a,
1113 const float_complex* b,
1120 saf_assert(c!=NULL,
"'c' can no longer be NULL");
1121 saf_assert(a!=c && b!=c,
"In-place operation is no longer supported.");
1125#if defined(SAF_USE_INTEL_IPP)
1126 ippsMul_32fc((Ipp32fc*)a, (Ipp32fc*)b, (Ipp32fc*)c, len);
1127#elif defined(SAF_USE_APPLE_ACCELERATE)
1129 vDSP_vmul((
float*)a, 2, (
float*)b+1, 2, (
float*)c+1, 2, (vDSP_Length)len);
1130 vDSP_vma((
float*)a+1, 2, (
float*)b, 2, (
float*)c+1, 2, (
float*)c, 2, (vDSP_Length)len);
1131 cblas_scopy(len, (
float*)c, 2, (
float*)c+1, 2);
1133 vDSP_vmmsb((
float*)a, 2, (
float*)b, 2, (
float*)a+1, 2, (
float*)b+1, 2, (
float*)c, 2, (vDSP_Length)len);
1134#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1136#elif defined(SAF_ENABLE_SIMD)
1138 float* sa, *sb, *sc;
1139 sa = (
float*)a; sb = (
float*)b; sc = (
float*)c;
1140# if (defined(__AVX__) && defined(__AVX2__)) || defined(__AVX512F__)
1141 __m256i permute_ri = _mm256_set_epi32(6, 7, 4, 5, 2, 3, 0, 1);
1142 for(i=0; i<(len-3); i+=4){
1144 __m256 src1 = _mm256_moveldup_ps(_mm256_loadu_ps(sa+2*i));
1146 __m256 src2 = _mm256_loadu_ps(sb+2*i);
1148 __m256 tmp1 = _mm256_mul_ps(src1, src2);
1150 __m256 b1 = _mm256_permutevar8x32_ps(src2, permute_ri);
1152 src1 = _mm256_movehdup_ps(_mm256_loadu_ps(sa+2*i));
1154 __m256 tmp2 = _mm256_mul_ps(src1, b1);
1156 _mm256_storeu_ps(sc+2*i, _mm256_addsub_ps(tmp1, tmp2));
1158# elif defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
1159 for(i=0; i<(len-1); i+=2){
1161 __m128 src1 = _mm_moveldup_ps(_mm_loadu_ps(sa+2*i));
1163 __m128 src2 = _mm_loadu_ps(sb+2*i);
1165 __m128 tmp1 = _mm_mul_ps(src1, src2);
1167 __m128 b1 = _mm_shuffle_ps(src2, src2, _MM_SHUFFLE(2, 3, 0, 1));
1169 src1 = _mm_movehdup_ps(_mm_loadu_ps(sa+2*i));
1171 __m128 tmp2 = _mm_mul_ps(src1, b1);
1173 _mm_storeu_ps(sc+2*i, _mm_addsub_ps(tmp1, tmp2));
1177 sc[2*i] = sa[2*i] * sb[2*i] - sa[2*i+1] * sb[2*i+1];
1178 sc[2*i+1] = sa[2*i] * sb[2*i+1] + sa[2*i+1] * sb[2*i];
1180#elif __STDC_VERSION__ >= 199901L && defined(NDEBUG)
1183 for(i=0; i<len-3; i+=4){
1185 c[i+1] = a[i+1] * b[i+1];
1186 c[i+2] = a[i+2] * b[i+2];
1187 c[i+3] = a[i+3] * b[i+3];
1191#elif __STDC_VERSION__ >= 199901L
1193 for (i = 0; i < len; i++)
1197 for (i = 0; i < len; i++)
1198 c[i] = ccmulf(a[i], b[i]);
1215 c[0] = cblas_sdot (len, a, 1, b, 1);
1220 const float_complex* a,
1221 const float_complex* b,
1230 cblas_cdotu_sub(len, a, 1, b, 1, c);
1233 cblas_cdotc_sub(len, a, 1, b, 1, c);
1251#if defined(SAF_USE_INTEL_IPP)
1253 ippsMulC_32f_I((Ipp32f)s[0], (Ipp32f*)a, len);
1255 ippsMulC_32f((Ipp32f*)a, (Ipp32f)s[0], (Ipp32f*)c, len);
1256#elif defined(SAF_USE_APPLE_ACCELERATE)
1258 cblas_sscal(len, s[0], a, 1);
1260 vDSP_vsmul(a, 1, s, c, 1, (vDSP_Length)len);
1263 cblas_sscal(len, s[0], a, 1);
1266 cblas_sscal(len, s[0], c, 1);
1274 const float_complex* s,
1280 cblas_cscal(len, s, a, 1);
1282 cblas_ccopy(len, a, 1, c, 1);
1283 cblas_cscal(len, s, c, 1);
1295#if defined(SAF_USE_INTEL_IPP)
1297 ippsMulC_64f_I((Ipp64f)s[0], (Ipp64f*)a, len);
1299 ippsMulC_64f((Ipp64f*)a, (Ipp64f)s[0], (Ipp64f*)c, len);
1300#elif defined(SAF_USE_APPLE_ACCELERATE)
1302 cblas_dscal(len, s[0], a, 1);
1304 vDSP_vsmulD(a, 1, s, c, 1, (vDSP_Length)len);
1307 cblas_dscal(len, s[0], a, 1);
1310 cblas_dscal(len, s[0], c, 1);
1318 const double_complex* s,
1324 cblas_zscal(len, s, a, 1);
1326 cblas_zcopy(len, a, 1, c, 1);
1327 cblas_zscal(len, s, c, 1);
1345 memset(c, 0, len*
sizeof(
float));
1348#if defined(SAF_USE_INTEL_IPP)
1349 ippsDivC_32f((Ipp32f*)a, (Ipp32f)s[0], (Ipp32f*)c, len);
1350#elif defined(SAF_USE_APPLE_ACCELERATE)
1351 vDSP_vsdiv(a, 1, s, c, 1, (vDSP_Length)len);
1353 cblas_scopy(len, a, 1, c, 1);
1354 cblas_sscal(len, 1.0f/s[0], c, 1);
1371#if defined(SAF_USE_INTEL_IPP)
1372 ippsAddC_32f((Ipp32f*)a, (Ipp32f)s[0], (Ipp32f*)c, len);
1373#elif defined(SAF_USE_APPLE_ACCELERATE)
1374 vDSP_vsadd(a, 1, s, c, 1, (vDSP_Length)len);
1375#elif defined(SAF_ENABLE_SIMD)
1377# if defined(__AVX512F__)
1378 __m512 s16 = _mm512_set1_ps(s[0]);
1379 for(i=0; i<(len-15); i+=16)
1380 _mm512_storeu_ps(c+i, _mm512_add_ps(_mm512_loadu_ps(a+i), s16));
1381# elif defined(__AVX__) && defined(__AVX2__)
1382 __m256 s8 = _mm256_set1_ps(s[0]);
1383 for(i=0; i<(len-7); i+=8)
1384 _mm256_storeu_ps(c+i, _mm256_add_ps(_mm256_loadu_ps(a+i), s8));
1385# elif defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
1386 __m128 s4 = _mm_load_ps1(s);
1387 for(i=0; i<(len-3); i+=4)
1388 _mm_storeu_ps(c+i, _mm_add_ps(_mm_loadu_ps(a+i), s4));
1394 for(i=0; i<len; i++)
1412#if defined(SAF_USE_INTEL_IPP)
1413 ippsSubC_32f((Ipp32f*)a, (Ipp32f)s[0], (Ipp32f*)c, len);
1414#elif defined(SAF_USE_APPLE_ACCELERATE)
1417 vDSP_vsadd(a, 1, &inv_s, c, 1, (vDSP_Length)len);
1418#elif defined(SAF_ENABLE_SIMD)
1420# if defined(__AVX512F__)
1421 __m512 s16 = _mm512_set1_ps(s[0]);
1422 for(i=0; i<(len-15); i+=16)
1423 _mm512_storeu_ps(c+i, _mm512_sub_ps(_mm512_loadu_ps(a+i), s16));
1424# elif defined(__AVX__) && defined(__AVX2__)
1425 __m256 s8 = _mm256_set1_ps(s[0]);
1426 for(i=0; i<(len-7); i+=8)
1427 _mm256_storeu_ps(c+i, _mm256_sub_ps(_mm256_loadu_ps(a+i), s8));
1428# elif defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
1429 __m128 s4 = _mm_load_ps1(s);
1430 for(i=0; i<(len-3); i+=4)
1431 _mm_storeu_ps(c+i, _mm_sub_ps(_mm_loadu_ps(a+i), s4));
1437 for(i=0; i<len; i++)
1456#ifdef SAF_USE_APPLE_ACCELERATE
1458 vDSP_Length* inds_vDSP;
1459 inds_vDSP =
malloc1d(len*
sizeof(vDSP_Length));
1460 for(i=0; i<len; i++)
1461 inds_vDSP[i] = (vDSP_Length)(inds[i] + 1);
1462 vDSP_vgathr(sv, inds_vDSP, 1, cv, 1, (vDSP_Length)len);
1464#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1470 for(i=0; i<len; i++)
1472 cblas_sgthr(len, sv, cv, (
veclib_int*)inds_tmp);
1475#elif defined(NDEBUG)
1477 for(i=0; i<len-3; i+=4){
1478 cv[i] = sv[inds[i]];
1479 cv[i+1] = sv[inds[i+1]];
1480 cv[i+2] = sv[inds[i+2]];
1481 cv[i+3] = sv[inds[i+3]];
1484 cv[i] = sv[inds[i]];
1486 for(i=0; i<len; i++)
1487 cv[i] = sv[inds[i]];
1493 const float_complex* sv,
1500#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1506 for(i=0; i<len; i++)
1508 cblas_cgthr(len, sv, cv, (
veclib_int*)inds_tmp);
1511#elif defined(NDEBUG)
1513 for(i=0; i<len-3; i+=4){
1514 cv[i] = sv[inds[i]];
1515 cv[i+1] = sv[inds[i+1]];
1516 cv[i+2] = sv[inds[i+2]];
1517 cv[i+3] = sv[inds[i+3]];
1520 cv[i] = sv[inds[i]];
1522 for(i=0; i<len; i++)
1523 cv[i] = sv[inds[i]];
1536#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1542 for(i=0; i<len; i++)
1544 cblas_dgthr(len, sv, cv, (
veclib_int*)inds_tmp);
1547#elif defined(NDEBUG)
1549 for(i=0; i<len-3; i+=4){
1550 cv[i] = sv[inds[i]];
1551 cv[i+1] = sv[inds[i+1]];
1552 cv[i+2] = sv[inds[i+2]];
1553 cv[i+3] = sv[inds[i+3]];
1556 cv[i] = sv[inds[i]];
1558 for(i=0; i<len; i++)
1559 cv[i] = sv[inds[i]];
1565 const double_complex* sv,
1572#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1578 for(i=0; i<len; i++)
1580 cblas_zgthr(len, sv, cv, (
veclib_int*)inds_tmp);
1583#elif defined(NDEBUG)
1585 for(i=0; i<len-3; i+=4){
1586 cv[i] = sv[inds[i]];
1587 cv[i+1] = sv[inds[i+1]];
1588 cv[i+2] = sv[inds[i+2]];
1589 cv[i+3] = sv[inds[i+3]];
1592 cv[i] = sv[inds[i]];
1594 for(i=0; i<len; i++)
1595 cv[i] = sv[inds[i]];
1605typedef struct _utility_ssvd_data {
1606 int maxDim1, maxDim2;
1608 float* a, *s, *u, *vt;
1617 h->maxDim1 = maxDim1;
1618 h->maxDim2 = maxDim2;
1619 h->currentWorkSize = 0;
1620 h->a =
malloc1d(maxDim1*maxDim2*
sizeof(
float));
1622 h->u =
malloc1d(maxDim1*maxDim1*
sizeof(
float));
1623 h->vt =
malloc1d(maxDim2*maxDim2*
sizeof(
float));
1661 m = dim1; n = dim2; lda = dim1; ldu = dim1; ldvt = dim2;
1669 saf_assert(dim1<=h->maxDim1,
"dim1 exceeds the maximum length specified");
1670 saf_assert(dim2<=h->maxDim2,
"dim2 exceeds the maximum length specified");
1675 for(i=0; i<dim1; i++)
1676 for(j=0; j<dim2; j++)
1677 h->a[j*dim1+i] = A[i*dim2 +j];
1681#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
1682 sgesvd_(
"A",
"A", &m, &n, h->a, &lda, h->s, h->u, &ldu, h->vt, &ldvt, &wkopt, &lwork, &info );
1683#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
1685#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
1686 info = LAPACKE_sgesvd_work(CblasColMajor,
'A',
'A', m, n, h->a, lda, h->s, h->u, ldu, h->vt, ldvt, &wkopt, lwork);
1689 if(lwork>h->currentWorkSize){
1690 h->currentWorkSize = lwork;
1691 h->work =
realloc1d(h->work, h->currentWorkSize*
sizeof(
float));
1695#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
1696 sgesvd_(
"A",
"A", &m, &n, h->a, &lda, h->s, h->u, &ldu, h->vt, &ldvt, h->work, &lwork, &info );
1697#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
1699#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
1700 info = LAPACKE_sgesvd_work(CblasColMajor,
'A',
'A', m, n, h->a, lda, h->s, h->u, ldu, h->vt, ldvt, h->work, lwork);
1706 memset(S, 0, dim1*dim2*
sizeof(
float));
1708 memset(U, 0, dim1*dim1*
sizeof(
float));
1710 memset(V, 0, dim2*dim2*
sizeof(
float));
1712 memset(sing, 0,
SAF_MIN(dim1, dim2)*
sizeof(
float));
1717 saf_print_warning(
"Could not compute SVD in utility_ssvd(). Output matrices/vectors have been zeroed.");
1723 memset(S, 0, dim1*dim2*
sizeof(
float));
1725 for(i=0; i<
SAF_MIN(dim1, dim2); i++)
1726 S[i*dim2+i] = h->s[i];
1731 for(i=0; i<dim1; i++)
1732 for(j=0; j<dim1; j++)
1733 U[i*dim1+j] = h->u[j*dim1+i];
1737 for(i=0; i<dim2; i++)
1738 for(j=0; j<dim2; j++)
1739 V[i*dim2+j] = h->vt[i*dim2+j];
1741 for(i=0; i<
SAF_MIN(dim1, dim2); i++)
1750typedef struct _utility_csvd_data {
1751 int maxDim1, maxDim2;
1753 float_complex* a, *u, *vt, *work;
1762 h->maxDim1 = maxDim1;
1763 h->maxDim2 = maxDim2;
1764 h->currentWorkSize = 0;
1765 h->a =
malloc1d(maxDim1*maxDim2*
sizeof(float_complex));
1767 h->u =
malloc1d(maxDim1*maxDim1*
sizeof(float_complex));
1768 h->vt =
malloc1d(maxDim2*maxDim2*
sizeof(float_complex));
1794 const float_complex* A,
1806 float_complex wkopt;
1807#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1808 const MKL_Complex8 calpha = {1.0f, 0.0f};
1813 m = dim1; n = dim2; lda = dim1; ldu = dim1; ldvt = dim2;
1821 saf_assert(dim1<=h->maxDim1,
"dim1 exceeds the maximum length specified");
1822 saf_assert(dim2<=h->maxDim2,
"dim2 exceeds the maximum length specified");
1827#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1830 for(i=0; i<dim1; i++)
1831 for(j=0; j<dim2; j++)
1832 h->a[j*dim1+i] = A[i*dim2+j];
1837#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
1840#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
1842#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
1843 info = LAPACKE_cgesvd_work(CblasColMajor,
'A',
'A', m, n, (
veclib_float_complex*)h->a, lda, h->s, (
veclib_float_complex*)h->u, ldu,
1847 if(lwork>h->currentWorkSize){
1848 h->currentWorkSize = lwork;
1849 h->work =
realloc1d(h->work, h->currentWorkSize*
sizeof(float_complex));
1853#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
1854 cgesvd_(
"A",
"A", &m, &n, (
veclib_float_complex*)h->a, &lda, h->s, (
veclib_float_complex*)h->u, &ldu, (
veclib_float_complex*)h->vt, &ldvt,
1856#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
1858#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
1859 info = LAPACKE_cgesvd_work(CblasColMajor,
'A',
'A', m, n, (
veclib_float_complex*)h->a, lda, h->s, (
veclib_float_complex*)h->u, ldu, (
veclib_float_complex*)h->vt, ldvt,
1866 memset(S, 0, dim1*dim2*
sizeof(float_complex));
1868 memset(U, 0, dim1*dim1*
sizeof(float_complex));
1870 memset(V, 0, dim2*dim2*
sizeof(float_complex));
1872 memset(sing, 0,
SAF_MIN(dim1, dim2)*
sizeof(float_complex));
1877 saf_print_warning(
"Could not compute SVD in utility_csvd(). Output matrices/vectors have been zeroed.");
1884 memset(S, 0, dim1*dim2*
sizeof(float_complex));
1885 cblas_scopy(
SAF_MIN(dim1, dim2), h->s, 1, (
float*)S, 2*(dim2+1));
1890#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1893 for(i=0; i<dim1; i++)
1894 for(j=0; j<dim1; j++)
1895 U[i*dim1+j] = h->u[j*dim1+i];
1901 cblas_ccopy(dim2*dim2, h->vt, 1, V, 1);
1902 cblas_sscal(dim2*dim2, -1.0f, ((
float*)V)+1, 2);
1906 cblas_scopy(
SAF_MIN(dim1, dim2), h->s, 1, sing, 1);
1919typedef struct _utility_sseig_data {
1933 h->currentWorkSize = 0;
1934 h->w =
malloc1d(maxDim*
sizeof(
float));
1935 h->a =
malloc1d(maxDim*maxDim*
sizeof(
float));
1979 saf_assert(dim<=h->maxDim,
"dim exceeds the maximum length specified");
1984 for(i=0; i<dim; i++)
1985 for(j=0; j<dim; j++)
1986 h->a[i*dim+j] = A[j*dim+i];
1990#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
1991 ssyev_(
"Vectors",
"Upper", &n, h->a, &lda, h->w, &wkopt, &lwork, &info );
1992#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
1994#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
1995 info = LAPACKE_ssyev_work(CblasColMajor,
'V',
'U', n, h->a, lda, h->w, &wkopt, lwork);
1998 if(lwork>h->currentWorkSize){
1999 h->currentWorkSize = lwork;
2000 h->work =
realloc1d(h->work, h->currentWorkSize*
sizeof(
float));
2004#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2005 ssyev_(
"Vectors",
"Upper", &n, h->a, &lda, h->w, h->work, &lwork, &info );
2006#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2008#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2009 info = LAPACKE_ssyev_work(CblasColMajor,
'V',
'U', n, h->a, lda, h->w, h->work, lwork);
2014 memset(D, 0, dim*dim*
sizeof(
float));
2018 memset(V, 0, dim*dim*
sizeof(
float));
2024 saf_print_warning(
"Could not compute EVD in utility_sseig(). Output matrices/vectors have been zeroed.");
2029 for(i=0; i<dim; i++){
2031 for(j=0; j<dim; j++)
2032 V[i*dim+j] = h->a[(dim-j-1)*dim+i];
2034 D[i*dim+i] = h->w[dim-i-1];
2036 eig[i] = h->w[dim-i-1];
2040 for(i=0; i<dim; i++){
2042 for(j=0; j<dim; j++)
2043 V[i*dim+j] = h->a[j*dim+i];
2045 D[i*dim+i] = h->w[i];
2057typedef struct _utility_cseig_data {
2063 float_complex* work;
2072 h->currentWorkSize =
SAF_MAX(1, 2*maxDim-1);
2073 h->rwork =
malloc1d((3*maxDim-2)*
sizeof(
float));
2074 h->w =
malloc1d(maxDim*
sizeof(
float));
2075 h->a =
malloc1d(maxDim*maxDim*
sizeof(float_complex));
2076 h->work =
malloc1d(h->currentWorkSize*
sizeof(float_complex));
2098 const float_complex* A,
2109 float_complex wkopt;
2110#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
2111 const MKL_Complex8 calpha = {1.0f, 0.0f};
2125 saf_assert(dim<=h->maxDim,
"dim exceeds the maximum length specified");
2130#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
2133 for(i=0; i<dim; i++)
2134 for(j=0; j<dim; j++)
2135 h->a[i*dim+j] = A[j*dim+i];
2140#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2142#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2144#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2148 if(lwork>h->currentWorkSize){
2149 h->currentWorkSize = lwork;
2150 h->work =
realloc1d(h->work, h->currentWorkSize*
sizeof(float_complex));
2154#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2156#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2158#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2164 memset(D, 0, dim*dim*
sizeof(float_complex));
2168 memset(V, 0, dim*dim*
sizeof(float_complex));
2174 saf_print_warning(
"Could not compute EVD in utility_cseig(). Output matrices/vectors have been zeroed.");
2180 if(sortDecFLAG && V!=NULL)
2181 for(i=0; i<(int)((
float)dim/2.0f); i++)
2182 cblas_cswap(dim, &h->a[i*dim], 1, &h->a[(dim-i-1)*dim], 1);
2185#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
2188 for(i=0; i<dim; i++)
2189 for(j=0; j<dim; j++)
2190 V[i*dim+j] = h->a[j*dim+i];
2195 for(i=0; i<dim; i++) {
2197 D[i*dim+i] = cmplxf(h->w[dim-i-1], 0.0f);
2199 eig[i] = h->w[dim-i-1];
2203 for(i=0; i<dim; i++){
2205 D[i*dim+i] = cmplxf(h->w[i], 0.0f);
2222typedef struct _utility_ceigmp_data {
2225 float_complex* a, *b, *vl, *vr, *alpha, *beta;
2227 float_complex* work;
2236 h->currentWorkSize = 4*maxDim;
2237 h->rwork =
malloc1d(4*(h->currentWorkSize)*
sizeof(
float));
2238 h->a =
malloc1d(maxDim*maxDim*
sizeof(float_complex));
2239 h->b =
malloc1d(maxDim*maxDim*
sizeof(float_complex));
2240 h->vl =
malloc1d(maxDim*maxDim*
sizeof(float_complex));
2241 h->vr =
malloc1d(maxDim*maxDim*
sizeof(float_complex));
2242 h->alpha =
malloc1d(maxDim*
sizeof(float_complex));
2243 h->beta =
malloc1d(maxDim*
sizeof(float_complex));
2244 h->work =
malloc1d(h->currentWorkSize*
sizeof(float_complex));
2270 const float_complex* A,
2271 const float_complex* B,
2283 n = lda = ldb = ldvl = ldvr = dim;
2291 saf_assert(dim<=h->maxDim,
"dim exceeds the maximum length specified");
2296 for(i=0; i<dim; i++)
2297 for(j=0; j<dim; j++)
2298 h->a[j*dim+i] = A[i*dim+j];
2299 for(i=0; i<dim; i++)
2300 for(j=0; j<dim; j++)
2301 h->b[j*dim+i] = B[i*dim+j];
2304 lwork = h->currentWorkSize;
2305#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2308#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2310#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2311 info = LAPACKE_cggev_work(CblasColMajor,
'V',
'V', n, (
veclib_float_complex*)h->a, lda, (
veclib_float_complex*)h->b, ldb, (
veclib_float_complex*)h->alpha, (
veclib_float_complex*)h->beta,
2316 memset(D, 0, dim*dim*
sizeof(float_complex));
2321 memset(VL, 0, dim*dim*
sizeof(float_complex));
2323 memset(VR, 0, dim*dim*
sizeof(float_complex));
2329 saf_print_warning(
"Could not compute EVD in utility_ceigmp(). Output matrices/vectors have been zeroed.");
2335 for(i=0; i<dim; i++)
2336 D[i*dim+i] = ccdivf(h->alpha[i],h->beta[i]);
2339 for(i=0; i<dim; i++)
2340 for(j=0; j<dim; j++)
2341 VL[i*dim+j] = h->vl[j*dim+i];
2343 for(i=0; i<dim; i++)
2344 for(j=0; j<dim; j++)
2345 VR[i*dim+j] = h->vr[j*dim+i];
2353typedef struct _utility_zeigmp_data {
2356 double_complex* a, *b, *vl, *vr, *alpha, *beta;
2358 double_complex* work;
2367 h->currentWorkSize = 4*maxDim;
2368 h->rwork =
malloc1d(4*(h->currentWorkSize)*
sizeof(
double));
2369 h->a =
malloc1d(maxDim*maxDim*
sizeof(double_complex));
2370 h->b =
malloc1d(maxDim*maxDim*
sizeof(double_complex));
2371 h->vl =
malloc1d(maxDim*maxDim*
sizeof(double_complex));
2372 h->vr =
malloc1d(maxDim*maxDim*
sizeof(double_complex));
2373 h->alpha =
malloc1d(maxDim*
sizeof(double_complex));
2374 h->beta =
malloc1d(maxDim*
sizeof(double_complex));
2375 h->work =
malloc1d(h->currentWorkSize*
sizeof(double_complex));
2401 const double_complex* A,
2402 const double_complex* B,
2414 n = lda = ldb = ldvl = ldvr = dim;
2422 saf_assert(dim<=h->maxDim,
"dim exceeds the maximum length specified");
2427 for(i=0; i<dim; i++)
2428 for(j=0; j<dim; j++)
2429 h->a[j*dim+i] = A[i*dim+j];
2430 for(i=0; i<dim; i++)
2431 for(j=0; j<dim; j++)
2432 h->b[j*dim+i] = B[i*dim+j];
2435 lwork = h->currentWorkSize;
2436#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2439#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2441#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2442 info = LAPACKE_zggev_work(CblasColMajor,
'V',
'V', n, (
veclib_double_complex*)h->a, lda, (
veclib_double_complex*)h->b, ldb, (
veclib_double_complex*)h->alpha, (
veclib_double_complex*)h->beta,
2447 memset(D, 0, dim*dim*
sizeof(double_complex));
2452 memset(VL, 0, dim*dim*
sizeof(double_complex));
2454 memset(VR, 0, dim*dim*
sizeof(double_complex));
2460 saf_print_warning(
"Could not compute EVD in utility_zeigmp(). Output matrices/vectors have been zeroed.");
2466 for(i=0; i<dim; i++)
2467 D[i*dim+i] = ccdiv(h->alpha[i],h->beta[i]);
2470 for(i=0; i<dim; i++)
2471 for(j=0; j<dim; j++)
2472 VL[i*dim+j] = h->vl[j*dim+i];
2474 for(i=0; i<dim; i++)
2475 for(j=0; j<dim; j++)
2476 VR[i*dim+j] = h->vr[j*dim+i];
2489typedef struct _utility_ceig_data {
2492 float_complex *w, *vl, *vr, *a;
2494 float_complex* work;
2503 h->currentWorkSize = 0;
2504 h->rwork =
malloc1d(4*maxDim*
sizeof(
float));
2505 h->w =
malloc1d(maxDim*
sizeof(float_complex));
2506 h->vl =
malloc1d(maxDim*maxDim*
sizeof(float_complex));
2507 h->vr =
malloc1d(maxDim*maxDim*
sizeof(float_complex));
2508 h->a =
malloc1d(maxDim*maxDim*
sizeof(float_complex));
2533 const float_complex* A,
2544 float_complex wkopt;
2546 n = lda = ldvl = ldvr = dim;
2554 saf_assert(dim<=h->maxDim,
"dim exceeds the maximum length specified");
2559 for(i=0; i<dim; i++)
2560 for(j=0; j<dim; j++)
2561 h->a[i*dim+j] = A[j*dim+i];
2565#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2568#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2570#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2575 if(lwork>h->currentWorkSize){
2576 h->currentWorkSize = lwork;
2577 h->work =
realloc1d(h->work, h->currentWorkSize*
sizeof(float_complex));
2581#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2584#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2586#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2593 memset(D, 0, dim*dim*
sizeof(float_complex));
2598 memset(VL, 0, dim*dim*
sizeof(float_complex));
2600 memset(VR, 0, dim*dim*
sizeof(float_complex));
2602 memset(eig, 0, dim*
sizeof(float_complex));
2608 saf_print_warning(
"Could not compute EVD in utility_ceig(). Output matrices/vectors have been zeroed.");
2613 for(i=0; i<dim; i++){
2615 for(j=0; j<dim; j++)
2616 VL[i*dim+j] = h->vl[j*dim+i];
2618 for(j=0; j<dim; j++)
2619 VR[i*dim+j] = h->vr[j*dim+i];
2621 D[i*dim+i] = h->w[i];
2632typedef struct _utility_zeig_data {
2635 double_complex *w, *vl, *vr, *a;
2637 double_complex* work;
2646 h->currentWorkSize = 0;
2647 h->rwork =
malloc1d(4*maxDim*
sizeof(
double));
2648 h->w =
malloc1d(maxDim*
sizeof(double_complex));
2649 h->vl =
malloc1d(maxDim*maxDim*
sizeof(double_complex));
2650 h->vr =
malloc1d(maxDim*maxDim*
sizeof(double_complex));
2651 h->a =
malloc1d(maxDim*maxDim*
sizeof(double_complex));
2676 const double_complex* A,
2687 double_complex wkopt;
2689 n = lda = ldvl = ldvr = dim;
2697 saf_assert(dim<=h->maxDim,
"dim exceeds the maximum length specified");
2702 for(i=0; i<dim; i++)
2703 for(j=0; j<dim; j++)
2704 h->a[i*dim+j] = A[j*dim+i];
2708#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2711#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2713#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2718 if(lwork>h->currentWorkSize){
2719 h->currentWorkSize = lwork;
2720 h->work =
realloc1d(h->work, h->currentWorkSize*
sizeof(double_complex));
2724#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2727#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2729#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2736 memset(D, 0, dim*dim*
sizeof(double_complex));
2741 memset(VL, 0, dim*dim*
sizeof(double_complex));
2743 memset(VR, 0, dim*dim*
sizeof(double_complex));
2745 memset(eig, 0, dim*
sizeof(double_complex));
2751 saf_print_warning(
"Could not compute EVD in utility_zeig(). Output matrices/vectors have been zeroed.");
2756 for(i=0; i<dim; i++){
2758 for(j=0; j<dim; j++)
2759 VL[i*dim+j] = h->vl[j*dim+i];
2761 for(j=0; j<dim; j++)
2762 VR[i*dim+j] = h->vr[j*dim+i];
2764 D[i*dim+i] = h->w[i];
2780typedef struct _utility_sglslv_data {
2794 h->maxNCol = maxNCol;
2797 h->a =
malloc1d(maxDim*maxDim*
sizeof(
float));
2798 h->b =
malloc1d(maxDim*maxNCol*
sizeof(
float));
2827 veclib_int n = dim, nrhs = nCol, lda = dim, ldb = dim, info;
2828#if !defined(SAF_VECLIB_USE_LAPACKE_INTERFACE) && !(defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64))
2838 saf_assert(dim<=h->maxDim,
"dim exceeds the maximum length specified");
2839 saf_assert(nCol<=h->maxNCol,
"nCol exceeds the maximum length specified");
2843#if defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2845 cblas_scopy(dim*dim, A, 1, h->a, 1);
2846 cblas_scopy(dim*nCol, B, 1, h->b, 1);
2849# if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
2850 MKL_Somatcopy(
'R',
'T', dim, dim, 1.0f, A, dim, h->a, dim);
2851 MKL_Somatcopy(
'R',
'T', dim, nCol, 1.0f, B, nCol, h->b, dim);
2853 for(i=0; i<dim; i++)
2854 for(j=0; j<dim; j++)
2855 h->a[i*dim+j] = A[j*dim+i];
2856 for(i=0; i<dim; i++)
2857 for(j=0; j<nCol; j++)
2858 h->b[j*dim+i] = B[i*nCol+j];
2863#ifdef SAF_VECLIB_USE_CLAPACK_INTERFACE
2864 info = clapack_sgesv(CblasColMajor, n, nrhs, h->a, lda, h->IPIV, h->b, ldb);
2865#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2866 info = LAPACKE_sgesv_work(CblasRowMajor, n, nrhs, h->a, lda, h->IPIV, h->b, ldb);
2867#elif defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2868 sgesv_( &n, &nrhs, h->a, &lda, h->IPIV, h->b, &ldb, &info );
2873 memset(X, 0, dim*nCol*
sizeof(
float));
2878 saf_print_warning(
"Could not solve the linear equation in utility_sglslv(). Output matrices/vectors have been zeroed.");
2882#if defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2883 cblas_scopy(dim*nCol, h->b, 1, X, 1);
2886# if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
2887 MKL_Somatcopy(
'R',
'T', nCol, dim, 1.0f, h->b, dim, X, nCol);
2889 for(i=0; i<dim; i++)
2890 for(j=0; j<nCol; j++)
2891 X[i*nCol+j] = h->b[j*dim+i];
2901typedef struct _utility_cglslv_data {
2915 h->maxNCol = maxNCol;
2918 h->a =
malloc1d(maxDim*maxDim*
sizeof(float_complex));
2919 h->b =
malloc1d(maxDim*maxNCol*
sizeof(float_complex));
2940 const float_complex* A,
2961 saf_assert(dim<=h->maxDim,
"dim exceeds the maximum length specified");
2962 saf_assert(nCol<=h->maxNCol,
"nCol exceeds the maximum length specified");
2967 for(i=0; i<dim; i++)
2968 for(j=0; j<dim; j++)
2969 h->a[j*dim+i] = A[i*dim+j];
2970 for(i=0; i<dim; i++)
2971 for(j=0; j<nCol; j++)
2972 h->b[j*dim+i] = B[i*nCol+j];
2975#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2977#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2979#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2985 memset(X, 0, dim*nCol*
sizeof(float_complex));
2990 saf_print_warning(
"Could not solve the linear equation in utility_cglslv(). Output matrices/vectors have been zeroed.");
2995 for(i=0; i<dim; i++)
2996 for(j=0; j<nCol; j++)
2997 X[i*nCol+j] = h->b[j*dim+i];
3005typedef struct _utility_dglslv_data {
3019 h->maxNCol = maxNCol;
3021 h->a =
malloc1d(maxDim*maxDim*
sizeof(
double));
3022 h->b =
malloc1d(maxDim*maxNCol*
sizeof(
double));
3064 saf_assert(dim<=h->maxDim,
"dim exceeds the maximum length specified");
3065 saf_assert(nCol<=h->maxNCol,
"nCol exceeds the maximum length specified");
3070 for(i=0; i<dim; i++)
3071 for(j=0; j<dim; j++)
3072 h->a[j*dim+i] = A[i*dim+j];
3073 for(i=0; i<dim; i++)
3074 for(j=0; j<nCol; j++)
3075 h->b[j*dim+i] = B[i*nCol+j];
3078#ifdef SAF_VECLIB_USE_CLAPACK_INTERFACE
3079 info = clapack_dgesv(CblasColMajor, n, nrhs, h->a, lda, h->IPIV, h->b, ldb);
3080#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3081 info = LAPACKE_dgesv_work(CblasColMajor, n, nrhs, h->a, lda, h->IPIV, h->b, ldb);
3082#elif defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3083 dgesv_( &n, &nrhs, h->a, &lda, h->IPIV, h->b, &ldb, &info );
3088 memset(X, 0, dim*nCol*
sizeof(
double));
3093 saf_print_warning(
"Could not solve the linear equation in utility_dglslv(). Output matrices/vectors have been zeroed.");
3098 for(i=0; i<dim; i++)
3099 for(j=0; j<nCol; j++)
3100 X[i*nCol+j] = h->b[j*dim+i];
3108typedef struct _utility_zglslv_data {
3122 h->maxNCol = maxNCol;
3124 h->a =
malloc1d(maxDim*maxDim*
sizeof(double_complex));
3125 h->b =
malloc1d(maxDim*maxNCol*
sizeof(double_complex));
3146 const double_complex* A,
3154 veclib_int i, j, n = dim, nrhs = nCol, lda = dim, ldb = dim, info;
3167 saf_assert(dim<=h->maxDim,
"dim exceeds the maximum length specified");
3168 saf_assert(nCol<=h->maxNCol,
"nCol exceeds the maximum length specified");
3173 for(i=0; i<dim; i++)
3174 for(j=0; j<dim; j++)
3175 h->a[j*dim+i] = A[i*dim+j];
3176 for(i=0; i<dim; i++)
3177 for(j=0; j<nCol; j++)
3178 h->b[j*dim+i] = B[i*nCol+j];
3181#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3183#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3185#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3191 memset(X, 0, dim*nCol*
sizeof(double_complex));
3196 saf_print_warning(
"Could not solve the linear equation in utility_zglslv(). Output matrices/vectors have been zeroed.");
3201 for(i=0; i<dim; i++)
3202 for(j=0; j<nCol; j++)
3203 X[i*nCol+j] = h->b[j*dim+i];
3216typedef struct _utility_sglslvt_data {
3230 h->maxNCol = maxNCol;
3232 h->a =
malloc1d(maxDim*maxDim*
sizeof(
float));
3233 h->b =
malloc1d(maxDim*maxNCol*
sizeof(
float));
3262 veclib_int n = nCol, nrhs = dim, lda = nCol, ldb = nCol, info;
3270 saf_assert(dim<=h->maxDim,
"dim exceeds the maximum length specified");
3271 saf_assert(nCol<=h->maxNCol,
"nCol exceeds the maximum length specified");
3276 cblas_scopy(dim*dim, A, 1, h->a, 1);
3277 cblas_scopy(dim*nCol, B, 1, h->b, 1);
3280#ifdef SAF_VECLIB_USE_CLAPACK_INTERFACE
3281 info = clapack_sgesv(CblasColMajor, n, nrhs, h->b, ldb, h->IPIV, h->a, lda);
3282#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3283 info = LAPACKE_sgesv_work(CblasColMajor, n, nrhs, h->b, ldb, h->IPIV, h->a, lda);
3284#elif defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3286 sgesv_( &n, &nrhs, h->b, &ldb, h->IPIV, h->a, &lda, &info );
3291 memset(X, 0, dim*nCol*
sizeof(
float));
3296 saf_print_warning(
"Could not solve the linear equation in utility_sglslvt(). Output matrices/vectors have been zeroed.");
3300 cblas_scopy(dim*nCol, h->a, 1, X, 1);
3312typedef struct _utility_sslslv_data {
3325 h->maxNCol = maxNCol;
3326 h->a =
malloc1d(maxDim*maxDim*
sizeof(
float));
3327 h->b =
malloc1d(maxDim*maxNCol*
sizeof(
float));
3368 saf_assert(dim<=h->maxDim,
"dim exceeds the maximum length specified");
3369 saf_assert(nCol<=h->maxNCol,
"nCol exceeds the maximum length specified");
3374 for(i=0; i<dim; i++)
3375 for(j=0; j<dim; j++)
3376 h->a[j*dim+i] = A[i*dim+j];
3377 for(i=0; i<dim; i++)
3378 for(j=0; j<nCol; j++)
3379 h->b[j*dim+i] = B[i*nCol+j];
3382#ifdef SAF_VECLIB_USE_CLAPACK_INTERFACE
3383 info = clapack_sposv(CblasColMajor, CblasUpper, n, nrhs, h->a, lda, h->b, ldb);
3384#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3385 info = LAPACKE_sposv_work(CblasColMajor, CblasUpper, n, nrhs, h->a, lda, h->b, ldb);
3386#elif defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3387 sposv_(
"U", &n, &nrhs, h->a, &lda, h->b, &ldb, &info );
3392 memset(X, 0, dim*nCol*
sizeof(
float));
3397 saf_print_warning(
"Could not solve the linear equation in utility_sslslv(). Output matrices/vectors have been zeroed.");
3402 for(i=0; i<dim; i++)
3403 for(j=0; j<nCol; j++)
3404 X[i*nCol+j] = h->b[j*dim+i];
3412typedef struct _utility_cslslv_data {
3425 h->maxNCol = maxNCol;
3426 h->a =
malloc1d(maxDim*maxDim*
sizeof(float_complex));
3427 h->b =
malloc1d(maxDim*maxNCol*
sizeof(float_complex));
3447 const float_complex* A,
3468 saf_assert(dim<=h->maxDim,
"dim exceeds the maximum length specified");
3469 saf_assert(nCol<=h->maxNCol,
"nCol exceeds the maximum length specified");
3474 for(i=0; i<dim; i++)
3475 for(j=0; j<dim; j++)
3476 h->a[j*dim+i] = A[i*dim+j];
3477 for(i=0; i<dim; i++)
3478 for(j=0; j<nCol; j++)
3479 h->b[j*dim+i] = B[i*nCol+j];
3482#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3484#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3486#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3492 memset(X, 0, dim*nCol*
sizeof(float_complex));
3497 saf_print_warning(
"Could not solve the linear equation in utility_cslslv(). Output matrices/vectors have been zeroed.");
3502 for(i=0; i<dim; i++)
3503 for(j=0; j<nCol; j++)
3504 X[i*nCol+j] = h->b[j*dim+i];
3517typedef struct _utility_spinv_data {
3518 int maxDim1, maxDim2;
3520 float* a, *s, *u, *vt, *inva;
3529 h->maxDim1 = maxDim1;
3530 h->maxDim2 = maxDim2;
3531 h->currentWorkSize = 0;
3532 h->a =
malloc1d(maxDim1*maxDim2*
sizeof(
float));
3534 h->u =
malloc1d(maxDim1*maxDim1*
sizeof(
float));
3535 h->vt =
malloc1d(maxDim2*maxDim2*
sizeof(
float));
3536 h->inva =
malloc1d(maxDim1*maxDim2*
sizeof(
float));
3568 veclib_int i, j, m, n, k, lda, ldu, ldvt, ld_inva, info;
3573 m = lda = ldu = dim1;
3583 saf_assert(dim1<=h->maxDim1,
"dim1 exceeds the maximum length specified");
3584 saf_assert(dim2<=h->maxDim2,
"dim2 exceeds the maximum length specified");
3591 h->a[j*m+i] = inM[i*n+j];
3595#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3596 sgesvd_(
"S",
"S", &m, &n, h->a, &lda, h->s, h->u, &ldu, h->vt, &ldvt, &wkopt, &lwork, &info);
3597#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3599#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3600 info = LAPACKE_sgesvd_work(CblasColMajor,
'S',
'S', m, n, h->a, lda, h->s, h->u, ldu, h->vt, ldvt, &wkopt, lwork);
3603 if(lwork>h->currentWorkSize){
3604 h->currentWorkSize = lwork;
3605 h->work =
realloc1d(h->work, h->currentWorkSize*
sizeof(
float));
3609#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3610 sgesvd_(
"S",
"S", &m, &n, h->a, &lda, h->s, h->u, &ldu, h->vt, &ldvt, h->work, &lwork, &info );
3611#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3613#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3614 info = LAPACKE_sgesvd_work(CblasColMajor,
'S',
'S', m, n, h->a, lda, h->s, h->u, ldu, h->vt, ldvt, h->work, lwork);
3618 memset(outM, 0, dim1*dim2*
sizeof(
float));
3623 saf_print_warning(
"Could not compute SVD in utility_spinv(). Output matrices/vectors have been zeroed.");
3628 if(h->s[i] > 1.0e-5f)
3632 cblas_sscal(m, ss, &h->u[i*m], 1);
3635 cblas_sgemm(CblasColMajor, CblasTrans, CblasTrans, n, m, k, 1.0f,
3643 outM[j*m+i] = h->inva[i*n+j];
3652typedef struct _utility_cpinv_data {
3653 int maxDim1, maxDim2;
3655 float_complex* a, *u, *vt, *inva;
3657 float_complex* work;
3665 h->maxDim1 = maxDim1;
3666 h->maxDim2 = maxDim2;
3667 h->currentWorkSize = 0;
3668 h->a =
malloc1d(maxDim1*maxDim2*
sizeof(float_complex));
3670 h->u =
malloc1d(maxDim1*maxDim1*
sizeof(float_complex));
3671 h->vt =
malloc1d(maxDim2*maxDim2*
sizeof(float_complex));
3672 h->inva =
malloc1d(maxDim1*maxDim2*
sizeof(float_complex));
3698 const float_complex* inM,
3705 veclib_int i, j, m, n, k, lda, ldu, ldvt, ld_inva, info;
3706 float_complex ss_cmplx;
3707 const float_complex calpha = cmplxf(1.0f, 0.0f);
const float_complex cbeta = cmplxf(0.0f, 0.0f);
3710 float_complex wkopt;
3712 m = lda = ldu = dim1;
3722 saf_assert(dim1<=h->maxDim1,
"dim1 exceeds the maximum length specified");
3723 saf_assert(dim2<=h->maxDim2,
"dim2 exceeds the maximum length specified");
3728 for(i=0; i<dim1; i++)
3729 for(j=0; j<dim2; j++)
3730 h->a[j*dim1+i] = inM[i*dim2 +j];
3734#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3735 cgesvd_(
"A",
"A", &m, &n, (
veclib_float_complex*)h->a, &lda, h->s, (
veclib_float_complex*)h->u, &ldu, (
veclib_float_complex*)h->vt, &ldvt,
3737#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3739#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3740 info = LAPACKE_cgesvd_work(CblasColMajor,
'S',
'S', m, n, (
veclib_float_complex*)h->a, lda, h->s, (
veclib_float_complex*)h->u, ldu, (
veclib_float_complex*)h->vt, ldvt,
3744 if(lwork>h->currentWorkSize){
3745 h->currentWorkSize = lwork;
3746 h->work =
realloc1d(h->work, h->currentWorkSize*
sizeof(float_complex));
3750#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3751 cgesvd_(
"A",
"A", &m, &n, (
veclib_float_complex*)h->a, &lda, h->s, (
veclib_float_complex*)h->u, &ldu, (
veclib_float_complex*)h->vt, &ldvt,
3753#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3755#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3756 info = LAPACKE_cgesvd_work(CblasColMajor,
'S',
'S', m, n, (
veclib_float_complex*)h->a, lda, h->s, (
veclib_float_complex*)h->u, ldu, (
veclib_float_complex*)h->vt, ldvt,
3761 memset(outM, 0, dim1*dim2*
sizeof(float_complex));
3766 saf_print_warning(
"Could not compute SVD in utility_cpinv(). Output matrices/vectors have been zeroed.");
3771 if(h->s[i] > 1.0e-5f)
3775 ss_cmplx = cmplxf(ss, 0.0f);
3776 cblas_cscal(m, &ss_cmplx, &h->u[i*m], 1);
3779 cblas_cgemm(CblasColMajor, CblasConjTrans, CblasConjTrans, n, m, k, &calpha,
3787 outM[j*m+i] = h->inva[i*n+j];
3796typedef struct _utility_dpinv_data {
3797 int maxDim1, maxDim2;
3799 double* a, *s, *u, *vt, *inva;
3808 h->maxDim1 = maxDim1;
3809 h->maxDim2 = maxDim2;
3810 h->currentWorkSize = 0;
3811 h->a =
malloc1d(maxDim1*maxDim2*
sizeof(
double));
3813 h->u =
malloc1d(maxDim1*maxDim1*
sizeof(
double));
3814 h->vt =
malloc1d(maxDim2*maxDim2*
sizeof(
double));
3815 h->inva =
malloc1d(maxDim1*maxDim2*
sizeof(
double));
3847 veclib_int i, j, m, n, k, lda, ldu, ldvt, ld_inva, info;
3852 m = lda = ldu = dim1;
3862 saf_assert(dim1<=h->maxDim1,
"dim1 exceeds the maximum length specified");
3863 saf_assert(dim2<=h->maxDim2,
"dim2 exceeds the maximum length specified");
3870 h->a[j*m+i] = inM[i*n+j];
3874#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3875 dgesvd_(
"S",
"S", &m, &n, h->a, &lda, h->s, h->u, &ldu, h->vt, &ldvt, &wkopt, &lwork, &info);
3876#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3878#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3879 info = LAPACKE_dgesvd_work(CblasColMajor,
'S',
'S', m, n, h->a, lda, h->s, h->u, ldu, h->vt, ldvt, &wkopt, lwork);
3882 if(lwork>h->currentWorkSize){
3883 h->currentWorkSize = lwork;
3884 h->work =
realloc1d(h->work, h->currentWorkSize*
sizeof(
double));
3888#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3889 dgesvd_(
"S",
"S", &m, &n, h->a, &lda, h->s, h->u, &ldu, h->vt, &ldvt, h->work, &lwork, &info );
3890#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3892#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3893 info = LAPACKE_dgesvd_work(CblasColMajor,
'S',
'S', m, n, h->a, lda, h->s, h->u, ldu, h->vt, ldvt, h->work, lwork);
3897 memset(outM, 0, dim1*dim2*
sizeof(
double));
3902 saf_print_warning(
"Could not compute SVD in utility_dpinv(). Output matrices/vectors have been zeroed.");
3907 if(h->s[i] > 1.0e-9)
3911 cblas_dscal(m, ss, &h->u[i*m], 1);
3914 cblas_dgemm(CblasColMajor, CblasTrans, CblasTrans, n, m, k, 1.0,
3922 outM[j*m+i] = h->inva[i*n+j];
3931typedef struct _utility_zpinv_data {
3932 int maxDim1, maxDim2;
3934 double_complex* a, *u, *vt, *inva;
3936 double_complex* work;
3944 h->maxDim1 = maxDim1;
3945 h->maxDim2 = maxDim2;
3946 h->currentWorkSize = 0;
3947 h->a =
malloc1d(maxDim1*maxDim2*
sizeof(double_complex));
3949 h->u =
malloc1d(maxDim1*maxDim1*
sizeof(double_complex));
3950 h->vt =
malloc1d(maxDim2*maxDim2*
sizeof(double_complex));
3951 h->inva =
malloc1d(maxDim1*maxDim2*
sizeof(double_complex));
3977 const double_complex* inM,
3980 double_complex* outM
3984 veclib_int i, j, m, n, k, lda, ldu, ldvt, info, ld_inva;
3985 double_complex ss_cmplx;
3986 const double_complex calpha = cmplx(1.0, 0.0);
const double_complex cbeta = cmplx(0.0, 0.0);
3989 double_complex wkopt;
3991 m = lda = ldu = dim1;
4001 saf_assert(dim1<=h->maxDim1,
"dim1 exceeds the maximum length specified");
4002 saf_assert(dim2<=h->maxDim2,
"dim2 exceeds the maximum length specified");
4007 for(i=0; i<dim1; i++)
4008 for(j=0; j<dim2; j++)
4009 h->a[j*dim1+i] = inM[i*dim2 +j];
4013#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4014 zgesvd_(
"A",
"A", &m, &n, (
veclib_double_complex*)h->a, &lda, h->s, (
veclib_double_complex*)h->u, &ldu, (
veclib_double_complex*)h->vt, &ldvt,
4016#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4018#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4019 info = LAPACKE_zgesvd_work(CblasColMajor,
'A',
'A', m, n, (
veclib_double_complex*)h->a, lda, h->s, (
veclib_double_complex*)h->u, ldu, (
veclib_double_complex*)h->vt, ldvt,
4023 if(lwork>h->currentWorkSize){
4024 h->currentWorkSize = lwork;
4025 h->work =
realloc1d(h->work, h->currentWorkSize*
sizeof(double_complex));
4029#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4030 zgesvd_(
"A",
"A", &m, &n, (
veclib_double_complex*)h->a, &lda, h->s, (
veclib_double_complex*)h->u, &ldu, (
veclib_double_complex*)h->vt, &ldvt,
4032#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4034#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4035 info = LAPACKE_zgesvd_work(CblasColMajor,
'A',
'A', m, n, (
veclib_double_complex*)h->a, lda, h->s, (
veclib_double_complex*)h->u, ldu, (
veclib_double_complex*)h->vt, ldvt,
4040 memset(outM, 0, dim1*dim2*
sizeof(double_complex));
4045 saf_print_warning(
"Could not compute SVD in utility_zpinv(). Output matrices/vectors have been zeroed.");
4050 if(h->s[i] > 1.0e-5)
4054 ss_cmplx = cmplx(ss, 0.0);
4055 cblas_zscal(m, &ss_cmplx, &h->u[i*m], 1);
4058 cblas_zgemm(CblasColMajor, CblasConjTrans, CblasConjTrans, n, m, k, &calpha,
4066 outM[j*m+i] = h->inva[i*n+j];
4080typedef struct _utility_schol_data {
4091 h->a =
malloc1d(maxDim*maxDim*
sizeof(
float));
4126 saf_assert(dim<=h->maxDim,
"dim exceeds the maximum length specified");
4131 for(i=0; i<dim; i++)
4132 for(j=0; j<dim; j++)
4133 h->a[j*dim+i] = A[i*dim+j];
4136#ifdef SAF_VECLIB_USE_CLAPACK_INTERFACE
4137 info = clapack_spotrf(CblasColMajor, CblasUpper, n, h->a, lda);
4138#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4139 info = LAPACKE_spotrf_work(CblasColMajor, CblasUpper, n, h->a, lda);
4140#elif defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4141 spotrf_(
"U", &n, h->a, &lda, &info );
4146 memset(X, 0, dim*dim*
sizeof(
float));
4152 saf_print_warning(
"Could not compute the Cholesky factorisation in utility_schol(). Output matrices/vectors have been zeroed.");
4157 for(i=0; i<dim; i++)
4158 for(j=0; j<dim; j++)
4159 X[i*dim+j] = j>=i ? h->a[j*dim+i] : 0.0f;
4168typedef struct _utility_cchol_data {
4179 h->a =
malloc1d(maxDim*maxDim*
sizeof(float_complex));
4198 const float_complex* A,
4214 saf_assert(dim<=h->maxDim,
"dim exceeds the maximum length specified");
4219 for(i=0; i<dim; i++)
4220 for(j=0; j<dim; j++)
4221 h->a[j*dim+i] = A[i*dim+j];
4224#if defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4226#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4228#elif defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4234 memset(X, 0, dim*dim*
sizeof(float_complex));
4240 saf_print_warning(
"Could not compute the Cholesky factorisation in utility_cchol(). Output matrices/vectors have been zeroed.");
4245 for(i=0; i<dim; i++)
4246 for(j=0; j<dim; j++)
4247 X[i*dim+j] = j>=i ? h->a[j*dim+i] : cmplxf(0.0f, 0.0f);
4260typedef struct _utility_sdet_data {
4273 h->tmp =
malloc1d(maxN*maxN*
sizeof(
float));
4303 return A[0]*A[3] - A[2]*A[1];
4306 return A[0] * ((A[4]*A[8]) - (A[7]*A[5])) -A[1] * (A[3]
4307 * A[8] - A[6] * A[5]) + A[2] * (A[3] * A[7] - A[6] * A[4]);
4311 A[3] * A[6] * A[9] * A[12] - A[2] * A[7] * A[9] * A[12] -
4312 A[3] * A[5] * A[10] * A[12] + A[1] * A[7] * A[10] * A[12] +
4313 A[2] * A[5] * A[11] * A[12] - A[1] * A[6] * A[11] * A[12] -
4314 A[3] * A[6] * A[8] * A[13] + A[2] * A[7] * A[8] * A[13] +
4315 A[3] * A[4] * A[10] * A[13] - A[0] * A[7] * A[10] * A[13] -
4316 A[2] * A[4] * A[11] * A[13] + A[0] * A[6] * A[11] * A[13] +
4317 A[3] * A[5] * A[8] * A[14] - A[1] * A[7] * A[8] * A[14] -
4318 A[3] * A[4] * A[9] * A[14] + A[0] * A[7] * A[9] * A[14] +
4319 A[1] * A[4] * A[11] * A[14] - A[0] * A[5] * A[11] * A[14] -
4320 A[2] * A[5] * A[8] * A[15] + A[1] * A[6] * A[8] * A[15] +
4321 A[2] * A[4] * A[9] * A[15] - A[0] * A[6] * A[9] * A[15] -
4322 A[1] * A[4] * A[10] * A[15] + A[0] * A[5] * A[10] * A[15];
4332 saf_assert(N<=h->maxN,
"N exceeds the maximum length specified");
4339 h->tmp[j*N+i] = A[i*N+j];
4341#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4343#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4344 INFO = clapack_sgetrf(CblasColMajor, N, N, h->tmp, N, h->IPIV);
4345#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4346 INFO = LAPACKE_sgetrf_work(CblasColMajor, N, N, h->tmp, N, h->IPIV);
4354 saf_print_warning(
"Unable to compute determinant of input matrix. The function utility_sdet() returned 0. ");
4359 for( i = 0; i < N; i++ ) {
4361 if(h->IPIV[i] != i+1)
4374typedef struct _utility_ddet_data {
4375 int currentWorkSize;
4378 double *tmp, *TAU, *WORK;
4387 h->currentWorkSize = 0;
4389 h->tmp =
malloc1d(maxN*maxN*
sizeof(
double));
4390 h->TAU =
malloc1d(maxN*
sizeof(
double));
4423 return A[0]*A[3] - A[2]*A[1];
4426 return A[0] * ((A[4]*A[8]) - (A[7]*A[5])) -A[1] * (A[3]
4427 * A[8] - A[6] * A[5]) + A[2] * (A[3] * A[7] - A[6] * A[4]);
4431 A[3] * A[6] * A[9] * A[12] - A[2] * A[7] * A[9] * A[12] -
4432 A[3] * A[5] * A[10] * A[12] + A[1] * A[7] * A[10] * A[12] +
4433 A[2] * A[5] * A[11] * A[12] - A[1] * A[6] * A[11] * A[12] -
4434 A[3] * A[6] * A[8] * A[13] + A[2] * A[7] * A[8] * A[13] +
4435 A[3] * A[4] * A[10] * A[13] - A[0] * A[7] * A[10] * A[13] -
4436 A[2] * A[4] * A[11] * A[13] + A[0] * A[6] * A[11] * A[13] +
4437 A[3] * A[5] * A[8] * A[14] - A[1] * A[7] * A[8] * A[14] -
4438 A[3] * A[4] * A[9] * A[14] + A[0] * A[7] * A[9] * A[14] +
4439 A[1] * A[4] * A[11] * A[14] - A[0] * A[5] * A[11] * A[14] -
4440 A[2] * A[5] * A[8] * A[15] + A[1] * A[6] * A[8] * A[15] +
4441 A[2] * A[4] * A[9] * A[15] - A[0] * A[6] * A[9] * A[15] -
4442 A[1] * A[4] * A[10] * A[15] + A[0] * A[5] * A[10] * A[15];
4452 saf_assert(N<=h->maxN,
"N exceeds the maximum length specified");
4459 h->tmp[j*N+i] = A[i*N+j];
4463#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4464 dgeqrf_(&N, &N, h->tmp, &N, h->TAU, &lwork2, &LWORK, &INFO);
4465#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4467#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4468 INFO = LAPACKE_dgeqrf_work(CblasColMajor, N, N, h->tmp, N, h->TAU, &lwork2, LWORK);
4471 if(lwork3>h->currentWorkSize){
4472 h->currentWorkSize = lwork3;
4473 h->WORK =
realloc1d(h->WORK, h->currentWorkSize*
sizeof(
double));
4477#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4478 dgeqrf_(&N, &N, h->tmp, &N, h->TAU, h->WORK, &lwork3, &INFO);
4479#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4481#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4482 INFO = LAPACKE_dgeqrf_work(CblasColMajor, N, N, h->tmp, N, h->TAU, h->WORK, lwork3);
4490 saf_print_warning(
"Unable to compute determinant of input matrix. The function utility_ddet() returned 0. ");
4515typedef struct _utility_sinv_data {
4528 h->tmp =
malloc1d(maxN*maxN*
sizeof(
float));
4529 h->WORK =
malloc1d(maxN*maxN*
sizeof(
float));
4569 saf_assert(N<=h->maxN,
"N exceeds the maximum length specified");
4576 h->tmp[j*N+i] = A[i*N+j];
4578#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4581#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4582 INFO = clapack_sgetrf(CblasColMajor, N_, N_, h->tmp, N_, h->IPIV);
4583 INFO = clapack_sgetri(CblasColMajor, N_, h->tmp, N_, h->IPIV);
4584#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4585 INFO = LAPACKE_sgetrf_work(CblasColMajor, N_, N_, h->tmp, N_, h->IPIV);
4586 INFO = LAPACKE_sgetri_work(CblasColMajor, N_, h->tmp, N_, h->IPIV, h->WORK, LWORK);
4590 memset(B, 0, N*N*
sizeof(
float));
4595 saf_print_warning(
"Unable to compute the inverse of input matrix. The function utility_sinv() returned a matrix of zeros. ");
4602 B[j*N+i] = h->tmp[i*N+j];
4611typedef struct _utility_dinv_data {
4624 h->tmp =
malloc1d(maxN*maxN*
sizeof(
double));
4625 h->WORK =
malloc1d(maxN*maxN*
sizeof(
double));
4665 saf_assert(N<=h->maxN,
"N exceeds the maximum length specified");
4672 h->tmp[j*N+i] = A[i*N+j];
4674#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4677#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4678 INFO = clapack_dgetrf(CblasColMajor, N_, N_, h->tmp, N_, h->IPIV);
4679 INFO = clapack_dgetri(CblasColMajor, N_, h->tmp, N_, h->IPIV);
4680#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4681 INFO = LAPACKE_dgetrf_work(CblasColMajor, N_, N_, h->tmp, N_, h->IPIV);
4682 INFO = LAPACKE_dgetri_work(CblasColMajor, N_, h->tmp, N_, h->IPIV, h->WORK, LWORK);
4686 memset(B, 0, N*N*
sizeof(
double));
4691 saf_print_warning(
"Unable to compute the inverse of input matrix. The function utility_dinv() returned a matrix of zeros. ");
4698 B[j*N+i] = h->tmp[i*N+j];
4707typedef struct _utility_cinv_data {
4710 float_complex *WORK, *tmp;
4720 h->tmp =
malloc1d(maxN*maxN*
sizeof(float_complex));
4721 h->WORK =
malloc1d(maxN*maxN*
sizeof(float_complex));
4761 saf_assert(N<=h->maxN,
"N exceeds the maximum length specified");
4768 h->tmp[j*N+i] = A[i*N+j];
4770#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4773#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4776#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4782 memset(B, 0, N*N*
sizeof(float_complex));
4787 saf_print_warning(
"Unable to compute the inverse of input matrix. The function utility_cinv() returned a matrix of zeros. ");
4794 B[j*N+i] = h->tmp[i*N+j];
void utility_dimaxv(const double *a, const int len, int *index)
Double-precision, index of maximum absolute value in a vector, i.e.
#define saf_print_error(message)
Macro to print a error message along with the filename and line number.
void utility_spinv_create(void **const phWork, int maxDim1, int maxDim2)
(Optional) Pre-allocate the working struct used by utility_spinv()
void utility_dinv(void *const hWork, double *A, double *B, const int N)
Matrix inversion: double precision, i.e.
double utility_ddet(void *const hWork, double *A, int N)
Determinant of a Matrix, double precision, i,e.
void utility_dinv_create(void **const phWork, int maxN)
(Optional) Pre-allocate the working struct used by utility_dinv()
void utility_zvvsub(const double_complex *a, const double_complex *b, const int len, double_complex *c)
Double-precision, complex, vector-vector subtraction, i.e.
void utility_ceigmp(void *const hWork, const float_complex *A, const float_complex *B, const int dim, float_complex *VL, float_complex *VR, float_complex *D)
Computes eigenvalues of a matrix pair using the QZ method, single precision complex,...
#define saf_assert(x, message)
Macro to make an assertion, along with a string explaining its purpose.
void utility_cpinv_create(void **const phWork, int maxDim1, int maxDim2)
(Optional) Pre-allocate the working struct used by utility_cpinv()
void utility_zsv2cv_inds(const double_complex *sv, const int *inds, const int len, double_complex *cv)
Double-precision complex, sparse-vector to compressed vector given known indices.
void utility_cglslv_destroy(void **const phWork)
De-allocate the working struct used by utility_cglslv()
void utility_cglslv(void *const hWork, const float_complex *A, const int dim, float_complex *B, int nCol, float_complex *X)
General linear solver: single precision complex, i.e.
void utility_ddet_destroy(void **const phWork)
De-allocate the working struct used by utility_ddet()
void utility_zeig_destroy(void **const phWork)
De-allocate the working struct used by utility_zeig()
void utility_cvvmul(const float_complex *a, const float_complex *b, const int len, float_complex *c)
Single-precision, complex, element-wise vector-vector multiplication i.e.
void utility_svvmul(const float *a, const float *b, const int len, float *c)
Single-precision, element-wise vector-vector multiplication i.e.
void utility_ceig_destroy(void **const phWork)
De-allocate the working struct used by utility_ceig()
void utility_cinv(void *const hWork, float_complex *A, float_complex *B, const int N)
Matrix inversion: double precision complex, i.e.
void utility_dvsmul(double *a, const double *s, const int len, double *c)
Double-precision, multiplies each element in vector 'a' with a scalar 's', i.e.
void utility_csv2cv_inds(const float_complex *sv, const int *inds, const int len, float_complex *cv)
Single-precision complex, sparse-vector to compressed vector given known indices.
void utility_svmod(const float *a, const float *b, const int len, float *c)
Single-precision, modulus of vector elements, i.e.
void utility_ziminv(const double_complex *a, const int len, int *index)
Double-precision, complex, index of maximum absolute value in a vector, i.e.
void utility_dvvsub(const double *a, const double *b, const int len, double *c)
Double-precision, vector-vector subtraction, i.e.
void utility_dsv2cv_inds(const double *sv, const int *inds, const int len, double *cv)
Double-precision, sparse-vector to compressed vector given known indices i.e.
#define SAF_FALSE
Boolean false.
void utility_csvd_destroy(void **const phWork)
De-allocate the working struct used by utility_csvd()
#define SAF_MAX(a, b)
Returns the maximum of the two values.
void utility_ceigmp_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_ceigmp()
void utility_dglslv(void *const hWork, const double *A, const int dim, double *B, int nCol, double *X)
General linear solver: double precision, i.e.
void utility_zvsmul(double_complex *a, const double_complex *s, const int len, double_complex *c)
Double-precision, complex, multiplies each element in vector 'a' with a scalar 's',...
void utility_dglslv_destroy(void **const phWork)
De-allocate the working struct used by utility_dglslv()
void utility_schol_destroy(void **const phWork)
De-allocate the working struct used by utility_schol()
void utility_zeig_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_zeig()
void utility_svsadd(float *a, const float *s, const int len, float *c)
Single-precision, adds each element in vector 'a' with a scalar 's', i.e.
void utility_cseig_destroy(void **const phWork)
De-allocate the working struct used by utility_cseig()
void utility_ciminv(const float_complex *a, const int len, int *index)
Single-precision, complex, index of maximum absolute value in a vector, i.e.
void utility_dvvadd(const double *a, const double *b, const int len, double *c)
Double-precision, vector-vector addition, i.e.
void utility_svssub(float *a, const float *s, const int len, float *c)
Single-precision, subtracts each element in vector 'a' with a scalar 's', i.e.
void utility_svvsub(const float *a, const float *b, const int len, float *c)
Single-precision, vector-vector subtraction, i.e.
void utility_ssvd_create(void **const phWork, int maxDim1, int maxDim2)
(Optional) Pre-allocate the working struct used by utility_ssvd()
void utility_sglslvt(void *const hWork, const float *A, const int dim, float *B, int nCol, float *X)
General linear solver (the other way): single precision, i.e.
void utility_ssvd(void *const hWork, const float *A, const int dim1, const int dim2, float *U, float *S, float *V, float *sing)
Singular value decomposition: single precision, i.e.
void utility_zimaxv(const double_complex *a, const int len, int *index)
Double-precision, complex, index of maximum absolute value in a vector, i.e.
void utility_zeigmp(void *const hWork, const double_complex *A, const double_complex *B, const int dim, double_complex *VL, double_complex *VR, double_complex *D)
Computes eigenvalues of a matrix pair using the QZ method, double precision complex,...
void utility_cinv_destroy(void **const phWork)
De-allocate the working struct used by utility_cinv()
void utility_ceig(void *const hWork, const float_complex *A, const int dim, float_complex *VL, float_complex *VR, float_complex *D, float_complex *eig)
Eigenvalue decomposition of a NON-SYMMETRIC matrix: single precision complex, i.e.
void utility_cvvadd(const float_complex *a, const float_complex *b, const int len, float_complex *c)
Single-precision, complex, vector-vector addition, i.e.
void utility_sinv_create(void **const phWork, int maxN)
(Optional) Pre-allocate the working struct used by utility_sinv()
void utility_zglslv(void *const hWork, const double_complex *A, const int dim, double_complex *B, int nCol, double_complex *X)
General linear solver: double precision complex, i.e.
void utility_sglslv(void *const hWork, const float *A, const int dim, float *B, int nCol, float *X)
General linear solver: single precision, i.e.
CONJ_FLAG
Whether a vector should be conjugated or not (e.g.
void utility_svvcopy(const float *a, const int len, float *c)
Single-precision, vector-vector copy, i.e.
void utility_cimaxv(const float_complex *a, const int len, int *index)
Single-precision, complex, index of maximum absolute value in a vector, i.e.
#define SAF_TRUE
Boolean true.
void utility_spinv(void *const hWork, const float *inM, const int dim1, const int dim2, float *outM)
General matrix pseudo-inverse (the svd way): single precision, i.e.
void utility_dpinv_create(void **const phWork, int maxDim1, int maxDim2)
(Optional) Pre-allocate the working struct used by utility_dpinv()
void utility_cpinv(void *const hWork, const float_complex *inM, const int dim1, const int dim2, float_complex *outM)
General matrix pseudo-inverse (the svd way): single precision complex, i.e.
void utility_cslslv_create(void **const phWork, int maxDim, int maxNCol)
(Optional) Pre-allocate the working struct used by utility_cslslv()
void utility_sseig_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_sseig()
void utility_sinv(void *const hWork, float *A, float *B, const int N)
Matrix inversion: single precision, i.e.
void utility_zglslv_destroy(void **const phWork)
De-allocate the working struct used by utility_zglslv()
void utility_cpinv_destroy(void **const phWork)
De-allocate the working struct used by utility_cpinv()
void utility_sglslvt_destroy(void **const phWork)
De-allocate the working struct used by utility_sglslvt()
void utility_dpinv_destroy(void **const phWork)
De-allocate the working struct used by utility_dpinv()
void utility_cseig(void *const hWork, const float_complex *A, const int dim, int sortDecFLAG, float_complex *V, float_complex *D, float *eig)
Eigenvalue decomposition of a SYMMETRIC/HERMITION matrix: single precision complex,...
void utility_csvd(void *const hWork, const float_complex *A, const int dim1, const int dim2, float_complex *U, float_complex *S, float_complex *V, float *sing)
Singular value decomposition: single precision complex, i.e.
void utility_dpinv(void *const hWork, const double *inM, const int dim1, const int dim2, double *outM)
General matrix pseudo-inverse (the svd way): double precision, i.e.
void utility_cseig_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_cseig()
void utility_diminv(const double *a, const int len, int *index)
Double-precision, index of minimum absolute value in a vector, i.e.
void utility_sslslv_destroy(void **const phWork)
De-allocate the working struct used by utility_sslslv()
void utility_schol(void *const hWork, const float *A, const int dim, float *X)
Cholesky factorisation of a symmetric matrix positive-definate matrix: single precision,...
#define SAF_MIN(a, b)
Returns the minimum of the two values.
void utility_zeig(void *const hWork, const double_complex *A, const int dim, double_complex *VL, double_complex *VR, double_complex *D, double_complex *eig)
Eigenvalue decomposition of a NON-SYMMETRIC matrix: double precision complex, i.e.
void utility_cvabs(const float_complex *a, const int len, float *c)
Single-precision, complex, absolute values of vector elements, i.e.
void utility_cvvdot(const float_complex *a, const float_complex *b, const int len, CONJ_FLAG flag, float_complex *c)
Single-precision, complex, vector-vector dot product, i.e.
void utility_sglslv_create(void **const phWork, int maxDim, int maxNCol)
(Optional) Pre-allocate the working struct used by utility_sglslv()
void utility_cinv_create(void **const phWork, int maxN)
(Optional) Pre-allocate the working struct used by utility_cinv()
void utility_cvvsub(const float_complex *a, const float_complex *b, const int len, float_complex *c)
Single-precision, complex, vector-vector subtraction, i.e.
void utility_zpinv(void *const hWork, const double_complex *inM, const int dim1, const int dim2, double_complex *outM)
General matrix pseudo-inverse (the svd way): double precision complex, i.e.
void utility_zpinv_create(void **const phWork, int maxDim1, int maxDim2)
(Optional) Pre-allocate the working struct used by utility_zpinv()
void utility_sseig(void *const hWork, const float *A, const int dim, int sortDecFLAG, float *V, float *D, float *eig)
Eigenvalue decomposition of a SYMMETRIC matrix: single precision, i.e.
void utility_dinv_destroy(void **const phWork)
De-allocate the working struct used by utility_dinv()
void utility_cchol(void *const hWork, const float_complex *A, const int dim, float_complex *X)
Cholesky factorisation of a hermitian positive-definate matrix: single precision complex,...
#define saf_print_warning(message)
Macro to print a warning message along with the filename and line number.
void utility_cchol_destroy(void **const phWork)
De-allocate the working struct used by utility_cchol()
void utility_zvvcopy(const double_complex *a, const int len, double_complex *c)
double-precision, complex, vector-vector copy, i.e.
float utility_sdet(void *const hWork, float *A, int N)
Determinant of a Matrix, single precision, i,e.
void utility_zeigmp_destroy(void **const phWork)
De-allocate the working struct used by utility_zeigmp()
void utility_schol_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_schol()
void utility_sslslv_create(void **const phWork, int maxDim, int maxNCol)
(Optional) Pre-allocate the working struct used by utility_sslslv()
void utility_zvvadd(const double_complex *a, const double_complex *b, const int len, double_complex *c)
Double-precision, complex, vector-vector addition, i.e.
void utility_ceigmp_destroy(void **const phWork)
De-allocate the working struct used by utility_ceigmp()
void utility_csvd_create(void **const phWork, int maxDim1, int maxDim2)
(Optional) Pre-allocate the working struct used by utility_csvd()
void utility_cvvcopy(const float_complex *a, const int len, float_complex *c)
Single-precision, complex, vector-vector copy, i.e.
void utility_svvadd(const float *a, const float *b, const int len, float *c)
Single-precision, vector-vector addition, i.e.
void utility_svrecip(const float *a, const int len, float *c)
Single-precision, vector-reciprocal/inversion, i.e.
void utility_sglslvt_create(void **const phWork, int maxDim, int maxNCol)
(Optional) Pre-allocate the working struct used by utility_sglslvt()
void utility_spinv_destroy(void **const phWork)
De-allocate the working struct used by utility_spinv()
void utility_sslslv(void *const hWork, const float *A, const int dim, float *B, int nCol, float *X)
Linear solver for SYMMETRIC positive-definate 'A': single precision, i.e.
void utility_ddet_create(void **const phWork, int maxN)
(Optional) Pre-allocate the working struct used by utility_ddet()
void utility_sdet_create(void **const phWork, int maxN)
(Optional) Pre-allocate the working struct used by utility_sdet()
void utility_cvsmul(float_complex *a, const float_complex *s, const int len, float_complex *c)
Single-precision, complex, multiplies each element in vector 'a' with a scalar 's',...
void utility_cglslv_create(void **const phWork, int maxDim, int maxNCol)
(Optional) Pre-allocate the working struct used by utility_cglslv()
void utility_ceig_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_ceig()
void utility_dglslv_create(void **const phWork, int maxDim, int maxNCol)
(Optional) Pre-allocate the working struct used by utility_dglslv()
void utility_svvdot(const float *a, const float *b, const int len, float *c)
Single-precision, vector-vector dot product, i.e.
void utility_svsdiv(const float *a, const float *s, const int len, float *c)
Single-precision, divides each element in vector 'a' with a scalar 's', i.e.
void utility_cslslv_destroy(void **const phWork)
De-allocate the working struct used by utility_cslslv()
void utility_zglslv_create(void **const phWork, int maxDim, int maxNCol)
(Optional) Pre-allocate the working struct used by utility_zglslv()
void utility_cvconj(const float_complex *a, const int len, float_complex *c)
Single-precision, complex, vector-conjugate, i.e.
void utility_dvvcopy(const double *a, const int len, double *c)
double-precision, vector-vector copy, i.e.
void utility_sseig_destroy(void **const phWork)
De-allocate the working struct used by utility_sseig()
void utility_svabs(const float *a, const int len, float *c)
Single-precision, absolute values of vector elements, i.e.
void utility_svsmul(float *a, const float *s, const int len, float *c)
Single-precision, multiplies each element in vector 'a' with a scalar 's', i.e.
void utility_zvconj(const double_complex *a, const int len, double_complex *c)
Double-precision, complex, vector-conjugate, i.e.
void utility_sinv_destroy(void **const phWork)
De-allocate the working struct used by utility_sinv()
void utility_cchol_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_cchol()
void utility_sdet_destroy(void **const phWork)
De-allocate the working struct used by utility_sdet()
void utility_siminv(const float *a, const int len, int *index)
Single-precision, index of minimum absolute value in a vector, i.e.
void utility_zeigmp_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_zeigmp()
void utility_ssv2cv_inds(const float *sv, const int *inds, const int len, float *cv)
Single-precision, sparse-vector to compressed vector given known indices i.e.
void utility_sglslv_destroy(void **const phWork)
De-allocate the working struct used by utility_sglslv()
void utility_simaxv(const float *a, const int len, int *index)
Single-precision, index of maximum absolute value in a vector, i.e.
void utility_ssvd_destroy(void **const phWork)
De-allocate the working struct used by utility_ssvd()
void utility_zpinv_destroy(void **const phWork)
De-allocate the working struct used by utility_zpinv()
void utility_cslslv(void *const hWork, const float_complex *A, const int dim, float_complex *B, int nCol, float_complex *X)
Linear solver for HERMITIAN positive-definate 'A': single precision complex, i.e.
@ NO_CONJ
Do not take the conjugate.
@ CONJ
Take the conjugate.
void * malloc1d(size_t dim1_data_size)
1-D malloc (same as malloc, but with error checking)
void * realloc1d(void *ptr, size_t dim1_data_size)
1-D realloc (same as realloc, but with error checking)
Include header for SAF externals.
Main header for the utilities module (SAF_UTILITIES_MODULE)
double veclib_double
real: 8-bytes
MKL_INT veclib_int
integer: 4-bytes
MKL_Complex16 veclib_double_complex
complex: 16-bytes
MKL_Complex8 veclib_float_complex
complex: 8-bytes
#define SAF_INTEL_MKL_VML_MODE
Taken from the Intel MKL VML developers reference documentation: The flag(s) that can be passed to In...
float veclib_float
real: 4-bytes
Data structure for utility_cchol()
Data structure for utility_ceig()
Data structure for utility_ceigmp()
Data structure for utility_cglslv()
Data structure for utility_cinv()
Data structure for utility_cpinv()
Data structure for utility_cseig()
Data structure for utility_cslslv()
Data structure for utility_csvd()
Data structure for utility_ddet()
Data structure for utility_dglslv()
Data structure for utility_dinv()
Data structure for utility_dpinv()
Data structure for utility_schol()
Data structure for utility_sdet()
Data structure for utility_sglslv()
Data structure for utility_sglslvt()
Data structure for utility_sinv()
Data structure for utility_spinv()
Data structure for utility_sseig()
Data structure for utility_sslslv()
Data structure for utility_ssvd()
Data structure for utility_zeig()
Data structure for utility_zeigmp()
Data structure for utility_zglslv()
Data structure for utility_zpinv()