54#if defined(SAF_USE_INTEL_MKL_LP64)
56# define SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE
57#elif defined(SAF_USE_INTEL_MKL_ILP64)
59# define SAF_VECLIB_USE_LAPACKE_INTERFACE
60#elif defined(SAF_USE_OPEN_BLAS_AND_LAPACKE)
61# define SAF_VECLIB_USE_LAPACKE_INTERFACE
62#elif defined(SAF_USE_ATLAS)
63# define SAF_VECLIB_USE_CLAPACK_INTERFACE
64#elif defined(__APPLE__) && (defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64))
65# define SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE
66#elif defined(SAF_USE_GSL)
67# define SAF_VECLIB_USE_GSL_LINALG
69# error No LAPACK interface was specified!
73#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
74# ifndef SAF_INTEL_MKL_VML_MODE
94# define SAF_INTEL_MKL_VML_MODE ( VML_LA | VML_FTZDAZ_ON )
99#if defined(__APPLE__) && defined(SAF_USE_APPLE_ACCELERATE_LP64) && !defined(ACCELERATE_NEW_LAPACK)
105#elif defined(__APPLE__) && (defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64))
107# ifdef SAF_USE_APPLE_ACCELERATE_LP64
116#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
117# ifdef SAF_USE_INTEL_MKL_LP64
126#elif defined(SAF_USE_OPEN_BLAS_AND_LAPACKE)
145#ifdef SAF_USE_BUILT_IN_NAIVE_CBLAS
146void cblas_scopy(
const int N,
const float *X,
const int incX,
float *Y,
const int incY){
148#if defined(SAF_ENABLE_SIMD)
149 if(incX==1 && incY==1){
150 for(i=0; i<(N-3); i+=4)
151 _mm_storeu_ps(Y+i, _mm_loadu_ps(X+i));
157 Y[i*incY] = X[i*incX];
160 Y[i*incY] = X[i*incX];
164void cblas_dcopy(
const int N,
const double *X,
const int incX,
double *Y,
const int incY){
166 for(i=j=0; i<N; i+=incX, j+=incY)
170void cblas_ccopy(
const int N,
const void *X,
const int incX,
void *Y,
const int incY){
172 float_complex *cX, *cY;
173 cX = (float_complex*)X;
174 cY = (float_complex*)Y;
175 for(i=j=0; i<N; i+=incX, j+=incY)
179void cblas_zcopy(
const int N,
const void *X,
const int incX,
void *Y,
const int incY){
181 double_complex *cX, *cY;
182 cX = (double_complex*)X;
183 cY = (double_complex*)Y;
184 for(i=j=0; i<N; i+=incX, j+=incY)
188void cblas_saxpy(
const int N,
const float alpha,
const float* X,
const int incX,
float* Y,
const int incY) {
190 for (i=j=0; i<N; i+=incX, j+=incY)
191 Y[i] = alpha * X[i] + Y[i];
194void cblas_daxpy(
const int N,
const double alpha,
const double* X,
const int incX,
double* Y,
const int incY) {
196 for (i=j=0; i<N; i+=incX, j+=incY)
197 Y[i] = alpha * X[i] + Y[i];
200float cblas_sdot(
const int N,
const float* X,
const int incX,
const float* Y,
const int incY){
203#if defined(SAF_ENABLE_SIMD)
205 if(incX==1 && incY==1){
207 __m128 sum = _mm_setzero_ps();
209 for(i=0; i<(N-3); i+=4)
210 sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(X+i), _mm_loadu_ps(Y+i)));
212 sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
214 sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 2, 0, 1)));
215 _mm_store_ss(&ret, sum);
222 ret += X[i*incX] * Y[i*incY];
226 ret += X[i*incX] * Y[i*incY];
238#ifdef SAF_USE_BUILT_IN_NAIVE_CBLAS
239void cblas_sgemm(
const CBLAS_LAYOUT Layout,
const CBLAS_TRANSPOSE TransA,
240 const CBLAS_TRANSPOSE TransB,
const int M,
const int N,
241 const int K,
const float alpha,
const float *A,
242 const int lda,
const float *B,
const int ldb,
243 const float beta,
float *C,
const int ldc)
261 if(!_transA && !_transB){
266 C[i*ldc+j] += A[i*lda+k] * B[k*ldb+j];
270 else if(_transA && !_transB){
272 else if(_transA && _transB){
289#if defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
292 vDSP_minmgvi(a, 1, &minVal, &ind_tmp, (vDSP_Length)len);
293 *index = (int)ind_tmp;
294#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
295 *index = (int)cblas_isamin(len, a, 1);
299 float minVal=FLT_MAX;
300 for(i=0; i<len; i++){
301 if(fabsf(a[i])<minVal){
302 minVal = fabsf(a[i]);
311 const float_complex* a,
316#if defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
319 abs_a =
malloc1d(len*
sizeof(
float));
320 vDSP_vdist((
float*)a, 2, (
float*)a+1, 2, abs_a, 1, (vDSP_Length)len);
322 vDSP_minmgvi(abs_a, 1, &minVal, &ind_tmp, (vDSP_Length)len);
323 *index = (int)ind_tmp;
325#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
326 *index = (int)cblas_icamin(len, a, 1);
330 float minVal=FLT_MAX;
331 for(i=0; i<len; i++){
332 if(cabsf(a[i])<minVal){
333 minVal = cabsf(a[i]);
347#if defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
350 vDSP_minmgviD(a, 1, &minVal, &ind_tmp, (vDSP_Length)len);
351 *index = (int)ind_tmp;
352#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
353 *index = (int)cblas_idamin(len, a, 1);
357 double minVal=DBL_MAX;
358 for(i=0; i<len; i++){
359 if(fabs(a[i])<minVal){
369 const double_complex* a,
374#if defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
377 abs_a =
malloc1d(len*
sizeof(
double));
378 vDSP_vdistD((
double*)a, 2, (
double*)a+1, 2, abs_a, 1, (vDSP_Length)len);
380 vDSP_minmgviD(abs_a, 1, &minVal, &ind_tmp, (vDSP_Length)len);
381 *index = (int)ind_tmp;
383#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
384 *index = (int)cblas_izamin(len, a, 1);
388 double minVal=DBL_MAX;
389 for(i=0; i<len; i++){
390 if(cabs(a[i])<minVal){
410 *index = (int)cblas_isamax(len, a, 1);
415 const float_complex* a,
420 *index = (int)cblas_icamax(len, a, 1);
430 *index = (int)cblas_idamax(len, a, 1);
435 const double_complex* a,
440 *index = (int)cblas_izamax(len, a, 1);
455#if defined(SAF_USE_INTEL_IPP)
456 ippsAbs_32f((Ipp32f*)a, (Ipp32f*)c, len);
457#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
458 vDSP_vabs(a, 1, c, 1, (vDSP_Length)len);
459#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
470 const float_complex* a,
475#if defined(SAF_USE_INTEL_IPP)
476 ippsMagnitude_32fc((Ipp32fc*)a, (Ipp32f*)c, len);
477#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
478 vDSP_vdist((
float*)a, 2, (
float*)a+1, 2, c, 1, (vDSP_Length)len);
479#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
501#if defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
502 vvfmodf(c, a, b, &len);
503#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
508 c[i] = fmodf(a[i], b[i]);
524#if defined(SAF_USE_INTEL_IPP)
525 ippsDivCRev_32f((Ipp32f*)a, 1.0f, (Ipp32f*)c, len);
526#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
529 vDSP_svdiv(&one, a, 1, c, 1, (vDSP_Length)len);
530#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
532#elif defined(SAF_ENABLE_SIMD)
535# if defined(__AVX512F__)
536 for(; i<(len-15); i+=16)
537 _mm512_storeu_ps(c+i, _mm512_rcp14_ps(_mm512_loadu_ps(a+i)));
539# if defined(__AVX__) && defined(__AVX2__)
540 for(; i<(len-7); i+=8)
541 _mm256_storeu_ps(c+i, _mm256_rcp_ps(_mm256_loadu_ps(a+i)));
543# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
544 for(; i<(len-3); i+=4)
545 _mm_storeu_ps(c+i, _mm_rcp_ps(_mm_loadu_ps(a+i)));
563 const float_complex* a,
568#if defined(SAF_USE_INTEL_IPP)
569 ippsConj_32fc((Ipp32fc*)a, (Ipp32fc*)c, len);
571 cblas_ccopy(len, a, 1, c, 1);
572 cblas_sscal(len, -1.0f, ((
float*)c)+1, 2);
578 const double_complex* a,
583#if defined(SAF_USE_INTEL_IPP)
584 ippsConj_64fc((Ipp64fc*)a, (Ipp64fc*)c, len);
586 cblas_zcopy(len, a, 1, c, 1);
587 cblas_dscal(len, -1.0, ((
double*)c)+1, 2);
603 cblas_scopy(len, a, 1, c, 1);
608 const float_complex* a,
613 cblas_ccopy(len, a, 1, c, 1);
623 cblas_dcopy(len, a, 1, c, 1);
628 const double_complex* a,
633 cblas_zcopy(len, a, 1, c, 1);
651 saf_assert(c!=NULL,
"'c' can no longer be NULL");
652 saf_assert(a!=c && b!=c,
"In-place operation is no longer supported, use e.g. cblas_saxby() instead.");
656#if defined(SAF_USE_INTEL_IPP)
657 ippsAdd_32f((Ipp32f*)a, (Ipp32f*)b, (Ipp32f*)c, len);
658#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
659 vDSP_vadd(a, 1, b, 1, c, 1, (vDSP_Length)len);
660#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
662#elif defined(SAF_ENABLE_SIMD)
665# if defined(__AVX512F__)
666 for(; i<(len-15); i+=16)
667 _mm512_storeu_ps(c+i, _mm512_add_ps(_mm512_loadu_ps(a+i), _mm512_loadu_ps(b+i)));
669# if defined(__AVX__) && defined(__AVX2__)
670 for(; i<(len-7); i+=8)
671 _mm256_storeu_ps(c+i, _mm256_add_ps(_mm256_loadu_ps(a+i), _mm256_loadu_ps(b+i)));
673# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
674 for(; i<(len-3); i+=4)
675 _mm_storeu_ps(c+i, _mm_add_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
682 for(i=0; i<len-3; i+=4){
684 c[i+1] = a[i+1] + b[i+1];
685 c[i+2] = a[i+2] + b[i+2];
686 c[i+3] = a[i+3] + b[i+3];
692 for (j = 0; j < len; j++)
699 const float_complex* a,
700 const float_complex* b,
707 saf_assert(c!=NULL,
"'c' can no longer be NULL");
708 saf_assert(a!=c && b!=c,
"In-place operation is no longer supported, use e.g. cblas_caxby() instead.");
712#if defined(SAF_USE_INTEL_IPP)
713 ippsAdd_32fc((Ipp32fc*)a, (Ipp32fc*)b, (Ipp32fc*)c, len);
714#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
715 vDSP_vadd((
float*)a, 1, (
float*)b, 1, (
float*)c, 1, 2*(vDSP_Length)len);
716#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
718#elif defined(SAF_ENABLE_SIMD)
722 sa = (
float*)a; sb = (
float*)b; sc = (
float*)c;
724# if defined(__AVX512F__)
725 for(; i<(len2-15); i+=16)
726 _mm512_storeu_ps(sc+i, _mm512_add_ps(_mm512_loadu_ps(sa+i), _mm512_loadu_ps(sb+i)));
728# if defined(__AVX__) && defined(__AVX2__)
729 for(; i<(len2-7); i+=8)
730 _mm256_storeu_ps(sc+i, _mm256_add_ps(_mm256_loadu_ps(sa+i), _mm256_loadu_ps(sb+i)));
732# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
733 for(; i<(len2-3); i+=4)
734 _mm_storeu_ps(sc+i, _mm_add_ps(_mm_loadu_ps(sa+i), _mm_loadu_ps(sb+i)));
737 sc[i] = sa[i] + sb[i];
738#elif __STDC_VERSION__ >= 199901L && defined(NDEBUG)
741 for(i=0; i<len-3; i+=4){
743 c[i+1] = a[i+1] + b[i+1];
744 c[i+2] = a[i+2] + b[i+2];
745 c[i+3] = a[i+3] + b[i+3];
751 for (j = 0; j < len; j++)
752 c[j] = ccaddf(a[j], b[j]);
766 saf_assert(c!=NULL,
"'c' can no longer be NULL");
767 saf_assert(a!=c && b!=c,
"In-place operation is no longer supported, use e.g. cblas_daxby() instead.");
771#if defined(SAF_USE_INTEL_IPP)
772 ippsAdd_64f((Ipp64f*)a, (Ipp64f*)b, (Ipp64f*)c, len);
773#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
774 vDSP_vaddD(a, 1, b, 1, c, 1, (vDSP_Length)len);
775#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
777#elif defined(SAF_ENABLE_SIMD)
780# if defined(__AVX512F__)
781 for(; i<(len-7); i+=8)
782 _mm512_storeu_pd(c+i, _mm512_add_pd(_mm512_loadu_pd(a+i), _mm512_loadu_pd(b+i)));
784# if defined(__AVX__) && defined(__AVX2__)
785 for(; i<(len-3); i+=4)
786 _mm256_storeu_pd(c+i, _mm256_add_pd(_mm256_loadu_pd(a+i), _mm256_loadu_pd(b+i)));
788# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
789 for(; i<(len-1); i+=2)
790 _mm_storeu_pd(c+i, _mm_add_pd(_mm_loadu_pd(a+i), _mm_loadu_pd(b+i)));
796 for (j = 0; j < len; j++)
803 const double_complex* a,
804 const double_complex* b,
811 saf_assert(c!=NULL,
"'c' can no longer be NULL");
812 saf_assert(a!=c && b!=c,
"In-place operation is no longer supported, use e.g. cblas_zaxby() instead.");
816#if defined(SAF_USE_INTEL_IPP)
817 ippsAdd_64fc((Ipp64fc*)a, (Ipp64fc*)b, (Ipp64fc*)c, len);
818#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
819 vDSP_vaddD((
double*)a, 1, (
double*)b, 1, (
double*)c, 1, 2*(vDSP_Length)len);
820#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
822#elif defined(SAF_ENABLE_SIMD)
824 double* sa, *sb, *sc;
826 sa = (
double*)a; sb = (
double*)b; sc = (
double*)c;
828# if defined(__AVX512F__)
829 for(; i<(len2-7); i+=8)
830 _mm512_storeu_pd(sc+i, _mm512_add_pd(_mm512_loadu_pd(sa+i), _mm512_loadu_pd(sb+i)));
832# if defined(__AVX__) && defined(__AVX2__)
833 for(; i<(len2-3); i+=4)
834 _mm256_storeu_pd(sc+i, _mm256_add_pd(_mm256_loadu_pd(sa+i), _mm256_loadu_pd(sb+i)));
836# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
837 for(; i<(len2-1); i+=2)
838 _mm_storeu_pd(sc+i, _mm_add_pd(_mm_loadu_pd(sa+i), _mm_loadu_pd(sb+i)));
841 sc[i] = sa[i] + sb[i];
844 for (j = 0; j < len; j++)
845 c[j] = ccadd(a[j], b[j]);
865 saf_assert(a!=c && b!=c,
"In-place operation is not supported.");
869#if defined(SAF_USE_INTEL_IPP)
870 ippsSub_32f((Ipp32f*)b, (Ipp32f*)a, (Ipp32f*)c, len);
871#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
872 vDSP_vsub(b, 1, a, 1, c, 1, (vDSP_Length)len);
873#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
875#elif defined(SAF_ENABLE_SIMD)
878# if defined(__AVX512F__)
879 for(; i<(len-15); i+=16)
880 _mm512_storeu_ps(c+i, _mm512_sub_ps(_mm512_loadu_ps(a+i), _mm512_loadu_ps(b+i)));
882# if defined(__AVX__) && defined(__AVX2__)
883 for(; i<(len-7); i+=8)
884 _mm256_storeu_ps(c+i, _mm256_sub_ps(_mm256_loadu_ps(a+i), _mm256_loadu_ps(b+i)));
886# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
887 for(; i<(len-3); i+=4)
888 _mm_storeu_ps(c+i, _mm_sub_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
895 for(i=0; i<len-3; i+=4){
897 c[i+1] = a[i+1] - b[i+1];
898 c[i+2] = a[i+2] - b[i+2];
899 c[i+3] = a[i+3] - b[i+3];
905 for (j = 0; j < len; j++)
912 const float_complex* a,
913 const float_complex* b,
921 saf_assert(a!=c && b!=c,
"In-place operation is not supported.");
925#if defined(SAF_USE_INTEL_IPP)
926 ippsSub_32fc((Ipp32fc*)b, (Ipp32fc*)a, (Ipp32fc*)c, len);
927#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
928 vDSP_vsub((
float*)b, 1, (
float*)a, 1, (
float*)c, 1, 2*(vDSP_Length)len);
929#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
931#elif defined(SAF_ENABLE_SIMD)
935 sa = (
float*)a; sb = (
float*)b; sc = (
float*)c;
937# if defined(__AVX512F__)
938 for(; i<(len2-15); i+=16)
939 _mm512_storeu_ps(sc+i, _mm512_sub_ps(_mm512_loadu_ps(sa+i), _mm512_loadu_ps(sb+i)));
941# if defined(__AVX__) && defined(__AVX2__)
942 for(; i<(len2-7); i+=8)
943 _mm256_storeu_ps(sc+i, _mm256_sub_ps(_mm256_loadu_ps(sa+i), _mm256_loadu_ps(sb+i)));
945# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
946 for(; i<(len2-3); i+=4)
947 _mm_storeu_ps(sc+i, _mm_sub_ps(_mm_loadu_ps(sa+i), _mm_loadu_ps(sb+i)));
950 sc[i] = sa[i] - sb[i];
951#elif __STDC_VERSION__ >= 199901L && defined(NDEBUG)
954 for(i=0; i<len-3; i+=4){
956 c[i+1] = a[i+1] - b[i+1];
957 c[i+2] = a[i+2] - b[i+2];
958 c[i+3] = a[i+3] - b[i+3];
964 for (j = 0; j < len; j++)
965 c[j] = ccsubf(a[j], b[j]);
980 saf_assert(a!=c && b!=c,
"In-place operation is not supported.");
984#if defined(SAF_USE_INTEL_IPP)
985 ippsSub_64f((Ipp64f*)b, (Ipp64f*)a, (Ipp64f*)c, len);
986#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
987 vDSP_vsubD(b, 1, a, 1, c, 1, (vDSP_Length)len);
988#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
990#elif defined(SAF_ENABLE_SIMD)
993# if defined(__AVX512F__)
994 for(; i<(len-7); i+=8)
995 _mm512_storeu_pd(c+i, _mm512_sub_pd(_mm512_loadu_pd(a+i), _mm512_loadu_pd(b+i)));
997# if defined(__AVX__) && defined(__AVX2__)
998 for(; i<(len-3); i+=4)
999 _mm256_storeu_pd(c+i, _mm256_sub_pd(_mm256_loadu_pd(a+i), _mm256_loadu_pd(b+i)));
1001# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
1002 for(; i<(len-1); i+=2)
1003 _mm_storeu_pd(c+i, _mm_sub_pd(_mm_loadu_pd(a+i), _mm_loadu_pd(b+i)));
1009 for (j = 0; j < len; j++)
1016 const double_complex* a,
1017 const double_complex* b,
1025 saf_assert(a!=c && b!=c,
"In-place operation is not supported.");
1029#if defined(SAF_USE_INTEL_IPP)
1030 ippsSub_64fc((Ipp64fc*)b, (Ipp64fc*)a, (Ipp64fc*)c, len);
1031#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
1032 vDSP_vsubD((
double*)b, 1, (
double*)a, 1, (
double*)c, 1, 2*(vDSP_Length)len);
1033#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1035#elif defined(SAF_ENABLE_SIMD)
1037 double* sa, *sb, *sc;
1039 sa = (
double*)a; sb = (
double*)b; sc = (
double*)c;
1041# if defined(__AVX512F__)
1042 for(; i<(len2-7); i+=8)
1043 _mm512_storeu_pd(sc+i, _mm512_sub_pd(_mm512_loadu_pd(sa+i), _mm512_loadu_pd(sb+i)));
1045# if defined(__AVX__) && defined(__AVX2__)
1046 for(; i<(len2-3); i+=4)
1047 _mm256_storeu_pd(sc+i, _mm256_sub_pd(_mm256_loadu_pd(sa+i), _mm256_loadu_pd(sb+i)));
1049# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
1050 for(; i<(len2-1); i+=2)
1051 _mm_storeu_pd(sc+i, _mm_sub_pd(_mm_loadu_pd(sa+i), _mm_loadu_pd(sb+i)));
1054 sc[i] = sa[i] - sb[i];
1057 for (j = 0; j < len; j++)
1058 c[j] = ccsub(a[j], b[j]);
1077 saf_assert(c!=NULL,
"'c' can no longer be NULL");
1078 saf_assert(a!=c && b!=c,
"In-place operation is no longer supported.");
1082#if defined(SAF_USE_INTEL_IPP)
1083 ippsMul_32f((Ipp32f*)a, (Ipp32f*)b, (Ipp32f*)c, len);
1084#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
1085 vDSP_vmul(a, 1, b, 1, c, 1, (vDSP_Length)len);
1086#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1088#elif defined(SAF_ENABLE_SIMD)
1091# if defined(__AVX512F__)
1092 for(; i<(len-15); i+=16)
1093 _mm512_storeu_ps(c+i, _mm512_mul_ps(_mm512_loadu_ps(a+i), _mm512_loadu_ps(b+i)));
1095# if defined(__AVX__) && defined(__AVX2__)
1096 for(; i<(len-7); i+=8)
1097 _mm256_storeu_ps(c+i, _mm256_mul_ps(_mm256_loadu_ps(a+i), _mm256_loadu_ps(b+i)));
1099# if defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
1100 for(; i<(len-3); i+=4)
1101 _mm_storeu_ps(c+i, _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
1105#elif defined(NDEBUG)
1108 for(i=0; i<len-3; i+=4){
1110 c[i+1] = a[i+1] * b[i+1];
1111 c[i+2] = a[i+2] * b[i+2];
1112 c[i+3] = a[i+3] * b[i+3];
1118 for (j = 0; j < len; j++)
1125 const float_complex* a,
1126 const float_complex* b,
1133 saf_assert(c!=NULL,
"'c' can no longer be NULL");
1134 saf_assert(a!=c && b!=c,
"In-place operation is no longer supported.");
1138#if defined(SAF_USE_INTEL_IPP)
1139 ippsMul_32fc((Ipp32fc*)a, (Ipp32fc*)b, (Ipp32fc*)c, len);
1140#elif (defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)) && 0
1142 vDSP_vmul((
float*)a, 2, (
float*)b+1, 2, (
float*)c+1, 2, (vDSP_Length)len);
1143 vDSP_vma((
float*)a+1, 2, (
float*)b, 2, (
float*)c+1, 2, (
float*)c, 2, (vDSP_Length)len);
1144 cblas_scopy(len, (
float*)c, 2, (
float*)c+1, 2);
1146 vDSP_vmmsb((
float*)a, 2, (
float*)b, 2, (
float*)a+1, 2, (
float*)b+1, 2, (
float*)c, 2, (vDSP_Length)len);
1147#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1149#elif defined(SAF_ENABLE_SIMD)
1151 float* sa, *sb, *sc;
1152 sa = (
float*)a; sb = (
float*)b; sc = (
float*)c;
1153# if (defined(__AVX__) && defined(__AVX2__)) || defined(__AVX512F__)
1154 __m256i permute_ri = _mm256_set_epi32(6, 7, 4, 5, 2, 3, 0, 1);
1155 for(i=0; i<(len-3); i+=4){
1157 __m256 src1 = _mm256_moveldup_ps(_mm256_loadu_ps(sa+2*i));
1159 __m256 src2 = _mm256_loadu_ps(sb+2*i);
1161 __m256 tmp1 = _mm256_mul_ps(src1, src2);
1163 __m256 b1 = _mm256_permutevar8x32_ps(src2, permute_ri);
1165 src1 = _mm256_movehdup_ps(_mm256_loadu_ps(sa+2*i));
1167 __m256 tmp2 = _mm256_mul_ps(src1, b1);
1169 _mm256_storeu_ps(sc+2*i, _mm256_addsub_ps(tmp1, tmp2));
1171# elif defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
1172 for(i=0; i<(len-1); i+=2){
1174 __m128 src1 = _mm_moveldup_ps(_mm_loadu_ps(sa+2*i));
1176 __m128 src2 = _mm_loadu_ps(sb+2*i);
1178 __m128 tmp1 = _mm_mul_ps(src1, src2);
1180 __m128 b1 = _mm_shuffle_ps(src2, src2, _MM_SHUFFLE(2, 3, 0, 1));
1182 src1 = _mm_movehdup_ps(_mm_loadu_ps(sa+2*i));
1184 __m128 tmp2 = _mm_mul_ps(src1, b1);
1186 _mm_storeu_ps(sc+2*i, _mm_addsub_ps(tmp1, tmp2));
1190 sc[2*i] = sa[2*i] * sb[2*i] - sa[2*i+1] * sb[2*i+1];
1191 sc[2*i+1] = sa[2*i] * sb[2*i+1] + sa[2*i+1] * sb[2*i];
1193#elif __STDC_VERSION__ >= 199901L && defined(NDEBUG)
1196 for(i=0; i<len-3; i+=4){
1198 c[i+1] = a[i+1] * b[i+1];
1199 c[i+2] = a[i+2] * b[i+2];
1200 c[i+3] = a[i+3] * b[i+3];
1204#elif __STDC_VERSION__ >= 199901L
1206 for (i = 0; i < len; i++)
1210 for (i = 0; i < len; i++)
1211 c[i] = ccmulf(a[i], b[i]);
1228 c[0] = cblas_sdot (len, a, 1, b, 1);
1233 const float_complex* a,
1234 const float_complex* b,
1243 cblas_cdotu_sub(len, a, 1, b, 1, c);
1246 cblas_cdotc_sub(len, a, 1, b, 1, c);
1264#if defined(SAF_USE_INTEL_IPP)
1266 ippsMulC_32f_I((Ipp32f)s[0], (Ipp32f*)a, len);
1268 ippsMulC_32f((Ipp32f*)a, (Ipp32f)s[0], (Ipp32f*)c, len);
1269#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
1271 cblas_sscal(len, s[0], a, 1);
1273 vDSP_vsmul(a, 1, s, c, 1, (vDSP_Length)len);
1276 cblas_sscal(len, s[0], a, 1);
1279 cblas_sscal(len, s[0], c, 1);
1287 const float_complex* s,
1293 cblas_cscal(len, s, a, 1);
1295 cblas_ccopy(len, a, 1, c, 1);
1296 cblas_cscal(len, s, c, 1);
1308#if defined(SAF_USE_INTEL_IPP)
1310 ippsMulC_64f_I((Ipp64f)s[0], (Ipp64f*)a, len);
1312 ippsMulC_64f((Ipp64f*)a, (Ipp64f)s[0], (Ipp64f*)c, len);
1313#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
1315 cblas_dscal(len, s[0], a, 1);
1317 vDSP_vsmulD(a, 1, s, c, 1, (vDSP_Length)len);
1320 cblas_dscal(len, s[0], a, 1);
1323 cblas_dscal(len, s[0], c, 1);
1331 const double_complex* s,
1337 cblas_zscal(len, s, a, 1);
1339 cblas_zcopy(len, a, 1, c, 1);
1340 cblas_zscal(len, s, c, 1);
1358 memset(c, 0, len*
sizeof(
float));
1361#if defined(SAF_USE_INTEL_IPP)
1362 ippsDivC_32f((Ipp32f*)a, (Ipp32f)s[0], (Ipp32f*)c, len);
1363#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
1364 vDSP_vsdiv(a, 1, s, c, 1, (vDSP_Length)len);
1366 cblas_scopy(len, a, 1, c, 1);
1367 cblas_sscal(len, 1.0f/s[0], c, 1);
1384#if defined(SAF_USE_INTEL_IPP)
1385 ippsAddC_32f((Ipp32f*)a, (Ipp32f)s[0], (Ipp32f*)c, len);
1386#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
1387 vDSP_vsadd(a, 1, s, c, 1, (vDSP_Length)len);
1388#elif defined(SAF_ENABLE_SIMD)
1390# if defined(__AVX512F__)
1391 __m512 s16 = _mm512_set1_ps(s[0]);
1392 for(i=0; i<(len-15); i+=16)
1393 _mm512_storeu_ps(c+i, _mm512_add_ps(_mm512_loadu_ps(a+i), s16));
1394# elif defined(__AVX__) && defined(__AVX2__)
1395 __m256 s8 = _mm256_set1_ps(s[0]);
1396 for(i=0; i<(len-7); i+=8)
1397 _mm256_storeu_ps(c+i, _mm256_add_ps(_mm256_loadu_ps(a+i), s8));
1398# elif defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
1399 __m128 s4 = _mm_load_ps1(s);
1400 for(i=0; i<(len-3); i+=4)
1401 _mm_storeu_ps(c+i, _mm_add_ps(_mm_loadu_ps(a+i), s4));
1407 for(i=0; i<len; i++)
1425#if defined(SAF_USE_INTEL_IPP)
1426 ippsSubC_32f((Ipp32f*)a, (Ipp32f)s[0], (Ipp32f*)c, len);
1427#elif defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
1430 vDSP_vsadd(a, 1, &inv_s, c, 1, (vDSP_Length)len);
1431#elif defined(SAF_ENABLE_SIMD)
1433# if defined(__AVX512F__)
1434 __m512 s16 = _mm512_set1_ps(s[0]);
1435 for(i=0; i<(len-15); i+=16)
1436 _mm512_storeu_ps(c+i, _mm512_sub_ps(_mm512_loadu_ps(a+i), s16));
1437# elif defined(__AVX__) && defined(__AVX2__)
1438 __m256 s8 = _mm256_set1_ps(s[0]);
1439 for(i=0; i<(len-7); i+=8)
1440 _mm256_storeu_ps(c+i, _mm256_sub_ps(_mm256_loadu_ps(a+i), s8));
1441# elif defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__)
1442 __m128 s4 = _mm_load_ps1(s);
1443 for(i=0; i<(len-3); i+=4)
1444 _mm_storeu_ps(c+i, _mm_sub_ps(_mm_loadu_ps(a+i), s4));
1450 for(i=0; i<len; i++)
1469#if defined(SAF_USE_APPLE_ACCELERATE_LP64) || defined(SAF_USE_APPLE_ACCELERATE_ILP64)
1471 vDSP_Length* inds_vDSP;
1472 inds_vDSP =
malloc1d(len*
sizeof(vDSP_Length));
1473 for(i=0; i<len; i++)
1474 inds_vDSP[i] = (vDSP_Length)(inds[i] + 1);
1475 vDSP_vgathr(sv, inds_vDSP, 1, cv, 1, (vDSP_Length)len);
1477#elif defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1483 for(i=0; i<len; i++)
1485 cblas_sgthr(len, sv, cv, (
veclib_int*)inds_tmp);
1488#elif defined(NDEBUG)
1490 for(i=0; i<len-3; i+=4){
1491 cv[i] = sv[inds[i]];
1492 cv[i+1] = sv[inds[i+1]];
1493 cv[i+2] = sv[inds[i+2]];
1494 cv[i+3] = sv[inds[i+3]];
1497 cv[i] = sv[inds[i]];
1499 for(i=0; i<len; i++)
1500 cv[i] = sv[inds[i]];
1506 const float_complex* sv,
1513#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1519 for(i=0; i<len; i++)
1521 cblas_cgthr(len, sv, cv, (
veclib_int*)inds_tmp);
1524#elif defined(NDEBUG)
1526 for(i=0; i<len-3; i+=4){
1527 cv[i] = sv[inds[i]];
1528 cv[i+1] = sv[inds[i+1]];
1529 cv[i+2] = sv[inds[i+2]];
1530 cv[i+3] = sv[inds[i+3]];
1533 cv[i] = sv[inds[i]];
1535 for(i=0; i<len; i++)
1536 cv[i] = sv[inds[i]];
1549#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1555 for(i=0; i<len; i++)
1557 cblas_dgthr(len, sv, cv, (
veclib_int*)inds_tmp);
1560#elif defined(NDEBUG)
1562 for(i=0; i<len-3; i+=4){
1563 cv[i] = sv[inds[i]];
1564 cv[i+1] = sv[inds[i+1]];
1565 cv[i+2] = sv[inds[i+2]];
1566 cv[i+3] = sv[inds[i+3]];
1569 cv[i] = sv[inds[i]];
1571 for(i=0; i<len; i++)
1572 cv[i] = sv[inds[i]];
1578 const double_complex* sv,
1585#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1591 for(i=0; i<len; i++)
1593 cblas_zgthr(len, sv, cv, (
veclib_int*)inds_tmp);
1596#elif defined(NDEBUG)
1598 for(i=0; i<len-3; i+=4){
1599 cv[i] = sv[inds[i]];
1600 cv[i+1] = sv[inds[i+1]];
1601 cv[i+2] = sv[inds[i+2]];
1602 cv[i+3] = sv[inds[i+3]];
1605 cv[i] = sv[inds[i]];
1607 for(i=0; i<len; i++)
1608 cv[i] = sv[inds[i]];
1618typedef struct _utility_ssvd_data {
1619 int maxDim1, maxDim2;
1621 float* a, *s, *u, *vt;
1630 h->maxDim1 = maxDim1;
1631 h->maxDim2 = maxDim2;
1632 h->currentWorkSize = 0;
1633 h->a =
malloc1d(maxDim1*maxDim2*
sizeof(
float));
1635 h->u =
malloc1d(maxDim1*maxDim1*
sizeof(
float));
1636 h->vt =
malloc1d(maxDim2*maxDim2*
sizeof(
float));
1674 m = dim1; n = dim2; lda = dim1; ldu = dim1; ldvt = dim2;
1682 saf_assert(dim1<=h->maxDim1,
"dim1 exceeds the maximum length specified");
1683 saf_assert(dim2<=h->maxDim2,
"dim2 exceeds the maximum length specified");
1688 for(i=0; i<dim1; i++)
1689 for(j=0; j<dim2; j++)
1690 h->a[j*dim1+i] = A[i*dim2 +j];
1694#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
1695 sgesvd_(
"A",
"A", &m, &n, h->a, &lda, h->s, h->u, &ldu, h->vt, &ldvt, &wkopt, &lwork, &info );
1696#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
1698#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
1699 info = LAPACKE_sgesvd_work(CblasColMajor,
'A',
'A', m, n, h->a, lda, h->s, h->u, ldu, h->vt, ldvt, &wkopt, lwork);
1702 if(lwork>h->currentWorkSize){
1703 h->currentWorkSize = lwork;
1704 h->work =
realloc1d(h->work, h->currentWorkSize*
sizeof(
float));
1708#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
1709 sgesvd_(
"A",
"A", &m, &n, h->a, &lda, h->s, h->u, &ldu, h->vt, &ldvt, h->work, &lwork, &info );
1710#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
1712#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
1713 info = LAPACKE_sgesvd_work(CblasColMajor,
'A',
'A', m, n, h->a, lda, h->s, h->u, ldu, h->vt, ldvt, h->work, lwork);
1719 memset(S, 0, dim1*dim2*
sizeof(
float));
1721 memset(U, 0, dim1*dim1*
sizeof(
float));
1723 memset(V, 0, dim2*dim2*
sizeof(
float));
1725 memset(sing, 0,
SAF_MIN(dim1, dim2)*
sizeof(
float));
1730 saf_print_warning(
"Could not compute SVD in utility_ssvd(). Output matrices/vectors have been zeroed.");
1736 memset(S, 0, dim1*dim2*
sizeof(
float));
1738 for(i=0; i<
SAF_MIN(dim1, dim2); i++)
1739 S[i*dim2+i] = h->s[i];
1744 for(i=0; i<dim1; i++)
1745 for(j=0; j<dim1; j++)
1746 U[i*dim1+j] = h->u[j*dim1+i];
1750 for(i=0; i<dim2; i++)
1751 for(j=0; j<dim2; j++)
1752 V[i*dim2+j] = h->vt[i*dim2+j];
1754 for(i=0; i<
SAF_MIN(dim1, dim2); i++)
1763typedef struct _utility_csvd_data {
1764 int maxDim1, maxDim2;
1766 float_complex* a, *u, *vt, *work;
1775 h->maxDim1 = maxDim1;
1776 h->maxDim2 = maxDim2;
1777 h->currentWorkSize = 0;
1778 h->a =
malloc1d(maxDim1*maxDim2*
sizeof(float_complex));
1780 h->u =
malloc1d(maxDim1*maxDim1*
sizeof(float_complex));
1781 h->vt =
malloc1d(maxDim2*maxDim2*
sizeof(float_complex));
1807 const float_complex* A,
1819 float_complex wkopt;
1820#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1821 const MKL_Complex8 calpha = {1.0f, 0.0f};
1826 m = dim1; n = dim2; lda = dim1; ldu = dim1; ldvt = dim2;
1834 saf_assert(dim1<=h->maxDim1,
"dim1 exceeds the maximum length specified");
1835 saf_assert(dim2<=h->maxDim2,
"dim2 exceeds the maximum length specified");
1840#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1843 for(i=0; i<dim1; i++)
1844 for(j=0; j<dim2; j++)
1845 h->a[j*dim1+i] = A[i*dim2+j];
1850#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
1853#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
1855#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
1856 info = LAPACKE_cgesvd_work(CblasColMajor,
'A',
'A', m, n, (
veclib_float_complex*)h->a, lda, h->s, (
veclib_float_complex*)h->u, ldu,
1860 if(lwork>h->currentWorkSize){
1861 h->currentWorkSize = lwork;
1862 h->work =
realloc1d(h->work, h->currentWorkSize*
sizeof(float_complex));
1866#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
1867 cgesvd_(
"A",
"A", &m, &n, (
veclib_float_complex*)h->a, &lda, h->s, (
veclib_float_complex*)h->u, &ldu, (
veclib_float_complex*)h->vt, &ldvt,
1869#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
1871#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
1872 info = LAPACKE_cgesvd_work(CblasColMajor,
'A',
'A', m, n, (
veclib_float_complex*)h->a, lda, h->s, (
veclib_float_complex*)h->u, ldu, (
veclib_float_complex*)h->vt, ldvt,
1879 memset(S, 0, dim1*dim2*
sizeof(float_complex));
1881 memset(U, 0, dim1*dim1*
sizeof(float_complex));
1883 memset(V, 0, dim2*dim2*
sizeof(float_complex));
1885 memset(sing, 0,
SAF_MIN(dim1, dim2)*
sizeof(float_complex));
1890 saf_print_warning(
"Could not compute SVD in utility_csvd(). Output matrices/vectors have been zeroed.");
1897 memset(S, 0, dim1*dim2*
sizeof(float_complex));
1898 cblas_scopy(
SAF_MIN(dim1, dim2), h->s, 1, (
float*)S, 2*(dim2+1));
1903#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
1906 for(i=0; i<dim1; i++)
1907 for(j=0; j<dim1; j++)
1908 U[i*dim1+j] = h->u[j*dim1+i];
1914 cblas_ccopy(dim2*dim2, h->vt, 1, V, 1);
1915 cblas_sscal(dim2*dim2, -1.0f, ((
float*)V)+1, 2);
1919 cblas_scopy(
SAF_MIN(dim1, dim2), h->s, 1, sing, 1);
1932typedef struct _utility_sseig_data {
1946 h->currentWorkSize = 0;
1947 h->w =
malloc1d(maxDim*
sizeof(
float));
1948 h->a =
malloc1d(maxDim*maxDim*
sizeof(
float));
1992 saf_assert(dim<=h->maxDim,
"dim exceeds the maximum length specified");
1997 for(i=0; i<dim; i++)
1998 for(j=0; j<dim; j++)
1999 h->a[i*dim+j] = A[j*dim+i];
2003#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2004 ssyev_(
"Vectors",
"Upper", &n, h->a, &lda, h->w, &wkopt, &lwork, &info );
2005#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2007#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2008 info = LAPACKE_ssyev_work(CblasColMajor,
'V',
'U', n, h->a, lda, h->w, &wkopt, lwork);
2011 if(lwork>h->currentWorkSize){
2012 h->currentWorkSize = lwork;
2013 h->work =
realloc1d(h->work, h->currentWorkSize*
sizeof(
float));
2017#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2018 ssyev_(
"Vectors",
"Upper", &n, h->a, &lda, h->w, h->work, &lwork, &info );
2019#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2021#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2022 info = LAPACKE_ssyev_work(CblasColMajor,
'V',
'U', n, h->a, lda, h->w, h->work, lwork);
2027 memset(D, 0, dim*dim*
sizeof(
float));
2031 memset(V, 0, dim*dim*
sizeof(
float));
2037 saf_print_warning(
"Could not compute EVD in utility_sseig(). Output matrices/vectors have been zeroed.");
2042 for(i=0; i<dim; i++){
2044 for(j=0; j<dim; j++)
2045 V[i*dim+j] = h->a[(dim-j-1)*dim+i];
2047 D[i*dim+i] = h->w[dim-i-1];
2049 eig[i] = h->w[dim-i-1];
2053 for(i=0; i<dim; i++){
2055 for(j=0; j<dim; j++)
2056 V[i*dim+j] = h->a[j*dim+i];
2058 D[i*dim+i] = h->w[i];
2070typedef struct _utility_cseig_data {
2076 float_complex* work;
2085 h->currentWorkSize =
SAF_MAX(1, 2*maxDim-1);
2086 h->rwork =
malloc1d((3*maxDim-2)*
sizeof(
float));
2087 h->w =
malloc1d(maxDim*
sizeof(
float));
2088 h->a =
malloc1d(maxDim*maxDim*
sizeof(float_complex));
2089 h->work =
malloc1d(h->currentWorkSize*
sizeof(float_complex));
2111 const float_complex* A,
2122 float_complex wkopt;
2123#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
2124 const MKL_Complex8 calpha = {1.0f, 0.0f};
2138 saf_assert(dim<=h->maxDim,
"dim exceeds the maximum length specified");
2143#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
2146 for(i=0; i<dim; i++)
2147 for(j=0; j<dim; j++)
2148 h->a[i*dim+j] = A[j*dim+i];
2153#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2155#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2157#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2161 if(lwork>h->currentWorkSize){
2162 h->currentWorkSize = lwork;
2163 h->work =
realloc1d(h->work, h->currentWorkSize*
sizeof(float_complex));
2167#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2169#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2171#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2177 memset(D, 0, dim*dim*
sizeof(float_complex));
2181 memset(V, 0, dim*dim*
sizeof(float_complex));
2187 saf_print_warning(
"Could not compute EVD in utility_cseig(). Output matrices/vectors have been zeroed.");
2193 if(sortDecFLAG && V!=NULL)
2194 for(i=0; i<(int)((float)dim/2.0f); i++)
2195 cblas_cswap(dim, &h->a[i*dim], 1, &h->a[(dim-i-1)*dim], 1);
2198#if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
2201 for(i=0; i<dim; i++)
2202 for(j=0; j<dim; j++)
2203 V[i*dim+j] = h->a[j*dim+i];
2208 for(i=0; i<dim; i++) {
2210 D[i*dim+i] = cmplxf(h->w[dim-i-1], 0.0f);
2212 eig[i] = h->w[dim-i-1];
2216 for(i=0; i<dim; i++){
2218 D[i*dim+i] = cmplxf(h->w[i], 0.0f);
2235typedef struct _utility_ceigmp_data {
2238 float_complex* a, *b, *vl, *vr, *alpha, *beta;
2240 float_complex* work;
2249 h->currentWorkSize = 4*maxDim;
2250 h->rwork =
malloc1d(4*(h->currentWorkSize)*
sizeof(
float));
2251 h->a =
malloc1d(maxDim*maxDim*
sizeof(float_complex));
2252 h->b =
malloc1d(maxDim*maxDim*
sizeof(float_complex));
2253 h->vl =
malloc1d(maxDim*maxDim*
sizeof(float_complex));
2254 h->vr =
malloc1d(maxDim*maxDim*
sizeof(float_complex));
2255 h->alpha =
malloc1d(maxDim*
sizeof(float_complex));
2256 h->beta =
malloc1d(maxDim*
sizeof(float_complex));
2257 h->work =
malloc1d(h->currentWorkSize*
sizeof(float_complex));
2283 const float_complex* A,
2284 const float_complex* B,
2296 n = lda = ldb = ldvl = ldvr = dim;
2304 saf_assert(dim<=h->maxDim,
"dim exceeds the maximum length specified");
2309 for(i=0; i<dim; i++)
2310 for(j=0; j<dim; j++)
2311 h->a[j*dim+i] = A[i*dim+j];
2312 for(i=0; i<dim; i++)
2313 for(j=0; j<dim; j++)
2314 h->b[j*dim+i] = B[i*dim+j];
2317 lwork = h->currentWorkSize;
2318#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2321#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2323#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2324 info = LAPACKE_cggev_work(CblasColMajor,
'V',
'V', n, (
veclib_float_complex*)h->a, lda, (
veclib_float_complex*)h->b, ldb, (
veclib_float_complex*)h->alpha, (
veclib_float_complex*)h->beta,
2329 memset(D, 0, dim*dim*
sizeof(float_complex));
2334 memset(VL, 0, dim*dim*
sizeof(float_complex));
2336 memset(VR, 0, dim*dim*
sizeof(float_complex));
2342 saf_print_warning(
"Could not compute EVD in utility_ceigmp(). Output matrices/vectors have been zeroed.");
2348 for(i=0; i<dim; i++)
2349 D[i*dim+i] = ccdivf(h->alpha[i],h->beta[i]);
2352 for(i=0; i<dim; i++)
2353 for(j=0; j<dim; j++)
2354 VL[i*dim+j] = h->vl[j*dim+i];
2356 for(i=0; i<dim; i++)
2357 for(j=0; j<dim; j++)
2358 VR[i*dim+j] = h->vr[j*dim+i];
2366typedef struct _utility_zeigmp_data {
2369 double_complex* a, *b, *vl, *vr, *alpha, *beta;
2371 double_complex* work;
2380 h->currentWorkSize = 4*maxDim;
2381 h->rwork =
malloc1d(4*(h->currentWorkSize)*
sizeof(
double));
2382 h->a =
malloc1d(maxDim*maxDim*
sizeof(double_complex));
2383 h->b =
malloc1d(maxDim*maxDim*
sizeof(double_complex));
2384 h->vl =
malloc1d(maxDim*maxDim*
sizeof(double_complex));
2385 h->vr =
malloc1d(maxDim*maxDim*
sizeof(double_complex));
2386 h->alpha =
malloc1d(maxDim*
sizeof(double_complex));
2387 h->beta =
malloc1d(maxDim*
sizeof(double_complex));
2388 h->work =
malloc1d(h->currentWorkSize*
sizeof(double_complex));
2414 const double_complex* A,
2415 const double_complex* B,
2427 n = lda = ldb = ldvl = ldvr = dim;
2435 saf_assert(dim<=h->maxDim,
"dim exceeds the maximum length specified");
2440 for(i=0; i<dim; i++)
2441 for(j=0; j<dim; j++)
2442 h->a[j*dim+i] = A[i*dim+j];
2443 for(i=0; i<dim; i++)
2444 for(j=0; j<dim; j++)
2445 h->b[j*dim+i] = B[i*dim+j];
2448 lwork = h->currentWorkSize;
2449#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2452#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2454#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2455 info = LAPACKE_zggev_work(CblasColMajor,
'V',
'V', n, (
veclib_double_complex*)h->a, lda, (
veclib_double_complex*)h->b, ldb, (
veclib_double_complex*)h->alpha, (
veclib_double_complex*)h->beta,
2460 memset(D, 0, dim*dim*
sizeof(double_complex));
2465 memset(VL, 0, dim*dim*
sizeof(double_complex));
2467 memset(VR, 0, dim*dim*
sizeof(double_complex));
2473 saf_print_warning(
"Could not compute EVD in utility_zeigmp(). Output matrices/vectors have been zeroed.");
2479 for(i=0; i<dim; i++)
2480 D[i*dim+i] = ccdiv(h->alpha[i],h->beta[i]);
2483 for(i=0; i<dim; i++)
2484 for(j=0; j<dim; j++)
2485 VL[i*dim+j] = h->vl[j*dim+i];
2487 for(i=0; i<dim; i++)
2488 for(j=0; j<dim; j++)
2489 VR[i*dim+j] = h->vr[j*dim+i];
2502typedef struct _utility_ceig_data {
2505 float_complex *w, *vl, *vr, *a;
2507 float_complex* work;
2516 h->currentWorkSize = 0;
2517 h->rwork =
malloc1d(4*maxDim*
sizeof(
float));
2518 h->w =
malloc1d(maxDim*
sizeof(float_complex));
2519 h->vl =
malloc1d(maxDim*maxDim*
sizeof(float_complex));
2520 h->vr =
malloc1d(maxDim*maxDim*
sizeof(float_complex));
2521 h->a =
malloc1d(maxDim*maxDim*
sizeof(float_complex));
2546 const float_complex* A,
2557 float_complex wkopt;
2559 n = lda = ldvl = ldvr = dim;
2567 saf_assert(dim<=h->maxDim,
"dim exceeds the maximum length specified");
2572 for(i=0; i<dim; i++)
2573 for(j=0; j<dim; j++)
2574 h->a[i*dim+j] = A[j*dim+i];
2578#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2581#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2583#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2588 if(lwork>h->currentWorkSize){
2589 h->currentWorkSize = lwork;
2590 h->work =
realloc1d(h->work, h->currentWorkSize*
sizeof(float_complex));
2594#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2597#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2599#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2606 memset(D, 0, dim*dim*
sizeof(float_complex));
2611 memset(VL, 0, dim*dim*
sizeof(float_complex));
2613 memset(VR, 0, dim*dim*
sizeof(float_complex));
2615 memset(eig, 0, dim*
sizeof(float_complex));
2621 saf_print_warning(
"Could not compute EVD in utility_ceig(). Output matrices/vectors have been zeroed.");
2626 for(i=0; i<dim; i++){
2628 for(j=0; j<dim; j++)
2629 VL[i*dim+j] = h->vl[j*dim+i];
2631 for(j=0; j<dim; j++)
2632 VR[i*dim+j] = h->vr[j*dim+i];
2634 D[i*dim+i] = h->w[i];
2645typedef struct _utility_zeig_data {
2648 double_complex *w, *vl, *vr, *a;
2650 double_complex* work;
2659 h->currentWorkSize = 0;
2660 h->rwork =
malloc1d(4*maxDim*
sizeof(
double));
2661 h->w =
malloc1d(maxDim*
sizeof(double_complex));
2662 h->vl =
malloc1d(maxDim*maxDim*
sizeof(double_complex));
2663 h->vr =
malloc1d(maxDim*maxDim*
sizeof(double_complex));
2664 h->a =
malloc1d(maxDim*maxDim*
sizeof(double_complex));
2689 const double_complex* A,
2700 double_complex wkopt;
2702 n = lda = ldvl = ldvr = dim;
2710 saf_assert(dim<=h->maxDim,
"dim exceeds the maximum length specified");
2715 for(i=0; i<dim; i++)
2716 for(j=0; j<dim; j++)
2717 h->a[i*dim+j] = A[j*dim+i];
2721#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2724#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2726#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2731 if(lwork>h->currentWorkSize){
2732 h->currentWorkSize = lwork;
2733 h->work =
realloc1d(h->work, h->currentWorkSize*
sizeof(double_complex));
2737#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2740#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2742#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2749 memset(D, 0, dim*dim*
sizeof(double_complex));
2754 memset(VL, 0, dim*dim*
sizeof(double_complex));
2756 memset(VR, 0, dim*dim*
sizeof(double_complex));
2758 memset(eig, 0, dim*
sizeof(double_complex));
2764 saf_print_warning(
"Could not compute EVD in utility_zeig(). Output matrices/vectors have been zeroed.");
2769 for(i=0; i<dim; i++){
2771 for(j=0; j<dim; j++)
2772 VL[i*dim+j] = h->vl[j*dim+i];
2774 for(j=0; j<dim; j++)
2775 VR[i*dim+j] = h->vr[j*dim+i];
2777 D[i*dim+i] = h->w[i];
2793typedef struct _utility_sglslv_data {
2807 h->maxNCol = maxNCol;
2810 h->a =
malloc1d(maxDim*maxDim*
sizeof(
float));
2811 h->b =
malloc1d(maxDim*maxNCol*
sizeof(
float));
2840 veclib_int n = dim, nrhs = nCol, lda = dim, ldb = dim, info;
2841#if !defined(SAF_VECLIB_USE_LAPACKE_INTERFACE) && !(defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64))
2851 saf_assert(dim<=h->maxDim,
"dim exceeds the maximum length specified");
2852 saf_assert(nCol<=h->maxNCol,
"nCol exceeds the maximum length specified");
2856#if defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2858 cblas_scopy(dim*dim, A, 1, h->a, 1);
2859 cblas_scopy(dim*nCol, B, 1, h->b, 1);
2862# if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
2863 MKL_Somatcopy(
'R',
'T', dim, dim, 1.0f, A, dim, h->a, dim);
2864 MKL_Somatcopy(
'R',
'T', dim, nCol, 1.0f, B, nCol, h->b, dim);
2866 for(i=0; i<dim; i++)
2867 for(j=0; j<dim; j++)
2868 h->a[i*dim+j] = A[j*dim+i];
2869 for(i=0; i<dim; i++)
2870 for(j=0; j<nCol; j++)
2871 h->b[j*dim+i] = B[i*nCol+j];
2876#ifdef SAF_VECLIB_USE_CLAPACK_INTERFACE
2877 info = clapack_sgesv(CblasColMajor, n, nrhs, h->a, lda, h->IPIV, h->b, ldb);
2878#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2879 info = LAPACKE_sgesv_work(CblasRowMajor, n, nrhs, h->a, lda, h->IPIV, h->b, ldb);
2880#elif defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2881 sgesv_( &n, &nrhs, h->a, &lda, h->IPIV, h->b, &ldb, &info );
2886 memset(X, 0, dim*nCol*
sizeof(
float));
2891 saf_print_warning(
"Could not solve the linear equation in utility_sglslv(). Output matrices/vectors have been zeroed.");
2895#if defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2896 cblas_scopy(dim*nCol, h->b, 1, X, 1);
2899# if defined(SAF_USE_INTEL_MKL_LP64) || defined(SAF_USE_INTEL_MKL_ILP64)
2900 MKL_Somatcopy(
'R',
'T', nCol, dim, 1.0f, h->b, dim, X, nCol);
2902 for(i=0; i<dim; i++)
2903 for(j=0; j<nCol; j++)
2904 X[i*nCol+j] = h->b[j*dim+i];
2914typedef struct _utility_cglslv_data {
2928 h->maxNCol = maxNCol;
2931 h->a =
malloc1d(maxDim*maxDim*
sizeof(float_complex));
2932 h->b =
malloc1d(maxDim*maxNCol*
sizeof(float_complex));
2953 const float_complex* A,
2974 saf_assert(dim<=h->maxDim,
"dim exceeds the maximum length specified");
2975 saf_assert(nCol<=h->maxNCol,
"nCol exceeds the maximum length specified");
2980 for(i=0; i<dim; i++)
2981 for(j=0; j<dim; j++)
2982 h->a[j*dim+i] = A[i*dim+j];
2983 for(i=0; i<dim; i++)
2984 for(j=0; j<nCol; j++)
2985 h->b[j*dim+i] = B[i*nCol+j];
2988#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
2990#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
2992#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
2998 memset(X, 0, dim*nCol*
sizeof(float_complex));
3003 saf_print_warning(
"Could not solve the linear equation in utility_cglslv(). Output matrices/vectors have been zeroed.");
3008 for(i=0; i<dim; i++)
3009 for(j=0; j<nCol; j++)
3010 X[i*nCol+j] = h->b[j*dim+i];
3018typedef struct _utility_dglslv_data {
3032 h->maxNCol = maxNCol;
3034 h->a =
malloc1d(maxDim*maxDim*
sizeof(
double));
3035 h->b =
malloc1d(maxDim*maxNCol*
sizeof(
double));
3077 saf_assert(dim<=h->maxDim,
"dim exceeds the maximum length specified");
3078 saf_assert(nCol<=h->maxNCol,
"nCol exceeds the maximum length specified");
3083 for(i=0; i<dim; i++)
3084 for(j=0; j<dim; j++)
3085 h->a[j*dim+i] = A[i*dim+j];
3086 for(i=0; i<dim; i++)
3087 for(j=0; j<nCol; j++)
3088 h->b[j*dim+i] = B[i*nCol+j];
3091#ifdef SAF_VECLIB_USE_CLAPACK_INTERFACE
3092 info = clapack_dgesv(CblasColMajor, n, nrhs, h->a, lda, h->IPIV, h->b, ldb);
3093#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3094 info = LAPACKE_dgesv_work(CblasColMajor, n, nrhs, h->a, lda, h->IPIV, h->b, ldb);
3095#elif defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3096 dgesv_( &n, &nrhs, h->a, &lda, h->IPIV, h->b, &ldb, &info );
3101 memset(X, 0, dim*nCol*
sizeof(
double));
3106 saf_print_warning(
"Could not solve the linear equation in utility_dglslv(). Output matrices/vectors have been zeroed.");
3111 for(i=0; i<dim; i++)
3112 for(j=0; j<nCol; j++)
3113 X[i*nCol+j] = h->b[j*dim+i];
3121typedef struct _utility_zglslv_data {
3135 h->maxNCol = maxNCol;
3137 h->a =
malloc1d(maxDim*maxDim*
sizeof(double_complex));
3138 h->b =
malloc1d(maxDim*maxNCol*
sizeof(double_complex));
3159 const double_complex* A,
3167 veclib_int i, j, n = dim, nrhs = nCol, lda = dim, ldb = dim, info;
3180 saf_assert(dim<=h->maxDim,
"dim exceeds the maximum length specified");
3181 saf_assert(nCol<=h->maxNCol,
"nCol exceeds the maximum length specified");
3186 for(i=0; i<dim; i++)
3187 for(j=0; j<dim; j++)
3188 h->a[j*dim+i] = A[i*dim+j];
3189 for(i=0; i<dim; i++)
3190 for(j=0; j<nCol; j++)
3191 h->b[j*dim+i] = B[i*nCol+j];
3194#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3196#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3198#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3204 memset(X, 0, dim*nCol*
sizeof(double_complex));
3209 saf_print_warning(
"Could not solve the linear equation in utility_zglslv(). Output matrices/vectors have been zeroed.");
3214 for(i=0; i<dim; i++)
3215 for(j=0; j<nCol; j++)
3216 X[i*nCol+j] = h->b[j*dim+i];
3229typedef struct _utility_sglslvt_data {
3243 h->maxNCol = maxNCol;
3245 h->a =
malloc1d(maxDim*maxDim*
sizeof(
float));
3246 h->b =
malloc1d(maxDim*maxNCol*
sizeof(
float));
3275 veclib_int n = nCol, nrhs = dim, lda = nCol, ldb = nCol, info;
3283 saf_assert(dim<=h->maxDim,
"dim exceeds the maximum length specified");
3284 saf_assert(nCol<=h->maxNCol,
"nCol exceeds the maximum length specified");
3289 cblas_scopy(dim*dim, A, 1, h->a, 1);
3290 cblas_scopy(dim*nCol, B, 1, h->b, 1);
3293#ifdef SAF_VECLIB_USE_CLAPACK_INTERFACE
3294 info = clapack_sgesv(CblasColMajor, n, nrhs, h->b, ldb, h->IPIV, h->a, lda);
3295#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3296 info = LAPACKE_sgesv_work(CblasColMajor, n, nrhs, h->b, ldb, h->IPIV, h->a, lda);
3297#elif defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3299 sgesv_( &n, &nrhs, h->b, &ldb, h->IPIV, h->a, &lda, &info );
3304 memset(X, 0, dim*nCol*
sizeof(
float));
3309 saf_print_warning(
"Could not solve the linear equation in utility_sglslvt(). Output matrices/vectors have been zeroed.");
3313 cblas_scopy(dim*nCol, h->a, 1, X, 1);
3325typedef struct _utility_sslslv_data {
3338 h->maxNCol = maxNCol;
3339 h->a =
malloc1d(maxDim*maxDim*
sizeof(
float));
3340 h->b =
malloc1d(maxDim*maxNCol*
sizeof(
float));
3381 saf_assert(dim<=h->maxDim,
"dim exceeds the maximum length specified");
3382 saf_assert(nCol<=h->maxNCol,
"nCol exceeds the maximum length specified");
3387 for(i=0; i<dim; i++)
3388 for(j=0; j<dim; j++)
3389 h->a[j*dim+i] = A[i*dim+j];
3390 for(i=0; i<dim; i++)
3391 for(j=0; j<nCol; j++)
3392 h->b[j*dim+i] = B[i*nCol+j];
3395#ifdef SAF_VECLIB_USE_CLAPACK_INTERFACE
3396 info = clapack_sposv(CblasColMajor, CblasUpper, n, nrhs, h->a, lda, h->b, ldb);
3397#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3398 info = LAPACKE_sposv_work(CblasColMajor, CblasUpper, n, nrhs, h->a, lda, h->b, ldb);
3399#elif defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3400 sposv_(
"U", &n, &nrhs, h->a, &lda, h->b, &ldb, &info );
3405 memset(X, 0, dim*nCol*
sizeof(
float));
3410 saf_print_warning(
"Could not solve the linear equation in utility_sslslv(). Output matrices/vectors have been zeroed.");
3415 for(i=0; i<dim; i++)
3416 for(j=0; j<nCol; j++)
3417 X[i*nCol+j] = h->b[j*dim+i];
3425typedef struct _utility_cslslv_data {
3438 h->maxNCol = maxNCol;
3439 h->a =
malloc1d(maxDim*maxDim*
sizeof(float_complex));
3440 h->b =
malloc1d(maxDim*maxNCol*
sizeof(float_complex));
3460 const float_complex* A,
3481 saf_assert(dim<=h->maxDim,
"dim exceeds the maximum length specified");
3482 saf_assert(nCol<=h->maxNCol,
"nCol exceeds the maximum length specified");
3487 for(i=0; i<dim; i++)
3488 for(j=0; j<dim; j++)
3489 h->a[j*dim+i] = A[i*dim+j];
3490 for(i=0; i<dim; i++)
3491 for(j=0; j<nCol; j++)
3492 h->b[j*dim+i] = B[i*nCol+j];
3495#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3497#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3499#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3505 memset(X, 0, dim*nCol*
sizeof(float_complex));
3510 saf_print_warning(
"Could not solve the linear equation in utility_cslslv(). Output matrices/vectors have been zeroed.");
3515 for(i=0; i<dim; i++)
3516 for(j=0; j<nCol; j++)
3517 X[i*nCol+j] = h->b[j*dim+i];
3530typedef struct _utility_spinv_data {
3531 int maxDim1, maxDim2;
3533 float* a, *s, *u, *vt, *inva;
3542 h->maxDim1 = maxDim1;
3543 h->maxDim2 = maxDim2;
3544 h->currentWorkSize = 0;
3545 h->a =
malloc1d(maxDim1*maxDim2*
sizeof(
float));
3547 h->u =
malloc1d(maxDim1*maxDim1*
sizeof(
float));
3548 h->vt =
malloc1d(maxDim2*maxDim2*
sizeof(
float));
3549 h->inva =
malloc1d(maxDim1*maxDim2*
sizeof(
float));
3581 veclib_int i, j, m, n, k, lda, ldu, ldvt, ld_inva, info;
3586 m = lda = ldu = dim1;
3596 saf_assert(dim1<=h->maxDim1,
"dim1 exceeds the maximum length specified");
3597 saf_assert(dim2<=h->maxDim2,
"dim2 exceeds the maximum length specified");
3604 h->a[j*m+i] = inM[i*n+j];
3608#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3609 sgesvd_(
"S",
"S", &m, &n, h->a, &lda, h->s, h->u, &ldu, h->vt, &ldvt, &wkopt, &lwork, &info);
3610#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3612#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3613 info = LAPACKE_sgesvd_work(CblasColMajor,
'S',
'S', m, n, h->a, lda, h->s, h->u, ldu, h->vt, ldvt, &wkopt, lwork);
3616 if(lwork>h->currentWorkSize){
3617 h->currentWorkSize = lwork;
3618 h->work =
realloc1d(h->work, h->currentWorkSize*
sizeof(
float));
3622#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3623 sgesvd_(
"S",
"S", &m, &n, h->a, &lda, h->s, h->u, &ldu, h->vt, &ldvt, h->work, &lwork, &info );
3624#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3626#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3627 info = LAPACKE_sgesvd_work(CblasColMajor,
'S',
'S', m, n, h->a, lda, h->s, h->u, ldu, h->vt, ldvt, h->work, lwork);
3631 memset(outM, 0, dim1*dim2*
sizeof(
float));
3636 saf_print_warning(
"Could not compute SVD in utility_spinv(). Output matrices/vectors have been zeroed.");
3641 if(h->s[i] > 1.0e-5f)
3645 cblas_sscal(m, ss, &h->u[i*m], 1);
3648 cblas_sgemm(CblasColMajor, CblasTrans, CblasTrans, n, m, k, 1.0f,
3656 outM[j*m+i] = h->inva[i*n+j];
3665typedef struct _utility_cpinv_data {
3666 int maxDim1, maxDim2;
3668 float_complex* a, *u, *vt, *inva;
3670 float_complex* work;
3678 h->maxDim1 = maxDim1;
3679 h->maxDim2 = maxDim2;
3680 h->currentWorkSize = 0;
3681 h->a =
malloc1d(maxDim1*maxDim2*
sizeof(float_complex));
3683 h->u =
malloc1d(maxDim1*maxDim1*
sizeof(float_complex));
3684 h->vt =
malloc1d(maxDim2*maxDim2*
sizeof(float_complex));
3685 h->inva =
malloc1d(maxDim1*maxDim2*
sizeof(float_complex));
3711 const float_complex* inM,
3718 veclib_int i, j, m, n, k, lda, ldu, ldvt, ld_inva, info;
3719 float_complex ss_cmplx;
3720 const float_complex calpha = cmplxf(1.0f, 0.0f);
const float_complex cbeta = cmplxf(0.0f, 0.0f);
3723 float_complex wkopt;
3725 m = lda = ldu = dim1;
3735 saf_assert(dim1<=h->maxDim1,
"dim1 exceeds the maximum length specified");
3736 saf_assert(dim2<=h->maxDim2,
"dim2 exceeds the maximum length specified");
3741 for(i=0; i<dim1; i++)
3742 for(j=0; j<dim2; j++)
3743 h->a[j*dim1+i] = inM[i*dim2 +j];
3747#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3748 cgesvd_(
"A",
"A", &m, &n, (
veclib_float_complex*)h->a, &lda, h->s, (
veclib_float_complex*)h->u, &ldu, (
veclib_float_complex*)h->vt, &ldvt,
3750#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3752#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3753 info = LAPACKE_cgesvd_work(CblasColMajor,
'S',
'S', m, n, (
veclib_float_complex*)h->a, lda, h->s, (
veclib_float_complex*)h->u, ldu, (
veclib_float_complex*)h->vt, ldvt,
3757 if(lwork>h->currentWorkSize){
3758 h->currentWorkSize = lwork;
3759 h->work =
realloc1d(h->work, h->currentWorkSize*
sizeof(float_complex));
3763#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3764 cgesvd_(
"A",
"A", &m, &n, (
veclib_float_complex*)h->a, &lda, h->s, (
veclib_float_complex*)h->u, &ldu, (
veclib_float_complex*)h->vt, &ldvt,
3766#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3768#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3769 info = LAPACKE_cgesvd_work(CblasColMajor,
'S',
'S', m, n, (
veclib_float_complex*)h->a, lda, h->s, (
veclib_float_complex*)h->u, ldu, (
veclib_float_complex*)h->vt, ldvt,
3774 memset(outM, 0, dim1*dim2*
sizeof(float_complex));
3779 saf_print_warning(
"Could not compute SVD in utility_cpinv(). Output matrices/vectors have been zeroed.");
3784 if(h->s[i] > 1.0e-5f)
3788 ss_cmplx = cmplxf(ss, 0.0f);
3789 cblas_cscal(m, &ss_cmplx, &h->u[i*m], 1);
3792 cblas_cgemm(CblasColMajor, CblasConjTrans, CblasConjTrans, n, m, k, &calpha,
3800 outM[j*m+i] = h->inva[i*n+j];
3809typedef struct _utility_dpinv_data {
3810 int maxDim1, maxDim2;
3812 double* a, *s, *u, *vt, *inva;
3821 h->maxDim1 = maxDim1;
3822 h->maxDim2 = maxDim2;
3823 h->currentWorkSize = 0;
3824 h->a =
malloc1d(maxDim1*maxDim2*
sizeof(
double));
3826 h->u =
malloc1d(maxDim1*maxDim1*
sizeof(
double));
3827 h->vt =
malloc1d(maxDim2*maxDim2*
sizeof(
double));
3828 h->inva =
malloc1d(maxDim1*maxDim2*
sizeof(
double));
3860 veclib_int i, j, m, n, k, lda, ldu, ldvt, ld_inva, info;
3865 m = lda = ldu = dim1;
3875 saf_assert(dim1<=h->maxDim1,
"dim1 exceeds the maximum length specified");
3876 saf_assert(dim2<=h->maxDim2,
"dim2 exceeds the maximum length specified");
3883 h->a[j*m+i] = inM[i*n+j];
3887#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3888 dgesvd_(
"S",
"S", &m, &n, h->a, &lda, h->s, h->u, &ldu, h->vt, &ldvt, &wkopt, &lwork, &info);
3889#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3891#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3892 info = LAPACKE_dgesvd_work(CblasColMajor,
'S',
'S', m, n, h->a, lda, h->s, h->u, ldu, h->vt, ldvt, &wkopt, lwork);
3895 if(lwork>h->currentWorkSize){
3896 h->currentWorkSize = lwork;
3897 h->work =
realloc1d(h->work, h->currentWorkSize*
sizeof(
double));
3901#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
3902 dgesvd_(
"S",
"S", &m, &n, h->a, &lda, h->s, h->u, &ldu, h->vt, &ldvt, h->work, &lwork, &info );
3903#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
3905#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
3906 info = LAPACKE_dgesvd_work(CblasColMajor,
'S',
'S', m, n, h->a, lda, h->s, h->u, ldu, h->vt, ldvt, h->work, lwork);
3910 memset(outM, 0, dim1*dim2*
sizeof(
double));
3915 saf_print_warning(
"Could not compute SVD in utility_dpinv(). Output matrices/vectors have been zeroed.");
3920 if(h->s[i] > 1.0e-9)
3924 cblas_dscal(m, ss, &h->u[i*m], 1);
3927 cblas_dgemm(CblasColMajor, CblasTrans, CblasTrans, n, m, k, 1.0,
3935 outM[j*m+i] = h->inva[i*n+j];
3944typedef struct _utility_zpinv_data {
3945 int maxDim1, maxDim2;
3947 double_complex* a, *u, *vt, *inva;
3949 double_complex* work;
3957 h->maxDim1 = maxDim1;
3958 h->maxDim2 = maxDim2;
3959 h->currentWorkSize = 0;
3960 h->a =
malloc1d(maxDim1*maxDim2*
sizeof(double_complex));
3962 h->u =
malloc1d(maxDim1*maxDim1*
sizeof(double_complex));
3963 h->vt =
malloc1d(maxDim2*maxDim2*
sizeof(double_complex));
3964 h->inva =
malloc1d(maxDim1*maxDim2*
sizeof(double_complex));
3990 const double_complex* inM,
3993 double_complex* outM
3997 veclib_int i, j, m, n, k, lda, ldu, ldvt, info, ld_inva;
3998 double_complex ss_cmplx;
3999 const double_complex calpha = cmplx(1.0, 0.0);
const double_complex cbeta = cmplx(0.0, 0.0);
4002 double_complex wkopt;
4004 m = lda = ldu = dim1;
4014 saf_assert(dim1<=h->maxDim1,
"dim1 exceeds the maximum length specified");
4015 saf_assert(dim2<=h->maxDim2,
"dim2 exceeds the maximum length specified");
4020 for(i=0; i<dim1; i++)
4021 for(j=0; j<dim2; j++)
4022 h->a[j*dim1+i] = inM[i*dim2 +j];
4026#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4027 zgesvd_(
"A",
"A", &m, &n, (
veclib_double_complex*)h->a, &lda, h->s, (
veclib_double_complex*)h->u, &ldu, (
veclib_double_complex*)h->vt, &ldvt,
4029#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4031#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4032 info = LAPACKE_zgesvd_work(CblasColMajor,
'A',
'A', m, n, (
veclib_double_complex*)h->a, lda, h->s, (
veclib_double_complex*)h->u, ldu, (
veclib_double_complex*)h->vt, ldvt,
4036 if(lwork>h->currentWorkSize){
4037 h->currentWorkSize = lwork;
4038 h->work =
realloc1d(h->work, h->currentWorkSize*
sizeof(double_complex));
4042#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4043 zgesvd_(
"A",
"A", &m, &n, (
veclib_double_complex*)h->a, &lda, h->s, (
veclib_double_complex*)h->u, &ldu, (
veclib_double_complex*)h->vt, &ldvt,
4045#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4047#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4048 info = LAPACKE_zgesvd_work(CblasColMajor,
'A',
'A', m, n, (
veclib_double_complex*)h->a, lda, h->s, (
veclib_double_complex*)h->u, ldu, (
veclib_double_complex*)h->vt, ldvt,
4053 memset(outM, 0, dim1*dim2*
sizeof(double_complex));
4058 saf_print_warning(
"Could not compute SVD in utility_zpinv(). Output matrices/vectors have been zeroed.");
4063 if(h->s[i] > 1.0e-5)
4067 ss_cmplx = cmplx(ss, 0.0);
4068 cblas_zscal(m, &ss_cmplx, &h->u[i*m], 1);
4071 cblas_zgemm(CblasColMajor, CblasConjTrans, CblasConjTrans, n, m, k, &calpha,
4079 outM[j*m+i] = h->inva[i*n+j];
4093typedef struct _utility_schol_data {
4104 h->a =
malloc1d(maxDim*maxDim*
sizeof(
float));
4139 saf_assert(dim<=h->maxDim,
"dim exceeds the maximum length specified");
4144 for(i=0; i<dim; i++)
4145 for(j=0; j<dim; j++)
4146 h->a[j*dim+i] = A[i*dim+j];
4149#ifdef SAF_VECLIB_USE_CLAPACK_INTERFACE
4150 info = clapack_spotrf(CblasColMajor, CblasUpper, n, h->a, lda);
4151#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4152 info = LAPACKE_spotrf_work(CblasColMajor, CblasUpper, n, h->a, lda);
4153#elif defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4154 spotrf_(
"U", &n, h->a, &lda, &info );
4159 memset(X, 0, dim*dim*
sizeof(
float));
4165 saf_print_warning(
"Could not compute the Cholesky factorisation in utility_schol(). Output matrices/vectors have been zeroed.");
4170 for(i=0; i<dim; i++)
4171 for(j=0; j<dim; j++)
4172 X[i*dim+j] = j>=i ? h->a[j*dim+i] : 0.0f;
4181typedef struct _utility_cchol_data {
4192 h->a =
malloc1d(maxDim*maxDim*
sizeof(float_complex));
4211 const float_complex* A,
4227 saf_assert(dim<=h->maxDim,
"dim exceeds the maximum length specified");
4232 for(i=0; i<dim; i++)
4233 for(j=0; j<dim; j++)
4234 h->a[j*dim+i] = A[i*dim+j];
4237#if defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4239#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4241#elif defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4247 memset(X, 0, dim*dim*
sizeof(float_complex));
4253 saf_print_warning(
"Could not compute the Cholesky factorisation in utility_cchol(). Output matrices/vectors have been zeroed.");
4258 for(i=0; i<dim; i++)
4259 for(j=0; j<dim; j++)
4260 X[i*dim+j] = j>=i ? h->a[j*dim+i] : cmplxf(0.0f, 0.0f);
4273typedef struct _utility_sdet_data {
4286 h->tmp =
malloc1d(maxN*maxN*
sizeof(
float));
4316 return A[0]*A[3] - A[2]*A[1];
4319 return A[0] * ((A[4]*A[8]) - (A[7]*A[5])) -A[1] * (A[3]
4320 * A[8] - A[6] * A[5]) + A[2] * (A[3] * A[7] - A[6] * A[4]);
4324 A[3] * A[6] * A[9] * A[12] - A[2] * A[7] * A[9] * A[12] -
4325 A[3] * A[5] * A[10] * A[12] + A[1] * A[7] * A[10] * A[12] +
4326 A[2] * A[5] * A[11] * A[12] - A[1] * A[6] * A[11] * A[12] -
4327 A[3] * A[6] * A[8] * A[13] + A[2] * A[7] * A[8] * A[13] +
4328 A[3] * A[4] * A[10] * A[13] - A[0] * A[7] * A[10] * A[13] -
4329 A[2] * A[4] * A[11] * A[13] + A[0] * A[6] * A[11] * A[13] +
4330 A[3] * A[5] * A[8] * A[14] - A[1] * A[7] * A[8] * A[14] -
4331 A[3] * A[4] * A[9] * A[14] + A[0] * A[7] * A[9] * A[14] +
4332 A[1] * A[4] * A[11] * A[14] - A[0] * A[5] * A[11] * A[14] -
4333 A[2] * A[5] * A[8] * A[15] + A[1] * A[6] * A[8] * A[15] +
4334 A[2] * A[4] * A[9] * A[15] - A[0] * A[6] * A[9] * A[15] -
4335 A[1] * A[4] * A[10] * A[15] + A[0] * A[5] * A[10] * A[15];
4345 saf_assert(N<=h->maxN,
"N exceeds the maximum length specified");
4352 h->tmp[j*N+i] = A[i*N+j];
4354#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4356#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4357 INFO = clapack_sgetrf(CblasColMajor, N, N, h->tmp, N, h->IPIV);
4358#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4359 INFO = LAPACKE_sgetrf_work(CblasColMajor, N, N, h->tmp, N, h->IPIV);
4367 saf_print_warning(
"Unable to compute determinant of input matrix. The function utility_sdet() returned 0. ");
4372 for( i = 0; i < N; i++ ) {
4374 if(h->IPIV[i] != i+1)
4387typedef struct _utility_ddet_data {
4391 double *tmp, *TAU, *WORK;
4400 h->currentWorkSize = 0;
4402 h->tmp =
malloc1d(maxN*maxN*
sizeof(
double));
4403 h->TAU =
malloc1d(maxN*
sizeof(
double));
4436 return A[0]*A[3] - A[2]*A[1];
4439 return A[0] * ((A[4]*A[8]) - (A[7]*A[5])) -A[1] * (A[3]
4440 * A[8] - A[6] * A[5]) + A[2] * (A[3] * A[7] - A[6] * A[4]);
4444 A[3] * A[6] * A[9] * A[12] - A[2] * A[7] * A[9] * A[12] -
4445 A[3] * A[5] * A[10] * A[12] + A[1] * A[7] * A[10] * A[12] +
4446 A[2] * A[5] * A[11] * A[12] - A[1] * A[6] * A[11] * A[12] -
4447 A[3] * A[6] * A[8] * A[13] + A[2] * A[7] * A[8] * A[13] +
4448 A[3] * A[4] * A[10] * A[13] - A[0] * A[7] * A[10] * A[13] -
4449 A[2] * A[4] * A[11] * A[13] + A[0] * A[6] * A[11] * A[13] +
4450 A[3] * A[5] * A[8] * A[14] - A[1] * A[7] * A[8] * A[14] -
4451 A[3] * A[4] * A[9] * A[14] + A[0] * A[7] * A[9] * A[14] +
4452 A[1] * A[4] * A[11] * A[14] - A[0] * A[5] * A[11] * A[14] -
4453 A[2] * A[5] * A[8] * A[15] + A[1] * A[6] * A[8] * A[15] +
4454 A[2] * A[4] * A[9] * A[15] - A[0] * A[6] * A[9] * A[15] -
4455 A[1] * A[4] * A[10] * A[15] + A[0] * A[5] * A[10] * A[15];
4465 saf_assert(N<=h->maxN,
"N exceeds the maximum length specified");
4472 h->tmp[j*N+i] = A[i*N+j];
4476#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4478 dgeqrf_(&_N, &_N, h->tmp, &_N, h->TAU, &lwork2, &LWORK, &INFO);
4479#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4481#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4482 INFO = LAPACKE_dgeqrf_work(CblasColMajor, N, N, h->tmp, N, h->TAU, &lwork2, LWORK);
4485 if(lwork3>h->currentWorkSize){
4486 h->currentWorkSize = lwork3;
4487 h->WORK =
realloc1d(h->WORK, h->currentWorkSize*
sizeof(
double));
4491#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4492 dgeqrf_(&_N, &_N, h->tmp, &_N, h->TAU, h->WORK, &lwork3, &INFO);
4493#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4495#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4496 INFO = LAPACKE_dgeqrf_work(CblasColMajor, N, N, h->tmp, N, h->TAU, h->WORK, lwork3);
4504 saf_print_warning(
"Unable to compute determinant of input matrix. The function utility_ddet() returned 0. ");
4529typedef struct _utility_sinv_data {
4542 h->tmp =
malloc1d(maxN*maxN*
sizeof(
float));
4543 h->WORK =
malloc1d(maxN*maxN*
sizeof(
float));
4583 saf_assert(N<=h->maxN,
"N exceeds the maximum length specified");
4590 h->tmp[j*N+i] = A[i*N+j];
4592#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4595#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4596 INFO = clapack_sgetrf(CblasColMajor, N_, N_, h->tmp, N_, h->IPIV);
4597 INFO = clapack_sgetri(CblasColMajor, N_, h->tmp, N_, h->IPIV);
4598#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4599 INFO = LAPACKE_sgetrf_work(CblasColMajor, N_, N_, h->tmp, N_, h->IPIV);
4600 INFO = LAPACKE_sgetri_work(CblasColMajor, N_, h->tmp, N_, h->IPIV, h->WORK, LWORK);
4604 memset(B, 0, N*N*
sizeof(
float));
4609 saf_print_warning(
"Unable to compute the inverse of input matrix. The function utility_sinv() returned a matrix of zeros. ");
4616 B[j*N+i] = h->tmp[i*N+j];
4625typedef struct _utility_dinv_data {
4638 h->tmp =
malloc1d(maxN*maxN*
sizeof(
double));
4639 h->WORK =
malloc1d(maxN*maxN*
sizeof(
double));
4679 saf_assert(N<=h->maxN,
"N exceeds the maximum length specified");
4686 h->tmp[j*N+i] = A[i*N+j];
4688#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4691#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4692 INFO = clapack_dgetrf(CblasColMajor, N_, N_, h->tmp, N_, h->IPIV);
4693 INFO = clapack_dgetri(CblasColMajor, N_, h->tmp, N_, h->IPIV);
4694#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4695 INFO = LAPACKE_dgetrf_work(CblasColMajor, N_, N_, h->tmp, N_, h->IPIV);
4696 INFO = LAPACKE_dgetri_work(CblasColMajor, N_, h->tmp, N_, h->IPIV, h->WORK, LWORK);
4700 memset(B, 0, N*N*
sizeof(
double));
4705 saf_print_warning(
"Unable to compute the inverse of input matrix. The function utility_dinv() returned a matrix of zeros. ");
4712 B[j*N+i] = h->tmp[i*N+j];
4721typedef struct _utility_cinv_data {
4724 float_complex *WORK, *tmp;
4734 h->tmp =
malloc1d(maxN*maxN*
sizeof(float_complex));
4735 h->WORK =
malloc1d(maxN*maxN*
sizeof(float_complex));
4775 saf_assert(N<=h->maxN,
"N exceeds the maximum length specified");
4782 h->tmp[j*N+i] = A[i*N+j];
4784#if defined(SAF_VECLIB_USE_LAPACK_FORTRAN_INTERFACE)
4787#elif defined(SAF_VECLIB_USE_CLAPACK_INTERFACE)
4790#elif defined(SAF_VECLIB_USE_LAPACKE_INTERFACE)
4796 memset(B, 0, N*N*
sizeof(float_complex));
4801 saf_print_warning(
"Unable to compute the inverse of input matrix. The function utility_cinv() returned a matrix of zeros. ");
4808 B[j*N+i] = h->tmp[i*N+j];
void utility_dimaxv(const double *a, const int len, int *index)
Double-precision, index of maximum absolute value in a vector, i.e.
#define saf_print_error(message)
Macro to print a error message along with the filename and line number.
void utility_spinv_create(void **const phWork, int maxDim1, int maxDim2)
(Optional) Pre-allocate the working struct used by utility_spinv()
void utility_dinv(void *const hWork, double *A, double *B, const int N)
Matrix inversion: double precision, i.e.
double utility_ddet(void *const hWork, double *A, int N)
Determinant of a Matrix, double precision, i,e.
void utility_dinv_create(void **const phWork, int maxN)
(Optional) Pre-allocate the working struct used by utility_dinv()
void utility_zvvsub(const double_complex *a, const double_complex *b, const int len, double_complex *c)
Double-precision, complex, vector-vector subtraction, i.e.
void utility_ceigmp(void *const hWork, const float_complex *A, const float_complex *B, const int dim, float_complex *VL, float_complex *VR, float_complex *D)
Computes eigenvalues of a matrix pair using the QZ method, single precision complex,...
#define saf_assert(x, message)
Macro to make an assertion, along with a string explaining its purpose.
void utility_cpinv_create(void **const phWork, int maxDim1, int maxDim2)
(Optional) Pre-allocate the working struct used by utility_cpinv()
void utility_zsv2cv_inds(const double_complex *sv, const int *inds, const int len, double_complex *cv)
Double-precision complex, sparse-vector to compressed vector given known indices.
void utility_cglslv_destroy(void **const phWork)
De-allocate the working struct used by utility_cglslv()
void utility_cglslv(void *const hWork, const float_complex *A, const int dim, float_complex *B, int nCol, float_complex *X)
General linear solver: single precision complex, i.e.
void utility_ddet_destroy(void **const phWork)
De-allocate the working struct used by utility_ddet()
void utility_zeig_destroy(void **const phWork)
De-allocate the working struct used by utility_zeig()
void utility_cvvmul(const float_complex *a, const float_complex *b, const int len, float_complex *c)
Single-precision, complex, element-wise vector-vector multiplication i.e.
void utility_svvmul(const float *a, const float *b, const int len, float *c)
Single-precision, element-wise vector-vector multiplication i.e.
void utility_ceig_destroy(void **const phWork)
De-allocate the working struct used by utility_ceig()
void utility_cinv(void *const hWork, float_complex *A, float_complex *B, const int N)
Matrix inversion: double precision complex, i.e.
void utility_dvsmul(double *a, const double *s, const int len, double *c)
Double-precision, multiplies each element in vector 'a' with a scalar 's', i.e.
void utility_csv2cv_inds(const float_complex *sv, const int *inds, const int len, float_complex *cv)
Single-precision complex, sparse-vector to compressed vector given known indices.
void utility_svmod(const float *a, const float *b, const int len, float *c)
Single-precision, modulus of vector elements, i.e.
void utility_ziminv(const double_complex *a, const int len, int *index)
Double-precision, complex, index of maximum absolute value in a vector, i.e.
void utility_dvvsub(const double *a, const double *b, const int len, double *c)
Double-precision, vector-vector subtraction, i.e.
void utility_dsv2cv_inds(const double *sv, const int *inds, const int len, double *cv)
Double-precision, sparse-vector to compressed vector given known indices i.e.
#define SAF_FALSE
Boolean false.
void utility_csvd_destroy(void **const phWork)
De-allocate the working struct used by utility_csvd()
#define SAF_MAX(a, b)
Returns the maximum of the two values.
void utility_ceigmp_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_ceigmp()
void utility_dglslv(void *const hWork, const double *A, const int dim, double *B, int nCol, double *X)
General linear solver: double precision, i.e.
void utility_zvsmul(double_complex *a, const double_complex *s, const int len, double_complex *c)
Double-precision, complex, multiplies each element in vector 'a' with a scalar 's',...
void utility_dglslv_destroy(void **const phWork)
De-allocate the working struct used by utility_dglslv()
void utility_schol_destroy(void **const phWork)
De-allocate the working struct used by utility_schol()
void utility_zeig_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_zeig()
void utility_svsadd(float *a, const float *s, const int len, float *c)
Single-precision, adds each element in vector 'a' with a scalar 's', i.e.
void utility_cseig_destroy(void **const phWork)
De-allocate the working struct used by utility_cseig()
void utility_ciminv(const float_complex *a, const int len, int *index)
Single-precision, complex, index of maximum absolute value in a vector, i.e.
void utility_dvvadd(const double *a, const double *b, const int len, double *c)
Double-precision, vector-vector addition, i.e.
void utility_svssub(float *a, const float *s, const int len, float *c)
Single-precision, subtracts each element in vector 'a' with a scalar 's', i.e.
void utility_svvsub(const float *a, const float *b, const int len, float *c)
Single-precision, vector-vector subtraction, i.e.
void utility_ssvd_create(void **const phWork, int maxDim1, int maxDim2)
(Optional) Pre-allocate the working struct used by utility_ssvd()
void utility_sglslvt(void *const hWork, const float *A, const int dim, float *B, int nCol, float *X)
General linear solver (the other way): single precision, i.e.
void utility_ssvd(void *const hWork, const float *A, const int dim1, const int dim2, float *U, float *S, float *V, float *sing)
Singular value decomposition: single precision, i.e.
void utility_zimaxv(const double_complex *a, const int len, int *index)
Double-precision, complex, index of maximum absolute value in a vector, i.e.
void utility_zeigmp(void *const hWork, const double_complex *A, const double_complex *B, const int dim, double_complex *VL, double_complex *VR, double_complex *D)
Computes eigenvalues of a matrix pair using the QZ method, double precision complex,...
void utility_cinv_destroy(void **const phWork)
De-allocate the working struct used by utility_cinv()
void utility_ceig(void *const hWork, const float_complex *A, const int dim, float_complex *VL, float_complex *VR, float_complex *D, float_complex *eig)
Eigenvalue decomposition of a NON-SYMMETRIC matrix: single precision complex, i.e.
void utility_cvvadd(const float_complex *a, const float_complex *b, const int len, float_complex *c)
Single-precision, complex, vector-vector addition, i.e.
void utility_sinv_create(void **const phWork, int maxN)
(Optional) Pre-allocate the working struct used by utility_sinv()
void utility_zglslv(void *const hWork, const double_complex *A, const int dim, double_complex *B, int nCol, double_complex *X)
General linear solver: double precision complex, i.e.
void utility_sglslv(void *const hWork, const float *A, const int dim, float *B, int nCol, float *X)
General linear solver: single precision, i.e.
CONJ_FLAG
Whether a vector should be conjugated or not (e.g.
void utility_svvcopy(const float *a, const int len, float *c)
Single-precision, vector-vector copy, i.e.
void utility_cimaxv(const float_complex *a, const int len, int *index)
Single-precision, complex, index of maximum absolute value in a vector, i.e.
#define SAF_TRUE
Boolean true.
void utility_spinv(void *const hWork, const float *inM, const int dim1, const int dim2, float *outM)
General matrix pseudo-inverse (the svd way): single precision, i.e.
void utility_dpinv_create(void **const phWork, int maxDim1, int maxDim2)
(Optional) Pre-allocate the working struct used by utility_dpinv()
void utility_cpinv(void *const hWork, const float_complex *inM, const int dim1, const int dim2, float_complex *outM)
General matrix pseudo-inverse (the svd way): single precision complex, i.e.
void utility_cslslv_create(void **const phWork, int maxDim, int maxNCol)
(Optional) Pre-allocate the working struct used by utility_cslslv()
void utility_sseig_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_sseig()
void utility_sinv(void *const hWork, float *A, float *B, const int N)
Matrix inversion: single precision, i.e.
void utility_zglslv_destroy(void **const phWork)
De-allocate the working struct used by utility_zglslv()
void utility_cpinv_destroy(void **const phWork)
De-allocate the working struct used by utility_cpinv()
void utility_sglslvt_destroy(void **const phWork)
De-allocate the working struct used by utility_sglslvt()
void utility_dpinv_destroy(void **const phWork)
De-allocate the working struct used by utility_dpinv()
void utility_cseig(void *const hWork, const float_complex *A, const int dim, int sortDecFLAG, float_complex *V, float_complex *D, float *eig)
Eigenvalue decomposition of a SYMMETRIC/HERMITION matrix: single precision complex,...
void utility_csvd(void *const hWork, const float_complex *A, const int dim1, const int dim2, float_complex *U, float_complex *S, float_complex *V, float *sing)
Singular value decomposition: single precision complex, i.e.
void utility_dpinv(void *const hWork, const double *inM, const int dim1, const int dim2, double *outM)
General matrix pseudo-inverse (the svd way): double precision, i.e.
void utility_cseig_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_cseig()
void utility_diminv(const double *a, const int len, int *index)
Double-precision, index of minimum absolute value in a vector, i.e.
void utility_sslslv_destroy(void **const phWork)
De-allocate the working struct used by utility_sslslv()
void utility_schol(void *const hWork, const float *A, const int dim, float *X)
Cholesky factorisation of a symmetric matrix positive-definate matrix: single precision,...
#define SAF_MIN(a, b)
Returns the minimum of the two values.
void utility_zeig(void *const hWork, const double_complex *A, const int dim, double_complex *VL, double_complex *VR, double_complex *D, double_complex *eig)
Eigenvalue decomposition of a NON-SYMMETRIC matrix: double precision complex, i.e.
void utility_cvabs(const float_complex *a, const int len, float *c)
Single-precision, complex, absolute values of vector elements, i.e.
void utility_cvvdot(const float_complex *a, const float_complex *b, const int len, CONJ_FLAG flag, float_complex *c)
Single-precision, complex, vector-vector dot product, i.e.
void utility_sglslv_create(void **const phWork, int maxDim, int maxNCol)
(Optional) Pre-allocate the working struct used by utility_sglslv()
void utility_cinv_create(void **const phWork, int maxN)
(Optional) Pre-allocate the working struct used by utility_cinv()
void utility_cvvsub(const float_complex *a, const float_complex *b, const int len, float_complex *c)
Single-precision, complex, vector-vector subtraction, i.e.
void utility_zpinv(void *const hWork, const double_complex *inM, const int dim1, const int dim2, double_complex *outM)
General matrix pseudo-inverse (the svd way): double precision complex, i.e.
void utility_zpinv_create(void **const phWork, int maxDim1, int maxDim2)
(Optional) Pre-allocate the working struct used by utility_zpinv()
void utility_sseig(void *const hWork, const float *A, const int dim, int sortDecFLAG, float *V, float *D, float *eig)
Eigenvalue decomposition of a SYMMETRIC matrix: single precision, i.e.
void utility_dinv_destroy(void **const phWork)
De-allocate the working struct used by utility_dinv()
void utility_cchol(void *const hWork, const float_complex *A, const int dim, float_complex *X)
Cholesky factorisation of a hermitian positive-definate matrix: single precision complex,...
#define saf_print_warning(message)
Macro to print a warning message along with the filename and line number.
void utility_cchol_destroy(void **const phWork)
De-allocate the working struct used by utility_cchol()
void utility_zvvcopy(const double_complex *a, const int len, double_complex *c)
double-precision, complex, vector-vector copy, i.e.
float utility_sdet(void *const hWork, float *A, int N)
Determinant of a Matrix, single precision, i,e.
void utility_zeigmp_destroy(void **const phWork)
De-allocate the working struct used by utility_zeigmp()
void utility_schol_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_schol()
void utility_sslslv_create(void **const phWork, int maxDim, int maxNCol)
(Optional) Pre-allocate the working struct used by utility_sslslv()
void utility_zvvadd(const double_complex *a, const double_complex *b, const int len, double_complex *c)
Double-precision, complex, vector-vector addition, i.e.
void utility_ceigmp_destroy(void **const phWork)
De-allocate the working struct used by utility_ceigmp()
void utility_csvd_create(void **const phWork, int maxDim1, int maxDim2)
(Optional) Pre-allocate the working struct used by utility_csvd()
void utility_cvvcopy(const float_complex *a, const int len, float_complex *c)
Single-precision, complex, vector-vector copy, i.e.
void utility_svvadd(const float *a, const float *b, const int len, float *c)
Single-precision, vector-vector addition, i.e.
void utility_svrecip(const float *a, const int len, float *c)
Single-precision, vector-reciprocal/inversion, i.e.
void utility_sglslvt_create(void **const phWork, int maxDim, int maxNCol)
(Optional) Pre-allocate the working struct used by utility_sglslvt()
void utility_spinv_destroy(void **const phWork)
De-allocate the working struct used by utility_spinv()
void utility_sslslv(void *const hWork, const float *A, const int dim, float *B, int nCol, float *X)
Linear solver for SYMMETRIC positive-definate 'A': single precision, i.e.
void utility_ddet_create(void **const phWork, int maxN)
(Optional) Pre-allocate the working struct used by utility_ddet()
void utility_sdet_create(void **const phWork, int maxN)
(Optional) Pre-allocate the working struct used by utility_sdet()
void utility_cvsmul(float_complex *a, const float_complex *s, const int len, float_complex *c)
Single-precision, complex, multiplies each element in vector 'a' with a scalar 's',...
void utility_cglslv_create(void **const phWork, int maxDim, int maxNCol)
(Optional) Pre-allocate the working struct used by utility_cglslv()
void utility_ceig_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_ceig()
void utility_dglslv_create(void **const phWork, int maxDim, int maxNCol)
(Optional) Pre-allocate the working struct used by utility_dglslv()
void utility_svvdot(const float *a, const float *b, const int len, float *c)
Single-precision, vector-vector dot product, i.e.
void utility_svsdiv(const float *a, const float *s, const int len, float *c)
Single-precision, divides each element in vector 'a' with a scalar 's', i.e.
void utility_cslslv_destroy(void **const phWork)
De-allocate the working struct used by utility_cslslv()
void utility_zglslv_create(void **const phWork, int maxDim, int maxNCol)
(Optional) Pre-allocate the working struct used by utility_zglslv()
void utility_cvconj(const float_complex *a, const int len, float_complex *c)
Single-precision, complex, vector-conjugate, i.e.
void utility_dvvcopy(const double *a, const int len, double *c)
double-precision, vector-vector copy, i.e.
void utility_sseig_destroy(void **const phWork)
De-allocate the working struct used by utility_sseig()
void utility_svabs(const float *a, const int len, float *c)
Single-precision, absolute values of vector elements, i.e.
void utility_svsmul(float *a, const float *s, const int len, float *c)
Single-precision, multiplies each element in vector 'a' with a scalar 's', i.e.
void utility_zvconj(const double_complex *a, const int len, double_complex *c)
Double-precision, complex, vector-conjugate, i.e.
void utility_sinv_destroy(void **const phWork)
De-allocate the working struct used by utility_sinv()
void utility_cchol_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_cchol()
void utility_sdet_destroy(void **const phWork)
De-allocate the working struct used by utility_sdet()
void utility_siminv(const float *a, const int len, int *index)
Single-precision, index of minimum absolute value in a vector, i.e.
void utility_zeigmp_create(void **const phWork, int maxDim)
(Optional) Pre-allocate the working struct used by utility_zeigmp()
void utility_ssv2cv_inds(const float *sv, const int *inds, const int len, float *cv)
Single-precision, sparse-vector to compressed vector given known indices i.e.
void utility_sglslv_destroy(void **const phWork)
De-allocate the working struct used by utility_sglslv()
void utility_simaxv(const float *a, const int len, int *index)
Single-precision, index of maximum absolute value in a vector, i.e.
void utility_ssvd_destroy(void **const phWork)
De-allocate the working struct used by utility_ssvd()
void utility_zpinv_destroy(void **const phWork)
De-allocate the working struct used by utility_zpinv()
void utility_cslslv(void *const hWork, const float_complex *A, const int dim, float_complex *B, int nCol, float_complex *X)
Linear solver for HERMITIAN positive-definate 'A': single precision complex, i.e.
@ NO_CONJ
Do not take the conjugate.
@ CONJ
Take the conjugate.
void * malloc1d(size_t dim1_data_size)
1-D malloc (same as malloc, but with error checking)
void * realloc1d(void *ptr, size_t dim1_data_size)
1-D realloc (same as realloc, but with error checking)
Include header for SAF externals.
Main header for the utilities module (SAF_UTILITIES_MODULE)
double veclib_double
real: 8-bytes
MKL_INT veclib_int
integer: 4-bytes
MKL_Complex16 veclib_double_complex
complex: 16-bytes
MKL_Complex8 veclib_float_complex
complex: 8-bytes
#define SAF_INTEL_MKL_VML_MODE
Taken from the Intel MKL VML developers reference documentation: The flag(s) that can be passed to In...
float veclib_float
real: 4-bytes
Data structure for utility_cchol()
Data structure for utility_ceig()
Data structure for utility_ceigmp()
Data structure for utility_cglslv()
Data structure for utility_cinv()
Data structure for utility_cpinv()
Data structure for utility_cseig()
Data structure for utility_cslslv()
Data structure for utility_csvd()
Data structure for utility_ddet()
Data structure for utility_dglslv()
Data structure for utility_dinv()
Data structure for utility_dpinv()
Data structure for utility_schol()
Data structure for utility_sdet()
Data structure for utility_sglslv()
Data structure for utility_sglslvt()
Data structure for utility_sinv()
Data structure for utility_spinv()
Data structure for utility_sseig()
Data structure for utility_sslslv()
Data structure for utility_ssvd()
Data structure for utility_zeig()
Data structure for utility_zeigmp()
Data structure for utility_zglslv()
Data structure for utility_zpinv()