44 #define X(name) NFFT(name)
47 static inline INT intprod(
const INT *vec,
const INT a,
const INT d)
52 for (t = 0; t < d; t++)
59 #define BASE(x) CEXP(x)
75 static inline void sort0(
const INT d,
const INT *n,
const INT m,
76 const INT local_x_num,
const R *local_x, INT *ar_x)
78 INT u_j[d], i, j, help, rhigh;
82 for (i = 0; i < local_x_num; i++)
86 for (j = 0; j < d; j++)
88 help = (INT) LRINT(FLOOR((R)(n[j]) * local_x[d * i + j] - (R)(m)));
89 u_j[j] = (help % n[j] + n[j]) % n[j];
91 ar_x[2 * i] += u_j[j];
93 ar_x[2 * i] *= n[j + 1];
97 for (j = 0, nprod = 1; j < d; j++)
100 rhigh = (INT) LRINT(CEIL(LOG2((R)nprod))) - 1;
102 ar_x_temp = (INT*) Y(malloc)(2 * (size_t)(local_x_num) *
sizeof(INT));
103 Y(sort_node_indices_radix_lsdf)(local_x_num, ar_x, ar_x_temp, rhigh);
105 for (i = 1; i < local_x_num; i++)
106 assert(ar_x[2 * (i - 1)] <= ar_x[2 * i]);
119 static inline void sort(
const X(plan) *ths)
121 if (ths->flags & NFFT_SORT_NODES)
122 sort0(ths->d, ths->n, ths->m, ths->M_total, ths->x, ths->index_x);
145 void X(trafo_direct)(
const X(plan) *ths)
147 C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f;
149 memset(f, 0, (
size_t)(ths->M_total) *
sizeof(C));
156 #pragma omp parallel for default(shared) private(j)
158 for (j = 0; j < ths->M_total; j++)
161 for (k_L = 0; k_L < ths->N_total; k_L++)
163 R omega = K2PI * ((R)(k_L - ths->N_total/2)) * ths->x[j];
164 f[j] += f_hat[k_L] * BASE(-II * omega);
173 #pragma omp parallel for default(shared) private(j)
175 for (j = 0; j < ths->M_total; j++)
177 R x[ths->d], omega, Omega[ths->d + 1];
178 INT t, t2, k_L, k[ths->d];
180 for (t = 0; t < ths->d; t++)
183 x[t] = K2PI * ths->x[j * ths->d + t];
184 Omega[t+1] = ((R)k[t]) * x[t] + Omega[t];
186 omega = Omega[ths->d];
188 for (k_L = 0; k_L < ths->N_total; k_L++)
190 f[j] += f_hat[k_L] * BASE(-II * omega);
192 for (t = ths->d - 1; (t >= 1) && (k[t] == ths->N[t]/2 - 1); t--)
197 for (t2 = t; t2 < ths->d; t2++)
198 Omega[t2+1] = ((R)k[t2]) * x[t2] + Omega[t2];
200 omega = Omega[ths->d];
207 void X(adjoint_direct)(
const X(plan) *ths)
209 C *f_hat = (C*)ths->f_hat, *f = (C*)ths->f;
211 memset(f_hat, 0, (
size_t)(ths->N_total) *
sizeof(C));
218 #pragma omp parallel for default(shared) private(k_L)
219 for (k_L = 0; k_L < ths->N_total; k_L++)
222 for (j = 0; j < ths->M_total; j++)
224 R omega = K2PI * ((R)(k_L - (ths->N_total/2))) * ths->x[j];
225 f_hat[k_L] += f[j] * BASE(II * omega);
230 for (j = 0; j < ths->M_total; j++)
233 for (k_L = 0; k_L < ths->N_total; k_L++)
235 R omega = K2PI * ((R)(k_L - ths->N_total / 2)) * ths->x[j];
236 f_hat[k_L] += f[j] * BASE(II * omega);
246 #pragma omp parallel for default(shared) private(j, k_L)
247 for (k_L = 0; k_L < ths->N_total; k_L++)
249 INT k[ths->d], k_temp, t;
253 for (t = ths->d - 1; t >= 0; t--)
255 k[t] = k_temp % ths->N[t] - ths->N[t]/2;
259 for (j = 0; j < ths->M_total; j++)
262 for (t = 0; t < ths->d; t++)
263 omega += k[t] * K2PI * ths->x[j * ths->d + t];
264 f_hat[k_L] += f[j] * BASE(II * omega);
268 for (j = 0; j < ths->M_total; j++)
270 R x[ths->d], omega, Omega[ths->d+1];
271 INT t, t2, k[ths->d];
273 for (t = 0; t < ths->d; t++)
276 x[t] = K2PI * ths->x[j * ths->d + t];
277 Omega[t+1] = ((R)k[t]) * x[t] + Omega[t];
279 omega = Omega[ths->d];
280 for (k_L = 0; k_L < ths->N_total; k_L++)
282 f_hat[k_L] += f[j] * BASE(II * omega);
284 for (t = ths->d-1; (t >= 1) && (k[t] == ths->N[t]/2-1); t--)
289 for (t2 = t; t2 < ths->d; t2++)
290 Omega[t2+1] = ((R)k[t2]) * x[t2] + Omega[t2];
292 omega = Omega[ths->d];
324 static inline void uo(
const X(plan) *ths,
const INT j, INT *up, INT *op,
327 const R xj = ths->x[j * ths->d + act_dim];
328 INT c = LRINT(FLOOR(xj * (R)(ths->n[act_dim])));
330 (*up) = c - (ths->m);
331 (*op) = c + 1 + (ths->m);
334 static inline void uo2(INT *u, INT *o,
const R x,
const INT n,
const INT m)
336 INT c = LRINT(FLOOR(x * (R)(n)));
338 *u = (c - m + n) % n;
339 *o = (c + 1 + m + n) % n;
342 #define MACRO_D_compute_A \
344 g_hat[k_plain[ths->d]] = f_hat[ks_plain[ths->d]] * c_phi_inv_k[ths->d]; \
347 #define MACRO_D_compute_T \
349 f_hat[ks_plain[ths->d]] = g_hat[k_plain[ths->d]] * c_phi_inv_k[ths->d]; \
352 #define MACRO_D_init_result_A memset(g_hat, 0, (size_t)(ths->n_total) * sizeof(C));
354 #define MACRO_D_init_result_T memset(f_hat, 0, (size_t)(ths->N_total) * sizeof(C));
356 #define MACRO_with_PRE_PHI_HUT * ths->c_phi_inv[t2][ks[t2]];
358 #define MACRO_without_PRE_PHI_HUT / (PHI_HUT(ths->n[t2],ks[t2]-(ths->N[t2]/2),t2));
360 #define MACRO_init_k_ks \
362 for (t = ths->d-1; 0 <= t; t--) \
365 ks[t] = ths->N[t]/2; \
370 #define MACRO_update_c_phi_inv_k(which_one) \
372 for (t2 = t; t2 < ths->d; t2++) \
374 c_phi_inv_k[t2+1] = c_phi_inv_k[t2] MACRO_ ##which_one; \
375 ks_plain[t2+1] = ks_plain[t2]*ths->N[t2] + ks[t2]; \
376 k_plain[t2+1] = k_plain[t2]*ths->n[t2] + k[t2]; \
380 #define MACRO_count_k_ks \
382 for (t = ths->d-1; (t > 0) && (kp[t] == ths->N[t]-1); t--) \
385 ks[t]= ths->N[t]/2; \
388 kp[t]++; k[t]++; ks[t]++; \
389 if(kp[t] == ths->N[t]/2) \
391 k[t] = ths->n[t] - ths->N[t]/2; \
397 #define MACRO_D(which_one) \
398 static inline void D_serial_ ## which_one (X(plan) *ths) \
401 R c_phi_inv_k[ths->d+1]; \
407 INT k_plain[ths->d+1]; \
408 INT ks_plain[ths->d+1]; \
410 f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat; \
411 MACRO_D_init_result_ ## which_one; \
413 c_phi_inv_k[0] = K(1.0); \
419 if (ths->flags & PRE_PHI_HUT) \
421 for (k_L = 0; k_L < ths->N_total; k_L++) \
423 MACRO_update_c_phi_inv_k(with_PRE_PHI_HUT); \
424 MACRO_D_compute_ ## which_one; \
430 for (k_L = 0; k_L < ths->N_total; k_L++) \
432 MACRO_update_c_phi_inv_k(without_PRE_PHI_HUT); \
433 MACRO_D_compute_ ## which_one; \
440 static inline void D_openmp_A(
X(plan) *ths)
445 f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat;
446 memset(g_hat, 0, ths->n_total *
sizeof(C));
448 if (ths->flags & PRE_PHI_HUT)
450 #pragma omp parallel for default(shared) private(k_L)
451 for (k_L = 0; k_L < ths->N_total; k_L++)
456 R c_phi_inv_k_val = K(1.0);
458 INT ks_plain_val = 0;
462 for (t = ths->d-1; t >= 0; t--)
464 kp[t] = k_temp % ths->N[t];
465 if (kp[t] >= ths->N[t]/2)
466 k[t] = ths->n[t] - ths->N[t] + kp[t];
469 ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
473 for (t = 0; t < ths->d; t++)
475 c_phi_inv_k_val *= ths->c_phi_inv[t][ks[t]];
476 ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
477 k_plain_val = k_plain_val*ths->n[t] + k[t];
480 g_hat[k_plain_val] = f_hat[ks_plain_val] * c_phi_inv_k_val;
485 #pragma omp parallel for default(shared) private(k_L)
486 for (k_L = 0; k_L < ths->N_total; k_L++)
491 R c_phi_inv_k_val = K(1.0);
493 INT ks_plain_val = 0;
497 for (t = ths->d-1; t >= 0; t--)
499 kp[t] = k_temp % ths->N[t];
500 if (kp[t] >= ths->N[t]/2)
501 k[t] = ths->n[t] - ths->N[t] + kp[t];
504 ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
508 for (t = 0; t < ths->d; t++)
510 c_phi_inv_k_val /= (PHI_HUT(ths->n[t],ks[t]-(ths->N[t]/2),t));
511 ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
512 k_plain_val = k_plain_val*ths->n[t] + k[t];
515 g_hat[k_plain_val] = f_hat[ks_plain_val] * c_phi_inv_k_val;
525 static inline void D_A(
X(plan) *ths)
535 static void D_openmp_T(
X(plan) *ths)
540 f_hat = (C*)ths->f_hat; g_hat = (C*)ths->g_hat;
541 memset(f_hat, 0, ths->N_total *
sizeof(C));
543 if (ths->flags & PRE_PHI_HUT)
545 #pragma omp parallel for default(shared) private(k_L)
546 for (k_L = 0; k_L < ths->N_total; k_L++)
551 R c_phi_inv_k_val = K(1.0);
553 INT ks_plain_val = 0;
557 for (t = ths->d - 1; t >= 0; t--)
559 kp[t] = k_temp % ths->N[t];
560 if (kp[t] >= ths->N[t]/2)
561 k[t] = ths->n[t] - ths->N[t] + kp[t];
564 ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
568 for (t = 0; t < ths->d; t++)
570 c_phi_inv_k_val *= ths->c_phi_inv[t][ks[t]];
571 ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
572 k_plain_val = k_plain_val*ths->n[t] + k[t];
575 f_hat[ks_plain_val] = g_hat[k_plain_val] * c_phi_inv_k_val;
580 #pragma omp parallel for default(shared) private(k_L)
581 for (k_L = 0; k_L < ths->N_total; k_L++)
586 R c_phi_inv_k_val = K(1.0);
588 INT ks_plain_val = 0;
592 for (t = ths->d-1; t >= 0; t--)
594 kp[t] = k_temp % ths->N[t];
595 if (kp[t] >= ths->N[t]/2)
596 k[t] = ths->n[t] - ths->N[t] + kp[t];
599 ks[t] = (kp[t] + ths->N[t]/2) % ths->N[t];
603 for (t = 0; t < ths->d; t++)
605 c_phi_inv_k_val /= (PHI_HUT(ths->n[t],ks[t]-(ths->N[t]/2),t));
606 ks_plain_val = ks_plain_val*ths->N[t] + ks[t];
607 k_plain_val = k_plain_val*ths->n[t] + k[t];
610 f_hat[ks_plain_val] = g_hat[k_plain_val] * c_phi_inv_k_val;
620 static void D_T(
X(plan) *ths)
630 #define MACRO_B_init_result_A memset(f, 0, (size_t)(ths->M_total) * sizeof(C));
631 #define MACRO_B_init_result_T memset(g, 0, (size_t)(ths->n_total) * sizeof(C));
633 #define MACRO_B_PRE_FULL_PSI_compute_A \
635 (*fj) += ths->psi[ix] * g[ths->psi_index_g[ix]]; \
638 #define MACRO_B_PRE_FULL_PSI_compute_T \
640 g[ths->psi_index_g[ix]] += ths->psi[ix] * (*fj); \
643 #define MACRO_B_compute_A \
645 (*fj) += phi_prod[ths->d] * g[ll_plain[ths->d]]; \
648 #define MACRO_B_compute_T \
650 g[ll_plain[ths->d]] += phi_prod[ths->d] * (*fj); \
653 #define MACRO_with_FG_PSI fg_psi[t2][lj[t2]]
655 #define MACRO_with_PRE_PSI ths->psi[(j*ths->d+t2) * (2*ths->m+2)+lj[t2]]
657 #define MACRO_without_PRE_PSI PHI(ths->n[t2], ths->x[j*ths->d+t2] \
658 - ((R)l[t2])/((R)ths->n[t2]), t2)
660 #define MACRO_init_uo_l_lj_t \
662 for (t = ths->d-1; t >= 0; t--) \
664 uo(ths,j,&u[t],&o[t],t); \
671 #define MACRO_update_phi_prod_ll_plain(which_one) { \
672 for (t2 = t; t2 < ths->d; t2++) \
674 phi_prod[t2+1] = phi_prod[t2] * MACRO_ ## which_one; \
675 ll_plain[t2+1] = ll_plain[t2] * ths->n[t2] + (l[t2] + ths->n[t2]) % ths->n[t2]; \
679 #define MACRO_count_uo_l_lj_t \
681 for (t = ths->d-1; (t > 0) && (l[t] == o[t]); t--) \
691 #define MACRO_B(which_one) \
692 static inline void B_serial_ ## which_one (X(plan) *ths) \
695 INT u[ths->d], o[ths->d]; \
701 INT ll_plain[ths->d+1]; \
702 R phi_prod[ths->d+1]; \
706 R fg_psi[ths->d][2*ths->m+2]; \
707 R fg_exp_l[ths->d][2*ths->m+2]; \
709 R tmpEXP1, tmpEXP2, tmpEXP2sq, tmp1, tmp2, tmp3; \
712 INT ip_s = ths->K/(ths->m+2); \
714 f = (C*)ths->f; g = (C*)ths->g; \
716 MACRO_B_init_result_ ## which_one; \
718 if (ths->flags & PRE_FULL_PSI) \
720 for (ix = 0, j = 0, fj = f; j < ths->M_total; j++, fj++) \
722 for (l_L = 0; l_L < ths->psi_index_f[j]; l_L++, ix++) \
724 MACRO_B_PRE_FULL_PSI_compute_ ## which_one; \
730 phi_prod[0] = K(1.0); \
733 for (t = 0, lprod = 1; t < ths->d; t++) \
734 lprod *= (2 * ths->m + 2); \
736 if (ths->flags & PRE_PSI) \
738 for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
740 MACRO_init_uo_l_lj_t; \
742 for (l_L = 0; l_L < lprod; l_L++) \
744 MACRO_update_phi_prod_ll_plain(with_PRE_PSI); \
746 MACRO_B_compute_ ## which_one; \
748 MACRO_count_uo_l_lj_t; \
754 if (ths->flags & PRE_FG_PSI) \
756 for(t2 = 0; t2 < ths->d; t2++) \
758 tmpEXP2 = EXP(K(-1.0) / ths->b[t2]); \
759 tmpEXP2sq = tmpEXP2*tmpEXP2; \
762 fg_exp_l[t2][0] = K(1.0); \
763 for (lj_fg = 1; lj_fg <= (2 * ths->m + 2); lj_fg++) \
765 tmp3 = tmp2*tmpEXP2; \
767 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1] * tmp3; \
770 for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
772 MACRO_init_uo_l_lj_t; \
774 for (t2 = 0; t2 < ths->d; t2++) \
776 fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)]; \
777 tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1]; \
779 for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
782 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
786 for (l_L= 0; l_L < lprod; l_L++) \
788 MACRO_update_phi_prod_ll_plain(with_FG_PSI); \
790 MACRO_B_compute_ ## which_one; \
792 MACRO_count_uo_l_lj_t; \
798 if (ths->flags & FG_PSI) \
800 for (t2 = 0; t2 < ths->d; t2++) \
802 tmpEXP2 = EXP(K(-1.0)/ths->b[t2]); \
803 tmpEXP2sq = tmpEXP2*tmpEXP2; \
806 fg_exp_l[t2][0] = K(1.0); \
807 for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++) \
809 tmp3 = tmp2*tmpEXP2; \
811 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3; \
814 for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
816 MACRO_init_uo_l_lj_t; \
818 for (t2 = 0; t2 < ths->d; t2++) \
820 fg_psi[t2][0] = (PHI(ths->n[t2], (ths->x[j*ths->d+t2] - ((R)u[t2])/((R)(ths->n[t2]))), t2));\
822 tmpEXP1 = EXP(K(2.0) * ((R)(ths->n[t2]) * ths->x[j * ths->d + t2] - (R)(u[t2])) \
825 for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++) \
828 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg]; \
832 for (l_L = 0; l_L < lprod; l_L++) \
834 MACRO_update_phi_prod_ll_plain(with_FG_PSI); \
836 MACRO_B_compute_ ## which_one; \
838 MACRO_count_uo_l_lj_t; \
844 if (ths->flags & PRE_LIN_PSI) \
846 for (j = 0, fj=f; j<ths->M_total; j++, fj++) \
848 MACRO_init_uo_l_lj_t; \
850 for (t2 = 0; t2 < ths->d; t2++) \
852 y[t2] = (((R)(ths->n[t2]) * ths->x[j * ths->d + t2] - (R)(u[t2])) \
853 * ((R)(ths->K))) / (R)(ths->m + 2); \
854 ip_u = LRINT(FLOOR(y[t2])); \
856 for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++) \
858 fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)] \
859 * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)] \
864 for (l_L = 0; l_L < lprod; l_L++) \
866 MACRO_update_phi_prod_ll_plain(with_FG_PSI); \
868 MACRO_B_compute_ ## which_one; \
870 MACRO_count_uo_l_lj_t; \
877 for (j = 0, fj = f; j < ths->M_total; j++, fj++) \
879 MACRO_init_uo_l_lj_t; \
881 for (l_L = 0; l_L < lprod; l_L++) \
883 MACRO_update_phi_prod_ll_plain(without_PRE_PSI); \
885 MACRO_B_compute_ ## which_one; \
887 MACRO_count_uo_l_lj_t; \
897 static inline void B_openmp_A (
X(plan) *ths)
902 memset(ths->f, 0, ths->M_total *
sizeof(C));
904 for (k = 0, lprod = 1; k < ths->d; k++)
905 lprod *= (2*ths->m+2);
907 if (ths->flags & PRE_FULL_PSI)
909 #pragma omp parallel for default(shared) private(k)
910 for (k = 0; k < ths->M_total; k++)
913 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
915 for (l = 0; l < lprod; l++)
916 ths->f[j] += ths->psi[j*lprod+l] * ths->g[ths->psi_index_g[j*lprod+l]];
921 if (ths->flags & PRE_PSI)
923 #pragma omp parallel for default(shared) private(k)
924 for (k = 0; k < ths->M_total; k++)
926 INT u[ths->d], o[ths->d];
931 INT ll_plain[ths->d+1];
932 R phi_prod[ths->d+1];
933 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
935 phi_prod[0] = K(1.0);
938 MACRO_init_uo_l_lj_t;
940 for (l_L = 0; l_L < lprod; l_L++)
942 MACRO_update_phi_prod_ll_plain(with_PRE_PSI);
944 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
946 MACRO_count_uo_l_lj_t;
952 if (ths->flags & PRE_FG_PSI)
955 R fg_exp_l[ths->d][2*ths->m+2];
957 for (t2 = 0; t2 < ths->d; t2++)
960 R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
961 R tmpEXP2sq = tmpEXP2*tmpEXP2;
964 fg_exp_l[t2][0] = K(1.0);
965 for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
969 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
973 #pragma omp parallel for default(shared) private(k,t,t2)
974 for (k = 0; k < ths->M_total; k++)
976 INT ll_plain[ths->d+1];
977 R phi_prod[ths->d+1];
978 INT u[ths->d], o[ths->d];
981 R fg_psi[ths->d][2*ths->m+2];
985 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
987 phi_prod[0] = K(1.0);
990 MACRO_init_uo_l_lj_t;
992 for (t2 = 0; t2 < ths->d; t2++)
994 fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)];
995 tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1];
997 for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
1000 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1004 for (l_L= 0; l_L < lprod; l_L++)
1006 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1008 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
1010 MACRO_count_uo_l_lj_t;
1016 if (ths->flags & FG_PSI)
1019 R fg_exp_l[ths->d][2*ths->m+2];
1023 for (t2 = 0; t2 < ths->d; t2++)
1026 R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
1027 R tmpEXP2sq = tmpEXP2*tmpEXP2;
1030 fg_exp_l[t2][0] = K(1.0);
1031 for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
1033 tmp3 = tmp2*tmpEXP2;
1035 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
1039 #pragma omp parallel for default(shared) private(k,t,t2)
1040 for (k = 0; k < ths->M_total; k++)
1042 INT ll_plain[ths->d+1];
1043 R phi_prod[ths->d+1];
1044 INT u[ths->d], o[ths->d];
1047 R fg_psi[ths->d][2*ths->m+2];
1051 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1053 phi_prod[0] = K(1.0);
1056 MACRO_init_uo_l_lj_t;
1058 for (t2 = 0; t2 < ths->d; t2++)
1060 fg_psi[t2][0] = (PHI(ths->n[t2],(ths->x[j*ths->d+t2]-((R)u[t2])/ths->n[t2]),t2));
1062 tmpEXP1 = EXP(K(2.0)*(ths->n[t2]*ths->x[j*ths->d+t2] - u[t2])
1065 for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
1068 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1072 for (l_L = 0; l_L < lprod; l_L++)
1074 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1076 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
1078 MACRO_count_uo_l_lj_t;
1084 if (ths->flags & PRE_LIN_PSI)
1088 #pragma omp parallel for default(shared) private(k)
1089 for (k = 0; k<ths->M_total; k++)
1091 INT u[ths->d], o[ths->d];
1096 INT ll_plain[ths->d+1];
1097 R phi_prod[ths->d+1];
1099 R fg_psi[ths->d][2*ths->m+2];
1103 INT ip_s = ths->K/(ths->m+2);
1104 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1106 phi_prod[0] = K(1.0);
1109 MACRO_init_uo_l_lj_t;
1111 for (t2 = 0; t2 < ths->d; t2++)
1113 y[t2] = ((ths->n[t2]*ths->x[j*ths->d+t2]-(R)u[t2])
1114 * ((R)ths->K))/(ths->m+2);
1115 ip_u = LRINT(FLOOR(y[t2]));
1117 for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++)
1119 fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)]
1120 * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)]
1125 for (l_L = 0; l_L < lprod; l_L++)
1127 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1129 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
1131 MACRO_count_uo_l_lj_t;
1140 #pragma omp parallel for default(shared) private(k)
1141 for (k = 0; k < ths->M_total; k++)
1143 INT u[ths->d], o[ths->d];
1148 INT ll_plain[ths->d+1];
1149 R phi_prod[ths->d+1];
1150 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1152 phi_prod[0] = K(1.0);
1155 MACRO_init_uo_l_lj_t;
1157 for (l_L = 0; l_L < lprod; l_L++)
1159 MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
1161 ths->f[j] += phi_prod[ths->d] * ths->g[ll_plain[ths->d]];
1163 MACRO_count_uo_l_lj_t;
1169 static void B_A(
X(plan) *ths)
1194 static inline INT index_x_binary_search(
const INT *ar_x,
const INT len,
const INT key)
1196 INT left = 0, right = len - 1;
1201 while (left < right - 1)
1203 INT i = (left + right) / 2;
1204 if (ar_x[2*i] >= key)
1206 else if (ar_x[2*i] < key)
1210 if (ar_x[2*left] < key && left != len-1)
1233 static void nfft_adjoint_B_omp_blockwise_init(INT *my_u0, INT *my_o0,
1234 INT *min_u_a, INT *max_u_a, INT *min_u_b, INT *max_u_b,
const INT d,
1235 const INT *n,
const INT m)
1237 const INT n0 = n[0];
1239 INT nthreads = omp_get_num_threads();
1240 INT nthreads_used = MIN(nthreads, n0);
1241 INT size_per_thread = n0 / nthreads_used;
1242 INT size_left = n0 - size_per_thread * nthreads_used;
1243 INT size_g[nthreads_used];
1244 INT offset_g[nthreads_used];
1245 INT my_id = omp_get_thread_num();
1246 INT n_prod_rest = 1;
1248 for (k = 1; k < d; k++)
1249 n_prod_rest *= n[k];
1258 if (my_id < nthreads_used)
1260 const INT m22 = 2 * m + 2;
1263 for (k = 0; k < nthreads_used; k++)
1266 offset_g[k] = offset_g[k-1] + size_g[k-1];
1267 size_g[k] = size_per_thread;
1275 *my_u0 = offset_g[my_id];
1276 *my_o0 = offset_g[my_id] + size_g[my_id] - 1;
1278 if (nthreads_used > 1)
1280 *max_u_a = n_prod_rest*(offset_g[my_id] + size_g[my_id]) - 1;
1281 *min_u_a = n_prod_rest*(offset_g[my_id] - m22 + 1);
1286 *max_u_a = n_prod_rest * n0 - 1;
1291 *min_u_b = n_prod_rest * (offset_g[my_id] - m22 + 1 + n0);
1292 *max_u_b = n_prod_rest * n0 - 1;
1296 if (*min_u_b != -1 && *min_u_b <= *max_u_a)
1298 *max_u_a = *max_u_b;
1303 assert(*min_u_a <= *max_u_a);
1304 assert(*min_u_b <= *max_u_b);
1305 assert(*min_u_b == -1 || *max_u_a < *min_u_b);
1319 static void nfft_adjoint_B_compute_full_psi(C *g,
const INT *psi_index_g,
1320 const R *psi,
const C *f,
const INT M,
const INT d,
const INT *n,
1321 const INT m,
const unsigned flags,
const INT *index_x)
1333 for(t = 0, lprod = 1; t < d; t++)
1337 lprod_m1 = lprod / (2 * m + 2);
1341 if (flags & NFFT_OMP_BLOCKWISE_ADJOINT)
1343 #pragma omp parallel private(k)
1345 INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b;
1346 const INT *ar_x = index_x;
1347 INT n_prod_rest = 1;
1349 for (k = 1; k < d; k++)
1350 n_prod_rest *= n[k];
1352 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, &min_u_b, &max_u_b, d, n, m);
1356 k = index_x_binary_search(ar_x, M, min_u_a);
1358 assert(ar_x[2*k] >= min_u_a || k == M-1);
1360 assert(ar_x[2*k-2] < min_u_a);
1365 INT u_prod = ar_x[2*k];
1366 INT j = ar_x[2*k+1];
1368 if (u_prod < min_u_a || u_prod > max_u_a)
1371 for (l0 = 0; l0 < 2 * m + 2; l0++)
1373 const INT start_index = psi_index_g[j * lprod + l0 * lprod_m1];
1375 if (start_index < my_u0 * n_prod_rest || start_index > (my_o0+1) * n_prod_rest - 1)
1378 for (lrest = 0; lrest < lprod_m1; lrest++)
1380 const INT l = l0 * lprod_m1 + lrest;
1381 g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1391 k = index_x_binary_search(ar_x, M, min_u_b);
1393 assert(ar_x[2*k] >= min_u_b || k == M-1);
1395 assert(ar_x[2*k-2] < min_u_b);
1400 INT u_prod = ar_x[2*k];
1401 INT j = ar_x[2*k+1];
1403 if (u_prod < min_u_b || u_prod > max_u_b)
1406 for (l0 = 0; l0 < 2 * m + 2; l0++)
1408 const INT start_index = psi_index_g[j * lprod + l0 * lprod_m1];
1410 if (start_index < my_u0 * n_prod_rest || start_index > (my_o0+1) * n_prod_rest - 1)
1412 for (lrest = 0; lrest < lprod_m1; lrest++)
1414 const INT l = l0 * lprod_m1 + lrest;
1415 g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1428 #pragma omp parallel for default(shared) private(k)
1430 for (k = 0; k < M; k++)
1433 INT j = (flags & NFFT_SORT_NODES) ? index_x[2*k+1] : k;
1435 for (l = 0; l < lprod; l++)
1438 C val = psi[j * lprod + l] * f[j];
1439 C *gref = g + psi_index_g[j * lprod + l];
1440 R *gref_real = (R*) gref;
1443 gref_real[0] += CREAL(val);
1446 gref_real[1] += CIMAG(val);
1448 g[psi_index_g[j * lprod + l]] += psi[j * lprod + l] * f[j];
1459 static inline void B_openmp_T(
X(plan) *ths)
1464 memset(ths->g, 0, (
size_t)(ths->n_total) *
sizeof(C));
1466 for (k = 0, lprod = 1; k < ths->d; k++)
1467 lprod *= (2*ths->m+2);
1469 if (ths->flags & PRE_FULL_PSI)
1471 nfft_adjoint_B_compute_full_psi(ths->g, ths->psi_index_g, ths->psi, ths->f,
1472 ths->M_total, ths->d, ths->n, ths->m, ths->flags, ths->index_x);
1476 if (ths->flags & PRE_PSI)
1478 #pragma omp parallel for default(shared) private(k)
1479 for (k = 0; k < ths->M_total; k++)
1481 INT u[ths->d], o[ths->d];
1486 INT ll_plain[ths->d+1];
1487 R phi_prod[ths->d+1];
1488 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1490 phi_prod[0] = K(1.0);
1493 MACRO_init_uo_l_lj_t;
1495 for (l_L = 0; l_L < lprod; l_L++)
1501 MACRO_update_phi_prod_ll_plain(with_PRE_PSI);
1503 lhs = ths->g + ll_plain[ths->d];
1505 val = phi_prod[ths->d] * ths->f[j];
1508 lhs_real[0] += CREAL(val);
1511 lhs_real[1] += CIMAG(val);
1513 MACRO_count_uo_l_lj_t;
1519 if (ths->flags & PRE_FG_PSI)
1522 R fg_exp_l[ths->d][2*ths->m+2];
1523 for(t2 = 0; t2 < ths->d; t2++)
1526 R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
1527 R tmpEXP2sq = tmpEXP2*tmpEXP2;
1530 fg_exp_l[t2][0] = K(1.0);
1531 for(lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
1533 tmp3 = tmp2*tmpEXP2;
1535 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
1539 #pragma omp parallel for default(shared) private(k,t,t2)
1540 for (k = 0; k < ths->M_total; k++)
1542 INT ll_plain[ths->d+1];
1543 R phi_prod[ths->d+1];
1544 INT u[ths->d], o[ths->d];
1547 R fg_psi[ths->d][2*ths->m+2];
1551 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1553 phi_prod[0] = K(1.0);
1556 MACRO_init_uo_l_lj_t;
1558 for (t2 = 0; t2 < ths->d; t2++)
1560 fg_psi[t2][0] = ths->psi[2*(j*ths->d+t2)];
1561 tmpEXP1 = ths->psi[2*(j*ths->d+t2)+1];
1563 for (l_fg = u[t2]+1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
1566 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1570 for (l_L= 0; l_L < lprod; l_L++)
1576 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1578 lhs = ths->g + ll_plain[ths->d];
1580 val = phi_prod[ths->d] * ths->f[j];
1583 lhs_real[0] += CREAL(val);
1586 lhs_real[1] += CIMAG(val);
1588 MACRO_count_uo_l_lj_t;
1594 if (ths->flags & FG_PSI)
1597 R fg_exp_l[ths->d][2*ths->m+2];
1601 for (t2 = 0; t2 < ths->d; t2++)
1604 R tmpEXP2 = EXP(K(-1.0)/ths->b[t2]);
1605 R tmpEXP2sq = tmpEXP2*tmpEXP2;
1608 fg_exp_l[t2][0] = K(1.0);
1609 for (lj_fg = 1; lj_fg <= (2*ths->m+2); lj_fg++)
1611 tmp3 = tmp2*tmpEXP2;
1613 fg_exp_l[t2][lj_fg] = fg_exp_l[t2][lj_fg-1]*tmp3;
1617 #pragma omp parallel for default(shared) private(k,t,t2)
1618 for (k = 0; k < ths->M_total; k++)
1620 INT ll_plain[ths->d+1];
1621 R phi_prod[ths->d+1];
1622 INT u[ths->d], o[ths->d];
1625 R fg_psi[ths->d][2*ths->m+2];
1629 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1631 phi_prod[0] = K(1.0);
1634 MACRO_init_uo_l_lj_t;
1636 for (t2 = 0; t2 < ths->d; t2++)
1638 fg_psi[t2][0] = (PHI(ths->n[t2],(ths->x[j*ths->d+t2]-((R)u[t2])/ths->n[t2]),t2));
1640 tmpEXP1 = EXP(K(2.0)*(ths->n[t2]*ths->x[j*ths->d+t2] - u[t2])
1643 for (l_fg = u[t2] + 1, lj_fg = 1; l_fg <= o[t2]; l_fg++, lj_fg++)
1646 fg_psi[t2][lj_fg] = fg_psi[t2][0]*tmp1*fg_exp_l[t2][lj_fg];
1650 for (l_L = 0; l_L < lprod; l_L++)
1656 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1658 lhs = ths->g + ll_plain[ths->d];
1660 val = phi_prod[ths->d] * ths->f[j];
1663 lhs_real[0] += CREAL(val);
1666 lhs_real[1] += CIMAG(val);
1668 MACRO_count_uo_l_lj_t;
1674 if (ths->flags & PRE_LIN_PSI)
1678 #pragma omp parallel for default(shared) private(k)
1679 for (k = 0; k<ths->M_total; k++)
1681 INT u[ths->d], o[ths->d];
1686 INT ll_plain[ths->d+1];
1687 R phi_prod[ths->d+1];
1689 R fg_psi[ths->d][2*ths->m+2];
1693 INT ip_s = ths->K/(ths->m+2);
1694 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1696 phi_prod[0] = K(1.0);
1699 MACRO_init_uo_l_lj_t;
1701 for (t2 = 0; t2 < ths->d; t2++)
1703 y[t2] = ((ths->n[t2]*ths->x[j*ths->d+t2]-(R)u[t2])
1704 * ((R)ths->K))/(ths->m+2);
1705 ip_u = LRINT(FLOOR(y[t2]));
1707 for (l_fg = u[t2], lj_fg = 0; l_fg <= o[t2]; l_fg++, lj_fg++)
1709 fg_psi[t2][lj_fg] = ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s)]
1710 * (1-ip_w) + ths->psi[(ths->K+1)*t2 + ABS(ip_u-lj_fg*ip_s+1)]
1715 for (l_L = 0; l_L < lprod; l_L++)
1721 MACRO_update_phi_prod_ll_plain(with_FG_PSI);
1723 lhs = ths->g + ll_plain[ths->d];
1725 val = phi_prod[ths->d] * ths->f[j];
1728 lhs_real[0] += CREAL(val);
1731 lhs_real[1] += CIMAG(val);
1733 MACRO_count_uo_l_lj_t;
1742 #pragma omp parallel for default(shared) private(k)
1743 for (k = 0; k < ths->M_total; k++)
1745 INT u[ths->d], o[ths->d];
1750 INT ll_plain[ths->d+1];
1751 R phi_prod[ths->d+1];
1752 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1754 phi_prod[0] = K(1.0);
1757 MACRO_init_uo_l_lj_t;
1759 for (l_L = 0; l_L < lprod; l_L++)
1765 MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
1767 lhs = ths->g + ll_plain[ths->d];
1769 val = phi_prod[ths->d] * ths->f[j];
1772 lhs_real[0] += CREAL(val);
1775 lhs_real[1] += CIMAG(val);
1777 MACRO_count_uo_l_lj_t;
1783 static void B_T(
X(plan) *ths)
1794 static void nfft_1d_init_fg_exp_l(R *fg_exp_l,
const INT m,
const R b)
1796 const INT tmp2 = 2*m+2;
1798 R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
1800 fg_exp_b0 = EXP(K(-1.0)/b);
1801 fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
1802 fg_exp_b1 = fg_exp_b2 =fg_exp_l[0] = K(1.0);
1804 for (l = 1; l < tmp2; l++)
1806 fg_exp_b2 = fg_exp_b1*fg_exp_b0;
1807 fg_exp_b1 *= fg_exp_b0_sq;
1808 fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
1813 static void nfft_trafo_1d_compute(C *fj,
const C *g,
const R *psij_const,
1814 const R *xj,
const INT n,
const INT m)
1821 uo2(&u, &o, *xj, n, m);
1825 for (l = 1, gj = g + u, (*fj) = (*psij++) * (*gj++); l <= 2*m+1; l++)
1826 (*fj) += (*psij++) * (*gj++);
1830 for (l = 1, gj = g + u, (*fj) = (*psij++) * (*gj++); l < 2*m+1 - o; l++)
1831 (*fj) += (*psij++) * (*gj++);
1832 for (l = 0, gj = g; l <= o; l++)
1833 (*fj) += (*psij++) * (*gj++);
1838 static void nfft_adjoint_1d_compute_serial(
const C *fj, C *g,
1839 const R *psij_const,
const R *xj,
const INT n,
const INT m)
1846 uo2(&u,&o,*xj, n, m);
1850 for (l = 0, gj = g+u; l <= 2*m+1; l++)
1851 (*gj++) += (*psij++) * (*fj);
1855 for (l = 0, gj = g+u; l < 2*m+1-o; l++)
1856 (*gj++) += (*psij++) * (*fj);
1857 for (l = 0, gj = g; l <= o; l++)
1858 (*gj++) += (*psij++) * (*fj);
1865 static void nfft_adjoint_1d_compute_omp_atomic(
const C f, C *g,
1866 const R *psij_const,
const R *xj,
const INT n,
const INT m)
1870 INT index_temp[2*m+2];
1872 uo2(&u,&o,*xj, n, m);
1874 for (l=0; l<=2*m+1; l++)
1875 index_temp[l] = (l+u)%n;
1877 for (l = 0, gj = g+u; l <= 2*m+1; l++)
1879 INT i = index_temp[l];
1881 R *lhs_real = (R*)lhs;
1882 C val = psij_const[l] * f;
1884 lhs_real[0] += CREAL(val);
1887 lhs_real[1] += CIMAG(val);
1908 static void nfft_adjoint_1d_compute_omp_blockwise(
const C f, C *g,
1909 const R *psij_const,
const R *xj,
const INT n,
const INT m,
1910 const INT my_u0,
const INT my_o0)
1914 uo2(&ar_u,&ar_o,*xj, n, m);
1918 INT u = MAX(my_u0,ar_u);
1919 INT o = MIN(my_o0,ar_o);
1920 INT offset_psij = u-ar_u;
1922 assert(offset_psij >= 0);
1923 assert(o-u <= 2*m+1);
1924 assert(offset_psij+o-u <= 2*m+1);
1927 for (l = 0; l <= o-u; l++)
1928 g[u+l] += psij_const[offset_psij+l] * f;
1932 INT u = MAX(my_u0,ar_u);
1934 INT offset_psij = u-ar_u;
1936 assert(offset_psij >= 0);
1937 assert(o-u <= 2*m+1);
1938 assert(offset_psij+o-u <= 2*m+1);
1941 for (l = 0; l <= o-u; l++)
1942 g[u+l] += psij_const[offset_psij+l] * f;
1945 o = MIN(my_o0,ar_o);
1946 offset_psij += my_u0-ar_u+n;
1951 assert(o-u <= 2*m+1);
1952 if (offset_psij+o-u > 2*m+1)
1954 fprintf(stderr,
"ERR: %d %d %d %d %d %d %d\n", ar_u, ar_o, my_u0, my_o0, u, o, offset_psij);
1956 assert(offset_psij+o-u <= 2*m+1);
1959 for (l = 0; l <= o-u; l++)
1960 g[u+l] += psij_const[offset_psij+l] * f;
1965 static void nfft_trafo_1d_B(
X(plan) *ths)
1967 const INT n = ths->n[0], M = ths->M_total, m = ths->m, m2p2 = 2*m+2;
1968 const C *g = (C*)ths->g;
1970 if (ths->flags & PRE_FULL_PSI)
1974 #pragma omp parallel for default(shared) private(k)
1976 for (k = 0; k < M; k++)
1979 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1981 for (l = 0; l < m2p2; l++)
1982 ths->f[j] += ths->psi[j*m2p2+l] * g[ths->psi_index_g[j*m2p2+l]];
1987 if (ths->flags & PRE_PSI)
1991 #pragma omp parallel for default(shared) private(k)
1993 for (k = 0; k < M; k++)
1995 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
1996 nfft_trafo_1d_compute(&ths->f[j], g, ths->psi + j * (2 * m + 2),
2002 if (ths->flags & PRE_FG_PSI)
2007 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2010 #pragma omp parallel for default(shared) private(k)
2012 for (k = 0; k < M; k++)
2014 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2015 const R fg_psij0 = ths->psi[2 * j], fg_psij1 = ths->psi[2 * j + 1];
2016 R fg_psij2 = K(1.0);
2020 psij_const[0] = fg_psij0;
2022 for (l = 1; l < m2p2; l++)
2024 fg_psij2 *= fg_psij1;
2025 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2028 nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2034 if (ths->flags & FG_PSI)
2041 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2044 #pragma omp parallel for default(shared) private(k)
2046 for (k = 0; k < M; k++)
2048 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2050 R fg_psij0, fg_psij1, fg_psij2;
2053 uo(ths, (INT)j, &u, &o, (INT)0);
2054 fg_psij0 = (PHI(ths->n[0], ths->x[j] - ((R)(u))/(R)(n), 0));
2055 fg_psij1 = EXP(K(2.0) * ((R)(n) * ths->x[j] - (R)(u)) / ths->b[0]);
2058 psij_const[0] = fg_psij0;
2060 for (l = 1; l < m2p2; l++)
2062 fg_psij2 *= fg_psij1;
2063 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2066 nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2071 if (ths->flags & PRE_LIN_PSI)
2073 const INT K = ths->K, ip_s = K / (m + 2);
2079 #pragma omp parallel for default(shared) private(k)
2081 for (k = 0; k < M; k++)
2087 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2089 uo(ths, (INT)j, &u, &o, (INT)0);
2091 ip_y = FABS((R)(n) * ths->x[j] - (R)(u)) * ((R)ip_s);
2092 ip_u = (INT)(LRINT(FLOOR(ip_y)));
2093 ip_w = ip_y - (R)(ip_u);
2095 for (l = 0; l < m2p2; l++)
2096 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w)
2097 + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w);
2099 nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2111 #pragma omp parallel for default(shared) private(k)
2113 for (k = 0; k < M; k++)
2117 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2119 uo(ths, (INT)j, &u, &o, (INT)0);
2121 for (l = 0; l < m2p2; l++)
2122 psij_const[l] = (PHI(ths->n[0], ths->x[j] - ((R)((u+l))) / (R)(n), 0));
2124 nfft_trafo_1d_compute(&ths->f[j], g, psij_const, &ths->x[j], n, m);
2131 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A \
2133 assert(ar_x[2*k] >= min_u_a || k == M-1); \
2135 assert(ar_x[2*k-2] < min_u_a); \
2138 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A
2142 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B \
2144 assert(ar_x[2*k] >= min_u_b || k == M-1); \
2146 assert(ar_x[2*k-2] < min_u_b); \
2149 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B
2152 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
2154 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, \
2155 ths->psi + j * (2 * m + 2), ths->x + j, n, m, my_u0, my_o0); \
2158 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
2160 R psij_const[2 * m + 2]; \
2162 R fg_psij0 = ths->psi[2 * j]; \
2163 R fg_psij1 = ths->psi[2 * j + 1]; \
2164 R fg_psij2 = K(1.0); \
2166 psij_const[0] = fg_psij0; \
2167 for (l = 1; l <= 2 * m + 1; l++) \
2169 fg_psij2 *= fg_psij1; \
2170 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; \
2173 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2174 ths->x + j, n, m, my_u0, my_o0); \
2177 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
2179 R psij_const[2 * m + 2]; \
2180 R fg_psij0, fg_psij1, fg_psij2; \
2183 uo(ths, j, &u, &o, (INT)0); \
2184 fg_psij0 = (PHI(ths->n[0],ths->x[j]-((R)u)/n,0)); \
2185 fg_psij1 = EXP(K(2.0) * (n * (ths->x[j]) - u) / ths->b[0]); \
2186 fg_psij2 = K(1.0); \
2187 psij_const[0] = fg_psij0; \
2188 for (l = 1; l <= 2 * m + 1; l++) \
2190 fg_psij2 *= fg_psij1; \
2191 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l]; \
2194 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2195 ths->x + j, n, m, my_u0, my_o0); \
2198 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
2200 R psij_const[2 * m + 2]; \
2205 uo(ths, j, &u, &o, (INT)0); \
2207 ip_y = FABS(n * ths->x[j] - u) * ((R)ip_s); \
2208 ip_u = LRINT(FLOOR(ip_y)); \
2209 ip_w = ip_y - ip_u; \
2210 for (l = 0; l < 2 * m + 2; l++) \
2212 = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w) \
2213 + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w); \
2215 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2216 ths->x + j, n, m, my_u0, my_o0); \
2219 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
2221 R psij_const[2 * m + 2]; \
2224 uo(ths, j, &u, &o, (INT)0); \
2226 for (l = 0; l <= 2 * m + 1; l++) \
2227 psij_const[l] = (PHI(ths->n[0],ths->x[j]-((R)((u+l)))/n,0)); \
2229 nfft_adjoint_1d_compute_omp_blockwise(ths->f[j], g, psij_const, \
2230 ths->x + j, n, m, my_u0, my_o0); \
2233 #define MACRO_adjoint_1d_B_OMP_BLOCKWISE(whichone) \
2235 if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
2237 _Pragma("omp parallel private(k)") \
2239 INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
2240 INT *ar_x = ths->index_x; \
2242 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
2243 &min_u_b, &max_u_b, 1, &n, m); \
2245 if (min_u_a != -1) \
2247 k = index_x_binary_search(ar_x, M, min_u_a); \
2249 MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_A \
2253 INT u_prod = ar_x[2*k]; \
2254 INT j = ar_x[2*k+1]; \
2256 if (u_prod < min_u_a || u_prod > max_u_a) \
2259 MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
2265 if (min_u_b != -1) \
2267 k = index_x_binary_search(ar_x, M, min_u_b); \
2269 MACRO_adjoint_1d_B_OMP_BLOCKWISE_ASSERT_B \
2273 INT u_prod = ar_x[2*k]; \
2274 INT j = ar_x[2*k+1]; \
2276 if (u_prod < min_u_b || u_prod > max_u_b) \
2279 MACRO_adjoint_1d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
2289 static void nfft_adjoint_1d_B(
X(plan) *ths)
2291 const INT n = ths->n[0], M = ths->M_total, m = ths->m;
2295 memset(g, 0, (
size_t)(ths->n_total) *
sizeof(C));
2297 if (ths->flags & PRE_FULL_PSI)
2299 nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
2300 (INT)1, ths->n, m, ths->flags, ths->index_x);
2304 if (ths->flags & PRE_PSI)
2307 MACRO_adjoint_1d_B_OMP_BLOCKWISE(PRE_PSI)
2311 #pragma omp parallel for default(shared) private(k)
2313 for (k = 0; k < M; k++)
2315 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2317 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, ths->psi + j * (2 * m + 2), ths->x + j, n, m);
2319 nfft_adjoint_1d_compute_serial(ths->f + j, g, ths->psi + j * (2 * m + 2), ths->x + j, n, m);
2326 if (ths->flags & PRE_FG_PSI)
2328 R fg_exp_l[2 * m + 2];
2330 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2333 MACRO_adjoint_1d_B_OMP_BLOCKWISE(PRE_FG_PSI)
2338 #pragma omp parallel for default(shared) private(k)
2340 for (k = 0; k < M; k++)
2342 R psij_const[2 * m + 2];
2343 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2345 R fg_psij0 = ths->psi[2 * j];
2346 R fg_psij1 = ths->psi[2 * j + 1];
2347 R fg_psij2 = K(1.0);
2349 psij_const[0] = fg_psij0;
2350 for (l = 1; l <= 2 * m + 1; l++)
2352 fg_psij2 *= fg_psij1;
2353 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2357 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2359 nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2366 if (ths->flags & FG_PSI)
2368 R fg_exp_l[2 * m + 2];
2370 nfft_1d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2375 MACRO_adjoint_1d_B_OMP_BLOCKWISE(FG_PSI)
2379 #pragma omp parallel for default(shared) private(k)
2381 for (k = 0; k < M; k++)
2384 R psij_const[2 * m + 2];
2385 R fg_psij0, fg_psij1, fg_psij2;
2386 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2388 uo(ths, j, &u, &o, (INT)0);
2389 fg_psij0 = (PHI(ths->n[0], ths->x[j] - ((R)u) / (R)(n),0));
2390 fg_psij1 = EXP(K(2.0) * ((R)(n) * (ths->x[j]) - (R)(u)) / ths->b[0]);
2392 psij_const[0] = fg_psij0;
2393 for (l = 1; l <= 2 * m + 1; l++)
2395 fg_psij2 *= fg_psij1;
2396 psij_const[l] = fg_psij0 * fg_psij2 * fg_exp_l[l];
2400 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2402 nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2409 if (ths->flags & PRE_LIN_PSI)
2411 const INT K = ths->K;
2412 const INT ip_s = K / (m + 2);
2417 MACRO_adjoint_1d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
2421 #pragma omp parallel for default(shared) private(k)
2423 for (k = 0; k < M; k++)
2428 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2429 R psij_const[2 * m + 2];
2431 uo(ths, j, &u, &o, (INT)0);
2433 ip_y = FABS((R)(n) * ths->x[j] - (R)(u)) * ((R)ip_s);
2434 ip_u = (INT)(LRINT(FLOOR(ip_y)));
2435 ip_w = ip_y - (R)(ip_u);
2436 for (l = 0; l < 2 * m + 2; l++)
2438 = ths->psi[ABS(ip_u-l*ip_s)] * (K(1.0) - ip_w)
2439 + ths->psi[ABS(ip_u-l*ip_s+1)] * (ip_w);
2442 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2444 nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2454 MACRO_adjoint_1d_B_OMP_BLOCKWISE(NO_PSI)
2458 #pragma omp parallel for default(shared) private(k)
2460 for (k = 0; k < M; k++)
2463 R psij_const[2 * m + 2];
2464 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2466 uo(ths, j, &u, &o, (INT)0);
2468 for (l = 0; l <= 2 * m + 1; l++)
2469 psij_const[l] = (PHI(ths->n[0], ths->x[j] - ((R)((u+l))) / (R)(n),0));
2472 nfft_adjoint_1d_compute_omp_atomic(ths->f[j], g, psij_const, ths->x + j, n, m);
2474 nfft_adjoint_1d_compute_serial(ths->f + j, g, psij_const, ths->x + j, n, m);
2479 void X(trafo_1d)(
X(plan) *ths)
2481 const INT N = ths->N[0], N2 = N/2, n = ths->n[0];
2482 C *f_hat1 = (C*)ths->f_hat, *f_hat2 = (C*)&ths->f_hat[N2];
2484 ths->g_hat = ths->g1;
2488 C *g_hat1 = (C*)&ths->g_hat[n-N/2], *g_hat2 = (C*)ths->g_hat;
2489 R *c_phi_inv1, *c_phi_inv2;
2495 #pragma omp parallel for default(shared) private(k)
2496 for (k = 0; k < ths->n_total; k++)
2497 ths->g_hat[k] = 0.0;
2500 memset(ths->g_hat, 0, (
size_t)(ths->n_total) *
sizeof(C));
2502 if(ths->flags & PRE_PHI_HUT)
2505 c_phi_inv1 = ths->c_phi_inv[0];
2506 c_phi_inv2 = &ths->c_phi_inv[0][N2];
2509 #pragma omp parallel for default(shared) private(k)
2511 for (k = 0; k < N2; k++)
2513 g_hat1[k] = f_hat1[k] * c_phi_inv1[k];
2514 g_hat2[k] = f_hat2[k] * c_phi_inv2[k];
2521 #pragma omp parallel for default(shared) private(k)
2523 for (k = 0; k < N2; k++)
2525 g_hat1[k] = f_hat1[k] / (PHI_HUT(ths->n[0],k-N2,0));
2526 g_hat2[k] = f_hat2[k] / (PHI_HUT(ths->n[0],k,0));
2532 FFTW(execute)(ths->my_fftw_plan1);
2536 nfft_trafo_1d_B(ths);
2541 void X(adjoint_1d)(
X(plan) *ths)
2544 C *g_hat1,*g_hat2,*f_hat1,*f_hat2;
2545 R *c_phi_inv1, *c_phi_inv2;
2553 f_hat1=(C*)ths->f_hat;
2554 f_hat2=(C*)&ths->f_hat[N/2];
2555 g_hat1=(C*)&ths->g_hat[n-N/2];
2556 g_hat2=(C*)ths->g_hat;
2559 nfft_adjoint_1d_B(ths);
2563 FFTW(execute)(ths->my_fftw_plan2);
2567 if(ths->flags & PRE_PHI_HUT)
2570 c_phi_inv1=ths->c_phi_inv[0];
2571 c_phi_inv2=&ths->c_phi_inv[0][N/2];
2574 #pragma omp parallel for default(shared) private(k)
2576 for (k = 0; k < N/2; k++)
2578 f_hat1[k] = g_hat1[k] * c_phi_inv1[k];
2579 f_hat2[k] = g_hat2[k] * c_phi_inv2[k];
2587 #pragma omp parallel for default(shared) private(k)
2589 for (k = 0; k < N/2; k++)
2591 f_hat1[k] = g_hat1[k] / (PHI_HUT(ths->n[0],k-N/2,0));
2592 f_hat2[k] = g_hat2[k] / (PHI_HUT(ths->n[0],k,0));
2601 static void nfft_2d_init_fg_exp_l(R *fg_exp_l,
const INT m,
const R b)
2604 R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
2606 fg_exp_b0 = EXP(K(-1.0)/b);
2607 fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
2610 fg_exp_l[0] = K(1.0);
2611 for(l=1; l <= 2*m+1; l++)
2613 fg_exp_b2 = fg_exp_b1*fg_exp_b0;
2614 fg_exp_b1 *= fg_exp_b0_sq;
2615 fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
2619 static void nfft_trafo_2d_compute(C *fj,
const C *g,
const R *psij_const0,
2620 const R *psij_const1,
const R *xj0,
const R *xj1,
const INT n0,
2621 const INT n1,
const INT m)
2623 INT u0,o0,l0,u1,o1,l1;
2625 const R *psij0,*psij1;
2630 uo2(&u0,&o0,*xj0, n0, m);
2631 uo2(&u1,&o1,*xj1, n1, m);
2637 for(l0=0; l0<=2*m+1; l0++,psij0++)
2641 for(l1=0; l1<=2*m+1; l1++)
2642 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2645 for(l0=0; l0<=2*m+1; l0++,psij0++)
2649 for(l1=0; l1<2*m+1-o1; l1++)
2650 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2652 for(l1=0; l1<=o1; l1++)
2653 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2658 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2662 for(l1=0; l1<=2*m+1; l1++)
2663 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2665 for(l0=0; l0<=o0; l0++,psij0++)
2669 for(l1=0; l1<=2*m+1; l1++)
2670 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2675 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2679 for(l1=0; l1<2*m+1-o1; l1++)
2680 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2682 for(l1=0; l1<=o1; l1++)
2683 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2685 for(l0=0; l0<=o0; l0++,psij0++)
2689 for(l1=0; l1<2*m+1-o1; l1++)
2690 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2692 for(l1=0; l1<=o1; l1++)
2693 (*fj) += (*psij0) * (*psij1++) * (*gj++);
2700 static void nfft_adjoint_2d_compute_omp_atomic(
const C f, C *g,
2701 const R *psij_const0,
const R *psij_const1,
const R *xj0,
2702 const R *xj1,
const INT n0,
const INT n1,
const INT m)
2704 INT u0,o0,l0,u1,o1,l1;
2705 const INT lprod = (2*m+2) * (2*m+2);
2707 INT index_temp0[2*m+2];
2708 INT index_temp1[2*m+2];
2710 uo2(&u0,&o0,*xj0, n0, m);
2711 uo2(&u1,&o1,*xj1, n1, m);
2713 for (l0=0; l0<=2*m+1; l0++)
2714 index_temp0[l0] = (u0+l0)%n0;
2716 for (l1=0; l1<=2*m+1; l1++)
2717 index_temp1[l1] = (u1+l1)%n1;
2719 for(l0=0; l0<=2*m+1; l0++)
2721 for(l1=0; l1<=2*m+1; l1++)
2723 INT i = index_temp0[l0] * n1 + index_temp1[l1];
2725 R *lhs_real = (R*)lhs;
2726 C val = psij_const0[l0] * psij_const1[l1] * f;
2729 lhs_real[0] += CREAL(val);
2732 lhs_real[1] += CIMAG(val);
2757 static void nfft_adjoint_2d_compute_omp_blockwise(
const C f, C *g,
2758 const R *psij_const0,
const R *psij_const1,
const R *xj0,
2759 const R *xj1,
const INT n0,
const INT n1,
const INT m,
2760 const INT my_u0,
const INT my_o0)
2762 INT ar_u0,ar_o0,l0,u1,o1,l1;
2763 const INT lprod = (2*m+2) * (2*m+2);
2764 INT index_temp1[2*m+2];
2766 uo2(&ar_u0,&ar_o0,*xj0, n0, m);
2767 uo2(&u1,&o1,*xj1, n1, m);
2769 for (l1 = 0; l1 <= 2*m+1; l1++)
2770 index_temp1[l1] = (u1+l1)%n1;
2774 INT u0 = MAX(my_u0,ar_u0);
2775 INT o0 = MIN(my_o0,ar_o0);
2776 INT offset_psij = u0-ar_u0;
2778 assert(offset_psij >= 0);
2779 assert(o0-u0 <= 2*m+1);
2780 assert(offset_psij+o0-u0 <= 2*m+1);
2783 for (l0 = 0; l0 <= o0-u0; l0++)
2785 INT i0 = (u0+l0) * n1;
2786 const C val0 = psij_const0[offset_psij+l0];
2788 for(l1=0; l1<=2*m+1; l1++)
2789 g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
2794 INT u0 = MAX(my_u0,ar_u0);
2796 INT offset_psij = u0-ar_u0;
2798 assert(offset_psij >= 0);
2799 assert(o0-u0 <= 2*m+1);
2800 assert(offset_psij+o0-u0 <= 2*m+1);
2803 for (l0 = 0; l0 <= o0-u0; l0++)
2805 INT i0 = (u0+l0) * n1;
2806 const C val0 = psij_const0[offset_psij+l0];
2808 for(l1=0; l1<=2*m+1; l1++)
2809 g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
2813 o0 = MIN(my_o0,ar_o0);
2814 offset_psij += my_u0-ar_u0+n0;
2819 assert(o0-u0 <= 2*m+1);
2820 assert(offset_psij+o0-u0 <= 2*m+1);
2824 for (l0 = 0; l0 <= o0-u0; l0++)
2826 INT i0 = (u0+l0) * n1;
2827 const C val0 = psij_const0[offset_psij+l0];
2829 for(l1=0; l1<=2*m+1; l1++)
2830 g[i0 + index_temp1[l1]] += val0 * psij_const1[l1] * f;
2837 static void nfft_adjoint_2d_compute_serial(
const C *fj, C *g,
2838 const R *psij_const0,
const R *psij_const1,
const R *xj0,
2839 const R *xj1,
const INT n0,
const INT n1,
const INT m)
2841 INT u0,o0,l0,u1,o1,l1;
2843 const R *psij0,*psij1;
2848 uo2(&u0,&o0,*xj0, n0, m);
2849 uo2(&u1,&o1,*xj1, n1, m);
2853 for(l0=0; l0<=2*m+1; l0++,psij0++)
2857 for(l1=0; l1<=2*m+1; l1++)
2858 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2861 for(l0=0; l0<=2*m+1; l0++,psij0++)
2865 for(l1=0; l1<2*m+1-o1; l1++)
2866 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2868 for(l1=0; l1<=o1; l1++)
2869 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2874 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2878 for(l1=0; l1<=2*m+1; l1++)
2879 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2881 for(l0=0; l0<=o0; l0++,psij0++)
2885 for(l1=0; l1<=2*m+1; l1++)
2886 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2891 for(l0=0; l0<2*m+1-o0; l0++,psij0++)
2895 for(l1=0; l1<2*m+1-o1; l1++)
2896 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2898 for(l1=0; l1<=o1; l1++)
2899 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2901 for(l0=0; l0<=o0; l0++,psij0++)
2905 for(l1=0; l1<2*m+1-o1; l1++)
2906 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2908 for(l1=0; l1<=o1; l1++)
2909 (*gj++) += (*psij0) * (*psij1++) * (*fj);
2915 static void nfft_trafo_2d_B(
X(plan) *ths)
2917 const C *g = (C*)ths->g;
2918 const INT n0 = ths->n[0];
2919 const INT n1 = ths->n[1];
2920 const INT M = ths->M_total;
2921 const INT m = ths->m;
2925 if(ths->flags & PRE_FULL_PSI)
2927 const INT lprod = (2*m+2) * (2*m+2);
2929 #pragma omp parallel for default(shared) private(k)
2931 for (k = 0; k < M; k++)
2934 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2936 for (l = 0; l < lprod; l++)
2937 ths->f[j] += ths->psi[j*lprod+l] * g[ths->psi_index_g[j*lprod+l]];
2942 if(ths->flags & PRE_PSI)
2945 #pragma omp parallel for default(shared) private(k)
2947 for (k = 0; k < M; k++)
2949 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2950 nfft_trafo_2d_compute(ths->f+j, g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
2956 if(ths->flags & PRE_FG_PSI)
2958 R fg_exp_l[2*(2*m+2)];
2960 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
2961 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
2964 #pragma omp parallel for default(shared) private(k)
2966 for (k = 0; k < M; k++)
2968 R psij_const[2*(2*m+2)];
2969 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
2971 R fg_psij0 = ths->psi[2*j*2];
2972 R fg_psij1 = ths->psi[2*j*2+1];
2973 R fg_psij2 = K(1.0);
2975 psij_const[0] = fg_psij0;
2976 for (l = 1; l <= 2*m+1; l++)
2978 fg_psij2 *= fg_psij1;
2979 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
2982 fg_psij0 = ths->psi[2*(j*2+1)];
2983 fg_psij1 = ths->psi[2*(j*2+1)+1];
2985 psij_const[2*m+2] = fg_psij0;
2986 for (l = 1; l <= 2*m+1; l++)
2988 fg_psij2 *= fg_psij1;
2989 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
2992 nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
2998 if(ths->flags & FG_PSI)
3000 R fg_exp_l[2*(2*m+2)];
3002 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3003 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3008 #pragma omp parallel for default(shared) private(k)
3010 for (k = 0; k < M; k++)
3013 R fg_psij0, fg_psij1, fg_psij2;
3014 R psij_const[2*(2*m+2)];
3015 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3017 uo(ths, j, &u, &o, (INT)0);
3018 fg_psij0 = (PHI(ths->n[0], ths->x[2*j] - ((R)u) / (R)(n0),0));
3019 fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[2*j]) - (R)(u)) / ths->b[0]);
3021 psij_const[0] = fg_psij0;
3022 for (l = 1; l <= 2*m+1; l++)
3024 fg_psij2 *= fg_psij1;
3025 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3028 uo(ths,j,&u,&o, (INT)1);
3029 fg_psij0 = (PHI(ths->n[1], ths->x[2*j+1] - ((R)u) / (R)(n1),1));
3030 fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[2*j+1]) - (R)(u)) / ths->b[1]);
3032 psij_const[2*m+2] = fg_psij0;
3033 for(l=1; l<=2*m+1; l++)
3035 fg_psij2 *= fg_psij1;
3036 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3039 nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3045 if(ths->flags & PRE_LIN_PSI)
3047 const INT K = ths->K, ip_s = K / (m + 2);
3052 #pragma omp parallel for default(shared) private(k)
3054 for (k = 0; k < M; k++)
3059 R psij_const[2*(2*m+2)];
3060 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3062 uo(ths,j,&u,&o,(INT)0);
3063 ip_y = FABS((R)(n0) * ths->x[2*j] - (R)(u)) * ((R)ip_s);
3064 ip_u = (INT)LRINT(FLOOR(ip_y));
3065 ip_w = ip_y - (R)(ip_u);
3066 for (l = 0; l < 2*m+2; l++)
3067 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
3069 uo(ths,j,&u,&o,(INT)1);
3070 ip_y = FABS((R)(n1) * ths->x[2*j+1] - (R)(u)) * ((R)ip_s);
3071 ip_u = (INT)(LRINT(FLOOR(ip_y)));
3072 ip_w = ip_y - (R)(ip_u);
3073 for (l = 0; l < 2*m+2; l++)
3074 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
3076 nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3086 #pragma omp parallel for default(shared) private(k)
3088 for (k = 0; k < M; k++)
3090 R psij_const[2*(2*m+2)];
3092 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3094 uo(ths,j,&u,&o,(INT)0);
3095 for (l = 0; l <= 2*m+1; l++)
3096 psij_const[l]=(PHI(ths->n[0], ths->x[2*j] - ((R)((u+l))) / (R)(n0),0));
3098 uo(ths,j,&u,&o,(INT)1);
3099 for (l = 0; l <= 2*m+1; l++)
3100 psij_const[2*m+2+l] = (PHI(ths->n[1], ths->x[2*j+1] - ((R)((u+l)))/(R)(n1),1));
3102 nfft_trafo_2d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3107 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A \
3109 assert(ar_x[2*k] >= min_u_a || k == M-1); \
3111 assert(ar_x[2*k-2] < min_u_a); \
3114 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A
3118 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B \
3120 assert(ar_x[2*k] >= min_u_b || k == M-1); \
3122 assert(ar_x[2*k-2] < min_u_b); \
3125 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B
3128 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
3129 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3130 ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), \
3131 ths->x+2*j, ths->x+2*j+1, n0, n1, m, my_u0, my_o0);
3133 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
3135 R psij_const[2*(2*m+2)]; \
3137 R fg_psij0 = ths->psi[2*j*2]; \
3138 R fg_psij1 = ths->psi[2*j*2+1]; \
3139 R fg_psij2 = K(1.0); \
3141 psij_const[0] = fg_psij0; \
3142 for(l=1; l<=2*m+1; l++) \
3144 fg_psij2 *= fg_psij1; \
3145 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
3148 fg_psij0 = ths->psi[2*(j*2+1)]; \
3149 fg_psij1 = ths->psi[2*(j*2+1)+1]; \
3150 fg_psij2 = K(1.0); \
3151 psij_const[2*m+2] = fg_psij0; \
3152 for(l=1; l<=2*m+1; l++) \
3154 fg_psij2 *= fg_psij1; \
3155 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
3158 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3159 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3160 n0, n1, m, my_u0, my_o0); \
3163 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
3165 R psij_const[2*(2*m+2)]; \
3166 R fg_psij0, fg_psij1, fg_psij2; \
3169 uo(ths,j,&u,&o,(INT)0); \
3170 fg_psij0 = (PHI(ths->n[0],ths->x[2*j]-((R)u)/n0,0)); \
3171 fg_psij1 = EXP(K(2.0)*(n0*(ths->x[2*j]) - u)/ths->b[0]); \
3172 fg_psij2 = K(1.0); \
3173 psij_const[0] = fg_psij0; \
3174 for(l=1; l<=2*m+1; l++) \
3176 fg_psij2 *= fg_psij1; \
3177 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
3180 uo(ths,j,&u,&o,(INT)1); \
3181 fg_psij0 = (PHI(ths->n[1],ths->x[2*j+1]-((R)u)/n1,1)); \
3182 fg_psij1 = EXP(K(2.0)*(n1*(ths->x[2*j+1]) - u)/ths->b[1]); \
3183 fg_psij2 = K(1.0); \
3184 psij_const[2*m+2] = fg_psij0; \
3185 for(l=1; l<=2*m+1; l++) \
3187 fg_psij2 *= fg_psij1; \
3188 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
3191 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3192 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3193 n0, n1, m, my_u0, my_o0); \
3196 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
3198 R psij_const[2*(2*m+2)]; \
3203 uo(ths,j,&u,&o,(INT)0); \
3204 ip_y = FABS(n0*(ths->x[2*j]) - u)*((R)ip_s); \
3205 ip_u = LRINT(FLOOR(ip_y)); \
3207 for(l=0; l < 2*m+2; l++) \
3208 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
3209 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); \
3211 uo(ths,j,&u,&o,(INT)1); \
3212 ip_y = FABS(n1*(ths->x[2*j+1]) - u)*((R)ip_s); \
3213 ip_u = LRINT(FLOOR(ip_y)); \
3215 for(l=0; l < 2*m+2; l++) \
3216 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
3217 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
3219 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3220 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3221 n0, n1, m, my_u0, my_o0); \
3224 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
3226 R psij_const[2*(2*m+2)]; \
3229 uo(ths,j,&u,&o,(INT)0); \
3230 for(l=0;l<=2*m+1;l++) \
3231 psij_const[l]=(PHI(ths->n[0],ths->x[2*j]-((R)((u+l)))/n0,0)); \
3233 uo(ths,j,&u,&o,(INT)1); \
3234 for(l=0;l<=2*m+1;l++) \
3235 psij_const[2*m+2+l]=(PHI(ths->n[1],ths->x[2*j+1]-((R)((u+l)))/n1,1)); \
3237 nfft_adjoint_2d_compute_omp_blockwise(ths->f[j], g, \
3238 psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, \
3239 n0, n1, m, my_u0, my_o0); \
3242 #define MACRO_adjoint_2d_B_OMP_BLOCKWISE(whichone) \
3244 if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
3246 _Pragma("omp parallel private(k)") \
3248 INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
3249 INT *ar_x = ths->index_x; \
3251 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
3252 &min_u_b, &max_u_b, 2, ths->n, m); \
3254 if (min_u_a != -1) \
3256 k = index_x_binary_search(ar_x, M, min_u_a); \
3258 MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_A \
3262 INT u_prod = ar_x[2*k]; \
3263 INT j = ar_x[2*k+1]; \
3265 if (u_prod < min_u_a || u_prod > max_u_a) \
3268 MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
3274 if (min_u_b != -1) \
3276 INT k = index_x_binary_search(ar_x, M, min_u_b); \
3278 MACRO_adjoint_2d_B_OMP_BLOCKWISE_ASSERT_B \
3282 INT u_prod = ar_x[2*k]; \
3283 INT j = ar_x[2*k+1]; \
3285 if (u_prod < min_u_b || u_prod > max_u_b) \
3288 MACRO_adjoint_2d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
3299 static void nfft_adjoint_2d_B(
X(plan) *ths)
3301 const INT n0 = ths->n[0];
3302 const INT n1 = ths->n[1];
3303 const INT M = ths->M_total;
3304 const INT m = ths->m;
3308 memset(g, 0, (
size_t)(ths->n_total) *
sizeof(C));
3310 if(ths->flags & PRE_FULL_PSI)
3312 nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
3313 (INT)2, ths->n, m, ths->flags, ths->index_x);
3317 if(ths->flags & PRE_PSI)
3320 MACRO_adjoint_2d_B_OMP_BLOCKWISE(PRE_PSI)
3324 #pragma omp parallel for default(shared) private(k)
3326 for (k = 0; k < M; k++)
3328 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3330 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3332 nfft_adjoint_2d_compute_serial(ths->f+j, g, ths->psi+j*2*(2*m+2), ths->psi+(j*2+1)*(2*m+2), ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3338 if(ths->flags & PRE_FG_PSI)
3340 R fg_exp_l[2*(2*m+2)];
3342 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3343 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3346 MACRO_adjoint_2d_B_OMP_BLOCKWISE(PRE_FG_PSI)
3351 #pragma omp parallel for default(shared) private(k)
3353 for (k = 0; k < M; k++)
3355 R psij_const[2*(2*m+2)];
3356 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3358 R fg_psij0 = ths->psi[2*j*2];
3359 R fg_psij1 = ths->psi[2*j*2+1];
3360 R fg_psij2 = K(1.0);
3362 psij_const[0] = fg_psij0;
3363 for(l=1; l<=2*m+1; l++)
3365 fg_psij2 *= fg_psij1;
3366 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3369 fg_psij0 = ths->psi[2*(j*2+1)];
3370 fg_psij1 = ths->psi[2*(j*2+1)+1];
3372 psij_const[2*m+2] = fg_psij0;
3373 for(l=1; l<=2*m+1; l++)
3375 fg_psij2 *= fg_psij1;
3376 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3380 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3382 nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3389 if(ths->flags & FG_PSI)
3391 R fg_exp_l[2*(2*m+2)];
3393 nfft_2d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
3394 nfft_2d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
3399 MACRO_adjoint_2d_B_OMP_BLOCKWISE(FG_PSI)
3403 #pragma omp parallel for default(shared) private(k)
3405 for (k = 0; k < M; k++)
3408 R fg_psij0, fg_psij1, fg_psij2;
3409 R psij_const[2*(2*m+2)];
3410 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3412 uo(ths,j,&u,&o,(INT)0);
3413 fg_psij0 = (PHI(ths->n[0], ths->x[2*j] - ((R)u)/(R)(n0),0));
3414 fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[2*j]) - (R)(u)) / ths->b[0]);
3416 psij_const[0] = fg_psij0;
3417 for(l=1; l<=2*m+1; l++)
3419 fg_psij2 *= fg_psij1;
3420 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
3423 uo(ths,j,&u,&o,(INT)1);
3424 fg_psij0 = (PHI(ths->n[1], ths->x[2*j+1] - ((R)u) / (R)(n1),1));
3425 fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[2*j+1]) - (R)(u)) / ths->b[1]);
3427 psij_const[2*m+2] = fg_psij0;
3428 for(l=1; l<=2*m+1; l++)
3430 fg_psij2 *= fg_psij1;
3431 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
3435 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3437 nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3444 if(ths->flags & PRE_LIN_PSI)
3446 const INT K = ths->K;
3447 const INT ip_s = K / (m + 2);
3452 MACRO_adjoint_2d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
3456 #pragma omp parallel for default(shared) private(k)
3458 for (k = 0; k < M; k++)
3463 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3464 R psij_const[2*(2*m+2)];
3466 uo(ths,j,&u,&o,(INT)0);
3467 ip_y = FABS((R)(n0) * (ths->x[2*j]) - (R)(u)) * ((R)ip_s);
3468 ip_u = (INT)(LRINT(FLOOR(ip_y)));
3469 ip_w = ip_y - (R)(ip_u);
3470 for(l=0; l < 2*m+2; l++)
3471 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
3472 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
3474 uo(ths,j,&u,&o,(INT)1);
3475 ip_y = FABS((R)(n1) * (ths->x[2*j+1]) - (R)(u)) * ((R)ip_s);
3476 ip_u = (INT)(LRINT(FLOOR(ip_y)));
3477 ip_w = ip_y - (R)(ip_u);
3478 for(l=0; l < 2*m+2; l++)
3479 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
3480 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
3483 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3485 nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3495 MACRO_adjoint_2d_B_OMP_BLOCKWISE(NO_PSI)
3499 #pragma omp parallel for default(shared) private(k)
3501 for (k = 0; k < M; k++)
3504 R psij_const[2*(2*m+2)];
3505 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
3507 uo(ths,j,&u,&o,(INT)0);
3508 for(l=0;l<=2*m+1;l++)
3509 psij_const[l]=(PHI(ths->n[0], ths->x[2*j] - ((R)((u+l))) / (R)(n0),0));
3511 uo(ths,j,&u,&o,(INT)1);
3512 for(l=0;l<=2*m+1;l++)
3513 psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[2*j+1] - ((R)((u+l))) / (R)(n1),1));
3516 nfft_adjoint_2d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3518 nfft_adjoint_2d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, ths->x+2*j, ths->x+2*j+1, n0, n1, m);
3524 void X(trafo_2d)(
X(plan) *ths)
3526 INT k0,k1,n0,n1,N0,N1;
3528 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12;
3529 R ck01, ck02, ck11, ck12;
3530 C *g_hat11,*f_hat11,*g_hat21,*f_hat21,*g_hat12,*f_hat12,*g_hat22,*f_hat22;
3540 f_hat=(C*)ths->f_hat;
3541 g_hat=(C*)ths->g_hat;
3545 #pragma omp parallel for default(shared) private(k0)
3546 for (k0 = 0; k0 < ths->n_total; k0++)
3547 ths->g_hat[k0] = 0.0;
3549 memset(ths->g_hat, 0, (
size_t)(ths->n_total) *
sizeof(C));
3551 if(ths->flags & PRE_PHI_HUT)
3553 c_phi_inv01=ths->c_phi_inv[0];
3554 c_phi_inv02=&ths->c_phi_inv[0][N0/2];
3557 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,c_phi_inv11,c_phi_inv12,g_hat11,f_hat11,g_hat21,f_hat21,g_hat12,f_hat12,g_hat22,f_hat22,ck11,ck12)
3559 for(k0=0;k0<N0/2;k0++)
3561 ck01=c_phi_inv01[k0];
3562 ck02=c_phi_inv02[k0];
3564 c_phi_inv11=ths->c_phi_inv[1];
3565 c_phi_inv12=&ths->c_phi_inv[1][N1/2];
3567 g_hat11=g_hat + (n0-(N0/2)+k0)*n1+n1-(N1/2);
3568 f_hat11=f_hat + k0*N1;
3569 g_hat21=g_hat + k0*n1+n1-(N1/2);
3570 f_hat21=f_hat + ((N0/2)+k0)*N1;
3571 g_hat12=g_hat + (n0-(N0/2)+k0)*n1;
3572 f_hat12=f_hat + k0*N1+(N1/2);
3573 g_hat22=g_hat + k0*n1;
3574 f_hat22=f_hat + ((N0/2)+k0)*N1+(N1/2);
3576 for(k1=0;k1<N1/2;k1++)
3578 ck11=c_phi_inv11[k1];
3579 ck12=c_phi_inv12[k1];
3581 g_hat11[k1] = f_hat11[k1] * ck01 * ck11;
3582 g_hat21[k1] = f_hat21[k1] * ck02 * ck11;
3583 g_hat12[k1] = f_hat12[k1] * ck01 * ck12;
3584 g_hat22[k1] = f_hat22[k1] * ck02 * ck12;
3590 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,ck11,ck12)
3592 for(k0=0;k0<N0/2;k0++)
3594 ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
3595 ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
3596 for(k1=0;k1<N1/2;k1++)
3598 ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
3599 ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
3600 g_hat[(n0-N0/2+k0)*n1+n1-N1/2+k1] = f_hat[k0*N1+k1] * ck01 * ck11;
3601 g_hat[k0*n1+n1-N1/2+k1] = f_hat[(N0/2+k0)*N1+k1] * ck02 * ck11;
3602 g_hat[(n0-N0/2+k0)*n1+k1] = f_hat[k0*N1+N1/2+k1] * ck01 * ck12;
3603 g_hat[k0*n1+k1] = f_hat[(N0/2+k0)*N1+N1/2+k1] * ck02 * ck12;
3610 FFTW(execute)(ths->my_fftw_plan1);
3614 nfft_trafo_2d_B(ths);
3618 void X(adjoint_2d)(
X(plan) *ths)
3620 INT k0,k1,n0,n1,N0,N1;
3622 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12;
3623 R ck01, ck02, ck11, ck12;
3624 C *g_hat11,*f_hat11,*g_hat21,*f_hat21,*g_hat12,*f_hat12,*g_hat22,*f_hat22;
3634 f_hat=(C*)ths->f_hat;
3635 g_hat=(C*)ths->g_hat;
3638 nfft_adjoint_2d_B(ths);
3642 FFTW(execute)(ths->my_fftw_plan2);
3646 if(ths->flags & PRE_PHI_HUT)
3648 c_phi_inv01=ths->c_phi_inv[0];
3649 c_phi_inv02=&ths->c_phi_inv[0][N0/2];
3652 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,c_phi_inv11,c_phi_inv12,g_hat11,f_hat11,g_hat21,f_hat21,g_hat12,f_hat12,g_hat22,f_hat22,ck11,ck12)
3654 for(k0=0;k0<N0/2;k0++)
3656 ck01=c_phi_inv01[k0];
3657 ck02=c_phi_inv02[k0];
3659 c_phi_inv11=ths->c_phi_inv[1];
3660 c_phi_inv12=&ths->c_phi_inv[1][N1/2];
3662 g_hat11=g_hat + (n0-(N0/2)+k0)*n1+n1-(N1/2);
3663 f_hat11=f_hat + k0*N1;
3664 g_hat21=g_hat + k0*n1+n1-(N1/2);
3665 f_hat21=f_hat + ((N0/2)+k0)*N1;
3666 g_hat12=g_hat + (n0-(N0/2)+k0)*n1;
3667 f_hat12=f_hat + k0*N1+(N1/2);
3668 g_hat22=g_hat + k0*n1;
3669 f_hat22=f_hat + ((N0/2)+k0)*N1+(N1/2);
3671 for(k1=0;k1<N1/2;k1++)
3673 ck11=c_phi_inv11[k1];
3674 ck12=c_phi_inv12[k1];
3676 f_hat11[k1] = g_hat11[k1] * ck01 * ck11;
3677 f_hat21[k1] = g_hat21[k1] * ck02 * ck11;
3678 f_hat12[k1] = g_hat12[k1] * ck01 * ck12;
3679 f_hat22[k1] = g_hat22[k1] * ck02 * ck12;
3685 #pragma omp parallel for default(shared) private(k0,k1,ck01,ck02,ck11,ck12)
3687 for(k0=0;k0<N0/2;k0++)
3689 ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
3690 ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
3691 for(k1=0;k1<N1/2;k1++)
3693 ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
3694 ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
3695 f_hat[k0*N1+k1] = g_hat[(n0-N0/2+k0)*n1+n1-N1/2+k1] * ck01 * ck11;
3696 f_hat[(N0/2+k0)*N1+k1] = g_hat[k0*n1+n1-N1/2+k1] * ck02 * ck11;
3697 f_hat[k0*N1+N1/2+k1] = g_hat[(n0-N0/2+k0)*n1+k1] * ck01 * ck12;
3698 f_hat[(N0/2+k0)*N1+N1/2+k1] = g_hat[k0*n1+k1] * ck02 * ck12;
3706 static void nfft_3d_init_fg_exp_l(R *fg_exp_l,
const INT m,
const R b)
3709 R fg_exp_b0, fg_exp_b1, fg_exp_b2, fg_exp_b0_sq;
3711 fg_exp_b0 = EXP(-K(1.0) / b);
3712 fg_exp_b0_sq = fg_exp_b0*fg_exp_b0;
3715 fg_exp_l[0] = K(1.0);
3716 for(l=1; l <= 2*m+1; l++)
3718 fg_exp_b2 = fg_exp_b1*fg_exp_b0;
3719 fg_exp_b1 *= fg_exp_b0_sq;
3720 fg_exp_l[l] = fg_exp_l[l-1]*fg_exp_b2;
3724 static void nfft_trafo_3d_compute(C *fj,
const C *g,
const R *psij_const0,
3725 const R *psij_const1,
const R *psij_const2,
const R *xj0,
const R *xj1,
3726 const R *xj2,
const INT n0,
const INT n1,
const INT n2,
const INT m)
3728 INT u0, o0, l0, u1, o1, l1, u2, o2, l2;
3730 const R *psij0, *psij1, *psij2;
3732 psij0 = psij_const0;
3733 psij1 = psij_const1;
3734 psij2 = psij_const2;
3736 uo2(&u0, &o0, *xj0, n0, m);
3737 uo2(&u1, &o1, *xj1, n1, m);
3738 uo2(&u2, &o2, *xj2, n2, m);
3745 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
3747 psij1 = psij_const1;
3748 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3750 psij2 = psij_const2;
3751 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3752 for (l2 = 0; l2 <= 2 * m + 1; l2++)
3753 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3758 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
3760 psij1 = psij_const1;
3761 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3763 psij2 = psij_const2;
3764 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3765 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3766 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3767 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
3768 for (l2 = 0; l2 <= o2; l2++)
3769 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3774 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
3776 psij1 = psij_const1;
3777 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3779 psij2 = psij_const2;
3780 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3781 for (l2 = 0; l2 <= 2 * m + 1; l2++)
3782 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3784 for (l1 = 0; l1 <= o1; l1++, psij1++)
3786 psij2 = psij_const2;
3787 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
3788 for (l2 = 0; l2 <= 2 * m + 1; l2++)
3789 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3794 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
3796 psij1 = psij_const1;
3797 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3799 psij2 = psij_const2;
3800 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3801 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3802 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3803 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
3804 for (l2 = 0; l2 <= o2; l2++)
3805 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3807 for (l1 = 0; l1 <= o1; l1++, psij1++)
3809 psij2 = psij_const2;
3810 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
3811 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3812 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3813 gj = g + ((u0 + l0) * n1 + l1) * n2;
3814 for (l2 = 0; l2 <= o2; l2++)
3815 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3823 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
3825 psij1 = psij_const1;
3826 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3828 psij2 = psij_const2;
3829 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3830 for (l2 = 0; l2 <= 2 * m + 1; l2++)
3831 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3835 for (l0 = 0; l0 <= o0; l0++, psij0++)
3837 psij1 = psij_const1;
3838 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3840 psij2 = psij_const2;
3841 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
3842 for (l2 = 0; l2 <= 2 * m + 1; l2++)
3843 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3848 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
3850 psij1 = psij_const1;
3851 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3853 psij2 = psij_const2;
3854 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3855 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3856 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3857 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
3858 for (l2 = 0; l2 <= o2; l2++)
3859 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3863 for (l0 = 0; l0 <= o0; l0++, psij0++)
3865 psij1 = psij_const1;
3866 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
3868 psij2 = psij_const2;
3869 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
3870 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3871 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3872 gj = g + (l0 * n1 + (u1 + l1)) * n2;
3873 for (l2 = 0; l2 <= o2; l2++)
3874 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3881 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
3883 psij1 = psij_const1;
3884 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3886 psij2 = psij_const2;
3887 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3888 for (l2 = 0; l2 <= 2 * m + 1; l2++)
3889 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3891 for (l1 = 0; l1 <= o1; l1++, psij1++)
3893 psij2 = psij_const2;
3894 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
3895 for (l2 = 0; l2 <= 2 * m + 1; l2++)
3896 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3899 for (l0 = 0; l0 <= o0; l0++, psij0++)
3901 psij1 = psij_const1;
3902 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3904 psij2 = psij_const2;
3905 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
3906 for (l2 = 0; l2 <= 2 * m + 1; l2++)
3907 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3909 for (l1 = 0; l1 <= o1; l1++, psij1++)
3911 psij2 = psij_const2;
3912 gj = g + (l0 * n1 + l1) * n2 + u2;
3913 for (l2 = 0; l2 <= 2 * m + 1; l2++)
3914 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3919 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
3921 psij1 = psij_const1;
3922 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3924 psij2 = psij_const2;
3925 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
3926 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3927 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3928 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
3929 for (l2 = 0; l2 <= o2; l2++)
3930 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3932 for (l1 = 0; l1 <= o1; l1++, psij1++)
3934 psij2 = psij_const2;
3935 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
3936 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3937 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3938 gj = g + ((u0 + l0) * n1 + l1) * n2;
3939 for (l2 = 0; l2 <= o2; l2++)
3940 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3944 for (l0 = 0; l0 <= o0; l0++, psij0++)
3946 psij1 = psij_const1;
3947 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
3949 psij2 = psij_const2;
3950 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
3951 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3952 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3953 gj = g + (l0 * n1 + (u1 + l1)) * n2;
3954 for (l2 = 0; l2 <= o2; l2++)
3955 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3957 for (l1 = 0; l1 <= o1; l1++, psij1++)
3959 psij2 = psij_const2;
3960 gj = g + (l0 * n1 + l1) * n2 + u2;
3961 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
3962 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3963 gj = g + (l0 * n1 + l1) * n2;
3964 for (l2 = 0; l2 <= o2; l2++)
3965 (*fj) += (*psij0) * (*psij1) * (*psij2++) * (*gj++);
3993 static void nfft_adjoint_3d_compute_omp_blockwise(
const C f, C *g,
3994 const R *psij_const0,
const R *psij_const1,
const R *psij_const2,
3995 const R *xj0,
const R *xj1,
const R *xj2,
3996 const INT n0,
const INT n1,
const INT n2,
const INT m,
3997 const INT my_u0,
const INT my_o0)
3999 INT ar_u0,ar_o0,l0,u1,o1,l1,u2,o2,l2;
4000 const INT lprod = (2*m+2) * (2*m+2) * (2*m+2);
4002 INT index_temp1[2*m+2];
4003 INT index_temp2[2*m+2];
4005 uo2(&ar_u0,&ar_o0,*xj0, n0, m);
4006 uo2(&u1,&o1,*xj1, n1, m);
4007 uo2(&u2,&o2,*xj2, n2, m);
4009 for (l1=0; l1<=2*m+1; l1++)
4010 index_temp1[l1] = (u1+l1)%n1;
4012 for (l2=0; l2<=2*m+1; l2++)
4013 index_temp2[l2] = (u2+l2)%n2;
4017 INT u0 = MAX(my_u0,ar_u0);
4018 INT o0 = MIN(my_o0,ar_o0);
4019 INT offset_psij = u0-ar_u0;
4021 assert(offset_psij >= 0);
4022 assert(o0-u0 <= 2*m+1);
4023 assert(offset_psij+o0-u0 <= 2*m+1);
4026 for (l0 = 0; l0 <= o0-u0; l0++)
4028 const INT i0 = (u0+l0) * n1;
4029 const C val0 = psij_const0[offset_psij+l0];
4031 for(l1=0; l1<=2*m+1; l1++)
4033 const INT i1 = (i0 + index_temp1[l1]) * n2;
4034 const C val1 = psij_const1[l1];
4036 for(l2=0; l2<=2*m+1; l2++)
4037 g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4043 INT u0 = MAX(my_u0,ar_u0);
4045 INT offset_psij = u0-ar_u0;
4047 assert(offset_psij >= 0);
4048 assert(o0-u0 <= 2*m+1);
4049 assert(offset_psij+o0-u0 <= 2*m+1);
4052 for (l0 = 0; l0 <= o0-u0; l0++)
4054 INT i0 = (u0+l0) * n1;
4055 const C val0 = psij_const0[offset_psij+l0];
4057 for(l1=0; l1<=2*m+1; l1++)
4059 const INT i1 = (i0 + index_temp1[l1]) * n2;
4060 const C val1 = psij_const1[l1];
4062 for(l2=0; l2<=2*m+1; l2++)
4063 g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4068 o0 = MIN(my_o0,ar_o0);
4069 offset_psij += my_u0-ar_u0+n0;
4074 assert(o0-u0 <= 2*m+1);
4075 assert(offset_psij+o0-u0 <= 2*m+1);
4078 for (l0 = 0; l0 <= o0-u0; l0++)
4080 INT i0 = (u0+l0) * n1;
4081 const C val0 = psij_const0[offset_psij+l0];
4083 for(l1=0; l1<=2*m+1; l1++)
4085 const INT i1 = (i0 + index_temp1[l1]) * n2;
4086 const C val1 = psij_const1[l1];
4088 for(l2=0; l2<=2*m+1; l2++)
4089 g[i1 + index_temp2[l2]] += val0 * val1 * psij_const2[l2] * f;
4098 static void nfft_adjoint_3d_compute_omp_atomic(
const C f, C *g,
4099 const R *psij_const0,
const R *psij_const1,
const R *psij_const2,
4100 const R *xj0,
const R *xj1,
const R *xj2,
4101 const INT n0,
const INT n1,
const INT n2,
const INT m)
4103 INT u0,o0,l0,u1,o1,l1,u2,o2,l2;
4104 const INT lprod = (2*m+2) * (2*m+2) * (2*m+2);
4106 INT index_temp0[2*m+2];
4107 INT index_temp1[2*m+2];
4108 INT index_temp2[2*m+2];
4110 uo2(&u0,&o0,*xj0, n0, m);
4111 uo2(&u1,&o1,*xj1, n1, m);
4112 uo2(&u2,&o2,*xj2, n2, m);
4114 for (l0=0; l0<=2*m+1; l0++)
4115 index_temp0[l0] = (u0+l0)%n0;
4117 for (l1=0; l1<=2*m+1; l1++)
4118 index_temp1[l1] = (u1+l1)%n1;
4120 for (l2=0; l2<=2*m+1; l2++)
4121 index_temp2[l2] = (u2+l2)%n2;
4123 for(l0=0; l0<=2*m+1; l0++)
4125 for(l1=0; l1<=2*m+1; l1++)
4127 for(l2=0; l2<=2*m+1; l2++)
4129 INT i = (index_temp0[l0] * n1 + index_temp1[l1]) * n2 + index_temp2[l2];
4131 R *lhs_real = (R*)lhs;
4132 C val = psij_const0[l0] * psij_const1[l1] * psij_const2[l2] * f;
4135 lhs_real[0] += CREAL(val);
4138 lhs_real[1] += CIMAG(val);
4146 static void nfft_adjoint_3d_compute_serial(
const C *fj, C *g,
4147 const R *psij_const0,
const R *psij_const1,
const R *psij_const2,
const R *xj0,
4148 const R *xj1,
const R *xj2,
const INT n0,
const INT n1,
const INT n2,
4151 INT u0, o0, l0, u1, o1, l1, u2, o2, l2;
4153 const R *psij0, *psij1, *psij2;
4155 psij0 = psij_const0;
4156 psij1 = psij_const1;
4157 psij2 = psij_const2;
4159 uo2(&u0, &o0, *xj0, n0, m);
4160 uo2(&u1, &o1, *xj1, n1, m);
4161 uo2(&u2, &o2, *xj2, n2, m);
4166 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4168 psij1 = psij_const1;
4169 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4171 psij2 = psij_const2;
4172 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4173 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4174 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4179 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4181 psij1 = psij_const1;
4182 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4184 psij2 = psij_const2;
4185 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4186 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4187 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4188 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4189 for (l2 = 0; l2 <= o2; l2++)
4190 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4195 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4197 psij1 = psij_const1;
4198 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4200 psij2 = psij_const2;
4201 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4202 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4203 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4205 for (l1 = 0; l1 <= o1; l1++, psij1++)
4207 psij2 = psij_const2;
4208 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4209 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4210 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4215 for (l0 = 0; l0 <= 2 * m + 1; l0++, psij0++)
4217 psij1 = psij_const1;
4218 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4220 psij2 = psij_const2;
4221 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4222 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4223 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4224 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4225 for (l2 = 0; l2 <= o2; l2++)
4226 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4228 for (l1 = 0; l1 <= o1; l1++, psij1++)
4230 psij2 = psij_const2;
4231 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4232 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4233 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4234 gj = g + ((u0 + l0) * n1 + l1) * n2;
4235 for (l2 = 0; l2 <= o2; l2++)
4236 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4244 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4246 psij1 = psij_const1;
4247 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4249 psij2 = psij_const2;
4250 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4251 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4252 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4256 for (l0 = 0; l0 <= o0; l0++, psij0++)
4258 psij1 = psij_const1;
4259 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4261 psij2 = psij_const2;
4262 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4263 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4264 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4269 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4271 psij1 = psij_const1;
4272 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4274 psij2 = psij_const2;
4275 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4276 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4277 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4278 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4279 for (l2 = 0; l2 <= o2; l2++)
4280 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4284 for (l0 = 0; l0 <= o0; l0++, psij0++)
4286 psij1 = psij_const1;
4287 for (l1 = 0; l1 <= 2 * m + 1; l1++, psij1++)
4289 psij2 = psij_const2;
4290 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4291 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4292 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4293 gj = g + (l0 * n1 + (u1 + l1)) * n2;
4294 for (l2 = 0; l2 <= o2; l2++)
4295 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4302 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4304 psij1 = psij_const1;
4305 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4307 psij2 = psij_const2;
4308 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4309 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4310 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4312 for (l1 = 0; l1 <= o1; l1++, psij1++)
4314 psij2 = psij_const2;
4315 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4316 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4317 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4320 for (l0 = 0; l0 <= o0; l0++, psij0++)
4322 psij1 = psij_const1;
4323 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4325 psij2 = psij_const2;
4326 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4327 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4328 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4330 for (l1 = 0; l1 <= o1; l1++, psij1++)
4332 psij2 = psij_const2;
4333 gj = g + (l0 * n1 + l1) * n2 + u2;
4334 for (l2 = 0; l2 <= 2 * m + 1; l2++)
4335 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4340 for (l0 = 0; l0 < 2 * m + 1 - o0; l0++, psij0++)
4342 psij1 = psij_const1;
4343 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4345 psij2 = psij_const2;
4346 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2 + u2;
4347 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4348 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4349 gj = g + ((u0 + l0) * n1 + (u1 + l1)) * n2;
4350 for (l2 = 0; l2 <= o2; l2++)
4351 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4353 for (l1 = 0; l1 <= o1; l1++, psij1++)
4355 psij2 = psij_const2;
4356 gj = g + ((u0 + l0) * n1 + l1) * n2 + u2;
4357 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4358 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4359 gj = g + ((u0 + l0) * n1 + l1) * n2;
4360 for (l2 = 0; l2 <= o2; l2++)
4361 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4365 for (l0 = 0; l0 <= o0; l0++, psij0++)
4367 psij1 = psij_const1;
4368 for (l1 = 0; l1 < 2 * m + 1 - o1; l1++, psij1++)
4370 psij2 = psij_const2;
4371 gj = g + (l0 * n1 + (u1 + l1)) * n2 + u2;
4372 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4373 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4374 gj = g + (l0 * n1 + (u1 + l1)) * n2;
4375 for (l2 = 0; l2 <= o2; l2++)
4376 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4378 for (l1 = 0; l1 <= o1; l1++, psij1++)
4380 psij2 = psij_const2;
4381 gj = g + (l0 * n1 + l1) * n2 + u2;
4382 for (l2 = 0; l2 < 2 * m + 1 - o2; l2++)
4383 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4384 gj = g + (l0 * n1 + l1) * n2;
4385 for (l2 = 0; l2 <= o2; l2++)
4386 (*gj++) += (*psij0) * (*psij1) * (*psij2++) * (*fj);
4393 static void nfft_trafo_3d_B(
X(plan) *ths)
4395 const INT n0 = ths->n[0];
4396 const INT n1 = ths->n[1];
4397 const INT n2 = ths->n[2];
4398 const INT M = ths->M_total;
4399 const INT m = ths->m;
4401 const C* g = (C*) ths->g;
4405 if(ths->flags & PRE_FULL_PSI)
4407 const INT lprod = (2*m+2) * (2*m+2) * (2*m+2);
4409 #pragma omp parallel for default(shared) private(k)
4411 for (k = 0; k < M; k++)
4414 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4416 for (l = 0; l < lprod; l++)
4417 ths->f[j] += ths->psi[j*lprod+l] * g[ths->psi_index_g[j*lprod+l]];
4422 if(ths->flags & PRE_PSI)
4425 #pragma omp parallel for default(shared) private(k)
4427 for (k = 0; k < M; k++)
4429 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4430 nfft_trafo_3d_compute(ths->f+j, g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4435 if(ths->flags & PRE_FG_PSI)
4437 R fg_exp_l[3*(2*m+2)];
4439 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4440 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4441 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4444 #pragma omp parallel for default(shared) private(k)
4446 for (k = 0; k < M; k++)
4448 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4450 R psij_const[3*(2*m+2)];
4451 R fg_psij0 = ths->psi[2*j*3];
4452 R fg_psij1 = ths->psi[2*j*3+1];
4453 R fg_psij2 = K(1.0);
4455 psij_const[0] = fg_psij0;
4456 for(l=1; l<=2*m+1; l++)
4458 fg_psij2 *= fg_psij1;
4459 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4462 fg_psij0 = ths->psi[2*(j*3+1)];
4463 fg_psij1 = ths->psi[2*(j*3+1)+1];
4465 psij_const[2*m+2] = fg_psij0;
4466 for(l=1; l<=2*m+1; l++)
4468 fg_psij2 *= fg_psij1;
4469 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4472 fg_psij0 = ths->psi[2*(j*3+2)];
4473 fg_psij1 = ths->psi[2*(j*3+2)+1];
4475 psij_const[2*(2*m+2)] = fg_psij0;
4476 for(l=1; l<=2*m+1; l++)
4478 fg_psij2 *= fg_psij1;
4479 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
4482 nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4488 if(ths->flags & FG_PSI)
4490 R fg_exp_l[3*(2*m+2)];
4492 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4493 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4494 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4499 #pragma omp parallel for default(shared) private(k)
4501 for (k = 0; k < M; k++)
4503 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4505 R psij_const[3*(2*m+2)];
4506 R fg_psij0, fg_psij1, fg_psij2;
4508 uo(ths,j,&u,&o,(INT)0);
4509 fg_psij0 = (PHI(ths->n[0], ths->x[3*j] - ((R)u) / (R)(n0),0));
4510 fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[3*j]) - (R)(u)) / ths->b[0]);
4512 psij_const[0] = fg_psij0;
4513 for(l=1; l<=2*m+1; l++)
4515 fg_psij2 *= fg_psij1;
4516 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4519 uo(ths,j,&u,&o,(INT)1);
4520 fg_psij0 = (PHI(ths->n[1], ths->x[3*j+1] - ((R)u) / (R)(n1),1));
4521 fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[3*j+1]) - (R)(u)) / ths->b[1]);
4523 psij_const[2*m+2] = fg_psij0;
4524 for(l=1; l<=2*m+1; l++)
4526 fg_psij2 *= fg_psij1;
4527 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4530 uo(ths,j,&u,&o,(INT)2);
4531 fg_psij0 = (PHI(ths->n[2], ths->x[3*j+2] - ((R)u) / (R)(n2),2));
4532 fg_psij1 = EXP(K(2.0) * ((R)(n2) * (ths->x[3*j+2]) - (R)(u)) / ths->b[2]);
4534 psij_const[2*(2*m+2)] = fg_psij0;
4535 for(l=1; l<=2*m+1; l++)
4537 fg_psij2 *= fg_psij1;
4538 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
4541 nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4547 if(ths->flags & PRE_LIN_PSI)
4549 const INT K = ths->K, ip_s = K / (m + 2);
4554 #pragma omp parallel for default(shared) private(k)
4556 for (k = 0; k < M; k++)
4561 R psij_const[3*(2*m+2)];
4562 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4564 uo(ths,j,&u,&o,(INT)0);
4565 ip_y = FABS((R)(n0) * ths->x[3*j+0] - (R)(u)) * ((R)ip_s);
4566 ip_u = (INT)(LRINT(FLOOR(ip_y)));
4567 ip_w = ip_y - (R)(ip_u);
4568 for(l=0; l < 2*m+2; l++)
4569 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4570 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
4572 uo(ths,j,&u,&o,(INT)1);
4573 ip_y = FABS((R)(n1) * ths->x[3*j+1] - (R)(u)) * ((R)ip_s);
4574 ip_u = (INT)(LRINT(FLOOR(ip_y)));
4575 ip_w = ip_y - (R)(ip_u);
4576 for(l=0; l < 2*m+2; l++)
4577 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4578 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
4580 uo(ths,j,&u,&o,(INT)2);
4581 ip_y = FABS((R)(n2) * ths->x[3*j+2] - (R)(u)) * ((R)ip_s);
4582 ip_u = (INT)(LRINT(FLOOR(ip_y)));
4583 ip_w = ip_y - (R)(ip_u);
4584 for(l=0; l < 2*m+2; l++)
4585 psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
4586 ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
4588 nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4598 #pragma omp parallel for default(shared) private(k)
4600 for (k = 0; k < M; k++)
4602 R psij_const[3*(2*m+2)];
4604 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4606 uo(ths,j,&u,&o,(INT)0);
4607 for(l=0;l<=2*m+1;l++)
4608 psij_const[l]=(PHI(ths->n[0], ths->x[3*j] - ((R)((u+l))) / (R)(n0),0));
4610 uo(ths,j,&u,&o,(INT)1);
4611 for(l=0;l<=2*m+1;l++)
4612 psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[3*j+1] - ((R)((u+l))) / (R)(n1),1));
4614 uo(ths,j,&u,&o,(INT)2);
4615 for(l=0;l<=2*m+1;l++)
4616 psij_const[2*(2*m+2)+l]=(PHI(ths->n[2], ths->x[3*j+2] - ((R)((u+l))) / (R)(n2),2));
4618 nfft_trafo_3d_compute(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4623 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A \
4625 assert(ar_x[2*k] >= min_u_a || k == M-1); \
4627 assert(ar_x[2*k-2] < min_u_a); \
4630 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A
4634 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B \
4636 assert(ar_x[2*k] >= min_u_b || k == M-1); \
4638 assert(ar_x[2*k-2] < min_u_b); \
4641 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B
4644 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_PSI \
4645 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4646 ths->psi+j*3*(2*m+2), \
4647 ths->psi+(j*3+1)*(2*m+2), \
4648 ths->psi+(j*3+2)*(2*m+2), \
4649 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4650 n0, n1, n2, m, my_u0, my_o0);
4652 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_FG_PSI \
4655 R psij_const[3*(2*m+2)]; \
4656 R fg_psij0 = ths->psi[2*j*3]; \
4657 R fg_psij1 = ths->psi[2*j*3+1]; \
4658 R fg_psij2 = K(1.0); \
4660 psij_const[0] = fg_psij0; \
4661 for(l=1; l<=2*m+1; l++) \
4663 fg_psij2 *= fg_psij1; \
4664 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
4667 fg_psij0 = ths->psi[2*(j*3+1)]; \
4668 fg_psij1 = ths->psi[2*(j*3+1)+1]; \
4669 fg_psij2 = K(1.0); \
4670 psij_const[2*m+2] = fg_psij0; \
4671 for(l=1; l<=2*m+1; l++) \
4673 fg_psij2 *= fg_psij1; \
4674 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
4677 fg_psij0 = ths->psi[2*(j*3+2)]; \
4678 fg_psij1 = ths->psi[2*(j*3+2)+1]; \
4679 fg_psij2 = K(1.0); \
4680 psij_const[2*(2*m+2)] = fg_psij0; \
4681 for(l=1; l<=2*m+1; l++) \
4683 fg_psij2 *= fg_psij1; \
4684 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; \
4687 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4688 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4689 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4690 n0, n1, n2, m, my_u0, my_o0); \
4693 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_FG_PSI \
4696 R psij_const[3*(2*m+2)]; \
4697 R fg_psij0, fg_psij1, fg_psij2; \
4699 uo(ths,j,&u,&o,(INT)0); \
4700 fg_psij0 = (PHI(ths->n[0],ths->x[3*j]-((R)u)/n0,0)); \
4701 fg_psij1 = EXP(K(2.0)*(n0*(ths->x[3*j]) - u)/ths->b[0]); \
4702 fg_psij2 = K(1.0); \
4703 psij_const[0] = fg_psij0; \
4704 for(l=1; l<=2*m+1; l++) \
4706 fg_psij2 *= fg_psij1; \
4707 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l]; \
4710 uo(ths,j,&u,&o,(INT)1); \
4711 fg_psij0 = (PHI(ths->n[1],ths->x[3*j+1]-((R)u)/n1,1)); \
4712 fg_psij1 = EXP(K(2.0)*(n1*(ths->x[3*j+1]) - u)/ths->b[1]); \
4713 fg_psij2 = K(1.0); \
4714 psij_const[2*m+2] = fg_psij0; \
4715 for(l=1; l<=2*m+1; l++) \
4717 fg_psij2 *= fg_psij1; \
4718 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l]; \
4721 uo(ths,j,&u,&o,(INT)2); \
4722 fg_psij0 = (PHI(ths->n[2],ths->x[3*j+2]-((R)u)/n2,2)); \
4723 fg_psij1 = EXP(K(2.0)*(n2*(ths->x[3*j+2]) - u)/ths->b[2]); \
4724 fg_psij2 = K(1.0); \
4725 psij_const[2*(2*m+2)] = fg_psij0; \
4726 for(l=1; l<=2*m+1; l++) \
4728 fg_psij2 *= fg_psij1; \
4729 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l]; \
4732 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4733 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4734 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4735 n0, n1, n2, m, my_u0, my_o0); \
4738 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_PRE_LIN_PSI \
4741 R psij_const[3*(2*m+2)]; \
4745 uo(ths,j,&u,&o,(INT)0); \
4746 ip_y = FABS(n0*ths->x[3*j+0] - u)*((R)ip_s); \
4747 ip_u = LRINT(FLOOR(ip_y)); \
4749 for(l=0; l < 2*m+2; l++) \
4750 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
4751 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w); \
4753 uo(ths,j,&u,&o,(INT)1); \
4754 ip_y = FABS(n1*ths->x[3*j+1] - u)*((R)ip_s); \
4755 ip_u = LRINT(FLOOR(ip_y)); \
4757 for(l=0; l < 2*m+2; l++) \
4758 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
4759 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
4761 uo(ths,j,&u,&o,(INT)2); \
4762 ip_y = FABS(n2*ths->x[3*j+2] - u)*((R)ip_s); \
4763 ip_u = LRINT(FLOOR(ip_y)); \
4765 for(l=0; l < 2*m+2; l++) \
4766 psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) + \
4767 ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w); \
4769 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4770 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4771 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4772 n0, n1, n2, m, my_u0, my_o0); \
4775 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_NO_PSI \
4778 R psij_const[3*(2*m+2)]; \
4780 uo(ths,j,&u,&o,(INT)0); \
4781 for(l=0;l<=2*m+1;l++) \
4782 psij_const[l]=(PHI(ths->n[0],ths->x[3*j]-((R)((u+l)))/n0,0)); \
4784 uo(ths,j,&u,&o,(INT)1); \
4785 for(l=0;l<=2*m+1;l++) \
4786 psij_const[2*m+2+l]=(PHI(ths->n[1],ths->x[3*j+1]-((R)((u+l)))/n1,1)); \
4788 uo(ths,j,&u,&o,(INT)2); \
4789 for(l=0;l<=2*m+1;l++) \
4790 psij_const[2*(2*m+2)+l]=(PHI(ths->n[2],ths->x[3*j+2]-((R)((u+l)))/n2,2)); \
4792 nfft_adjoint_3d_compute_omp_blockwise(ths->f[j], g, \
4793 psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, \
4794 ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, \
4795 n0, n1, n2, m, my_u0, my_o0); \
4798 #define MACRO_adjoint_3d_B_OMP_BLOCKWISE(whichone) \
4800 if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT) \
4802 _Pragma("omp parallel private(k)") \
4804 INT my_u0, my_o0, min_u_a, max_u_a, min_u_b, max_u_b; \
4805 INT *ar_x = ths->index_x; \
4807 nfft_adjoint_B_omp_blockwise_init(&my_u0, &my_o0, &min_u_a, &max_u_a, \
4808 &min_u_b, &max_u_b, 3, ths->n, m); \
4810 if (min_u_a != -1) \
4812 k = index_x_binary_search(ar_x, M, min_u_a); \
4814 MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_A \
4818 INT u_prod = ar_x[2*k]; \
4819 INT j = ar_x[2*k+1]; \
4821 if (u_prod < min_u_a || u_prod > max_u_a) \
4824 MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
4830 if (min_u_b != -1) \
4832 INT k = index_x_binary_search(ar_x, M, min_u_b); \
4834 MACRO_adjoint_3d_B_OMP_BLOCKWISE_ASSERT_B \
4838 INT u_prod = ar_x[2*k]; \
4839 INT j = ar_x[2*k+1]; \
4841 if (u_prod < min_u_b || u_prod > max_u_b) \
4844 MACRO_adjoint_3d_B_OMP_BLOCKWISE_COMPUTE_ ##whichone \
4854 static void nfft_adjoint_3d_B(
X(plan) *ths)
4857 const INT n0 = ths->n[0];
4858 const INT n1 = ths->n[1];
4859 const INT n2 = ths->n[2];
4860 const INT M = ths->M_total;
4861 const INT m = ths->m;
4865 memset(g, 0, (
size_t)(ths->n_total) *
sizeof(C));
4867 if(ths->flags & PRE_FULL_PSI)
4869 nfft_adjoint_B_compute_full_psi(g, ths->psi_index_g, ths->psi, ths->f, M,
4870 (INT)3, ths->n, m, ths->flags, ths->index_x);
4874 if(ths->flags & PRE_PSI)
4877 MACRO_adjoint_3d_B_OMP_BLOCKWISE(PRE_PSI)
4881 #pragma omp parallel for default(shared) private(k)
4883 for (k = 0; k < M; k++)
4885 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4887 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4889 nfft_adjoint_3d_compute_serial(ths->f+j, g, ths->psi+j*3*(2*m+2), ths->psi+(j*3+1)*(2*m+2), ths->psi+(j*3+2)*(2*m+2), ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4895 if(ths->flags & PRE_FG_PSI)
4897 R fg_exp_l[3*(2*m+2)];
4899 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4900 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4901 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4904 MACRO_adjoint_3d_B_OMP_BLOCKWISE(PRE_FG_PSI)
4908 #pragma omp parallel for default(shared) private(k)
4910 for (k = 0; k < M; k++)
4912 R psij_const[3*(2*m+2)];
4913 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4915 R fg_psij0 = ths->psi[2*j*3];
4916 R fg_psij1 = ths->psi[2*j*3+1];
4917 R fg_psij2 = K(1.0);
4919 psij_const[0] = fg_psij0;
4920 for(l=1; l<=2*m+1; l++)
4922 fg_psij2 *= fg_psij1;
4923 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4926 fg_psij0 = ths->psi[2*(j*3+1)];
4927 fg_psij1 = ths->psi[2*(j*3+1)+1];
4929 psij_const[2*m+2] = fg_psij0;
4930 for(l=1; l<=2*m+1; l++)
4932 fg_psij2 *= fg_psij1;
4933 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
4936 fg_psij0 = ths->psi[2*(j*3+2)];
4937 fg_psij1 = ths->psi[2*(j*3+2)+1];
4939 psij_const[2*(2*m+2)] = fg_psij0;
4940 for(l=1; l<=2*m+1; l++)
4942 fg_psij2 *= fg_psij1;
4943 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
4947 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4949 nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
4956 if(ths->flags & FG_PSI)
4958 R fg_exp_l[3*(2*m+2)];
4960 nfft_3d_init_fg_exp_l(fg_exp_l, m, ths->b[0]);
4961 nfft_3d_init_fg_exp_l(fg_exp_l+2*m+2, m, ths->b[1]);
4962 nfft_3d_init_fg_exp_l(fg_exp_l+2*(2*m+2), m, ths->b[2]);
4967 MACRO_adjoint_3d_B_OMP_BLOCKWISE(FG_PSI)
4971 #pragma omp parallel for default(shared) private(k)
4973 for (k = 0; k < M; k++)
4976 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
4977 R psij_const[3*(2*m+2)];
4978 R fg_psij0, fg_psij1, fg_psij2;
4980 uo(ths,j,&u,&o,(INT)0);
4981 fg_psij0 = (PHI(ths->n[0], ths->x[3*j] - ((R)u) / (R)(n0),0));
4982 fg_psij1 = EXP(K(2.0) * ((R)(n0) * (ths->x[3*j]) - (R)(u))/ths->b[0]);
4984 psij_const[0] = fg_psij0;
4985 for(l=1; l<=2*m+1; l++)
4987 fg_psij2 *= fg_psij1;
4988 psij_const[l] = fg_psij0*fg_psij2*fg_exp_l[l];
4991 uo(ths,j,&u,&o,(INT)1);
4992 fg_psij0 = (PHI(ths->n[1], ths->x[3*j+1] - ((R)u) / (R)(n1),1));
4993 fg_psij1 = EXP(K(2.0) * ((R)(n1) * (ths->x[3*j+1]) - (R)(u))/ths->b[1]);
4995 psij_const[2*m+2] = fg_psij0;
4996 for(l=1; l<=2*m+1; l++)
4998 fg_psij2 *= fg_psij1;
4999 psij_const[2*m+2+l] = fg_psij0*fg_psij2*fg_exp_l[2*m+2+l];
5002 uo(ths,j,&u,&o,(INT)2);
5003 fg_psij0 = (PHI(ths->n[2], ths->x[3*j+2] - ((R)u) / (R)(n2),2));
5004 fg_psij1 = EXP(K(2.0) * ((R)(n2) * (ths->x[3*j+2]) - (R)(u))/ths->b[2]);
5006 psij_const[2*(2*m+2)] = fg_psij0;
5007 for(l=1; l<=2*m+1; l++)
5009 fg_psij2 *= fg_psij1;
5010 psij_const[2*(2*m+2)+l] = fg_psij0*fg_psij2*fg_exp_l[2*(2*m+2)+l];
5014 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5016 nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5023 if(ths->flags & PRE_LIN_PSI)
5025 const INT K = ths->K;
5026 const INT ip_s = K / (m + 2);
5031 MACRO_adjoint_3d_B_OMP_BLOCKWISE(PRE_LIN_PSI)
5035 #pragma omp parallel for default(shared) private(k)
5037 for (k = 0; k < M; k++)
5042 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5043 R psij_const[3*(2*m+2)];
5045 uo(ths,j,&u,&o,(INT)0);
5046 ip_y = FABS((R)(n0) * ths->x[3*j+0] - (R)(u)) * ((R)ip_s);
5047 ip_u = (INT)(LRINT(FLOOR(ip_y)));
5048 ip_w = ip_y - (R)(ip_u);
5049 for(l=0; l < 2*m+2; l++)
5050 psij_const[l] = ths->psi[ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5051 ths->psi[ABS(ip_u-l*ip_s+1)]*(ip_w);
5053 uo(ths,j,&u,&o,(INT)1);
5054 ip_y = FABS((R)(n1) * ths->x[3*j+1] - (R)(u)) * ((R)ip_s);
5055 ip_u = (INT)(LRINT(FLOOR(ip_y)));
5056 ip_w = ip_y - (R)(ip_u);
5057 for(l=0; l < 2*m+2; l++)
5058 psij_const[2*m+2+l] = ths->psi[(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5059 ths->psi[(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
5061 uo(ths,j,&u,&o,(INT)2);
5062 ip_y = FABS((R)(n2) * ths->x[3*j+2] - (R)(u))*((R)ip_s);
5063 ip_u = (INT)(LRINT(FLOOR(ip_y)));
5064 ip_w = ip_y - (R)(ip_u);
5065 for(l=0; l < 2*m+2; l++)
5066 psij_const[2*(2*m+2)+l] = ths->psi[2*(K+1)+ABS(ip_u-l*ip_s)]*(K(1.0)-ip_w) +
5067 ths->psi[2*(K+1)+ABS(ip_u-l*ip_s+1)]*(ip_w);
5070 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5072 nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5082 MACRO_adjoint_3d_B_OMP_BLOCKWISE(NO_PSI)
5086 #pragma omp parallel for default(shared) private(k)
5088 for (k = 0; k < M; k++)
5091 R psij_const[3*(2*m+2)];
5092 INT j = (ths->flags & NFFT_SORT_NODES) ? ths->index_x[2*k+1] : k;
5094 uo(ths,j,&u,&o,(INT)0);
5095 for(l=0;l<=2*m+1;l++)
5096 psij_const[l]=(PHI(ths->n[0], ths->x[3*j] - ((R)((u+l))) / (R)(n0),0));
5098 uo(ths,j,&u,&o,(INT)1);
5099 for(l=0;l<=2*m+1;l++)
5100 psij_const[2*m+2+l]=(PHI(ths->n[1], ths->x[3*j+1] - ((R)((u+l))) / (R)(n1),1));
5102 uo(ths,j,&u,&o,(INT)2);
5103 for(l=0;l<=2*m+1;l++)
5104 psij_const[2*(2*m+2)+l]=(PHI(ths->n[2], ths->x[3*j+2] - ((R)((u+l))) / (R)(n2),2));
5107 nfft_adjoint_3d_compute_omp_atomic(ths->f[j], g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5109 nfft_adjoint_3d_compute_serial(ths->f+j, g, psij_const, psij_const+2*m+2, psij_const+(2*m+2)*2, ths->x+3*j, ths->x+3*j+1, ths->x+3*j+2, n0, n1, n2, m);
5115 void X(trafo_3d)(
X(plan) *ths)
5117 INT k0,k1,k2,n0,n1,n2,N0,N1,N2;
5119 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12, *c_phi_inv21, *c_phi_inv22;
5120 R ck01, ck02, ck11, ck12, ck21, ck22;
5121 C *g_hat111,*f_hat111,*g_hat211,*f_hat211,*g_hat121,*f_hat121,*g_hat221,*f_hat221;
5122 C *g_hat112,*f_hat112,*g_hat212,*f_hat212,*g_hat122,*f_hat122,*g_hat222,*f_hat222;
5134 f_hat=(C*)ths->f_hat;
5135 g_hat=(C*)ths->g_hat;
5139 #pragma omp parallel for default(shared) private(k0)
5140 for (k0 = 0; k0 < ths->n_total; k0++)
5141 ths->g_hat[k0] = 0.0;
5143 memset(ths->g_hat, 0, (
size_t)(ths->n_total) *
sizeof(C));
5146 if(ths->flags & PRE_PHI_HUT)
5148 c_phi_inv01=ths->c_phi_inv[0];
5149 c_phi_inv02=&ths->c_phi_inv[0][N0/2];
5152 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,c_phi_inv11,c_phi_inv12,ck11,ck12,c_phi_inv21,c_phi_inv22,g_hat111,f_hat111,g_hat211,f_hat211,g_hat121,f_hat121,g_hat221,f_hat221,g_hat112,f_hat112,g_hat212,f_hat212,g_hat122,f_hat122,g_hat222,f_hat222,ck21,ck22)
5154 for(k0=0;k0<N0/2;k0++)
5156 ck01=c_phi_inv01[k0];
5157 ck02=c_phi_inv02[k0];
5158 c_phi_inv11=ths->c_phi_inv[1];
5159 c_phi_inv12=&ths->c_phi_inv[1][N1/2];
5161 for(k1=0;k1<N1/2;k1++)
5163 ck11=c_phi_inv11[k1];
5164 ck12=c_phi_inv12[k1];
5165 c_phi_inv21=ths->c_phi_inv[2];
5166 c_phi_inv22=&ths->c_phi_inv[2][N2/2];
5168 g_hat111=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5169 f_hat111=f_hat + (k0*N1+k1)*N2;
5170 g_hat211=g_hat + (k0*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5171 f_hat211=f_hat + (((N0/2)+k0)*N1+k1)*N2;
5172 g_hat121=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2+n2-(N2/2);
5173 f_hat121=f_hat + (k0*N1+(N1/2)+k1)*N2;
5174 g_hat221=g_hat + (k0*n1+k1)*n2+n2-(N2/2);
5175 f_hat221=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2;
5177 g_hat112=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2;
5178 f_hat112=f_hat + (k0*N1+k1)*N2+(N2/2);
5179 g_hat212=g_hat + (k0*n1+n1-(N1/2)+k1)*n2;
5180 f_hat212=f_hat + (((N0/2)+k0)*N1+k1)*N2+(N2/2);
5181 g_hat122=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2;
5182 f_hat122=f_hat + (k0*N1+N1/2+k1)*N2+(N2/2);
5183 g_hat222=g_hat + (k0*n1+k1)*n2;
5184 f_hat222=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2+(N2/2);
5186 for(k2=0;k2<N2/2;k2++)
5188 ck21=c_phi_inv21[k2];
5189 ck22=c_phi_inv22[k2];
5191 g_hat111[k2] = f_hat111[k2] * ck01 * ck11 * ck21;
5192 g_hat211[k2] = f_hat211[k2] * ck02 * ck11 * ck21;
5193 g_hat121[k2] = f_hat121[k2] * ck01 * ck12 * ck21;
5194 g_hat221[k2] = f_hat221[k2] * ck02 * ck12 * ck21;
5196 g_hat112[k2] = f_hat112[k2] * ck01 * ck11 * ck22;
5197 g_hat212[k2] = f_hat212[k2] * ck02 * ck11 * ck22;
5198 g_hat122[k2] = f_hat122[k2] * ck01 * ck12 * ck22;
5199 g_hat222[k2] = f_hat222[k2] * ck02 * ck12 * ck22;
5206 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,ck11,ck12,ck21,ck22)
5208 for(k0=0;k0<N0/2;k0++)
5210 ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
5211 ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
5212 for(k1=0;k1<N1/2;k1++)
5214 ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
5215 ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
5217 for(k2=0;k2<N2/2;k2++)
5219 ck21=K(1.0)/(PHI_HUT(ths->n[2],k2-N2/2,2));
5220 ck22=K(1.0)/(PHI_HUT(ths->n[2],k2,2));
5222 g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] = f_hat[(k0*N1+k1)*N2+k2] * ck01 * ck11 * ck21;
5223 g_hat[(k0*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] = f_hat[((N0/2+k0)*N1+k1)*N2+k2] * ck02 * ck11 * ck21;
5224 g_hat[((n0-N0/2+k0)*n1+k1)*n2+n2-N2/2+k2] = f_hat[(k0*N1+N1/2+k1)*N2+k2] * ck01 * ck12 * ck21;
5225 g_hat[(k0*n1+k1)*n2+n2-N2/2+k2] = f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+k2] * ck02 * ck12 * ck21;
5227 g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+k2] = f_hat[(k0*N1+k1)*N2+N2/2+k2] * ck01 * ck11 * ck22;
5228 g_hat[(k0*n1+n1-N1/2+k1)*n2+k2] = f_hat[((N0/2+k0)*N1+k1)*N2+N2/2+k2] * ck02 * ck11 * ck22;
5229 g_hat[((n0-N0/2+k0)*n1+k1)*n2+k2] = f_hat[(k0*N1+N1/2+k1)*N2+N2/2+k2] * ck01 * ck12 * ck22;
5230 g_hat[(k0*n1+k1)*n2+k2] = f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+N2/2+k2] * ck02 * ck12 * ck22;
5238 FFTW(execute)(ths->my_fftw_plan1);
5242 nfft_trafo_3d_B(ths);
5246 void X(adjoint_3d)(
X(plan) *ths)
5248 INT k0,k1,k2,n0,n1,n2,N0,N1,N2;
5250 R *c_phi_inv01, *c_phi_inv02, *c_phi_inv11, *c_phi_inv12, *c_phi_inv21, *c_phi_inv22;
5251 R ck01, ck02, ck11, ck12, ck21, ck22;
5252 C *g_hat111,*f_hat111,*g_hat211,*f_hat211,*g_hat121,*f_hat121,*g_hat221,*f_hat221;
5253 C *g_hat112,*f_hat112,*g_hat212,*f_hat212,*g_hat122,*f_hat122,*g_hat222,*f_hat222;
5265 f_hat=(C*)ths->f_hat;
5266 g_hat=(C*)ths->g_hat;
5269 nfft_adjoint_3d_B(ths);
5273 FFTW(execute)(ths->my_fftw_plan2);
5277 if(ths->flags & PRE_PHI_HUT)
5279 c_phi_inv01=ths->c_phi_inv[0];
5280 c_phi_inv02=&ths->c_phi_inv[0][N0/2];
5283 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,c_phi_inv11,c_phi_inv12,ck11,ck12,c_phi_inv21,c_phi_inv22,g_hat111,f_hat111,g_hat211,f_hat211,g_hat121,f_hat121,g_hat221,f_hat221,g_hat112,f_hat112,g_hat212,f_hat212,g_hat122,f_hat122,g_hat222,f_hat222,ck21,ck22)
5285 for(k0=0;k0<N0/2;k0++)
5287 ck01=c_phi_inv01[k0];
5288 ck02=c_phi_inv02[k0];
5289 c_phi_inv11=ths->c_phi_inv[1];
5290 c_phi_inv12=&ths->c_phi_inv[1][N1/2];
5292 for(k1=0;k1<N1/2;k1++)
5294 ck11=c_phi_inv11[k1];
5295 ck12=c_phi_inv12[k1];
5296 c_phi_inv21=ths->c_phi_inv[2];
5297 c_phi_inv22=&ths->c_phi_inv[2][N2/2];
5299 g_hat111=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5300 f_hat111=f_hat + (k0*N1+k1)*N2;
5301 g_hat211=g_hat + (k0*n1+n1-(N1/2)+k1)*n2+n2-(N2/2);
5302 f_hat211=f_hat + (((N0/2)+k0)*N1+k1)*N2;
5303 g_hat121=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2+n2-(N2/2);
5304 f_hat121=f_hat + (k0*N1+(N1/2)+k1)*N2;
5305 g_hat221=g_hat + (k0*n1+k1)*n2+n2-(N2/2);
5306 f_hat221=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2;
5308 g_hat112=g_hat + ((n0-(N0/2)+k0)*n1+n1-(N1/2)+k1)*n2;
5309 f_hat112=f_hat + (k0*N1+k1)*N2+(N2/2);
5310 g_hat212=g_hat + (k0*n1+n1-(N1/2)+k1)*n2;
5311 f_hat212=f_hat + (((N0/2)+k0)*N1+k1)*N2+(N2/2);
5312 g_hat122=g_hat + ((n0-(N0/2)+k0)*n1+k1)*n2;
5313 f_hat122=f_hat + (k0*N1+(N1/2)+k1)*N2+(N2/2);
5314 g_hat222=g_hat + (k0*n1+k1)*n2;
5315 f_hat222=f_hat + (((N0/2)+k0)*N1+(N1/2)+k1)*N2+(N2/2);
5317 for(k2=0;k2<N2/2;k2++)
5319 ck21=c_phi_inv21[k2];
5320 ck22=c_phi_inv22[k2];
5322 f_hat111[k2] = g_hat111[k2] * ck01 * ck11 * ck21;
5323 f_hat211[k2] = g_hat211[k2] * ck02 * ck11 * ck21;
5324 f_hat121[k2] = g_hat121[k2] * ck01 * ck12 * ck21;
5325 f_hat221[k2] = g_hat221[k2] * ck02 * ck12 * ck21;
5327 f_hat112[k2] = g_hat112[k2] * ck01 * ck11 * ck22;
5328 f_hat212[k2] = g_hat212[k2] * ck02 * ck11 * ck22;
5329 f_hat122[k2] = g_hat122[k2] * ck01 * ck12 * ck22;
5330 f_hat222[k2] = g_hat222[k2] * ck02 * ck12 * ck22;
5337 #pragma omp parallel for default(shared) private(k0,k1,k2,ck01,ck02,ck11,ck12,ck21,ck22)
5339 for(k0=0;k0<N0/2;k0++)
5341 ck01=K(1.0)/(PHI_HUT(ths->n[0],k0-N0/2,0));
5342 ck02=K(1.0)/(PHI_HUT(ths->n[0],k0,0));
5343 for(k1=0;k1<N1/2;k1++)
5345 ck11=K(1.0)/(PHI_HUT(ths->n[1],k1-N1/2,1));
5346 ck12=K(1.0)/(PHI_HUT(ths->n[1],k1,1));
5348 for(k2=0;k2<N2/2;k2++)
5350 ck21=K(1.0)/(PHI_HUT(ths->n[2],k2-N2/2,2));
5351 ck22=K(1.0)/(PHI_HUT(ths->n[2],k2,2));
5353 f_hat[(k0*N1+k1)*N2+k2] = g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] * ck01 * ck11 * ck21;
5354 f_hat[((N0/2+k0)*N1+k1)*N2+k2] = g_hat[(k0*n1+n1-N1/2+k1)*n2+n2-N2/2+k2] * ck02 * ck11 * ck21;
5355 f_hat[(k0*N1+N1/2+k1)*N2+k2] = g_hat[((n0-N0/2+k0)*n1+k1)*n2+n2-N2/2+k2] * ck01 * ck12 * ck21;
5356 f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+k2] = g_hat[(k0*n1+k1)*n2+n2-N2/2+k2] * ck02 * ck12 * ck21;
5358 f_hat[(k0*N1+k1)*N2+N2/2+k2] = g_hat[((n0-N0/2+k0)*n1+n1-N1/2+k1)*n2+k2] * ck01 * ck11 * ck22;
5359 f_hat[((N0/2+k0)*N1+k1)*N2+N2/2+k2] = g_hat[(k0*n1+n1-N1/2+k1)*n2+k2] * ck02 * ck11 * ck22;
5360 f_hat[(k0*N1+N1/2+k1)*N2+N2/2+k2] = g_hat[((n0-N0/2+k0)*n1+k1)*n2+k2] * ck01 * ck12 * ck22;
5361 f_hat[((N0/2+k0)*N1+N1/2+k1)*N2+N2/2+k2] = g_hat[(k0*n1+k1)*n2+k2] * ck02 * ck12 * ck22;
5371 void X(trafo)(
X(plan) *ths)
5375 case 1:
X(trafo_1d)(ths);
break;
5376 case 2:
X(trafo_2d)(ths);
break;
5377 case 3:
X(trafo_3d)(ths);
break;
5381 ths->g_hat = ths->g1;
5396 FFTW(execute)(ths->my_fftw_plan1);
5409 void X(adjoint)(
X(plan) *ths)
5413 case 1:
X(adjoint_1d)(ths);
break;
5414 case 2:
X(adjoint_2d)(ths);
break;
5415 case 3:
X(adjoint_3d)(ths);
break;
5434 FFTW(execute)(ths->my_fftw_plan2);
5450 static
void precompute_phi_hut(
X(plan) *ths)
5455 ths->c_phi_inv = (R**) Y(malloc)((size_t)(ths->d) *
sizeof(R*));
5457 for (t = 0; t < ths->d; t++)
5459 ths->c_phi_inv[t] = (R*)Y(malloc)((size_t)(ths->N[t]) *
sizeof(R));
5461 for (ks[t] = 0; ks[t] < ths->N[t]; ks[t]++)
5463 ths->c_phi_inv[t][ks[t]]= K(1.0) / (PHI_HUT(ths->n[t], ks[t] - ths->N[t] / 2,t));
5472 void X(precompute_lin_psi)(
X(plan) *ths)
5478 for (t=0; t<ths->d; t++)
5480 step = ((R)(ths->m+2)) / ((R)(ths->K * ths->n[t]));
5481 for(j = 0;j <= ths->K; j++)
5483 ths->psi[(ths->K+1)*t + j] = PHI(ths->n[t], (R)(j) * step,t);
5488 void X(precompute_fg_psi)(
X(plan) *ths)
5495 for (t=0; t<ths->d; t++)
5499 #pragma omp parallel for default(shared) private(j,u,o)
5501 for (j = 0; j < ths->M_total; j++)
5505 ths->psi[2*(j*ths->d+t)]=
5506 (PHI(ths->n[t] ,(ths->x[j*ths->d+t] - ((R)u) / (R)(ths->n[t])),t));
5508 ths->psi[2*(j*ths->d+t)+1]=
5509 EXP(K(2.0) * ((R)(ths->n[t]) * ths->x[j*ths->d+t] - (R)(u)) / ths->b[t]);
5515 void X(precompute_psi)(
X(plan) *ths)
5524 for (t=0; t<ths->d; t++)
5528 #pragma omp parallel for default(shared) private(j,l,lj,u,o)
5530 for (j = 0; j < ths->M_total; j++)
5534 for(l = u, lj = 0; l <= o; l++, lj++)
5535 ths->psi[(j * ths->d + t) * (2 * ths->m + 2) + lj] =
5536 (PHI(ths->n[t], (ths->x[j*ths->d+t] - ((R)l) / (R)(ths->n[t])), t));
5543 static void nfft_precompute_full_psi_omp(
X(plan) *ths)
5550 for(t=0,lprod = 1; t<ths->d; t++)
5551 lprod *= 2*ths->m+2;
5554 #pragma omp parallel for default(shared) private(j)
5555 for(j=0; j<ths->M_total; j++)
5561 INT ll_plain[ths->d+1];
5563 INT u[ths->d], o[ths->d];
5565 R phi_prod[ths->d+1];
5571 MACRO_init_uo_l_lj_t;
5573 for(l_L=0; l_L<lprod; l_L++, ix++)
5575 MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
5577 ths->psi_index_g[ix]=ll_plain[ths->d];
5578 ths->psi[ix]=phi_prod[ths->d];
5580 MACRO_count_uo_l_lj_t;
5583 ths->psi_index_f[j]=lprod;
5588 void X(precompute_full_psi)(
X(plan) *ths)
5593 nfft_precompute_full_psi_omp(ths);
5600 INT ll_plain[ths->d+1];
5602 INT u[ths->d], o[ths->d];
5604 R phi_prod[ths->d+1];
5610 phi_prod[0] = K(1.0);
5613 for (t = 0, lprod = 1; t < ths->d; t++)
5614 lprod *= 2 * ths->m + 2;
5616 for (j = 0, ix = 0, ix_old = 0; j < ths->M_total; j++)
5618 MACRO_init_uo_l_lj_t;
5620 for (l_L = 0; l_L < lprod; l_L++, ix++)
5622 MACRO_update_phi_prod_ll_plain(without_PRE_PSI);
5624 ths->psi_index_g[ix] = ll_plain[ths->d];
5625 ths->psi[ix] = phi_prod[ths->d];
5627 MACRO_count_uo_l_lj_t;
5630 ths->psi_index_f[j] = ix - ix_old;
5636 void X(precompute_one_psi)(
X(plan) *ths)
5638 if(ths->flags & PRE_LIN_PSI)
5639 X(precompute_lin_psi)(ths);
5640 if(ths->flags & PRE_FG_PSI)
5641 X(precompute_fg_psi)(ths);
5642 if(ths->flags & PRE_PSI)
5643 X(precompute_psi)(ths);
5644 if(ths->flags & PRE_FULL_PSI)
5645 X(precompute_full_psi)(ths);
5648 static void init_help(
X(plan) *ths)
5653 if (ths->flags & NFFT_OMP_BLOCKWISE_ADJOINT)
5654 ths->flags |= NFFT_SORT_NODES;
5656 ths->N_total = intprod(ths->N, 0, ths->d);
5657 ths->n_total = intprod(ths->n, 0, ths->d);
5659 ths->sigma = (R*) Y(malloc)((size_t)(ths->d) *
sizeof(R));
5661 for(t = 0;t < ths->d; t++)
5662 ths->sigma[t] = ((R)ths->n[t]) / (R)(ths->N[t]);
5666 if(ths->flags & MALLOC_X)
5667 ths->x = (R*)Y(malloc)((size_t)(ths->d * ths->M_total) *
sizeof(R));
5669 if(ths->flags & MALLOC_F_HAT)
5670 ths->f_hat = (C*)Y(malloc)((size_t)(ths->N_total) *
sizeof(C));
5672 if(ths->flags & MALLOC_F)
5673 ths->f = (C*)Y(malloc)((size_t)(ths->M_total) *
sizeof(C));
5675 if(ths->flags & PRE_PHI_HUT)
5676 precompute_phi_hut(ths);
5678 if (ths->flags & PRE_LIN_PSI)
5682 ths->K = Y(m2K)(ths->m);
5684 ths->psi = (R*) Y(malloc)((size_t)((ths->K+1) * ths->d) *
sizeof(R));
5687 if(ths->flags & PRE_FG_PSI)
5688 ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * ths->d * 2) *
sizeof(R));
5690 if(ths->flags & PRE_PSI)
5691 ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * ths->d * (2 * ths->m + 2)) *
sizeof(R));
5693 if(ths->flags & PRE_FULL_PSI)
5695 for (t = 0, lprod = 1; t < ths->d; t++)
5696 lprod *= 2 * ths->m + 2;
5698 ths->psi = (R*) Y(malloc)((size_t)(ths->M_total * lprod) *
sizeof(R));
5700 ths->psi_index_f = (INT*) Y(malloc)((size_t)(ths->M_total) *
sizeof(INT));
5701 ths->psi_index_g = (INT*) Y(malloc)((size_t)(ths->M_total * lprod) *
sizeof(INT));
5704 if(ths->flags & FFTW_INIT)
5707 INT nthreads = Y(get_num_threads)();
5710 ths->g1 = (C*)Y(malloc)((size_t)(ths->n_total) *
sizeof(C));
5712 if(ths->flags & FFT_OUT_OF_PLACE)
5713 ths->g2 = (C*) Y(malloc)((size_t)(ths->n_total) *
sizeof(C));
5718 #pragma omp critical (nfft_omp_critical_fftw_plan)
5720 FFTW(plan_with_nthreads)(nthreads);
5723 int *_n = Y(malloc)((size_t)(ths->d) *
sizeof(int));
5725 for (t = 0; t < ths->d; t++)
5726 _n[t] = (
int)(ths->n[t]);
5728 ths->my_fftw_plan1 = FFTW(plan_dft)((int)ths->d, _n, ths->g1, ths->g2, FFTW_FORWARD, ths->fftw_flags);
5729 ths->my_fftw_plan2 = FFTW(plan_dft)((int)ths->d, _n, ths->g2, ths->g1, FFTW_BACKWARD, ths->fftw_flags);
5737 if(ths->flags & NFFT_SORT_NODES)
5738 ths->index_x = (INT*) Y(malloc)(
sizeof(INT) * 2U * (
size_t)(ths->M_total));
5740 ths->index_x = NULL;
5742 ths->mv_trafo = (void (*) (
void* ))
X(trafo);
5743 ths->mv_adjoint = (void (*) (
void* ))
X(adjoint);
5746 void X(init)(
X(plan) *ths,
int d,
int *N,
int M_total)
5752 ths->N = (INT*) Y(malloc)((size_t)(d) *
sizeof(INT));
5754 for (t = 0; t < d; t++)
5755 ths->N[t] = (INT)N[t];
5757 ths->M_total = (INT)M_total;
5759 ths->n = (INT*) Y(malloc)((size_t)(d) *
sizeof(INT));
5761 for (t = 0; t < d; t++)
5762 ths->n[t] = 2 * (Y(next_power_of_2)(ths->N[t]));
5764 ths->m = WINDOW_HELP_ESTIMATE_m;
5769 ths->flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
5770 FFTW_INIT | FFT_OUT_OF_PLACE | NFFT_SORT_NODES |
5771 NFFT_OMP_BLOCKWISE_ADJOINT;
5773 ths->flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
5774 FFTW_INIT | FFT_OUT_OF_PLACE | NFFT_SORT_NODES;
5778 ths->flags = PRE_PHI_HUT | PRE_PSI | MALLOC_X| MALLOC_F_HAT | MALLOC_F |
5779 FFTW_INIT | FFT_OUT_OF_PLACE;
5781 ths->fftw_flags= FFTW_ESTIMATE| FFTW_DESTROY_INPUT;
5787 void X(init_guru)(
X(plan) *ths,
int d,
int *N,
int M_total,
int *n,
int m,
5788 unsigned flags,
unsigned fftw_flags)
5793 ths->M_total = (INT)M_total;
5794 ths->N = (INT*)Y(malloc)((size_t)(ths->d) *
sizeof(INT));
5796 for (t = 0; t < d; t++)
5797 ths->N[t] = (INT)N[t];
5799 ths->n = (INT*)Y(malloc)((size_t)(ths->d) *
sizeof(INT));
5801 for (t = 0; t < d; t++)
5802 ths->n[t] = (INT)n[t];
5807 ths->fftw_flags = fftw_flags;
5813 void X(init_lin)(
X(plan) *ths,
int d,
int *N,
int M_total,
int *n,
int m,
int K,
5814 unsigned flags,
unsigned fftw_flags)
5819 ths->M_total = (INT)M_total;
5820 ths->N = (INT*)Y(malloc)((size_t)(ths->d) *
sizeof(INT));
5822 for (t = 0; t < d; t++)
5823 ths->N[t] = (INT)N[t];
5825 ths->n = (INT*)Y(malloc)((size_t)(ths->d) *
sizeof(INT));
5827 for (t = 0; t < d; t++)
5828 ths->n[t] = (INT)n[t];
5833 ths->fftw_flags = fftw_flags;
5839 void X(init_1d)(
X(plan) *ths,
int N1,
int M_total)
5845 X(init)(ths, 1, N, M_total);
5848 void X(init_2d)(
X(plan) *ths,
int N1,
int N2,
int M_total)
5854 X(init)(ths, 2, N, M_total);
5857 void X(init_3d)(
X(plan) *ths,
int N1,
int N2,
int N3,
int M_total)
5864 X(init)(ths, 3, N, M_total);
5867 const char*
X(check)(
X(plan) *ths)
5872 return "Member f not initialized.";
5875 return "Member x not initialized.";
5878 return "Member f_hat not initialized.";
5880 if ((ths->flags & PRE_LIN_PSI) && ths->K < ths->M_total)
5881 return "Number of nodes too small to use PRE_LIN_PSI.";
5883 for (j = 0; j < ths->M_total * ths->d; j++)
5885 if ((ths->x[j]<-K(0.5)) || (ths->x[j]>= K(0.5)))
5887 return "ths->x out of range [-0.5,0.5)";
5891 for (j = 0; j < ths->d; j++)
5893 if (ths->sigma[j] <= 1)
5894 return "Oversampling factor too small";
5896 if(ths->N[j] <= ths->m)
5897 return "Polynomial degree N is smaller than cut-off m";
5899 if(ths->N[j]%2 == 1)
5900 return "polynomial degree N has to be even";
5905 void X(finalize)(
X(plan) *ths)
5909 if(ths->flags & NFFT_SORT_NODES)
5910 Y(free)(ths->index_x);
5912 if(ths->flags & FFTW_INIT)
5915 #pragma omp critical (nfft_omp_critical_fftw_plan)
5917 FFTW(destroy_plan)(ths->my_fftw_plan2);
5919 #pragma omp critical (nfft_omp_critical_fftw_plan)
5921 FFTW(destroy_plan)(ths->my_fftw_plan1);
5923 if(ths->flags & FFT_OUT_OF_PLACE)
5929 if(ths->flags & PRE_FULL_PSI)
5931 Y(free)(ths->psi_index_g);
5932 Y(free)(ths->psi_index_f);
5936 if(ths->flags & PRE_PSI)
5939 if(ths->flags & PRE_FG_PSI)
5942 if(ths->flags & PRE_LIN_PSI)
5945 if(ths->flags & PRE_PHI_HUT)
5947 for (t = 0; t < ths->d; t++)
5948 Y(free)(ths->c_phi_inv[t]);
5949 Y(free)(ths->c_phi_inv);
5952 if(ths->flags & MALLOC_F)
5955 if(ths->flags & MALLOC_F_HAT)
5956 Y(free)(ths->f_hat);
5958 if(ths->flags & MALLOC_X)
5961 WINDOW_HELP_FINALIZE;
5963 Y(free)(ths->sigma);
#define TIC(a)
Timing, method works since the inaccurate timer is updated mostly in the measured function...
#define X(name)
Include header for C99 complex datatype.
#define UNUSED(x)
Dummy use of unused parameters to silence compiler warnings.