1 //          Copyright Jernej Krempuš 2012
2 //          Copyright Guillaume Piolat 2016-2023
3 // Distributed under the Boost Software License, Version 1.0.
4 //    (See accompanying file LICENSE_1_0.txt or copy at
5 //          http://www.boost.org/LICENSE_1_0.txt)
6 module dplug.fft.sse_double;
7 
8 import inteli.emmintrin;
9 import dplug.fft.fft_impl;
10 
11 struct Vector
12 {
13 nothrow:
14 @nogc:
15     alias double2 vec;
16     alias double T;
17     
18     enum vec_size = 2;
19     enum log2_bitreverse_chunk_size = 2;
20     
21     static vec scalar_to_vector(T a)
22     {
23         return _mm_set1_pd(a);
24     }
25         
26     static void interleave(vec a0,  vec a1, ref vec r0, ref vec r1)
27     {
28         r0 = _mm_unpacklo_pd(a0, a1);
29         r1 = _mm_unpackhi_pd(a0, a1);
30     }
31         
32     static vec unaligned_load(T* p)
33     {
34         return _mm_loadu_pd(p);
35     }
36 
37     static void unaligned_store(T* p, vec v)
38     {
39         _mm_storeu_pd(p, v);
40     }
41 
42     static vec reverse(vec v)
43     {
44         return _mm_shuffle_pd!1(v, v);
45     }
46 
47     private static vec * v(T * a)
48     {
49         return cast(vec*)a;
50     }
51 
52     static void complex_array_to_real_imag_vec(int len)(
53         T * arr, ref vec rr, ref vec ri)
54     {
55             interleave(v(arr)[0], v(arr)[1], rr, ri);
56     }
57 
58     alias interleave deinterleave;
59 
60     static void  transpose(int elements_per_vector)(
61             vec a0,  vec a1, ref vec r0, ref vec r1)
62     {
63         static if(elements_per_vector == 2)
64             interleave(a0, a1, r0, r1);
65         else
66             static assert(0);
67     }
68 
69     static void bit_reverse_swap(T * p0, T * p1, size_t m)
70     {
71         vec a0, a1, a2, a3, b0, b1, b2, b3;
72 
73         a0 = v(p0 + m * 0)[0];
74         a1 = v(p0 + m * 2)[0];
75         b0 = v(p1 + m * 0)[0];
76         b1 = v(p1 + m * 2)[0];
77         interleave(a0, a1, a0, a1);
78         interleave(b0, b1, b0, b1);
79         v(p1 + m * 0)[0] = a0;
80         v(p1 + m * 2)[0] = a1;
81         v(p0 + m * 0)[0] = b0;
82         v(p0 + m * 2)[0] = b1;
83         
84         a2 = v(p0 + m * 1)[1];
85         a3 = v(p0 + m * 3)[1];
86         b2 = v(p1 + m * 1)[1];
87         b3 = v(p1 + m * 3)[1];
88         interleave(a2, a3, a2, a3);
89         interleave(b2, b3, b2, b3);
90         v(p1 + m * 1)[1] = a2;
91         v(p1 + m * 3)[1] = a3;
92         v(p0 + m * 1)[1] = b2;
93         v(p0 + m * 3)[1] = b3;
94         
95         a0 = v(p0 + m * 0)[1];
96         a1 = v(p0 + m * 2)[1];
97         a2 = v(p0 + m * 1)[0];
98         a3 = v(p0 + m * 3)[0];
99         interleave(a0, a1, a0, a1);
100         interleave(a2, a3, a2, a3);
101         b0 = v(p1 + m * 0)[1];
102         b1 = v(p1 + m * 2)[1];
103         b2 = v(p1 + m * 1)[0];
104         b3 = v(p1 + m * 3)[0];
105         v(p1 + m * 0)[1] = a2;
106         v(p1 + m * 2)[1] = a3;
107         v(p1 + m * 1)[0] = a0;
108         v(p1 + m * 3)[0] = a1;
109         interleave(b0, b1, b0, b1);
110         interleave(b2, b3, b2, b3);
111         v(p0 + m * 0)[1] = b2;
112         v(p0 + m * 2)[1] = b3;
113         v(p0 + m * 1)[0] = b0;
114         v(p0 + m * 3)[0] = b1;
115     }
116 
117     static void bit_reverse(T * p, size_t m)
118     {
119         vec a0, a1, a2, a3;
120         a0 = v(p + m * 0)[0];
121         a1 = v(p + m * 2)[0];
122         a2 = v(p + m * 1)[1];
123         a3 = v(p + m * 3)[1];
124         interleave(a0, a1, a0, a1);
125         interleave(a2, a3, a2, a3);
126         v(p + m * 0)[0] = a0;
127         v(p + m * 2)[0] = a1;
128         v(p + m * 1)[1] = a2;
129         v(p + m * 3)[1] = a3;
130         
131         a0 = v(p + m * 0)[1];
132         a1 = v(p + m * 2)[1];
133         a2 = v(p + m * 1)[0];
134         a3 = v(p + m * 3)[0];
135         interleave(a0, a1, a0, a1);
136         interleave(a2, a3, a2, a3);
137         v(p + m * 0)[1] = a2;
138         v(p + m * 2)[1] = a3;
139         v(p + m * 1)[0] = a0;
140         v(p + m * 3)[0] = a1;
141     }
142 }
143 
144 struct Options
145 {
146     enum log2_bitreverse_large_chunk_size = 5;
147     enum large_limit = 13;
148     enum log2_optimal_n = 10;
149     enum passes_per_recursive_call = 4;
150     enum log2_recursive_passes_chunk_size = 5;
151     enum prefered_alignment = 4 * (1 << 10);
152     enum { fast_init };
153 }
154