1 //          Copyright Jernej Krempuš 2012
2 // Distributed under the Boost Software License, Version 1.0.
3 //    (See accompanying file LICENSE_1_0.txt or copy at
4 //          http://www.boost.org/LICENSE_1_0.txt)
5 
6 module dplug.fft.sse_double;
7 
8 import core.simd;
9 
10 import dplug.fft.fft_impl;
11 
12 template shuf_mask(int a3, int a2, int a1, int a0)
13 { 
14     enum shuf_mask = a0 | (a1<<2) | (a2<<4) | (a3<<6); 
15 }
16 
17 import dplug.fft.ldc_compat;
18 import dplug.fft.dmd32_compat;
19 
20 struct Vector
21 {
22 nothrow:
23 @nogc:
24     alias double2 vec;
25     alias double T;
26     
27     enum vec_size = 2;
28     enum log2_bitreverse_chunk_size = 2;
29     
30     version(GNU)
31     {
32         import gcc.builtins;
33         
34         static vec scalar_to_vector(T a)
35         {
36             return a;
37         }
38         
39         static void interleave( 
40             vec a0,  vec a1, ref vec r0, ref vec r1)
41         {
42             r0 = __builtin_ia32_unpcklpd(a0, a1);
43             r1 = __builtin_ia32_unpckhpd(a0, a1);
44         }
45         
46         static vec unaligned_load(T* p)
47         {
48             return __builtin_ia32_loadupd(p);
49         }
50 
51         static void unaligned_store(T* p, vec v)
52         {
53             return __builtin_ia32_storeupd(p, v);
54         }
55 
56         static vec reverse(vec v)
57         {
58             return __builtin_ia32_shufpd(v, v, 0x1);
59         }
60     }
61     else version(LDC)
62     {
63         static vec scalar_to_vector(T a)
64         {
65             return a;
66         }
67         
68         static void interleave( 
69             vec a0,  vec a1, ref vec r0, ref vec r1)
70         {
71             r0 = shufflevector!(double2, 0, 2)(a0, a1);
72             r1 = shufflevector!(double2, 1, 3)(a0, a1);
73         }
74         
75         static vec unaligned_load(T* p)
76         {
77             return loadUnaligned!vec(cast(double*)p);
78         }
79 
80         static void unaligned_store(T* p, vec v)
81         {
82             storeUnaligned!vec(v, cast(double*)p);
83         }
84         
85         static vec reverse(vec v)
86         {
87             return shufflevector!(vec, 1, 0)(v, v);
88         }
89     }
90     else version(DigitalMars)
91     {
92         version(D_SIMD)
93         {
94             static vec scalar_to_vector(T a)
95             {
96                vec r;
97                r.ptr[0] = a;
98                r.ptr[1] = a;
99                return r;
100             }
101         
102             static void interleave( 
103                 vec a0,  vec a1, ref vec r0, ref vec r1)
104             {
105                 r0 = cast(double2) __simd(XMM.UNPCKLPD, a0, a1);
106                 r1 = cast(double2) __simd(XMM.UNPCKHPD, a0, a1);
107             }
108         }
109         else
110         {
111             static vec scalar_to_vector(T a)
112             {
113                 return vec(a, a);
114             }
115 
116             static void interleave(vec a0,  vec a1, ref vec r0, ref vec r1)
117             {
118                 r0.x = a0.x;
119                 r0.y = a1.x;
120                 r1.x = a0.y;
121                 r1.y = a1.y;
122             }
123 
124         }
125     }
126     else
127         static assert(false, "Unsupported compiler");
128         
129     private static vec * v(T * a)
130     {
131         return cast(vec*)a;
132     }
133             
134     static void complex_array_to_real_imag_vec(int len)(
135         T * arr, ref vec rr, ref vec ri)
136     {
137             interleave(v(arr)[0], v(arr)[1], rr, ri);
138     }
139 
140     alias interleave deinterleave;
141 
142     static void  transpose(int elements_per_vector)(
143             vec a0,  vec a1, ref vec r0, ref vec r1)
144     {
145         static if(elements_per_vector == 2)
146             interleave(a0, a1, r0, r1);
147         else
148             static assert(0);
149     }
150     
151     static void bit_reverse_swap(T * p0, T * p1, size_t m)
152     {
153         vec a0, a1, a2, a3, b0, b1, b2, b3;
154 
155         a0 = v(p0 + m * 0)[0];
156         a1 = v(p0 + m * 2)[0];
157         b0 = v(p1 + m * 0)[0];
158         b1 = v(p1 + m * 2)[0];
159         interleave(a0, a1, a0, a1);
160         interleave(b0, b1, b0, b1);
161         v(p1 + m * 0)[0] = a0;
162         v(p1 + m * 2)[0] = a1;
163         v(p0 + m * 0)[0] = b0;
164         v(p0 + m * 2)[0] = b1;
165         
166         a2 = v(p0 + m * 1)[1];
167         a3 = v(p0 + m * 3)[1];
168         b2 = v(p1 + m * 1)[1];
169         b3 = v(p1 + m * 3)[1];
170         interleave(a2, a3, a2, a3);
171         interleave(b2, b3, b2, b3);
172         v(p1 + m * 1)[1] = a2;
173         v(p1 + m * 3)[1] = a3;
174         v(p0 + m * 1)[1] = b2;
175         v(p0 + m * 3)[1] = b3;
176         
177         a0 = v(p0 + m * 0)[1];
178         a1 = v(p0 + m * 2)[1];
179         a2 = v(p0 + m * 1)[0];
180         a3 = v(p0 + m * 3)[0];
181         interleave(a0, a1, a0, a1);
182         interleave(a2, a3, a2, a3);
183         b0 = v(p1 + m * 0)[1];
184         b1 = v(p1 + m * 2)[1];
185         b2 = v(p1 + m * 1)[0];
186         b3 = v(p1 + m * 3)[0];
187         v(p1 + m * 0)[1] = a2;
188         v(p1 + m * 2)[1] = a3;
189         v(p1 + m * 1)[0] = a0;
190         v(p1 + m * 3)[0] = a1;
191         interleave(b0, b1, b0, b1);
192         interleave(b2, b3, b2, b3);
193         v(p0 + m * 0)[1] = b2;
194         v(p0 + m * 2)[1] = b3;
195         v(p0 + m * 1)[0] = b0;
196         v(p0 + m * 3)[0] = b1;
197     }
198 
199     static void bit_reverse(T * p, size_t m)
200     {
201         vec a0, a1, a2, a3;
202         a0 = v(p + m * 0)[0];
203         a1 = v(p + m * 2)[0];
204         a2 = v(p + m * 1)[1];
205         a3 = v(p + m * 3)[1];
206         interleave(a0, a1, a0, a1);
207         interleave(a2, a3, a2, a3);
208         v(p + m * 0)[0] = a0;
209         v(p + m * 2)[0] = a1;
210         v(p + m * 1)[1] = a2;
211         v(p + m * 3)[1] = a3;
212         
213         a0 = v(p + m * 0)[1];
214         a1 = v(p + m * 2)[1];
215         a2 = v(p + m * 1)[0];
216         a3 = v(p + m * 3)[0];
217         interleave(a0, a1, a0, a1);
218         interleave(a2, a3, a2, a3);
219         v(p + m * 0)[1] = a2;
220         v(p + m * 2)[1] = a3;
221         v(p + m * 1)[0] = a0;
222         v(p + m * 3)[0] = a1;
223     }
224 }
225 
226 struct Options
227 {
228     enum log2_bitreverse_large_chunk_size = 5;
229     enum large_limit = 13;
230     enum log2_optimal_n = 10;
231     enum passes_per_recursive_call = 4;
232     enum log2_recursive_passes_chunk_size = 5;
233     enum prefered_alignment = 4 * (1 << 10);
234     enum { fast_init };
235 }
236