1 //          Copyright Jernej Krempuš 2012
2 // Distributed under the Boost Software License, Version 1.0.
3 //    (See accompanying file LICENSE_1_0.txt or copy at
4 //          http://www.boost.org/LICENSE_1_0.txt)
5 
6 module dplug.fft.sse_float;
7 
8 import core.simd;
9 
10 import dplug.fft.fft_impl;
11 
12 template shuf_mask(int a3, int a2, int a1, int a0)
13 { 
14     enum shuf_mask = a0 | (a1<<2) | (a2<<4) | (a3<<6); 
15 }
16 
17 version(X86_64)
18     version(linux)
19         version = linux_x86_64;
20 
21 
22 import dplug.fft.ldc_compat;
23 import dplug.fft.dmd32_compat;
24         
25 struct Vector 
26 {
27 nothrow:
28 @nogc:
29     alias float4 vec;
30     alias float T;
31     
32     enum vec_size = 4;
33     enum log2_bitreverse_chunk_size = 2;
34     
35     version(GNU)
36     {
37         import gcc.builtins;
38                 
39         static vec scalar_to_vector()(T a)
40         {
41             return a;
42         }  
43 
44         private static shufps(int m0, int m1, int m2, int m3)(float4 a, float4 b)
45         {
46             return __builtin_ia32_shufps(a, b, shuf_mask!(m0, m1, m2, m3));
47         }
48 
49         alias __builtin_ia32_unpcklps unpcklps;
50         alias __builtin_ia32_unpckhps unpckhps;
51               
52         static vec unaligned_load(T* p)
53         {
54             return __builtin_ia32_loadups(p);
55         }
56 
57         static void unaligned_store(T* p, vec v)
58         {
59             return __builtin_ia32_storeups(p, v);
60         }
61 
62         static vec reverse(vec v)
63         {
64             return shufps!(0, 1, 2, 3)(v, v);
65         }
66     }
67     
68     version(DigitalMars)
69     {
70         static vec scalar_to_vector()(float a)
71         {
72             version(linux_x86_64)
73                 asm nothrow @nogc
74                 {
75                     naked;
76                     shufps XMM0, XMM0, 0;
77                     ret;
78                 }
79             else
80             {
81                 static struct quad
82                 {
83                     align(16) float a;
84                     float b;
85                     float c;
86                     float d;
87                 }
88                 auto q = quad(a,a,a,a);
89                 return *cast(vec*)& q;
90             }
91         }
92     }
93     
94     version(LDC)
95     {    
96         static vec scalar_to_vector()(float a)
97         {
98             return a;
99         }
100 
101         static auto shufps(int m0, int m1, int m2, int m3)(float4 a, float4 b)
102         {
103             return shufflevector!(float4, m3, m2, m1+4, m0+4)(a, b);
104         }
105         
106         static vec unpcklps(vec a, vec b)
107         { 
108             return shufflevector!(float4, 0, 4, 1, 5)(a, b);
109         }
110         
111         static vec unpckhps(vec a, vec b)
112         { 
113             return shufflevector!(float4, 2, 6, 3, 7)(a, b);
114         }
115 
116        static vec unaligned_load(T* p)
117         {
118             return loadUnaligned!vec(cast(float*)p);
119         }
120 
121         static void unaligned_store(T* p, vec v)
122         {
123             storeUnaligned!vec(v, cast(float*)p);
124         }
125         
126         static vec reverse(vec v)
127         {
128             return shufflevector!(float4, 3, 2, 1, 0)(v, v);
129         }
130     }
131     
132     static if(is(typeof(shufps)))
133     {
134         static void complex_array_to_real_imag_vec(int len)(
135             float * arr, ref vec rr, ref vec ri)
136         {
137             static if(len==2)
138             {
139                 rr = ri = (cast(vec*)arr)[0];
140                 rr = shufps!(2,2,0,0)(rr, rr);    // I could use __builtin_ia32_movsldup here but it doesn't seem to increase performance
141                 ri = shufps!(3,3,1,1)(ri, ri);
142             }
143             else static if(len==4)
144             {
145                 vec tmp = (cast(vec*)arr)[0];
146                 ri = (cast(vec*)arr)[1];
147                 rr = shufps!(2,0,2,0)(tmp, ri);
148                 ri = shufps!(3,1,3,1)(tmp, ri);
149             }
150         }
151 
152         static void transpose(int elements_per_vector)(
153             vec a0,  vec a1, ref vec r0, ref vec r1)
154         {
155             if(elements_per_vector==4)
156             {
157                 r0 = shufps!(2,0,2,0)(a0,a1);
158                 r1 = shufps!(3,1,3,1)(a0,a1);
159                 r0 = shufps!(3,1,2,0)(r0,r0);
160                 r1 = shufps!(3,1,2,0)(r1,r1);
161             }
162             else if(elements_per_vector==2)
163             {
164                 r0 = shufps!(1,0,1,0)(a0,a1);
165                 r1 = shufps!(3,2,3,2)(a0,a1);
166             }
167         }
168         
169         static void interleave( 
170             vec a0,  vec a1, ref vec r0, ref vec r1)
171         {
172             r0 = unpcklps(a0,a1);
173             r1 = unpckhps(a0,a1);
174         }
175         
176         static void deinterleave(
177             vec a0,  vec a1, ref vec r0, ref vec r1)
178         {
179             r0 = shufps!(2,0,2,0)(a0,a1);
180             r1 = shufps!(3,1,3,1)(a0,a1);
181         }
182         
183         private static float4 * v()(float * a)
184         {
185             return cast(float4*)a;
186         }
187         
188         private static void br16()(
189             float4 a0, float4 a1, float4 a2, float4 a3, 
190             ref float4 r0, ref float4 r1, ref float4 r2, ref float4 r3)
191         {
192             float4 b0 = shufps!(1,0,1,0)(a0, a2);
193             float4 b1 = shufps!(1,0,1,0)(a1, a3);
194             float4 b2 = shufps!(3,2,3,2)(a0, a2);
195             float4 b3 = shufps!(3,2,3,2)(a1, a3);
196             r0 = shufps!(2,0,2,0)(b0, b1);
197             r1 = shufps!(2,0,2,0)(b2, b3);
198             r2 = shufps!(3,1,3,1)(b0, b1);
199             r3 = shufps!(3,1,3,1)(b2, b3);
200         }
201         
202         static void bit_reverse_swap()(float * p0, float * p1, size_t m)
203         {
204             float4 b0 = *v(p1 + 0 * m); 
205             float4 b1 = *v(p1 + 1 * m); 
206             float4 b2 = *v(p1 + 2 * m); 
207             float4 b3 = *v(p1 + 3 * m);
208             
209             br16(*v(p0 + 0 * m), *v(p0 + 1 * m), *v(p0 + 2 * m), *v(p0 + 3 * m), 
210                  *v(p1 + 0 * m), *v(p1 + 1 * m), *v(p1 + 2 * m), *v(p1 + 3 * m));
211             
212             br16(b0, b1, b2, b3, 
213                  *v(p0 + 0 * m), *v(p0 + 1 * m), *v(p0 + 2 * m), *v(p0 + 3 * m));
214         }
215 
216         static void bit_reverse()(float * p, size_t m)
217         {
218             br16(*v(p + 0 * m), *v(p + 1 * m), *v(p + 2 * m), *v(p + 3 * m), 
219                  *v(p + 0 * m), *v(p + 1 * m), *v(p + 2 * m), *v(p + 3 * m));
220         }
221     }
222     else
223     {        
224         static void bit_reverse()(T * p0, size_t m)
225         {
226             version(linux_x86_64)
227                 asm nothrow @nogc
228                 {
229                     naked;
230                     lea     RAX,[RDI+RDI*1];
231                     lea     RCX,[RSI+RDI*4];
232                     lea     RDI,[RDI+RDI*2];
233                     movaps  XMM1,[RSI];
234                     lea     RDX,[RSI+RAX*4];
235                     lea     R8,[RSI+RDI*4];
236                     movaps  XMM0,[RCX];
237                     movaps  XMM3,XMM1;
238                     movaps  XMM5,[RDX];
239                     movaps  XMM2,XMM0;
240                     movaps  XMM4,[R8];
241                     shufps  XMM1,XMM5,0xEE;
242                     movlhps XMM3,XMM5;
243                     shufps  XMM0,XMM4,0xEE;
244                     movlhps XMM2,XMM4;
245                     movaps  XMM6,XMM3;
246                     movaps  XMM7,XMM1;
247                     shufps  XMM3,XMM2,0xDD;
248                     shufps  XMM6,XMM2,0x88;
249                     shufps  XMM7,XMM0,0x88;
250                     shufps  XMM1,XMM0,0xDD;
251                     movaps  [RSI],XMM6;
252                     movaps  [RCX],XMM7;
253                     movaps  [RDX],XMM3;
254                     movaps  [R8],XMM1;
255                     ret;
256                 }
257             else
258                 Scalar!T.bit_reverse(p0, m);
259         }
260 
261         static void bit_reverse_swap()(T * p0, T * p1, size_t m)
262         {
263             version(linux_x86_64)
264                 asm nothrow @nogc
265                 {
266                     naked;
267                     lea     RAX,[RDI+RDI*1];
268                     lea     RCX,[RDI*4+0x0];
269                     lea     RDI,[RDI+RDI*2];
270                     movaps  XMM1,[RSI];
271                     shl     RAX,0x2;
272                     lea     R10,[RSI+RCX*1];
273                     shl     RDI,0x2;
274                     lea     R9,[RSI+RAX*1];
275                     movaps  XMM3,[RDX];
276                     add     RAX,RDX;
277                     lea     R8,[RSI+RDI*1];
278                     add     RCX,RDX;
279                     movaps  XMM5,[R9];
280                     add     RDI,RDX;
281                     movaps  XMM7,XMM3;
282                     movaps  XMM9,[RAX];
283                     movaps  XMM12,XMM1;
284                     shufps  XMM1,XMM5,0xEE;
285                     movaps  XMM0,[R10];
286                     shufps  XMM3,XMM9,0xEE;
287                     movlhps XMM7,XMM9;
288                     movaps  XMM2,[RCX];
289                     movlhps XMM12,XMM5;
290                     movaps  XMM13,XMM0;
291                     movaps  XMM4,[R8];
292                     movaps  XMM6,XMM2;
293                     movaps  XMM10,XMM7;
294                     movaps  XMM8,[RDI];
295                     shufps  XMM0,XMM4,0xEE;
296                     movaps  XMM11,XMM3;
297                     shufps  XMM2,XMM8,0xEE;
298                     movlhps XMM6,XMM8;
299                     movlhps XMM13,XMM4;
300                     movaps  XMM14,XMM12;
301                     movaps  XMM15,XMM1;
302                     shufps  XMM10,XMM6,0x88;
303                     shufps  XMM11,XMM2,0x88;
304                     shufps  XMM7,XMM6,0xDD;
305                     shufps  XMM3,XMM2,0xDD;
306                     shufps  XMM14,XMM13,0x88;
307                     shufps  XMM15,XMM0,0x88;
308                     shufps  XMM12,XMM13,0xDD;
309                     shufps  XMM1,XMM0,0xDD;
310                     movaps  [RSI],XMM10;
311                     movaps  [R10],XMM11;
312                     movaps  [R9],XMM7;
313                     movaps  [R8],XMM3;
314                     movaps  [RDX],XMM14;
315                     movaps  [RCX],XMM15;
316                     movaps  [RAX],XMM12;
317                     movaps  [RDI],XMM1;
318                     ret;
319                 }
320             else
321                 Scalar!T.bit_reverse_swap(p0, p1, m);
322         }                         
323     }
324 }
325 
326 struct Options
327 {
328     enum log2_bitreverse_large_chunk_size = 5;
329     enum large_limit = 14;
330     enum log2_optimal_n = 10;
331     enum passes_per_recursive_call = 4;
332     enum log2_recursive_passes_chunk_size = 5;
333     enum prefered_alignment = 4 * (1 << 10);
334     //enum { fast_init };
335 }
336