1 //          Copyright Jernej Krempuš 2012
2 //          Copyright Guillaume Piolat 2016-2023
3 // Distributed under the Boost Software License, Version 1.0.
4 //    (See accompanying file LICENSE_1_0.txt or copy at
5 //          http://www.boost.org/LICENSE_1_0.txt)
6 module dplug.fft.sse_float;
7 
8 import inteli.emmintrin;
9 import dplug.fft.fft_impl;
10 
11 struct Vector 
12 {
13 nothrow:
14 @nogc:
15     alias float4 vec;
16     alias float T;
17     
18     enum vec_size = 4;
19     enum log2_bitreverse_chunk_size = 2;
20     
21     static vec scalar_to_vector()(float a)
22     {
23         return _mm_set1_ps(a);
24     }
25 
26     static auto shufps(int m0, int m1, int m2, int m3)(float4 a, float4 b)
27     {
28         enum shufmask = _MM_SHUFFLE(m0, m1, m2, m3);
29         return _mm_shuffle_ps!shufmask(a, b);
30     }
31     
32     static vec unpcklps(vec a, vec b)
33     { 
34         return _mm_unpacklo_ps(a, b);
35     }
36     
37     static vec unpckhps(vec a, vec b)
38     { 
39         return _mm_unpackhi_ps(a, b);
40     }
41 
42     static vec unaligned_load(T* p)
43     {
44         return _mm_loadu_ps(p);
45     }
46 
47     static void unaligned_store(T* p, vec v)
48     {
49         _mm_storeu_ps(p, v);
50     }
51     
52     static vec reverse(vec v)
53     {
54         return _mm_shuffle_ps!(_MM_SHUFFLE(0, 1, 2, 3))(v, v);
55     }
56     
57     
58     static void complex_array_to_real_imag_vec(int len)(
59         float * arr, ref vec rr, ref vec ri)
60     {
61         static if(len==2)
62         {
63             rr = ri = (cast(vec*)arr)[0];
64             rr = shufps!(2,2,0,0)(rr, rr);    // I could use __builtin_ia32_movsldup here but it doesn't seem to increase performance
65             ri = shufps!(3,3,1,1)(ri, ri);
66         }
67         else static if(len==4)
68         {
69             vec tmp = (cast(vec*)arr)[0];
70             ri = (cast(vec*)arr)[1];
71             rr = shufps!(2,0,2,0)(tmp, ri);
72             ri = shufps!(3,1,3,1)(tmp, ri);
73         }
74     }
75 
76     static void transpose(int elements_per_vector)(
77         vec a0,  vec a1, ref vec r0, ref vec r1)
78     {
79         if(elements_per_vector==4)
80         {
81             r0 = shufps!(2,0,2,0)(a0,a1);
82             r1 = shufps!(3,1,3,1)(a0,a1);
83             r0 = shufps!(3,1,2,0)(r0,r0);
84             r1 = shufps!(3,1,2,0)(r1,r1);
85         }
86         else if(elements_per_vector==2)
87         {
88             r0 = shufps!(1,0,1,0)(a0,a1);
89             r1 = shufps!(3,2,3,2)(a0,a1);
90         }
91     }
92     
93     static void interleave( 
94         vec a0,  vec a1, ref vec r0, ref vec r1)
95     {
96         r0 = unpcklps(a0,a1);
97         r1 = unpckhps(a0,a1);
98     }
99     
100     static void deinterleave(
101         vec a0,  vec a1, ref vec r0, ref vec r1)
102     {
103         r0 = shufps!(2,0,2,0)(a0,a1);
104         r1 = shufps!(3,1,3,1)(a0,a1);
105     }
106     
107     private static float4 * v()(float * a)
108     {
109         return cast(float4*)a;
110     }
111     
112     private static void br16()(
113         float4 a0, float4 a1, float4 a2, float4 a3, 
114         ref float4 r0, ref float4 r1, ref float4 r2, ref float4 r3)
115     {
116         float4 b0 = shufps!(1,0,1,0)(a0, a2);
117         float4 b1 = shufps!(1,0,1,0)(a1, a3);
118         float4 b2 = shufps!(3,2,3,2)(a0, a2);
119         float4 b3 = shufps!(3,2,3,2)(a1, a3);
120         r0 = shufps!(2,0,2,0)(b0, b1);
121         r1 = shufps!(2,0,2,0)(b2, b3);
122         r2 = shufps!(3,1,3,1)(b0, b1);
123         r3 = shufps!(3,1,3,1)(b2, b3);
124     }
125     
126     static void bit_reverse_swap()(float * p0, float * p1, size_t m)
127     {
128         float4 b0 = *v(p1 + 0 * m); 
129         float4 b1 = *v(p1 + 1 * m); 
130         float4 b2 = *v(p1 + 2 * m); 
131         float4 b3 = *v(p1 + 3 * m);
132         
133         br16(*v(p0 + 0 * m), *v(p0 + 1 * m), *v(p0 + 2 * m), *v(p0 + 3 * m), 
134              *v(p1 + 0 * m), *v(p1 + 1 * m), *v(p1 + 2 * m), *v(p1 + 3 * m));
135         
136         br16(b0, b1, b2, b3, 
137              *v(p0 + 0 * m), *v(p0 + 1 * m), *v(p0 + 2 * m), *v(p0 + 3 * m));
138     }
139 
140     static void bit_reverse()(float * p, size_t m)
141     {
142         br16(*v(p + 0 * m), *v(p + 1 * m), *v(p + 2 * m), *v(p + 3 * m), 
143              *v(p + 0 * m), *v(p + 1 * m), *v(p + 2 * m), *v(p + 3 * m));
144     }
145 }
146 
147 struct Options
148 {
149     enum log2_bitreverse_large_chunk_size = 5;
150     enum large_limit = 14;
151     enum log2_optimal_n = 10;
152     enum passes_per_recursive_call = 4;
153     enum log2_recursive_passes_chunk_size = 5;
154     enum prefered_alignment = 4 * (1 << 10);
155     //enum { fast_init };
156 }
157 
158 unittest
159 {
160     alias V = Vector;
161     float[4] m;
162     float[4] correct = [4.0f, 4.0f, 4.0f, 4.0f];
163     V.vec A = V.scalar_to_vector(4.0f);
164     V.unaligned_store(m.ptr, A);
165     assert(m == correct);
166 }
167 
168 unittest
169 {
170     alias V = Vector;
171     float[4] m = [2.0f, 3.0f, 4.0f, 5.0f];
172     float[4] r;
173     V.vec A = V.unaligned_load(m.ptr);
174     A = V.reverse(A);
175     float[4] correct = [5.0f, 4.0f, 3.0f, 2.0f];
176     V.unaligned_store(r.ptr, A);
177     assert(r == correct);
178 
179     // unpcklps
180     V.vec B = V.unpcklps(A, A);
181     correct = [5.0f, 5.0f, 4.0f, 4.0f];
182     V.unaligned_store(r.ptr, B);
183     assert(r == correct);
184 
185      // unpckhps
186     B = V.unpckhps(A, A);
187     correct = [3.0f, 3.0f, 2.0f, 2.0f];
188     V.unaligned_store(r.ptr, B);
189     assert(r == correct);
190 }
191 
192 unittest
193 {
194     alias V = Vector;
195     float[4] A = [-1.0f, 2.0f, 3.0f, 4.0f];
196     V.vec B = V.unaligned_load(A.ptr);
197     V.vec C = V.shufps!(3,1,2,1)(B, B);
198     float[4] correct = [2.0f, 3.0f, 2.0f, 4.0f];
199     float[4] r;
200     V.unaligned_store(r.ptr, C);
201     assert(r == correct);
202 }