1 // Copyright Jernej Krempuš 2012
2 // Copyright Guillaume Piolat 2016-2023
3 // Distributed under the Boost Software License, Version 1.0.
4 // (See accompanying file LICENSE_1_0.txt or copy at
5 // http://www.boost.org/LICENSE_1_0.txt)
6 module dplug.fft.sse_float;
7
8 import inteli.emmintrin;
9 import dplug.fft.fft_impl;
10
11 struct Vector
12 {
13 nothrow:
14 @nogc:
15 alias float4 vec;
16 alias float T;
17
18 enum vec_size = 4;
19 enum log2_bitreverse_chunk_size = 2;
20
21 static vec scalar_to_vector()(float a)
22 {
23 return _mm_set1_ps(a);
24 }
25
26 static auto shufps(int m0, int m1, int m2, int m3)(float4 a, float4 b)
27 {
28 enum shufmask = _MM_SHUFFLE(m0, m1, m2, m3);
29 return _mm_shuffle_ps!shufmask(a, b);
30 }
31
32 static vec unpcklps(vec a, vec b)
33 {
34 return _mm_unpacklo_ps(a, b);
35 }
36
37 static vec unpckhps(vec a, vec b)
38 {
39 return _mm_unpackhi_ps(a, b);
40 }
41
42 static vec unaligned_load(T* p)
43 {
44 return _mm_loadu_ps(p);
45 }
46
47 static void unaligned_store(T* p, vec v)
48 {
49 _mm_storeu_ps(p, v);
50 }
51
52 static vec reverse(vec v)
53 {
54 return _mm_shuffle_ps!(_MM_SHUFFLE(0, 1, 2, 3))(v, v);
55 }
56
57
58 static void complex_array_to_real_imag_vec(int len)(
59 float * arr, ref vec rr, ref vec ri)
60 {
61 static if(len==2)
62 {
63 rr = ri = (cast(vec*)arr)[0];
64 rr = shufps!(2,2,0,0)(rr, rr); // I could use __builtin_ia32_movsldup here but it doesn't seem to increase performance
65 ri = shufps!(3,3,1,1)(ri, ri);
66 }
67 else static if(len==4)
68 {
69 vec tmp = (cast(vec*)arr)[0];
70 ri = (cast(vec*)arr)[1];
71 rr = shufps!(2,0,2,0)(tmp, ri);
72 ri = shufps!(3,1,3,1)(tmp, ri);
73 }
74 }
75
76 static void transpose(int elements_per_vector)(
77 vec a0, vec a1, ref vec r0, ref vec r1)
78 {
79 if(elements_per_vector==4)
80 {
81 r0 = shufps!(2,0,2,0)(a0,a1);
82 r1 = shufps!(3,1,3,1)(a0,a1);
83 r0 = shufps!(3,1,2,0)(r0,r0);
84 r1 = shufps!(3,1,2,0)(r1,r1);
85 }
86 else if(elements_per_vector==2)
87 {
88 r0 = shufps!(1,0,1,0)(a0,a1);
89 r1 = shufps!(3,2,3,2)(a0,a1);
90 }
91 }
92
93 static void interleave(
94 vec a0, vec a1, ref vec r0, ref vec r1)
95 {
96 r0 = unpcklps(a0,a1);
97 r1 = unpckhps(a0,a1);
98 }
99
100 static void deinterleave(
101 vec a0, vec a1, ref vec r0, ref vec r1)
102 {
103 r0 = shufps!(2,0,2,0)(a0,a1);
104 r1 = shufps!(3,1,3,1)(a0,a1);
105 }
106
107 private static float4 * v()(float * a)
108 {
109 return cast(float4*)a;
110 }
111
112 private static void br16()(
113 float4 a0, float4 a1, float4 a2, float4 a3,
114 ref float4 r0, ref float4 r1, ref float4 r2, ref float4 r3)
115 {
116 float4 b0 = shufps!(1,0,1,0)(a0, a2);
117 float4 b1 = shufps!(1,0,1,0)(a1, a3);
118 float4 b2 = shufps!(3,2,3,2)(a0, a2);
119 float4 b3 = shufps!(3,2,3,2)(a1, a3);
120 r0 = shufps!(2,0,2,0)(b0, b1);
121 r1 = shufps!(2,0,2,0)(b2, b3);
122 r2 = shufps!(3,1,3,1)(b0, b1);
123 r3 = shufps!(3,1,3,1)(b2, b3);
124 }
125
126 static void bit_reverse_swap()(float * p0, float * p1, size_t m)
127 {
128 float4 b0 = *v(p1 + 0 * m);
129 float4 b1 = *v(p1 + 1 * m);
130 float4 b2 = *v(p1 + 2 * m);
131 float4 b3 = *v(p1 + 3 * m);
132
133 br16(*v(p0 + 0 * m), *v(p0 + 1 * m), *v(p0 + 2 * m), *v(p0 + 3 * m),
134 *v(p1 + 0 * m), *v(p1 + 1 * m), *v(p1 + 2 * m), *v(p1 + 3 * m));
135
136 br16(b0, b1, b2, b3,
137 *v(p0 + 0 * m), *v(p0 + 1 * m), *v(p0 + 2 * m), *v(p0 + 3 * m));
138 }
139
140 static void bit_reverse()(float * p, size_t m)
141 {
142 br16(*v(p + 0 * m), *v(p + 1 * m), *v(p + 2 * m), *v(p + 3 * m),
143 *v(p + 0 * m), *v(p + 1 * m), *v(p + 2 * m), *v(p + 3 * m));
144 }
145 }
146
147 struct Options
148 {
149 enum log2_bitreverse_large_chunk_size = 5;
150 enum large_limit = 14;
151 enum log2_optimal_n = 10;
152 enum passes_per_recursive_call = 4;
153 enum log2_recursive_passes_chunk_size = 5;
154 enum prefered_alignment = 4 * (1 << 10);
155 //enum { fast_init };
156 }
157
158 unittest
159 {
160 alias V = Vector;
161 float[4] m;
162 float[4] correct = [4.0f, 4.0f, 4.0f, 4.0f];
163 V.vec A = V.scalar_to_vector(4.0f);
164 V.unaligned_store(m.ptr, A);
165 assert(m == correct);
166 }
167
168 unittest
169 {
170 alias V = Vector;
171 float[4] m = [2.0f, 3.0f, 4.0f, 5.0f];
172 float[4] r;
173 V.vec A = V.unaligned_load(m.ptr);
174 A = V.reverse(A);
175 float[4] correct = [5.0f, 4.0f, 3.0f, 2.0f];
176 V.unaligned_store(r.ptr, A);
177 assert(r == correct);
178
179 // unpcklps
180 V.vec B = V.unpcklps(A, A);
181 correct = [5.0f, 5.0f, 4.0f, 4.0f];
182 V.unaligned_store(r.ptr, B);
183 assert(r == correct);
184
185 // unpckhps
186 B = V.unpckhps(A, A);
187 correct = [3.0f, 3.0f, 2.0f, 2.0f];
188 V.unaligned_store(r.ptr, B);
189 assert(r == correct);
190 }
191
192 unittest
193 {
194 alias V = Vector;
195 float[4] A = [-1.0f, 2.0f, 3.0f, 4.0f];
196 V.vec B = V.unaligned_load(A.ptr);
197 V.vec C = V.shufps!(3,1,2,1)(B, B);
198 float[4] correct = [2.0f, 3.0f, 2.0f, 4.0f];
199 float[4] r;
200 V.unaligned_store(r.ptr, C);
201 assert(r == correct);
202 }