1 // Copyright Jernej Krempuš 2012 2 // Copyright Guillaume Piolat 2016-2023 3 // Distributed under the Boost Software License, Version 1.0. 4 // (See accompanying file LICENSE_1_0.txt or copy at 5 // http://www.boost.org/LICENSE_1_0.txt) 6 module dplug.fft.sse_float; 7 8 import inteli.emmintrin; 9 import dplug.fft.fft_impl; 10 11 struct Vector 12 { 13 nothrow: 14 @nogc: 15 alias float4 vec; 16 alias float T; 17 18 enum vec_size = 4; 19 enum log2_bitreverse_chunk_size = 2; 20 21 static vec scalar_to_vector()(float a) 22 { 23 return _mm_set1_ps(a); 24 } 25 26 static auto shufps(int m0, int m1, int m2, int m3)(float4 a, float4 b) 27 { 28 enum shufmask = _MM_SHUFFLE(m0, m1, m2, m3); 29 return _mm_shuffle_ps!shufmask(a, b); 30 } 31 32 static vec unpcklps(vec a, vec b) 33 { 34 return _mm_unpacklo_ps(a, b); 35 } 36 37 static vec unpckhps(vec a, vec b) 38 { 39 return _mm_unpackhi_ps(a, b); 40 } 41 42 static vec unaligned_load(T* p) 43 { 44 return _mm_loadu_ps(p); 45 } 46 47 static void unaligned_store(T* p, vec v) 48 { 49 _mm_storeu_ps(p, v); 50 } 51 52 static vec reverse(vec v) 53 { 54 return _mm_shuffle_ps!(_MM_SHUFFLE(0, 1, 2, 3))(v, v); 55 } 56 57 58 static void complex_array_to_real_imag_vec(int len)( 59 float * arr, ref vec rr, ref vec ri) 60 { 61 static if(len==2) 62 { 63 rr = ri = (cast(vec*)arr)[0]; 64 rr = shufps!(2,2,0,0)(rr, rr); // I could use __builtin_ia32_movsldup here but it doesn't seem to increase performance 65 ri = shufps!(3,3,1,1)(ri, ri); 66 } 67 else static if(len==4) 68 { 69 vec tmp = (cast(vec*)arr)[0]; 70 ri = (cast(vec*)arr)[1]; 71 rr = shufps!(2,0,2,0)(tmp, ri); 72 ri = shufps!(3,1,3,1)(tmp, ri); 73 } 74 } 75 76 static void transpose(int elements_per_vector)( 77 vec a0, vec a1, ref vec r0, ref vec r1) 78 { 79 if(elements_per_vector==4) 80 { 81 r0 = shufps!(2,0,2,0)(a0,a1); 82 r1 = shufps!(3,1,3,1)(a0,a1); 83 r0 = shufps!(3,1,2,0)(r0,r0); 84 r1 = shufps!(3,1,2,0)(r1,r1); 85 } 86 else if(elements_per_vector==2) 87 { 88 r0 = shufps!(1,0,1,0)(a0,a1); 89 r1 = shufps!(3,2,3,2)(a0,a1); 90 } 91 } 92 93 static void interleave( 94 vec a0, vec a1, ref vec r0, ref vec r1) 95 { 96 r0 = unpcklps(a0,a1); 97 r1 = unpckhps(a0,a1); 98 } 99 100 static void deinterleave( 101 vec a0, vec a1, ref vec r0, ref vec r1) 102 { 103 r0 = shufps!(2,0,2,0)(a0,a1); 104 r1 = shufps!(3,1,3,1)(a0,a1); 105 } 106 107 private static float4 * v()(float * a) 108 { 109 return cast(float4*)a; 110 } 111 112 private static void br16()( 113 float4 a0, float4 a1, float4 a2, float4 a3, 114 ref float4 r0, ref float4 r1, ref float4 r2, ref float4 r3) 115 { 116 float4 b0 = shufps!(1,0,1,0)(a0, a2); 117 float4 b1 = shufps!(1,0,1,0)(a1, a3); 118 float4 b2 = shufps!(3,2,3,2)(a0, a2); 119 float4 b3 = shufps!(3,2,3,2)(a1, a3); 120 r0 = shufps!(2,0,2,0)(b0, b1); 121 r1 = shufps!(2,0,2,0)(b2, b3); 122 r2 = shufps!(3,1,3,1)(b0, b1); 123 r3 = shufps!(3,1,3,1)(b2, b3); 124 } 125 126 static void bit_reverse_swap()(float * p0, float * p1, size_t m) 127 { 128 float4 b0 = *v(p1 + 0 * m); 129 float4 b1 = *v(p1 + 1 * m); 130 float4 b2 = *v(p1 + 2 * m); 131 float4 b3 = *v(p1 + 3 * m); 132 133 br16(*v(p0 + 0 * m), *v(p0 + 1 * m), *v(p0 + 2 * m), *v(p0 + 3 * m), 134 *v(p1 + 0 * m), *v(p1 + 1 * m), *v(p1 + 2 * m), *v(p1 + 3 * m)); 135 136 br16(b0, b1, b2, b3, 137 *v(p0 + 0 * m), *v(p0 + 1 * m), *v(p0 + 2 * m), *v(p0 + 3 * m)); 138 } 139 140 static void bit_reverse()(float * p, size_t m) 141 { 142 br16(*v(p + 0 * m), *v(p + 1 * m), *v(p + 2 * m), *v(p + 3 * m), 143 *v(p + 0 * m), *v(p + 1 * m), *v(p + 2 * m), *v(p + 3 * m)); 144 } 145 } 146 147 struct Options 148 { 149 enum log2_bitreverse_large_chunk_size = 5; 150 enum large_limit = 14; 151 enum log2_optimal_n = 10; 152 enum passes_per_recursive_call = 4; 153 enum log2_recursive_passes_chunk_size = 5; 154 enum prefered_alignment = 4 * (1 << 10); 155 //enum { fast_init }; 156 } 157 158 unittest 159 { 160 alias V = Vector; 161 float[4] m; 162 float[4] correct = [4.0f, 4.0f, 4.0f, 4.0f]; 163 V.vec A = V.scalar_to_vector(4.0f); 164 V.unaligned_store(m.ptr, A); 165 assert(m == correct); 166 } 167 168 unittest 169 { 170 alias V = Vector; 171 float[4] m = [2.0f, 3.0f, 4.0f, 5.0f]; 172 float[4] r; 173 V.vec A = V.unaligned_load(m.ptr); 174 A = V.reverse(A); 175 float[4] correct = [5.0f, 4.0f, 3.0f, 2.0f]; 176 V.unaligned_store(r.ptr, A); 177 assert(r == correct); 178 179 // unpcklps 180 V.vec B = V.unpcklps(A, A); 181 correct = [5.0f, 5.0f, 4.0f, 4.0f]; 182 V.unaligned_store(r.ptr, B); 183 assert(r == correct); 184 185 // unpckhps 186 B = V.unpckhps(A, A); 187 correct = [3.0f, 3.0f, 2.0f, 2.0f]; 188 V.unaligned_store(r.ptr, B); 189 assert(r == correct); 190 } 191 192 unittest 193 { 194 alias V = Vector; 195 float[4] A = [-1.0f, 2.0f, 3.0f, 4.0f]; 196 V.vec B = V.unaligned_load(A.ptr); 197 V.vec C = V.shufps!(3,1,2,1)(B, B); 198 float[4] correct = [2.0f, 3.0f, 2.0f, 4.0f]; 199 float[4] r; 200 V.unaligned_store(r.ptr, C); 201 assert(r == correct); 202 }