1 // Copyright Jernej Krempuš 2012 2 // Distributed under the Boost Software License, Version 1.0. 3 // (See accompanying file LICENSE_1_0.txt or copy at 4 // http://www.boost.org/LICENSE_1_0.txt) 5 6 module dplug.fft.sse_double; 7 8 import core.simd; 9 10 import dplug.fft.fft_impl; 11 12 template shuf_mask(int a3, int a2, int a1, int a0) 13 { 14 enum shuf_mask = a0 | (a1<<2) | (a2<<4) | (a3<<6); 15 } 16 17 import dplug.fft.ldc_compat; 18 import dplug.fft.dmd32_compat; 19 20 struct Vector 21 { 22 nothrow: 23 @nogc: 24 alias double2 vec; 25 alias double T; 26 27 enum vec_size = 2; 28 enum log2_bitreverse_chunk_size = 2; 29 30 version(GNU) 31 { 32 import gcc.builtins; 33 34 static vec scalar_to_vector(T a) 35 { 36 return a; 37 } 38 39 static void interleave( 40 vec a0, vec a1, ref vec r0, ref vec r1) 41 { 42 r0 = __builtin_ia32_unpcklpd(a0, a1); 43 r1 = __builtin_ia32_unpckhpd(a0, a1); 44 } 45 46 static vec unaligned_load(T* p) 47 { 48 return __builtin_ia32_loadupd(p); 49 } 50 51 static void unaligned_store(T* p, vec v) 52 { 53 return __builtin_ia32_storeupd(p, v); 54 } 55 56 static vec reverse(vec v) 57 { 58 return __builtin_ia32_shufpd(v, v, 0x1); 59 } 60 } 61 else version(LDC) 62 { 63 static vec scalar_to_vector(T a) 64 { 65 return a; 66 } 67 68 static void interleave( 69 vec a0, vec a1, ref vec r0, ref vec r1) 70 { 71 r0 = shufflevector!(double2, 0, 2)(a0, a1); 72 r1 = shufflevector!(double2, 1, 3)(a0, a1); 73 } 74 75 static vec unaligned_load(T* p) 76 { 77 return loadUnaligned!vec(cast(double*)p); 78 } 79 80 static void unaligned_store(T* p, vec v) 81 { 82 storeUnaligned!vec(v, cast(double*)p); 83 } 84 85 static vec reverse(vec v) 86 { 87 return shufflevector!(vec, 1, 0)(v, v); 88 } 89 } 90 else version(DigitalMars) 91 { 92 version(D_SIMD) 93 { 94 static vec scalar_to_vector(T a) 95 { 96 vec r; 97 r.ptr[0] = a; 98 r.ptr[1] = a; 99 return r; 100 } 101 102 static void interleave( 103 vec a0, vec a1, ref vec r0, ref vec r1) 104 { 105 r0 = cast(double2) __simd(XMM.UNPCKLPD, a0, a1); 106 r1 = cast(double2) __simd(XMM.UNPCKHPD, a0, a1); 107 } 108 } 109 else 110 { 111 static vec scalar_to_vector(T a) 112 { 113 return vec(a, a); 114 } 115 116 static void interleave(vec a0, vec a1, ref vec r0, ref vec r1) 117 { 118 r0.x = a0.x; 119 r0.y = a1.x; 120 r1.x = a0.y; 121 r1.y = a1.y; 122 } 123 124 } 125 } 126 else 127 static assert(false, "Unsupported compiler"); 128 129 private static vec * v(T * a) 130 { 131 return cast(vec*)a; 132 } 133 134 static void complex_array_to_real_imag_vec(int len)( 135 T * arr, ref vec rr, ref vec ri) 136 { 137 interleave(v(arr)[0], v(arr)[1], rr, ri); 138 } 139 140 alias interleave deinterleave; 141 142 static void transpose(int elements_per_vector)( 143 vec a0, vec a1, ref vec r0, ref vec r1) 144 { 145 static if(elements_per_vector == 2) 146 interleave(a0, a1, r0, r1); 147 else 148 static assert(0); 149 } 150 151 static void bit_reverse_swap(T * p0, T * p1, size_t m) 152 { 153 vec a0, a1, a2, a3, b0, b1, b2, b3; 154 155 a0 = v(p0 + m * 0)[0]; 156 a1 = v(p0 + m * 2)[0]; 157 b0 = v(p1 + m * 0)[0]; 158 b1 = v(p1 + m * 2)[0]; 159 interleave(a0, a1, a0, a1); 160 interleave(b0, b1, b0, b1); 161 v(p1 + m * 0)[0] = a0; 162 v(p1 + m * 2)[0] = a1; 163 v(p0 + m * 0)[0] = b0; 164 v(p0 + m * 2)[0] = b1; 165 166 a2 = v(p0 + m * 1)[1]; 167 a3 = v(p0 + m * 3)[1]; 168 b2 = v(p1 + m * 1)[1]; 169 b3 = v(p1 + m * 3)[1]; 170 interleave(a2, a3, a2, a3); 171 interleave(b2, b3, b2, b3); 172 v(p1 + m * 1)[1] = a2; 173 v(p1 + m * 3)[1] = a3; 174 v(p0 + m * 1)[1] = b2; 175 v(p0 + m * 3)[1] = b3; 176 177 a0 = v(p0 + m * 0)[1]; 178 a1 = v(p0 + m * 2)[1]; 179 a2 = v(p0 + m * 1)[0]; 180 a3 = v(p0 + m * 3)[0]; 181 interleave(a0, a1, a0, a1); 182 interleave(a2, a3, a2, a3); 183 b0 = v(p1 + m * 0)[1]; 184 b1 = v(p1 + m * 2)[1]; 185 b2 = v(p1 + m * 1)[0]; 186 b3 = v(p1 + m * 3)[0]; 187 v(p1 + m * 0)[1] = a2; 188 v(p1 + m * 2)[1] = a3; 189 v(p1 + m * 1)[0] = a0; 190 v(p1 + m * 3)[0] = a1; 191 interleave(b0, b1, b0, b1); 192 interleave(b2, b3, b2, b3); 193 v(p0 + m * 0)[1] = b2; 194 v(p0 + m * 2)[1] = b3; 195 v(p0 + m * 1)[0] = b0; 196 v(p0 + m * 3)[0] = b1; 197 } 198 199 static void bit_reverse(T * p, size_t m) 200 { 201 vec a0, a1, a2, a3; 202 a0 = v(p + m * 0)[0]; 203 a1 = v(p + m * 2)[0]; 204 a2 = v(p + m * 1)[1]; 205 a3 = v(p + m * 3)[1]; 206 interleave(a0, a1, a0, a1); 207 interleave(a2, a3, a2, a3); 208 v(p + m * 0)[0] = a0; 209 v(p + m * 2)[0] = a1; 210 v(p + m * 1)[1] = a2; 211 v(p + m * 3)[1] = a3; 212 213 a0 = v(p + m * 0)[1]; 214 a1 = v(p + m * 2)[1]; 215 a2 = v(p + m * 1)[0]; 216 a3 = v(p + m * 3)[0]; 217 interleave(a0, a1, a0, a1); 218 interleave(a2, a3, a2, a3); 219 v(p + m * 0)[1] = a2; 220 v(p + m * 2)[1] = a3; 221 v(p + m * 1)[0] = a0; 222 v(p + m * 3)[0] = a1; 223 } 224 } 225 226 struct Options 227 { 228 enum log2_bitreverse_large_chunk_size = 5; 229 enum large_limit = 13; 230 enum log2_optimal_n = 10; 231 enum passes_per_recursive_call = 4; 232 enum log2_recursive_passes_chunk_size = 5; 233 enum prefered_alignment = 4 * (1 << 10); 234 enum { fast_init }; 235 } 236