1 // Copyright Jernej Krempuš 2012 2 // Distributed under the Boost Software License, Version 1.0. 3 // (See accompanying file LICENSE_1_0.txt or copy at 4 // http://www.boost.org/LICENSE_1_0.txt) 5 6 module dplug.fft.sse_float; 7 8 import core.simd; 9 10 import dplug.fft.fft_impl; 11 12 template shuf_mask(int a3, int a2, int a1, int a0) 13 { 14 enum shuf_mask = a0 | (a1<<2) | (a2<<4) | (a3<<6); 15 } 16 17 version(X86_64) 18 version(linux) 19 version = linux_x86_64; 20 21 22 import dplug.fft.ldc_compat; 23 import dplug.fft.dmd32_compat; 24 25 struct Vector 26 { 27 nothrow: 28 @nogc: 29 alias float4 vec; 30 alias float T; 31 32 enum vec_size = 4; 33 enum log2_bitreverse_chunk_size = 2; 34 35 version(GNU) 36 { 37 import gcc.builtins; 38 39 static vec scalar_to_vector()(T a) 40 { 41 return a; 42 } 43 44 private static shufps(int m0, int m1, int m2, int m3)(float4 a, float4 b) 45 { 46 return __builtin_ia32_shufps(a, b, shuf_mask!(m0, m1, m2, m3)); 47 } 48 49 alias __builtin_ia32_unpcklps unpcklps; 50 alias __builtin_ia32_unpckhps unpckhps; 51 52 static vec unaligned_load(T* p) 53 { 54 return __builtin_ia32_loadups(p); 55 } 56 57 static void unaligned_store(T* p, vec v) 58 { 59 return __builtin_ia32_storeups(p, v); 60 } 61 62 static vec reverse(vec v) 63 { 64 return shufps!(0, 1, 2, 3)(v, v); 65 } 66 } 67 68 version(DigitalMars) 69 { 70 static vec scalar_to_vector()(float a) 71 { 72 version(linux_x86_64) 73 asm nothrow @nogc 74 { 75 naked; 76 shufps XMM0, XMM0, 0; 77 ret; 78 } 79 else 80 { 81 static struct quad 82 { 83 align(16) float a; 84 float b; 85 float c; 86 float d; 87 } 88 auto q = quad(a,a,a,a); 89 return *cast(vec*)& q; 90 } 91 } 92 } 93 94 version(LDC) 95 { 96 static vec scalar_to_vector()(float a) 97 { 98 return a; 99 } 100 101 static auto shufps(int m0, int m1, int m2, int m3)(float4 a, float4 b) 102 { 103 return shufflevector!(float4, m3, m2, m1+4, m0+4)(a, b); 104 } 105 106 static vec unpcklps(vec a, vec b) 107 { 108 return shufflevector!(float4, 0, 4, 1, 5)(a, b); 109 } 110 111 static vec unpckhps(vec a, vec b) 112 { 113 return shufflevector!(float4, 2, 6, 3, 7)(a, b); 114 } 115 116 static vec unaligned_load(T* p) 117 { 118 return loadUnaligned!vec(cast(float*)p); 119 } 120 121 static void unaligned_store(T* p, vec v) 122 { 123 storeUnaligned!vec(v, cast(float*)p); 124 } 125 126 static vec reverse(vec v) 127 { 128 return shufflevector!(float4, 3, 2, 1, 0)(v, v); 129 } 130 } 131 132 static if(is(typeof(shufps))) 133 { 134 static void complex_array_to_real_imag_vec(int len)( 135 float * arr, ref vec rr, ref vec ri) 136 { 137 static if(len==2) 138 { 139 rr = ri = (cast(vec*)arr)[0]; 140 rr = shufps!(2,2,0,0)(rr, rr); // I could use __builtin_ia32_movsldup here but it doesn't seem to increase performance 141 ri = shufps!(3,3,1,1)(ri, ri); 142 } 143 else static if(len==4) 144 { 145 vec tmp = (cast(vec*)arr)[0]; 146 ri = (cast(vec*)arr)[1]; 147 rr = shufps!(2,0,2,0)(tmp, ri); 148 ri = shufps!(3,1,3,1)(tmp, ri); 149 } 150 } 151 152 static void transpose(int elements_per_vector)( 153 vec a0, vec a1, ref vec r0, ref vec r1) 154 { 155 if(elements_per_vector==4) 156 { 157 r0 = shufps!(2,0,2,0)(a0,a1); 158 r1 = shufps!(3,1,3,1)(a0,a1); 159 r0 = shufps!(3,1,2,0)(r0,r0); 160 r1 = shufps!(3,1,2,0)(r1,r1); 161 } 162 else if(elements_per_vector==2) 163 { 164 r0 = shufps!(1,0,1,0)(a0,a1); 165 r1 = shufps!(3,2,3,2)(a0,a1); 166 } 167 } 168 169 static void interleave( 170 vec a0, vec a1, ref vec r0, ref vec r1) 171 { 172 r0 = unpcklps(a0,a1); 173 r1 = unpckhps(a0,a1); 174 } 175 176 static void deinterleave( 177 vec a0, vec a1, ref vec r0, ref vec r1) 178 { 179 r0 = shufps!(2,0,2,0)(a0,a1); 180 r1 = shufps!(3,1,3,1)(a0,a1); 181 } 182 183 private static float4 * v()(float * a) 184 { 185 return cast(float4*)a; 186 } 187 188 private static void br16()( 189 float4 a0, float4 a1, float4 a2, float4 a3, 190 ref float4 r0, ref float4 r1, ref float4 r2, ref float4 r3) 191 { 192 float4 b0 = shufps!(1,0,1,0)(a0, a2); 193 float4 b1 = shufps!(1,0,1,0)(a1, a3); 194 float4 b2 = shufps!(3,2,3,2)(a0, a2); 195 float4 b3 = shufps!(3,2,3,2)(a1, a3); 196 r0 = shufps!(2,0,2,0)(b0, b1); 197 r1 = shufps!(2,0,2,0)(b2, b3); 198 r2 = shufps!(3,1,3,1)(b0, b1); 199 r3 = shufps!(3,1,3,1)(b2, b3); 200 } 201 202 static void bit_reverse_swap()(float * p0, float * p1, size_t m) 203 { 204 float4 b0 = *v(p1 + 0 * m); 205 float4 b1 = *v(p1 + 1 * m); 206 float4 b2 = *v(p1 + 2 * m); 207 float4 b3 = *v(p1 + 3 * m); 208 209 br16(*v(p0 + 0 * m), *v(p0 + 1 * m), *v(p0 + 2 * m), *v(p0 + 3 * m), 210 *v(p1 + 0 * m), *v(p1 + 1 * m), *v(p1 + 2 * m), *v(p1 + 3 * m)); 211 212 br16(b0, b1, b2, b3, 213 *v(p0 + 0 * m), *v(p0 + 1 * m), *v(p0 + 2 * m), *v(p0 + 3 * m)); 214 } 215 216 static void bit_reverse()(float * p, size_t m) 217 { 218 br16(*v(p + 0 * m), *v(p + 1 * m), *v(p + 2 * m), *v(p + 3 * m), 219 *v(p + 0 * m), *v(p + 1 * m), *v(p + 2 * m), *v(p + 3 * m)); 220 } 221 } 222 else 223 { 224 static void bit_reverse()(T * p0, size_t m) 225 { 226 version(linux_x86_64) 227 asm nothrow @nogc 228 { 229 naked; 230 lea RAX,[RDI+RDI*1]; 231 lea RCX,[RSI+RDI*4]; 232 lea RDI,[RDI+RDI*2]; 233 movaps XMM1,[RSI]; 234 lea RDX,[RSI+RAX*4]; 235 lea R8,[RSI+RDI*4]; 236 movaps XMM0,[RCX]; 237 movaps XMM3,XMM1; 238 movaps XMM5,[RDX]; 239 movaps XMM2,XMM0; 240 movaps XMM4,[R8]; 241 shufps XMM1,XMM5,0xEE; 242 movlhps XMM3,XMM5; 243 shufps XMM0,XMM4,0xEE; 244 movlhps XMM2,XMM4; 245 movaps XMM6,XMM3; 246 movaps XMM7,XMM1; 247 shufps XMM3,XMM2,0xDD; 248 shufps XMM6,XMM2,0x88; 249 shufps XMM7,XMM0,0x88; 250 shufps XMM1,XMM0,0xDD; 251 movaps [RSI],XMM6; 252 movaps [RCX],XMM7; 253 movaps [RDX],XMM3; 254 movaps [R8],XMM1; 255 ret; 256 } 257 else 258 Scalar!T.bit_reverse(p0, m); 259 } 260 261 static void bit_reverse_swap()(T * p0, T * p1, size_t m) 262 { 263 version(linux_x86_64) 264 asm nothrow @nogc 265 { 266 naked; 267 lea RAX,[RDI+RDI*1]; 268 lea RCX,[RDI*4+0x0]; 269 lea RDI,[RDI+RDI*2]; 270 movaps XMM1,[RSI]; 271 shl RAX,0x2; 272 lea R10,[RSI+RCX*1]; 273 shl RDI,0x2; 274 lea R9,[RSI+RAX*1]; 275 movaps XMM3,[RDX]; 276 add RAX,RDX; 277 lea R8,[RSI+RDI*1]; 278 add RCX,RDX; 279 movaps XMM5,[R9]; 280 add RDI,RDX; 281 movaps XMM7,XMM3; 282 movaps XMM9,[RAX]; 283 movaps XMM12,XMM1; 284 shufps XMM1,XMM5,0xEE; 285 movaps XMM0,[R10]; 286 shufps XMM3,XMM9,0xEE; 287 movlhps XMM7,XMM9; 288 movaps XMM2,[RCX]; 289 movlhps XMM12,XMM5; 290 movaps XMM13,XMM0; 291 movaps XMM4,[R8]; 292 movaps XMM6,XMM2; 293 movaps XMM10,XMM7; 294 movaps XMM8,[RDI]; 295 shufps XMM0,XMM4,0xEE; 296 movaps XMM11,XMM3; 297 shufps XMM2,XMM8,0xEE; 298 movlhps XMM6,XMM8; 299 movlhps XMM13,XMM4; 300 movaps XMM14,XMM12; 301 movaps XMM15,XMM1; 302 shufps XMM10,XMM6,0x88; 303 shufps XMM11,XMM2,0x88; 304 shufps XMM7,XMM6,0xDD; 305 shufps XMM3,XMM2,0xDD; 306 shufps XMM14,XMM13,0x88; 307 shufps XMM15,XMM0,0x88; 308 shufps XMM12,XMM13,0xDD; 309 shufps XMM1,XMM0,0xDD; 310 movaps [RSI],XMM10; 311 movaps [R10],XMM11; 312 movaps [R9],XMM7; 313 movaps [R8],XMM3; 314 movaps [RDX],XMM14; 315 movaps [RCX],XMM15; 316 movaps [RAX],XMM12; 317 movaps [RDI],XMM1; 318 ret; 319 } 320 else 321 Scalar!T.bit_reverse_swap(p0, p1, m); 322 } 323 } 324 } 325 326 struct Options 327 { 328 enum log2_bitreverse_large_chunk_size = 5; 329 enum large_limit = 14; 330 enum log2_optimal_n = 10; 331 enum passes_per_recursive_call = 4; 332 enum log2_recursive_passes_chunk_size = 5; 333 enum prefered_alignment = 4 * (1 << 10); 334 //enum { fast_init }; 335 } 336