1 // Copyright Jernej Krempuš 2012 2 // Distributed under the Boost Software License, Version 1.0. 3 // (See accompanying file LICENSE_1_0.txt or copy at 4 // http://www.boost.org/LICENSE_1_0.txt) 5 6 module dplug.fft.shuffle; 7 8 import core.bitop; 9 10 nothrow: 11 @nogc: 12 13 template st(alias a){ enum st = cast(size_t) a; } 14 15 void _swap(T)(ref T a, ref T b) 16 { 17 auto aa = a; 18 auto bb = b; 19 b = aa; 20 a = bb; 21 } 22 23 template ints_up_to(int n, T...) 24 { 25 static if(n) 26 { 27 alias ints_up_to!(n-1, n-1, T) ints_up_to; 28 } 29 else 30 alias T ints_up_to; 31 } 32 33 template powers_up_to(int n, T...) 34 { 35 static if(n > 1) 36 { 37 alias powers_up_to!(n / 2, n / 2, T) powers_up_to; 38 } 39 else 40 alias T powers_up_to; 41 } 42 43 template RepeatType(T, int n, R...) 44 { 45 static if(n == 0) 46 alias R RepeatType; 47 else 48 alias RepeatType!(T, n - 1, T, R) RepeatType; 49 } 50 51 void iter_bit_reversed_pairs(alias dg, A...)(int log2n, A args) 52 { 53 int mask = (0xffffffff<<(log2n)); 54 uint i2 = ~mask; 55 uint i1 = i2; 56 57 while(i1 != (0U - 1U)) 58 { 59 dg(i1, i2, args); 60 i2 = mask ^ (i2 ^ (mask>>(bsf(i1)+1))); 61 --i1; 62 } 63 } 64 65 void bit_reverse_simple(T)(T* p, int log2n) 66 { 67 static void loopBody(int i0, int i1, T* p) 68 { 69 if(i1 > i0) 70 _swap(p[i0],p[i1]); 71 } 72 73 iter_bit_reversed_pairs!loopBody(log2n, p); 74 } 75 76 template reverse_bits(int i, int bits_left, int r = 0) 77 { 78 static if(bits_left == 0) 79 enum reverse_bits = r; 80 else 81 enum reverse_bits = reverse_bits!( 82 i >> 1, bits_left - 1, (r << 1) | (i & 1)); 83 } 84 85 auto bit_reverse_static_size(int log2n, T, S...)(T* p, S stride) 86 if(S.length <= 1) 87 { 88 enum n = 1 << log2n; 89 enum l = 1 << (log2n / 2); 90 91 static if(stride.length == 1) 92 auto index(size_t i)(S s){ return i / l * s[0] + i % l; } 93 else 94 auto index(size_t i)(S s){ return i; } 95 96 RepeatType!(T, n) a; 97 98 foreach(i, _; a) 99 static if(i != reverse_bits!(i, log2n)) 100 a[i] = p[index!i(stride)]; 101 102 foreach(i, _; a) 103 static if(i != reverse_bits!(i, log2n)) 104 p[index!i(stride)] = a[reverse_bits!(i, log2n)]; 105 } 106 107 auto bit_reverse_tiny(int max_log2n, T)(T* p, int log2n) 108 { 109 switch(log2n) 110 { 111 foreach(i; ints_up_to!max_log2n) 112 { 113 case i: 114 bit_reverse_static_size!i(p); 115 break; 116 } 117 118 default: 119 } 120 } 121 122 void bit_reverse_step(size_t chunk_size, T)(T* p, size_t nchunks) 123 { 124 for(size_t i = chunk_size, j = (nchunks >> 1) * chunk_size; 125 j < nchunks * chunk_size; 126 j += chunk_size*2, i += chunk_size*2) 127 { 128 foreach(k; ints_up_to!chunk_size) 129 _swap(p[i + k], p[j + k]); 130 } 131 } 132 133 auto aligned_ptr(T, U)(U * ptr, size_t alignment) 134 { 135 return cast(T*)(((cast(size_t)ptr) + alignment) & ~(alignment - 1UL)); 136 } 137 138 auto aligned_size(T)(size_t size, size_t alignment) 139 { 140 return size * T.sizeof + alignment; 141 } 142 143 struct BitReverse(alias V, Options) 144 { 145 alias V.T T; 146 alias V.vec vec; 147 148 static size_t br_table_size()(int log2n) 149 { 150 enum log2l = V.log2_bitreverse_chunk_size; 151 152 return log2n < log2l * 2 ? 0 : (1 << (log2n - 2 * log2l)) + 2 * log2l; 153 } 154 155 static void init_br_table()(uint* table, int log2n) 156 { 157 enum log2l = V.log2_bitreverse_chunk_size; 158 159 static void loopBody0(int i0, int i1, uint** p) 160 { 161 if(i1 == i0) 162 (**p = i0 << log2l), (*p)++; 163 } 164 iter_bit_reversed_pairs!loopBody0(log2n - 2 * log2l, &table); 165 166 static void loopBody1(int i0, int i1, uint** p) 167 { 168 if(i1 < i0) 169 { 170 **p = i0 << log2l; 171 (*p)++; 172 **p = i1 << log2l; 173 (*p)++; 174 } 175 } 176 iter_bit_reversed_pairs!loopBody1(log2n - 2 * log2l, &table); 177 } 178 179 static void bit_reverse_small()(T* p, uint log2n, uint* table) 180 { 181 enum log2l = V.log2_bitreverse_chunk_size; 182 183 uint tmp = log2n - log2l - log2l; 184 uint n1 = 1u << ((tmp + 1) >> 1); 185 uint n2 = 1u << tmp; 186 uint m = 1u << (log2n - log2l); 187 188 uint* t1 = table + n1, t2 = table + n2; 189 190 for(; table < t1; table++) 191 V.bit_reverse( p + table[0], m); 192 for(; table < t2; table += 2) 193 V.bit_reverse_swap( p + table[0], p + table[1], m); 194 } 195 196 private static auto highest_power_2(int a, int maxpower) 197 { 198 while(a % maxpower) 199 maxpower /= 2; 200 201 return maxpower; 202 } 203 204 static void swap_some(int n, TT)(TT* a, TT* b) 205 { 206 RepeatType!(TT, 2 * n) tmp; 207 208 foreach(i; ints_up_to!n) 209 tmp[i] = a[i]; 210 foreach(i; ints_up_to!n) 211 tmp[i + n] = b[i]; 212 213 foreach(i; ints_up_to!n) 214 b[i] = tmp[i]; 215 foreach(i; ints_up_to!n) 216 a[i] = tmp[i + n]; 217 } 218 219 static void swap_array(int len, TT)(TT * a, TT * b) 220 { 221 static assert(len*TT.sizeof % vec.sizeof == 0); 222 223 enum n = highest_power_2( len * TT.sizeof / vec.sizeof, 4); 224 225 foreach(i; 0 .. len * TT.sizeof / n / vec.sizeof) 226 swap_some!n((cast(vec*)a) + n * i, (cast(vec*)b) + n * i); 227 } 228 229 static void copy_some(int n, TT)(TT* dst, TT* src) 230 { 231 RepeatType!(TT, n) a; 232 233 foreach(i, _; a) 234 a[i] = src[i]; 235 foreach(i, _; a) 236 dst[i] = a[i]; 237 } 238 239 static void copy_array(int len, TT)(TT * a, TT * b) 240 { 241 static assert((len * TT.sizeof % vec.sizeof == 0)); 242 243 enum n = highest_power_2( len * TT.sizeof / vec.sizeof, 8); 244 245 foreach(i; 0 .. len * TT.sizeof / n / vec.sizeof) 246 copy_some!n((cast(vec*)a) + n * i, (cast(vec*)b) + n * i); 247 } 248 249 static void strided_copy(size_t chunk_size, TT)( 250 TT* dst, TT* src, size_t dst_stride, size_t src_stride, size_t nchunks) 251 { 252 for( 253 TT* s = src, d = dst; 254 s < src + nchunks * src_stride; 255 s += src_stride, d += dst_stride) 256 { 257 copy_array!chunk_size(d, s); 258 } 259 } 260 261 static void bit_reverse_large()(T* p, int log2n, uint * table) 262 { 263 enum log2l = Options.log2_bitreverse_large_chunk_size; 264 enum l = 1<<log2l; 265 266 ubyte[aligned_size!T(l * l, 64)] mem = void; 267 auto buffer = aligned_ptr!T(mem.ptr, 64); 268 269 int log2m = log2n - log2l; 270 size_t m = 1<<log2m, n = 1<<log2n; 271 T * pend = p + n; 272 273 iter_bit_reversed_pairs!(function (size_t i0, size_t i1, 274 T* p, T* pend, size_t m, uint* table, T* buffer) 275 { 276 if(i1 >= i0) 277 { 278 strided_copy!l(buffer, p + i0 * l, l, m, l); 279 280 bit_reverse_small(buffer,log2l+log2l, table); 281 282 if(i1 != i0) 283 { 284 for( 285 T* pp = p + i1 * l, pb = buffer; 286 pp < pend; 287 pb += l, pp += m) 288 { 289 swap_array!l(pp, pb); 290 } 291 292 bit_reverse_small(buffer,log2l+log2l, table); 293 } 294 295 strided_copy!l(p + i0 * l, buffer, m, l, l); 296 } 297 })(log2m-log2l, p, pend, m, table, buffer); 298 } 299 } 300 301 private struct Scalar(TT) 302 { 303 public: 304 305 alias TT T; 306 alias TT vec; 307 enum vec_size = 1; 308 309 static void interleave(vec a0, vec a1, ref vec r0, ref vec r1) 310 { 311 r0 = a0; 312 r1 = a1; 313 } 314 315 static void deinterleave(vec a0, vec a1, ref vec r0, ref vec r1) 316 { 317 r0 = a0; 318 r1 = a1; 319 } 320 } 321 322 template hasInterleaving(V) 323 { 324 enum hasInterleaving = 325 is(typeof(V.interleave)) && 326 is(typeof(V.deinterleave)); 327 } 328 329 struct InterleaveImpl(V, int chunk_size, bool isInverse) 330 { 331 static size_t itable_size_bytes()(int log2n) 332 { 333 return (bool.sizeof << log2n) / V.vec_size / chunk_size; 334 } 335 336 static bool* interleave_table()(int log2n, void* p) 337 { 338 auto n = st!1 << log2n; 339 auto is_cycle_minimum = cast(bool*) p; 340 size_t n_chunks = n / V.vec_size / chunk_size; 341 342 if(n_chunks < 4) 343 return null; 344 345 is_cycle_minimum[0 .. n_chunks] = true; 346 347 for(size_t i = 1;;) 348 { 349 size_t j = i; 350 while(true) 351 { 352 j = j < n_chunks / 2 ? 2 * j : 2 * (j - n_chunks / 2) + 1; 353 if(j == i) 354 break; 355 356 is_cycle_minimum[j] = false; 357 } 358 359 // The last cycle minimum is at n / 2 - 1 360 if(i == n_chunks / 2 - 1) 361 break; 362 363 do i++; while(!is_cycle_minimum[i]); 364 } 365 366 return is_cycle_minimum; 367 } 368 369 static void interleave_chunks()( 370 V.vec* a, size_t n_chunks, bool* is_cycle_minimum) 371 { 372 alias RepeatType!(V.vec, chunk_size) RT; 373 alias ints_up_to!chunk_size indices; 374 375 for(size_t i = 1;;) 376 { 377 size_t j = i; 378 379 RT element; 380 auto p = &a[i * chunk_size]; 381 foreach(k; indices) 382 element[k] = p[k]; 383 384 while(true) 385 { 386 static if(isInverse) 387 j = j & 1 ? j / 2 + n_chunks / 2 : j / 2; 388 else 389 j = j < n_chunks / 2 ? 2 * j : 2 * (j - n_chunks / 2) + 1; 390 391 if(j == i) 392 break; 393 394 RT tmp; 395 p = &a[j * chunk_size]; 396 foreach(k; indices) 397 tmp[k] = p[k]; 398 399 foreach(k; indices) 400 p[k] = element[k]; 401 402 foreach(k; indices) 403 element[k] = tmp[k]; 404 } 405 406 p = &a[i * chunk_size]; 407 foreach(k; indices) 408 p[k] = element[k]; 409 410 if(i == n_chunks / 2 - 1) 411 break; 412 413 do i++; while(!is_cycle_minimum[i]); 414 } 415 } 416 417 static void interleave_tiny()(V.vec* p, size_t len) 418 { 419 switch(len) 420 { 421 foreach(n; powers_up_to!(2 * chunk_size)) 422 { 423 case 2 * n: 424 425 RepeatType!(V.vec, 2 * n) tmp; 426 427 static if(isInverse) 428 foreach(j; ints_up_to!n) 429 V.deinterleave( 430 p[2 * j], p[2 * j + 1], tmp[j], 431 tmp[n + j]); 432 else 433 foreach(j; ints_up_to!n) 434 V.interleave( 435 p[j], p[n + j], 436 tmp[2 * j], tmp[2 * j + 1]); 437 438 foreach(j; ints_up_to!(2 * n)) 439 p[j] = tmp[j]; 440 441 break; 442 } 443 444 default: {} 445 } 446 } 447 448 static void interleave_chunk_elements()(V.vec* a, size_t n_chunks) 449 { 450 for(auto p = a; p < a + n_chunks * chunk_size; p += 2 * chunk_size) 451 { 452 RepeatType!(V.vec, 2 * chunk_size) tmp; 453 454 static if(isInverse) 455 foreach(j; ints_up_to!chunk_size) 456 V.deinterleave( 457 p[2 * j], p[2 * j + 1], tmp[j], tmp[chunk_size + j]); 458 else 459 foreach(j; ints_up_to!chunk_size) 460 V.interleave( 461 p[j], p[chunk_size + j], tmp[2 * j], tmp[2 * j + 1]); 462 463 foreach(j; ints_up_to!(2 * chunk_size)) 464 p[j] = tmp[j]; 465 } 466 } 467 468 static void interleave()(V.T* p, int log2n, bool* table) 469 { 470 auto n = st!1 << log2n; 471 472 if(n < 4) 473 return; 474 else if(n < 2 * V.vec_size) 475 return 476 InterleaveImpl!(Scalar!(V.T), V.vec_size / 2, isInverse) 477 .interleave_tiny(p, n); 478 479 assert(n >= 2 * V.vec_size); 480 481 auto vp = cast(V.vec*) p; 482 auto vn = n / V.vec_size; 483 484 if(n < 4 * V.vec_size * chunk_size) 485 interleave_tiny(vp, vn); 486 else 487 { 488 auto n_chunks = vn / chunk_size; 489 static if(isInverse) 490 { 491 interleave_chunk_elements(vp, n_chunks); 492 interleave_chunks(vp, n_chunks, table); 493 } 494 else 495 { 496 interleave_chunks(vp, n_chunks, table); 497 interleave_chunk_elements(vp, n_chunks); 498 } 499 } 500 } 501 } 502 503 template Interleave(V, int chunk_size, bool isInverse) 504 { 505 static if(hasInterleaving!V) 506 alias InterleaveImpl!(V, chunk_size, isInverse) Interleave; 507 else 508 alias 509 InterleaveImpl!(Scalar!(V.T), chunk_size, isInverse) Interleave; 510 }