1 // Copyright Jernej Krempuš 2012 2 // Distributed under the Boost Software License, Version 1.0. 3 // (See accompanying file LICENSE_1_0.txt or copy at 4 // http://www.boost.org/LICENSE_1_0.txt) 5 6 module dplug.fft.fft_impl; 7 8 import dplug.fft.shuffle; 9 10 nothrow: 11 @nogc: 12 13 struct Scalar(_T) 14 { 15 nothrow: 16 @nogc: 17 alias _T vec; 18 alias _T T; 19 20 enum vec_size = 1; 21 enum log2_bitreverse_chunk_size = 2; 22 23 static vec scalar_to_vector(T a) 24 { 25 return a; 26 } 27 28 private static void load4br(T* p, size_t m, ref T a0, ref T a1, ref T a2, ref T a3) 29 { 30 a0 = p[0]; 31 a1 = p[m]; 32 a2 = p[1]; 33 a3 = p[m + 1]; 34 } 35 36 private static void store4(T* p, size_t m, T a0, T a1, T a2, T a3) 37 { 38 p[0] = a0; 39 p[1] = a1; 40 p[m] = a2; 41 p[m + 1] = a3; 42 } 43 44 static void bit_reverse_swap(T * p0, T * p1, size_t m) 45 { 46 RepeatType!(T, 4) a, b; 47 48 auto s = 2 * m; 49 auto i0 = 0, i1 = 2, i2 = m, i3 = m + 2; 50 51 load4br(p0, s, a); 52 load4br(p1, s, b); 53 store4(p1, s, a); 54 store4(p0, s, b); 55 56 load4br(p0 + i3, s, a); 57 load4br(p1 + i3, s, b); 58 store4(p1 + i3, s, a); 59 store4(p0 + i3, s, b); 60 61 load4br(p0 + i1, s, a); 62 load4br(p1 + i2, s, b); 63 store4(p1 + i2, s, a); 64 store4(p0 + i1, s, b); 65 66 load4br(p1 + i1, s, a); 67 load4br(p0 + i2, s, b); 68 store4(p0 + i2, s, a); 69 store4(p1 + i1, s, b); 70 } 71 72 static void bit_reverse(T * p, size_t m) 73 { 74 //bit_reverse_static_size!4(p, m); 75 T a0, a1, a2, b0, b1, b2; 76 77 a0 = p[1 + 0 * m]; 78 a1 = p[2 + 0 * m]; 79 a2 = p[3 + 0 * m]; 80 b0 = p[0 + 2 * m]; 81 b1 = p[0 + 1 * m]; 82 b2 = p[0 + 3 * m]; 83 84 p[0 + 2 * m] = a0; 85 p[0 + 1 * m] = a1; 86 p[0 + 3 * m] = a2; 87 p[1 + 0 * m] = b0; 88 p[2 + 0 * m] = b1; 89 p[3 + 0 * m] = b2; 90 91 a0 = p[1 + 1 * m]; 92 a1 = p[3 + 1 * m]; 93 a2 = p[3 + 2 * m]; 94 b0 = p[2 + 2 * m]; 95 b1 = p[2 + 3 * m]; 96 b2 = p[1 + 3 * m]; 97 98 p[2 + 2 * m] = a0; 99 p[2 + 3 * m] = a1; 100 p[1 + 3 * m] = a2; 101 p[1 + 1 * m] = b0; 102 p[3 + 1 * m] = b1; 103 p[3 + 2 * m] = b2; 104 } 105 106 static T unaligned_load(T* p){ return *p; } 107 static void unaligned_store(T* p, T a){ *p = a; } 108 static T reverse(T a){ return a; } 109 } 110 111 version(DisableLarge) 112 enum disable_large = true; 113 else 114 enum disable_large = false; 115 116 // reinventing some Phobos stuff... 117 118 struct Tuple(A...) 119 { 120 A a; 121 alias a this; 122 } 123 124 template TypeTuple(A...) 125 { 126 alias A TypeTuple; 127 } 128 129 template ParamTypeTuple(alias f) 130 { 131 auto params_struct(Ret, Params...)(Ret function(Params) f) 132 { 133 struct R 134 { 135 Params p; 136 } 137 return R.init; 138 } 139 140 static if(is(typeof(params_struct(&f)))) 141 alias f f_instance; 142 else 143 alias f!() f_instance; 144 145 alias typeof(params_struct(&f_instance).tupleof) type; 146 } 147 148 void static_size_fft(int log2n, T)(T *pr, T *pi, T *table) 149 { 150 enum n = 1 << log2n; 151 RepeatType!(T, n) ar, ai; 152 153 foreach(i; ints_up_to!n) 154 ar[i] = pr[i]; 155 156 foreach(i; ints_up_to!n) 157 ai[i] = pi[i]; 158 159 foreach(i; powers_up_to!n) 160 { 161 enum m = n / i; 162 163 auto tp = table; 164 foreach(j; ints_up_to!(n / m)) 165 { 166 enum offset = m * j; 167 168 T wr = tp[0]; 169 T wi = tp[1]; 170 tp += 2; 171 foreach(k1; ints_up_to!(m / 2)) 172 { 173 enum k2 = k1 + m / 2; 174 static if(j == 0) 175 { 176 T tr = ar[offset + k2], ti = ai[offset + k2]; 177 T ur = ar[offset + k1], ui = ai[offset + k1]; 178 } 179 else static if(j == 1) 180 { 181 T tr = ai[offset + k2], ti = -ar[offset + k2]; 182 T ur = ar[offset + k1], ui = ai[offset + k1]; 183 } 184 else 185 { 186 T tmpr = ar[offset + k2], ti = ai[offset + k2]; 187 T ur = ar[offset + k1], ui = ai[offset + k1]; 188 T tr = tmpr*wr - ti*wi; 189 ti = tmpr*wi + ti*wr; 190 } 191 ar[offset + k2] = ur - tr; 192 ar[offset + k1] = ur + tr; 193 ai[offset + k2] = ui - ti; 194 ai[offset + k1] = ui + ti; 195 } 196 } 197 } 198 199 foreach(i; ints_up_to!n) 200 pr[i] = ar[reverse_bits!(i, log2n)]; 201 202 foreach(i; ints_up_to!n) 203 pi[i] = ai[reverse_bits!(i, log2n)]; 204 } 205 206 struct FFT(V, Options) 207 { 208 nothrow: 209 @nogc: 210 import core.bitop, core.stdc.stdlib; 211 212 alias BitReverse!(V, Options) BR; 213 214 alias V.vec_size vec_size; 215 alias V.T T; 216 alias V.vec vec; 217 alias FFT!(Scalar!T, Options) SFFT; 218 219 import cmath = core.stdc.math; 220 221 static if(is(T == float)) 222 { 223 alias cmath.sinf _sin; 224 alias cmath.cosf _cos; 225 alias cmath.asinf _asin; 226 } 227 else static if(is(T == double)) 228 { 229 alias cmath.sin _sin; 230 alias cmath.cos _cos; 231 alias cmath.asin _asin; 232 } 233 else static if(is(T == real)) 234 { 235 alias cmath.sinl _sin; 236 alias cmath.cosl _cos; 237 alias cmath.asinl _asin; 238 } 239 else 240 static assert(0); 241 242 template st(alias a){ enum st = cast(size_t) a; } 243 244 alias Tuple!(T,T) Pair; 245 246 static void complex_array_to_vector()(Pair * pairs, size_t n) 247 { 248 for(size_t i=0; i<n; i += vec_size) 249 { 250 T[vec_size*2] buffer = void; 251 for(size_t j = 0; j < vec_size; j++) 252 { 253 buffer[j] = pairs[i+j][0]; 254 buffer[j + vec_size] = pairs[i+j][1]; 255 } 256 for(size_t j = 0; j < vec_size; j++) 257 { 258 pairs[i+j][0] = buffer[2*j]; 259 pairs[i+j][1] = buffer[2*j+1]; 260 } 261 } 262 } 263 264 static int log2()(int a) 265 { 266 int r = 0; 267 while(a) 268 { 269 a >>= 1; 270 r++; 271 } 272 return r - 1; 273 } 274 275 static void sines_cosines_refine(bool computeEven)( 276 Pair* src, Pair* dest, size_t n_from, T dphi) 277 { 278 T cdphi = _cos(dphi); 279 T sdphi = _sin(dphi); 280 281 enum compute = computeEven ? 0 : 1; 282 enum copy = compute ^ 1; 283 284 for(auto src_end = src + n_from; src < src_end; src++, dest += 2) 285 { 286 auto c = src[0][0]; 287 auto s = src[0][1]; 288 dest[copy][0] = c; 289 dest[copy][1] = s; 290 dest[compute][0] = c * cdphi - s * sdphi; 291 dest[compute][1] = c * sdphi + s * cdphi; 292 } 293 } 294 295 static void sines_cosines(bool phi0_is_last)( 296 Pair* r, size_t n, T phi0, T deltaphi, bool bit_reversed) 297 { 298 r[n - 1][0] = _cos(phi0); 299 r[n - 1][1] = _sin(phi0); 300 for(size_t len = 1; len < n; len *= 2) 301 { 302 auto denom = bit_reversed ? n / 2 / len : len; 303 sines_cosines_refine!phi0_is_last( 304 r + n - len, r + n - 2 * len, len, deltaphi / 2 / denom); 305 } 306 } 307 308 static void twiddle_table()(int log2n, Pair * r) 309 { 310 if(log2n >= Options.large_limit || log2n < 2 * log2(vec_size)) 311 { 312 return sines_cosines!false( 313 r, st!1 << (log2n - 1), 0.0, -2 * _asin(1), true); 314 } 315 316 r++; 317 318 auto p = r; 319 for (int s = 0; s < log2n; ++s) 320 { 321 size_t m2 = 1 << s; 322 323 if(s < log2n - log2(vec_size)) 324 sines_cosines!false(p, m2, 0.0, -2 * _asin(1), true); 325 else 326 { 327 sines_cosines!false(p, m2, 0.0, -2 * _asin(1), false); 328 complex_array_to_vector(p, m2); 329 } 330 331 p += m2; 332 } 333 334 p = r; 335 for (int s = 0; s + 1 < log2n - log2(vec_size); s += 2) 336 { 337 size_t m2 = 1 << s; 338 339 foreach(i; 0 .. m2) 340 // p[i] is p[m2 + 2 * i] ^^ 2. We store it here so that we 341 // don't need to recompute it below, which improves precision 342 // slightly. 343 p[m2 + 2 * i + 1] = p[i]; 344 345 foreach(i; 0 .. m2) 346 { 347 Pair a1 = p[m2 + 2 * i]; 348 Pair a2 = p[m2 + 2 * i + 1]; 349 Pair a3; 350 351 a3[0] = a2[0] * a1[0] - a2[1] * a1[1]; 352 a3[1] = a2[0] * a1[1] + a2[1] * a1[0]; 353 354 p[3 * i] = a1; 355 p[3 * i + 1] = a2; 356 p[3 * i + 2] = a3; 357 } 358 359 p += 3 * m2; 360 } 361 } 362 363 alias void* Table; 364 365 static size_t twiddle_table_size_bytes(int log2n) 366 { 367 auto compact = log2n >= Options.large_limit || 368 log2n < 2 * log2(vec_size); 369 370 return Pair.sizeof << (compact ? log2n - 1 : log2n); 371 } 372 373 static T* twiddle_table_ptr(void* p, int log2n) 374 { 375 return cast(T*)p; 376 } 377 378 static uint* br_table_ptr(void* p, int log2n) 379 { 380 return cast(uint*)(p + twiddle_table_size_bytes(log2n)); 381 } 382 383 static size_t table_size_bytes()(uint log2n) 384 { 385 uint log2nbr = log2n < Options.large_limit ? 386 log2n : 2 * Options.log2_bitreverse_large_chunk_size; 387 388 return 389 twiddle_table_size_bytes(log2n) + 390 BR.br_table_size(log2nbr) * uint.sizeof; 391 } 392 393 static Table fft_table()(int log2n, void * p) 394 { 395 if(log2n == 0) 396 return p; 397 //else if(log2n <= log2(vec_size)) 398 // return SFFT.fft_table(log2n, p); 399 400 Table tables = p; 401 402 twiddle_table(log2n, cast(Pair *)(twiddle_table_ptr(tables, log2n))); 403 404 if(log2n < V.log2_bitreverse_chunk_size * 2) 405 { 406 } 407 else if(log2n < Options.large_limit) 408 { 409 BR.init_br_table(br_table_ptr(tables, log2n), log2n); 410 } 411 else 412 { 413 enum log2size = 2*Options.log2_bitreverse_large_chunk_size; 414 BR.init_br_table(br_table_ptr(tables, log2n), log2size); 415 } 416 return tables; 417 } 418 419 static void fft_passes_bit_reversed()(vec* re, vec* im, size_t N , 420 vec* table, size_t start_stride = 1) 421 { 422 table += start_stride + start_stride; 423 vec* pend = re + N; 424 for (size_t m2 = start_stride; m2 < N ; m2 <<= 1) 425 { 426 size_t m = m2 + m2; 427 for( 428 vec* pr = re, pi = im; 429 pr < pend ; 430 pr += m, pi += m) 431 { 432 for (size_t k1 = 0, k2 = m2; k1<m2; k1++, k2 ++) 433 { 434 vec wr = table[2*k1], wi = table[2*k1+1]; 435 436 vec tmpr = pr[k2], ti = pi[k2]; 437 vec ur = pr[k1], ui = pi[k1]; 438 vec tr = tmpr*wr - ti*wi; 439 ti = tmpr*wi + ti*wr; 440 pr[k2] = ur - tr; 441 pr[k1] = ur + tr; 442 pi[k2] = ui - ti; 443 pi[k1] = ui + ti; 444 } 445 } 446 table += m; 447 } 448 } 449 450 static void first_fft_passes()(vec* pr, vec* pi, size_t n) 451 { 452 size_t i0 = 0, i1 = i0 + n/4, i2 = i1 + n/4, i3 = i2 + n/4, iend = i1; 453 454 for(; i0 < iend; i0++, i1++, i2++, i3++) 455 { 456 vec tr = pr[i2], ti = pi[i2]; 457 vec ur = pr[i0], ui = pi[i0]; 458 vec ar0 = ur + tr; 459 vec ar2 = ur - tr; 460 vec ai0 = ui + ti; 461 vec ai2 = ui - ti; 462 463 tr = pr[i3], ti = pi[i3]; 464 ur = pr[i1], ui = pi[i1]; 465 vec ar1 = ur + tr; 466 vec ar3 = ur - tr; 467 vec ai1 = ui + ti; 468 vec ai3 = ui - ti; 469 470 pr[i0] = ar0 + ar1; 471 pr[i1] = ar0 - ar1; 472 pi[i0] = ai0 + ai1; 473 pi[i1] = ai0 - ai1; 474 475 pr[i2] = ar2 + ai3; 476 pr[i3] = ar2 - ai3; 477 pi[i2] = ai2 - ar3; 478 pi[i3] = ai2 + ar3; 479 } 480 } 481 482 static void fft_pass()(vec *pr, vec *pi, vec *pend, T *table, size_t m2) 483 { 484 size_t m = m2 + m2; 485 for(; pr < pend ; pr += m, pi += m) 486 { 487 vec wr = V.scalar_to_vector(table[0]); 488 vec wi = V.scalar_to_vector(table[1]); 489 table += 2; 490 for (size_t k1 = 0, k2 = m2; k1<m2; k1++, k2 ++) 491 { 492 vec tmpr = pr[k2], ti = pi[k2]; 493 vec ur = pr[k1], ui = pi[k1]; 494 vec tr = tmpr*wr - ti*wi; 495 ti = tmpr*wi + ti*wr; 496 pr[k2] = ur - tr; 497 pr[k1] = ur + tr; 498 pi[k2] = ui - ti; 499 pi[k1] = ui + ti; 500 } 501 } 502 } 503 504 static void fft_two_passes(Tab...)( 505 vec *pr, vec *pi, vec *pend, size_t m2, Tab tab) 506 { 507 // When this function is called with tab.length == 2 on DMD, it 508 // sometimes gives an incorrect result (for example when building with 509 // SSE on 64 bit Linux and runnitg test_float pfft "14".), so lets's 510 // use fft_pass instead. 511 512 // Disabled work-around 513 514 version(DigitalMars) 515 { 516 static if (tab.length == 2) 517 enum workaroundTabLength = true; 518 else 519 enum workaroundTabLength = false; 520 } 521 else 522 enum workaroundTabLength = false; 523 524 static if (workaroundTabLength) 525 { 526 fft_pass(pr, pi, pend, tab[0], m2); 527 fft_pass(pr, pi, pend, tab[1], m2 / 2); 528 } 529 else 530 { 531 size_t m = m2 + m2; 532 size_t m4 = m2 / 2; 533 for(; pr < pend ; pr += m, pi += m) 534 { 535 static if(tab.length == 2) 536 { 537 vec w1r = V.scalar_to_vector(tab[1][0]); 538 vec w1i = V.scalar_to_vector(tab[1][1]); 539 540 vec w2r = V.scalar_to_vector(tab[0][0]); 541 vec w2i = V.scalar_to_vector(tab[0][1]); 542 543 vec w3r = w1r * w2r - w1i * w2i; 544 vec w3i = w1r * w2i + w1i * w2r; 545 546 tab[0] += 2; 547 tab[1] += 4; 548 } 549 else 550 { 551 vec w1r = V.scalar_to_vector(tab[0][0]); 552 vec w1i = V.scalar_to_vector(tab[0][1]); 553 554 vec w2r = V.scalar_to_vector(tab[0][2]); 555 vec w2i = V.scalar_to_vector(tab[0][3]); 556 557 vec w3r = V.scalar_to_vector(tab[0][4]); 558 vec w3i = V.scalar_to_vector(tab[0][5]); 559 560 tab[0] += 6; 561 } 562 563 for ( 564 size_t k0 = 0, k1 = m4, k2 = m2, k3 = m2 + m4; 565 k0<m4; k0++, 566 k1++, k2++, k3++) 567 { 568 vec tr, ur, ti, ui; 569 570 vec r0 = pr[k0]; 571 vec r1 = pr[k1]; 572 vec r2 = pr[k2]; 573 vec r3 = pr[k3]; 574 575 vec i0 = pi[k0]; 576 vec i1 = pi[k1]; 577 vec i2 = pi[k2]; 578 vec i3 = pi[k3]; 579 580 tr = r2 * w2r - i2 * w2i; 581 ti = r2 * w2i + i2 * w2r; 582 r2 = r0 - tr; 583 i2 = i0 - ti; 584 r0 = r0 + tr; 585 i0 = i0 + ti; 586 587 tr = r3 * w3r - i3 * w3i; 588 ti = r3 * w3i + i3 * w3r; 589 ur = r1 * w1r - i1 * w1i; 590 ui = r1 * w1i + i1 * w1r; 591 r3 = ur - tr; 592 i3 = ui - ti; 593 r1 = ur + tr; 594 i1 = ui + ti; 595 596 tr = r1; 597 ti = i1; 598 r1 = r0 - tr; 599 i1 = i0 - ti; 600 r0 = r0 + tr; 601 i0 = i0 + ti; 602 603 tr = i3; 604 ti = r3; // take minus into account later 605 r3 = r2 - tr; 606 i3 = i2 + ti; 607 r2 = r2 + tr; 608 i2 = i2 - ti; 609 610 pr[k0] = r0; 611 pr[k1] = r1; 612 pr[k2] = r2; 613 pr[k3] = r3; 614 615 pi[k0] = i0; 616 pi[k1] = i1; 617 pi[k2] = i2; 618 pi[k3] = i3; 619 } 620 } 621 } 622 } 623 624 static void fft_passes(bool compact_table)( 625 vec* re, vec* im, size_t N , T* table) 626 { 627 vec * pend = re + N; 628 629 size_t tableRowLen = 2; 630 size_t m2 = N/2; 631 632 static nextRow(ref T* table, ref size_t len) 633 { 634 static if(!compact_table) 635 { 636 table += len; 637 len += len; 638 } 639 } 640 641 if(m2 > 1) 642 { 643 first_fft_passes(re, im, N); 644 645 m2 >>= 2; 646 647 nextRow(table, tableRowLen); 648 nextRow(table, tableRowLen); 649 } 650 651 for (; m2 > 1 ; m2 >>= 2) 652 { 653 static if(compact_table) 654 fft_two_passes(re, im, pend, m2, table, table); 655 else 656 fft_two_passes(re, im, pend, m2, table); 657 658 nextRow(table, tableRowLen); 659 nextRow(table, tableRowLen); 660 } 661 662 if(m2 != 0) 663 fft_pass(re, im, pend, table, m2); 664 } 665 666 static void fft_passes_fractional()( 667 vec * pr, vec * pi, vec * pend, 668 T * table, size_t tableI) 669 { 670 static if(is(typeof(V.transpose!2))) 671 { 672 for(; pr < pend; pr += 2, pi += 2, tableI += 4) 673 { 674 auto ar = pr[0]; 675 auto ai = pi[0]; 676 auto br = pr[1]; 677 auto bi = pi[1]; 678 679 foreach(i; ints_up_to!(log2(vec_size))) 680 { 681 vec wr, wi, ur, ui; 682 683 V.complex_array_to_real_imag_vec!(2 << i)( 684 table + (tableI << i), wr, wi); 685 686 V.transpose!(2 << i)(ar, br, ur, br); 687 V.transpose!(2 << i)(ai, bi, ui, bi); 688 689 auto tr = br * wr - bi * wi; 690 auto ti = bi * wr + br * wi; 691 692 ar = ur + tr; 693 br = ur - tr; 694 ai = ui + ti; 695 bi = ui - ti; 696 } 697 698 V.interleave(ar, br, pr[0], pr[1]); 699 V.interleave(ai, bi, pi[0], pi[1]); 700 } 701 } 702 else 703 for (size_t m2 = vec_size >> 1; m2 > 0 ; m2 >>= 1) 704 { 705 SFFT.fft_pass( 706 cast(T*) pr, cast(T*) pi, cast(T*)pend, table + tableI, m2); 707 708 tableI *= 2; 709 } 710 } 711 712 // bug_killer below is a dummy parameter which apparently causes the DMD 713 // stack alignment bug to go away. 714 static void fft_passes_strided(int l, int chunk_size)( 715 vec * pr, vec * pi, size_t N , 716 ref T * table, ref size_t tableI, void* bug_killer, 717 size_t stride, int nPasses) 718 { 719 ubyte[aligned_size!vec(l * chunk_size, 64)] rmem = void; 720 ubyte[aligned_size!vec(l * chunk_size, 64)] imem = void; 721 722 auto rbuf = aligned_ptr!vec(rmem.ptr, 64); 723 auto ibuf = aligned_ptr!vec(imem.ptr, 64); 724 725 BR.strided_copy!(chunk_size)(rbuf, pr, chunk_size, stride, l); 726 BR.strided_copy!(chunk_size)(ibuf, pi, chunk_size, stride, l); 727 728 size_t m2 = l*chunk_size/2; 729 size_t m2_limit = m2>>nPasses; 730 731 if(tableI == 0 && nPasses >= 2) 732 { 733 first_fft_passes(rbuf, ibuf, l*chunk_size); 734 m2 >>= 1; 735 tableI *= 2; 736 m2 >>= 1; 737 tableI *= 2; 738 } 739 740 for(; m2 > 2 * m2_limit; m2 >>= 2) 741 { 742 fft_two_passes(rbuf, ibuf, rbuf + l*chunk_size, m2, 743 table + tableI, table + 2 * tableI); 744 745 tableI *= 4; 746 } 747 748 if(m2 != m2_limit) 749 { 750 fft_pass(rbuf, ibuf, rbuf + l*chunk_size, table + tableI, m2); 751 tableI *= 2; 752 } 753 754 BR.strided_copy!(chunk_size)(pr, rbuf, stride, chunk_size, l); 755 BR.strided_copy!(chunk_size)(pi, ibuf, stride, chunk_size, l); 756 } 757 758 static void fft_passes_recursive()( 759 vec * pr, vec * pi, size_t N , 760 T * table, size_t tableI) 761 { 762 if(N <= (1<<Options.log2_optimal_n)) 763 { 764 size_t m2 = N >> 1; 765 766 for (; m2 > 1 ; m2 >>= 2) 767 { 768 fft_two_passes(pr, pi, pr + N, m2, table + tableI, 769 table + 2 * tableI); 770 771 tableI *= 4; 772 } 773 774 if(m2 != 0) 775 { 776 fft_pass(pr, pi, pr + N, table + tableI, m2); 777 tableI *= 2; 778 } 779 780 fft_passes_fractional(pr, pi, pr + N, table, tableI); 781 782 return; 783 } 784 785 enum log2l = Options.passes_per_recursive_call, l = 1 << log2l; 786 enum chunk_size = 1UL << Options.log2_recursive_passes_chunk_size; 787 788 int log2n = bsf(N); 789 790 int nPasses = log2n > log2l + Options.log2_optimal_n ? 791 log2l : log2n - Options.log2_optimal_n; 792 793 nPasses = (nPasses & 1) && !(log2l & 1) ? nPasses + 1 : nPasses; 794 795 int log2m = log2n - log2l; 796 size_t m = st!1 << log2m; 797 798 size_t tableIOld = tableI; 799 800 for(size_t i=0; i < m; i += chunk_size) 801 { 802 tableI = tableIOld; 803 804 fft_passes_strided!(l, chunk_size)( 805 pr + i, pi + i, N, table, tableI, null, m, nPasses); 806 } 807 808 { 809 size_t nextN = (N>>nPasses); 810 811 for(int i = 0; i<(1<<nPasses); i++) 812 fft_passes_recursive( 813 pr + nextN*i, pi + nextN*i, nextN, 814 table, tableI + 2*i); 815 } 816 } 817 818 static void bit_reverse_small_two(int minLog2n)( 819 T* re, T* im, int log2n, uint* brTable) 820 { 821 enum l = V.log2_bitreverse_chunk_size; 822 823 static if(minLog2n < 2 * l) 824 { 825 if(log2n < 2 * l) 826 { 827 // only works for log2n < 2 * l 828 bit_reverse_tiny!(2 * l)(re, log2n); 829 bit_reverse_tiny!(2 * l)(im, log2n); 830 } 831 else 832 { 833 BR.bit_reverse_small(re, log2n, brTable); 834 BR.bit_reverse_small(im, log2n, brTable); 835 } 836 } 837 else 838 { 839 //we already know that log2n >= 2 * l here. 840 BR.bit_reverse_small(re, log2n, brTable); 841 BR.bit_reverse_small(im, log2n, brTable); 842 } 843 } 844 845 static auto v(T* p){ return cast(vec*) p; } 846 847 static void fft_tiny()(T * re, T * im, int log2n, Table tables) 848 { 849 // assert(log2n > log2(vec_size)); 850 851 auto N = st!1 << log2n; 852 fft_passes!(true)(v(re), v(im), N / vec_size, 853 twiddle_table_ptr(tables, log2n)); 854 855 fft_passes_fractional( 856 v(re), v(im), v(re) + N / vec_size, 857 twiddle_table_ptr(tables, log2n), 0); 858 859 bit_reverse_small_two!(log2(vec_size) + 1)( 860 re, im, log2n, br_table_ptr(tables, log2n)); 861 } 862 863 static void fft_small()(T * re, T * im, int log2n, Table tables) 864 { 865 // assert(log2n >= 2*log2(vec_size)); 866 867 size_t N = (1<<log2n); 868 869 fft_passes!false( 870 v(re), v(im), N / vec_size, 871 twiddle_table_ptr(tables, log2n) + 2); 872 873 bit_reverse_small_two!(2 * log2(vec_size))( 874 re, im, log2n, br_table_ptr(tables, log2n)); 875 876 static if(vec_size > 1) 877 fft_passes_bit_reversed( 878 v(re), v(im) , N / vec_size, 879 cast(vec*) twiddle_table_ptr(tables, log2n), 880 N / vec_size/vec_size); 881 } 882 883 static void fft_large()(T * re, T * im, int log2n, Table tables) 884 { 885 size_t N = (1<<log2n); 886 887 fft_passes_recursive( 888 v(re), v(im), N / vec_size, 889 twiddle_table_ptr(tables, log2n), 0); 890 891 BR.bit_reverse_large(re, log2n, br_table_ptr(tables, log2n)); 892 BR.bit_reverse_large(im, log2n, br_table_ptr(tables, log2n)); 893 } 894 895 static void fft()(T * re, T * im, int log2n, Table tables) 896 { 897 foreach(i; ints_up_to!(log2(vec_size) + 1)) 898 if(i == log2n) 899 return static_size_fft!i(re, im, twiddle_table_ptr(tables, i)); 900 901 if(log2n < 2 * log2(vec_size)) 902 return fft_tiny(re, im, log2n, tables); 903 else if( log2n < Options.large_limit || disable_large) 904 return fft_small(re, im, log2n, tables); 905 else 906 static if(!disable_large) 907 fft_large(re, im, log2n, tables); 908 } 909 910 alias T* RTable; 911 912 static auto rtable_size_bytes()(int log2n) 913 { 914 return T.sizeof << (log2n - 1); 915 } 916 917 enum supports_real = is(typeof( 918 { 919 T a; 920 vec v = V.unaligned_load(&a); 921 v = V.reverse(v); 922 V.unaligned_store(&a, v); 923 })); 924 925 static RTable rfft_table()(int log2n, void *p) if(supports_real) 926 { 927 if(log2n < 2) 928 return cast(RTable) p; 929 else if(st!1 << log2n < 4 * vec_size) 930 return SFFT.rfft_table(log2n, p); 931 932 auto r = (cast(Pair*) p)[0 .. (st!1 << (log2n - 2))]; 933 934 auto phi = _asin(1); 935 sines_cosines!true(r.ptr, r.length, -phi, phi, false); 936 937 /*foreach(size_t i, ref e; r) 938 { 939 T phi = - (_asin(1.0) * (i + 1)) / r.length; 940 941 e[0] = _cos(phi); 942 e[1] = _sin(phi); 943 }*/ 944 945 complex_array_to_vector(r.ptr, r.length); 946 947 return cast(RTable) r.ptr; 948 } 949 950 static auto rfft_table()(int log2n, void *p) if(!supports_real) 951 { 952 return SFFT.rfft_table(log2n, p); 953 } 954 955 static void rfft()( 956 T* rr, T* ri, int log2n, Table table, RTable rtable) 957 { 958 if(log2n == 0) 959 return; 960 else if(log2n == 1) 961 { 962 auto rr0 = rr[0], ri0 = ri[0]; 963 rr[0] = rr0 + ri0; 964 ri[0] = rr0 - ri0; 965 return; 966 } 967 968 fft(rr, ri, log2n - 1, table); 969 rfft_last_pass!false(rr, ri, log2n, rtable); 970 } 971 972 static void irfft()( 973 T* rr, T* ri, int log2n, Table table, RTable rtable) 974 { 975 if(log2n == 0) 976 return; 977 else if(log2n == 1) 978 { 979 // we don't multiply with 0.5 here because we want the inverse to 980 // be scaled by 2. 981 982 auto rr0 = rr[0], ri0 = ri[0]; 983 rr[0] = (rr0 + ri0); 984 ri[0] = (rr0 - ri0); 985 return; 986 } 987 988 rfft_last_pass!true(rr, ri, log2n, rtable); 989 fft(ri, rr, log2n - 1, table); 990 } 991 992 static void rfft_last_pass(bool inverse)(T* rr, T* ri, int log2n, RTable rtable) 993 if(supports_real) 994 { 995 if(st!1 << log2n < 4 * vec_size) 996 return SFFT.rfft_last_pass!inverse(rr, ri, log2n, rtable); 997 998 static vec* v(T* a){ return cast(vec*) a; } 999 1000 auto n = st!1 << log2n; 1001 1002 vec half = V.scalar_to_vector(cast(T) 0.5); 1003 1004 T middle_r = rr[n / 4]; 1005 T middle_i = ri[n / 4]; 1006 1007 for( 1008 size_t i0 = 1, i1 = n / 2 - vec_size, iw = 0; 1009 i0 <= i1; 1010 i0 += vec_size, i1 -= vec_size, iw += 2*vec_size) 1011 { 1012 vec wr = *v(rtable + iw); 1013 vec wi = *v(rtable + iw + vec_size); 1014 1015 vec r0r = V.unaligned_load(&rr[i0]); 1016 vec r0i = V.unaligned_load(&ri[i0]); 1017 vec r1r = V.reverse(*v(rr + i1)); 1018 vec r1i = V.reverse(*v(ri + i1)); 1019 1020 vec ar = r0r + r1r; 1021 vec ai = r1i - r0i; 1022 vec br = r0r - r1r; 1023 vec bi = r0i + r1i; 1024 1025 static if(inverse) 1026 { 1027 // we use -w* instead of w in this case and we do not divide by 2. 1028 // The reason for that is that we want the inverse to be scaled 1029 // by n as it is in the complex case and not just by n / 2. 1030 1031 vec tmp = br * wi - bi * wr; 1032 br = bi * wi + br * wr; 1033 bi = tmp; 1034 } 1035 else 1036 { 1037 ar *= half; 1038 ai *= half; 1039 br *= half; 1040 bi *= half; 1041 vec tmp = br * wi + bi * wr; 1042 br = bi * wi - br * wr; 1043 bi = tmp; 1044 } 1045 1046 V.unaligned_store(rr + i0, ar + bi); 1047 V.unaligned_store(ri + i0, br - ai); 1048 1049 *v(rr + i1) = V.reverse(ar - bi); 1050 *v(ri + i1) = V.reverse(ai + br); 1051 } 1052 1053 // fixes the aliasing bug: 1054 rr[n / 4] = inverse ? middle_r + middle_r : middle_r; 1055 ri[n / 4] = -(inverse ? middle_i + middle_i : middle_i); 1056 1057 1058 { 1059 // When calculating inverse we would need to multiply with 0.5 here 1060 // to get an exact inverse. We don't do that because we actually 1061 // want the inverse to be scaled by 2. 1062 1063 auto r0r = rr[0]; 1064 auto r0i = ri[0]; 1065 1066 rr[0] = r0r + r0i; 1067 ri[0] = r0r - r0i; 1068 } 1069 } 1070 1071 static void rfft_last_pass(bool inverse)(T* rr, T* ri, int log2n, RTable rtable) 1072 if(!supports_real) 1073 { 1074 SFFT.rfft_last_pass!inverse(rr, ri, log2n, rtable); 1075 } 1076 1077 static void interleave_array()(T* even, T* odd, T* interleaved, size_t n) 1078 { 1079 static if(is(typeof(V.interleave))) 1080 { 1081 if(n < vec_size) 1082 SFFT.interleave_array(even, odd, interleaved, n); 1083 else 1084 foreach(i; 0 .. n / vec_size) 1085 V.interleave( 1086 (cast(vec*)even)[i], 1087 (cast(vec*)odd)[i], 1088 (cast(vec*)interleaved)[i * 2], 1089 (cast(vec*)interleaved)[i * 2 + 1]); 1090 } 1091 else 1092 foreach(i; 0 .. n) 1093 { 1094 interleaved[i * 2] = even[i]; 1095 interleaved[i * 2 + 1] = odd[i]; 1096 } 1097 } 1098 1099 static void deinterleave_array()(T* even, T* odd, T* interleaved, size_t n) 1100 { 1101 static if(is(typeof(V.deinterleave))) 1102 { 1103 if(n < vec_size) 1104 SFFT.deinterleave_array(even, odd, interleaved, n); 1105 else 1106 foreach(i; 0 .. n / vec_size) 1107 V.deinterleave( 1108 (cast(vec*)interleaved)[i * 2], 1109 (cast(vec*)interleaved)[i * 2 + 1], 1110 (cast(vec*)even)[i], 1111 (cast(vec*)odd)[i]); 1112 } 1113 else 1114 foreach(i; 0 .. n) 1115 { 1116 even[i] = interleaved[i * 2]; 1117 odd[i] = interleaved[i * 2 + 1]; 1118 } 1119 } 1120 1121 alias bool* ITable; 1122 1123 alias Interleave!(V, 8, false).itable_size_bytes itable_size_bytes; 1124 alias Interleave!(V, 8, false).interleave_table interleave_table; 1125 alias Interleave!(V, 8, false).interleave interleave; 1126 alias Interleave!(V, 8, true).interleave deinterleave; 1127 1128 static void scale(T* data, size_t n, T factor) 1129 { 1130 auto k = V.scalar_to_vector(factor); 1131 1132 foreach(ref e; (cast(vec*) data)[0 .. n / vec_size]) 1133 e = e * k; 1134 1135 foreach(ref e; data[ n & ~(vec_size - 1) .. n]) 1136 e = e * factor; 1137 } 1138 1139 static size_t alignment(size_t n) 1140 { 1141 static if(is(typeof(Options.prefered_alignment)) && 1142 Options.prefered_alignment > vec.sizeof) 1143 { 1144 enum a = Options.prefered_alignment; 1145 } 1146 else 1147 enum a = vec.sizeof; 1148 1149 auto bytes = T.sizeof << bsr(n); 1150 1151 bytes = bytes < a ? bytes : a; 1152 return bytes > (void*).sizeof && (bytes & (bytes - 1)) == 0 ? 1153 bytes : (void*).sizeof; 1154 } 1155 } 1156 1157 mixin template Instantiate() 1158 { 1159 nothrow: 1160 @nogc: 1161 struct TableValue{}; 1162 alias TableValue* Table; 1163 1164 struct RTableValue{}; 1165 alias RTableValue* RTable; 1166 1167 struct ITableValue{}; 1168 alias ITableValue* ITable; 1169 1170 template selected(string func_name, Ret...) 1171 { 1172 auto selected(A...)(A args) 1173 { 1174 auto impl = implementation; 1175 foreach(i, F; FFTs) 1176 if(i == impl) 1177 { 1178 mixin("alias F." ~ func_name ~ " func;"); 1179 1180 ParamTypeTuple!(func).type fargs; 1181 1182 foreach(j, _; fargs) 1183 fargs[j] = cast(typeof(fargs[j])) args[j]; 1184 1185 static if(Ret.length == 0) 1186 return func(fargs); 1187 else 1188 return cast(Ret[0]) func(fargs); 1189 } 1190 1191 assert(false); 1192 } 1193 } 1194 1195 alias FFTs[0] FFT0; 1196 alias FFT0.T T; 1197 1198 void fft(T* re, T* im, uint log2n, Table t) 1199 { 1200 selected!"fft"(re, im, log2n, cast(FFT0.Table) t); 1201 } 1202 1203 Table fft_table(uint log2n, void* p = null) 1204 { 1205 return selected!("fft_table", Table)(log2n, p); 1206 } 1207 1208 size_t table_size_bytes(uint log2n) 1209 { 1210 return selected!"table_size_bytes"(log2n); 1211 } 1212 1213 void scale(T* data, size_t n, T factor) 1214 { 1215 selected!"scale"(data, n, factor); 1216 } 1217 1218 size_t alignment(size_t n) 1219 { 1220 return selected!"alignment"(n); 1221 } 1222 1223 void rfft(T* re, T* im, uint log2n, Table t, RTable rt) 1224 { 1225 selected!"rfft"(re, im, log2n, cast(FFT0.Table) t, cast(FFT0.RTable) rt); 1226 } 1227 1228 void irfft(T* re, T* im, uint log2n, Table t, RTable rt) 1229 { 1230 selected!"irfft"(re, im, log2n, cast(FFT0.Table) t, cast(FFT0.RTable) rt); 1231 } 1232 1233 RTable rfft_table(uint log2n, void* p = null) 1234 { 1235 return selected!("rfft_table", RTable)(log2n, p); 1236 } 1237 1238 size_t rtable_size_bytes(int log2n) 1239 { 1240 return selected!"rtable_size_bytes"(log2n); 1241 } 1242 1243 void deinterleave_array(T* even, T* odd, T* interleaved, size_t n) 1244 { 1245 selected!"deinterleave_array"(even, odd, interleaved, n); 1246 } 1247 1248 void interleave_array(T* even, T* odd, T* interleaved, size_t n) 1249 { 1250 selected!"interleave_array"(even, odd, interleaved, n); 1251 } 1252 1253 size_t itable_size_bytes(uint log2n) 1254 { 1255 return selected!"itable_size_bytes"(log2n); 1256 } 1257 1258 ITable interleave_table(uint log2n, void* p) 1259 { 1260 return selected!("interleave_table", ITable)(log2n, p); 1261 } 1262 1263 void interleave(T* p, uint log2n, ITable table) 1264 { 1265 selected!"interleave"(p, log2n, cast(FFT0.ITable) table); 1266 } 1267 1268 void deinterleave(T* p, uint log2n, ITable table) 1269 { 1270 selected!"deinterleave"(p, log2n, cast(FFT0.ITable) table); 1271 } 1272 }