1 //          Copyright Jernej Krempuš 2012
2 // Distributed under the Boost Software License, Version 1.0.
3 //    (See accompanying file LICENSE_1_0.txt or copy at
4 //          http://www.boost.org/LICENSE_1_0.txt)
5 
6 module dplug.fft.fft_impl;
7 
8 import dplug.fft.shuffle;
9 
10 nothrow:
11 @nogc:
12 
13 struct Scalar(_T)
14 {
15 nothrow:
16 @nogc:
17     alias _T vec;
18     alias _T T;
19     
20     enum vec_size = 1;
21     enum log2_bitreverse_chunk_size = 2;
22     
23     static vec scalar_to_vector(T a)
24     {
25         return a;
26     }
27    
28     private static void load4br(T* p, size_t m, ref T a0, ref T a1, ref T a2, ref T a3)
29     {
30         a0 = p[0];
31         a1 = p[m];
32         a2 = p[1];
33         a3 = p[m + 1];
34     } 
35  
36     private static void store4(T* p, size_t m, T a0, T a1, T a2, T a3)
37     {
38         p[0] = a0;
39         p[1] = a1;
40         p[m] = a2;
41         p[m + 1] = a3;
42     }
43  
44     static void bit_reverse_swap(T * p0, T * p1, size_t m)
45     {
46         RepeatType!(T, 4) a, b;
47        
48         auto s = 2 * m;
49         auto i0 = 0, i1 = 2, i2 = m, i3 = m + 2;
50  
51         load4br(p0, s, a); 
52         load4br(p1, s, b); 
53         store4(p1, s, a);     
54         store4(p0, s, b);     
55         
56         load4br(p0 + i3, s, a); 
57         load4br(p1 + i3, s, b); 
58         store4(p1 + i3, s, a);     
59         store4(p0 + i3, s, b);
60 
61         load4br(p0 + i1, s, a);
62         load4br(p1 + i2, s, b);
63         store4(p1 + i2, s, a);
64         store4(p0 + i1, s, b);
65 
66         load4br(p1 + i1, s, a);
67         load4br(p0 + i2, s, b);
68         store4(p0 + i2, s, a);
69         store4(p1 + i1, s, b);
70     }
71 
72     static void bit_reverse(T * p,  size_t m)
73     {
74         //bit_reverse_static_size!4(p, m);
75         T a0, a1, a2, b0, b1, b2;       
76 
77         a0 = p[1 + 0 * m];
78         a1 = p[2 + 0 * m];
79         a2 = p[3 + 0 * m];
80         b0 = p[0 + 2 * m];
81         b1 = p[0 + 1 * m];
82         b2 = p[0 + 3 * m];
83 
84         p[0 + 2 * m] = a0;
85         p[0 + 1 * m] = a1;
86         p[0 + 3 * m] = a2;
87         p[1 + 0 * m] = b0; 
88         p[2 + 0 * m] = b1; 
89         p[3 + 0 * m] = b2;
90 
91         a0 = p[1 + 1 * m];
92         a1 = p[3 + 1 * m];
93         a2 = p[3 + 2 * m];
94         b0 = p[2 + 2 * m];
95         b1 = p[2 + 3 * m];
96         b2 = p[1 + 3 * m];
97 
98         p[2 + 2 * m] = a0;
99         p[2 + 3 * m] = a1;
100         p[1 + 3 * m] = a2;
101         p[1 + 1 * m] = b0; 
102         p[3 + 1 * m] = b1; 
103         p[3 + 2 * m] = b2;
104     }
105 
106     static T unaligned_load(T* p){ return *p; }
107     static void unaligned_store(T* p, T a){ *p = a; }
108     static T reverse(T a){ return a; }           
109 }
110 
111 version(DisableLarge)
112     enum disable_large = true;
113 else 
114     enum disable_large = false;
115 
116 // reinventing some Phobos stuff...
117 
118 struct Tuple(A...)
119 {
120     A a;
121     alias a this;
122 }
123 
124 template TypeTuple(A...)
125 {
126     alias A TypeTuple;
127 }
128 
129 template ParamTypeTuple(alias f)
130 {
131     auto params_struct(Ret, Params...)(Ret function(Params) f) 
132     {
133         struct R
134         {
135             Params p;
136         }
137         return R.init;
138     }
139 
140     static if(is(typeof(params_struct(&f))))
141         alias f f_instance;
142     else
143         alias f!() f_instance;
144 
145     alias typeof(params_struct(&f_instance).tupleof) type;
146 }
147 
148 void static_size_fft(int log2n, T)(T *pr, T *pi, T *table)
149 { 
150     enum n = 1 << log2n;
151     RepeatType!(T, n) ar, ai;
152 
153     foreach(i; ints_up_to!n)
154         ar[i] = pr[i];
155 
156     foreach(i; ints_up_to!n)
157         ai[i] = pi[i];
158     
159     foreach(i; powers_up_to!n)
160     {
161         enum m = n / i;
162         
163         auto tp = table;
164         foreach(j; ints_up_to!(n / m))
165         {
166             enum offset = m * j;
167             
168             T wr = tp[0];
169             T wi = tp[1];
170             tp += 2;
171             foreach(k1; ints_up_to!(m / 2))
172             {
173                 enum k2 = k1 + m / 2;
174                 static if(j == 0)
175                 {
176                     T tr = ar[offset + k2], ti = ai[offset + k2];
177                     T ur = ar[offset + k1], ui = ai[offset + k1];
178                 }
179                 else static if(j == 1)
180                 {
181                     T tr = ai[offset + k2], ti = -ar[offset + k2];
182                     T ur = ar[offset + k1], ui = ai[offset + k1];
183                 }
184                 else
185                 {
186                     T tmpr = ar[offset + k2], ti = ai[offset + k2];
187                     T ur = ar[offset + k1], ui = ai[offset + k1];
188                     T tr = tmpr*wr - ti*wi;
189                     ti = tmpr*wi + ti*wr;
190                 }
191                 ar[offset + k2] = ur - tr;
192                 ar[offset + k1] = ur + tr;                                                    
193                 ai[offset + k2] = ui - ti;                                                    
194                 ai[offset + k1] = ui + ti;
195             }
196         }
197     }
198 
199     foreach(i; ints_up_to!n)
200         pr[i] = ar[reverse_bits!(i, log2n)];
201 
202     foreach(i; ints_up_to!n)
203         pi[i] = ai[reverse_bits!(i, log2n)];
204 }
205 
206 struct FFT(V, Options)
207 {    
208 nothrow:
209 @nogc:
210     import core.bitop, core.stdc.stdlib;
211    
212     alias BitReverse!(V, Options) BR;
213     
214     alias V.vec_size vec_size;
215     alias V.T T;
216     alias V.vec vec;
217     alias FFT!(Scalar!T, Options) SFFT;
218   
219     import cmath = core.stdc.math;
220 
221     static if(is(T == float))
222     {
223         alias cmath.sinf _sin;
224         alias cmath.cosf _cos;
225         alias cmath.asinf _asin;
226     }
227     else static if(is(T == double))
228     {
229         alias cmath.sin _sin;
230         alias cmath.cos _cos;
231         alias cmath.asin _asin;
232     }
233     else static if(is(T == real))
234     {
235         alias cmath.sinl _sin;
236         alias cmath.cosl _cos;
237         alias cmath.asinl _asin;
238     }
239     else
240         static assert(0);
241 
242     template st(alias a){ enum st = cast(size_t) a; }
243 
244     alias Tuple!(T,T) Pair;
245     
246     static void complex_array_to_vector()(Pair * pairs, size_t n)
247     {
248         for(size_t i=0; i<n; i += vec_size)
249         {
250           T[vec_size*2] buffer = void;
251           for(size_t j = 0; j < vec_size; j++)
252           {
253             buffer[j] = pairs[i+j][0];
254             buffer[j + vec_size] = pairs[i+j][1];
255           }
256           for(size_t j = 0; j < vec_size; j++)
257           {
258             pairs[i+j][0] = buffer[2*j];
259             pairs[i+j][1] = buffer[2*j+1];
260           }
261         }
262     }
263 
264     static int log2()(int a)
265     {
266         int r = 0;
267         while(a)
268         {
269             a >>= 1;
270             r++;
271         }
272         return r - 1;
273     }
274 
275     static void sines_cosines_refine(bool computeEven)(
276         Pair* src, Pair* dest, size_t n_from, T dphi)
277     {
278         T cdphi = _cos(dphi);
279         T sdphi = _sin(dphi);
280        
281         enum compute = computeEven ? 0 : 1;
282         enum copy = compute ^ 1;
283  
284         for(auto src_end = src + n_from; src < src_end; src++, dest += 2)
285         {
286             auto c = src[0][0];
287             auto s = src[0][1];
288             dest[copy][0] = c;
289             dest[copy][1] = s;
290             dest[compute][0] = c * cdphi - s * sdphi;   
291             dest[compute][1] = c * sdphi + s * cdphi;
292         }
293     }
294 
295     static void sines_cosines(bool phi0_is_last)(
296         Pair* r, size_t n, T phi0, T deltaphi, bool bit_reversed)
297     {
298         r[n - 1][0] = _cos(phi0);
299         r[n - 1][1] = _sin(phi0);
300         for(size_t len = 1; len < n; len *= 2)
301         {
302             auto denom = bit_reversed ? n / 2 / len : len;
303             sines_cosines_refine!phi0_is_last(
304                 r + n - len, r + n - 2 * len, len, deltaphi / 2 / denom);
305         }
306     } 
307     
308     static void twiddle_table()(int log2n, Pair * r)
309     {
310         if(log2n >= Options.large_limit || log2n < 2 * log2(vec_size))
311         {
312             return sines_cosines!false(
313                 r, st!1 << (log2n - 1), 0.0, -2 * _asin(1), true);
314         }
315 
316         r++;
317 
318         auto p = r;
319         for (int s = 0; s < log2n; ++s)
320         {
321             size_t m2 = 1 << s;
322             
323             if(s < log2n - log2(vec_size))
324                 sines_cosines!false(p, m2, 0.0, -2 * _asin(1), true);
325             else
326             {
327                 sines_cosines!false(p, m2, 0.0, -2 * _asin(1), false);
328                 complex_array_to_vector(p, m2);
329             }
330             
331             p += m2;
332         }
333        
334         p = r;
335         for (int s = 0; s + 1 < log2n - log2(vec_size);  s += 2)
336         {
337             size_t m2 = 1 << s;
338             
339             foreach(i; 0 .. m2)
340                 // p[i] is p[m2 + 2 * i] ^^ 2. We store it here so that we 
341                 // don't need to recompute it below, which improves precision 
342                 // slightly.
343                 p[m2 + 2 * i + 1] = p[i]; 
344 
345             foreach(i; 0 .. m2)
346             {
347                 Pair a1 = p[m2 + 2 * i];
348                 Pair a2 = p[m2 + 2 * i + 1];
349                 Pair a3;
350                 
351                 a3[0] = a2[0] * a1[0] - a2[1] * a1[1];
352                 a3[1] = a2[0] * a1[1] + a2[1] * a1[0];
353                 
354                 p[3 * i] = a1;
355                 p[3 * i + 1] = a2;
356                 p[3 * i + 2] = a3;
357             }
358             
359             p += 3 * m2;
360         }
361     }
362     
363     alias void* Table;
364    
365     static size_t twiddle_table_size_bytes(int log2n)
366     {
367         auto compact = log2n >= Options.large_limit || 
368             log2n < 2 * log2(vec_size);
369 
370         return Pair.sizeof << (compact ? log2n - 1 : log2n); 
371     }
372  
373     static T* twiddle_table_ptr(void* p, int log2n)
374     { 
375         return cast(T*)p;
376     }
377     
378     static uint* br_table_ptr(void* p, int log2n)
379     {
380         return cast(uint*)(p + twiddle_table_size_bytes(log2n));
381     }
382     
383     static size_t table_size_bytes()(uint log2n)
384     {
385         uint log2nbr = log2n < Options.large_limit ? 
386             log2n : 2 * Options.log2_bitreverse_large_chunk_size;
387         
388         return 
389             twiddle_table_size_bytes(log2n) + 
390             BR.br_table_size(log2nbr) * uint.sizeof;
391     }
392     
393     static Table fft_table()(int log2n, void * p)
394     {   
395         if(log2n == 0)
396             return p;
397         //else if(log2n <= log2(vec_size))
398          //   return SFFT.fft_table(log2n, p);
399         
400         Table tables = p;
401         
402         twiddle_table(log2n, cast(Pair *)(twiddle_table_ptr(tables, log2n)));
403         
404         if(log2n < V.log2_bitreverse_chunk_size * 2)
405         {
406         }
407         else if(log2n < Options.large_limit)
408         {
409             BR.init_br_table(br_table_ptr(tables, log2n), log2n);
410         }
411         else
412         {
413             enum log2size = 2*Options.log2_bitreverse_large_chunk_size;
414             BR.init_br_table(br_table_ptr(tables, log2n), log2size);
415         }
416         return tables;
417     }
418     
419     static void fft_passes_bit_reversed()(vec* re, vec* im, size_t N , 
420         vec* table, size_t start_stride = 1)
421     {
422         table += start_stride + start_stride;
423         vec* pend = re + N;
424         for (size_t m2 = start_stride; m2 < N ; m2 <<= 1)
425         {      
426             size_t m = m2 + m2;
427             for(
428                 vec* pr = re, pi = im; 
429                 pr < pend ;
430                 pr += m, pi += m)
431             {
432                 for (size_t k1 = 0, k2 = m2; k1<m2; k1++, k2 ++) 
433                 {  
434                     vec wr = table[2*k1], wi = table[2*k1+1];                       
435 
436                     vec tmpr = pr[k2], ti = pi[k2];
437                     vec ur = pr[k1], ui = pi[k1];
438                     vec tr = tmpr*wr - ti*wi;
439                     ti = tmpr*wi + ti*wr;
440                     pr[k2] = ur - tr;
441                     pr[k1] = ur + tr;                                                    
442                     pi[k2] = ui - ti;                                                    
443                     pi[k1] = ui + ti;
444                 }
445             }
446             table += m;
447         }
448     }
449     
450     static void first_fft_passes()(vec* pr, vec* pi, size_t n)
451     {
452         size_t i0 = 0, i1 = i0 + n/4, i2 = i1 + n/4, i3 = i2 + n/4, iend = i1;
453 
454         for(; i0 < iend; i0++, i1++, i2++, i3++)
455         {
456             vec tr = pr[i2], ti = pi[i2];
457             vec ur = pr[i0], ui = pi[i0];
458             vec ar0 = ur + tr;
459             vec ar2 = ur - tr;
460             vec ai0 = ui + ti;
461             vec ai2 = ui - ti;
462 
463             tr = pr[i3], ti = pi[i3];
464             ur = pr[i1], ui = pi[i1];
465             vec ar1 = ur + tr;
466             vec ar3 = ur - tr;
467             vec ai1 = ui + ti;
468             vec ai3 = ui - ti;
469 
470             pr[i0] = ar0 + ar1;
471             pr[i1] = ar0 - ar1;
472             pi[i0] = ai0 + ai1;
473             pi[i1] = ai0 - ai1;
474 
475             pr[i2] = ar2 + ai3;
476             pr[i3] = ar2 - ai3;
477             pi[i2] = ai2 - ar3;
478             pi[i3] = ai2 + ar3;      
479         }
480     }
481         
482     static void fft_pass()(vec *pr, vec *pi, vec *pend, T *table, size_t m2)
483     {
484         size_t m = m2 + m2;
485         for(; pr < pend ; pr += m, pi += m)
486         {
487             vec wr = V.scalar_to_vector(table[0]);
488             vec wi = V.scalar_to_vector(table[1]);
489             table += 2;
490             for (size_t k1 = 0, k2 = m2; k1<m2; k1++, k2 ++) 
491             { 
492                 vec tmpr = pr[k2], ti = pi[k2];
493                 vec ur = pr[k1], ui = pi[k1];
494                 vec tr = tmpr*wr - ti*wi;
495                 ti = tmpr*wi + ti*wr;
496                 pr[k2] = ur - tr;
497                 pr[k1] = ur + tr;                                                    
498                 pi[k2] = ui - ti;                                                    
499                 pi[k1] = ui + ti;
500             }
501         }
502     }
503     
504     static void fft_two_passes(Tab...)(
505         vec *pr, vec *pi, vec *pend, size_t m2, Tab tab)
506     {
507         // When this function is called with tab.length == 2 on DMD, it 
508         // sometimes gives an incorrect result (for example when building with 
509         // SSE on 64 bit Linux and runnitg test_float  pfft "14".), so lets's 
510         // use fft_pass instead.
511     
512         // Disabled work-around
513 
514         version(DigitalMars)
515         {
516             static if (tab.length == 2)
517                 enum workaroundTabLength = true;
518             else
519                 enum workaroundTabLength = false;
520         }
521         else
522             enum workaroundTabLength = false;
523         
524         static if (workaroundTabLength)
525         {
526             fft_pass(pr, pi, pend, tab[0], m2);
527             fft_pass(pr, pi, pend, tab[1], m2 / 2);
528         }
529         else
530         {
531             size_t m = m2 + m2;
532             size_t m4 = m2 / 2;
533             for(; pr < pend ; pr += m, pi += m)
534             {
535                 static if(tab.length == 2)
536                 {
537                     vec w1r = V.scalar_to_vector(tab[1][0]);
538                     vec w1i = V.scalar_to_vector(tab[1][1]);
539 
540                     vec w2r = V.scalar_to_vector(tab[0][0]);
541                     vec w2i = V.scalar_to_vector(tab[0][1]);
542 
543                     vec w3r = w1r * w2r - w1i * w2i;
544                     vec w3i = w1r * w2i + w1i * w2r;
545 
546                     tab[0] += 2;
547                     tab[1] += 4;
548                 }
549                 else
550                 {
551                     vec w1r = V.scalar_to_vector(tab[0][0]);
552                     vec w1i = V.scalar_to_vector(tab[0][1]);
553 
554                     vec w2r = V.scalar_to_vector(tab[0][2]);
555                     vec w2i = V.scalar_to_vector(tab[0][3]);
556 
557                     vec w3r = V.scalar_to_vector(tab[0][4]);
558                     vec w3i = V.scalar_to_vector(tab[0][5]);
559             
560                     tab[0] += 6;
561                 }
562             
563                 for (
564                     size_t k0 = 0, k1 = m4, k2 = m2, k3 = m2 + m4; 
565                     k0<m4; k0++, 
566                     k1++, k2++, k3++) 
567                 {                 
568                     vec tr, ur, ti, ui;
569                 
570                     vec r0 = pr[k0];
571                     vec r1 = pr[k1];
572                     vec r2 = pr[k2];
573                     vec r3 = pr[k3];
574                 
575                     vec i0 = pi[k0];
576                     vec i1 = pi[k1];
577                     vec i2 = pi[k2];
578                     vec i3 = pi[k3];
579                 
580                     tr = r2 * w2r - i2 * w2i;
581                     ti = r2 * w2i + i2 * w2r;
582                     r2 = r0 - tr;
583                     i2 = i0 - ti;
584                     r0 = r0 + tr;
585                     i0 = i0 + ti;
586                 
587                     tr = r3 * w3r - i3 * w3i;
588                     ti = r3 * w3i + i3 * w3r;
589                     ur = r1 * w1r - i1 * w1i;
590                     ui = r1 * w1i + i1 * w1r;
591                     r3 = ur - tr;
592                     i3 = ui - ti;
593                     r1 = ur + tr;
594                     i1 = ui + ti;
595                 
596                     tr = r1;
597                     ti = i1;
598                     r1 = r0 - tr;
599                     i1 = i0 - ti;
600                     r0 = r0 + tr;
601                     i0 = i0 + ti;
602                 
603                     tr = i3;
604                     ti = r3;                // take minus into account later
605                     r3 = r2 - tr;
606                     i3 = i2 + ti;
607                     r2 = r2 + tr;
608                     i2 = i2 - ti;
609                 
610                     pr[k0] = r0;
611                     pr[k1] = r1;
612                     pr[k2] = r2;
613                     pr[k3] = r3;
614                 
615                     pi[k0] = i0;
616                     pi[k1] = i1;
617                     pi[k2] = i2;
618                     pi[k3] = i3;
619                 }
620             }
621         }
622     }
623     
624     static void fft_passes(bool compact_table)(
625         vec* re, vec* im, size_t N , T* table)
626     {
627         vec * pend = re + N;
628 
629         size_t tableRowLen = 2;
630         size_t m2 = N/2;
631 
632         static nextRow(ref T* table, ref size_t len)
633         {
634             static if(!compact_table)
635             {
636                 table += len;
637                 len += len;
638             }
639         }
640 
641         if(m2 > 1)
642         {
643             first_fft_passes(re, im, N);
644             
645             m2 >>= 2;
646 
647             nextRow(table, tableRowLen);
648             nextRow(table, tableRowLen);
649         }
650        	
651         for (; m2 > 1 ; m2 >>= 2)
652         {
653             static if(compact_table)
654                 fft_two_passes(re, im, pend, m2, table, table);
655             else
656                 fft_two_passes(re, im, pend, m2, table);
657 
658             nextRow(table, tableRowLen);
659             nextRow(table, tableRowLen);
660         }
661         
662         if(m2 != 0)
663             fft_pass(re, im, pend, table, m2);
664     }
665     
666     static void fft_passes_fractional()(
667         vec * pr, vec * pi, vec * pend, 
668         T * table, size_t tableI)
669     {
670         static if(is(typeof(V.transpose!2)))
671         {
672             for(; pr < pend; pr += 2, pi += 2, tableI += 4)
673             {
674                 auto ar = pr[0];
675                 auto ai = pi[0];
676                 auto br = pr[1];
677                 auto bi = pi[1];
678                 
679                 foreach(i; ints_up_to!(log2(vec_size)))
680                 {
681                     vec wr, wi, ur, ui;
682 
683                     V.complex_array_to_real_imag_vec!(2 << i)(
684                         table + (tableI << i), wr, wi);
685                     
686                     V.transpose!(2 << i)(ar, br, ur, br);
687                     V.transpose!(2 << i)(ai, bi, ui, bi);
688 
689                     auto tr = br * wr - bi * wi;
690                     auto ti = bi * wr + br * wi;
691 
692                     ar = ur + tr;
693                     br = ur - tr;
694                     ai = ui + ti;
695                     bi = ui - ti;
696                 }  
697                 
698                 V.interleave(ar, br, pr[0], pr[1]); 
699                 V.interleave(ai, bi, pi[0], pi[1]); 
700             }
701         }
702         else
703             for (size_t m2 = vec_size >> 1; m2 > 0 ; m2 >>= 1)
704             {
705                 SFFT.fft_pass(
706                     cast(T*) pr, cast(T*) pi, cast(T*)pend, table + tableI, m2);
707                 
708                 tableI *= 2;
709             }
710     }
711   
712     // bug_killer below is a dummy parameter which apparently causes the DMD 
713     // stack alignment bug to go away. 
714     static void fft_passes_strided(int l, int chunk_size)(
715         vec * pr, vec * pi, size_t N , 
716         ref T * table, ref size_t tableI, void* bug_killer, 
717         size_t stride, int nPasses)
718     {
719         ubyte[aligned_size!vec(l * chunk_size, 64)] rmem = void;
720         ubyte[aligned_size!vec(l * chunk_size, 64)] imem = void;
721         
722         auto rbuf = aligned_ptr!vec(rmem.ptr, 64);
723         auto ibuf = aligned_ptr!vec(imem.ptr, 64);
724       
725         BR.strided_copy!(chunk_size)(rbuf, pr, chunk_size, stride, l);
726         BR.strided_copy!(chunk_size)(ibuf, pi, chunk_size, stride, l);
727         
728         size_t m2 = l*chunk_size/2;
729         size_t m2_limit = m2>>nPasses;
730 
731         if(tableI  == 0 && nPasses >= 2)
732         {
733             first_fft_passes(rbuf, ibuf, l*chunk_size);
734             m2 >>= 1;
735             tableI *= 2;
736             m2 >>= 1;
737             tableI *= 2;
738         }
739        
740         for(; m2 > 2 * m2_limit; m2 >>= 2)
741         {
742             fft_two_passes(rbuf, ibuf, rbuf + l*chunk_size, m2, 
743                 table + tableI, table + 2 * tableI);
744 
745             tableI *= 4;
746         }
747         
748         if(m2 != m2_limit)
749         {
750             fft_pass(rbuf, ibuf, rbuf + l*chunk_size, table + tableI, m2);
751             tableI *= 2;
752         }
753       
754         BR.strided_copy!(chunk_size)(pr, rbuf, stride, chunk_size, l);
755         BR.strided_copy!(chunk_size)(pi, ibuf, stride, chunk_size, l);
756     }
757     
758     static void fft_passes_recursive()(
759         vec * pr, vec *  pi, size_t N , 
760         T * table, size_t tableI)
761     {
762         if(N <= (1<<Options.log2_optimal_n))
763         {
764             size_t m2 = N >> 1;
765             
766             for (; m2 > 1 ; m2 >>= 2)
767             {
768                 fft_two_passes(pr, pi, pr + N, m2, table + tableI, 
769                     table + 2 * tableI);
770                 
771                 tableI *= 4;
772             }
773 
774             if(m2 != 0)
775             {
776                 fft_pass(pr, pi, pr + N, table + tableI, m2);
777                 tableI *= 2;
778             }
779             
780             fft_passes_fractional(pr, pi, pr + N, table, tableI);
781 
782             return;
783         }
784    
785         enum log2l =  Options.passes_per_recursive_call, l = 1 << log2l;
786         enum chunk_size = 1UL << Options.log2_recursive_passes_chunk_size;
787 
788         int log2n = bsf(N);
789 
790         int nPasses = log2n > log2l + Options.log2_optimal_n ?
791             log2l : log2n - Options.log2_optimal_n;
792 
793         nPasses = (nPasses & 1) && !(log2l & 1)  ? nPasses + 1 : nPasses;
794 
795         int log2m = log2n - log2l;
796         size_t m = st!1 << log2m;
797         
798         size_t tableIOld = tableI;
799 
800         for(size_t i=0; i < m; i += chunk_size)
801         {
802             tableI = tableIOld;
803 
804             fft_passes_strided!(l, chunk_size)(
805                 pr + i, pi + i, N, table, tableI, null, m, nPasses);
806         }
807 
808         {
809             size_t nextN = (N>>nPasses);
810 
811             for(int i = 0; i<(1<<nPasses); i++)
812                 fft_passes_recursive(
813                     pr + nextN*i, pi  + nextN*i, nextN, 
814                     table, tableI + 2*i);
815         }
816     }
817    
818     static void bit_reverse_small_two(int minLog2n)(
819         T* re, T* im, int log2n, uint* brTable)
820     {
821         enum l = V.log2_bitreverse_chunk_size;
822         
823         static if(minLog2n < 2 * l)
824         {
825             if(log2n < 2 * l)
826             {
827                 // only works for log2n < 2 * l
828                 bit_reverse_tiny!(2 * l)(re, log2n);
829                 bit_reverse_tiny!(2 * l)(im, log2n);
830             }
831             else
832             {
833                 BR.bit_reverse_small(re, log2n, brTable); 
834                 BR.bit_reverse_small(im, log2n, brTable);
835             }
836         }
837         else                                                            
838         {
839             //we already know that log2n >= 2 * l here.
840             BR.bit_reverse_small(re, log2n, brTable); 
841             BR.bit_reverse_small(im, log2n, brTable);
842         }   
843     }
844 
845     static auto v(T* p){ return cast(vec*) p; }
846 
847     static void fft_tiny()(T * re, T * im, int log2n, Table tables)
848     {
849         // assert(log2n > log2(vec_size));
850         
851         auto N = st!1 << log2n;
852         fft_passes!(true)(v(re), v(im), N / vec_size, 
853             twiddle_table_ptr(tables, log2n));
854         
855         fft_passes_fractional(
856             v(re), v(im), v(re) + N / vec_size,
857             twiddle_table_ptr(tables, log2n), 0);
858 
859         bit_reverse_small_two!(log2(vec_size) + 1)(
860             re, im, log2n, br_table_ptr(tables, log2n));
861     }
862 
863     static void fft_small()(T * re, T * im, int log2n, Table tables)
864     {
865         // assert(log2n >= 2*log2(vec_size));
866         
867         size_t N = (1<<log2n);
868         
869         fft_passes!false(
870             v(re), v(im), N / vec_size, 
871             twiddle_table_ptr(tables, log2n) + 2);
872         
873         bit_reverse_small_two!(2 * log2(vec_size))(
874             re, im, log2n, br_table_ptr(tables, log2n));
875 
876         static if(vec_size > 1) 
877             fft_passes_bit_reversed(
878                 v(re), v(im) , N / vec_size, 
879                 cast(vec*) twiddle_table_ptr(tables, log2n), 
880                 N / vec_size/vec_size);
881     }
882     
883     static void fft_large()(T * re, T * im, int log2n, Table tables)
884     {
885         size_t N = (1<<log2n);
886         
887         fft_passes_recursive(
888             v(re), v(im), N / vec_size, 
889             twiddle_table_ptr(tables, log2n), 0);
890         
891         BR.bit_reverse_large(re, log2n, br_table_ptr(tables, log2n)); 
892         BR.bit_reverse_large(im, log2n, br_table_ptr(tables, log2n));
893     }
894     
895     static void fft()(T * re, T * im, int log2n, Table tables)
896     {
897         foreach(i; ints_up_to!(log2(vec_size) + 1))
898             if(i == log2n)
899                 return static_size_fft!i(re, im, twiddle_table_ptr(tables, i));
900 
901         if(log2n < 2 * log2(vec_size))
902             return fft_tiny(re, im, log2n, tables);
903         else if( log2n < Options.large_limit || disable_large)
904             return fft_small(re, im, log2n, tables);
905         else 
906             static if(!disable_large)
907                 fft_large(re, im, log2n, tables);
908     }
909   
910     alias T* RTable;
911  
912     static auto rtable_size_bytes()(int log2n)
913     {
914         return T.sizeof << (log2n - 1);
915     }
916 
917     enum supports_real = is(typeof(
918     {
919         T a;
920         vec v = V.unaligned_load(&a);
921         v = V.reverse(v);
922         V.unaligned_store(&a, v);
923     }));
924 
925     static RTable rfft_table()(int log2n, void *p) if(supports_real)
926     {
927         if(log2n < 2)
928             return cast(RTable) p;
929         else if(st!1 << log2n < 4 * vec_size)
930             return SFFT.rfft_table(log2n, p);
931 
932         auto r = (cast(Pair*) p)[0 .. (st!1 << (log2n - 2))];
933 
934         auto phi = _asin(1);
935         sines_cosines!true(r.ptr, r.length, -phi, phi, false);
936 
937         /*foreach(size_t i, ref e; r)
938         {
939             T phi = - (_asin(1.0) * (i + 1)) / r.length;
940  
941             e[0] = _cos(phi);
942             e[1] = _sin(phi);
943         }*/
944         
945         complex_array_to_vector(r.ptr, r.length);
946 
947         return cast(RTable) r.ptr;
948     }
949 
950     static auto rfft_table()(int log2n, void *p) if(!supports_real)
951     {
952         return SFFT.rfft_table(log2n, p);
953     }
954 
955     static void rfft()(
956         T* rr, T* ri, int log2n, Table table, RTable rtable) 
957     {
958         if(log2n == 0)
959             return;
960         else if(log2n == 1)
961         {
962             auto rr0 = rr[0], ri0 = ri[0];
963             rr[0] = rr0 + ri0;
964             ri[0] = rr0 - ri0;
965             return;
966         }
967 
968         fft(rr, ri, log2n - 1, table);
969         rfft_last_pass!false(rr, ri, log2n, rtable);
970     }
971 
972     static void irfft()(
973         T* rr, T* ri, int log2n, Table table, RTable rtable) 
974     {
975         if(log2n == 0)
976             return;
977         else if(log2n == 1)
978         {
979             // we don't multiply with 0.5 here because we want the inverse to
980             // be scaled by 2.
981 
982             auto rr0 = rr[0], ri0 = ri[0];
983             rr[0] = (rr0 + ri0);
984             ri[0] = (rr0 - ri0);
985             return;
986         }
987 
988         rfft_last_pass!true(rr, ri, log2n, rtable);
989         fft(ri, rr, log2n - 1, table);
990     }
991 
992     static void rfft_last_pass(bool inverse)(T* rr, T* ri, int log2n, RTable rtable) 
993     if(supports_real)
994     {
995         if(st!1 << log2n < 4 * vec_size)
996             return SFFT.rfft_last_pass!inverse(rr, ri, log2n, rtable);       
997  
998         static vec* v(T* a){ return cast(vec*) a; }
999 
1000         auto n = st!1 << log2n;
1001 
1002         vec half = V.scalar_to_vector(cast(T) 0.5);
1003 
1004         T middle_r = rr[n / 4];        
1005         T middle_i = ri[n / 4];        
1006 
1007         for(
1008             size_t i0 = 1, i1 = n / 2 - vec_size, iw = 0; 
1009             i0 <= i1; 
1010             i0 += vec_size, i1 -= vec_size, iw += 2*vec_size)
1011         {
1012             vec wr = *v(rtable + iw);
1013             vec wi = *v(rtable + iw + vec_size);
1014 
1015             vec r0r = V.unaligned_load(&rr[i0]);
1016             vec r0i = V.unaligned_load(&ri[i0]);
1017             vec r1r = V.reverse(*v(rr + i1));
1018             vec r1i = V.reverse(*v(ri + i1));
1019 
1020             vec ar = r0r + r1r;
1021             vec ai = r1i - r0i;
1022             vec br = r0r - r1r;
1023             vec bi = r0i + r1i;
1024 
1025             static if(inverse) 
1026             {
1027                 // we use -w* instead of w in this case and we do not divide by 2.
1028                 // The reason for that is that we want the inverse to be scaled
1029                 // by n as it is in the complex case and not just by n / 2.
1030 
1031                 vec tmp = br * wi - bi * wr;
1032                 br = bi * wi + br * wr;
1033                 bi = tmp;
1034             }
1035             else
1036             {
1037                 ar *= half;
1038                 ai *= half;
1039                 br *= half;
1040                 bi *= half;
1041                 vec tmp = br * wi + bi * wr;
1042                 br = bi * wi - br * wr;
1043                 bi = tmp;
1044             }
1045 
1046             V.unaligned_store(rr + i0, ar + bi);
1047             V.unaligned_store(ri + i0, br - ai);
1048 
1049             *v(rr + i1) = V.reverse(ar - bi);
1050             *v(ri + i1) = V.reverse(ai + br);
1051         }
1052         
1053         // fixes the aliasing bug:
1054         rr[n / 4] = inverse ? middle_r + middle_r : middle_r; 
1055         ri[n / 4] = -(inverse ? middle_i + middle_i : middle_i);
1056         
1057 
1058         {
1059             // When calculating inverse we would need to multiply with 0.5 here 
1060             // to get an exact inverse. We don't do that because we actually
1061             // want the inverse to be scaled by 2.         
1062     
1063             auto r0r = rr[0];
1064             auto r0i = ri[0];
1065             
1066             rr[0] = r0r + r0i;
1067             ri[0] = r0r - r0i;
1068         }
1069     }
1070     
1071     static void rfft_last_pass(bool inverse)(T* rr, T* ri, int log2n, RTable rtable) 
1072     if(!supports_real)
1073     {
1074         SFFT.rfft_last_pass!inverse(rr, ri, log2n, rtable); 
1075     }
1076 
1077     static void interleave_array()(T* even, T* odd, T* interleaved, size_t n)
1078     {
1079         static if(is(typeof(V.interleave)))
1080         {
1081             if(n < vec_size)
1082                 SFFT.interleave_array(even, odd, interleaved, n);
1083             else
1084                 foreach(i; 0 .. n / vec_size)
1085                     V.interleave(
1086                         (cast(vec*)even)[i], 
1087                         (cast(vec*)odd)[i], 
1088                         (cast(vec*)interleaved)[i * 2], 
1089                         (cast(vec*)interleaved)[i * 2 + 1]);
1090         }
1091         else
1092             foreach(i; 0 .. n)
1093             {
1094                 interleaved[i * 2] = even[i];
1095                 interleaved[i * 2 + 1] = odd[i];
1096             }
1097     }
1098     
1099     static void deinterleave_array()(T* even, T* odd, T* interleaved, size_t n)
1100     {
1101         static if(is(typeof(V.deinterleave)))
1102         {
1103             if(n < vec_size)
1104                 SFFT.deinterleave_array(even, odd, interleaved, n);
1105             else
1106                 foreach(i; 0 .. n / vec_size)
1107                     V.deinterleave(
1108                         (cast(vec*)interleaved)[i * 2], 
1109                         (cast(vec*)interleaved)[i * 2 + 1], 
1110                         (cast(vec*)even)[i], 
1111                         (cast(vec*)odd)[i]);
1112         }
1113         else
1114             foreach(i; 0 .. n)
1115             {
1116                 even[i] = interleaved[i * 2];
1117                 odd[i] = interleaved[i * 2 + 1];
1118             }
1119     }
1120 
1121     alias bool* ITable;
1122     
1123     alias Interleave!(V, 8, false).itable_size_bytes itable_size_bytes;
1124     alias Interleave!(V, 8, false).interleave_table interleave_table;
1125     alias Interleave!(V, 8, false).interleave interleave;
1126     alias Interleave!(V, 8, true).interleave deinterleave;
1127 
1128     static void scale(T* data, size_t n, T factor)
1129     {
1130         auto k  = V.scalar_to_vector(factor);
1131         
1132         foreach(ref e; (cast(vec*) data)[0 .. n / vec_size])
1133             e = e * k;
1134 
1135         foreach(ref e;  data[ n & ~(vec_size - 1) .. n])
1136             e = e * factor;
1137     }
1138 
1139     static size_t alignment(size_t n)
1140     {
1141         static if(is(typeof(Options.prefered_alignment)) && 
1142             Options.prefered_alignment > vec.sizeof)
1143         {
1144             enum a = Options.prefered_alignment;
1145         } 
1146         else
1147             enum a = vec.sizeof; 
1148         
1149         auto bytes = T.sizeof << bsr(n);
1150         
1151         bytes = bytes < a ? bytes : a;
1152         return bytes > (void*).sizeof && (bytes & (bytes - 1)) == 0 ? 
1153             bytes : (void*).sizeof;
1154     }
1155 }
1156 
1157 mixin template Instantiate()
1158 {
1159 nothrow:
1160 @nogc:
1161     struct TableValue{};
1162     alias TableValue* Table;
1163 
1164     struct RTableValue{};
1165     alias RTableValue* RTable;
1166     
1167     struct ITableValue{};
1168     alias ITableValue* ITable;
1169 
1170     template selected(string func_name, Ret...)
1171     {
1172         auto selected(A...)(A args)
1173         {
1174             auto impl = implementation;
1175             foreach(i, F; FFTs)
1176                 if(i == impl)
1177                 {
1178                     mixin("alias F." ~ func_name ~ " func;");
1179 
1180                     ParamTypeTuple!(func).type fargs;
1181 
1182                     foreach(j, _; fargs)
1183                         fargs[j] = cast(typeof(fargs[j])) args[j];
1184 
1185                     static if(Ret.length == 0)
1186                         return func(fargs);
1187                     else
1188                         return cast(Ret[0]) func(fargs);
1189                 }
1190           
1191             assert(false);
1192         }
1193     }
1194 
1195     alias FFTs[0] FFT0;
1196     alias FFT0.T T;
1197 
1198     void fft(T* re, T* im, uint log2n, Table t)
1199     {
1200         selected!"fft"(re, im, log2n, cast(FFT0.Table) t);
1201     }
1202 
1203     Table fft_table(uint log2n, void* p = null)
1204     {
1205         return selected!("fft_table", Table)(log2n, p);
1206     }
1207 
1208     size_t table_size_bytes(uint log2n)
1209     {
1210         return selected!"table_size_bytes"(log2n);
1211     }
1212 
1213     void scale(T* data, size_t n, T factor)
1214     {
1215         selected!"scale"(data, n, factor); 
1216     }
1217 
1218     size_t alignment(size_t n)
1219     {
1220         return selected!"alignment"(n);
1221     }
1222 
1223     void rfft(T* re, T* im, uint log2n, Table t, RTable rt)
1224     {
1225         selected!"rfft"(re, im, log2n, cast(FFT0.Table) t, cast(FFT0.RTable) rt);
1226     }
1227 
1228     void irfft(T* re, T* im, uint log2n, Table t, RTable rt)
1229     {
1230         selected!"irfft"(re, im, log2n, cast(FFT0.Table) t, cast(FFT0.RTable) rt);
1231     }
1232 
1233     RTable rfft_table(uint log2n, void* p = null)
1234     {
1235         return selected!("rfft_table", RTable)(log2n, p);
1236     }
1237 
1238     size_t rtable_size_bytes(int log2n)
1239     {
1240         return selected!"rtable_size_bytes"(log2n);
1241     }
1242 
1243     void deinterleave_array(T* even, T* odd, T* interleaved, size_t n)
1244     {
1245         selected!"deinterleave_array"(even, odd, interleaved, n);
1246     }
1247 
1248     void interleave_array(T* even, T* odd, T* interleaved, size_t n)
1249     {
1250         selected!"interleave_array"(even, odd, interleaved, n);
1251     }
1252 
1253     size_t itable_size_bytes(uint log2n)
1254     {
1255         return selected!"itable_size_bytes"(log2n);
1256     }
1257 
1258     ITable interleave_table(uint log2n, void* p)
1259     {
1260         return selected!("interleave_table", ITable)(log2n, p);
1261     }
1262 
1263     void interleave(T* p, uint log2n, ITable table)
1264     {
1265         selected!"interleave"(p, log2n, cast(FFT0.ITable) table);  
1266     }
1267 
1268     void deinterleave(T* p, uint log2n, ITable table)
1269     {
1270         selected!"deinterleave"(p, log2n, cast(FFT0.ITable) table);  
1271     }
1272 }