1 //          Copyright Jernej Krempuš 2012
2 // Distributed under the Boost Software License, Version 1.0.
3 //    (See accompanying file LICENSE_1_0.txt or copy at
4 //          http://www.boost.org/LICENSE_1_0.txt)
5 
6 module dplug.fft.shuffle;
7 
8 import core.bitop;
9 
10 nothrow:
11 @nogc:
12 
13 template st(alias a){ enum st = cast(size_t) a; }
14 
15 void _swap(T)(ref T a, ref T b)
16 {
17     auto aa = a;
18     auto bb = b;
19     b = aa;
20     a = bb;
21 }
22 
23 template ints_up_to(int n, T...)
24 {
25     static if(n)
26     {
27         alias ints_up_to!(n-1, n-1, T) ints_up_to;
28     }
29     else
30         alias T ints_up_to;
31 }
32 
33 template powers_up_to(int n, T...)
34 {
35     static if(n > 1)
36     {
37         alias powers_up_to!(n / 2, n / 2, T) powers_up_to;
38     }
39     else
40         alias T powers_up_to;
41 }
42 
43 template RepeatType(T, int n, R...)
44 {
45     static if(n == 0)
46         alias R RepeatType;
47     else
48         alias RepeatType!(T, n - 1, T, R) RepeatType;
49 }
50 
51 void iter_bit_reversed_pairs(alias dg, A...)(int log2n, A args)
52 {
53     int mask = (0xffffffff<<(log2n));
54     uint i2 = ~mask; 
55     uint i1 = i2;
56 
57     while(i1 != (0U - 1U))
58     {
59         dg(i1, i2, args);
60         i2 = mask ^ (i2 ^ (mask>>(bsf(i1)+1)));
61         --i1;
62     }
63 }
64 
65 void bit_reverse_simple(T)(T* p, int log2n)
66 {
67     static void loopBody(int i0, int i1, T* p)
68     {
69         if(i1 > i0)
70             _swap(p[i0],p[i1]);
71     }
72 
73     iter_bit_reversed_pairs!loopBody(log2n, p);
74 }
75 
76 template reverse_bits(int i, int bits_left, int r = 0)
77 {
78     static if(bits_left == 0)
79         enum reverse_bits = r;
80     else
81         enum reverse_bits = reverse_bits!(
82             i >> 1, bits_left - 1, (r << 1) | (i & 1));
83 }
84 
85 auto bit_reverse_static_size(int log2n, T, S...)(T* p, S stride) 
86     if(S.length <= 1) 
87 {
88     enum n = 1 << log2n;
89     enum l = 1 << (log2n / 2);   
90     
91     static if(stride.length == 1)
92         auto index(size_t i)(S s){ return i / l * s[0] + i % l; } 
93     else
94         auto index(size_t i)(S s){ return i; }
95 
96     RepeatType!(T, n) a;
97     
98     foreach(i, _; a)
99         static if(i != reverse_bits!(i, log2n)) 
100             a[i] = p[index!i(stride)];
101     
102     foreach(i, _; a)
103         static if(i != reverse_bits!(i, log2n)) 
104             p[index!i(stride)] = a[reverse_bits!(i, log2n)];
105 }
106 
107 auto bit_reverse_tiny(int max_log2n, T)(T* p, int log2n)
108 {
109     switch(log2n)
110     {
111         foreach(i; ints_up_to!max_log2n)
112         {
113             case i:
114                 bit_reverse_static_size!i(p);
115                 break;
116         }
117         
118         default:
119     }
120 }
121 
122 void bit_reverse_step(size_t chunk_size, T)(T* p, size_t nchunks)
123 {
124     for(size_t i = chunk_size, j = (nchunks >> 1) * chunk_size; 
125         j < nchunks * chunk_size; 
126         j += chunk_size*2, i += chunk_size*2)
127     {        
128         foreach(k; ints_up_to!chunk_size)
129             _swap(p[i + k], p[j + k]);
130     }
131 }
132 
133 auto aligned_ptr(T, U)(U * ptr, size_t alignment)
134 {
135     return cast(T*)(((cast(size_t)ptr) + alignment) & ~(alignment - 1UL));
136 }
137 
138 auto aligned_size(T)(size_t size, size_t alignment)
139 {
140     return size * T.sizeof + alignment;
141 }
142 
143 struct BitReverse(alias V, Options)
144 {
145     alias V.T T;
146     alias V.vec vec;
147     
148     static size_t br_table_size()(int log2n)
149     {
150         enum log2l = V.log2_bitreverse_chunk_size;
151  
152         return log2n < log2l * 2 ? 0 : (1 << (log2n - 2 * log2l)) + 2 * log2l;
153     }
154     
155     static void init_br_table()(uint* table, int log2n)
156     {
157         enum log2l = V.log2_bitreverse_chunk_size;
158 
159         static void loopBody0(int i0, int i1, uint** p)
160         {
161             if(i1 == i0)
162                 (**p = i0 << log2l), (*p)++;
163         }
164         iter_bit_reversed_pairs!loopBody0(log2n - 2 * log2l, &table);
165 
166         static void loopBody1(int i0, int i1, uint** p)
167         {
168             if(i1 < i0)
169             {
170                 **p = i0 << log2l;
171                 (*p)++;
172                 **p = i1 << log2l;
173                 (*p)++;
174             }
175         }
176         iter_bit_reversed_pairs!loopBody1(log2n - 2 * log2l, &table);
177     }
178        
179     static void bit_reverse_small()(T*  p, uint log2n, uint*  table)
180     {
181         enum log2l = V.log2_bitreverse_chunk_size;
182         
183         uint tmp = log2n - log2l - log2l;
184         uint n1 = 1u << ((tmp + 1) >> 1);
185         uint n2 = 1u << tmp;
186         uint m = 1u << (log2n - log2l);
187       
188         uint* t1 = table + n1, t2 = table + n2;
189       
190         for(; table < t1; table++)
191             V.bit_reverse( p + table[0], m);
192         for(; table < t2; table += 2)
193             V.bit_reverse_swap( p + table[0], p + table[1], m);
194     }
195 
196     private static auto highest_power_2(int a, int maxpower)
197     {
198         while(a % maxpower)
199             maxpower /= 2;
200 
201         return maxpower;     
202     }
203 
204     static void swap_some(int n, TT)(TT* a, TT* b)
205     {
206         RepeatType!(TT, 2 * n) tmp;
207         
208         foreach(i; ints_up_to!n)
209             tmp[i] = a[i];
210         foreach(i; ints_up_to!n)
211             tmp[i + n] = b[i];
212         
213         foreach(i; ints_up_to!n)
214             b[i] = tmp[i];
215         foreach(i; ints_up_to!n)
216             a[i] = tmp[i + n];
217     }
218 
219     static void swap_array(int len, TT)(TT *  a, TT *  b)
220     {
221         static assert(len*TT.sizeof % vec.sizeof == 0);
222         
223         enum n = highest_power_2( len * TT.sizeof / vec.sizeof, 4);
224         
225         foreach(i; 0 .. len * TT.sizeof / n / vec.sizeof)
226             swap_some!n((cast(vec*)a) + n * i, (cast(vec*)b) + n * i);
227     }
228     
229     static void copy_some(int n, TT)(TT* dst, TT* src)
230     {
231         RepeatType!(TT, n) a;
232         
233         foreach(i, _; a)
234             a[i] = src[i];
235         foreach(i, _; a)
236             dst[i] = a[i];
237     }
238     
239     static void copy_array(int len, TT)(TT *  a, TT *  b)
240     {
241         static assert((len * TT.sizeof % vec.sizeof == 0));
242         
243         enum n = highest_power_2( len * TT.sizeof / vec.sizeof, 8);
244 
245         foreach(i; 0 .. len * TT.sizeof / n / vec.sizeof)
246             copy_some!n((cast(vec*)a) + n * i, (cast(vec*)b) + n * i);
247     }
248     
249     static void strided_copy(size_t chunk_size, TT)(
250         TT* dst, TT* src, size_t dst_stride, size_t src_stride, size_t nchunks)
251     {
252         for(
253             TT* s = src, d = dst; 
254             s < src + nchunks * src_stride; 
255             s += src_stride, d += dst_stride)
256         {
257             copy_array!chunk_size(d, s);
258         }
259     } 
260 
261     static void bit_reverse_large()(T* p, int log2n, uint * table)
262     {
263         enum log2l = Options.log2_bitreverse_large_chunk_size;
264         enum l = 1<<log2l;
265         
266         ubyte[aligned_size!T(l * l, 64)] mem = void;
267         auto buffer = aligned_ptr!T(mem.ptr, 64);
268         
269         int log2m = log2n - log2l;
270         size_t m = 1<<log2m, n = 1<<log2n;
271         T * pend = p + n;
272        
273         iter_bit_reversed_pairs!(function (size_t i0, size_t i1, 
274 	    T* p, T* pend, size_t m, uint* table, T* buffer)
275         {
276             if(i1 >= i0)
277             {
278                 strided_copy!l(buffer, p + i0 * l, l, m, l);
279           
280                 bit_reverse_small(buffer,log2l+log2l, table);
281 
282                 if(i1 != i0)
283                 {
284                     for(
285                         T* pp = p + i1 * l, pb = buffer;
286                         pp < pend; 
287                         pb += l, pp += m)
288                     {
289                         swap_array!l(pp, pb);
290                     }
291                 
292                     bit_reverse_small(buffer,log2l+log2l, table);
293                 }
294 
295                 strided_copy!l(p + i0 * l, buffer, m, l, l);
296             }
297         })(log2m-log2l, p, pend, m, table, buffer);
298     }
299 }
300 
301 private struct Scalar(TT)
302 {
303     public:
304 
305     alias TT T;
306     alias TT vec;
307     enum vec_size = 1;
308     
309     static void interleave(vec a0, vec a1, ref vec r0, ref vec r1)
310     {
311         r0 = a0;
312         r1 = a1; 
313     }
314     
315     static void deinterleave(vec a0, vec a1, ref vec r0, ref vec r1)
316     {
317         r0 = a0;
318         r1 = a1; 
319     }
320 }
321 
322 template hasInterleaving(V)
323 {
324     enum hasInterleaving =  
325         is(typeof(V.interleave)) && 
326         is(typeof(V.deinterleave));
327 }
328 
329 struct InterleaveImpl(V, int chunk_size, bool isInverse) 
330 {
331     static size_t itable_size_bytes()(int log2n)
332     {
333         return (bool.sizeof << log2n) / V.vec_size / chunk_size; 
334     }
335 
336     static bool* interleave_table()(int log2n, void* p)
337     {
338         auto n = st!1 << log2n;
339         auto is_cycle_minimum = cast(bool*) p;
340         size_t n_chunks = n / V.vec_size / chunk_size;
341 
342         if(n_chunks < 4)
343             return null;
344 
345         is_cycle_minimum[0 .. n_chunks] = true;    
346 
347         for(size_t i = 1;;)
348         {
349             size_t j = i;
350             while(true)
351             {
352                 j = j < n_chunks / 2 ? 2 * j : 2 * (j - n_chunks / 2) + 1;
353                 if(j == i)
354                     break;
355 
356                 is_cycle_minimum[j] = false;
357             }
358 
359             // The last cycle minimum is at n / 2 - 1
360             if(i == n_chunks / 2 - 1)
361                 break;           
362 
363             do i++; while(!is_cycle_minimum[i]);
364         }
365 
366         return is_cycle_minimum;
367     }
368 
369     static void interleave_chunks()(
370         V.vec* a, size_t n_chunks, bool* is_cycle_minimum)
371     {
372         alias RepeatType!(V.vec, chunk_size) RT;
373         alias ints_up_to!chunk_size indices;        
374 
375         for(size_t i = 1;;)
376         {
377             size_t j = i;
378 
379             RT element;
380             auto p = &a[i * chunk_size];
381             foreach(k; indices)
382                 element[k] = p[k];
383 
384             while(true)
385             {
386                 static if(isInverse)
387                     j = j & 1 ? j / 2 + n_chunks / 2 : j / 2;
388                 else
389                     j = j < n_chunks / 2 ? 2 * j : 2 * (j - n_chunks / 2) + 1;
390                 
391                 if(j == i)
392                     break;
393 
394                 RT tmp;
395                 p = &a[j * chunk_size];
396                 foreach(k; indices)
397                     tmp[k] = p[k];
398 
399                 foreach(k; indices)
400                     p[k] = element[k];
401 
402                 foreach(k; indices)
403                     element[k] = tmp[k];
404             }
405 
406             p = &a[i * chunk_size];
407             foreach(k; indices)
408                 p[k] = element[k];
409 
410             if(i == n_chunks / 2 - 1)
411                 break;           
412 
413             do i++; while(!is_cycle_minimum[i]);
414         }
415     }
416 
417     static void interleave_tiny()(V.vec* p, size_t len)
418     {
419         switch(len)
420         {
421             foreach(n; powers_up_to!(2 * chunk_size))
422             {
423                 case 2 * n:
424 
425                     RepeatType!(V.vec, 2 * n) tmp;
426 
427                     static if(isInverse)
428                         foreach(j; ints_up_to!n)
429                             V.deinterleave(
430                                 p[2 * j], p[2 * j + 1], tmp[j], 
431                                 tmp[n + j]);
432                     else
433                         foreach(j; ints_up_to!n)
434                             V.interleave(
435                                 p[j], p[n + j], 
436                                 tmp[2 * j], tmp[2 * j + 1]);
437 
438                     foreach(j; ints_up_to!(2 * n))
439                         p[j] = tmp[j];
440 
441                     break;
442             }
443 
444             default: {}
445         }
446     }
447 
448     static void interleave_chunk_elements()(V.vec* a, size_t n_chunks)
449     {
450         for(auto p = a; p < a + n_chunks * chunk_size; p += 2 * chunk_size)
451         {
452             RepeatType!(V.vec, 2 * chunk_size) tmp;
453 
454             static if(isInverse)
455                 foreach(j; ints_up_to!chunk_size)
456                     V.deinterleave(
457                         p[2 * j], p[2 * j + 1], tmp[j], tmp[chunk_size + j]);
458             else
459                 foreach(j; ints_up_to!chunk_size)
460                     V.interleave(
461                         p[j], p[chunk_size + j], tmp[2 * j], tmp[2 * j + 1]);
462 
463             foreach(j; ints_up_to!(2 * chunk_size))
464                 p[j] = tmp[j];
465         } 
466     }
467 
468     static void interleave()(V.T* p, int log2n, bool* table)
469     {
470         auto n = st!1 << log2n;
471 
472         if(n < 4)
473             return;
474         else if(n < 2 * V.vec_size)
475             return 
476                 InterleaveImpl!(Scalar!(V.T), V.vec_size / 2, isInverse)
477                     .interleave_tiny(p, n);
478 
479         assert(n >= 2 * V.vec_size);
480        
481         auto vp = cast(V.vec*) p;
482         auto vn = n / V.vec_size;
483  
484         if(n < 4 * V.vec_size * chunk_size)
485             interleave_tiny(vp, vn);
486         else
487         {
488             auto n_chunks = vn / chunk_size;
489             static if(isInverse)
490             {
491                 interleave_chunk_elements(vp, n_chunks);
492                 interleave_chunks(vp, n_chunks, table);
493             }
494             else
495             {
496                 interleave_chunks(vp, n_chunks, table);
497                 interleave_chunk_elements(vp, n_chunks);
498             }
499         }  
500     }
501 }
502 
503 template Interleave(V, int chunk_size, bool isInverse)
504 {
505     static if(hasInterleaving!V)
506         alias InterleaveImpl!(V, chunk_size, isInverse) Interleave;
507     else
508         alias 
509             InterleaveImpl!(Scalar!(V.T), chunk_size, isInverse) Interleave;
510 }