1 // Copyright Jernej Krempuš 2012
2 // Distributed under the Boost Software License, Version 1.0.
3 // (See accompanying file LICENSE_1_0.txt or copy at
4 // http://www.boost.org/LICENSE_1_0.txt)
5
6 module dplug.fft.shuffle;
7
8 import core.bitop;
9
10 nothrow:
11 @nogc:
12
13 template st(alias a){ enum st = cast(size_t) a; }
14
15 void _swap(T)(ref T a, ref T b)
16 {
17 auto aa = a;
18 auto bb = b;
19 b = aa;
20 a = bb;
21 }
22
23 template ints_up_to(int n, T...)
24 {
25 static if(n)
26 {
27 alias ints_up_to!(n-1, n-1, T) ints_up_to;
28 }
29 else
30 alias T ints_up_to;
31 }
32
33 template powers_up_to(int n, T...)
34 {
35 static if(n > 1)
36 {
37 alias powers_up_to!(n / 2, n / 2, T) powers_up_to;
38 }
39 else
40 alias T powers_up_to;
41 }
42
43 template RepeatType(T, int n, R...)
44 {
45 static if(n == 0)
46 alias R RepeatType;
47 else
48 alias RepeatType!(T, n - 1, T, R) RepeatType;
49 }
50
51 void iter_bit_reversed_pairs(alias dg, A...)(int log2n, A args)
52 {
53 int mask = (0xffffffff<<(log2n));
54 uint i2 = ~mask;
55 uint i1 = i2;
56
57 while(i1 != (0U - 1U))
58 {
59 dg(i1, i2, args);
60 i2 = mask ^ (i2 ^ (mask>>(bsf(i1)+1)));
61 --i1;
62 }
63 }
64
65 void bit_reverse_simple(T)(T* p, int log2n)
66 {
67 static void loopBody(int i0, int i1, T* p)
68 {
69 if(i1 > i0)
70 _swap(p[i0],p[i1]);
71 }
72
73 iter_bit_reversed_pairs!loopBody(log2n, p);
74 }
75
76 template reverse_bits(int i, int bits_left, int r = 0)
77 {
78 static if(bits_left == 0)
79 enum reverse_bits = r;
80 else
81 enum reverse_bits = reverse_bits!(
82 i >> 1, bits_left - 1, (r << 1) | (i & 1));
83 }
84
85 auto bit_reverse_static_size(int log2n, T, S...)(T* p, S stride)
86 if(S.length <= 1)
87 {
88 enum n = 1 << log2n;
89 enum l = 1 << (log2n / 2);
90
91 static if(stride.length == 1)
92 auto index(size_t i)(S s){ return i / l * s[0] + i % l; }
93 else
94 auto index(size_t i)(S s){ return i; }
95
96 RepeatType!(T, n) a;
97
98 foreach(i, _; a)
99 static if(i != reverse_bits!(i, log2n))
100 a[i] = p[index!i(stride)];
101
102 foreach(i, _; a)
103 static if(i != reverse_bits!(i, log2n))
104 p[index!i(stride)] = a[reverse_bits!(i, log2n)];
105 }
106
107 auto bit_reverse_tiny(int max_log2n, T)(T* p, int log2n)
108 {
109 switch(log2n)
110 {
111 foreach(i; ints_up_to!max_log2n)
112 {
113 case i:
114 bit_reverse_static_size!i(p);
115 break;
116 }
117
118 default:
119 }
120 }
121
122 void bit_reverse_step(size_t chunk_size, T)(T* p, size_t nchunks)
123 {
124 for(size_t i = chunk_size, j = (nchunks >> 1) * chunk_size;
125 j < nchunks * chunk_size;
126 j += chunk_size*2, i += chunk_size*2)
127 {
128 foreach(k; ints_up_to!chunk_size)
129 _swap(p[i + k], p[j + k]);
130 }
131 }
132
133 auto aligned_ptr(T, U)(U * ptr, size_t alignment)
134 {
135 return cast(T*)(((cast(size_t)ptr) + alignment) & ~(alignment - 1UL));
136 }
137
138 auto aligned_size(T)(size_t size, size_t alignment)
139 {
140 return size * T.sizeof + alignment;
141 }
142
143 struct BitReverse(alias V, Options)
144 {
145 alias V.T T;
146 alias V.vec vec;
147
148 static size_t br_table_size()(int log2n)
149 {
150 enum log2l = V.log2_bitreverse_chunk_size;
151
152 return log2n < log2l * 2 ? 0 : (1 << (log2n - 2 * log2l)) + 2 * log2l;
153 }
154
155 static void init_br_table()(uint* table, int log2n)
156 {
157 enum log2l = V.log2_bitreverse_chunk_size;
158
159 static void loopBody0(int i0, int i1, uint** p)
160 {
161 if(i1 == i0)
162 (**p = i0 << log2l), (*p)++;
163 }
164 iter_bit_reversed_pairs!loopBody0(log2n - 2 * log2l, &table);
165
166 static void loopBody1(int i0, int i1, uint** p)
167 {
168 if(i1 < i0)
169 {
170 **p = i0 << log2l;
171 (*p)++;
172 **p = i1 << log2l;
173 (*p)++;
174 }
175 }
176 iter_bit_reversed_pairs!loopBody1(log2n - 2 * log2l, &table);
177 }
178
179 static void bit_reverse_small()(T* p, uint log2n, uint* table)
180 {
181 enum log2l = V.log2_bitreverse_chunk_size;
182
183 uint tmp = log2n - log2l - log2l;
184 uint n1 = 1u << ((tmp + 1) >> 1);
185 uint n2 = 1u << tmp;
186 uint m = 1u << (log2n - log2l);
187
188 uint* t1 = table + n1, t2 = table + n2;
189
190 for(; table < t1; table++)
191 V.bit_reverse( p + table[0], m);
192 for(; table < t2; table += 2)
193 V.bit_reverse_swap( p + table[0], p + table[1], m);
194 }
195
196 private static auto highest_power_2(int a, int maxpower)
197 {
198 while(a % maxpower)
199 maxpower /= 2;
200
201 return maxpower;
202 }
203
204 static void swap_some(int n, TT)(TT* a, TT* b)
205 {
206 RepeatType!(TT, 2 * n) tmp;
207
208 foreach(i; ints_up_to!n)
209 tmp[i] = a[i];
210 foreach(i; ints_up_to!n)
211 tmp[i + n] = b[i];
212
213 foreach(i; ints_up_to!n)
214 b[i] = tmp[i];
215 foreach(i; ints_up_to!n)
216 a[i] = tmp[i + n];
217 }
218
219 static void swap_array(int len, TT)(TT * a, TT * b)
220 {
221 static assert(len*TT.sizeof % vec.sizeof == 0);
222
223 enum n = highest_power_2( len * TT.sizeof / vec.sizeof, 4);
224
225 foreach(i; 0 .. len * TT.sizeof / n / vec.sizeof)
226 swap_some!n((cast(vec*)a) + n * i, (cast(vec*)b) + n * i);
227 }
228
229 static void copy_some(int n, TT)(TT* dst, TT* src)
230 {
231 RepeatType!(TT, n) a;
232
233 foreach(i, _; a)
234 a[i] = src[i];
235 foreach(i, _; a)
236 dst[i] = a[i];
237 }
238
239 static void copy_array(int len, TT)(TT * a, TT * b)
240 {
241 static assert((len * TT.sizeof % vec.sizeof == 0));
242
243 enum n = highest_power_2( len * TT.sizeof / vec.sizeof, 8);
244
245 foreach(i; 0 .. len * TT.sizeof / n / vec.sizeof)
246 copy_some!n((cast(vec*)a) + n * i, (cast(vec*)b) + n * i);
247 }
248
249 static void strided_copy(size_t chunk_size, TT)(
250 TT* dst, TT* src, size_t dst_stride, size_t src_stride, size_t nchunks)
251 {
252 for(
253 TT* s = src, d = dst;
254 s < src + nchunks * src_stride;
255 s += src_stride, d += dst_stride)
256 {
257 copy_array!chunk_size(d, s);
258 }
259 }
260
261 static void bit_reverse_large()(T* p, int log2n, uint * table)
262 {
263 enum log2l = Options.log2_bitreverse_large_chunk_size;
264 enum l = 1<<log2l;
265
266 ubyte[aligned_size!T(l * l, 64)] mem = void;
267 auto buffer = aligned_ptr!T(mem.ptr, 64);
268
269 int log2m = log2n - log2l;
270 size_t m = 1<<log2m, n = 1<<log2n;
271 T * pend = p + n;
272
273 iter_bit_reversed_pairs!(function (size_t i0, size_t i1,
274 T* p, T* pend, size_t m, uint* table, T* buffer)
275 {
276 if(i1 >= i0)
277 {
278 strided_copy!l(buffer, p + i0 * l, l, m, l);
279
280 bit_reverse_small(buffer,log2l+log2l, table);
281
282 if(i1 != i0)
283 {
284 for(
285 T* pp = p + i1 * l, pb = buffer;
286 pp < pend;
287 pb += l, pp += m)
288 {
289 swap_array!l(pp, pb);
290 }
291
292 bit_reverse_small(buffer,log2l+log2l, table);
293 }
294
295 strided_copy!l(p + i0 * l, buffer, m, l, l);
296 }
297 })(log2m-log2l, p, pend, m, table, buffer);
298 }
299 }
300
301 private struct Scalar(TT)
302 {
303 public:
304
305 alias TT T;
306 alias TT vec;
307 enum vec_size = 1;
308
309 static void interleave(vec a0, vec a1, ref vec r0, ref vec r1)
310 {
311 r0 = a0;
312 r1 = a1;
313 }
314
315 static void deinterleave(vec a0, vec a1, ref vec r0, ref vec r1)
316 {
317 r0 = a0;
318 r1 = a1;
319 }
320 }
321
322 template hasInterleaving(V)
323 {
324 enum hasInterleaving =
325 is(typeof(V.interleave)) &&
326 is(typeof(V.deinterleave));
327 }
328
329 struct InterleaveImpl(V, int chunk_size, bool isInverse)
330 {
331 static size_t itable_size_bytes()(int log2n)
332 {
333 return (bool.sizeof << log2n) / V.vec_size / chunk_size;
334 }
335
336 static bool* interleave_table()(int log2n, void* p)
337 {
338 auto n = st!1 << log2n;
339 auto is_cycle_minimum = cast(bool*) p;
340 size_t n_chunks = n / V.vec_size / chunk_size;
341
342 if(n_chunks < 4)
343 return null;
344
345 is_cycle_minimum[0 .. n_chunks] = true;
346
347 for(size_t i = 1;;)
348 {
349 size_t j = i;
350 while(true)
351 {
352 j = j < n_chunks / 2 ? 2 * j : 2 * (j - n_chunks / 2) + 1;
353 if(j == i)
354 break;
355
356 is_cycle_minimum[j] = false;
357 }
358
359 // The last cycle minimum is at n / 2 - 1
360 if(i == n_chunks / 2 - 1)
361 break;
362
363 do i++; while(!is_cycle_minimum[i]);
364 }
365
366 return is_cycle_minimum;
367 }
368
369 static void interleave_chunks()(
370 V.vec* a, size_t n_chunks, bool* is_cycle_minimum)
371 {
372 alias RepeatType!(V.vec, chunk_size) RT;
373 alias ints_up_to!chunk_size indices;
374
375 for(size_t i = 1;;)
376 {
377 size_t j = i;
378
379 RT element;
380 auto p = &a[i * chunk_size];
381 foreach(k; indices)
382 element[k] = p[k];
383
384 while(true)
385 {
386 static if(isInverse)
387 j = j & 1 ? j / 2 + n_chunks / 2 : j / 2;
388 else
389 j = j < n_chunks / 2 ? 2 * j : 2 * (j - n_chunks / 2) + 1;
390
391 if(j == i)
392 break;
393
394 RT tmp;
395 p = &a[j * chunk_size];
396 foreach(k; indices)
397 tmp[k] = p[k];
398
399 foreach(k; indices)
400 p[k] = element[k];
401
402 foreach(k; indices)
403 element[k] = tmp[k];
404 }
405
406 p = &a[i * chunk_size];
407 foreach(k; indices)
408 p[k] = element[k];
409
410 if(i == n_chunks / 2 - 1)
411 break;
412
413 do i++; while(!is_cycle_minimum[i]);
414 }
415 }
416
417 static void interleave_tiny()(V.vec* p, size_t len)
418 {
419 switch(len)
420 {
421 foreach(n; powers_up_to!(2 * chunk_size))
422 {
423 case 2 * n:
424
425 RepeatType!(V.vec, 2 * n) tmp;
426
427 static if(isInverse)
428 foreach(j; ints_up_to!n)
429 V.deinterleave(
430 p[2 * j], p[2 * j + 1], tmp[j],
431 tmp[n + j]);
432 else
433 foreach(j; ints_up_to!n)
434 V.interleave(
435 p[j], p[n + j],
436 tmp[2 * j], tmp[2 * j + 1]);
437
438 foreach(j; ints_up_to!(2 * n))
439 p[j] = tmp[j];
440
441 break;
442 }
443
444 default: {}
445 }
446 }
447
448 static void interleave_chunk_elements()(V.vec* a, size_t n_chunks)
449 {
450 for(auto p = a; p < a + n_chunks * chunk_size; p += 2 * chunk_size)
451 {
452 RepeatType!(V.vec, 2 * chunk_size) tmp;
453
454 static if(isInverse)
455 foreach(j; ints_up_to!chunk_size)
456 V.deinterleave(
457 p[2 * j], p[2 * j + 1], tmp[j], tmp[chunk_size + j]);
458 else
459 foreach(j; ints_up_to!chunk_size)
460 V.interleave(
461 p[j], p[chunk_size + j], tmp[2 * j], tmp[2 * j + 1]);
462
463 foreach(j; ints_up_to!(2 * chunk_size))
464 p[j] = tmp[j];
465 }
466 }
467
468 static void interleave()(V.T* p, int log2n, bool* table)
469 {
470 auto n = st!1 << log2n;
471
472 if(n < 4)
473 return;
474 else if(n < 2 * V.vec_size)
475 return
476 InterleaveImpl!(Scalar!(V.T), V.vec_size / 2, isInverse)
477 .interleave_tiny(p, n);
478
479 assert(n >= 2 * V.vec_size);
480
481 auto vp = cast(V.vec*) p;
482 auto vn = n / V.vec_size;
483
484 if(n < 4 * V.vec_size * chunk_size)
485 interleave_tiny(vp, vn);
486 else
487 {
488 auto n_chunks = vn / chunk_size;
489 static if(isInverse)
490 {
491 interleave_chunk_elements(vp, n_chunks);
492 interleave_chunks(vp, n_chunks, table);
493 }
494 else
495 {
496 interleave_chunks(vp, n_chunks, table);
497 interleave_chunk_elements(vp, n_chunks);
498 }
499 }
500 }
501 }
502
503 template Interleave(V, int chunk_size, bool isInverse)
504 {
505 static if(hasInterleaving!V)
506 alias InterleaveImpl!(V, chunk_size, isInverse) Interleave;
507 else
508 alias
509 InterleaveImpl!(Scalar!(V.T), chunk_size, isInverse) Interleave;
510 }