1 /**
2 * Delay-line implementation.
3 * Copyright: Guillaume Piolat 2015-2024.
4 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
5 */
6 module dplug.dsp.delayline;
7 
8 import core.stdc.string;
9 
10 import dplug.core.nogc;
11 import dplug.core.math;
12 import dplug.core.vec;
13 
14 import inteli.emmintrin;
15 
16 /// Allow to sample signal back in time.
17 /// This delay-line has a twin write index, so that the read pointer 
18 /// can read a contiguous memory area.
19 /// ____________________________________________________________________________________
20 /// |     | _index |                  | readPointer = _index + half size |             |
21 /// ------------------------------------------------------------------------------------
22 ///
23 /// A Delayline is initialized with an internal length of N = numSamples,
24 /// in order to do a simple delay of N samples.
25 /// Internally, the delayline actually has 2 x nextPow2(N + 1) samples of storage.
26 /// So typically a delay line is initialized with maxDelaySamples + maxFrames (if buffering is used)..
27 ///
28 /// Example:
29 /// ---
30 /// import dplug.dsp.delayline;
31 ///
32 /// void delaySampleBySample() // slower method, but easier to be correct
33 /// {
34 ///     Delayline!float delayline;
35 ///     delayline.initialize(maxPossibleDelay);
36 ///     for (int n = 0; n < frames; ++n)
37 ///     {
38 ///         delayline.feedSample(input[n]);
39 ///
40 ///         // desiredDelay = 0 would be the sample we just fed
41 ///         // the delayline with.
42 ///         delayed[n] = delayline.fullSample(desiredDelay); 
43 ///     }
44 /// }
45 ///
46 /// void delayUsingReadPointer() // fastest method, but more confusing
47 /// {
48 ///     Delayline!float delayline;
49 ///     delayline.initialize(maxFrames + maxPossibleDelay);
50 ///     delayline.feedBuffer(input.ptr, frames);
51 ///     const(float)* readPtr = d.readPointer() - desiredDelay - frames + 1;
52 ///     delayed[0..frames] = readPtr[0..frames];
53 ///     // Caveats: frames <= maxFrames and desiredDelay <= maxPossibleDelay.
54 /// }
55 /// ---
56 struct Delayline(T)
57 {
58 public:
59 nothrow:
60 @nogc:
61 
62     /// Initialize the delay line. Can delay up to `numSamples` samples.
63     void initialize(int numSamples)
64     {
65         resize(numSamples);
66     }
67 
68     ~this()
69     {
70         _data.reallocBuffer(0);
71     }
72 
73     @disable this(this);
74 
75     /// Resize the delay line. Can delay up to count samples.
76     /// The state is cleared.
77     void resize(int numSamples)
78     {
79         if (numSamples < 0)
80             assert(false);
81 
82         // Over-allocate to support POW2 delaylines.
83         // This wastes memory but allows delay-line of length 0 without tests.
84         // The reason to add +1 here is that fundamentally in a delay line of length = 1
85         // we want to keep track of the current sample (at delay 0) and the former one 
86         // (at delay 1). That's two samples.
87 
88         int toAllocate = nextPow2HigherOrEqual(numSamples + 1);
89         _data.reallocBuffer(toAllocate * 2);
90         _half = toAllocate;
91         _indexMask = toAllocate - 1;
92         _numSamples = numSamples;
93         _index = _indexMask;
94 
95         _data[] = 0;
96     }
97 
98     /// Adds a new sample at end of delay.
99     void feedSample(T incoming) pure
100     {
101         _index = (_index + 1) & _indexMask;
102         _data.ptr[_index] = incoming;
103         _data.ptr[_index + _half] = incoming;
104     }
105 
106 
107 
108     /// Random access sampling of the delay-line at integer points.
109     /// Delay 0 = last entered sample with feed().
110     T sampleFull(int delay) pure
111     {
112         assert(delay >= 0);
113         return readPointer()[-delay];
114     }
115 
116     /// Random access sampling of the delay-line at integer points, extract a time slice.
117     /// Delay 0 = last entered sample with feed().
118     void sampleFullBuffer(int delayOfMostRecentSample, T* outBuffer, int frames) pure
119     {
120         assert(delayOfMostRecentSample >= 0);
121         const(T*) p = readPointer();
122         const(T*) source = &readPointer[-delayOfMostRecentSample - frames + 1];
123         size_t numBytes = frames * T.sizeof;
124         memcpy(outBuffer, source, numBytes);
125     }
126 
127     static if (is(T == float) || is(T == double))
128     {
129         /// Random access sampling of the delay-line with linear interpolation.
130         /// Note that will the HF rollout of linear interpolation, it can 
131         /// often sound quite good in 44.1 kHz
132         T sampleLinear(float delay) pure const
133         {
134             assert(delay > 0);
135             int iPart;
136             float fPart;
137             decomposeFractionalDelay(delay, iPart, fPart);
138             const(T)* pData = readPointer();
139             T x0  = pData[iPart];
140             T x1  = pData[iPart + 1];
141             return lerp(x0, x1, fPart);
142         }
143 
144         /// Random access sampling of the delay-line with a 3rd order polynomial.
145         T sampleHermite(float delay) pure const
146         {
147             assert(delay > 1);
148             int iPart;
149             float fPart;
150             decomposeFractionalDelay(delay, iPart, fPart);
151             const(T)* pData = readPointer();
152             T xm1 = pData[iPart-1];
153             T x0  = pData[iPart  ];
154             T x1  = pData[iPart+1];
155             T x2  = pData[iPart+2];
156             return hermiteInterp!T(fPart, xm1, x0, x1, x2);
157         }
158 
159         /// Third-order spline interpolation
160         /// http://musicdsp.org/showArchiveComment.php?ArchiveID=62
161         T sampleSpline3(float delay) pure const
162         {
163             assert(delay > 1);
164             int iPart;
165             float fPart;
166             decomposeFractionalDelay(delay, iPart, fPart);
167             assert(fPart >= 0.0f);
168             assert(fPart <= 1.0f);
169             const(T)* pData = readPointer();
170             T L1 = pData[iPart-1];
171             T L0  = pData[iPart  ];
172             T H0  = pData[iPart+1];
173             T H1  = pData[iPart+2];
174 
175             return L0 + 0.5f *
176                 fPart*(H0-L1 +
177                 fPart*(H0 + L0*(-2) + L1 +
178                 fPart*( (H0 - L0)*9 + (L1 - H1)*3 +
179                 fPart*((L0 - H0)*15 + (H1 - L1)*5 +
180                 fPart*((H0 - L0)*6 + (L1 - H1)*2 )))));
181         }
182 
183         /// 4th order spline interpolation
184         /// http://musicdsp.org/showArchiveComment.php?ArchiveID=60
185         T sampleSpline4(float delay) pure const
186         {
187             assert(delay > 2);
188             int iPart;
189             float fPart;
190             decomposeFractionalDelay(delay, iPart, fPart);
191 
192             align(16) __gshared static immutable float[8][5] MAT = 
193             [
194                 [  2.0f / 24, -16.0f / 24,   0.0f / 24,   16.0f / 24,  -2.0f / 24,   0.0f / 24, 0.0f, 0.0f ],
195                 [ -1.0f / 24,  16.0f / 24, -30.0f / 24,   16.0f / 24,  -1.0f / 24,   0.0f / 24, 0.0f, 0.0f ],
196                 [ -9.0f / 24,  39.0f / 24, -70.0f / 24,   66.0f / 24, -33.0f / 24,   7.0f / 24, 0.0f, 0.0f ],
197                 [ 13.0f / 24, -64.0f / 24, 126.0f / 24, -124.0f / 24,  61.0f / 24, -12.0f / 24, 0.0f, 0.0f ],
198                 [ -5.0f / 24,  25.0f / 24, -50.0f / 24,   50.0f / 24, -25.0f / 24,   5.0f / 24, 0.0f, 0.0f ]
199             ];
200 
201             __m128 pFactor0_3 = _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f);
202             __m128 pFactor4_7 = _mm_setzero_ps();
203 
204             __m128 XMM_fPart = _mm_set1_ps(fPart);
205             __m128 weight = XMM_fPart;
206             pFactor0_3 = _mm_add_ps(pFactor0_3, _mm_load_ps(&MAT[0][0]) * weight);
207             pFactor4_7 = _mm_add_ps(pFactor4_7, _mm_load_ps(&MAT[0][4]) * weight);
208             weight = _mm_mul_ps(weight, XMM_fPart);
209             pFactor0_3 = _mm_add_ps(pFactor0_3, _mm_load_ps(&MAT[1][0]) * weight);
210             pFactor4_7 = _mm_add_ps(pFactor4_7, _mm_load_ps(&MAT[1][4]) * weight);
211             weight = _mm_mul_ps(weight, XMM_fPart);
212             pFactor0_3 = _mm_add_ps(pFactor0_3, _mm_load_ps(&MAT[2][0]) * weight);
213             pFactor4_7 = _mm_add_ps(pFactor4_7, _mm_load_ps(&MAT[2][4]) * weight);
214             weight = _mm_mul_ps(weight, XMM_fPart);
215             pFactor0_3 = _mm_add_ps(pFactor0_3, _mm_load_ps(&MAT[3][0]) * weight);
216             pFactor4_7 = _mm_add_ps(pFactor4_7, _mm_load_ps(&MAT[3][4]) * weight);
217             weight = _mm_mul_ps(weight, XMM_fPart);
218             pFactor0_3 = _mm_add_ps(pFactor0_3, _mm_load_ps(&MAT[4][0]) * weight);
219             pFactor4_7 = _mm_add_ps(pFactor4_7, _mm_load_ps(&MAT[4][4]) * weight);
220 
221             float[8] pfactor = void;
222             _mm_storeu_ps(&pfactor[0], pFactor0_3); 
223             _mm_storeu_ps(&pfactor[4], pFactor4_7);
224 
225             T result = 0;
226             const(T)* pData = readPointer();
227             foreach(n; 0..6)
228                 result += pData[iPart-2 + n] * pfactor[n];
229             return result;
230         }
231     }
232 
233     /// Adds several samples at end of delay.
234     void feedBuffer(const(T)[] incoming) pure
235     {
236         int N = cast(int)(incoming.length);
237 
238         // Note: it is legal to overfeed the delayline, in case of large 
239         // maxFrames for example. Though this is normally unexpected, but it's
240         // useful when using silence detection.
241 
242         if (N > _numSamples)
243         {
244             N = _numSamples;
245             incoming = incoming[$-N .. $];
246         }
247 
248         // remaining samples before end of delayline
249         int remain = _indexMask - _index;
250 
251         if (N == 0)
252         {
253             return;
254         }
255         else if (N <= remain)
256         {
257             memcpy( &_data[_index + 1], incoming.ptr, N * T.sizeof );
258             memcpy( &_data[_index + 1 + _half], incoming.ptr, N * T.sizeof );
259             _index += N;
260         }
261         else
262         {
263             if (remain != 0)
264             {
265                 memcpy(_data.ptr + (_index+1), incoming.ptr, remain * T.sizeof );
266                 memcpy(_data.ptr + (_index+1) + _half, incoming.ptr, remain * T.sizeof);
267             }
268             size_t numBytes = (N - remain) * T.sizeof;
269             memcpy( _data.ptr, incoming.ptr + remain, numBytes);
270             memcpy( _data.ptr + _half, incoming.ptr + remain, numBytes);
271             _index = (_index + N) & _indexMask;
272         }
273     }
274     ///ditto
275     void feedBuffer(const(T)* incoming, size_t count) pure
276     {
277         feedBuffer(incoming[0..count]);
278     }
279 
280     /// Returns: A pointer which allow to get delayed values.
281     ///    readPointer()[0] is the last samples fed, 
282     ///    readPointer()[-1] is the penultimate.
283     /// Warning: it goes backwards, increasing delay => decreasing addressed.
284     const(T)* readPointer() pure const
285     {
286         return _data.ptr + _index + _half;
287     }
288 
289     /// Combined feed + sampleFull.
290     /// Uses the delay line as a fixed delay of count samples.
291     ///
292     /// This is normally very rare to need this vs separate `sampleXXX` and
293     /// `feedSample`.
294     T nextSample(T incoming) pure
295     {
296         feedSample(incoming);
297         return sampleFull(_numSamples);
298     }
299 
300     /// Combined feed + sampleFull.
301     /// Uses the delay line as a fixed delay of count samples.
302     ///
303     /// Note: input and output may overlap. 
304     ///       If this was ever optimized, this should preserve that property.
305     ///
306     /// This is normally very rare to need this vs separate `sampleXXX` and
307     /// `feedBuffer`.
308     void nextBuffer(const(T)* input, T* output, int frames) pure
309     {
310         for(int i = 0; i < frames; ++i)
311             output[i] = nextSample(input[i]);
312     }
313 
314 private:
315     T[] _data;
316     int _index;
317     int _half; // half the size of the data
318     int _indexMask;
319     int _numSamples;
320 
321     void decomposeFractionalDelay(float delay, 
322                                   out int outIntegerPart, 
323                                   out float outFloatPart) pure const
324     {
325         // Because float index can yield suprising low precision with interpolation
326         // So we up the precision to double in order to have a precise fractional part
327         int offset = cast(int)(_data.length);
328         double doubleDelayMinus = cast(double)(-delay);
329         int iPart = cast(int)(doubleDelayMinus + offset);
330         iPart -= offset;
331         float fPart = cast(float)(doubleDelayMinus - iPart);
332         assert(fPart >= 0.0f);
333         assert(fPart <= 1.0f);
334         outIntegerPart = iPart;
335         outFloatPart = fPart;
336     }
337 }
338 
339 unittest
340 {
341     Delayline!float line;
342     line.initialize(0); // should be possible
343     assert(line.nextSample(1) == 1);
344     Delayline!double line2;
345 
346     Delayline!float line3;
347     line3.initialize(2);
348     assert(line3.nextSample(1) == 0);
349     assert(line3.nextSample(2) == 0);
350     assert(line3.nextSample(3) == 1);
351     assert(line3.nextSample(42) == 2);
352     assert(line3.sampleFull(0) == 42);
353     assert(line3.sampleFull(1) == 3);
354     assert(line3.sampleLinear(0.5f) == (3.0f + 42.0f) * 0.5f);
355 }
356 
357 // See Issue #607, usability of feedBuffer.
358 unittest
359 {
360     float[256] zeroes;
361     float[256] data;
362     float[256] delayed;
363     foreach (n; 0..256)
364     {
365         data[n] = cast(float)n;
366         zeroes[n] = 0.0f;
367     }
368 
369     // Delay of 256 samples, using `nextBuffer`.
370     {
371         Delayline!float d;
372         d.initialize(256);
373         d.nextBuffer(data.ptr, delayed.ptr, 256);
374         assert(delayed == zeroes);
375         d.nextBuffer(zeroes.ptr, delayed.ptr, 256);
376         assert(delayed == data);
377     }
378 
379     // It should be possible to use feedBuffer to delay of 256 amount too.
380     {
381         int desiredDelay = 256;
382         Delayline!float d;
383         d.initialize(256);
384         int frames = 256;
385         d.feedBuffer(data.ptr, frames);
386         const(float)* readPtr = d.readPointer() - desiredDelay - frames + 1;
387         delayed[0..frames] = readPtr[0..frames];
388         assert(delayed == zeroes);
389 
390         d.feedBuffer(zeroes.ptr, frames);
391         readPtr = d.readPointer() - desiredDelay - frames + 1;
392         delayed[0..frames] = readPtr[0..frames];
393         assert(delayed == data);
394     }
395 }
396 
397 // Issue 846, feeding a buffer larger than the delay line length.
398 // It's useful for testing effects in isolation, in a way that may
399 // have large maxFrames.
400 unittest
401 {
402     int[256] data;
403     for (int n = 0; n < 256; ++n)
404         data[n] = n;
405     Delayline!int d;
406     d.initialize(128);
407     d.feedBuffer(data[0..256]); // now work, only data[128..256] considered
408     for (int n = 0; n < 128; ++n)
409         assert(d.sampleFull(n) == 255 - n);
410 }
411 
412 /// Simplified delay line, mostly there to compensate latency manually.
413 /// No interpolation and no delay change while playing.
414 struct SimpleDelay(T)
415 {
416 public:
417 nothrow:
418 @nogc:
419 
420     enum MAX_CHANNELS = 2; // current limitation
421 
422     void initialize(int numChans, int maxFrames, int delayInSamples)
423     {
424         assert(numChans <= MAX_CHANNELS);
425         assert(_delayInSamples >= 0);
426         _delayInSamples = delayInSamples;
427         _numChans = numChans;
428         if (_delayInSamples > 0)
429         {
430             for (int chan = 0; chan < _numChans; ++chan)
431             {
432                 _delay[chan].initialize(maxFrames + delayInSamples + 1); // not sure if the +1 is still needed, or why. It is part of culture now.
433             }
434         }
435     }
436 
437     /// Just a reminder, to compute this processor latency.
438     static int latencySamples(int delayInSamples) pure
439     {
440         return delayInSamples;
441     }   
442 
443     /// Process samples, single channel version.
444     void nextBufferMono(const(T)* input, T* output, int frames)
445     {
446         assert(_numChans == 1);
447         const(T)*[1] inputs;
448         T*[1] outputs;
449 
450         inputs[0] = input;
451         outputs[0] = output;
452         nextBuffer(inputs.ptr, outputs.ptr, frames);
453     }
454     ///ditto
455     void nextBufferMonoInPlace(T* inoutSamples, int frames)
456     {
457         assert(_numChans == 1);
458         if  (_delayInSamples == 0)
459             return;
460         const(T)*[1] inputs;
461         T*[1] outputs;
462         inputs[0] = inoutSamples;
463         outputs[0] = inoutSamples;
464         nextBuffer(inputs.ptr, outputs.ptr, frames);
465     }
466 
467     /// Process samples, multichannel version.
468     /// Note: input and output buffers can overlap, or even be the same.
469     void nextBuffer(const(T*)* inputs, T** output, int frames)
470     {
471         for (int chan = 0; chan < _numChans; ++chan)
472         {
473             if (_delayInSamples == 0)
474             {
475                 // Since the two can overlap, use memmove.
476                 memmove(output[chan], inputs[chan], frames * T.sizeof);
477             }
478             else
479             {
480                  _delay[chan].feedBuffer(inputs[chan], frames);
481                  const(T)* readPtr = _delay[chan].readPointer() - _delayInSamples - frames + 1;
482                  output[chan][0..frames] = readPtr[0..frames];
483             }
484         }
485     }
486     ///ditto
487     void nextBufferInPlace(T** inoutSamples, int frames)
488     {
489         if  (_delayInSamples == 0)
490             return;
491         nextBuffer(inoutSamples, inoutSamples, frames);
492     }
493 
494 private:
495     int _numChans;
496     int _delayInSamples;
497     Delayline!T[MAX_CHANNELS] _delay;
498 }
499 
500 /// A delay that resyncs latency of two signals when it's not clear which has 
501 /// more latency. This is a building block for internal latency compensation.
502 /// 
503 /// Input:                                              |         |
504 ///       A with latency LA, B with latency LB          | A       | B
505 ///                                                     V         V
506 ///                                        ____________LatencyResync___________
507 ///                                       |  Delayline of L1 = max(LB - LA, 0) |
508 ///                                       |  Delayline of L2 = max(LA - LB, 0) |
509 ///                                       |____________________________________|
510 ///                                                     |         |
511 /// Output:                                             |         |
512 ///        Two aligned signal, latency = max(LA, LB)    | A       | B
513 ///                                                     V         V
514 struct LatencyResync(T)
515 {
516 public:
517 nothrow:
518 @nogc:
519     void initialize(int numChans, int maxFrames, int latencySamplesA, int latencySamplesB)
520     {
521         int L1 = latencySamplesB - latencySamplesA;
522         int L2 = latencySamplesA - latencySamplesB;
523         if (L1 < 0) L1 = 0;        
524         if (L2 < 0) L2 = 0;
525         _delayA.initialize(numChans, maxFrames, L1);
526         _delayB.initialize(numChans, maxFrames, L2);
527     }
528 
529     /// Just a reminder, to compute this processor latency.
530     static int latencySamples(int latencySamplesA, int latencySamplesB)
531     {
532         return latencySamplesA > latencySamplesB ? latencySamplesA : latencySamplesB;
533     }
534 
535     /// Process mono inputs, help function.
536     void nextBufferMono(const(T)* inputA, const(T)* inputB, T* outputA, T* outputB, int frames)
537     {
538         _delayA.nextBufferMono(inputA, outputA, frames);
539         _delayB.nextBufferMono(inputB, outputB, frames);
540     }
541     ///ditto
542     void nextBufferMonoInPlace(T* inoutASamples, T* inoutBSamples, int frames)
543     {
544         _delayA.nextBufferMonoInPlace(inoutASamples, frames);
545         _delayB.nextBufferMonoInPlace(inoutBSamples, frames);
546     }
547 
548     /// Process buffers. A and B signal gets aligned with regards to their relative latency.
549     void nextBuffer(const(T)** inputsA, const(T)** inputsB, T** outputsA, T** outputsB, int frames)
550     {
551         _delayA.nextBuffer(inputsA, outputsA, frames);
552         _delayB.nextBuffer(inputsB, outputsB, frames);
553     }
554     ///ditto
555     void nextBufferInPlace(T** inoutASamples, T** inoutBSamples, int frames)
556     {
557         _delayA.nextBufferInPlace(inoutASamples, frames);
558         _delayB.nextBufferInPlace(inoutBSamples, frames);
559     }
560 
561 private:
562     SimpleDelay!T _delayA;
563     SimpleDelay!T _delayB;
564 }
565 
566 unittest
567 {
568     {
569         double[4] A = [0.0, 3, 0, 0];
570         double[4] B = [0.0, 0, 2, 0];
571         LatencyResync!double lr;
572         int numChans = 1;
573         int maxFrames = 4;
574         int latencyA = 1;
575         int latencyB = 2;
576         lr.initialize(numChans, maxFrames, latencyA, latencyB);
577         lr.nextBufferMono(A.ptr, B.ptr, A.ptr, B.ptr, 4);
578         assert(A == [0.0, 0, 3, 0]);
579         assert(B == [0.0, 0, 2, 0]);
580     }
581 
582     {
583         double[4] A = [0.0, 0, 3, 9];
584         double[4] B = [2.0, 0, 0, 8];
585         LatencyResync!double lr;
586         int numChans = 1;
587         int maxFrames = 4;
588         int latencyA = 2;
589         int latencyB = 0;
590         lr.initialize(numChans, maxFrames, latencyA, latencyB);
591         lr.nextBufferMonoInPlace(A.ptr, B.ptr, 3);
592         assert(A == [0.0, 0, 3, 9]);
593         assert(B == [0.0, 0, 2, 8]);
594     }
595 }