1 /**
2 * Delay-line implementation.
3 * Copyright: Guillaume Piolat 2015-2021.
4 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
5 */
6 module dplug.dsp.delayline;
7 
8 import core.stdc.string;
9 
10 import dplug.core.nogc;
11 import dplug.core.math;
12 import dplug.core.vec;
13 
14 /// Allow to sample signal back in time.
15 /// This delay-line has a twin write index, so that the read pointer 
16 /// can read a contiguous memory area.
17 /// ____________________________________________________________________________________
18 /// |     | _index |                  | readPointer = _index + half size |             |
19 /// ------------------------------------------------------------------------------------
20 ///
21 /// A Delayline is initialized with an internal length of N = numSamples,
22 /// in order to do a simple delay of N samples.
23 /// Internally, the delayline actually has 2 x nextPow2(N + 1) samples of storage.
24 /// So typically a delay line is initialized with maxDelaySamples + maxFrames (if buffering is used)..
25 ///
26 /// Example:
27 /// ---
28 /// import dplug.dsp.delayline;
29 ///
30 /// void delaySampleBySample() // slower method, but easier to be correct
31 /// {
32 ///     Delayline!float delayline;
33 ///     delayline.initialize(maxPossibleDelay);
34 ///     for (int n = 0; n < frames; ++n)
35 ///     {
36 ///         delayline.feedSample(input[n]);
37 ///
38 ///         // desiredDelay = 0 would be the sample we just fed
39 ///         // the delayline with.
40 ///         delayed[n] = delayline.fullSample(desiredDelay); 
41 ///     }
42 /// }
43 ///
44 /// void delayUsingReadPointer() // fastest method, but more confusing
45 /// {
46 ///     Delayline!float delayline;
47 ///     delayline.initialize(maxFrames + maxPossibleDelay);
48 ///     delayline.feedBuffer(input.ptr, frames);
49 ///     const(float)* readPtr = d.readPointer() - desiredDelay - frames + 1;
50 ///     delayed[0..frames] = readPtr[0..frames];
51 ///     // Caveats: frames <= maxFrames and desiredDelay <= maxPossibleDelay.
52 /// }
53 /// ---
54 struct Delayline(T)
55 {
56 public:
57 nothrow:
58 @nogc:
59 
60     /// Initialize the delay line. Can delay up to `numSamples` samples.
61     void initialize(int numSamples)
62     {
63         resize(numSamples);
64     }
65 
66     ~this()
67     {
68         _data.reallocBuffer(0);
69     }
70 
71     @disable this(this);
72 
73     /// Resize the delay line. Can delay up to count samples.
74     /// The state is cleared.
75     void resize(int numSamples)
76     {
77         if (numSamples < 0)
78             assert(false);
79 
80         // Over-allocate to support POW2 delaylines.
81         // This wastes memory but allows delay-line of length 0 without tests.
82         // The reason to add +1 here is that fundamentally in a delay line of length = 1
83         // we want to keep track of the current sample (at delay 0) and the former one (at delay 1).
84 
85         int toAllocate = nextPow2HigherOrEqual(numSamples + 1);
86         _data.reallocBuffer(toAllocate * 2);
87         _half = toAllocate;
88         _indexMask = toAllocate - 1;
89         _numSamples = numSamples;
90         _index = _indexMask;
91 
92         _data[] = 0;
93     }
94 
95     /// Combined feed + sampleFull.
96     /// Uses the delay line as a fixed delay of count samples.
97     T nextSample(T incoming) pure
98     {
99         feedSample(incoming);
100         return sampleFull(_numSamples);
101     }
102 
103     /// Combined feed + sampleFull.
104     /// Uses the delay line as a fixed delay of count samples.
105     ///
106     /// Note: input and output may overlap. 
107     ///       If this was ever optimized, this should preserve that property.
108     void nextBuffer(const(T)* input, T* output, int frames) pure
109     {
110         for(int i = 0; i < frames; ++i)
111             output[i] = nextSample(input[i]);
112     }
113 
114     /// Adds a new sample at end of delay.
115     void feedSample(T incoming) pure
116     {
117         _index = (_index + 1) & _indexMask;
118         _data.ptr[_index] = incoming;
119         _data.ptr[_index + _half] = incoming;
120     }
121 
122     /// Adds several samples at end of delay.
123     void feedBuffer(const(T)[] incoming) pure
124     {
125         int N = cast(int)(incoming.length);
126 
127         // this buffer must be smaller than the delay line, 
128         // else we may risk dropping samples immediately
129         assert(N <=_numSamples);
130 
131         // remaining samples before end of delayline
132         int remain = _indexMask - _index;
133 
134         if (N == 0)
135         {
136             return;
137         }
138         else if (N <= remain)
139         {
140             memcpy( &_data[_index + 1], incoming.ptr, N * T.sizeof );
141             memcpy( &_data[_index + 1 + _half], incoming.ptr, N * T.sizeof );
142             _index += N;
143         }
144         else
145         {
146             if (remain != 0)
147             {
148                 memcpy( _data.ptr + (_index + 1), incoming.ptr, remain * T.sizeof );
149                 memcpy( _data.ptr + (_index + 1) + _half, incoming.ptr, remain * T.sizeof );                
150             }
151             size_t numBytes = (N - remain) * T.sizeof;
152             memcpy( _data.ptr, incoming.ptr + remain, numBytes);
153             memcpy( _data.ptr + _half, incoming.ptr + remain, numBytes);
154             _index = (_index + N) & _indexMask;
155         }
156     }
157     ///ditto
158     void feedBuffer(const(T)* incoming, size_t count) pure
159     {
160         feedBuffer(incoming[0..count]);
161     }
162 
163     /// Returns: A pointer which allow to get delayed values.
164     ///    readPointer()[0] is the last samples fed,  readPointer()[-1] is the penultimate.
165     /// Warning: it goes backwards, increasing delay => decreasing addressed.
166     const(T)* readPointer() pure const
167     {
168         return _data.ptr + _index + _half;
169     }
170 
171     /// Random access sampling of the delay-line at integer points.
172     /// Delay 0 = last entered sample with feed().
173     T sampleFull(int delay) pure
174     {
175         assert(delay >= 0);
176         return readPointer()[-delay];
177     }
178 
179     /// Random access sampling of the delay-line at integer points, extract a time slice.
180     /// Delay 0 = last entered sample with feed().
181     void sampleFullBuffer(int delayOfMostRecentSample, float* outBuffer, int frames) pure
182     {
183         assert(delayOfMostRecentSample >= 0);
184         const(T*) p = readPointer();
185         const(T*) source = &readPointer[-delayOfMostRecentSample - frames + 1];
186         size_t numBytes = frames * T.sizeof;
187         memcpy(outBuffer, source, numBytes);
188     }
189 
190     static if (is(T == float) || is(T == double))
191     {
192         /// Random access sampling of the delay-line with linear interpolation.
193         T sampleLinear(float delay) pure const
194         {
195             assert(delay > 0);
196             int iPart;
197             float fPart;
198             decomposeFractionalDelay(delay, iPart, fPart);
199             const(T)* pData = readPointer();
200             T x0  = pData[iPart];
201             T x1  = pData[iPart + 1];
202             return lerp(x0, x1, fPart);
203         }
204 
205         /// Random access sampling of the delay-line with a 3rd order polynomial.
206         T sampleHermite(float delay) pure const
207         {
208             assert(delay > 1);
209             int iPart;
210             float fPart;
211             decomposeFractionalDelay(delay, iPart, fPart);
212             const(T)* pData = readPointer();
213             T xm1 = pData[iPart-1];
214             T x0  = pData[iPart  ];
215             T x1  = pData[iPart+1];
216             T x2  = pData[iPart+2];
217             return hermite!T(fPart, xm1, x0, x1, x2);
218         }
219 
220         /// Third-order spline interpolation
221         /// http://musicdsp.org/showArchiveComment.php?ArchiveID=62
222         T sampleSpline3(float delay) pure const
223         {
224             assert(delay > 1);
225             int iPart;
226             float fPart;
227             decomposeFractionalDelay(delay, iPart, fPart);
228             assert(fPart >= 0.0f);
229             assert(fPart <= 1.0f);
230             const(T)* pData = readPointer();
231             T L1 = pData[iPart-1];
232             T L0  = pData[iPart  ];
233             T H0  = pData[iPart+1];
234             T H1  = pData[iPart+2];
235 
236             return L0 + 0.5f *
237                 fPart*(H0-L1 +
238                 fPart*(H0 + L0*(-2) + L1 +
239                 fPart*( (H0 - L0)*9 + (L1 - H1)*3 +
240                 fPart*((L0 - H0)*15 + (H1 - L1)*5 +
241                 fPart*((H0 - L0)*6 + (L1 - H1)*2 )))));
242         }
243 
244         /// 4th order spline interpolation
245         /// http://musicdsp.org/showArchiveComment.php?ArchiveID=60
246         T sampleSpline4(float delay) pure const
247         {
248             assert(delay > 2);
249             int iPart;
250             float fPart;
251             decomposeFractionalDelay(delay, iPart, fPart);
252 
253             align(16) __gshared static immutable float[8][5] MAT = 
254             [
255                 [  2.0f / 24, -16.0f / 24,   0.0f / 24,   16.0f / 24,  -2.0f / 24,   0.0f / 24, 0.0f, 0.0f ],
256                 [ -1.0f / 24,  16.0f / 24, -30.0f / 24,   16.0f / 24,  -1.0f / 24,   0.0f / 24, 0.0f, 0.0f ],
257                 [ -9.0f / 24,  39.0f / 24, -70.0f / 24,   66.0f / 24, -33.0f / 24,   7.0f / 24, 0.0f, 0.0f ],
258                 [ 13.0f / 24, -64.0f / 24, 126.0f / 24, -124.0f / 24,  61.0f / 24, -12.0f / 24, 0.0f, 0.0f ],
259                 [ -5.0f / 24,  25.0f / 24, -50.0f / 24,   50.0f / 24, -25.0f / 24,   5.0f / 24, 0.0f, 0.0f ]
260             ];
261             import inteli.emmintrin;
262             __m128 pFactor0_3 = _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f);
263             __m128 pFactor4_7 = _mm_setzero_ps();
264 
265             __m128 XMM_fPart = _mm_set1_ps(fPart);
266             __m128 weight = XMM_fPart;
267             pFactor0_3 = _mm_add_ps(pFactor0_3, _mm_load_ps(&MAT[0][0]) * weight);
268             pFactor4_7 = _mm_add_ps(pFactor4_7, _mm_load_ps(&MAT[0][4]) * weight);
269             weight = _mm_mul_ps(weight, XMM_fPart);
270             pFactor0_3 = _mm_add_ps(pFactor0_3, _mm_load_ps(&MAT[1][0]) * weight);
271             pFactor4_7 = _mm_add_ps(pFactor4_7, _mm_load_ps(&MAT[1][4]) * weight);
272             weight = _mm_mul_ps(weight, XMM_fPart);
273             pFactor0_3 = _mm_add_ps(pFactor0_3, _mm_load_ps(&MAT[2][0]) * weight);
274             pFactor4_7 = _mm_add_ps(pFactor4_7, _mm_load_ps(&MAT[2][4]) * weight);
275             weight = _mm_mul_ps(weight, XMM_fPart);
276             pFactor0_3 = _mm_add_ps(pFactor0_3, _mm_load_ps(&MAT[3][0]) * weight);
277             pFactor4_7 = _mm_add_ps(pFactor4_7, _mm_load_ps(&MAT[3][4]) * weight);
278             weight = _mm_mul_ps(weight, XMM_fPart);
279             pFactor0_3 = _mm_add_ps(pFactor0_3, _mm_load_ps(&MAT[4][0]) * weight);
280             pFactor4_7 = _mm_add_ps(pFactor4_7, _mm_load_ps(&MAT[4][4]) * weight);
281 
282             float[8] pfactor = void;
283             _mm_storeu_ps(&pfactor[0], pFactor0_3); 
284             _mm_storeu_ps(&pfactor[4], pFactor4_7);
285 
286             T result = 0;
287             const(T)* pData = readPointer();
288             foreach(n; 0..6)
289                 result += pData[iPart-2 + n] * pfactor[n];
290             return result;
291         };
292     }
293 
294 private:
295     T[] _data;
296     int _index;
297     int _half; // half the size of the data
298     int _indexMask;
299     int _numSamples;
300 
301     void decomposeFractionalDelay(float delay, 
302                                   out int outIntegerPart, 
303                                   out float outFloatPart) pure const
304     {
305         // Because float index can yield suprising low precision with interpolation  
306         // So we up the precision to double in order to have a precise fractional part          
307         int offset = cast(int)(_data.length);
308         double doubleDelayMinus = cast(double)(-delay);
309         int iPart = cast(int)(doubleDelayMinus + offset);
310         iPart -= offset;
311         float fPart = cast(float)(doubleDelayMinus - iPart);
312         assert(fPart >= 0.0f);
313         assert(fPart <= 1.0f);
314         outIntegerPart = iPart;
315         outFloatPart = fPart;
316     }
317 }
318 
319 unittest
320 {
321     Delayline!float line;
322     line.initialize(0); // should be possible
323     assert(line.nextSample(1) == 1);
324 
325     Delayline!double line2;
326 
327     Delayline!float line3;
328     line3.initialize(2);
329     assert(line3.nextSample(1) == 0);
330     assert(line3.nextSample(2) == 0);
331     assert(line3.nextSample(3) == 1);
332     assert(line3.nextSample(42) == 2);
333 
334     assert(line3.sampleFull(0) == 42);
335     assert(line3.sampleFull(1) == 3);
336     assert(line3.sampleLinear(0.5f) == (3.0f + 42.0f) * 0.5f);
337 }
338 
339 // See Issue #607, usability of feedBuffer.
340 unittest
341 {
342     
343     float[256] zeroes;
344     float[256] data;
345     float[256] delayed;
346     foreach (n; 0..256)
347     {
348         data[n] = cast(float)n;
349         zeroes[n] = 0.0f;
350     }
351 
352     // Delay of 256 samples, using `nextBuffer`.
353     {
354         Delayline!float d;
355         d.initialize(256);
356         d.nextBuffer(data.ptr, delayed.ptr, 256);
357         assert(delayed == zeroes);
358         d.nextBuffer(zeroes.ptr, delayed.ptr, 256);
359         assert(delayed == data);
360     }
361 
362     // It should be possible to use feedBuffer to delay of 256 amount too.
363     {
364         int desiredDelay = 256;
365         Delayline!float d;
366         d.initialize(256);
367         int frames = 256;
368         d.feedBuffer(data.ptr, frames);
369         const(float)* readPtr = d.readPointer() - desiredDelay - frames + 1;
370         delayed[0..frames] = readPtr[0..frames];
371         assert(delayed == zeroes);
372 
373         d.feedBuffer(zeroes.ptr, frames);
374         readPtr = d.readPointer() - desiredDelay - frames + 1;
375         delayed[0..frames] = readPtr[0..frames];
376         assert(delayed == data);
377     }
378 }