1 /** 2 * Delay-line implementation. 3 * Copyright: Guillaume Piolat 2015-2021. 4 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 5 */ 6 module dplug.dsp.delayline; 7 8 import core.stdc.string; 9 10 import dplug.core.nogc; 11 import dplug.core.math; 12 import dplug.core.vec; 13 14 /// Allow to sample signal back in time. 15 /// This delay-line has a twin write index, so that the read pointer 16 /// can read a contiguous memory area. 17 /// ____________________________________________________________________________________ 18 /// | | _index | | readPointer = _index + half size | | 19 /// ------------------------------------------------------------------------------------ 20 /// 21 /// A Delayline is initialized with an internal length of N = numSamples, 22 /// in order to do a simple delay of N samples. 23 /// Internally, the delayline actually has 2 x nextPow2(N + 1) samples of storage. 24 /// So typically a delay line is initialized with maxDelaySamples + maxFrames (if buffering is used).. 25 /// 26 /// Example: 27 /// --- 28 /// import dplug.dsp.delayline; 29 /// 30 /// void delaySampleBySample() // slower method, but easier to be correct 31 /// { 32 /// Delayline!float delayline; 33 /// delayline.initialize(maxPossibleDelay); 34 /// for (int n = 0; n < frames; ++n) 35 /// { 36 /// delayline.feedSample(input[n]); 37 /// 38 /// // desiredDelay = 0 would be the sample we just fed 39 /// // the delayline with. 40 /// delayed[n] = delayline.fullSample(desiredDelay); 41 /// } 42 /// } 43 /// 44 /// void delayUsingReadPointer() // fastest method, but more confusing 45 /// { 46 /// Delayline!float delayline; 47 /// delayline.initialize(maxFrames + maxPossibleDelay); 48 /// delayline.feedBuffer(input.ptr, frames); 49 /// const(float)* readPtr = d.readPointer() - desiredDelay - frames + 1; 50 /// delayed[0..frames] = readPtr[0..frames]; 51 /// // Caveats: frames <= maxFrames and desiredDelay <= maxPossibleDelay. 52 /// } 53 /// --- 54 struct Delayline(T) 55 { 56 public: 57 nothrow: 58 @nogc: 59 60 /// Initialize the delay line. Can delay up to `numSamples` samples. 61 void initialize(int numSamples) 62 { 63 resize(numSamples); 64 } 65 66 ~this() 67 { 68 _data.reallocBuffer(0); 69 } 70 71 @disable this(this); 72 73 /// Resize the delay line. Can delay up to count samples. 74 /// The state is cleared. 75 void resize(int numSamples) 76 { 77 if (numSamples < 0) 78 assert(false); 79 80 // Over-allocate to support POW2 delaylines. 81 // This wastes memory but allows delay-line of length 0 without tests. 82 // The reason to add +1 here is that fundamentally in a delay line of length = 1 83 // we want to keep track of the current sample (at delay 0) and the former one (at delay 1). 84 85 int toAllocate = nextPow2HigherOrEqual(numSamples + 1); 86 _data.reallocBuffer(toAllocate * 2); 87 _half = toAllocate; 88 _indexMask = toAllocate - 1; 89 _numSamples = numSamples; 90 _index = _indexMask; 91 92 _data[] = 0; 93 } 94 95 /// Combined feed + sampleFull. 96 /// Uses the delay line as a fixed delay of count samples. 97 T nextSample(T incoming) pure 98 { 99 feedSample(incoming); 100 return sampleFull(_numSamples); 101 } 102 103 /// Combined feed + sampleFull. 104 /// Uses the delay line as a fixed delay of count samples. 105 /// 106 /// Note: input and output may overlap. 107 /// If this was ever optimized, this should preserve that property. 108 void nextBuffer(const(T)* input, T* output, int frames) pure 109 { 110 for(int i = 0; i < frames; ++i) 111 output[i] = nextSample(input[i]); 112 } 113 114 /// Adds a new sample at end of delay. 115 void feedSample(T incoming) pure 116 { 117 _index = (_index + 1) & _indexMask; 118 _data.ptr[_index] = incoming; 119 _data.ptr[_index + _half] = incoming; 120 } 121 122 /// Adds several samples at end of delay. 123 void feedBuffer(const(T)[] incoming) pure 124 { 125 int N = cast(int)(incoming.length); 126 127 // this buffer must be smaller than the delay line, 128 // else we may risk dropping samples immediately 129 assert(N <=_numSamples); 130 131 // remaining samples before end of delayline 132 int remain = _indexMask - _index; 133 134 if (N == 0) 135 { 136 return; 137 } 138 else if (N <= remain) 139 { 140 memcpy( &_data[_index + 1], incoming.ptr, N * T.sizeof ); 141 memcpy( &_data[_index + 1 + _half], incoming.ptr, N * T.sizeof ); 142 _index += N; 143 } 144 else 145 { 146 if (remain != 0) 147 { 148 memcpy( _data.ptr + (_index + 1), incoming.ptr, remain * T.sizeof ); 149 memcpy( _data.ptr + (_index + 1) + _half, incoming.ptr, remain * T.sizeof ); 150 } 151 size_t numBytes = (N - remain) * T.sizeof; 152 memcpy( _data.ptr, incoming.ptr + remain, numBytes); 153 memcpy( _data.ptr + _half, incoming.ptr + remain, numBytes); 154 _index = (_index + N) & _indexMask; 155 } 156 } 157 ///ditto 158 void feedBuffer(const(T)* incoming, size_t count) pure 159 { 160 feedBuffer(incoming[0..count]); 161 } 162 163 /// Returns: A pointer which allow to get delayed values. 164 /// readPointer()[0] is the last samples fed, readPointer()[-1] is the penultimate. 165 /// Warning: it goes backwards, increasing delay => decreasing addressed. 166 const(T)* readPointer() pure const 167 { 168 return _data.ptr + _index + _half; 169 } 170 171 /// Random access sampling of the delay-line at integer points. 172 /// Delay 0 = last entered sample with feed(). 173 T sampleFull(int delay) pure 174 { 175 assert(delay >= 0); 176 return readPointer()[-delay]; 177 } 178 179 /// Random access sampling of the delay-line at integer points, extract a time slice. 180 /// Delay 0 = last entered sample with feed(). 181 void sampleFullBuffer(int delayOfMostRecentSample, float* outBuffer, int frames) pure 182 { 183 assert(delayOfMostRecentSample >= 0); 184 const(T*) p = readPointer(); 185 const(T*) source = &readPointer[-delayOfMostRecentSample - frames + 1]; 186 size_t numBytes = frames * T.sizeof; 187 memcpy(outBuffer, source, numBytes); 188 } 189 190 static if (is(T == float) || is(T == double)) 191 { 192 /// Random access sampling of the delay-line with linear interpolation. 193 T sampleLinear(float delay) pure const 194 { 195 assert(delay > 0); 196 int iPart; 197 float fPart; 198 decomposeFractionalDelay(delay, iPart, fPart); 199 const(T)* pData = readPointer(); 200 T x0 = pData[iPart]; 201 T x1 = pData[iPart + 1]; 202 return lerp(x0, x1, fPart); 203 } 204 205 /// Random access sampling of the delay-line with a 3rd order polynomial. 206 T sampleHermite(float delay) pure const 207 { 208 assert(delay > 1); 209 int iPart; 210 float fPart; 211 decomposeFractionalDelay(delay, iPart, fPart); 212 const(T)* pData = readPointer(); 213 T xm1 = pData[iPart-1]; 214 T x0 = pData[iPart ]; 215 T x1 = pData[iPart+1]; 216 T x2 = pData[iPart+2]; 217 return hermite!T(fPart, xm1, x0, x1, x2); 218 } 219 220 /// Third-order spline interpolation 221 /// http://musicdsp.org/showArchiveComment.php?ArchiveID=62 222 T sampleSpline3(float delay) pure const 223 { 224 assert(delay > 1); 225 int iPart; 226 float fPart; 227 decomposeFractionalDelay(delay, iPart, fPart); 228 assert(fPart >= 0.0f); 229 assert(fPart <= 1.0f); 230 const(T)* pData = readPointer(); 231 T L1 = pData[iPart-1]; 232 T L0 = pData[iPart ]; 233 T H0 = pData[iPart+1]; 234 T H1 = pData[iPart+2]; 235 236 return L0 + 0.5f * 237 fPart*(H0-L1 + 238 fPart*(H0 + L0*(-2) + L1 + 239 fPart*( (H0 - L0)*9 + (L1 - H1)*3 + 240 fPart*((L0 - H0)*15 + (H1 - L1)*5 + 241 fPart*((H0 - L0)*6 + (L1 - H1)*2 ))))); 242 } 243 244 /// 4th order spline interpolation 245 /// http://musicdsp.org/showArchiveComment.php?ArchiveID=60 246 T sampleSpline4(float delay) pure const 247 { 248 assert(delay > 2); 249 int iPart; 250 float fPart; 251 decomposeFractionalDelay(delay, iPart, fPart); 252 253 align(16) __gshared static immutable float[8][5] MAT = 254 [ 255 [ 2.0f / 24, -16.0f / 24, 0.0f / 24, 16.0f / 24, -2.0f / 24, 0.0f / 24, 0.0f, 0.0f ], 256 [ -1.0f / 24, 16.0f / 24, -30.0f / 24, 16.0f / 24, -1.0f / 24, 0.0f / 24, 0.0f, 0.0f ], 257 [ -9.0f / 24, 39.0f / 24, -70.0f / 24, 66.0f / 24, -33.0f / 24, 7.0f / 24, 0.0f, 0.0f ], 258 [ 13.0f / 24, -64.0f / 24, 126.0f / 24, -124.0f / 24, 61.0f / 24, -12.0f / 24, 0.0f, 0.0f ], 259 [ -5.0f / 24, 25.0f / 24, -50.0f / 24, 50.0f / 24, -25.0f / 24, 5.0f / 24, 0.0f, 0.0f ] 260 ]; 261 import inteli.emmintrin; 262 __m128 pFactor0_3 = _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f); 263 __m128 pFactor4_7 = _mm_setzero_ps(); 264 265 __m128 XMM_fPart = _mm_set1_ps(fPart); 266 __m128 weight = XMM_fPart; 267 pFactor0_3 = _mm_add_ps(pFactor0_3, _mm_load_ps(&MAT[0][0]) * weight); 268 pFactor4_7 = _mm_add_ps(pFactor4_7, _mm_load_ps(&MAT[0][4]) * weight); 269 weight = _mm_mul_ps(weight, XMM_fPart); 270 pFactor0_3 = _mm_add_ps(pFactor0_3, _mm_load_ps(&MAT[1][0]) * weight); 271 pFactor4_7 = _mm_add_ps(pFactor4_7, _mm_load_ps(&MAT[1][4]) * weight); 272 weight = _mm_mul_ps(weight, XMM_fPart); 273 pFactor0_3 = _mm_add_ps(pFactor0_3, _mm_load_ps(&MAT[2][0]) * weight); 274 pFactor4_7 = _mm_add_ps(pFactor4_7, _mm_load_ps(&MAT[2][4]) * weight); 275 weight = _mm_mul_ps(weight, XMM_fPart); 276 pFactor0_3 = _mm_add_ps(pFactor0_3, _mm_load_ps(&MAT[3][0]) * weight); 277 pFactor4_7 = _mm_add_ps(pFactor4_7, _mm_load_ps(&MAT[3][4]) * weight); 278 weight = _mm_mul_ps(weight, XMM_fPart); 279 pFactor0_3 = _mm_add_ps(pFactor0_3, _mm_load_ps(&MAT[4][0]) * weight); 280 pFactor4_7 = _mm_add_ps(pFactor4_7, _mm_load_ps(&MAT[4][4]) * weight); 281 282 float[8] pfactor = void; 283 _mm_storeu_ps(&pfactor[0], pFactor0_3); 284 _mm_storeu_ps(&pfactor[4], pFactor4_7); 285 286 T result = 0; 287 const(T)* pData = readPointer(); 288 foreach(n; 0..6) 289 result += pData[iPart-2 + n] * pfactor[n]; 290 return result; 291 }; 292 } 293 294 private: 295 T[] _data; 296 int _index; 297 int _half; // half the size of the data 298 int _indexMask; 299 int _numSamples; 300 301 void decomposeFractionalDelay(float delay, 302 out int outIntegerPart, 303 out float outFloatPart) pure const 304 { 305 // Because float index can yield suprising low precision with interpolation 306 // So we up the precision to double in order to have a precise fractional part 307 int offset = cast(int)(_data.length); 308 double doubleDelayMinus = cast(double)(-delay); 309 int iPart = cast(int)(doubleDelayMinus + offset); 310 iPart -= offset; 311 float fPart = cast(float)(doubleDelayMinus - iPart); 312 assert(fPart >= 0.0f); 313 assert(fPart <= 1.0f); 314 outIntegerPart = iPart; 315 outFloatPart = fPart; 316 } 317 } 318 319 unittest 320 { 321 Delayline!float line; 322 line.initialize(0); // should be possible 323 assert(line.nextSample(1) == 1); 324 325 Delayline!double line2; 326 327 Delayline!float line3; 328 line3.initialize(2); 329 assert(line3.nextSample(1) == 0); 330 assert(line3.nextSample(2) == 0); 331 assert(line3.nextSample(3) == 1); 332 assert(line3.nextSample(42) == 2); 333 334 assert(line3.sampleFull(0) == 42); 335 assert(line3.sampleFull(1) == 3); 336 assert(line3.sampleLinear(0.5f) == (3.0f + 42.0f) * 0.5f); 337 } 338 339 // See Issue #607, usability of feedBuffer. 340 unittest 341 { 342 343 float[256] zeroes; 344 float[256] data; 345 float[256] delayed; 346 foreach (n; 0..256) 347 { 348 data[n] = cast(float)n; 349 zeroes[n] = 0.0f; 350 } 351 352 // Delay of 256 samples, using `nextBuffer`. 353 { 354 Delayline!float d; 355 d.initialize(256); 356 d.nextBuffer(data.ptr, delayed.ptr, 256); 357 assert(delayed == zeroes); 358 d.nextBuffer(zeroes.ptr, delayed.ptr, 256); 359 assert(delayed == data); 360 } 361 362 // It should be possible to use feedBuffer to delay of 256 amount too. 363 { 364 int desiredDelay = 256; 365 Delayline!float d; 366 d.initialize(256); 367 int frames = 256; 368 d.feedBuffer(data.ptr, frames); 369 const(float)* readPtr = d.readPointer() - desiredDelay - frames + 1; 370 delayed[0..frames] = readPtr[0..frames]; 371 assert(delayed == zeroes); 372 373 d.feedBuffer(zeroes.ptr, frames); 374 readPtr = d.readPointer() - desiredDelay - frames + 1; 375 delayed[0..frames] = readPtr[0..frames]; 376 assert(delayed == data); 377 } 378 }