1 /** 2 * Delay-line implementation. 3 * Copyright: Guillaume Piolat 2015-2024. 4 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 5 */ 6 module dplug.dsp.delayline; 7 8 import core.stdc.string; 9 10 import dplug.core.nogc; 11 import dplug.core.math; 12 import dplug.core.vec; 13 14 import inteli.emmintrin; 15 16 /// Allow to sample signal back in time. 17 /// This delay-line has a twin write index, so that the read pointer 18 /// can read a contiguous memory area. 19 /// ____________________________________________________________________________________ 20 /// | | _index | | readPointer = _index + half size | | 21 /// ------------------------------------------------------------------------------------ 22 /// 23 /// A Delayline is initialized with an internal length of N = numSamples, 24 /// in order to do a simple delay of N samples. 25 /// Internally, the delayline actually has 2 x nextPow2(N + 1) samples of storage. 26 /// So typically a delay line is initialized with maxDelaySamples + maxFrames (if buffering is used).. 27 /// 28 /// Example: 29 /// --- 30 /// import dplug.dsp.delayline; 31 /// 32 /// void delaySampleBySample() // slower method, but easier to be correct 33 /// { 34 /// Delayline!float delayline; 35 /// delayline.initialize(maxPossibleDelay); 36 /// for (int n = 0; n < frames; ++n) 37 /// { 38 /// delayline.feedSample(input[n]); 39 /// 40 /// // desiredDelay = 0 would be the sample we just fed 41 /// // the delayline with. 42 /// delayed[n] = delayline.fullSample(desiredDelay); 43 /// } 44 /// } 45 /// 46 /// void delayUsingReadPointer() // fastest method, but more confusing 47 /// { 48 /// Delayline!float delayline; 49 /// delayline.initialize(maxFrames + maxPossibleDelay); 50 /// delayline.feedBuffer(input.ptr, frames); 51 /// const(float)* readPtr = d.readPointer() - desiredDelay - frames + 1; 52 /// delayed[0..frames] = readPtr[0..frames]; 53 /// // Caveats: frames <= maxFrames and desiredDelay <= maxPossibleDelay. 54 /// } 55 /// --- 56 struct Delayline(T) 57 { 58 public: 59 nothrow: 60 @nogc: 61 62 /// Initialize the delay line. Can delay up to `numSamples` samples. 63 void initialize(int numSamples) 64 { 65 resize(numSamples); 66 } 67 68 ~this() 69 { 70 _data.reallocBuffer(0); 71 } 72 73 @disable this(this); 74 75 /// Resize the delay line. Can delay up to count samples. 76 /// The state is cleared. 77 void resize(int numSamples) 78 { 79 if (numSamples < 0) 80 assert(false); 81 82 // Over-allocate to support POW2 delaylines. 83 // This wastes memory but allows delay-line of length 0 without tests. 84 // The reason to add +1 here is that fundamentally in a delay line of length = 1 85 // we want to keep track of the current sample (at delay 0) and the former one 86 // (at delay 1). That's two samples. 87 88 int toAllocate = nextPow2HigherOrEqual(numSamples + 1); 89 _data.reallocBuffer(toAllocate * 2); 90 _half = toAllocate; 91 _indexMask = toAllocate - 1; 92 _numSamples = numSamples; 93 _index = _indexMask; 94 95 _data[] = 0; 96 } 97 98 /// Adds a new sample at end of delay. 99 void feedSample(T incoming) pure 100 { 101 _index = (_index + 1) & _indexMask; 102 _data.ptr[_index] = incoming; 103 _data.ptr[_index + _half] = incoming; 104 } 105 106 107 108 /// Random access sampling of the delay-line at integer points. 109 /// Delay 0 = last entered sample with feed(). 110 T sampleFull(int delay) pure 111 { 112 assert(delay >= 0); 113 return readPointer()[-delay]; 114 } 115 116 /// Random access sampling of the delay-line at integer points, extract a time slice. 117 /// Delay 0 = last entered sample with feed(). 118 void sampleFullBuffer(int delayOfMostRecentSample, T* outBuffer, int frames) pure 119 { 120 assert(delayOfMostRecentSample >= 0); 121 const(T*) p = readPointer(); 122 const(T*) source = &readPointer[-delayOfMostRecentSample - frames + 1]; 123 size_t numBytes = frames * T.sizeof; 124 memcpy(outBuffer, source, numBytes); 125 } 126 127 static if (is(T == float) || is(T == double)) 128 { 129 /// Random access sampling of the delay-line with linear interpolation. 130 /// Note that will the HF rollout of linear interpolation, it can 131 /// often sound quite good in 44.1 kHz 132 T sampleLinear(float delay) pure const 133 { 134 assert(delay > 0); 135 int iPart; 136 float fPart; 137 decomposeFractionalDelay(delay, iPart, fPart); 138 const(T)* pData = readPointer(); 139 T x0 = pData[iPart]; 140 T x1 = pData[iPart + 1]; 141 return lerp(x0, x1, fPart); 142 } 143 144 /// Random access sampling of the delay-line with a 3rd order polynomial. 145 T sampleHermite(float delay) pure const 146 { 147 assert(delay > 1); 148 int iPart; 149 float fPart; 150 decomposeFractionalDelay(delay, iPart, fPart); 151 const(T)* pData = readPointer(); 152 T xm1 = pData[iPart-1]; 153 T x0 = pData[iPart ]; 154 T x1 = pData[iPart+1]; 155 T x2 = pData[iPart+2]; 156 return hermite!T(fPart, xm1, x0, x1, x2); 157 } 158 159 /// Third-order spline interpolation 160 /// http://musicdsp.org/showArchiveComment.php?ArchiveID=62 161 T sampleSpline3(float delay) pure const 162 { 163 assert(delay > 1); 164 int iPart; 165 float fPart; 166 decomposeFractionalDelay(delay, iPart, fPart); 167 assert(fPart >= 0.0f); 168 assert(fPart <= 1.0f); 169 const(T)* pData = readPointer(); 170 T L1 = pData[iPart-1]; 171 T L0 = pData[iPart ]; 172 T H0 = pData[iPart+1]; 173 T H1 = pData[iPart+2]; 174 175 return L0 + 0.5f * 176 fPart*(H0-L1 + 177 fPart*(H0 + L0*(-2) + L1 + 178 fPart*( (H0 - L0)*9 + (L1 - H1)*3 + 179 fPart*((L0 - H0)*15 + (H1 - L1)*5 + 180 fPart*((H0 - L0)*6 + (L1 - H1)*2 ))))); 181 } 182 183 /// 4th order spline interpolation 184 /// http://musicdsp.org/showArchiveComment.php?ArchiveID=60 185 T sampleSpline4(float delay) pure const 186 { 187 assert(delay > 2); 188 int iPart; 189 float fPart; 190 decomposeFractionalDelay(delay, iPart, fPart); 191 192 align(16) __gshared static immutable float[8][5] MAT = 193 [ 194 [ 2.0f / 24, -16.0f / 24, 0.0f / 24, 16.0f / 24, -2.0f / 24, 0.0f / 24, 0.0f, 0.0f ], 195 [ -1.0f / 24, 16.0f / 24, -30.0f / 24, 16.0f / 24, -1.0f / 24, 0.0f / 24, 0.0f, 0.0f ], 196 [ -9.0f / 24, 39.0f / 24, -70.0f / 24, 66.0f / 24, -33.0f / 24, 7.0f / 24, 0.0f, 0.0f ], 197 [ 13.0f / 24, -64.0f / 24, 126.0f / 24, -124.0f / 24, 61.0f / 24, -12.0f / 24, 0.0f, 0.0f ], 198 [ -5.0f / 24, 25.0f / 24, -50.0f / 24, 50.0f / 24, -25.0f / 24, 5.0f / 24, 0.0f, 0.0f ] 199 ]; 200 201 __m128 pFactor0_3 = _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f); 202 __m128 pFactor4_7 = _mm_setzero_ps(); 203 204 __m128 XMM_fPart = _mm_set1_ps(fPart); 205 __m128 weight = XMM_fPart; 206 pFactor0_3 = _mm_add_ps(pFactor0_3, _mm_load_ps(&MAT[0][0]) * weight); 207 pFactor4_7 = _mm_add_ps(pFactor4_7, _mm_load_ps(&MAT[0][4]) * weight); 208 weight = _mm_mul_ps(weight, XMM_fPart); 209 pFactor0_3 = _mm_add_ps(pFactor0_3, _mm_load_ps(&MAT[1][0]) * weight); 210 pFactor4_7 = _mm_add_ps(pFactor4_7, _mm_load_ps(&MAT[1][4]) * weight); 211 weight = _mm_mul_ps(weight, XMM_fPart); 212 pFactor0_3 = _mm_add_ps(pFactor0_3, _mm_load_ps(&MAT[2][0]) * weight); 213 pFactor4_7 = _mm_add_ps(pFactor4_7, _mm_load_ps(&MAT[2][4]) * weight); 214 weight = _mm_mul_ps(weight, XMM_fPart); 215 pFactor0_3 = _mm_add_ps(pFactor0_3, _mm_load_ps(&MAT[3][0]) * weight); 216 pFactor4_7 = _mm_add_ps(pFactor4_7, _mm_load_ps(&MAT[3][4]) * weight); 217 weight = _mm_mul_ps(weight, XMM_fPart); 218 pFactor0_3 = _mm_add_ps(pFactor0_3, _mm_load_ps(&MAT[4][0]) * weight); 219 pFactor4_7 = _mm_add_ps(pFactor4_7, _mm_load_ps(&MAT[4][4]) * weight); 220 221 float[8] pfactor = void; 222 _mm_storeu_ps(&pfactor[0], pFactor0_3); 223 _mm_storeu_ps(&pfactor[4], pFactor4_7); 224 225 T result = 0; 226 const(T)* pData = readPointer(); 227 foreach(n; 0..6) 228 result += pData[iPart-2 + n] * pfactor[n]; 229 return result; 230 } 231 } 232 233 /// Adds several samples at end of delay. 234 void feedBuffer(const(T)[] incoming) pure 235 { 236 int N = cast(int)(incoming.length); 237 238 // Note: it is legal to overfeed the delayline, in case of large 239 // maxFrames for example. Though this is normally unexpected, but it's 240 // useful when using silence detection. 241 242 if (N > _numSamples) 243 { 244 N = _numSamples; 245 incoming = incoming[$-N .. $]; 246 } 247 248 // remaining samples before end of delayline 249 int remain = _indexMask - _index; 250 251 if (N == 0) 252 { 253 return; 254 } 255 else if (N <= remain) 256 { 257 memcpy( &_data[_index + 1], incoming.ptr, N * T.sizeof ); 258 memcpy( &_data[_index + 1 + _half], incoming.ptr, N * T.sizeof ); 259 _index += N; 260 } 261 else 262 { 263 if (remain != 0) 264 { 265 memcpy(_data.ptr + (_index+1), incoming.ptr, remain * T.sizeof ); 266 memcpy(_data.ptr + (_index+1) + _half, incoming.ptr, remain * T.sizeof); 267 } 268 size_t numBytes = (N - remain) * T.sizeof; 269 memcpy( _data.ptr, incoming.ptr + remain, numBytes); 270 memcpy( _data.ptr + _half, incoming.ptr + remain, numBytes); 271 _index = (_index + N) & _indexMask; 272 } 273 } 274 ///ditto 275 void feedBuffer(const(T)* incoming, size_t count) pure 276 { 277 feedBuffer(incoming[0..count]); 278 } 279 280 /// Returns: A pointer which allow to get delayed values. 281 /// readPointer()[0] is the last samples fed, 282 /// readPointer()[-1] is the penultimate. 283 /// Warning: it goes backwards, increasing delay => decreasing addressed. 284 const(T)* readPointer() pure const 285 { 286 return _data.ptr + _index + _half; 287 } 288 289 /// Combined feed + sampleFull. 290 /// Uses the delay line as a fixed delay of count samples. 291 /// 292 /// This is normally very rare to need this vs separate `sampleXXX` and 293 /// `feedSample`. 294 T nextSample(T incoming) pure 295 { 296 feedSample(incoming); 297 return sampleFull(_numSamples); 298 } 299 300 /// Combined feed + sampleFull. 301 /// Uses the delay line as a fixed delay of count samples. 302 /// 303 /// Note: input and output may overlap. 304 /// If this was ever optimized, this should preserve that property. 305 /// 306 /// This is normally very rare to need this vs separate `sampleXXX` and 307 /// `feedBuffer`. 308 void nextBuffer(const(T)* input, T* output, int frames) pure 309 { 310 for(int i = 0; i < frames; ++i) 311 output[i] = nextSample(input[i]); 312 } 313 314 private: 315 T[] _data; 316 int _index; 317 int _half; // half the size of the data 318 int _indexMask; 319 int _numSamples; 320 321 void decomposeFractionalDelay(float delay, 322 out int outIntegerPart, 323 out float outFloatPart) pure const 324 { 325 // Because float index can yield suprising low precision with interpolation 326 // So we up the precision to double in order to have a precise fractional part 327 int offset = cast(int)(_data.length); 328 double doubleDelayMinus = cast(double)(-delay); 329 int iPart = cast(int)(doubleDelayMinus + offset); 330 iPart -= offset; 331 float fPart = cast(float)(doubleDelayMinus - iPart); 332 assert(fPart >= 0.0f); 333 assert(fPart <= 1.0f); 334 outIntegerPart = iPart; 335 outFloatPart = fPart; 336 } 337 } 338 339 unittest 340 { 341 Delayline!float line; 342 line.initialize(0); // should be possible 343 assert(line.nextSample(1) == 1); 344 Delayline!double line2; 345 346 Delayline!float line3; 347 line3.initialize(2); 348 assert(line3.nextSample(1) == 0); 349 assert(line3.nextSample(2) == 0); 350 assert(line3.nextSample(3) == 1); 351 assert(line3.nextSample(42) == 2); 352 assert(line3.sampleFull(0) == 42); 353 assert(line3.sampleFull(1) == 3); 354 assert(line3.sampleLinear(0.5f) == (3.0f + 42.0f) * 0.5f); 355 } 356 357 // See Issue #607, usability of feedBuffer. 358 unittest 359 { 360 float[256] zeroes; 361 float[256] data; 362 float[256] delayed; 363 foreach (n; 0..256) 364 { 365 data[n] = cast(float)n; 366 zeroes[n] = 0.0f; 367 } 368 369 // Delay of 256 samples, using `nextBuffer`. 370 { 371 Delayline!float d; 372 d.initialize(256); 373 d.nextBuffer(data.ptr, delayed.ptr, 256); 374 assert(delayed == zeroes); 375 d.nextBuffer(zeroes.ptr, delayed.ptr, 256); 376 assert(delayed == data); 377 } 378 379 // It should be possible to use feedBuffer to delay of 256 amount too. 380 { 381 int desiredDelay = 256; 382 Delayline!float d; 383 d.initialize(256); 384 int frames = 256; 385 d.feedBuffer(data.ptr, frames); 386 const(float)* readPtr = d.readPointer() - desiredDelay - frames + 1; 387 delayed[0..frames] = readPtr[0..frames]; 388 assert(delayed == zeroes); 389 390 d.feedBuffer(zeroes.ptr, frames); 391 readPtr = d.readPointer() - desiredDelay - frames + 1; 392 delayed[0..frames] = readPtr[0..frames]; 393 assert(delayed == data); 394 } 395 } 396 397 // Issue 846, feeding a buffer larger than the delay line length. 398 // It's useful for testing effects in isolation, in a way that may 399 // have large maxFrames. 400 unittest 401 { 402 int[256] data; 403 for (int n = 0; n < 256; ++n) 404 data[n] = n; 405 Delayline!int d; 406 d.initialize(128); 407 d.feedBuffer(data[0..256]); // now work, only data[128..256] considered 408 for (int n = 0; n < 128; ++n) 409 assert(d.sampleFull(n) == 255 - n); 410 } 411 412 /// Simplified delay line, mostly there to compensate latency manually. 413 /// No interpolation and no delay change while playing. 414 struct SimpleDelay(T) 415 { 416 public: 417 nothrow: 418 @nogc: 419 420 enum MAX_CHANNELS = 2; // current limitation 421 422 void initialize(int numChans, int maxFrames, int delayInSamples) 423 { 424 assert(numChans <= MAX_CHANNELS); 425 assert(_delayInSamples >= 0); 426 _delayInSamples = delayInSamples; 427 _numChans = numChans; 428 if (_delayInSamples > 0) 429 { 430 for (int chan = 0; chan < _numChans; ++chan) 431 { 432 _delay[chan].initialize(maxFrames + delayInSamples + 1); // not sure if the +1 is still needed, or why. It is part of culture now. 433 } 434 } 435 } 436 437 /// Just a reminder, to compute this processor latency. 438 static int latencySamples(int delayInSamples) pure 439 { 440 return delayInSamples; 441 } 442 443 /// Process samples, single channel version. 444 void nextBufferMono(const(T)* input, T* output, int frames) 445 { 446 assert(_numChans == 1); 447 const(T)*[1] inputs; 448 T*[1] outputs; 449 450 inputs[0] = input; 451 outputs[0] = output; 452 nextBuffer(inputs.ptr, outputs.ptr, frames); 453 } 454 ///ditto 455 void nextBufferMonoInPlace(T* inoutSamples, int frames) 456 { 457 assert(_numChans == 1); 458 if (_delayInSamples == 0) 459 return; 460 const(T)*[1] inputs; 461 T*[1] outputs; 462 inputs[0] = inoutSamples; 463 outputs[0] = inoutSamples; 464 nextBuffer(inputs.ptr, outputs.ptr, frames); 465 } 466 467 /// Process samples, multichannel version. 468 /// Note: input and output buffers can overlap, or even be the same. 469 void nextBuffer(const(T*)* inputs, T** output, int frames) 470 { 471 for (int chan = 0; chan < _numChans; ++chan) 472 { 473 if (_delayInSamples == 0) 474 { 475 // Since the two can overlap, use memmove. 476 memmove(output[chan], inputs[chan], frames * T.sizeof); 477 } 478 else 479 { 480 _delay[chan].feedBuffer(inputs[chan], frames); 481 const(T)* readPtr = _delay[chan].readPointer() - _delayInSamples - frames + 1; 482 output[chan][0..frames] = readPtr[0..frames]; 483 } 484 } 485 } 486 ///ditto 487 void nextBufferInPlace(T** inoutSamples, int frames) 488 { 489 if (_delayInSamples == 0) 490 return; 491 nextBuffer(inoutSamples, inoutSamples, frames); 492 } 493 494 private: 495 int _numChans; 496 int _delayInSamples; 497 Delayline!T[MAX_CHANNELS] _delay; 498 } 499 500 /// A delay that resyncs latency of two signals when it's not clear which has 501 /// more latency. This is a building block for internal latency compensation. 502 /// 503 /// Input: | | 504 /// A with latency LA, B with latency LB | A | B 505 /// V V 506 /// ____________LatencyResync___________ 507 /// | Delayline of L1 = max(LB - LA, 0) | 508 /// | Delayline of L2 = max(LA - LB, 0) | 509 /// |____________________________________| 510 /// | | 511 /// Output: | | 512 /// Two aligned signal, latency = max(LA, LB) | A | B 513 /// V V 514 struct LatencyResync(T) 515 { 516 public: 517 nothrow: 518 @nogc: 519 void initialize(int numChans, int maxFrames, int latencySamplesA, int latencySamplesB) 520 { 521 int L1 = latencySamplesB - latencySamplesA; 522 int L2 = latencySamplesA - latencySamplesB; 523 if (L1 < 0) L1 = 0; 524 if (L2 < 0) L2 = 0; 525 _delayA.initialize(numChans, maxFrames, L1); 526 _delayB.initialize(numChans, maxFrames, L2); 527 } 528 529 /// Just a reminder, to compute this processor latency. 530 static int latencySamples(int latencySamplesA, int latencySamplesB) 531 { 532 return latencySamplesA > latencySamplesB ? latencySamplesA : latencySamplesB; 533 } 534 535 /// Process mono inputs, help function. 536 void nextBufferMono(const(T)* inputA, const(T)* inputB, T* outputA, T* outputB, int frames) 537 { 538 _delayA.nextBufferMono(inputA, outputA, frames); 539 _delayB.nextBufferMono(inputB, outputB, frames); 540 } 541 ///ditto 542 void nextBufferMonoInPlace(T* inoutASamples, T* inoutBSamples, int frames) 543 { 544 _delayA.nextBufferMonoInPlace(inoutASamples, frames); 545 _delayB.nextBufferMonoInPlace(inoutBSamples, frames); 546 } 547 548 /// Process buffers. A and B signal gets aligned with regards to their relative latency. 549 void nextBuffer(const(T)** inputsA, const(T)** inputsB, T** outputsA, T** outputsB, int frames) 550 { 551 _delayA.nextBuffer(inputsA, outputsA, frames); 552 _delayB.nextBuffer(inputsB, outputsB, frames); 553 } 554 ///ditto 555 void nextBufferInPlace(T** inoutASamples, T** inoutBSamples, int frames) 556 { 557 _delayA.nextBufferInPlace(inoutASamples, frames); 558 _delayB.nextBufferInPlace(inoutBSamples, frames); 559 } 560 561 private: 562 SimpleDelay!T _delayA; 563 SimpleDelay!T _delayB; 564 } 565 566 unittest 567 { 568 { 569 double[4] A = [0.0, 3, 0, 0]; 570 double[4] B = [0.0, 0, 2, 0]; 571 LatencyResync!double lr; 572 int numChans = 1; 573 int maxFrames = 4; 574 int latencyA = 1; 575 int latencyB = 2; 576 lr.initialize(numChans, maxFrames, latencyA, latencyB); 577 lr.nextBufferMono(A.ptr, B.ptr, A.ptr, B.ptr, 4); 578 assert(A == [0.0, 0, 3, 0]); 579 assert(B == [0.0, 0, 2, 0]); 580 } 581 582 { 583 double[4] A = [0.0, 0, 3, 9]; 584 double[4] B = [2.0, 0, 0, 8]; 585 LatencyResync!double lr; 586 int numChans = 1; 587 int maxFrames = 4; 588 int latencyA = 2; 589 int latencyB = 0; 590 lr.initialize(numChans, maxFrames, latencyA, latencyB); 591 lr.nextBufferMonoInPlace(A.ptr, B.ptr, 3); 592 assert(A == [0.0, 0, 3, 9]); 593 assert(B == [0.0, 0, 2, 8]); 594 } 595 }