1 /** 2 Mipmap pyramid implementation. 3 4 Copyright: Guillaume Piolat 2015-2023. 5 License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module dplug.graphics.mipmap; 8 9 import dplug.math.vector; 10 import dplug.math.box; 11 import dplug.graphics.image; 12 import dplug.core.nogc; 13 import dplug.core.vec; 14 15 import inteli.smmintrin; 16 17 version( D_InlineAsm_X86 ) 18 { 19 version = AsmX86; 20 } 21 else version( D_InlineAsm_X86_64 ) 22 { 23 version = AsmX86; 24 } 25 26 /// Mipmapped images. 27 /// Supports non power-of-two textures. 28 /// Size of the i+1-th mipmap is { (width)/2, (height)/2 } 29 /// The mipmap owns each of its levels. 30 final class Mipmap(COLOR) if (is(COLOR == RGBA) || is(COLOR == L16) || is(COLOR == RGBA16) ) 31 { 32 public: 33 nothrow: 34 @nogc: 35 36 enum Quality 37 { 38 box, // simple 2x2 filter, creates phase problems with NPOT. For higher levels, automatically uses cubic. 39 cubic, // Very smooth kernel [1 2 1] x [1 2 1] 40 41 /// Box-filter, and after such a step the next level is alpha-premultiplied. 42 /// This is intended for the first level 0 to level 1 transition, in case of bloom. 43 /// Within version(futurePBREmissive), this also transitions to linear space to have 44 /// more natural highlights. 45 boxAlphaCovIntoPremul, 46 } 47 48 Vec!(OwnedImage!COLOR) levels; 49 50 /// Creates empty 51 this() 52 { 53 levels = makeVec!(OwnedImage!COLOR)(); 54 } 55 56 /// Set number of levels and size 57 /// maxLevel = 0 => only one image 58 /// maxLevel = 1 => one image + one 2x downsampled mipmap 59 /// etc... 60 this(int maxLevel, int w, int h) 61 { 62 this(); 63 size(maxLevel, w, h); 64 } 65 66 67 /// Creates a Mipmap out of a flat OwnedImage. 68 /// This takes ownership of the given image, which is now owned by the `Mipmap`. 69 this(int maxLevel, OwnedImage!COLOR level0) 70 { 71 //PERF: could avoid to create the 0th level only to replace it later 72 73 this(maxLevel, level0.w, level0.h); 74 75 // replaces level 0 76 levels[0].destroyFree(); 77 levels[0] = level0; 78 } 79 80 void size(int maxLevel, int w, int h) 81 { 82 // find number of needed levels 83 int neededLevels = 0; 84 { 85 int wr = w; 86 int hr = h; 87 for (; neededLevels <= maxLevel; ++neededLevels) 88 { 89 if (wr == 0 || hr == 0) 90 break; 91 wr = (wr + 0) >> 1; 92 hr = (hr + 0) >> 1; 93 } 94 } 95 96 void setLevels(int numLevels) 97 { 98 // FUTURE: cleanup excess levels 99 // should not happen until we have resizing 100 if (numLevels < levels.length) 101 { 102 assert(false); 103 } 104 105 int previousLength = cast(int)levels.length; 106 107 levels.resize(numLevels); 108 109 // create empty image for new levels 110 for(int level = previousLength; level < numLevels; ++level) 111 { 112 levels[level] = mallocNew!(OwnedImage!COLOR)(); 113 } 114 } 115 116 setLevels(neededLevels); 117 118 // resize levels 119 for (int level = 0; level < neededLevels; ++level) 120 { 121 assert(w != 0 && h != 0); 122 levels[level].size(w, h); 123 w = (w + 0) >> 1; 124 h = (h + 0) >> 1; 125 } 126 } 127 128 ~this() 129 { 130 foreach(level; levels) 131 level.destroyFree(); 132 } 133 134 /// Interpolates a color between mipmap levels. Floating-point level, spatial linear interpolation. 135 /// x and y are in base level coordinates (top-left pixel is on (0.5, 0.5) coordinates). 136 /// Clamped to borders. 137 auto linearMipmapSample(float level, float x, float y) nothrow @nogc 138 { 139 int ilevel = cast(int)level; 140 float flevel = level - ilevel; 141 vec4f levelN = linearSample(ilevel, x, y); 142 if (flevel == 0) 143 return levelN; 144 145 auto levelNp1 = linearSample(ilevel + 1, x, y); 146 147 return levelN * (1 - flevel) + levelNp1 * flevel; 148 } 149 150 /// Cubic filtering mode, using a Catmull-Rom bicubic filter. 151 /// Integer level, spatial linear interpolation. 152 /// x and y are in base level coordinates (top-left pixel is on (0.5, 0.5) coordinates). 153 /// Clamped to borders. 154 /// Reference: https://registry.khronos.org/OpenGL/extensions/IMG/IMG_texture_filter_cubic.txt 155 auto cubicSample(int level, float x, float y) nothrow @nogc 156 { 157 if (level < 0) 158 level = 0; 159 int numLevels = cast(int)levels.length; 160 if (level >= numLevels) 161 level = numLevels - 1; 162 163 OwnedImage!COLOR image = levels[level]; 164 165 static immutable float[14] factors = [ 1.0f, 0.5f, 0.25f, 0.125f, 166 0.0625f, 0.03125f, 0.015625f, 0.0078125f, 167 0.00390625f, 0.001953125f, 0.0009765625f, 0.00048828125f, 168 0.000244140625f, 0.0001220703125f]; 169 170 float divider = factors[level]; 171 x = x * divider - 0.5f; 172 y = y * divider - 0.5f; 173 174 __m128 mm0123 = _mm_setr_ps(-1, 0, 1, 2); 175 __m128i x_indices = _mm_cvttps_epi32( _mm_set1_ps(x) + mm0123); 176 __m128i y_indices = _mm_cvttps_epi32( _mm_set1_ps(y) + mm0123); 177 __m128i zero = _mm_setzero_si128(); 178 x_indices = _mm_max_epi32(x_indices, zero); 179 y_indices = _mm_max_epi32(y_indices, zero); 180 x_indices = _mm_min_epi32(x_indices, _mm_set1_epi32(image.w-1)); 181 y_indices = _mm_min_epi32(y_indices, _mm_set1_epi32(image.h-1)); 182 183 int i0 = x_indices.array[0]; 184 int i1 = x_indices.array[1]; 185 int i2 = x_indices.array[2]; 186 int i3 = x_indices.array[3]; 187 188 // fractional part 189 float a = x + 1.0f; 190 float b = y + 1.0f; 191 a = a - cast(int)(a); 192 b = b - cast(int)(b); 193 assert(a >= -0.01 && a <= 1.01); 194 assert(b >= -0.01 && b <= 1.01); 195 196 COLOR*[4] L = void; 197 L[0] = image.scanlinePtr(y_indices.array[0]); 198 L[1] = image.scanlinePtr(y_indices.array[1]); 199 L[2] = image.scanlinePtr(y_indices.array[2]); 200 L[3] = image.scanlinePtr(y_indices.array[3]); 201 202 static if (is(COLOR == L16)) 203 { 204 static float clamp_0_to_65535(float a) 205 { 206 if (a < 0) a = 0; 207 if (a > 65535) a = 65535; 208 return a; 209 } 210 static cubicInterp(float t, float x0, float x1, float x2, float x3) pure nothrow @nogc 211 { 212 // PERF: doesn't sound that great??? 213 return x1 214 + t * ((-0.5f * x0) + (0.5f * x2)) 215 + t * t * (x0 - (2.5f * x1) + (2.0f * x2) - (0.5f * x3)) 216 + t * t * t * ((-0.5f * x0) + (1.5f * x1) - (1.5f * x2) + 0.5f * x3); 217 } 218 219 float[4] R; 220 for (int row = 0; row < 4; ++row) 221 { 222 COLOR* pRow = L[row]; 223 COLOR ri0jn = pRow[i0]; 224 COLOR ri1jn = pRow[i1]; 225 COLOR ri2jn = pRow[i2]; 226 COLOR ri3jn = pRow[i3]; 227 float A = ri0jn.l; 228 float B = ri1jn.l; 229 float C = ri2jn.l; 230 float D = ri3jn.l; 231 R[row] = cubicInterp(a, A, B, C, D); 232 } 233 return clamp_0_to_65535(cubicInterp(b, R[0], R[1], R[2], R[3])); 234 } 235 else 236 { 237 // actually optimized ok by LDC 238 static vec4f clamp_0_to_65535(vec4f a) 239 { 240 if (a[0] < 0) a[0] = 0; 241 if (a[1] < 0) a[1] = 0; 242 if (a[2] < 0) a[2] = 0; 243 if (a[3] < 0) a[3] = 0; 244 if (a[0] > 65535) a[0] = 65535; 245 if (a[1] > 65535) a[1] = 65535; 246 if (a[2] > 65535) a[2] = 65535; 247 if (a[3] > 65535) a[3] = 65535; 248 return a; 249 } 250 251 static cubicInterp(float t, vec4f x0, vec4f x1, vec4f x2, vec4f x3) pure nothrow @nogc 252 { 253 // PERF: doesn't sound that great??? 254 return x1 255 + t * ((-0.5f * x0) + (0.5f * x2)) 256 + t * t * (x0 - (2.5f * x1) + (2.0f * x2) - (0.5f * x3)) 257 + t * t * t * ((-0.5f * x0) + (1.5f * x1) - (1.5f * x2) + 0.5f * x3); 258 } 259 vec4f[4] R = void; 260 for (int row = 0; row < 4; ++row) 261 { 262 COLOR* pRow = L[row]; 263 COLOR ri0jn = pRow[i0]; 264 COLOR ri1jn = pRow[i1]; 265 COLOR ri2jn = pRow[i2]; 266 COLOR ri3jn = pRow[i3]; 267 vec4f A = vec4f(ri0jn.r, ri0jn.g, ri0jn.b, ri0jn.a); 268 vec4f B = vec4f(ri1jn.r, ri1jn.g, ri1jn.b, ri1jn.a); 269 vec4f C = vec4f(ri2jn.r, ri2jn.g, ri2jn.b, ri2jn.a); 270 vec4f D = vec4f(ri3jn.r, ri3jn.g, ri3jn.b, ri3jn.a); 271 R[row] = cubicInterp(a, A, B, C, D); 272 } 273 return clamp_0_to_65535(cubicInterp(b, R[0], R[1], R[2], R[3])); 274 } 275 } 276 277 278 /// Interpolates a color. Integer level, spatial linear interpolation. 279 /// x and y are in base level coordinates (top-left pixel is on (0.5, 0.5) coordinates). 280 /// Clamped to borders. 281 auto linearSample(int level, float x, float y) nothrow @nogc 282 { 283 if (level < 0) 284 level = 0; 285 int numLevels = cast(int)levels.length; 286 if (level >= numLevels) 287 level = numLevels - 1; 288 289 OwnedImage!COLOR image = levels[level]; 290 291 292 static immutable float[14] factors = [ 1.0f, 0.5f, 0.25f, 0.125f, 293 0.0625f, 0.03125f, 0.015625f, 0.0078125f, 294 0.00390625f, 0.001953125f, 0.0009765625f, 0.00048828125f, 295 0.000244140625f, 0.0001220703125f]; 296 297 float divider = factors[level]; 298 x = x * divider - 0.5f; 299 y = y * divider - 0.5f; 300 301 if (x < 0) 302 x = 0; 303 if (y < 0) 304 y = 0; 305 306 __m128 floatCoords = _mm_setr_ps(x, y, 0, 0); 307 __m128i truncatedCoord = _mm_cvttps_epi32(floatCoords); 308 int ix = truncatedCoord.array[0]; 309 int iy = truncatedCoord.array[1]; 310 311 // Get fractional part 312 float fx = x - ix; 313 float fy = y - iy; 314 315 const int maxX = image.w-1; 316 const int maxY = image.h-1; 317 if (ix > maxX) 318 ix = maxX; 319 if (iy > maxY) 320 iy = maxY; 321 322 int ixp1 = ix + 1; 323 int iyp1 = iy + 1; 324 if (ixp1 > maxX) 325 ixp1 = maxX; 326 if (iyp1 > maxY) 327 iyp1 = maxY; 328 329 float fxm1 = 1 - fx; 330 float fym1 = 1 - fy; 331 332 COLOR* L0 = image.scanlinePtr(iy); 333 COLOR* L1 = image.scanlinePtr(iyp1); 334 335 COLOR A = L0[ix]; 336 COLOR B = L0[ixp1]; 337 COLOR C = L1[ix]; 338 COLOR D = L1[ixp1]; 339 340 static if (is(COLOR == RGBA)) 341 { 342 float inv255 = 1 / 255.0f; 343 version(LDC) 344 { 345 int Ai = *cast(int*)(&A); 346 int Bi = *cast(int*)(&B); 347 int Ci = *cast(int*)(&C); 348 int Di = *cast(int*)(&D); 349 350 __m128i mmZero = _mm_setzero_si128(); 351 __m128i mmABCD = _mm_setr_epi32(Ai, Bi, Ci, Di); 352 353 // Convert to float of the form (R, G, B, A) 354 __m128i mmAB = _mm_unpacklo_epi8(mmABCD, mmZero); 355 __m128i mmCD = _mm_unpackhi_epi8(mmABCD, mmZero); 356 __m128 vA = _mm_cvtepi32_ps( _mm_unpacklo_epi16(mmAB, mmZero)); 357 __m128 vB = _mm_cvtepi32_ps( _mm_unpackhi_epi16(mmAB, mmZero)); 358 __m128 vC = _mm_cvtepi32_ps( _mm_unpacklo_epi16(mmCD, mmZero)); 359 __m128 vD = _mm_cvtepi32_ps( _mm_unpackhi_epi16(mmCD, mmZero)); 360 361 __m128 vfx = _mm_set1_ps(fx); 362 __m128 vfxm1 = _mm_set1_ps(fxm1); 363 __m128 up = vA * vfxm1 + vB * vfx; 364 __m128 down = vC * vfxm1 + vD * vfx; 365 366 __m128 vfy = _mm_set1_ps(fy); 367 __m128 vfym1 = _mm_set1_ps(fym1); 368 __m128 dResult = up * fym1 + down * fy; 369 vec4f result = void; 370 _mm_storeu_ps(result.ptr, dResult); 371 return result; 372 373 } 374 else version( AsmX86 ) 375 { 376 vec4f asmResult; 377 378 asm nothrow @nogc 379 { 380 movd XMM0, A; 381 movd XMM1, B; 382 movd XMM2, C; 383 movd XMM3, D; 384 pxor XMM4, XMM4; 385 386 punpcklbw XMM0, XMM4; 387 punpcklbw XMM1, XMM4; 388 punpcklbw XMM2, XMM4; 389 punpcklbw XMM3, XMM4; 390 391 punpcklwd XMM0, XMM4; 392 punpcklwd XMM1, XMM4; 393 punpcklwd XMM2, XMM4; 394 punpcklwd XMM3, XMM4; 395 396 cvtdq2ps XMM0, XMM0; 397 cvtdq2ps XMM1, XMM1; 398 399 cvtdq2ps XMM2, XMM2; 400 cvtdq2ps XMM3, XMM3; 401 402 movss XMM4, fxm1; 403 pshufd XMM4, XMM4, 0; 404 movss XMM5, fx; 405 pshufd XMM5, XMM5, 0; 406 407 mulps XMM0, XMM4; 408 mulps XMM1, XMM5; 409 mulps XMM2, XMM4; 410 mulps XMM3, XMM5; 411 412 movss XMM4, fym1; 413 pshufd XMM4, XMM4, 0; 414 movss XMM5, fy; 415 pshufd XMM5, XMM5, 0; 416 417 addps XMM0, XMM1; 418 addps XMM2, XMM3; 419 420 mulps XMM0, XMM4; 421 mulps XMM2, XMM5; 422 423 addps XMM0, XMM2; 424 425 movups asmResult, XMM0; 426 } 427 428 // Uncomment to check 429 /* 430 vec4f vA = vec4f(A.r, A.g, A.b, A.a); 431 vec4f vB = vec4f(B.r, B.g, B.b, B.a); 432 vec4f vC = vec4f(C.r, C.g, C.b, C.a); 433 vec4f vD = vec4f(D.r, D.g, D.b, D.a); 434 435 vec4f up = vA * fxm1 + vB * fx; 436 vec4f down = vC * fxm1 + vD * fx; 437 vec4f dResult = up * fym1 + down * fy; 438 439 import gfm.core; 440 441 if (dResult.distanceTo(result) < 1.0f) 442 debugBreak(); 443 */ 444 445 vec4f result = asmResult; 446 return result; 447 } 448 else 449 { 450 vec4f vA = vec4f(A.r, A.g, A.b, A.a); 451 vec4f vB = vec4f(B.r, B.g, B.b, B.a); 452 vec4f vC = vec4f(C.r, C.g, C.b, C.a); 453 vec4f vD = vec4f(D.r, D.g, D.b, D.a); 454 455 456 457 vec4f up = vA * fxm1 + vB * fx; 458 vec4f down = vC * fxm1 + vD * fx; 459 vec4f dResult = up * fym1 + down * fy; 460 461 // assert(dResult.distanceTo(asmResult) < 1.0f); 462 463 return dResult; 464 } 465 } 466 else static if (is(COLOR == L16)) 467 { 468 float up = A.l * fxm1 + B.l * fx; 469 float down = C.l * fxm1 + D.l * fx; 470 return up * fym1 + down * fy; 471 } 472 else // RGBA16 473 { 474 vec4f vA = vec4f(A.r, A.g, A.b, A.a); 475 vec4f vB = vec4f(B.r, B.g, B.b, B.a); 476 vec4f vC = vec4f(C.r, C.g, C.b, C.a); 477 vec4f vD = vec4f(D.r, D.g, D.b, D.a); 478 479 vec4f up = vA * fxm1 + vB * fx; 480 vec4f down = vC * fxm1 + vD * fx; 481 vec4f result = up * fym1 + down * fy; 482 return result; 483 } 484 } 485 486 /// Returns: Width of the base level. 487 int width() pure const nothrow @nogc 488 { 489 return levels[0].w; 490 } 491 492 /// Returns: Height of the base level. 493 int height() pure const nothrow @nogc 494 { 495 return levels[0].h; 496 } 497 498 /// Returns: Number of levels. The maximum level is numLevels() - 1. 499 int numLevels() pure const nothrow @nogc 500 { 501 return cast(int)levels.length; 502 } 503 504 /// Regenerates the whole upper levels. 505 void generateMipmaps(Quality quality) nothrow @nogc 506 { 507 box2i updateRect = box2i(0, 0, width(), height()); 508 for (int level = 1; level < numLevels(); ++level) 509 { 510 // HACK: Force cubic filter past a level else it makes ugly looking mipmaps 511 if (level >= 3 && quality == Quality.box) 512 quality = Quality.cubic; 513 514 updateRect = generateNextLevel(quality, updateRect, level); 515 } 516 } 517 518 /// Regenerates a single mipmap level based on changes in the provided rectangle (expressed in level 0 coordinates). 519 /// updateRect expressed in level 0 coordinates 520 /// In general if you have several subparts of mipmaps to update, make sure a level is fully completed 521 /// before computing the next one. 522 box2i generateNextLevel(Quality quality, box2i updateRectPreviousLevel, int level) nothrow @nogc 523 { 524 OwnedImage!COLOR previousLevel = levels[level - 1]; 525 box2i updateRect = impactOnNextLevel(quality, updateRectPreviousLevel, previousLevel.w, previousLevel.h); 526 generateLevel(level, quality, updateRect); 527 return updateRect; 528 } 529 530 /// Regenerates one level 531 /// updateRect expressed in level i-th coordinates 532 void generateLevel(int level, Quality quality, box2i updateRect) nothrow @nogc 533 { 534 assert(level > 0); 535 OwnedImage!COLOR thisLevel = levels[level]; 536 OwnedImage!COLOR previousLevel = levels[level - 1]; 537 538 final switch(quality) with (Quality) 539 { 540 case box: 541 542 static if (is(COLOR == RGBA)) 543 generateLevelBoxRGBA(thisLevel, previousLevel, updateRect); 544 else static if (is(COLOR == L16)) 545 generateLevelBoxL16(thisLevel, previousLevel, updateRect); 546 else static if (is(COLOR == RGBA16)) 547 generateLevelBoxRGBA16(thisLevel, previousLevel, updateRect); 548 else 549 static assert(false, "not implemented"); 550 551 enum checkBoxMipmaps = false; 552 553 static if (checkBoxMipmaps) 554 { 555 for (int y = updateRect.min.y; y < updateRect.max.y; ++y) 556 { 557 COLOR[] L0 = previousLevel.scanline(y * 2); 558 COLOR[] L1 = previousLevel.scanline(y * 2 + 1); 559 COLOR[] dest = thisLevel.scanline(y); 560 561 for (int x = updateRect.min.x; x < updateRect.max.x; ++x) 562 { 563 // A B 564 // C D 565 COLOR A = L0[2 * x]; 566 COLOR B = L0[2 * x + 1]; 567 COLOR C = L1[2 * x]; 568 COLOR D = L1[2 * x + 1]; 569 assert(dest[x] == COLOR.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D)); 570 } 571 } 572 } 573 break; 574 575 case boxAlphaCovIntoPremul: 576 577 static if (is(COLOR == RGBA)) 578 { 579 generateLevelBoxAlphaCovIntoPremulRGBA(thisLevel, previousLevel, updateRect); 580 break; 581 } 582 else 583 assert(false); 584 585 case cubic: 586 static if (is(COLOR == RGBA)) 587 { 588 generateLevelCubicRGBA(thisLevel, previousLevel, updateRect); 589 break; 590 } 591 else static if (is(COLOR == L16)) 592 { 593 generateLevelCubicL16(thisLevel, previousLevel, updateRect); 594 break; 595 } 596 else static if (is(COLOR == RGBA16)) 597 { 598 generateLevelCubicRGBA16(thisLevel, previousLevel, updateRect); 599 break; 600 } 601 else 602 static assert(false, "not implemented"); 603 604 605 } 606 } 607 608 609 private: 610 /// Computes impact of updating the area box on next level 611 static box2i impactOnNextLevel(Quality quality, box2i area, int currentLevelWidth, int currentLevelHeight) pure nothrow @nogc 612 { 613 box2i maxArea = box2i(0, 0, currentLevelWidth / 2, currentLevelHeight / 2); 614 615 final switch(quality) with (Quality) 616 { 617 case box: 618 case boxAlphaCovIntoPremul: 619 int xmin = area.min.x / 2; 620 int ymin = area.min.y / 2; 621 int xmax = (area.max.x + 1) / 2; 622 int ymax = (area.max.y + 1) / 2; 623 return box2i(xmin, ymin, xmax, ymax).intersection(maxArea); 624 625 case cubic: 626 int xmin = (area.min.x - 1) / 2; 627 int ymin = (area.min.y - 1) / 2; 628 int xmax = (area.max.x + 2) / 2; 629 int ymax = (area.max.y + 2) / 2; 630 return box2i(xmin, ymin, xmax, ymax).intersection(maxArea); 631 } 632 633 } 634 } 635 636 unittest 637 { 638 Mipmap!RGBA a = new Mipmap!RGBA(); 639 a.size(4, 256, 256); 640 a.destroy(); 641 642 Mipmap!L16 b = new Mipmap!L16(); 643 b.size(16, 17, 333); 644 b.destroy(); 645 } 646 647 648 private: 649 650 align(16) static immutable short[8] xmmTwoShort = [ 2, 2, 2, 2, 2, 2, 2, 2 ]; 651 align(16) static immutable int[4] xmmTwoInt = [ 2, 2, 2, 2 ]; 652 align(16) static immutable float[4] xmm0_5 = [ 0.5f, 0.5f, 0.5f, 0.5f ]; 653 align(16) static immutable int[4] xmm512 = [ 512, 512, 512, 512 ]; 654 align(16) static immutable short[8] xmm11113333 = [ 1, 1, 1, 1, 3, 3, 3, 3 ]; 655 align(16) static immutable short[8] xmm33331111 = [ 3, 3, 3, 3, 1, 1, 1, 1 ]; 656 align(16) static immutable short[8] xmm33339999 = [ 3, 3, 3, 3, 9, 9, 9, 9 ]; 657 align(16) static immutable short[8] xmm99993333 = [ 9, 9, 9, 9, 3, 3, 3, 3 ]; 658 align(16) static immutable short[8] xmm32 = [ 32, 32, 32, 32, 32, 32, 32, 32 ]; 659 660 661 void generateLevelBoxRGBA(OwnedImage!RGBA thisLevel, 662 OwnedImage!RGBA previousLevel, 663 box2i updateRect) pure nothrow @nogc 664 { 665 int width = updateRect.width(); 666 int height = updateRect.height(); 667 668 for (int y = 0; y < height; ++y) 669 { 670 RGBA* L0 = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 ) + updateRect.min.x * 2; 671 RGBA* L1 = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 + 1) + updateRect.min.x * 2; 672 RGBA* dest = thisLevel.scanlinePtr( updateRect.min.y + y) + updateRect.min.x; 673 674 675 676 // PERF: enable later, this is faster on a full mipmap even without AVX2 677 /// Requires a somewhat recent intel-intrinsics though 678 /+ 679 int x = 0; 680 __m256i zero = _mm256_setzero_si256(); 681 __m256i two = _mm256_set1_epi16(2); 682 for ( ; x + 3 < width; x += 4) 683 { 684 // pixel patches: 685 // A B E F Goal = (A + B + C + D + 2) / 4 => res 686 // C D G H (E + F + G + H + 2) / 4 => res+1 687 // 688 __m256i ABEF = _mm256_loadu_si256(cast(const(__m256i)*) &L0[2*x]); 689 __m256i CDGH = _mm256_loadu_si256(cast(const(__m256i)*) &L1[2*x]); 690 __m256i AB = _mm256_unpacklo_epi8(ABEF, zero); 691 __m256i EF = _mm256_unpackhi_epi8(ABEF, zero); 692 __m256i CD = _mm256_unpacklo_epi8(CDGH, zero); 693 __m256i GH = _mm256_unpackhi_epi8(CDGH, zero); 694 AB = _mm256_add_epi16(AB, CD); // A + C B + D 695 EF = _mm256_add_epi16(EF, GH); // E + G F + H 696 __m256i AC_EG = _mm256_unpacklo_epi64(AB, EF); // A+C E+G 697 __m256i BD_FH = _mm256_unpackhi_epi64(AB, EF); // B+D F+H 698 __m256i sum = _mm256_add_epi16(AC_EG, BD_FH); // A+B+C+D E+F+G+H 699 sum = _mm256_add_epi16(sum, two); // A+B+C+D+2 E+F+G+H+2 700 sum = _mm256_srai_epi16(sum, 2); // (A+B+C+D+2)/4 (E+F+G+H+2)/4 701 __m256i finalPixels = _mm256_packus_epi16(sum, zero); 702 703 __m128i f_lo = _mm256_extractf128_si256!0(finalPixels); 704 __m128i f_hi = _mm256_extractf128_si256!1(finalPixels); 705 _mm_storeu_si64(&dest[x], f_lo); // PERF Would need a vpermute here. In each lane, only the low 8 bytes are interesting. 706 _mm_storeu_si64(&dest[x+2], f_hi); 707 } 708 } 709 710 +/ 711 712 __m128i zero = _mm_setzero_si128(); 713 __m128i two = _mm_set1_epi16(2); 714 int x = 0; 715 for ( ; x + 1 < width; x += 2) 716 { 717 // pixel patches: 718 // A B E F Goal = (A + B + C + D + 2) / 4 => res 719 // C D G H (E + F + G + H + 2) / 4 => res+1 720 // 721 __m128i ABEF = _mm_loadu_si128(cast(const(__m128i)*) &L0[2*x]); 722 __m128i CDGH = _mm_loadu_si128(cast(const(__m128i)*) &L1[2*x]); 723 __m128i AB = _mm_unpacklo_epi8(ABEF, zero); 724 __m128i EF = _mm_unpackhi_epi8(ABEF, zero); 725 __m128i CD = _mm_unpacklo_epi8(CDGH, zero); 726 __m128i GH = _mm_unpackhi_epi8(CDGH, zero); 727 AB = _mm_add_epi16(AB, CD); // A + C B + D 728 EF = _mm_add_epi16(EF, GH); // E + G F + H 729 __m128i AC_EG = _mm_unpacklo_epi64(AB, EF); // A+C E+G 730 __m128i BD_FH = _mm_unpackhi_epi64(AB, EF); // B+D F+H 731 __m128i sum = _mm_add_epi16(AC_EG, BD_FH); // A+B+C+D E+F+G+H 732 sum = _mm_add_epi16(sum, two); // A+B+C+D+2 E+F+G+H+2 733 sum = _mm_srai_epi16(sum, 2); // (A+B+C+D+2)/4 (E+F+G+H+2)/4 734 __m128i finalPixels = _mm_packus_epi16(sum, zero); 735 _mm_storeu_si64(&dest[x], finalPixels); 736 } 737 738 for (; x < width; ++x) 739 { 740 RGBA A = L0[2 * x]; 741 RGBA B = L0[2 * x + 1]; 742 RGBA C = L1[2 * x]; 743 RGBA D = L1[2 * x + 1]; 744 dest[x] = RGBA.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D); 745 } 746 } 747 } 748 749 void generateLevelBoxL16(OwnedImage!L16 thisLevel, 750 OwnedImage!L16 previousLevel, 751 box2i updateRect) pure nothrow @nogc 752 { 753 int width = updateRect.width(); 754 int height = updateRect.height(); 755 756 for (int y = 0; y < height; ++y) 757 { 758 L16* L0 = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 ) + updateRect.min.x * 2; 759 L16* L1 = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 + 1) + updateRect.min.x * 2; 760 L16* dest = thisLevel.scanlinePtr( updateRect.min.y + y) + updateRect.min.x; 761 762 763 // Fun performance fact: for this loop (LDC 1.33, arch x86_64), assembly is slower than intrinsics, 764 // themselves slower than normal D code. 765 766 int x = 0; 767 for (; x < width; ++x) 768 { 769 // A B 770 // C D 771 L16 A = L0[2 * x]; 772 L16 B = L0[2 * x + 1]; 773 L16 C = L1[2 * x]; 774 L16 D = L1[2 * x + 1]; 775 776 dest[x] = L16.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D); 777 } 778 } 779 } 780 781 void generateLevelBoxRGBA16(OwnedImage!RGBA16 thisLevel, 782 OwnedImage!RGBA16 previousLevel, 783 box2i updateRect) pure nothrow @nogc 784 { 785 // untested and unused for now 786 int width = updateRect.width(); 787 int height = updateRect.height(); 788 789 for (int y = 0; y < height; ++y) 790 { 791 RGBA16* L0 = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 ) + updateRect.min.x * 2; 792 RGBA16* L1 = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 + 1) + updateRect.min.x * 2; 793 RGBA16* dest = thisLevel.scanlinePtr( updateRect.min.y + y) + updateRect.min.x; 794 for (int x = 0; x < width; ++x) 795 { 796 // A B 797 // C D 798 RGBA16 A = L0[2 * x]; 799 RGBA16 B = L0[2 * x + 1]; 800 RGBA16 C = L1[2 * x]; 801 RGBA16 D = L1[2 * x + 1]; 802 803 dest[x] = RGBA16.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D); 804 } 805 } 806 } 807 808 void generateLevelBoxAlphaCovIntoPremulRGBA(OwnedImage!RGBA thisLevel, 809 OwnedImage!RGBA previousLevel, 810 box2i updateRect) nothrow @nogc 811 { 812 int width = updateRect.width(); 813 int height = updateRect.height(); 814 815 for (int y = 0; y < height; ++y) 816 { 817 RGBA* L0 = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 ) + updateRect.min.x * 2; 818 RGBA* L1 = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 + 1) + updateRect.min.x * 2; 819 RGBA* dest = thisLevel.scanlinePtr( updateRect.min.y + y) + updateRect.min.x; 820 821 version(futurePBREmissive) 822 { 823 // Note: basically very hard to beat with intrinsics. 824 // Hours lost trying to do that: 4. 825 // Neither float or integer intrinsics shenanigans do better than this plain code. 826 827 for (int x = 0; x < width; ++x) 828 { 829 RGBA A = L0[2 * x]; 830 RGBA B = L0[2 * x + 1]; 831 RGBA C = L1[2 * x]; 832 RGBA D = L1[2 * x + 1]; 833 834 // This is only approximate, does a pow2 835 static RGBAf convert_gammaspace_to_linear_premul (RGBA col) 836 { 837 RGBAf res; 838 enum float inv_255 = 1.0f / 255; 839 res.a = col.a * inv_255; // alpha is linear 840 res.r = col.r * inv_255 *col.r * inv_255* res.a; 841 res.g = col.g * inv_255 *col.g * inv_255* res.a; 842 res.b = col.b * inv_255 *col.b * inv_255* res.a; 843 return res; 844 } 845 846 // Convert those into 847 RGBAf A_linear = convert_gammaspace_to_linear_premul(A); 848 RGBAf B_linear = convert_gammaspace_to_linear_premul(B); 849 RGBAf C_linear = convert_gammaspace_to_linear_premul(C); 850 RGBAf D_linear = convert_gammaspace_to_linear_premul(D); 851 852 float meanR = A_linear.r + B_linear.r + C_linear.r + D_linear.r; 853 float meanG = A_linear.g + B_linear.g + C_linear.g + D_linear.g; 854 float meanB = A_linear.b + B_linear.b + C_linear.b + D_linear.b; 855 float meanA = A_linear.a + B_linear.a + C_linear.a + D_linear.a; 856 857 RGBA finalColor = RGBA( cast(ubyte)(meanR * 0.25f * 255.0f + 0.5f), 858 cast(ubyte)(meanG * 0.25f * 255.0f + 0.5f), 859 cast(ubyte)(meanB * 0.25f * 255.0f + 0.5f), 860 cast(ubyte)(meanA * 0.25f * 255.0f + 0.5f) ); 861 dest[x] = finalColor; 862 } 863 } 864 else 865 { 866 for (int x = 0; x < width; ++x) 867 { 868 RGBA A = L0[2 * x]; 869 RGBA B = L0[2 * x + 1]; 870 RGBA C = L1[2 * x]; 871 RGBA D = L1[2 * x + 1]; 872 int red = (A.r * A.a + B.r * B.a + C.r * C.a + D.r * D.a); 873 int green = (A.g * A.a + B.g * B.a + C.g * C.a + D.g * D.a); 874 int blue = (A.b * A.a + B.b* B.a + C.b * C.a + D.b * D.a); 875 int alpha = (A.a * A.a + B.a* B.a + C.a * C.a + D.a * D.a); 876 RGBA finalColor = RGBA( cast(ubyte)((red + 512) >> 10), 877 cast(ubyte)((green + 512) >> 10), 878 cast(ubyte)((blue + 512) >> 10), 879 cast(ubyte)((alpha + 512) >> 10)); 880 dest[x] = finalColor; 881 } 882 } 883 } 884 } 885 886 void generateLevelCubicRGBA(OwnedImage!RGBA thisLevel, 887 OwnedImage!RGBA previousLevel, 888 box2i updateRect) nothrow @nogc 889 { 890 for (int y = updateRect.min.y; y < updateRect.max.y; ++y) 891 { 892 int y2m1 = 2 * y - 1; 893 if (y2m1 < 0) 894 y2m1 = 0; 895 896 int y2p2 = 2 * y + 2; 897 if (y2p2 > previousLevel.h - 1) 898 y2p2 = previousLevel.h - 1; 899 900 RGBA* LM1 = previousLevel.scanlinePtr(y2m1); 901 RGBA* L0 = previousLevel.scanlinePtr(y * 2); 902 RGBA* L1 = previousLevel.scanlinePtr(y * 2 + 1); 903 RGBA* L2 = previousLevel.scanlinePtr(y2p2); 904 RGBA* dest = thisLevel.scanlinePtr(y); 905 906 for (int x = updateRect.min.x; x < updateRect.max.x; ++x) 907 { 908 // A B C D 909 // E F G H 910 // I J K L 911 // M N O P 912 913 int x2m1 = 2 * x - 1; 914 if (x2m1 < 0) 915 x2m1 = 0; 916 int x2p0 = 2 * x; 917 int x2p2 = 2 * x + 2; 918 if (x2p2 > previousLevel.w - 1) 919 x2p2 = previousLevel.w - 1; 920 921 static if (true) 922 { 923 align(16) RGBA[16] buf = void; 924 buf[0] = LM1[x2m1]; 925 buf[1] = LM1[x2p0]; 926 buf[2] = LM1[x2p0+1]; 927 buf[3] = LM1[x2p2]; 928 buf[4] = L0[x2m1]; 929 buf[5] = L0[x2p0]; 930 buf[6] = L0[x2p0+1]; 931 buf[7] = L0[x2p2]; 932 buf[8] = L1[x2m1]; 933 buf[9] = L1[x2p0]; 934 buf[10] = L1[x2p0+1]; 935 buf[11] = L1[x2p2]; 936 buf[12] = L2[x2m1]; 937 buf[13] = L2[x2p0]; 938 buf[14] = L2[x2p0+1]; 939 buf[15] = L2[x2p2]; 940 RGBA* pDest = dest + x; 941 942 const __m128i mmZero = _mm_setzero_si128(); 943 944 // Note: no coefficients improvements really convince. 945 // This was Issue #827, read for more context. 946 947 const __m128i xmm11113333 = _mm_setr_epi16(1, 1, 1, 1, 3, 3, 3, 3); 948 const __m128i xmm33339999 = _mm_setr_epi16(3, 3, 3, 3, 9, 9, 9, 9); 949 950 __m128i ABCD = _mm_load_si128(cast(const(__m128i*)) &buf[0]); 951 __m128i EFGH = _mm_load_si128(cast(const(__m128i*)) &buf[4]); 952 __m128i IJKL = _mm_load_si128(cast(const(__m128i*)) &buf[8]); 953 __m128i MNOP = _mm_load_si128(cast(const(__m128i*)) &buf[12]); 954 955 __m128i AB = _mm_unpacklo_epi8(ABCD, mmZero); 956 __m128i CD = _mm_unpackhi_epi8(ABCD, mmZero); 957 __m128i EF = _mm_unpacklo_epi8(EFGH, mmZero); 958 __m128i GH = _mm_unpackhi_epi8(EFGH, mmZero); 959 __m128i IJ = _mm_unpacklo_epi8(IJKL, mmZero); 960 __m128i KL = _mm_unpackhi_epi8(IJKL, mmZero); 961 __m128i MN = _mm_unpacklo_epi8(MNOP, mmZero); 962 __m128i OP = _mm_unpackhi_epi8(MNOP, mmZero); 963 964 // This avoid a few multiplications 965 AB = _mm_add_epi16(AB, MN); 966 CD = _mm_add_epi16(CD, OP); 967 EF = _mm_add_epi16(EF, IJ); 968 GH = _mm_add_epi16(GH, KL); 969 970 // Wrap a bit more, avoids two muls 971 AB = _mm_add_epi16(AB, _mm_shuffle_epi32!0x4e(CD)); // invert quadwords 972 EF = _mm_add_epi16(EF, _mm_shuffle_epi32!0x4e(GH)); // invert quadwords 973 974 // PERF: we can win a few mul here 975 __m128i sum01 = _mm_mullo_epi16(AB, xmm11113333); 976 sum01 = _mm_add_epi16(sum01, _mm_mullo_epi16(EF, xmm33339999)); 977 sum01 = _mm_add_epi16(sum01, _mm_srli_si128!8(sum01)); 978 979 __m128i sum = sum01; 980 sum = _mm_add_epi16(sum, _mm_set1_epi16(32)); 981 sum = _mm_srli_epi16(sum, 6); 982 __m128i finalPixels = _mm_packus_epi16(sum, mmZero); 983 _mm_storeu_si32(pDest, finalPixels); 984 } 985 else 986 { 987 RGBA A = LM1[x2m1]; 988 RGBA B = LM1[x2p0]; 989 RGBA C = LM1[x2p0+1]; 990 RGBA D = LM1[x2p2]; 991 992 RGBA E = L0[x2m1]; 993 RGBA F = L0[x2p0]; 994 RGBA G = L0[x2p0+1]; 995 RGBA H = L0[x2p2]; 996 997 RGBA I = L1[x2m1]; 998 RGBA J = L1[x2p0]; 999 RGBA K = L1[x2p0+1]; 1000 RGBA L = L1[x2p2]; 1001 1002 RGBA M = L2[x2m1]; 1003 RGBA N = L2[x2p0]; 1004 RGBA O = L2[x2p0+1]; 1005 RGBA P = L2[x2p2]; 1006 1007 // Apply filter 1008 // 1 3 3 1 1009 // 3 9 9 3 / 64 1010 // 3 9 9 3 1011 // 1 3 3 1 1012 1013 int rSum = (A.r + D.r + M.r + P.r) + 3 * (B.r + C.r + E.r + H.r + I.r + L.r + N.r + O.r) + 9 * (F.r + G.r + J.r + K.r); 1014 int gSum = (A.g + D.g + M.g + P.g) + 3 * (B.g + C.g + E.g + H.g + I.g + L.g + N.g + O.g) + 9 * (F.g + G.g + J.g + K.g); 1015 int bSum = (A.b + D.b + M.b + P.b) + 3 * (B.b + C.b + E.b + H.b + I.b + L.b + N.b + O.b) + 9 * (F.b + G.b + J.b + K.b); 1016 int aSum = (A.a + D.a + M.a + P.a) + 3 * (B.a + C.a + E.a + H.a + I.a + L.a + N.a + O.a) + 9 * (F.a + G.a + J.a + K.a); 1017 dest[x].r = cast(ubyte)((rSum + 32) >> 6); 1018 dest[x].g = cast(ubyte)((gSum + 32) >> 6); 1019 dest[x].b = cast(ubyte)((bSum + 32) >> 6); 1020 dest[x].a = cast(ubyte)((aSum + 32) >> 6); 1021 } 1022 } 1023 } 1024 } 1025 1026 void generateLevelCubicL16(OwnedImage!L16 thisLevel, 1027 OwnedImage!L16 previousLevel, 1028 box2i updateRect) nothrow @nogc 1029 { 1030 for (int y = updateRect.min.y; y < updateRect.max.y; ++y) 1031 { 1032 int y2m1 = 2 * y - 1; 1033 if (y2m1 < 0) 1034 y2m1 = 0; 1035 1036 int y2p2 = 2 * y + 2; 1037 if (y2p2 > previousLevel.h - 1) 1038 y2p2 = previousLevel.h - 1; 1039 1040 L16* LM1 = previousLevel.scanlinePtr(y2m1); 1041 L16* L0 = previousLevel.scanlinePtr(y * 2); 1042 L16* L1 = previousLevel.scanlinePtr(y * 2 + 1); 1043 L16* L2 = previousLevel.scanlinePtr(y2p2); 1044 L16* dest = thisLevel.scanlinePtr(y); 1045 1046 for (int x = updateRect.min.x; x < updateRect.max.x; ++x) 1047 { 1048 // A B C D 1049 // E F G H 1050 // I J K L 1051 // M N O P 1052 1053 int x2m1 = 2 * x - 1; 1054 if (x2m1 < 0) 1055 x2m1 = 0; 1056 int x2p0 = 2 * x; 1057 int x2p2 = 2 * x + 2; 1058 if (x2p2 > previousLevel.w - 1) 1059 x2p2 = previousLevel.w - 1; 1060 1061 ushort A = LM1[x2m1].l; 1062 ushort B = LM1[x2p0].l; 1063 ushort C = LM1[x2p0+1].l; 1064 ushort D = LM1[x2p2].l; 1065 1066 ushort E = L0[x2m1].l; 1067 ushort F = L0[x2p0].l; 1068 ushort G = L0[x2p0+1].l; 1069 ushort H = L0[x2p2].l; 1070 1071 ushort I = L1[x2m1].l; 1072 ushort J = L1[x2p0].l; 1073 ushort K = L1[x2p0+1].l; 1074 ushort L = L1[x2p2].l; 1075 1076 ushort M = L2[x2m1].l; 1077 ushort N = L2[x2p0].l; 1078 ushort O = L2[x2p0+1].l; 1079 ushort P = L2[x2p2].l; 1080 1081 // Apply filter 1082 // 1 3 3 1 A B C D 1083 // 3 9 9 3 E F G H 1084 // 3 9 9 3 I J K L 1085 // 1 3 3 1 M N O P 1086 1087 int depthSum = (A + D + M + P) 1088 + 3 * (B + C + E + H + I + L + N + O) 1089 + 9 * (F + G + J + K); 1090 dest[x].l = cast(ushort)((depthSum + 32) >> 6 ); 1091 } 1092 } 1093 } 1094 1095 void generateLevelCubicRGBA16(OwnedImage!RGBA16 thisLevel, 1096 OwnedImage!RGBA16 previousLevel, 1097 box2i updateRect) nothrow @nogc 1098 { 1099 // untested and unused for now 1100 for (int y = updateRect.min.y; y < updateRect.max.y; ++y) 1101 { 1102 int y2m1 = 2 * y - 1; 1103 if (y2m1 < 0) 1104 y2m1 = 0; 1105 1106 int y2p2 = 2 * y + 2; 1107 if (y2p2 > previousLevel.h - 1) 1108 y2p2 = previousLevel.h - 1; 1109 1110 RGBA16* LM1 = previousLevel.scanlinePtr(y2m1); 1111 RGBA16* L0 = previousLevel.scanlinePtr(y * 2); 1112 RGBA16* L1 = previousLevel.scanlinePtr(y * 2 + 1); 1113 RGBA16* L2 = previousLevel.scanlinePtr(y2p2); 1114 RGBA16* dest = thisLevel.scanlinePtr(y); 1115 1116 for (int x = updateRect.min.x; x < updateRect.max.x; ++x) 1117 { 1118 // A B C D 1119 // E F G H 1120 // I J K L 1121 // M N O P 1122 1123 int x2m1 = 2 * x - 1; 1124 if (x2m1 < 0) 1125 x2m1 = 0; 1126 int x2p0 = 2 * x; 1127 int x2p2 = 2 * x + 2; 1128 if (x2p2 > previousLevel.w - 1) 1129 x2p2 = previousLevel.w - 1; 1130 1131 auto A = LM1[x2m1]; 1132 auto B = LM1[x2p0]; 1133 auto C = LM1[x2p0+1]; 1134 auto D = LM1[x2p2]; 1135 1136 auto E = L0[x2m1]; 1137 auto F = L0[x2p0]; 1138 auto G = L0[x2p0+1]; 1139 auto H = L0[x2p2]; 1140 1141 auto I = L1[x2m1]; 1142 auto J = L1[x2p0]; 1143 auto K = L1[x2p0+1]; 1144 auto L = L1[x2p2]; 1145 1146 auto M = L2[x2m1]; 1147 auto N = L2[x2p0]; 1148 auto O = L2[x2p0+1]; 1149 auto P = L2[x2p2]; 1150 1151 // Apply filter 1152 // 1 3 3 1 1153 // 3 9 9 3 1154 // 3 9 9 3 1155 // 1 3 3 1 1156 1157 int rSum = (A.r + D.r + M.r + P.r) + 3 * (B.r + C.r + E.r + H.r + I.r + L.r + N.r + O.r) + 9 * (F.r + G.r + J.r + K.r); 1158 int gSum = (A.g + D.g + M.g + P.g) + 3 * (B.g + C.g + E.g + H.g + I.g + L.g + N.g + O.g) + 9 * (F.g + G.g + J.g + K.g); 1159 int bSum = (A.b + D.b + M.b + P.b) + 3 * (B.b + C.b + E.b + H.b + I.b + L.b + N.b + O.b) + 9 * (F.b + G.b + J.b + K.b); 1160 int aSum = (A.a + D.a + M.a + P.a) + 3 * (B.a + C.a + E.a + H.a + I.a + L.a + N.a + O.a) + 9 * (F.a + G.a + J.a + K.a); 1161 dest[x].r = cast(ushort)((rSum + 32) >> 6); 1162 dest[x].g = cast(ushort)((gSum + 32) >> 6); 1163 dest[x].b = cast(ushort)((bSum + 32) >> 6); 1164 dest[x].a = cast(ushort)((aSum + 32) >> 6); 1165 } 1166 } 1167 } 1168 1169 unittest 1170 { 1171 Mipmap!RGBA rgbaMipmap; 1172 Mipmap!L16 l16Mipmap; 1173 }