1 /** 2 Mipmap pyramid implementation. 3 4 Copyright: Guillaume Piolat 2015-2016. 5 License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module dplug.graphics.mipmap; 8 9 import std.algorithm.comparison; 10 11 import dplug.math.vector; 12 import dplug.math.box; 13 import dplug.graphics.image; 14 import dplug.core.nogc; 15 import dplug.core.vec; 16 17 18 import inteli.emmintrin; 19 20 version( D_InlineAsm_X86 ) 21 { 22 version = AsmX86; 23 } 24 else version( D_InlineAsm_X86_64 ) 25 { 26 version = AsmX86; 27 } 28 29 // Because of unability to load globals in PIC code with DMD, only enable some assembly with LDC 30 version(LDC) 31 { 32 version( D_InlineAsm_X86 ) 33 { 34 version = inlineAsmCanLoadGlobalsInPIC; 35 } 36 else version( D_InlineAsm_X86_64 ) 37 { 38 version = inlineAsmCanLoadGlobalsInPIC; 39 } 40 } 41 42 43 /// Mipmapped images. 44 /// Supports non power-of-two textures. 45 /// Size of the i+1-th mipmap is { (width)/2, (height)/2 } 46 /// The mipmap owns each of its levels. 47 final class Mipmap(COLOR) if (is(COLOR == RGBA) || is(COLOR == L16)) 48 { 49 public: 50 nothrow: 51 @nogc: 52 53 enum Quality 54 { 55 box, // simple 2x2 filter, creates phase problems with NPOT. For higher levels, automatically uses cubic. 56 cubic, // Very smooth kernel [1 2 1] x [1 2 1] 57 boxAlphaCov, // ditto but alpha is used as weight, only implemented for RGBA 58 boxAlphaCovIntoPremul, // same as boxAlphaConv but after such a step the next level is alpha-premultiplied 59 } 60 61 Vec!(OwnedImage!COLOR) levels; 62 63 /// Creates empty 64 this() 65 { 66 levels = makeVec!(OwnedImage!COLOR)(); 67 } 68 69 /// Set number of levels and size 70 /// maxLevel = 0 => only one image 71 /// maxLevel = 1 => one image + one 2x downsampled mipmap 72 /// etc... 73 this(int maxLevel, int w, int h) 74 { 75 this(); 76 size(maxLevel, w, h); 77 } 78 79 80 /// Creates a Mipmap out of a flat OwnedImage. 81 /// This takes ownership of the given image, which is now owned by the `Mipmap`. 82 this(int maxLevel, OwnedImage!COLOR level0) 83 { 84 //PERF: could avoid to create the 0th level only to replace it later 85 86 this(maxLevel, level0.w, level0.h); 87 88 // replaces level 0 89 levels[0].destroyFree(); 90 levels[0] = level0; 91 } 92 93 void size(int maxLevel, int w, int h) 94 { 95 // find number of needed levels 96 int neededLevels = 0; 97 { 98 int wr = w; 99 int hr = h; 100 for (; neededLevels <= maxLevel; ++neededLevels) 101 { 102 if (wr == 0 || hr == 0) 103 break; 104 wr = (wr + 0) >> 1; 105 hr = (hr + 0) >> 1; 106 } 107 } 108 109 void setLevels(int numLevels) 110 { 111 // FUTURE: cleanup excess levels 112 // should not happen until we have resizing 113 if (numLevels < levels.length) 114 { 115 assert(false); 116 } 117 118 int previousLength = cast(int)levels.length; 119 120 levels.resize(numLevels); 121 122 // create empty image for new levels 123 for(int level = previousLength; level < numLevels; ++level) 124 { 125 levels[level] = mallocNew!(OwnedImage!COLOR)(); 126 } 127 } 128 129 setLevels(neededLevels); 130 131 // resize levels 132 for (int level = 0; level < neededLevels; ++level) 133 { 134 assert(w != 0 && h != 0); 135 levels[level].size(w, h); 136 w = (w + 0) >> 1; 137 h = (h + 0) >> 1; 138 } 139 } 140 141 ~this() 142 { 143 foreach(level; levels) 144 level.destroyFree(); 145 } 146 147 /// Interpolates a color between mipmap levels. Floating-point level, spatial linear interpolation. 148 /// x and y are in base level coordinates (top-left pixel is on (0.5, 0.5) coordinates). 149 /// Clamped to borders. 150 auto linearMipmapSample(float level, float x, float y) nothrow @nogc 151 { 152 int ilevel = cast(int)level; 153 float flevel = level - ilevel; 154 vec4f levelN = linearSample(ilevel, x, y); 155 if (flevel == 0) 156 return levelN; 157 158 auto levelNp1 = linearSample(ilevel + 1, x, y); 159 160 return levelN * (1 - flevel) + levelNp1 * flevel; 161 } 162 163 164 /// Interpolates a color. Integer level, spatial linear interpolation. 165 /// x and y are in base level coordinates (top-left pixel is on (0.5, 0.5) coordinates). 166 /// Clamped to borders. 167 auto linearSample(int level, float x, float y) nothrow @nogc 168 { 169 if (level < 0) 170 level = 0; 171 int numLevels = cast(int)levels.length; 172 if (level >= numLevels) 173 level = numLevels - 1; 174 175 OwnedImage!COLOR image = levels[level]; 176 177 178 static immutable float[14] factors = [ 1.0f, 0.5f, 0.25f, 0.125f, 179 0.0625f, 0.03125f, 0.015625f, 0.0078125f, 180 0.00390625f, 0.001953125f, 0.0009765625f, 0.00048828125f, 181 0.000244140625f, 0.0001220703125f]; 182 183 float divider = factors[level]; 184 x = x * divider - 0.5f; 185 y = y * divider - 0.5f; 186 187 if (x < 0) 188 x = 0; 189 if (y < 0) 190 y = 0; 191 192 __m128 floatCoords = _mm_setr_ps(x, y, 0, 0); 193 __m128i truncatedCoord = _mm_cvttps_epi32(floatCoords); 194 int ix = truncatedCoord.array[0]; 195 int iy = truncatedCoord.array[1]; 196 197 // Get fractional part 198 float fx = x - ix; 199 float fy = y - iy; 200 201 const int maxX = image.w-1; 202 const int maxY = image.h-1; 203 if (ix > maxX) 204 ix = maxX; 205 if (iy > maxY) 206 iy = maxY; 207 208 int ixp1 = ix + 1; 209 int iyp1 = iy + 1; 210 if (ixp1 > maxX) 211 ixp1 = maxX; 212 if (iyp1 > maxY) 213 iyp1 = maxY; 214 215 float fxm1 = 1 - fx; 216 float fym1 = 1 - fy; 217 218 COLOR* L0 = image.scanlinePtr(iy); 219 COLOR* L1 = image.scanlinePtr(iyp1); 220 221 COLOR A = L0[ix]; 222 COLOR B = L0[ixp1]; 223 COLOR C = L1[ix]; 224 COLOR D = L1[ixp1]; 225 226 static if (is(COLOR == RGBA)) 227 { 228 float inv255 = 1 / 255.0f; 229 version(LDC) 230 { 231 int Ai = *cast(int*)(&A); 232 int Bi = *cast(int*)(&B); 233 int Ci = *cast(int*)(&C); 234 int Di = *cast(int*)(&D); 235 236 __m128i mmZero = _mm_setzero_si128(); 237 __m128i mmABCD = _mm_setr_epi32(Ai, Bi, Ci, Di); 238 239 // Convert to float of the form (R, G, B, A) 240 __m128i mmAB = _mm_unpacklo_epi8(mmABCD, mmZero); 241 __m128i mmCD = _mm_unpackhi_epi8(mmABCD, mmZero); 242 __m128 vA = _mm_cvtepi32_ps( _mm_unpacklo_epi16(mmAB, mmZero)); 243 __m128 vB = _mm_cvtepi32_ps( _mm_unpackhi_epi16(mmAB, mmZero)); 244 __m128 vC = _mm_cvtepi32_ps( _mm_unpacklo_epi16(mmCD, mmZero)); 245 __m128 vD = _mm_cvtepi32_ps( _mm_unpackhi_epi16(mmCD, mmZero)); 246 247 __m128 vfx = _mm_set1_ps(fx); 248 __m128 vfxm1 = _mm_set1_ps(fxm1); 249 __m128 up = vA * vfxm1 + vB * vfx; 250 __m128 down = vC * vfxm1 + vD * vfx; 251 252 __m128 vfy = _mm_set1_ps(fy); 253 __m128 vfym1 = _mm_set1_ps(fym1); 254 __m128 dResult = up * fym1 + down * fy; 255 vec4f result = void; 256 _mm_storeu_ps(result.ptr, dResult); 257 return result; 258 259 } 260 else version( AsmX86 ) 261 { 262 vec4f asmResult; 263 264 asm nothrow @nogc 265 { 266 movd XMM0, A; 267 movd XMM1, B; 268 movd XMM2, C; 269 movd XMM3, D; 270 pxor XMM4, XMM4; 271 272 punpcklbw XMM0, XMM4; 273 punpcklbw XMM1, XMM4; 274 punpcklbw XMM2, XMM4; 275 punpcklbw XMM3, XMM4; 276 277 punpcklwd XMM0, XMM4; 278 punpcklwd XMM1, XMM4; 279 punpcklwd XMM2, XMM4; 280 punpcklwd XMM3, XMM4; 281 282 cvtdq2ps XMM0, XMM0; 283 cvtdq2ps XMM1, XMM1; 284 285 cvtdq2ps XMM2, XMM2; 286 cvtdq2ps XMM3, XMM3; 287 288 movss XMM4, fxm1; 289 pshufd XMM4, XMM4, 0; 290 movss XMM5, fx; 291 pshufd XMM5, XMM5, 0; 292 293 mulps XMM0, XMM4; 294 mulps XMM1, XMM5; 295 mulps XMM2, XMM4; 296 mulps XMM3, XMM5; 297 298 movss XMM4, fym1; 299 pshufd XMM4, XMM4, 0; 300 movss XMM5, fy; 301 pshufd XMM5, XMM5, 0; 302 303 addps XMM0, XMM1; 304 addps XMM2, XMM3; 305 306 mulps XMM0, XMM4; 307 mulps XMM2, XMM5; 308 309 addps XMM0, XMM2; 310 311 movups asmResult, XMM0; 312 } 313 314 // Uncomment to check 315 /* 316 vec4f vA = vec4f(A.r, A.g, A.b, A.a); 317 vec4f vB = vec4f(B.r, B.g, B.b, B.a); 318 vec4f vC = vec4f(C.r, C.g, C.b, C.a); 319 vec4f vD = vec4f(D.r, D.g, D.b, D.a); 320 321 vec4f up = vA * fxm1 + vB * fx; 322 vec4f down = vC * fxm1 + vD * fx; 323 vec4f dResult = up * fym1 + down * fy; 324 325 import gfm.core; 326 327 if (dResult.distanceTo(result) < 1.0f) 328 debugBreak(); 329 */ 330 331 vec4f result = asmResult; 332 return result; 333 } 334 else 335 { 336 vec4f vA = vec4f(A.r, A.g, A.b, A.a); 337 vec4f vB = vec4f(B.r, B.g, B.b, B.a); 338 vec4f vC = vec4f(C.r, C.g, C.b, C.a); 339 vec4f vD = vec4f(D.r, D.g, D.b, D.a); 340 341 342 343 vec4f up = vA * fxm1 + vB * fx; 344 vec4f down = vC * fxm1 + vD * fx; 345 vec4f dResult = up * fym1 + down * fy; 346 347 // assert(dResult.distanceTo(asmResult) < 1.0f); 348 349 return dResult; 350 } 351 } 352 else 353 { 354 float up = A.l * fxm1 + B.l * fx; 355 float down = C.l * fxm1 + D.l * fx; 356 return up * fym1 + down * fy; 357 } 358 } 359 360 /// Returns: Width of the base level. 361 int width() pure const nothrow @nogc 362 { 363 return levels[0].w; 364 } 365 366 /// Returns: Height of the base level. 367 int height() pure const nothrow @nogc 368 { 369 return levels[0].h; 370 } 371 372 /// Returns: Number of levels. The maximum level is numLevels() - 1. 373 int numLevels() pure const nothrow @nogc 374 { 375 return cast(int)levels.length; 376 } 377 378 /// Regenerates the whole upper levels. 379 void generateMipmaps(Quality quality) nothrow @nogc 380 { 381 box2i updateRect = box2i(0, 0, width(), height()); 382 for (int level = 1; level < numLevels(); ++level) 383 { 384 // HACK: Force cubic filter past a level else it makes ugly looking mipmaps 385 if (level >= 3 && quality == Quality.box) 386 quality = Quality.cubic; 387 388 updateRect = generateNextLevel(quality, updateRect, level); 389 } 390 } 391 392 /// Regenerates a single mipmap level based on changes in the provided rectangle (expressed in level 0 coordinates). 393 /// updateRect expressed in level 0 coordinates 394 /// In general if you have several subparts of mipmaps to update, make sure a level is fully completed 395 /// before computing the next one. 396 box2i generateNextLevel(Quality quality, box2i updateRectPreviousLevel, int level) nothrow @nogc 397 { 398 OwnedImage!COLOR previousLevel = levels[level - 1]; 399 box2i updateRect = impactOnNextLevel(quality, updateRectPreviousLevel, previousLevel.w, previousLevel.h); 400 generateLevel(level, quality, updateRect); 401 return updateRect; 402 } 403 404 /// Regenerates one level 405 /// updateRect expressed in level i-th coordinates 406 void generateLevel(int level, Quality quality, box2i updateRect) nothrow @nogc 407 { 408 assert(level > 0); 409 OwnedImage!COLOR thisLevel = levels[level]; 410 OwnedImage!COLOR previousLevel = levels[level - 1]; 411 412 final switch(quality) with (Quality) 413 { 414 case box: 415 416 static if (is(COLOR == RGBA)) 417 generateLevelBoxRGBA(thisLevel, previousLevel, updateRect); 418 else static if (is(COLOR == L16)) 419 generateLevelBoxL16(thisLevel, previousLevel, updateRect); 420 else 421 static assert(false, "not implemented"); 422 423 enum checkBoxMipmaps = false; 424 425 static if (checkBoxMipmaps) 426 { 427 for (int y = updateRect.min.y; y < updateRect.max.y; ++y) 428 { 429 COLOR[] L0 = previousLevel.scanline(y * 2); 430 COLOR[] L1 = previousLevel.scanline(y * 2 + 1); 431 COLOR[] dest = thisLevel.scanline(y); 432 433 for (int x = updateRect.min.x; x < updateRect.max.x; ++x) 434 { 435 // A B 436 // C D 437 COLOR A = L0[2 * x]; 438 COLOR B = L0[2 * x + 1]; 439 COLOR C = L1[2 * x]; 440 COLOR D = L1[2 * x + 1]; 441 assert(dest[x] == COLOR.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D)); 442 } 443 } 444 } 445 break; 446 447 case boxAlphaCov: 448 449 static if (is(COLOR == RGBA)) 450 { 451 generateLevelBoxAlphaCovRGBA(thisLevel, previousLevel, updateRect); 452 453 static if (false) 454 { 455 void checkLevelBoxAlphaConvRGBA(Image!RGBA* thisLevel, Image!RGBA* previousLevel, box2i updateRect) 456 { 457 for (int y = updateRect.min.y; y < updateRect.max.y; ++y) 458 { 459 RGBA[] L0 = previousLevel.scanline(y * 2); 460 RGBA[] L1 = previousLevel.scanline(y * 2 + 1); 461 RGBA[] dest = thisLevel.scanline(y); 462 463 for (int x = updateRect.min.x; x < updateRect.max.x; ++x) 464 { 465 // A B 466 // C D 467 RGBA A = L0.ptr[2 * x]; 468 RGBA B = L0.ptr[2 * x + 1]; 469 RGBA C = L1.ptr[2 * x]; 470 RGBA D = L1.ptr[2 * x + 1]; 471 472 int alphaA = A.a; 473 int alphaB = B.a; 474 int alphaC = C.a; 475 int alphaD = D.a; 476 int sum = alphaA + alphaB + alphaC + alphaD; 477 if (sum == 0) 478 { 479 assert(dest.ptr[x] == A); 480 } 481 else 482 { 483 int destAlpha = cast(ubyte)( (alphaA + alphaB + alphaC + alphaD + 2) >> 2 ); 484 int red = (A.r * alphaA + B.r * alphaB + C.r * alphaC + D.r * alphaD); 485 int green = (A.g * alphaA + B.g * alphaB + C.g * alphaC + D.g * alphaD); 486 int blue = (A.b * alphaA + B.b* alphaB + C.b * alphaC + D.b * alphaD); 487 float invSum = 1 / cast(float)(sum); 488 489 RGBA finalColor = RGBA( cast(ubyte)(0.5f + red * invSum), 490 cast(ubyte)(0.5f + green * invSum), 491 cast(ubyte)(0.5f + blue * invSum), 492 cast(ubyte)destAlpha ); 493 assert(dest.ptr[x] == finalColor); 494 } 495 } 496 } 497 } 498 checkLevelBoxAlphaConvRGBA(thisLevel, previousLevel, updateRect); 499 } 500 break; 501 } 502 else 503 assert(false); 504 505 case boxAlphaCovIntoPremul: 506 507 static if (is(COLOR == RGBA)) 508 { 509 generateLevelBoxAlphaCovIntoPremulRGBA(thisLevel, previousLevel, updateRect); 510 break; 511 } 512 else 513 assert(false); 514 515 case cubic: 516 static if (is(COLOR == RGBA)) 517 { 518 generateLevelCubicRGBA(thisLevel, previousLevel, updateRect); 519 break; 520 } 521 else static if (is(COLOR == L16)) 522 { 523 generateLevelCubicL16(thisLevel, previousLevel, updateRect); 524 break; 525 } 526 else 527 static assert(false, "not implemented"); 528 529 530 } 531 } 532 533 534 private: 535 /// Computes impact of updating the area box on next level 536 static box2i impactOnNextLevel(Quality quality, box2i area, int currentLevelWidth, int currentLevelHeight) pure nothrow @nogc 537 { 538 box2i maxArea = box2i(0, 0, currentLevelWidth / 2, currentLevelHeight / 2); 539 540 final switch(quality) with (Quality) 541 { 542 case box: 543 case boxAlphaCov: 544 case boxAlphaCovIntoPremul: 545 int xmin = area.min.x / 2; 546 int ymin = area.min.y / 2; 547 int xmax = (area.max.x + 1) / 2; 548 int ymax = (area.max.y + 1) / 2; 549 return box2i(xmin, ymin, xmax, ymax).intersection(maxArea); 550 551 case cubic: 552 int xmin = (area.min.x - 1) / 2; 553 int ymin = (area.min.y - 1) / 2; 554 int xmax = (area.max.x + 2) / 2; 555 int ymax = (area.max.y + 2) / 2; 556 return box2i(xmin, ymin, xmax, ymax).intersection(maxArea); 557 } 558 559 } 560 } 561 562 unittest 563 { 564 Mipmap!RGBA a = new Mipmap!RGBA(); 565 a.size(4, 256, 256); 566 a.destroy(); 567 568 Mipmap!L16 b = new Mipmap!L16(); 569 b.size(16, 17, 333); 570 b.destroy(); 571 } 572 573 574 private: 575 576 align(16) static immutable short[8] xmmTwoShort = [ 2, 2, 2, 2, 2, 2, 2, 2 ]; 577 align(16) static immutable int[4] xmmTwoInt = [ 2, 2, 2, 2 ]; 578 align(16) static immutable float[4] xmm0_5 = [ 0.5f, 0.5f, 0.5f, 0.5f ]; 579 align(16) static immutable int[4] xmm512 = [ 512, 512, 512, 512 ]; 580 align(16) static immutable short[8] xmm11113333 = [ 1, 1, 1, 1, 3, 3, 3, 3 ]; 581 align(16) static immutable short[8] xmm33331111 = [ 3, 3, 3, 3, 1, 1, 1, 1 ]; 582 align(16) static immutable short[8] xmm33339999 = [ 3, 3, 3, 3, 9, 9, 9, 9 ]; 583 align(16) static immutable short[8] xmm99993333 = [ 9, 9, 9, 9, 3, 3, 3, 3 ]; 584 align(16) static immutable short[8] xmm32 = [ 32, 32, 32, 32, 32, 32, 32, 32 ]; 585 586 587 void generateLevelBoxRGBA(OwnedImage!RGBA thisLevel, 588 OwnedImage!RGBA previousLevel, 589 box2i updateRect) pure nothrow @nogc 590 { 591 int width = updateRect.width(); 592 int height = updateRect.height(); 593 594 for (int y = 0; y < height; ++y) 595 { 596 RGBA* L0 = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 ) + updateRect.min.x * 2; 597 RGBA* L1 = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 + 1) + updateRect.min.x * 2; 598 RGBA* dest = thisLevel.scanlinePtr( updateRect.min.y + y) + updateRect.min.x; 599 version(inlineAsmCanLoadGlobalsInPIC) 600 { 601 version(D_InlineAsm_X86) 602 { 603 asm pure nothrow @nogc 604 { 605 mov ECX, width; 606 shr ECX, 1; 607 jz no_need; // ECX = 0 => no pair of pixels to process 608 609 mov EAX, L0; 610 mov EDX, L1; 611 mov EDI, dest; 612 movaps XMM5, xmmTwoShort; 613 614 loop_ecx: 615 movdqu XMM0, [EAX]; // A B E F 616 pxor XMM4, XMM4; 617 movdqu XMM1, [EDX]; // C D G H 618 movdqa XMM2, XMM0; 619 movdqa XMM3, XMM1; 620 punpcklbw XMM0, XMM4; // A B in short 621 punpcklbw XMM1, XMM4; // C D in short 622 punpckhbw XMM2, XMM4; // E F in short 623 punpckhbw XMM3, XMM4; // G H in short 624 paddusw XMM0, XMM1; // A + C | B + D 625 paddusw XMM2, XMM3; // E + F | G + H 626 movdqa XMM1, XMM0; 627 movdqa XMM3, XMM2; 628 psrldq XMM1, 8; 629 psrldq XMM3, 8; 630 add EDI, 8; 631 paddusw XMM0, XMM1; // A + B + C + D | garbage 632 paddusw XMM2, XMM3; // E + F + G + H | garbage 633 paddusw XMM0, XMM5; // A + B + C + D + 2 | garbage 634 paddusw XMM2, XMM5; // E + F + G + H + 2 | garbage 635 psrlw XMM0, 2; // (A + B + C + D + 2) >> 2 | garbage 636 psrlw XMM2, 2; // (E + F + G + H + 2) >> 2 | garbage 637 add EAX, 16; 638 punpcklqdq XMM0, XMM2; 639 add EDX, 16; 640 packuswb XMM0, XMM4; // (A + B + C + D + 2) >> 2 | (E + F + G + H + 2) >> 2 | 0 | 0 641 movq [EDI-8], XMM0; 642 sub ECX, 1; 643 jnz loop_ecx; 644 no_need: ; 645 } 646 647 // Eventually filter the last pixel 648 int remaining = width & ~1; 649 for (int x = remaining; x < width; ++x) 650 { 651 RGBA A = L0[2 * x]; 652 RGBA B = L0[2 * x + 1]; 653 RGBA C = L1[2 * x]; 654 RGBA D = L1[2 * x + 1]; 655 dest[x] = RGBA.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D); 656 } 657 } 658 else version(D_InlineAsm_X86_64) 659 { 660 asm pure nothrow @nogc 661 { 662 mov ECX, width; 663 shr ECX, 1; 664 jz no_need; // ECX = 0 => no pair of pixels to process 665 666 mov RAX, L0; 667 mov RDX, L1; 668 mov RDI, dest; 669 movaps XMM5, xmmTwoShort; 670 671 loop_ecx: 672 movdqu XMM0, [RAX]; // A B E F 673 pxor XMM4, XMM4; 674 movdqu XMM1, [RDX]; // C D G H 675 movdqa XMM2, XMM0; 676 movdqa XMM3, XMM1; 677 punpcklbw XMM0, XMM4; // A B in short 678 punpcklbw XMM1, XMM4; // C D in short 679 punpckhbw XMM2, XMM4; // E F in short 680 punpckhbw XMM3, XMM4; // G H in short 681 paddusw XMM0, XMM1; // A + C | B + D 682 paddusw XMM2, XMM3; // E + F | G + H 683 movdqa XMM1, XMM0; 684 movdqa XMM3, XMM2; 685 psrldq XMM1, 8; 686 psrldq XMM3, 8; 687 add RDI, 8; 688 paddusw XMM0, XMM1; // A + B + C + D | garbage 689 paddusw XMM2, XMM3; // E + F + G + H | garbage 690 paddusw XMM0, XMM5; // A + B + C + D + 2 | garbage 691 paddusw XMM2, XMM5; // E + F + G + H + 2 | garbage 692 psrlw XMM0, 2; // (A + B + C + D + 2) >> 2 | garbage 693 psrlw XMM2, 2; // (E + F + G + H + 2) >> 2 | garbage 694 add RAX, 16; 695 punpcklqdq XMM0, XMM2; 696 add RDX, 16; 697 packuswb XMM0, XMM4; // (A + B + C + D + 2) >> 2 | (E + F + G + H + 2) >> 2 | 0 | 0 698 movq [RDI-8], XMM0; 699 sub ECX, 1; 700 jnz loop_ecx; 701 no_need: ; 702 } 703 704 // Eventually filter the last pixel 705 int remaining = width & ~1; 706 for (int x = remaining; x < width; ++x) 707 { 708 RGBA A = L0[2 * x]; 709 RGBA B = L0[2 * x + 1]; 710 RGBA C = L1[2 * x]; 711 RGBA D = L1[2 * x + 1]; 712 dest[x] = RGBA.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D); 713 } 714 } 715 else 716 static assert(false); 717 } 718 else 719 { 720 for (int x = 0; x < width; ++x) 721 { 722 // A B 723 // C D 724 RGBA A = L0[2 * x]; 725 RGBA B = L0[2 * x + 1]; 726 RGBA C = L1[2 * x]; 727 RGBA D = L1[2 * x + 1]; 728 729 dest[x] = RGBA.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D); 730 } 731 } 732 } 733 } 734 735 void generateLevelBoxL16(OwnedImage!L16 thisLevel, 736 OwnedImage!L16 previousLevel, 737 box2i updateRect) pure nothrow @nogc 738 { 739 int width = updateRect.width(); 740 int height = updateRect.height(); 741 742 for (int y = 0; y < height; ++y) 743 { 744 L16* L0 = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 ) + updateRect.min.x * 2; 745 L16* L1 = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 + 1) + updateRect.min.x * 2; 746 L16* dest = thisLevel.scanlinePtr( updateRect.min.y + y) + updateRect.min.x; 747 748 version(inlineAsmCanLoadGlobalsInPIC) 749 { 750 version(D_InlineAsm_X86) 751 { 752 asm pure nothrow @nogc 753 { 754 mov ECX, width; 755 shr ECX, 2; 756 jz no_need; // ECX = 0 => less than 4 pixels to process 757 758 mov EAX, L0; 759 mov EDX, L1; 760 mov EDI, dest; 761 movdqa XMM5, xmmTwoInt; 762 pxor XMM4, XMM4; 763 764 loop_ecx: 765 movdqu XMM0, [EAX]; // A B E F I J M N 766 movdqu XMM1, [EDX]; // C D G H K L O P 767 768 add EAX, 16; 769 add EDX, 16; 770 771 movdqa XMM2, XMM0; 772 movdqa XMM3, XMM1; 773 774 punpcklwd XMM0, XMM4; // A B E F in int32 775 punpckhwd XMM2, XMM4; // I J M N in int32 776 punpcklwd XMM1, XMM4; // C D G H in int32 777 punpckhwd XMM3, XMM4; // K L O P in int32 778 779 paddd XMM0, XMM1; // A+C B+D E+G F+H 780 paddd XMM2, XMM3; // I+K J+L M+O N+P 781 782 movdqa XMM1, XMM0; 783 movdqa XMM3, XMM2; 784 785 psrldq XMM1, 4; // B+D E+G F+H 0 786 psrldq XMM3, 4; // J+L M+O N+P 0 787 788 paddd XMM0, XMM1; // A+B+C+D garbage E+F+G+H garbage 789 paddd XMM2, XMM3; // I+J+K+L garbage M+N+O+P garbage 790 791 pshufd XMM0, XMM0, 0b00001000; // A+B+C+D E+F+G+H garbage garbage 792 pshufd XMM2, XMM2, 0b00001000; // I+J+K+L M+N+O+P garbage garbage 793 794 punpcklqdq XMM0, XMM2; // A+B+C+D E+F+G+H I+J+K+L M+N+O+P 795 paddd XMM0, XMM5; // add 2 796 psrld XMM0, 2; // >> 2 797 798 // because packusdw is not available before SSE4.1 799 // Extend sign bit to the right 800 pslld XMM0, 16; 801 psrad XMM0, 16; 802 add EDI, 8; 803 packssdw XMM0, XMM4; 804 805 movq [EDI-8], XMM0; 806 sub ECX, 1; 807 jnz loop_ecx; 808 no_need: ; 809 } 810 811 // Eventually filter the 0 to 3 pixels 812 int remaining = width & ~3; 813 for (int x = remaining; x < width; ++x) 814 { 815 L16 A = L0[2 * x]; 816 L16 B = L0[2 * x + 1]; 817 L16 C = L1[2 * x]; 818 L16 D = L1[2 * x + 1]; 819 dest[x] = L16.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D); 820 } 821 } 822 else version(D_InlineAsm_X86_64) 823 { 824 asm pure nothrow @nogc 825 { 826 mov ECX, width; 827 shr ECX, 2; 828 jz no_need; // ECX = 0 => less than 4 pixels to process 829 830 mov RAX, L0; 831 mov RDX, L1; 832 mov RDI, dest; 833 movdqa XMM5, xmmTwoInt; 834 pxor XMM4, XMM4; 835 836 loop_ecx: 837 movdqu XMM0, [RAX]; // A B E F I J M N 838 movdqu XMM1, [RDX]; // C D G H K L O P 839 840 add RAX, 16; 841 add RDX, 16; 842 843 movdqa XMM2, XMM0; 844 movdqa XMM3, XMM1; 845 846 punpcklwd XMM0, XMM4; // A B E F in int32 847 punpckhwd XMM2, XMM4; // I J M N in int32 848 punpcklwd XMM1, XMM4; // C D G H in int32 849 punpckhwd XMM3, XMM4; // K L O P in int32 850 851 paddd XMM0, XMM1; // A+C B+D E+G F+H 852 paddd XMM2, XMM3; // I+K J+L M+O N+P 853 854 movdqa XMM1, XMM0; 855 movdqa XMM3, XMM2; 856 857 psrldq XMM1, 4; // B+D E+G F+H 0 858 psrldq XMM3, 4; // J+L M+O N+P 0 859 860 paddd XMM0, XMM1; // A+B+C+D garbage E+F+G+H garbage 861 paddd XMM2, XMM3; // I+J+K+L garbage M+N+O+P garbage 862 863 pshufd XMM0, XMM0, 0b00001000; // A+B+C+D E+F+G+H garbage garbage 864 pshufd XMM2, XMM2, 0b00001000; // I+J+K+L M+N+O+P garbage garbage 865 866 punpcklqdq XMM0, XMM2; // A+B+C+D E+F+G+H I+J+K+L M+N+O+P 867 paddd XMM0, XMM5; // add 2 868 psrld XMM0, 2; // >> 2 869 870 // because packusdw is not available before SSE4.1 871 // Extend sign bit to the right 872 pslld XMM0, 16; 873 psrad XMM0, 16; 874 add RDI, 8; 875 packssdw XMM0, XMM4; 876 877 movq [RDI-8], XMM0; 878 sub ECX, 1; 879 jnz loop_ecx; 880 no_need: ; 881 } 882 883 // Eventually filter the 0 to 3 pixels 884 int remaining = width & ~3; 885 for (int x = remaining; x < width; ++x) 886 { 887 L16 A = L0[2 * x]; 888 L16 B = L0[2 * x + 1]; 889 L16 C = L1[2 * x]; 890 L16 D = L1[2 * x + 1]; 891 dest[x] = L16.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D); 892 } 893 } 894 else 895 static assert(false); 896 } 897 else 898 { 899 for (int x = 0; x < width; ++x) 900 { 901 // A B 902 // C D 903 L16 A = L0[2 * x]; 904 L16 B = L0[2 * x + 1]; 905 L16 C = L1[2 * x]; 906 L16 D = L1[2 * x + 1]; 907 908 dest[x] = L16.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D); 909 } 910 } 911 } 912 } 913 914 915 void generateLevelBoxAlphaCovRGBA(OwnedImage!RGBA thisLevel, 916 OwnedImage!RGBA previousLevel, 917 box2i updateRect) nothrow @nogc 918 { 919 int width = updateRect.width(); 920 int height = updateRect.height(); 921 922 for (int y = 0; y < height; ++y) 923 { 924 RGBA* L0 = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 ) + updateRect.min.x * 2; 925 RGBA* L1 = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 + 1) + updateRect.min.x * 2; 926 RGBA* dest = thisLevel.scanlinePtr( updateRect.min.y + y) + updateRect.min.x; 927 928 version(inlineAsmCanLoadGlobalsInPIC) 929 { 930 version(D_InlineAsm_X86) 931 { 932 // Note: this block of assembly created problems with LDC -a x86, 933 // and other problems with DMD, so it's currently disabled 934 935 // Generic code 936 assert(width > 0); 937 938 for (int x = 0; x < width; ++x) 939 { 940 // A B 941 // C D 942 RGBA A = L0[2 * x]; 943 RGBA B = L0[2 * x + 1]; 944 RGBA C = L1[2 * x]; 945 RGBA D = L1[2 * x + 1]; 946 947 int alphaA = A.a; 948 int alphaB = B.a; 949 int alphaC = C.a; 950 int alphaD = D.a; 951 int sum = alphaA + alphaB + alphaC + alphaD; 952 if (sum == 0) 953 { 954 dest[x] = RGBA(0,0,0,0); 955 } 956 else 957 { 958 int destAlpha = cast(ubyte)( (alphaA + alphaB + alphaC + alphaD + 2) >> 2 ); 959 int red = (A.r * alphaA + B.r * alphaB + C.r * alphaC + D.r * alphaD); 960 int green = (A.g * alphaA + B.g * alphaB + C.g * alphaC + D.g * alphaD); 961 int blue = (A.b * alphaA + B.b* alphaB + C.b * alphaC + D.b * alphaD); 962 float invSum = 1 / cast(float)(sum); 963 964 RGBA finalColor = RGBA( cast(ubyte)(0.5f + red * invSum), 965 cast(ubyte)(0.5f + green * invSum), 966 cast(ubyte)(0.5f + blue * invSum), 967 cast(ubyte)destAlpha ); 968 dest[x] = finalColor; 969 } 970 } 971 972 /+ 973 asm nothrow @nogc 974 { 975 mov ECX, width; 976 977 mov EAX, L0; 978 mov EDX, L1; 979 mov EDI, dest; 980 981 loop_ecx: 982 983 movq XMM0, [EAX]; // Ar Ag Ab Aa Br Bg Bb Ba + zeroes 984 movq XMM1, [EDX]; // Cr Cg Cb Ca Dr Dg Db Da + zeroes 985 pxor XMM4, XMM4; 986 add EAX, 8; 987 add EDX, 8; 988 989 punpcklbw XMM0, XMM4; // Ar Ag Ab Aa Br Bg Bb Ba 990 punpcklbw XMM1, XMM4; // Cr Cg Cb Ca Dr Dg Db Da 991 992 movdqa XMM2, XMM0; 993 punpcklwd XMM0, XMM1; // Ar Cr Ag Cg Ab Cb Aa Ca 994 punpckhwd XMM2, XMM1; // Br Dr Bg Dg Bb Db Ba Da 995 996 // perhaps unnecessary 997 movdqa XMM3, XMM0; 998 punpcklwd XMM0, XMM2; // Ar Br Cr Dr Ag Bg Cg Dg 999 punpckhwd XMM3, XMM2; // Ab Bb Cb Db Aa Ba Ca Da 1000 1001 movdqa XMM1, XMM3; 1002 punpckhqdq XMM1, XMM1; // Aa Ba Ca Da Aa Ba Ca Da 1003 1004 // Are alpha all zeroes? if so, early continue. 1005 movdqa XMM2, XMM1; 1006 pcmpeqb XMM2, XMM4; 1007 add EDI, 4; 1008 pmovmskb ESI, XMM2; 1009 cmp ESI, 0xffff; 1010 jnz non_null; 1011 1012 pxor XMM0, XMM0; 1013 sub ECX, 1; 1014 movd [EDI-4], XMM0; // dest[x] = A 1015 jnz loop_ecx; 1016 jmp end_of_loop; 1017 1018 non_null: 1019 1020 pmaddwd XMM0, XMM1; // Ar*Aa+Br*Ba Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 1021 pmaddwd XMM3, XMM1; // Ab*Aa+Bb*Ba Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 1022 1023 // Starting computing sum of coefficients too 1024 punpcklwd XMM1, XMM4; // Aa Ba Ca Da 1025 1026 movdqa XMM2, XMM0; 1027 movdqa XMM5, XMM3; 1028 movdqa XMM4, XMM1; 1029 psrldq XMM4, 8; 1030 1031 psrldq XMM2, 4; // Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 0 1032 psrldq XMM5, 4; // Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 0 1033 paddq XMM1, XMM4; // Aa+Ca Ba+Da garbage garbage 1034 movdqa XMM4, XMM1; 1035 1036 paddd XMM0, XMM2; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da garbage Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage 1037 paddd XMM3, XMM5; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da garbage Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage 1038 psrldq XMM4, 4; 1039 1040 pshufd XMM0, XMM0, 0b00001000; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage garbage 1041 paddq XMM1, XMM4; // Aa+Ba+Ca+Da garbage garbage garbage 1042 pshufd XMM3, XMM3, 0b00001000; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage garbage 1043 1044 punpcklqdq XMM0, XMM3; // fR fG fB fA 1045 pshufd XMM1, XMM1, 0; 1046 1047 cvtdq2ps XMM0, XMM0; 1048 1049 cvtdq2ps XMM3, XMM1; // sum sum sum sum 1050 1051 divps XMM0, XMM3; // fR/sum fG/sum fB/sum fA/sum 1052 addps XMM0, xmm0_5; 1053 cvttps2dq XMM0, XMM0; // return into integer domain using cast(int)(x + 0.5f) 1054 1055 paddd XMM1, xmmTwoInt; 1056 psrld XMM1, 2; // finalAlpha finalAlpha finalAlpha finalAlpha 1057 1058 pslldq XMM0, 4; // 0 fR/sum fG/sum fB/sum 1059 pslldq XMM1, 12; // 0 0 0 finalAlpha 1060 psrldq XMM0, 4; // fR/sum fG/sum fB/sum 0 1061 1062 por XMM0, XMM1; // fR/sum fG/sum fB/sum finalAlpha 1063 pxor XMM3, XMM3; 1064 packssdw XMM0, XMM3; // same in words 1065 packuswb XMM0, XMM3; // same in bytes 1066 1067 sub ECX, 1; 1068 movd [EDI-4], XMM0; // dest[x] = A 1069 jnz loop_ecx; 1070 end_of_loop: ; 1071 } 1072 +/ 1073 } 1074 else version(D_InlineAsm_X86_64) 1075 { 1076 assert(width > 0); 1077 asm nothrow @nogc 1078 { 1079 mov ECX, width; 1080 1081 mov RAX, L0; 1082 mov RDX, L1; 1083 mov RDI, dest; 1084 1085 loop_ecx: 1086 1087 movq XMM0, [RAX]; // Ar Ag Ab Aa Br Bg Bb Ba + zeroes 1088 movq XMM1, [RDX]; // Cr Cg Cb Ca Dr Dg Db Da + zeroes 1089 pxor XMM4, XMM4; 1090 add RAX, 8; 1091 add RDX, 8; 1092 1093 punpcklbw XMM0, XMM4; // Ar Ag Ab Aa Br Bg Bb Ba 1094 punpcklbw XMM1, XMM4; // Cr Cg Cb Ca Dr Dg Db Da 1095 1096 movdqa XMM2, XMM0; 1097 punpcklwd XMM0, XMM1; // Ar Cr Ag Cg Ab Cb Aa Ca 1098 punpckhwd XMM2, XMM1; // Br Dr Bg Dg Bb Db Ba Da 1099 1100 // perhaps unnecessary 1101 movdqa XMM3, XMM0; 1102 punpcklwd XMM0, XMM2; // Ar Br Cr Dr Ag Bg Cg Dg 1103 punpckhwd XMM3, XMM2; // Ab Bb Cb Db Aa Ba Ca Da 1104 1105 movdqa XMM1, XMM3; 1106 punpckhqdq XMM1, XMM1; // Aa Ba Ca Da Aa Ba Ca Da 1107 1108 // Are alpha all zeroes? if so, early continue. 1109 movdqa XMM2, XMM1; 1110 pcmpeqb XMM2, XMM4; 1111 add RDI, 4; 1112 pmovmskb ESI, XMM2; 1113 cmp ESI, 0xffff; 1114 jnz non_null; 1115 1116 pxor XMM0, XMM0; 1117 sub ECX, 1; 1118 movd [RDI-4], XMM0; // dest[x] = A 1119 jnz loop_ecx; 1120 jmp end_of_loop; 1121 1122 non_null: 1123 1124 pmaddwd XMM0, XMM1; // Ar*Aa+Br*Ba Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 1125 pmaddwd XMM3, XMM1; // Ab*Aa+Bb*Ba Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 1126 1127 // Starting computing sum of coefficients too 1128 punpcklwd XMM1, XMM4; // Aa Ba Ca Da 1129 1130 movdqa XMM2, XMM0; 1131 movdqa XMM5, XMM3; 1132 movdqa XMM4, XMM1; 1133 psrldq XMM4, 8; 1134 1135 psrldq XMM2, 4; // Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 0 1136 psrldq XMM5, 4; // Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 0 1137 paddq XMM1, XMM4; // Aa+Ca Ba+Da garbage garbage 1138 movdqa XMM4, XMM1; 1139 1140 paddd XMM0, XMM2; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da garbage Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage 1141 paddd XMM3, XMM5; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da garbage Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage 1142 psrldq XMM4, 4; 1143 1144 pshufd XMM0, XMM0, 0b00001000; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage garbage 1145 paddq XMM1, XMM4; // Aa+Ba+Ca+Da garbage garbage garbage 1146 pshufd XMM3, XMM3, 0b00001000; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage garbage 1147 1148 punpcklqdq XMM0, XMM3; // fR fG fB fA 1149 pshufd XMM1, XMM1, 0; 1150 1151 cvtdq2ps XMM0, XMM0; 1152 1153 cvtdq2ps XMM3, XMM1; // sum sum sum sum 1154 1155 divps XMM0, XMM3; // fR/sum fG/sum fB/sum fA/sum 1156 addps XMM0, xmm0_5; 1157 cvttps2dq XMM0, XMM0; // return into integer domain using cast(int)(x + 0.5f) 1158 1159 paddd XMM1, xmmTwoInt; 1160 psrld XMM1, 2; // finalAlpha finalAlpha finalAlpha finalAlpha 1161 1162 pslldq XMM0, 4; // 0 fR/sum fG/sum fB/sum 1163 pslldq XMM1, 12; // 0 0 0 finalAlpha 1164 psrldq XMM0, 4; // fR/sum fG/sum fB/sum 0 1165 1166 por XMM0, XMM1; // fR/sum fG/sum fB/sum finalAlpha 1167 pxor XMM3, XMM3; 1168 packssdw XMM0, XMM3; // same in words 1169 packuswb XMM0, XMM3; // same in bytes 1170 1171 sub ECX, 1; 1172 movd [RDI-4], XMM0; // dest[x] = A 1173 jnz loop_ecx; 1174 end_of_loop: ; 1175 } 1176 } 1177 else 1178 static assert(false); 1179 } 1180 else 1181 { 1182 for (int x = 0; x < width; ++x) 1183 { 1184 // A B 1185 // C D 1186 RGBA A = L0[2 * x]; 1187 RGBA B = L0[2 * x + 1]; 1188 RGBA C = L1[2 * x]; 1189 RGBA D = L1[2 * x + 1]; 1190 1191 int alphaA = A.a; 1192 int alphaB = B.a; 1193 int alphaC = C.a; 1194 int alphaD = D.a; 1195 int sum = alphaA + alphaB + alphaC + alphaD; 1196 if (sum == 0) 1197 { 1198 dest[x] = RGBA(0,0,0,0); 1199 } 1200 else 1201 { 1202 int destAlpha = cast(ubyte)( (alphaA + alphaB + alphaC + alphaD + 2) >> 2 ); 1203 int red = (A.r * alphaA + B.r * alphaB + C.r * alphaC + D.r * alphaD); 1204 int green = (A.g * alphaA + B.g * alphaB + C.g * alphaC + D.g * alphaD); 1205 int blue = (A.b * alphaA + B.b* alphaB + C.b * alphaC + D.b * alphaD); 1206 float invSum = 1 / cast(float)(sum); 1207 1208 RGBA finalColor = RGBA( cast(ubyte)(0.5f + red * invSum), 1209 cast(ubyte)(0.5f + green * invSum), 1210 cast(ubyte)(0.5f + blue * invSum), 1211 cast(ubyte)destAlpha ); 1212 dest[x] = finalColor; 1213 } 1214 } 1215 } 1216 1217 enum verify = false; 1218 1219 static if (verify) 1220 { 1221 for (int x = 0; x < width; ++x) 1222 { 1223 // A B 1224 // C D 1225 RGBA A = L0[2 * x]; 1226 RGBA B = L0[2 * x + 1]; 1227 RGBA C = L1[2 * x]; 1228 RGBA D = L1[2 * x + 1]; 1229 1230 int alphaA = A.a; 1231 int alphaB = B.a; 1232 int alphaC = C.a; 1233 int alphaD = D.a; 1234 int sum = alphaA + alphaB + alphaC + alphaD; 1235 if (sum == 0) 1236 { 1237 assert(dest[x] == RGBA(0,0,0,0)); 1238 } 1239 else 1240 { 1241 int destAlpha = cast(ubyte)( (alphaA + alphaB + alphaC + alphaD + 2) >> 2 ); 1242 int red = (A.r * alphaA + B.r * alphaB + C.r * alphaC + D.r * alphaD); 1243 int green = (A.g * alphaA + B.g * alphaB + C.g * alphaC + D.g * alphaD); 1244 int blue = (A.b * alphaA + B.b* alphaB + C.b * alphaC + D.b * alphaD); 1245 1246 float invSum = 1 / cast(float)(sum); 1247 1248 RGBA finalColor = RGBA( cast(ubyte)(0.5f + red * invSum), 1249 cast(ubyte)(0.5f + green * invSum), 1250 cast(ubyte)(0.5f + blue * invSum), 1251 cast(ubyte)destAlpha ); 1252 RGBA instead = dest[x]; 1253 1254 int insteadR = instead.r; 1255 int insteadG = instead.g; 1256 int insteadB = instead.b; 1257 int insteadA = instead.a; 1258 int finalColorR = finalColor.r; 1259 int finalColorG = finalColor.g; 1260 int finalColorB = finalColor.b; 1261 int finalColorA = finalColor.a; 1262 import std.math; 1263 assert(abs(insteadR - finalColorR) <= 1); // some remaining differences because of rounding 1264 assert(abs(insteadG - finalColorG) <= 1); 1265 assert(abs(insteadB - finalColorB) <= 1); 1266 assert(insteadA == finalColorA); 1267 } 1268 } 1269 } 1270 } 1271 } 1272 1273 void generateLevelBoxAlphaCovIntoPremulRGBA(OwnedImage!RGBA thisLevel, 1274 OwnedImage!RGBA previousLevel, 1275 box2i updateRect) nothrow @nogc 1276 { 1277 int width = updateRect.width(); 1278 int height = updateRect.height(); 1279 1280 for (int y = 0; y < height; ++y) 1281 { 1282 RGBA* L0 = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 ) + updateRect.min.x * 2; 1283 RGBA* L1 = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 + 1) + updateRect.min.x * 2; 1284 RGBA* dest = thisLevel.scanlinePtr( updateRect.min.y + y) + updateRect.min.x; 1285 1286 version(inlineAsmCanLoadGlobalsInPIC) 1287 { 1288 version(D_InlineAsm_X86) 1289 { 1290 asm nothrow @nogc 1291 { 1292 mov ECX, width; 1293 1294 mov EAX, L0; 1295 mov EDX, L1; 1296 mov EDI, dest; 1297 1298 movdqa XMM5, xmm512; // 512 512 5121 512 1299 pxor XMM4, XMM4; // all zeroes 1300 1301 loop_ecx: 1302 1303 movq XMM0, [EAX]; // Ar Ag Ab Aa Br Bg Bb Ba + zeroes 1304 movq XMM1, [EDX]; // Cr Cg Cb Ca Dr Dg Db Da + zeroes 1305 pxor XMM4, XMM4; 1306 add EAX, 8; 1307 add EDX, 8; 1308 1309 punpcklbw XMM0, XMM4; // Ar Ag Ab Aa Br Bg Bb Ba 1310 punpcklbw XMM1, XMM4; // Cr Cg Cb Ca Dr Dg Db Da 1311 1312 movdqa XMM2, XMM0; 1313 punpcklwd XMM0, XMM1; // Ar Cr Ag Cg Ab Cb Aa Ca 1314 punpckhwd XMM2, XMM1; // Br Dr Bg Dg Bb Db Ba Da 1315 1316 movdqa XMM3, XMM0; 1317 punpcklwd XMM0, XMM2; // Ar Br Cr Dr Ag Bg Cg Dg 1318 punpckhwd XMM3, XMM2; // Ab Bb Cb Db Aa Ba Ca Da 1319 1320 movdqa XMM1, XMM3; 1321 punpckhqdq XMM1, XMM1; // Aa Ba Ca Da Aa Ba Ca Da 1322 1323 add EDI, 4; 1324 1325 pmaddwd XMM0, XMM1; // Ar*Aa+Br*Ba Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 1326 pmaddwd XMM3, XMM1; // Ab*Aa+Bb*Ba Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 1327 1328 movdqa XMM2, XMM0; 1329 movdqa XMM1, XMM3; 1330 1331 psrldq XMM2, 4; // Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 0 1332 psrldq XMM1, 4; // Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 0 1333 1334 paddd XMM0, XMM2; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da garbage Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage 1335 paddd XMM3, XMM1; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da garbage Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage 1336 1337 pshufd XMM0, XMM0, 0b00001000; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage garbage 1338 pshufd XMM3, XMM3, 0b00001000; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage garbage 1339 1340 punpcklqdq XMM0, XMM3; // fR fG fB fA 1341 1342 1343 paddd XMM0, XMM5; 1344 psrld XMM0, 10; // final color in dwords 1345 1346 packssdw XMM0, XMM4; // same in words 1347 packuswb XMM0, XMM4; // same in bytes 1348 1349 sub ECX, 1; 1350 movd [EDI-4], XMM0; // dest[x] = A 1351 jnz loop_ecx; 1352 } 1353 } 1354 else version(D_InlineAsm_X86_64) 1355 { 1356 asm nothrow @nogc 1357 { 1358 mov ECX, width; 1359 1360 mov RAX, L0; 1361 mov RDX, L1; 1362 mov RDI, dest; 1363 1364 movdqa XMM5, xmm512; // 512 512 5121 512 1365 pxor XMM4, XMM4; // all zeroes 1366 1367 loop_ecx: 1368 1369 movq XMM0, [RAX]; // Ar Ag Ab Aa Br Bg Bb Ba + zeroes 1370 movq XMM1, [RDX]; // Cr Cg Cb Ca Dr Dg Db Da + zeroes 1371 pxor XMM4, XMM4; 1372 add RAX, 8; 1373 add RDX, 8; 1374 1375 punpcklbw XMM0, XMM4; // Ar Ag Ab Aa Br Bg Bb Ba 1376 punpcklbw XMM1, XMM4; // Cr Cg Cb Ca Dr Dg Db Da 1377 1378 movdqa XMM2, XMM0; 1379 punpcklwd XMM0, XMM1; // Ar Cr Ag Cg Ab Cb Aa Ca 1380 punpckhwd XMM2, XMM1; // Br Dr Bg Dg Bb Db Ba Da 1381 1382 movdqa XMM3, XMM0; 1383 punpcklwd XMM0, XMM2; // Ar Br Cr Dr Ag Bg Cg Dg 1384 punpckhwd XMM3, XMM2; // Ab Bb Cb Db Aa Ba Ca Da 1385 1386 movdqa XMM1, XMM3; 1387 punpckhqdq XMM1, XMM1; // Aa Ba Ca Da Aa Ba Ca Da 1388 1389 add RDI, 4; 1390 1391 pmaddwd XMM0, XMM1; // Ar*Aa+Br*Ba Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 1392 pmaddwd XMM3, XMM1; // Ab*Aa+Bb*Ba Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 1393 1394 movdqa XMM2, XMM0; 1395 movdqa XMM1, XMM3; 1396 1397 psrldq XMM2, 4; // Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 0 1398 psrldq XMM1, 4; // Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 0 1399 1400 paddd XMM0, XMM2; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da garbage Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage 1401 paddd XMM3, XMM1; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da garbage Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage 1402 1403 pshufd XMM0, XMM0, 0b00001000; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage garbage 1404 pshufd XMM3, XMM3, 0b00001000; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage garbage 1405 1406 punpcklqdq XMM0, XMM3; // fR fG fB fA 1407 1408 1409 paddd XMM0, XMM5; 1410 psrld XMM0, 10; // final color in dwords 1411 1412 packssdw XMM0, XMM4; // same in words 1413 packuswb XMM0, XMM4; // same in bytes 1414 1415 sub ECX, 1; 1416 movd [RDI-4], XMM0; // dest[x] = A 1417 jnz loop_ecx; 1418 } 1419 } 1420 else 1421 static assert(false); 1422 } 1423 else 1424 { 1425 for (int x = 0; x < width; ++x) 1426 { 1427 RGBA A = L0[2 * x]; 1428 RGBA B = L0[2 * x + 1]; 1429 RGBA C = L1[2 * x]; 1430 RGBA D = L1[2 * x + 1]; 1431 int red = (A.r * A.a + B.r * B.a + C.r * C.a + D.r * D.a); 1432 int green = (A.g * A.a + B.g * B.a + C.g * C.a + D.g * D.a); 1433 int blue = (A.b * A.a + B.b* B.a + C.b * C.a + D.b * D.a); 1434 int alpha = (A.a * A.a + B.a* B.a + C.a * C.a + D.a * D.a); 1435 RGBA finalColor = RGBA( cast(ubyte)((red + 512) >> 10), 1436 cast(ubyte)((green + 512) >> 10), 1437 cast(ubyte)((blue + 512) >> 10), 1438 cast(ubyte)((alpha + 512) >> 10)); 1439 dest[x] = finalColor; 1440 } 1441 } 1442 1443 enum bool verify = false; 1444 1445 static if (verify) 1446 { 1447 for (int x = 0; x < width; ++x) 1448 { 1449 RGBA A = L0[2 * x]; 1450 RGBA B = L0[2 * x + 1]; 1451 RGBA C = L1[2 * x]; 1452 RGBA D = L1[2 * x + 1]; 1453 int red = (A.r * A.a + B.r * B.a + C.r * C.a + D.r * D.a); 1454 int green = (A.g * A.a + B.g * B.a + C.g * C.a + D.g * D.a); 1455 int blue = (A.b * A.a + B.b* B.a + C.b * C.a + D.b * D.a); 1456 int alpha = (A.a * A.a + B.a* B.a + C.a * C.a + D.a * D.a); 1457 RGBA finalColor = RGBA( cast(ubyte)((red + 512) >> 10), 1458 cast(ubyte)((green + 512) >> 10), 1459 cast(ubyte)((blue + 512) >> 10), 1460 cast(ubyte)((alpha + 512) >> 10)); 1461 assert(dest[x] == finalColor); 1462 } 1463 } 1464 } 1465 } 1466 1467 void generateLevelCubicRGBA(OwnedImage!RGBA thisLevel, 1468 OwnedImage!RGBA previousLevel, 1469 box2i updateRect) nothrow @nogc 1470 { 1471 for (int y = updateRect.min.y; y < updateRect.max.y; ++y) 1472 { 1473 int y2m1 = 2 * y - 1; 1474 if (y2m1 < 0) 1475 y2m1 = 0; 1476 1477 int y2p2 = 2 * y + 2; 1478 if (y2p2 > previousLevel.h - 1) 1479 y2p2 = previousLevel.h - 1; 1480 1481 RGBA* LM1 = previousLevel.scanlinePtr(y2m1); 1482 RGBA* L0 = previousLevel.scanlinePtr(y * 2); 1483 RGBA* L1 = previousLevel.scanlinePtr(y * 2 + 1); 1484 RGBA* L2 = previousLevel.scanlinePtr(y2p2); 1485 RGBA* dest = thisLevel.scanlinePtr(y); 1486 1487 for (int x = updateRect.min.x; x < updateRect.max.x; ++x) 1488 { 1489 // A B C D 1490 // E F G H 1491 // I J K L 1492 // M N O P 1493 1494 int x2m1 = 2 * x - 1; 1495 if (x2m1 < 0) 1496 x2m1 = 0; 1497 int x2p0 = 2 * x; 1498 int x2p2 = 2 * x + 2; 1499 if (x2p2 > previousLevel.w - 1) 1500 x2p2 = previousLevel.w - 1; 1501 1502 version(inlineAsmCanLoadGlobalsInPIC) 1503 { 1504 version(D_InlineAsm_X86) 1505 { 1506 RGBA[16] buf = void; 1507 buf[0] = LM1[x2m1]; 1508 buf[1] = LM1[x2p0]; 1509 buf[2] = LM1[x2p0+1]; 1510 buf[3] = LM1[x2p2]; 1511 buf[4] = L0[x2m1]; 1512 buf[5] = L0[x2p0]; 1513 buf[6] = L0[x2p0+1]; 1514 buf[7] = L0[x2p2]; 1515 buf[8] = L1[x2m1]; 1516 buf[9] = L1[x2p0]; 1517 buf[10] = L1[x2p0+1]; 1518 buf[11] = L1[x2p2]; 1519 buf[12] = L2[x2m1]; 1520 buf[13] = L2[x2p0]; 1521 buf[14] = L2[x2p0+1]; 1522 buf[15] = L2[x2p2]; 1523 RGBA* pDest = dest + x; 1524 1525 asm nothrow @nogc 1526 { 1527 movdqu XMM0, buf; // A B C D 1528 movdqu XMM1, buf; 1529 pxor XMM2, XMM2; // zeroes 1530 punpcklbw XMM0, XMM2; // A B 1531 punpckhbw XMM1, XMM2; // C D 1532 pmullw XMM0, xmm11113333; // A*1 B*3 in shorts 1533 movdqa XMM3, XMM0; 1534 pmullw XMM1, xmm33331111; // C*3 D*3 in shorts 1535 movdqa XMM5, XMM1; 1536 1537 movdqu XMM0, buf+16; // E F G H 1538 movdqu XMM1, buf+16; 1539 punpcklbw XMM0, XMM2; // E F 1540 punpckhbw XMM1, XMM2; // G H 1541 pmullw XMM0, xmm33339999; // E*3 F*9 in shorts 1542 paddw XMM3, XMM0; 1543 pmullw XMM1, xmm99993333; // G*9 H*3 in shorts 1544 paddw XMM5, XMM1; 1545 1546 movdqu XMM0, buf+32; // I J K L 1547 movdqu XMM1, buf+32; 1548 punpcklbw XMM0, XMM2; // I J 1549 punpckhbw XMM1, XMM2; // K L 1550 pmullw XMM0, xmm33339999; // I*3 J*9 in shorts 1551 paddw XMM3, XMM0; 1552 pmullw XMM1, xmm99993333; // K*9 L*3 in shorts 1553 paddw XMM5, XMM1; 1554 1555 movdqu XMM0, buf+48; // M N O P 1556 movdqu XMM1, buf+48; 1557 punpcklbw XMM0, XMM2; // M N 1558 punpckhbw XMM1, XMM2; // O P 1559 pmullw XMM0, xmm11113333; // M*1 N*3 in shorts 1560 paddw XMM3, XMM0; // A+E*3+I*3+M B*3+F*9+J*9+3*N 1561 pmullw XMM1, xmm33331111; // O*3 P*1 in shorts 1562 paddw XMM5, XMM1; // C*3+G*9+K*9+O*3 D+H*3+L*3+P 1563 1564 movdqa XMM0, XMM3; 1565 movdqa XMM1, XMM5; 1566 psrldq XMM0, 8; 1567 psrldq XMM1, 8; 1568 paddw XMM3, XMM0; // A+E*3+I*3+M+B*3+F*9+J*9+3*N garbage(x4) 1569 paddw XMM5, XMM1; // C*3+G*9+K*9+O*3+D+H*3+L*3+P garbage(x4) 1570 paddw XMM3, XMM5; // total-sum garbage(x4) 1571 1572 paddw XMM3, xmm32; 1573 psrlw XMM3, 6; 1574 mov EAX, pDest; 1575 packuswb XMM3, XMM2; 1576 1577 movd [EAX], XMM3; 1578 } 1579 } 1580 else version(D_InlineAsm_X86_64) 1581 { 1582 RGBA[16] buf = void; 1583 buf[0] = LM1[x2m1]; 1584 buf[1] = LM1[x2p0]; 1585 buf[2] = LM1[x2p0+1]; 1586 buf[3] = LM1[x2p2]; 1587 buf[4] = L0[x2m1]; 1588 buf[5] = L0[x2p0]; 1589 buf[6] = L0[x2p0+1]; 1590 buf[7] = L0[x2p2]; 1591 buf[8] = L1[x2m1]; 1592 buf[9] = L1[x2p0]; 1593 buf[10] = L1[x2p0+1]; 1594 buf[11] = L1[x2p2]; 1595 buf[12] = L2[x2m1]; 1596 buf[13] = L2[x2p0]; 1597 buf[14] = L2[x2p0+1]; 1598 buf[15] = L2[x2p2]; 1599 RGBA* pDest = dest + x; 1600 1601 asm nothrow @nogc 1602 { 1603 movdqu XMM0, buf; // A B C D 1604 movdqu XMM1, buf; 1605 pxor XMM2, XMM2; // zeroes 1606 punpcklbw XMM0, XMM2; // A B 1607 punpckhbw XMM1, XMM2; // C D 1608 pmullw XMM0, xmm11113333; // A*1 B*3 in shorts 1609 movdqa XMM3, XMM0; 1610 pmullw XMM1, xmm33331111; // C*3 D*3 in shorts 1611 movdqa XMM5, XMM1; 1612 1613 movdqu XMM0, buf+16; // E F G H 1614 movdqu XMM1, buf+16; 1615 punpcklbw XMM0, XMM2; // E F 1616 punpckhbw XMM1, XMM2; // G H 1617 pmullw XMM0, xmm33339999; // E*3 F*9 in shorts 1618 paddw XMM3, XMM0; 1619 pmullw XMM1, xmm99993333; // G*9 H*3 in shorts 1620 paddw XMM5, XMM1; 1621 1622 movdqu XMM0, buf+32; // I J K L 1623 movdqu XMM1, buf+32; 1624 punpcklbw XMM0, XMM2; // I J 1625 punpckhbw XMM1, XMM2; // K L 1626 pmullw XMM0, xmm33339999; // I*3 J*9 in shorts 1627 paddw XMM3, XMM0; 1628 pmullw XMM1, xmm99993333; // K*9 L*3 in shorts 1629 paddw XMM5, XMM1; 1630 1631 movdqu XMM0, buf+48; // M N O P 1632 movdqu XMM1, buf+48; 1633 punpcklbw XMM0, XMM2; // M N 1634 punpckhbw XMM1, XMM2; // O P 1635 pmullw XMM0, xmm11113333; // M*1 N*3 in shorts 1636 paddw XMM3, XMM0; // A+E*3+I*3+M B*3+F*9+J*9+3*N 1637 pmullw XMM1, xmm33331111; // O*3 P*1 in shorts 1638 paddw XMM5, XMM1; // C*3+G*9+K*9+O*3 D+H*3+L*3+P 1639 1640 movdqa XMM0, XMM3; 1641 movdqa XMM1, XMM5; 1642 psrldq XMM0, 8; 1643 psrldq XMM1, 8; 1644 paddw XMM3, XMM0; // A+E*3+I*3+M+B*3+F*9+J*9+3*N garbage(x4) 1645 paddw XMM5, XMM1; // C*3+G*9+K*9+O*3+D+H*3+L*3+P garbage(x4) 1646 paddw XMM3, XMM5; // total-sum garbage(x4) 1647 1648 paddw XMM3, xmm32; 1649 psrlw XMM3, 6; 1650 mov RAX, pDest; 1651 packuswb XMM3, XMM2; 1652 1653 movd [RAX], XMM3; 1654 } 1655 } 1656 else 1657 static assert(false); 1658 } 1659 else 1660 { 1661 auto A = LM1[x2m1]; 1662 auto B = LM1[x2p0]; 1663 auto C = LM1[x2p0+1]; 1664 auto D = LM1[x2p2]; 1665 1666 auto E = L0[x2m1]; 1667 auto F = L0[x2p0]; 1668 auto G = L0[x2p0+1]; 1669 auto H = L0[x2p2]; 1670 1671 auto I = L1[x2m1]; 1672 auto J = L1[x2p0]; 1673 auto K = L1[x2p0+1]; 1674 auto L = L1[x2p2]; 1675 1676 auto M = L2[x2m1]; 1677 auto N = L2[x2p0]; 1678 auto O = L2[x2p0+1]; 1679 auto P = L2[x2p2]; 1680 1681 // Apply filter 1682 // 1 3 3 1 1683 // 3 9 9 3 1684 // 3 9 9 3 1685 // 1 3 3 1 1686 1687 int rSum = (A.r + D.r + M.r + P.r) + 3 * (B.r + C.r + E.r + H.r + I.r + L.r + N.r + O.r) + 9 * (F.r + G.r + J.r + K.r); 1688 int gSum = (A.g + D.g + M.g + P.g) + 3 * (B.g + C.g + E.g + H.g + I.g + L.g + N.g + O.g) + 9 * (F.g + G.g + J.g + K.g); 1689 int bSum = (A.b + D.b + M.b + P.b) + 3 * (B.b + C.b + E.b + H.b + I.b + L.b + N.b + O.b) + 9 * (F.b + G.b + J.b + K.b); 1690 int aSum = (A.a + D.a + M.a + P.a) + 3 * (B.a + C.a + E.a + H.a + I.a + L.a + N.a + O.a) + 9 * (F.a + G.a + J.a + K.a); 1691 dest[x].r = cast(ubyte)((rSum + 32) >> 6); 1692 dest[x].g = cast(ubyte)((gSum + 32) >> 6); 1693 dest[x].b = cast(ubyte)((bSum + 32) >> 6); 1694 dest[x].a = cast(ubyte)((aSum + 32) >> 6); 1695 } 1696 } 1697 } 1698 } 1699 1700 void generateLevelCubicL16(OwnedImage!L16 thisLevel, 1701 OwnedImage!L16 previousLevel, 1702 box2i updateRect) nothrow @nogc 1703 { 1704 for (int y = updateRect.min.y; y < updateRect.max.y; ++y) 1705 { 1706 int y2m1 = 2 * y - 1; 1707 if (y2m1 < 0) 1708 y2m1 = 0; 1709 1710 int y2p2 = 2 * y + 2; 1711 if (y2p2 > previousLevel.h - 1) 1712 y2p2 = previousLevel.h - 1; 1713 1714 L16* LM1 = previousLevel.scanlinePtr(y2m1); 1715 L16* L0 = previousLevel.scanlinePtr(y * 2); 1716 L16* L1 = previousLevel.scanlinePtr(y * 2 + 1); 1717 L16* L2 = previousLevel.scanlinePtr(y2p2); 1718 L16* dest = thisLevel.scanlinePtr(y); 1719 1720 for (int x = updateRect.min.x; x < updateRect.max.x; ++x) 1721 { 1722 // A B C D 1723 // E F G H 1724 // I J K L 1725 // M N O P 1726 1727 int x2m1 = 2 * x - 1; 1728 if (x2m1 < 0) 1729 x2m1 = 0; 1730 int x2p0 = 2 * x; 1731 int x2p2 = 2 * x + 2; 1732 if (x2p2 > previousLevel.w - 1) 1733 x2p2 = previousLevel.w - 1; 1734 1735 ushort A = LM1[x2m1].l; 1736 ushort B = LM1[x2p0].l; 1737 ushort C = LM1[x2p0+1].l; 1738 ushort D = LM1[x2p2].l; 1739 1740 ushort E = L0[x2m1].l; 1741 ushort F = L0[x2p0].l; 1742 ushort G = L0[x2p0+1].l; 1743 ushort H = L0[x2p2].l; 1744 1745 ushort I = L1[x2m1].l; 1746 ushort J = L1[x2p0].l; 1747 ushort K = L1[x2p0+1].l; 1748 ushort L = L1[x2p2].l; 1749 1750 ushort M = L2[x2m1].l; 1751 ushort N = L2[x2p0].l; 1752 ushort O = L2[x2p0+1].l; 1753 ushort P = L2[x2p2].l; 1754 1755 // Apply filter 1756 // 1 3 3 1 A B C D 1757 // 3 9 9 3 E F G H 1758 // 3 9 9 3 I J K L 1759 // 1 3 3 1 M N O P 1760 1761 int depthSum = (A + D + M + P) 1762 + 3 * (B + C + E + H + I + L + N + O) 1763 + 9 * (F + G + J + K); 1764 dest[x].l = cast(ushort)((depthSum + 32) >> 6 ); 1765 } 1766 } 1767 } 1768 1769 unittest 1770 { 1771 Mipmap!RGBA rgbaMipmap; 1772 Mipmap!L16 l16Mipmap; 1773 }