1 /** 2 * Mipmap pyramid implementation. 3 * 4 * Copyright: Copyright Auburn Sounds 2015 and later. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 * Authors: Guillaume Piolat 7 */ 8 module dplug.graphics.mipmap; 9 10 import std.algorithm.comparison; 11 12 import gfm.math.vector; 13 import gfm.math.box; 14 import dplug.graphics.color; 15 16 import dplug.core.nogc; 17 import dplug.core.vec; 18 import dplug.graphics.drawex; 19 20 version( D_InlineAsm_X86 ) 21 { 22 version = AsmX86; 23 } 24 else version( D_InlineAsm_X86_64 ) 25 { 26 version = AsmX86; 27 } 28 29 // Because of unability to load globals in PIC code with DMD, only enable some assembly with LDC 30 version(LDC) 31 { 32 version( D_InlineAsm_X86 ) 33 { 34 version = inlineAsmCanLoadGlobalsInPIC; 35 } 36 else version( D_InlineAsm_X86_64 ) 37 { 38 version = inlineAsmCanLoadGlobalsInPIC; 39 } 40 } 41 42 43 /// Mipmapped images. 44 /// Supports non power-of-two textures. 45 /// Size of the i+1-th mipmap is { (width)/2, (height)/2 } 46 /// The mipmap owns each of its levels. 47 final class Mipmap(COLOR) if (is(COLOR == RGBA) || is(COLOR == L16)) 48 { 49 public: 50 nothrow: 51 @nogc: 52 53 enum Quality 54 { 55 box, // simple 2x2 filter, creates phase problems with NPOT. For higher levels, automatically uses cubic. 56 cubic, // Very smooth kernel [1 2 1] x [1 2 1] 57 boxAlphaCov, // ditto but alpha is used as weight, only implemented for RGBA 58 boxAlphaCovIntoPremul, // same as boxAlphaConv but after such a step the next level is alpha-premultiplied 59 } 60 61 Vec!(OwnedImage!COLOR) levels; 62 63 /// Creates empty 64 this() 65 { 66 levels = makeVec!(OwnedImage!COLOR)(); 67 } 68 69 /// Set number of levels and size 70 /// maxLevel = 0 => only one image 71 /// maxLevel = 1 => one image + one 2x downsampled mipmap 72 /// etc... 73 this(int maxLevel, int w, int h) 74 { 75 this(); 76 size(maxLevel, w, h); 77 } 78 79 80 /// Creates a Mipmap out of a flat OwnedImage. 81 /// This takes ownership of the given image, which is now owned by the `Mipmap`. 82 this(int maxLevel, OwnedImage!COLOR level0) 83 { 84 //PERF: could avoid to create the 0th level only to replace it later 85 86 this(maxLevel, level0.w, level0.h); 87 88 // replaces level 0 89 levels[0].destroyFree(); 90 levels[0] = level0; 91 generateMipmaps(Quality.box); 92 } 93 94 void size(int maxLevel, int w, int h) 95 { 96 // find number of needed levels 97 int neededLevels = 0; 98 { 99 int wr = w; 100 int hr = h; 101 for (; neededLevels <= maxLevel; ++neededLevels) 102 { 103 if (wr == 0 || hr == 0) 104 break; 105 wr = (wr + 0) >> 1; 106 hr = (hr + 0) >> 1; 107 } 108 } 109 110 void setLevels(int numLevels) 111 { 112 // FUTURE: cleanup excess levels 113 // should not happen until we have resizing 114 if (numLevels < levels.length) 115 { 116 assert(false); 117 } 118 119 int previousLength = cast(int)levels.length; 120 121 levels.resize(numLevels); 122 123 // create empty image for new levels 124 for(int level = previousLength; level < numLevels; ++level) 125 { 126 levels[level] = mallocNew!(OwnedImage!COLOR)(); 127 } 128 } 129 130 setLevels(neededLevels); 131 132 // resize levels 133 for (int level = 0; level < neededLevels; ++level) 134 { 135 assert(w != 0 && h != 0); 136 levels[level].size(w, h); 137 w = (w + 0) >> 1; 138 h = (h + 0) >> 1; 139 } 140 } 141 142 ~this() 143 { 144 foreach(level; levels) 145 level.destroyFree(); 146 } 147 148 /// Interpolates a color between mipmap levels. Floating-point level, spatial linear interpolation. 149 /// x and y are in base level coordinates (top-left pixel is on (0.5, 0.5) coordinates). 150 /// Clamped to borders. 151 auto linearMipmapSample(float level, float x, float y) nothrow @nogc 152 { 153 int ilevel = cast(int)level; 154 float flevel = level - ilevel; 155 vec4f levelN = linearSample(ilevel, x, y); 156 if (flevel == 0) 157 return levelN; 158 159 auto levelNp1 = linearSample(ilevel + 1, x, y); 160 161 return levelN * (1 - flevel) + levelNp1 * flevel; 162 } 163 164 165 /// Interpolates a color. Integer level, spatial linear interpolation. 166 /// x and y are in base level coordinates (top-left pixel is on (0.5, 0.5) coordinates). 167 /// Clamped to borders. 168 auto linearSample(int level, float x, float y) nothrow @nogc 169 { 170 if (level < 0) 171 level = 0; 172 int numLevels = cast(int)levels.length; 173 if (level >= numLevels) 174 level = numLevels - 1; 175 176 OwnedImage!COLOR image = levels[level]; 177 178 179 static immutable float[14] factors = [ 1.0f, 0.5f, 0.25f, 0.125f, 180 0.0625f, 0.03125f, 0.015625f, 0.0078125f, 181 0.00390625f, 0.001953125f, 0.0009765625f, 0.00048828125f, 182 0.000244140625f, 0.0001220703125f]; 183 184 float divider = factors[level]; 185 x = x * divider - 0.5f; 186 y = y * divider - 0.5f; 187 188 float maxX = image.w - 1.001f; // avoids an edge case with truncation 189 float maxY = image.h - 1.001f; 190 191 if (x < 0) 192 x = 0; 193 if (y < 0) 194 y = 0; 195 if (x > maxX) 196 x = maxX; 197 if (y > maxY) 198 y = maxY; 199 200 int ix = cast(int)x; 201 int iy = cast(int)y; 202 float fx = x - ix; 203 204 int ixp1 = ix + 1; 205 if (ixp1 >= image.w) 206 ixp1 = image.w - 1; 207 int iyp1 = iy + 1; 208 if (iyp1 >= image.h) 209 iyp1 = image.h - 1; 210 211 float fxm1 = 1 - fx; 212 float fy = y - iy; 213 float fym1 = 1 - fy; 214 215 COLOR[] L0 = image.scanline(iy); 216 COLOR[] L1 = image.scanline(iyp1); 217 218 COLOR A = L0.ptr[ix]; 219 COLOR B = L0.ptr[ixp1]; 220 COLOR C = L1.ptr[ix]; 221 COLOR D = L1.ptr[ixp1]; 222 223 static if (is(COLOR == RGBA)) 224 { 225 float inv255 = 1 / 255.0f; 226 227 version( AsmX86 ) 228 { 229 vec4f asmResult; 230 231 asm nothrow @nogc 232 { 233 movd XMM0, A; 234 movd XMM1, B; 235 movd XMM2, C; 236 movd XMM3, D; 237 pxor XMM4, XMM4; 238 239 punpcklbw XMM0, XMM4; 240 punpcklbw XMM1, XMM4; 241 punpcklbw XMM2, XMM4; 242 punpcklbw XMM3, XMM4; 243 244 punpcklwd XMM0, XMM4; 245 punpcklwd XMM1, XMM4; 246 punpcklwd XMM2, XMM4; 247 punpcklwd XMM3, XMM4; 248 249 cvtdq2ps XMM0, XMM0; 250 cvtdq2ps XMM1, XMM1; 251 252 cvtdq2ps XMM2, XMM2; 253 cvtdq2ps XMM3, XMM3; 254 255 movss XMM4, fxm1; 256 pshufd XMM4, XMM4, 0; 257 movss XMM5, fx; 258 pshufd XMM5, XMM5, 0; 259 260 mulps XMM0, XMM4; 261 mulps XMM1, XMM5; 262 mulps XMM2, XMM4; 263 mulps XMM3, XMM5; 264 265 movss XMM4, fym1; 266 pshufd XMM4, XMM4, 0; 267 movss XMM5, fy; 268 pshufd XMM5, XMM5, 0; 269 270 addps XMM0, XMM1; 271 addps XMM2, XMM3; 272 273 mulps XMM0, XMM4; 274 mulps XMM2, XMM5; 275 276 addps XMM0, XMM2; 277 278 movups asmResult, XMM0; 279 } 280 281 // Uncomment to check 282 /* 283 vec4f vA = vec4f(A.r, A.g, A.b, A.a); 284 vec4f vB = vec4f(B.r, B.g, B.b, B.a); 285 vec4f vC = vec4f(C.r, C.g, C.b, C.a); 286 vec4f vD = vec4f(D.r, D.g, D.b, D.a); 287 288 vec4f up = vA * fxm1 + vB * fx; 289 vec4f down = vC * fxm1 + vD * fx; 290 vec4f dResult = up * fym1 + down * fy; 291 292 import gfm.core; 293 294 if (dResult.distanceTo(result) < 1.0f) 295 debugBreak(); 296 */ 297 298 vec4f result = asmResult; 299 return result; 300 } 301 else 302 { 303 vec4f vA = vec4f(A.r, A.g, A.b, A.a); 304 vec4f vB = vec4f(B.r, B.g, B.b, B.a); 305 vec4f vC = vec4f(C.r, C.g, C.b, C.a); 306 vec4f vD = vec4f(D.r, D.g, D.b, D.a); 307 308 309 310 vec4f up = vA * fxm1 + vB * fx; 311 vec4f down = vC * fxm1 + vD * fx; 312 vec4f dResult = up * fym1 + down * fy; 313 314 // assert(dResult.distanceTo(asmResult) < 1.0f); 315 316 return dResult; 317 } 318 } 319 else 320 { 321 float up = A.l * fxm1 + B.l * fx; 322 float down = C.l * fxm1 + D.l * fx; 323 return up * fym1 + down * fy; 324 } 325 } 326 327 /// Returns: Width of the base level. 328 int width() pure const nothrow @nogc 329 { 330 return levels[0].w; 331 } 332 333 /// Returns: Height of the base level. 334 int height() pure const nothrow @nogc 335 { 336 return levels[0].h; 337 } 338 339 /// Returns: Number of levels. The maximum level is numLevels() - 1. 340 int numLevels() pure const nothrow @nogc 341 { 342 return cast(int)levels.length; 343 } 344 345 /// Regenerates the whole upper levels. 346 void generateMipmaps(Quality quality) nothrow @nogc 347 { 348 box2i updateRect = box2i(0, 0, width(), height()); 349 for (int level = 1; level < numLevels(); ++level) 350 { 351 // HACK: Force cubic filter past a level else it makes ugly looking mipmaps 352 if (level >= 3 && quality == Quality.box) 353 quality = Quality.cubic; 354 355 updateRect = generateNextLevel(quality, updateRect, level); 356 } 357 } 358 359 /// Regenerates a single mipmap level based on changes in the provided rectangle (expressed in level 0 coordinates). 360 /// updateRect expressed in level 0 coordinates 361 /// In general if you have several subparts of mipmaps to update, make sure a level is fully completed 362 /// before computing the next one. 363 box2i generateNextLevel(Quality quality, box2i updateRectPreviousLevel, int level) nothrow @nogc 364 { 365 OwnedImage!COLOR previousLevel = levels[level - 1]; 366 box2i updateRect = impactOnNextLevel(quality, updateRectPreviousLevel, previousLevel.w, previousLevel.h); 367 generateLevel(level, quality, updateRect); 368 return updateRect; 369 } 370 371 /// Regenerates one level 372 /// updateRect expressed in level i-th coordinates 373 void generateLevel(int level, Quality quality, box2i updateRect) nothrow @nogc 374 { 375 assert(level > 0); 376 OwnedImage!COLOR thisLevel = levels[level]; 377 OwnedImage!COLOR previousLevel = levels[level - 1]; 378 379 final switch(quality) with (Quality) 380 { 381 case box: 382 383 static if (is(COLOR == RGBA)) 384 generateLevelBoxRGBA(thisLevel, previousLevel, updateRect); 385 else static if (is(COLOR == L16)) 386 generateLevelBoxL16(thisLevel, previousLevel, updateRect); 387 else 388 static assert(false, "not implemented"); 389 390 enum checkBoxMipmaps = false; 391 392 static if (checkBoxMipmaps) 393 { 394 for (int y = updateRect.min.y; y < updateRect.max.y; ++y) 395 { 396 COLOR[] L0 = previousLevel.scanline(y * 2); 397 COLOR[] L1 = previousLevel.scanline(y * 2 + 1); 398 COLOR[] dest = thisLevel.scanline(y); 399 400 for (int x = updateRect.min.x; x < updateRect.max.x; ++x) 401 { 402 // A B 403 // C D 404 COLOR A = L0[2 * x]; 405 COLOR B = L0[2 * x + 1]; 406 COLOR C = L1[2 * x]; 407 COLOR D = L1[2 * x + 1]; 408 assert(dest[x] == COLOR.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D)); 409 } 410 } 411 } 412 break; 413 414 case boxAlphaCov: 415 416 static if (is(COLOR == RGBA)) 417 { 418 generateLevelBoxAlphaCovRGBA(thisLevel, previousLevel, updateRect); 419 420 static if (false) 421 { 422 void checkLevelBoxAlphaConvRGBA(Image!RGBA* thisLevel, Image!RGBA* previousLevel, box2i updateRect) 423 { 424 for (int y = updateRect.min.y; y < updateRect.max.y; ++y) 425 { 426 RGBA[] L0 = previousLevel.scanline(y * 2); 427 RGBA[] L1 = previousLevel.scanline(y * 2 + 1); 428 RGBA[] dest = thisLevel.scanline(y); 429 430 for (int x = updateRect.min.x; x < updateRect.max.x; ++x) 431 { 432 // A B 433 // C D 434 RGBA A = L0.ptr[2 * x]; 435 RGBA B = L0.ptr[2 * x + 1]; 436 RGBA C = L1.ptr[2 * x]; 437 RGBA D = L1.ptr[2 * x + 1]; 438 439 int alphaA = A.a; 440 int alphaB = B.a; 441 int alphaC = C.a; 442 int alphaD = D.a; 443 int sum = alphaA + alphaB + alphaC + alphaD; 444 if (sum == 0) 445 { 446 assert(dest.ptr[x] == A); 447 } 448 else 449 { 450 int destAlpha = cast(ubyte)( (alphaA + alphaB + alphaC + alphaD + 2) >> 2 ); 451 int red = (A.r * alphaA + B.r * alphaB + C.r * alphaC + D.r * alphaD); 452 int green = (A.g * alphaA + B.g * alphaB + C.g * alphaC + D.g * alphaD); 453 int blue = (A.b * alphaA + B.b* alphaB + C.b * alphaC + D.b * alphaD); 454 float invSum = 1 / cast(float)(sum); 455 456 RGBA finalColor = RGBA( cast(ubyte)(0.5f + red * invSum), 457 cast(ubyte)(0.5f + green * invSum), 458 cast(ubyte)(0.5f + blue * invSum), 459 cast(ubyte)destAlpha ); 460 assert(dest.ptr[x] == finalColor); 461 } 462 } 463 } 464 } 465 checkLevelBoxAlphaConvRGBA(thisLevel, previousLevel, updateRect); 466 } 467 break; 468 } 469 else 470 assert(false); 471 472 case boxAlphaCovIntoPremul: 473 474 static if (is(COLOR == RGBA)) 475 { 476 generateLevelBoxAlphaCovIntoPremulRGBA(thisLevel, previousLevel, updateRect); 477 break; 478 } 479 else 480 assert(false); 481 482 case cubic: 483 static if (is(COLOR == RGBA)) 484 { 485 generateLevelCubicRGBA(thisLevel, previousLevel, updateRect); 486 break; 487 } 488 else static if (is(COLOR == L16)) 489 { 490 generateLevelCubicL16(thisLevel, previousLevel, updateRect); 491 break; 492 } 493 else 494 static assert(false, "not implemented"); 495 496 497 } 498 } 499 500 501 private: 502 /// Computes impact of updating the area box on next level 503 static box2i impactOnNextLevel(Quality quality, box2i area, int currentLevelWidth, int currentLevelHeight) pure nothrow @nogc 504 { 505 box2i maxArea = box2i(0, 0, currentLevelWidth / 2, currentLevelHeight / 2); 506 507 final switch(quality) with (Quality) 508 { 509 case box: 510 case boxAlphaCov: 511 case boxAlphaCovIntoPremul: 512 int xmin = area.min.x / 2; 513 int ymin = area.min.y / 2; 514 int xmax = (area.max.x + 1) / 2; 515 int ymax = (area.max.y + 1) / 2; 516 return box2i(xmin, ymin, xmax, ymax).intersection(maxArea); 517 518 case cubic: 519 int xmin = (area.min.x - 1) / 2; 520 int ymin = (area.min.y - 1) / 2; 521 int xmax = (area.max.x + 2) / 2; 522 int ymax = (area.max.y + 2) / 2; 523 return box2i(xmin, ymin, xmax, ymax).intersection(maxArea); 524 } 525 526 } 527 } 528 529 unittest 530 { 531 Mipmap!RGBA a = new Mipmap!RGBA(); 532 a.size(4, 256, 256); 533 a.destroy(); 534 535 Mipmap!L16 b = new Mipmap!L16(); 536 b.size(16, 17, 333); 537 b.destroy(); 538 } 539 540 541 private: 542 543 align(16) static immutable short[8] xmmTwoShort = [ 2, 2, 2, 2, 2, 2, 2, 2 ]; 544 align(16) static immutable int[4] xmmTwoInt = [ 2, 2, 2, 2 ]; 545 align(16) static immutable float[4] xmm0_5 = [ 0.5f, 0.5f, 0.5f, 0.5f ]; 546 align(16) static immutable int[4] xmm512 = [ 512, 512, 512, 512 ]; 547 align(16) static immutable short[8] xmm11113333 = [ 1, 1, 1, 1, 3, 3, 3, 3 ]; 548 align(16) static immutable short[8] xmm33331111 = [ 3, 3, 3, 3, 1, 1, 1, 1 ]; 549 align(16) static immutable short[8] xmm33339999 = [ 3, 3, 3, 3, 9, 9, 9, 9 ]; 550 align(16) static immutable short[8] xmm99993333 = [ 9, 9, 9, 9, 3, 3, 3, 3 ]; 551 align(16) static immutable short[8] xmm32 = [ 32, 32, 32, 32, 32, 32, 32, 32 ]; 552 553 554 void generateLevelBoxRGBA(OwnedImage!RGBA thisLevel, 555 OwnedImage!RGBA previousLevel, 556 box2i updateRect) pure nothrow @nogc 557 { 558 int width = updateRect.width(); 559 int height = updateRect.height(); 560 561 int previousPitch = previousLevel.w; 562 int thisPitch = thisLevel.w; 563 564 RGBA* L0 = previousLevel.scanline(updateRect.min.y * 2).ptr + updateRect.min.x * 2; 565 RGBA* L1 = L0 + previousPitch; 566 RGBA* dest = thisLevel.scanline(updateRect.min.y).ptr + updateRect.min.x; 567 568 for (int y = 0; y < height; ++y) 569 { 570 version(inlineAsmCanLoadGlobalsInPIC) 571 { 572 version(D_InlineAsm_X86) 573 { 574 asm pure nothrow @nogc 575 { 576 mov ECX, width; 577 shr ECX, 1; 578 jz no_need; // ECX = 0 => no pair of pixels to process 579 580 mov EAX, L0; 581 mov EDX, L1; 582 mov EDI, dest; 583 movaps XMM5, xmmTwoShort; 584 585 loop_ecx: 586 movdqu XMM0, [EAX]; // A B E F 587 pxor XMM4, XMM4; 588 movdqu XMM1, [EDX]; // C D G H 589 movdqa XMM2, XMM0; 590 movdqa XMM3, XMM1; 591 punpcklbw XMM0, XMM4; // A B in short 592 punpcklbw XMM1, XMM4; // C D in short 593 punpckhbw XMM2, XMM4; // E F in short 594 punpckhbw XMM3, XMM4; // G H in short 595 paddusw XMM0, XMM1; // A + C | B + D 596 paddusw XMM2, XMM3; // E + F | G + H 597 movdqa XMM1, XMM0; 598 movdqa XMM3, XMM2; 599 psrldq XMM1, 8; 600 psrldq XMM3, 8; 601 add EDI, 8; 602 paddusw XMM0, XMM1; // A + B + C + D | garbage 603 paddusw XMM2, XMM3; // E + F + G + H | garbage 604 paddusw XMM0, XMM5; // A + B + C + D + 2 | garbage 605 paddusw XMM2, XMM5; // E + F + G + H + 2 | garbage 606 psrlw XMM0, 2; // (A + B + C + D + 2) >> 2 | garbage 607 psrlw XMM2, 2; // (E + F + G + H + 2) >> 2 | garbage 608 add EAX, 16; 609 punpcklqdq XMM0, XMM2; 610 add EDX, 16; 611 packuswb XMM0, XMM4; // (A + B + C + D + 2) >> 2 | (E + F + G + H + 2) >> 2 | 0 | 0 612 movq [EDI-8], XMM0; 613 sub ECX, 1; 614 jnz loop_ecx; 615 no_need: ; 616 } 617 618 // Eventually filter the last pixel 619 int remaining = width & ~1; 620 for (int x = remaining; x < width; ++x) 621 { 622 RGBA A = L0[2 * x]; 623 RGBA B = L0[2 * x + 1]; 624 RGBA C = L1[2 * x]; 625 RGBA D = L1[2 * x + 1]; 626 dest[x] = RGBA.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D); 627 } 628 } 629 else version(D_InlineAsm_X86_64) 630 { 631 asm pure nothrow @nogc 632 { 633 mov ECX, width; 634 shr ECX, 1; 635 jz no_need; // ECX = 0 => no pair of pixels to process 636 637 mov RAX, L0; 638 mov RDX, L1; 639 mov RDI, dest; 640 movaps XMM5, xmmTwoShort; 641 642 loop_ecx: 643 movdqu XMM0, [RAX]; // A B E F 644 pxor XMM4, XMM4; 645 movdqu XMM1, [RDX]; // C D G H 646 movdqa XMM2, XMM0; 647 movdqa XMM3, XMM1; 648 punpcklbw XMM0, XMM4; // A B in short 649 punpcklbw XMM1, XMM4; // C D in short 650 punpckhbw XMM2, XMM4; // E F in short 651 punpckhbw XMM3, XMM4; // G H in short 652 paddusw XMM0, XMM1; // A + C | B + D 653 paddusw XMM2, XMM3; // E + F | G + H 654 movdqa XMM1, XMM0; 655 movdqa XMM3, XMM2; 656 psrldq XMM1, 8; 657 psrldq XMM3, 8; 658 add RDI, 8; 659 paddusw XMM0, XMM1; // A + B + C + D | garbage 660 paddusw XMM2, XMM3; // E + F + G + H | garbage 661 paddusw XMM0, XMM5; // A + B + C + D + 2 | garbage 662 paddusw XMM2, XMM5; // E + F + G + H + 2 | garbage 663 psrlw XMM0, 2; // (A + B + C + D + 2) >> 2 | garbage 664 psrlw XMM2, 2; // (E + F + G + H + 2) >> 2 | garbage 665 add RAX, 16; 666 punpcklqdq XMM0, XMM2; 667 add RDX, 16; 668 packuswb XMM0, XMM4; // (A + B + C + D + 2) >> 2 | (E + F + G + H + 2) >> 2 | 0 | 0 669 movq [RDI-8], XMM0; 670 sub ECX, 1; 671 jnz loop_ecx; 672 no_need: ; 673 } 674 675 // Eventually filter the last pixel 676 int remaining = width & ~1; 677 for (int x = remaining; x < width; ++x) 678 { 679 RGBA A = L0[2 * x]; 680 RGBA B = L0[2 * x + 1]; 681 RGBA C = L1[2 * x]; 682 RGBA D = L1[2 * x + 1]; 683 dest[x] = RGBA.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D); 684 } 685 } 686 else 687 static assert(false); 688 } 689 else 690 { 691 for (int x = 0; x < width; ++x) 692 { 693 // A B 694 // C D 695 RGBA A = L0[2 * x]; 696 RGBA B = L0[2 * x + 1]; 697 RGBA C = L1[2 * x]; 698 RGBA D = L1[2 * x + 1]; 699 700 dest[x] = RGBA.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D); 701 } 702 } 703 704 L0 += (2 * previousPitch); 705 L1 += (2 * previousPitch); 706 dest += thisPitch; 707 } 708 } 709 710 void generateLevelBoxL16(OwnedImage!L16 thisLevel, 711 OwnedImage!L16 previousLevel, 712 box2i updateRect) pure nothrow @nogc 713 { 714 int width = updateRect.width(); 715 int height = updateRect.height(); 716 717 int previousPitch = previousLevel.w; 718 int thisPitch = thisLevel.w; 719 720 L16* L0 = previousLevel.scanline(updateRect.min.y * 2).ptr + updateRect.min.x * 2; 721 L16* L1 = L0 + previousPitch; 722 723 L16* dest = thisLevel.scanline(updateRect.min.y).ptr + updateRect.min.x; 724 725 for (int y = 0; y < height; ++y) 726 { 727 version(inlineAsmCanLoadGlobalsInPIC) 728 { 729 version(D_InlineAsm_X86) 730 { 731 asm pure nothrow @nogc 732 { 733 mov ECX, width; 734 shr ECX, 2; 735 jz no_need; // ECX = 0 => less than 4 pixels to process 736 737 mov EAX, L0; 738 mov EDX, L1; 739 mov EDI, dest; 740 movdqa XMM5, xmmTwoInt; 741 pxor XMM4, XMM4; 742 743 loop_ecx: 744 movdqu XMM0, [EAX]; // A B E F I J M N 745 movdqu XMM1, [EDX]; // C D G H K L O P 746 747 add EAX, 16; 748 add EDX, 16; 749 750 movdqa XMM2, XMM0; 751 movdqa XMM3, XMM1; 752 753 punpcklwd XMM0, XMM4; // A B E F in int32 754 punpckhwd XMM2, XMM4; // I J M N in int32 755 punpcklwd XMM1, XMM4; // C D G H in int32 756 punpckhwd XMM3, XMM4; // K L O P in int32 757 758 paddd XMM0, XMM1; // A+C B+D E+G F+H 759 paddd XMM2, XMM3; // I+K J+L M+O N+P 760 761 movdqa XMM1, XMM0; 762 movdqa XMM3, XMM2; 763 764 psrldq XMM1, 4; // B+D E+G F+H 0 765 psrldq XMM3, 4; // J+L M+O N+P 0 766 767 paddd XMM0, XMM1; // A+B+C+D garbage E+F+G+H garbage 768 paddd XMM2, XMM3; // I+J+K+L garbage M+N+O+P garbage 769 770 pshufd XMM0, XMM0, 0b00001000; // A+B+C+D E+F+G+H garbage garbage 771 pshufd XMM2, XMM2, 0b00001000; // I+J+K+L M+N+O+P garbage garbage 772 773 punpcklqdq XMM0, XMM2; // A+B+C+D E+F+G+H I+J+K+L M+N+O+P 774 paddd XMM0, XMM5; // add 2 775 psrld XMM0, 2; // >> 2 776 777 // because packusdw is not available before SSE4.1 778 // Extend sign bit to the right 779 pslld XMM0, 16; 780 psrad XMM0, 16; 781 add EDI, 8; 782 packssdw XMM0, XMM4; 783 784 movq [EDI-8], XMM0; 785 sub ECX, 1; 786 jnz loop_ecx; 787 no_need: ; 788 } 789 790 // Eventually filter the 0 to 3 pixels 791 int remaining = width & ~3; 792 for (int x = remaining; x < width; ++x) 793 { 794 L16 A = L0[2 * x]; 795 L16 B = L0[2 * x + 1]; 796 L16 C = L1[2 * x]; 797 L16 D = L1[2 * x + 1]; 798 dest[x] = L16.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D); 799 } 800 } 801 else version(D_InlineAsm_X86_64) 802 { 803 asm pure nothrow @nogc 804 { 805 mov ECX, width; 806 shr ECX, 2; 807 jz no_need; // ECX = 0 => less than 4 pixels to process 808 809 mov RAX, L0; 810 mov RDX, L1; 811 mov RDI, dest; 812 movdqa XMM5, xmmTwoInt; 813 pxor XMM4, XMM4; 814 815 loop_ecx: 816 movdqu XMM0, [RAX]; // A B E F I J M N 817 movdqu XMM1, [RDX]; // C D G H K L O P 818 819 add RAX, 16; 820 add RDX, 16; 821 822 movdqa XMM2, XMM0; 823 movdqa XMM3, XMM1; 824 825 punpcklwd XMM0, XMM4; // A B E F in int32 826 punpckhwd XMM2, XMM4; // I J M N in int32 827 punpcklwd XMM1, XMM4; // C D G H in int32 828 punpckhwd XMM3, XMM4; // K L O P in int32 829 830 paddd XMM0, XMM1; // A+C B+D E+G F+H 831 paddd XMM2, XMM3; // I+K J+L M+O N+P 832 833 movdqa XMM1, XMM0; 834 movdqa XMM3, XMM2; 835 836 psrldq XMM1, 4; // B+D E+G F+H 0 837 psrldq XMM3, 4; // J+L M+O N+P 0 838 839 paddd XMM0, XMM1; // A+B+C+D garbage E+F+G+H garbage 840 paddd XMM2, XMM3; // I+J+K+L garbage M+N+O+P garbage 841 842 pshufd XMM0, XMM0, 0b00001000; // A+B+C+D E+F+G+H garbage garbage 843 pshufd XMM2, XMM2, 0b00001000; // I+J+K+L M+N+O+P garbage garbage 844 845 punpcklqdq XMM0, XMM2; // A+B+C+D E+F+G+H I+J+K+L M+N+O+P 846 paddd XMM0, XMM5; // add 2 847 psrld XMM0, 2; // >> 2 848 849 // because packusdw is not available before SSE4.1 850 // Extend sign bit to the right 851 pslld XMM0, 16; 852 psrad XMM0, 16; 853 add RDI, 8; 854 packssdw XMM0, XMM4; 855 856 movq [RDI-8], XMM0; 857 sub ECX, 1; 858 jnz loop_ecx; 859 no_need: ; 860 } 861 862 // Eventually filter the 0 to 3 pixels 863 int remaining = width & ~3; 864 for (int x = remaining; x < width; ++x) 865 { 866 L16 A = L0[2 * x]; 867 L16 B = L0[2 * x + 1]; 868 L16 C = L1[2 * x]; 869 L16 D = L1[2 * x + 1]; 870 dest[x] = L16.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D); 871 } 872 } 873 else 874 static assert(false); 875 } 876 else 877 { 878 for (int x = 0; x < width; ++x) 879 { 880 // A B 881 // C D 882 L16 A = L0[2 * x]; 883 L16 B = L0[2 * x + 1]; 884 L16 C = L1[2 * x]; 885 L16 D = L1[2 * x + 1]; 886 887 dest[x] = L16.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D); 888 } 889 } 890 891 L0 += (2 * previousPitch); 892 L1 += (2 * previousPitch); 893 dest += thisPitch; 894 } 895 } 896 897 898 void generateLevelBoxAlphaCovRGBA(OwnedImage!RGBA thisLevel, 899 OwnedImage!RGBA previousLevel, 900 box2i updateRect) nothrow @nogc 901 { 902 int width = updateRect.width(); 903 int height = updateRect.height(); 904 905 int previousPitch = previousLevel.w; 906 int thisPitch = thisLevel.w; 907 908 RGBA* L0 = previousLevel.scanline(updateRect.min.y * 2).ptr + updateRect.min.x * 2; 909 RGBA* L1 = L0 + previousPitch; 910 911 RGBA* dest = thisLevel.scanline(updateRect.min.y).ptr + updateRect.min.x; 912 913 for (int y = 0; y < height; ++y) 914 { 915 version(inlineAsmCanLoadGlobalsInPIC) 916 { 917 version(D_InlineAsm_X86) 918 { 919 assert(width > 0); 920 asm nothrow @nogc 921 { 922 mov ECX, width; 923 924 mov EAX, L0; 925 mov EDX, L1; 926 mov EDI, dest; 927 928 loop_ecx: 929 930 movq XMM0, [EAX]; // Ar Ag Ab Aa Br Bg Bb Ba + zeroes 931 movq XMM1, [EDX]; // Cr Cg Cb Ca Dr Dg Db Da + zeroes 932 pxor XMM4, XMM4; 933 add EAX, 8; 934 add EDX, 8; 935 936 punpcklbw XMM0, XMM4; // Ar Ag Ab Aa Br Bg Bb Ba 937 punpcklbw XMM1, XMM4; // Cr Cg Cb Ca Dr Dg Db Da 938 939 movdqa XMM2, XMM0; 940 punpcklwd XMM0, XMM1; // Ar Cr Ag Cg Ab Cb Aa Ca 941 punpckhwd XMM2, XMM1; // Br Dr Bg Dg Bb Db Ba Da 942 943 // perhaps unnecessary 944 movdqa XMM3, XMM0; 945 punpcklwd XMM0, XMM2; // Ar Br Cr Dr Ag Bg Cg Dg 946 punpckhwd XMM3, XMM2; // Ab Bb Cb Db Aa Ba Ca Da 947 948 movdqa XMM1, XMM3; 949 punpckhqdq XMM1, XMM1; // Aa Ba Ca Da Aa Ba Ca Da 950 951 // Are alpha all zeroes? if so, early continue. 952 movdqa XMM2, XMM1; 953 pcmpeqb XMM2, XMM4; 954 add EDI, 4; 955 pmovmskb ESI, XMM2; 956 cmp ESI, 0xffff; 957 jnz non_null; 958 959 pxor XMM0, XMM0; 960 sub ECX, 1; 961 movd [EDI-4], XMM0; // dest[x] = A 962 jnz loop_ecx; 963 jmp end_of_loop; 964 965 non_null: 966 967 pmaddwd XMM0, XMM1; // Ar*Aa+Br*Ba Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 968 pmaddwd XMM3, XMM1; // Ab*Aa+Bb*Ba Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 969 970 // Starting computing sum of coefficients too 971 punpcklwd XMM1, XMM4; // Aa Ba Ca Da 972 973 movdqa XMM2, XMM0; 974 movdqa XMM5, XMM3; 975 movdqa XMM4, XMM1; 976 psrldq XMM4, 8; 977 978 psrldq XMM2, 4; // Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 0 979 psrldq XMM5, 4; // Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 0 980 paddq XMM1, XMM4; // Aa+Ca Ba+Da garbage garbage 981 movdqa XMM4, XMM1; 982 983 paddd XMM0, XMM2; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da garbage Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage 984 paddd XMM3, XMM5; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da garbage Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage 985 psrldq XMM4, 4; 986 987 pshufd XMM0, XMM0, 0b00001000; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage garbage 988 paddq XMM1, XMM4; // Aa+Ba+Ca+Da garbage garbage garbage 989 pshufd XMM3, XMM3, 0b00001000; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage garbage 990 991 punpcklqdq XMM0, XMM3; // fR fG fB fA 992 pshufd XMM1, XMM1, 0; 993 994 cvtdq2ps XMM0, XMM0; 995 996 cvtdq2ps XMM3, XMM1; // sum sum sum sum 997 998 divps XMM0, XMM3; // fR/sum fG/sum fB/sum fA/sum 999 addps XMM0, xmm0_5; 1000 cvttps2dq XMM0, XMM0; // return into integer domain using cast(int)(x + 0.5f) 1001 1002 paddd XMM1, xmmTwoInt; 1003 psrld XMM1, 2; // finalAlpha finalAlpha finalAlpha finalAlpha 1004 1005 pslldq XMM0, 4; // 0 fR/sum fG/sum fB/sum 1006 pslldq XMM1, 12; // 0 0 0 finalAlpha 1007 psrldq XMM0, 4; // fR/sum fG/sum fB/sum 0 1008 1009 por XMM0, XMM1; // fR/sum fG/sum fB/sum finalAlpha 1010 pxor XMM3, XMM3; 1011 packssdw XMM0, XMM3; // same in words 1012 packuswb XMM0, XMM3; // same in bytes 1013 1014 sub ECX, 1; 1015 movd [EDI-4], XMM0; // dest[x] = A 1016 jnz loop_ecx; 1017 end_of_loop: ; 1018 } 1019 } 1020 else version(D_InlineAsm_X86_64) 1021 { 1022 assert(width > 0); 1023 asm nothrow @nogc 1024 { 1025 mov ECX, width; 1026 1027 mov RAX, L0; 1028 mov RDX, L1; 1029 mov RDI, dest; 1030 1031 loop_ecx: 1032 1033 movq XMM0, [RAX]; // Ar Ag Ab Aa Br Bg Bb Ba + zeroes 1034 movq XMM1, [RDX]; // Cr Cg Cb Ca Dr Dg Db Da + zeroes 1035 pxor XMM4, XMM4; 1036 add RAX, 8; 1037 add RDX, 8; 1038 1039 punpcklbw XMM0, XMM4; // Ar Ag Ab Aa Br Bg Bb Ba 1040 punpcklbw XMM1, XMM4; // Cr Cg Cb Ca Dr Dg Db Da 1041 1042 movdqa XMM2, XMM0; 1043 punpcklwd XMM0, XMM1; // Ar Cr Ag Cg Ab Cb Aa Ca 1044 punpckhwd XMM2, XMM1; // Br Dr Bg Dg Bb Db Ba Da 1045 1046 // perhaps unnecessary 1047 movdqa XMM3, XMM0; 1048 punpcklwd XMM0, XMM2; // Ar Br Cr Dr Ag Bg Cg Dg 1049 punpckhwd XMM3, XMM2; // Ab Bb Cb Db Aa Ba Ca Da 1050 1051 movdqa XMM1, XMM3; 1052 punpckhqdq XMM1, XMM1; // Aa Ba Ca Da Aa Ba Ca Da 1053 1054 // Are alpha all zeroes? if so, early continue. 1055 movdqa XMM2, XMM1; 1056 pcmpeqb XMM2, XMM4; 1057 add RDI, 4; 1058 pmovmskb ESI, XMM2; 1059 cmp ESI, 0xffff; 1060 jnz non_null; 1061 1062 pxor XMM0, XMM0; 1063 sub ECX, 1; 1064 movd [RDI-4], XMM0; // dest[x] = A 1065 jnz loop_ecx; 1066 jmp end_of_loop; 1067 1068 non_null: 1069 1070 pmaddwd XMM0, XMM1; // Ar*Aa+Br*Ba Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 1071 pmaddwd XMM3, XMM1; // Ab*Aa+Bb*Ba Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 1072 1073 // Starting computing sum of coefficients too 1074 punpcklwd XMM1, XMM4; // Aa Ba Ca Da 1075 1076 movdqa XMM2, XMM0; 1077 movdqa XMM5, XMM3; 1078 movdqa XMM4, XMM1; 1079 psrldq XMM4, 8; 1080 1081 psrldq XMM2, 4; // Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 0 1082 psrldq XMM5, 4; // Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 0 1083 paddq XMM1, XMM4; // Aa+Ca Ba+Da garbage garbage 1084 movdqa XMM4, XMM1; 1085 1086 paddd XMM0, XMM2; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da garbage Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage 1087 paddd XMM3, XMM5; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da garbage Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage 1088 psrldq XMM4, 4; 1089 1090 pshufd XMM0, XMM0, 0b00001000; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage garbage 1091 paddq XMM1, XMM4; // Aa+Ba+Ca+Da garbage garbage garbage 1092 pshufd XMM3, XMM3, 0b00001000; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage garbage 1093 1094 punpcklqdq XMM0, XMM3; // fR fG fB fA 1095 pshufd XMM1, XMM1, 0; 1096 1097 cvtdq2ps XMM0, XMM0; 1098 1099 cvtdq2ps XMM3, XMM1; // sum sum sum sum 1100 1101 divps XMM0, XMM3; // fR/sum fG/sum fB/sum fA/sum 1102 addps XMM0, xmm0_5; 1103 cvttps2dq XMM0, XMM0; // return into integer domain using cast(int)(x + 0.5f) 1104 1105 paddd XMM1, xmmTwoInt; 1106 psrld XMM1, 2; // finalAlpha finalAlpha finalAlpha finalAlpha 1107 1108 pslldq XMM0, 4; // 0 fR/sum fG/sum fB/sum 1109 pslldq XMM1, 12; // 0 0 0 finalAlpha 1110 psrldq XMM0, 4; // fR/sum fG/sum fB/sum 0 1111 1112 por XMM0, XMM1; // fR/sum fG/sum fB/sum finalAlpha 1113 pxor XMM3, XMM3; 1114 packssdw XMM0, XMM3; // same in words 1115 packuswb XMM0, XMM3; // same in bytes 1116 1117 sub ECX, 1; 1118 movd [RDI-4], XMM0; // dest[x] = A 1119 jnz loop_ecx; 1120 end_of_loop: ; 1121 } 1122 } 1123 else 1124 static assert(false); 1125 } 1126 else 1127 { 1128 for (int x = 0; x < width; ++x) 1129 { 1130 // A B 1131 // C D 1132 RGBA A = L0[2 * x]; 1133 RGBA B = L0[2 * x + 1]; 1134 RGBA C = L1[2 * x]; 1135 RGBA D = L1[2 * x + 1]; 1136 1137 int alphaA = A.a; 1138 int alphaB = B.a; 1139 int alphaC = C.a; 1140 int alphaD = D.a; 1141 int sum = alphaA + alphaB + alphaC + alphaD; 1142 if (sum == 0) 1143 { 1144 dest[x] = RGBA(0,0,0,0); 1145 } 1146 else 1147 { 1148 int destAlpha = cast(ubyte)( (alphaA + alphaB + alphaC + alphaD + 2) >> 2 ); 1149 int red = (A.r * alphaA + B.r * alphaB + C.r * alphaC + D.r * alphaD); 1150 int green = (A.g * alphaA + B.g * alphaB + C.g * alphaC + D.g * alphaD); 1151 int blue = (A.b * alphaA + B.b* alphaB + C.b * alphaC + D.b * alphaD); 1152 float invSum = 1 / cast(float)(sum); 1153 1154 RGBA finalColor = RGBA( cast(ubyte)(0.5f + red * invSum), 1155 cast(ubyte)(0.5f + green * invSum), 1156 cast(ubyte)(0.5f + blue * invSum), 1157 cast(ubyte)destAlpha ); 1158 dest[x] = finalColor; 1159 } 1160 } 1161 } 1162 1163 enum verify = false; 1164 1165 static if (verify) 1166 { 1167 for (int x = 0; x < width; ++x) 1168 { 1169 // A B 1170 // C D 1171 RGBA A = L0[2 * x]; 1172 RGBA B = L0[2 * x + 1]; 1173 RGBA C = L1[2 * x]; 1174 RGBA D = L1[2 * x + 1]; 1175 1176 int alphaA = A.a; 1177 int alphaB = B.a; 1178 int alphaC = C.a; 1179 int alphaD = D.a; 1180 int sum = alphaA + alphaB + alphaC + alphaD; 1181 if (sum == 0) 1182 { 1183 assert(dest[x] == RGBA(0,0,0,0)); 1184 } 1185 else 1186 { 1187 int destAlpha = cast(ubyte)( (alphaA + alphaB + alphaC + alphaD + 2) >> 2 ); 1188 int red = (A.r * alphaA + B.r * alphaB + C.r * alphaC + D.r * alphaD); 1189 int green = (A.g * alphaA + B.g * alphaB + C.g * alphaC + D.g * alphaD); 1190 int blue = (A.b * alphaA + B.b* alphaB + C.b * alphaC + D.b * alphaD); 1191 1192 float invSum = 1 / cast(float)(sum); 1193 1194 RGBA finalColor = RGBA( cast(ubyte)(0.5f + red * invSum), 1195 cast(ubyte)(0.5f + green * invSum), 1196 cast(ubyte)(0.5f + blue * invSum), 1197 cast(ubyte)destAlpha ); 1198 RGBA instead = dest[x]; 1199 1200 int insteadR = instead.r; 1201 int insteadG = instead.g; 1202 int insteadB = instead.b; 1203 int insteadA = instead.a; 1204 int finalColorR = finalColor.r; 1205 int finalColorG = finalColor.g; 1206 int finalColorB = finalColor.b; 1207 int finalColorA = finalColor.a; 1208 import std.math; 1209 assert(abs(insteadR - finalColorR) <= 1); // some remaining differences because of rounding 1210 assert(abs(insteadG - finalColorG) <= 1); 1211 assert(abs(insteadB - finalColorB) <= 1); 1212 assert(insteadA == finalColorA); 1213 } 1214 } 1215 } 1216 1217 L0 += (2 * previousPitch); 1218 L1 += (2 * previousPitch); 1219 dest += thisPitch; 1220 } 1221 } 1222 1223 void generateLevelBoxAlphaCovIntoPremulRGBA(OwnedImage!RGBA thisLevel, 1224 OwnedImage!RGBA previousLevel, 1225 box2i updateRect) nothrow @nogc 1226 { 1227 int width = updateRect.width(); 1228 int height = updateRect.height(); 1229 1230 int previousPitch = previousLevel.w; 1231 int thisPitch = thisLevel.w; 1232 1233 RGBA* L0 = previousLevel.scanline(updateRect.min.y * 2).ptr + updateRect.min.x * 2; 1234 RGBA* L1 = L0 + previousPitch; 1235 1236 RGBA* dest = thisLevel.scanline(updateRect.min.y).ptr + updateRect.min.x; 1237 1238 for (int y = 0; y < height; ++y) 1239 { 1240 version(inlineAsmCanLoadGlobalsInPIC) 1241 { 1242 version(D_InlineAsm_X86) 1243 { 1244 asm nothrow @nogc 1245 { 1246 mov ECX, width; 1247 1248 mov EAX, L0; 1249 mov EDX, L1; 1250 mov EDI, dest; 1251 1252 movdqa XMM5, xmm512; // 512 512 5121 512 1253 pxor XMM4, XMM4; // all zeroes 1254 1255 loop_ecx: 1256 1257 movq XMM0, [EAX]; // Ar Ag Ab Aa Br Bg Bb Ba + zeroes 1258 movq XMM1, [EDX]; // Cr Cg Cb Ca Dr Dg Db Da + zeroes 1259 pxor XMM4, XMM4; 1260 add EAX, 8; 1261 add EDX, 8; 1262 1263 punpcklbw XMM0, XMM4; // Ar Ag Ab Aa Br Bg Bb Ba 1264 punpcklbw XMM1, XMM4; // Cr Cg Cb Ca Dr Dg Db Da 1265 1266 movdqa XMM2, XMM0; 1267 punpcklwd XMM0, XMM1; // Ar Cr Ag Cg Ab Cb Aa Ca 1268 punpckhwd XMM2, XMM1; // Br Dr Bg Dg Bb Db Ba Da 1269 1270 movdqa XMM3, XMM0; 1271 punpcklwd XMM0, XMM2; // Ar Br Cr Dr Ag Bg Cg Dg 1272 punpckhwd XMM3, XMM2; // Ab Bb Cb Db Aa Ba Ca Da 1273 1274 movdqa XMM1, XMM3; 1275 punpckhqdq XMM1, XMM1; // Aa Ba Ca Da Aa Ba Ca Da 1276 1277 add EDI, 4; 1278 1279 pmaddwd XMM0, XMM1; // Ar*Aa+Br*Ba Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 1280 pmaddwd XMM3, XMM1; // Ab*Aa+Bb*Ba Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 1281 1282 movdqa XMM2, XMM0; 1283 movdqa XMM1, XMM3; 1284 1285 psrldq XMM2, 4; // Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 0 1286 psrldq XMM1, 4; // Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 0 1287 1288 paddd XMM0, XMM2; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da garbage Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage 1289 paddd XMM3, XMM1; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da garbage Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage 1290 1291 pshufd XMM0, XMM0, 0b00001000; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage garbage 1292 pshufd XMM3, XMM3, 0b00001000; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage garbage 1293 1294 punpcklqdq XMM0, XMM3; // fR fG fB fA 1295 1296 1297 paddd XMM0, XMM5; 1298 psrld XMM0, 10; // final color in dwords 1299 1300 packssdw XMM0, XMM4; // same in words 1301 packuswb XMM0, XMM4; // same in bytes 1302 1303 sub ECX, 1; 1304 movd [EDI-4], XMM0; // dest[x] = A 1305 jnz loop_ecx; 1306 } 1307 } 1308 else version(D_InlineAsm_X86_64) 1309 { 1310 asm nothrow @nogc 1311 { 1312 mov ECX, width; 1313 1314 mov RAX, L0; 1315 mov RDX, L1; 1316 mov RDI, dest; 1317 1318 movdqa XMM5, xmm512; // 512 512 5121 512 1319 pxor XMM4, XMM4; // all zeroes 1320 1321 loop_ecx: 1322 1323 movq XMM0, [RAX]; // Ar Ag Ab Aa Br Bg Bb Ba + zeroes 1324 movq XMM1, [RDX]; // Cr Cg Cb Ca Dr Dg Db Da + zeroes 1325 pxor XMM4, XMM4; 1326 add RAX, 8; 1327 add RDX, 8; 1328 1329 punpcklbw XMM0, XMM4; // Ar Ag Ab Aa Br Bg Bb Ba 1330 punpcklbw XMM1, XMM4; // Cr Cg Cb Ca Dr Dg Db Da 1331 1332 movdqa XMM2, XMM0; 1333 punpcklwd XMM0, XMM1; // Ar Cr Ag Cg Ab Cb Aa Ca 1334 punpckhwd XMM2, XMM1; // Br Dr Bg Dg Bb Db Ba Da 1335 1336 movdqa XMM3, XMM0; 1337 punpcklwd XMM0, XMM2; // Ar Br Cr Dr Ag Bg Cg Dg 1338 punpckhwd XMM3, XMM2; // Ab Bb Cb Db Aa Ba Ca Da 1339 1340 movdqa XMM1, XMM3; 1341 punpckhqdq XMM1, XMM1; // Aa Ba Ca Da Aa Ba Ca Da 1342 1343 add RDI, 4; 1344 1345 pmaddwd XMM0, XMM1; // Ar*Aa+Br*Ba Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 1346 pmaddwd XMM3, XMM1; // Ab*Aa+Bb*Ba Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 1347 1348 movdqa XMM2, XMM0; 1349 movdqa XMM1, XMM3; 1350 1351 psrldq XMM2, 4; // Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 0 1352 psrldq XMM1, 4; // Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 0 1353 1354 paddd XMM0, XMM2; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da garbage Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage 1355 paddd XMM3, XMM1; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da garbage Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage 1356 1357 pshufd XMM0, XMM0, 0b00001000; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage garbage 1358 pshufd XMM3, XMM3, 0b00001000; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage garbage 1359 1360 punpcklqdq XMM0, XMM3; // fR fG fB fA 1361 1362 1363 paddd XMM0, XMM5; 1364 psrld XMM0, 10; // final color in dwords 1365 1366 packssdw XMM0, XMM4; // same in words 1367 packuswb XMM0, XMM4; // same in bytes 1368 1369 sub ECX, 1; 1370 movd [RDI-4], XMM0; // dest[x] = A 1371 jnz loop_ecx; 1372 } 1373 } 1374 else 1375 static assert(false); 1376 } 1377 else 1378 { 1379 for (int x = 0; x < width; ++x) 1380 { 1381 RGBA A = L0[2 * x]; 1382 RGBA B = L0[2 * x + 1]; 1383 RGBA C = L1[2 * x]; 1384 RGBA D = L1[2 * x + 1]; 1385 int red = (A.r * A.a + B.r * B.a + C.r * C.a + D.r * D.a); 1386 int green = (A.g * A.a + B.g * B.a + C.g * C.a + D.g * D.a); 1387 int blue = (A.b * A.a + B.b* B.a + C.b * C.a + D.b * D.a); 1388 int alpha = (A.a * A.a + B.a* B.a + C.a * C.a + D.a * D.a); 1389 RGBA finalColor = RGBA( cast(ubyte)((red + 512) >> 10), 1390 cast(ubyte)((green + 512) >> 10), 1391 cast(ubyte)((blue + 512) >> 10), 1392 cast(ubyte)((alpha + 512) >> 10)); 1393 dest[x] = finalColor; 1394 } 1395 } 1396 1397 enum bool verify = false; 1398 1399 static if (verify) 1400 { 1401 for (int x = 0; x < width; ++x) 1402 { 1403 RGBA A = L0[2 * x]; 1404 RGBA B = L0[2 * x + 1]; 1405 RGBA C = L1[2 * x]; 1406 RGBA D = L1[2 * x + 1]; 1407 int red = (A.r * A.a + B.r * B.a + C.r * C.a + D.r * D.a); 1408 int green = (A.g * A.a + B.g * B.a + C.g * C.a + D.g * D.a); 1409 int blue = (A.b * A.a + B.b* B.a + C.b * C.a + D.b * D.a); 1410 int alpha = (A.a * A.a + B.a* B.a + C.a * C.a + D.a * D.a); 1411 RGBA finalColor = RGBA( cast(ubyte)((red + 512) >> 10), 1412 cast(ubyte)((green + 512) >> 10), 1413 cast(ubyte)((blue + 512) >> 10), 1414 cast(ubyte)((alpha + 512) >> 10)); 1415 assert(dest[x] == finalColor); 1416 } 1417 } 1418 1419 L0 += (2 * previousPitch); 1420 L1 += (2 * previousPitch); 1421 dest += thisPitch; 1422 } 1423 } 1424 1425 void generateLevelCubicRGBA(OwnedImage!RGBA thisLevel, 1426 OwnedImage!RGBA previousLevel, 1427 box2i updateRect) nothrow @nogc 1428 { 1429 for (int y = updateRect.min.y; y < updateRect.max.y; ++y) 1430 { 1431 int y2m1 = 2 * y - 1; 1432 if (y2m1 < 0) 1433 y2m1 = 0; 1434 1435 int y2p2 = 2 * y + 2; 1436 if (y2p2 > previousLevel.h - 1) 1437 y2p2 = previousLevel.h - 1; 1438 1439 RGBA* LM1 = previousLevel.scanline(y2m1).ptr; 1440 RGBA* L0 = previousLevel.scanline(y * 2).ptr; 1441 RGBA* L1 = previousLevel.scanline(y * 2 + 1).ptr; 1442 RGBA* L2 = previousLevel.scanline(y2p2).ptr; 1443 RGBA* dest = thisLevel.scanline(y).ptr; 1444 1445 for (int x = updateRect.min.x; x < updateRect.max.x; ++x) 1446 { 1447 // A B C D 1448 // E F G H 1449 // I J K L 1450 // M N O P 1451 1452 int x2m1 = 2 * x - 1; 1453 if (x2m1 < 0) 1454 x2m1 = 0; 1455 int x2p0 = 2 * x; 1456 int x2p2 = 2 * x + 2; 1457 if (x2p2 > previousLevel.w - 1) 1458 x2p2 = previousLevel.w - 1; 1459 1460 version(inlineAsmCanLoadGlobalsInPIC) 1461 { 1462 version(D_InlineAsm_X86) 1463 { 1464 RGBA[16] buf = void; 1465 buf[0] = LM1[x2m1]; 1466 buf[1] = LM1[x2p0]; 1467 buf[2] = LM1[x2p0+1]; 1468 buf[3] = LM1[x2p2]; 1469 buf[4] = L0[x2m1]; 1470 buf[5] = L0[x2p0]; 1471 buf[6] = L0[x2p0+1]; 1472 buf[7] = L0[x2p2]; 1473 buf[8] = L1[x2m1]; 1474 buf[9] = L1[x2p0]; 1475 buf[10] = L1[x2p0+1]; 1476 buf[11] = L1[x2p2]; 1477 buf[12] = L2[x2m1]; 1478 buf[13] = L2[x2p0]; 1479 buf[14] = L2[x2p0+1]; 1480 buf[15] = L2[x2p2]; 1481 RGBA* pDest = dest + x; 1482 1483 asm nothrow @nogc 1484 { 1485 movdqu XMM0, buf; // A B C D 1486 movdqu XMM1, buf; 1487 pxor XMM2, XMM2; // zeroes 1488 punpcklbw XMM0, XMM2; // A B 1489 punpckhbw XMM1, XMM2; // C D 1490 pmullw XMM0, xmm11113333; // A*1 B*3 in shorts 1491 movdqa XMM3, XMM0; 1492 pmullw XMM1, xmm33331111; // C*3 D*3 in shorts 1493 movdqa XMM5, XMM1; 1494 1495 movdqu XMM0, buf+16; // E F G H 1496 movdqu XMM1, buf+16; 1497 punpcklbw XMM0, XMM2; // E F 1498 punpckhbw XMM1, XMM2; // G H 1499 pmullw XMM0, xmm33339999; // E*3 F*9 in shorts 1500 paddw XMM3, XMM0; 1501 pmullw XMM1, xmm99993333; // G*9 H*3 in shorts 1502 paddw XMM5, XMM1; 1503 1504 movdqu XMM0, buf+32; // I J K L 1505 movdqu XMM1, buf+32; 1506 punpcklbw XMM0, XMM2; // I J 1507 punpckhbw XMM1, XMM2; // K L 1508 pmullw XMM0, xmm33339999; // I*3 J*9 in shorts 1509 paddw XMM3, XMM0; 1510 pmullw XMM1, xmm99993333; // K*9 L*3 in shorts 1511 paddw XMM5, XMM1; 1512 1513 movdqu XMM0, buf+48; // M N O P 1514 movdqu XMM1, buf+48; 1515 punpcklbw XMM0, XMM2; // M N 1516 punpckhbw XMM1, XMM2; // O P 1517 pmullw XMM0, xmm11113333; // M*1 N*3 in shorts 1518 paddw XMM3, XMM0; // A+E*3+I*3+M B*3+F*9+J*9+3*N 1519 pmullw XMM1, xmm33331111; // O*3 P*1 in shorts 1520 paddw XMM5, XMM1; // C*3+G*9+K*9+O*3 D+H*3+L*3+P 1521 1522 movdqa XMM0, XMM3; 1523 movdqa XMM1, XMM5; 1524 psrldq XMM0, 8; 1525 psrldq XMM1, 8; 1526 paddw XMM3, XMM0; // A+E*3+I*3+M+B*3+F*9+J*9+3*N garbage(x4) 1527 paddw XMM5, XMM1; // C*3+G*9+K*9+O*3+D+H*3+L*3+P garbage(x4) 1528 paddw XMM3, XMM5; // total-sum garbage(x4) 1529 1530 paddw XMM3, xmm32; 1531 psrlw XMM3, 6; 1532 mov EAX, pDest; 1533 packuswb XMM3, XMM2; 1534 1535 movd [EAX], XMM3; 1536 } 1537 } 1538 else version(D_InlineAsm_X86_64) 1539 { 1540 RGBA[16] buf = void; 1541 buf[0] = LM1[x2m1]; 1542 buf[1] = LM1[x2p0]; 1543 buf[2] = LM1[x2p0+1]; 1544 buf[3] = LM1[x2p2]; 1545 buf[4] = L0[x2m1]; 1546 buf[5] = L0[x2p0]; 1547 buf[6] = L0[x2p0+1]; 1548 buf[7] = L0[x2p2]; 1549 buf[8] = L1[x2m1]; 1550 buf[9] = L1[x2p0]; 1551 buf[10] = L1[x2p0+1]; 1552 buf[11] = L1[x2p2]; 1553 buf[12] = L2[x2m1]; 1554 buf[13] = L2[x2p0]; 1555 buf[14] = L2[x2p0+1]; 1556 buf[15] = L2[x2p2]; 1557 RGBA* pDest = dest + x; 1558 1559 asm nothrow @nogc 1560 { 1561 movdqu XMM0, buf; // A B C D 1562 movdqu XMM1, buf; 1563 pxor XMM2, XMM2; // zeroes 1564 punpcklbw XMM0, XMM2; // A B 1565 punpckhbw XMM1, XMM2; // C D 1566 pmullw XMM0, xmm11113333; // A*1 B*3 in shorts 1567 movdqa XMM3, XMM0; 1568 pmullw XMM1, xmm33331111; // C*3 D*3 in shorts 1569 movdqa XMM5, XMM1; 1570 1571 movdqu XMM0, buf+16; // E F G H 1572 movdqu XMM1, buf+16; 1573 punpcklbw XMM0, XMM2; // E F 1574 punpckhbw XMM1, XMM2; // G H 1575 pmullw XMM0, xmm33339999; // E*3 F*9 in shorts 1576 paddw XMM3, XMM0; 1577 pmullw XMM1, xmm99993333; // G*9 H*3 in shorts 1578 paddw XMM5, XMM1; 1579 1580 movdqu XMM0, buf+32; // I J K L 1581 movdqu XMM1, buf+32; 1582 punpcklbw XMM0, XMM2; // I J 1583 punpckhbw XMM1, XMM2; // K L 1584 pmullw XMM0, xmm33339999; // I*3 J*9 in shorts 1585 paddw XMM3, XMM0; 1586 pmullw XMM1, xmm99993333; // K*9 L*3 in shorts 1587 paddw XMM5, XMM1; 1588 1589 movdqu XMM0, buf+48; // M N O P 1590 movdqu XMM1, buf+48; 1591 punpcklbw XMM0, XMM2; // M N 1592 punpckhbw XMM1, XMM2; // O P 1593 pmullw XMM0, xmm11113333; // M*1 N*3 in shorts 1594 paddw XMM3, XMM0; // A+E*3+I*3+M B*3+F*9+J*9+3*N 1595 pmullw XMM1, xmm33331111; // O*3 P*1 in shorts 1596 paddw XMM5, XMM1; // C*3+G*9+K*9+O*3 D+H*3+L*3+P 1597 1598 movdqa XMM0, XMM3; 1599 movdqa XMM1, XMM5; 1600 psrldq XMM0, 8; 1601 psrldq XMM1, 8; 1602 paddw XMM3, XMM0; // A+E*3+I*3+M+B*3+F*9+J*9+3*N garbage(x4) 1603 paddw XMM5, XMM1; // C*3+G*9+K*9+O*3+D+H*3+L*3+P garbage(x4) 1604 paddw XMM3, XMM5; // total-sum garbage(x4) 1605 1606 paddw XMM3, xmm32; 1607 psrlw XMM3, 6; 1608 mov RAX, pDest; 1609 packuswb XMM3, XMM2; 1610 1611 movd [RAX], XMM3; 1612 } 1613 } 1614 else 1615 static assert(false); 1616 } 1617 else 1618 { 1619 auto A = LM1[x2m1]; 1620 auto B = LM1[x2p0]; 1621 auto C = LM1[x2p0+1]; 1622 auto D = LM1[x2p2]; 1623 1624 auto E = L0[x2m1]; 1625 auto F = L0[x2p0]; 1626 auto G = L0[x2p0+1]; 1627 auto H = L0[x2p2]; 1628 1629 auto I = L1[x2m1]; 1630 auto J = L1[x2p0]; 1631 auto K = L1[x2p0+1]; 1632 auto L = L1[x2p2]; 1633 1634 auto M = L2[x2m1]; 1635 auto N = L2[x2p0]; 1636 auto O = L2[x2p0+1]; 1637 auto P = L2[x2p2]; 1638 1639 // Apply filter 1640 // 1 3 3 1 1641 // 3 9 9 3 1642 // 3 9 9 3 1643 // 1 3 3 1 1644 1645 int rSum = (A.r + D.r + M.r + P.r) + 3 * (B.r + C.r + E.r + H.r + I.r + L.r + N.r + O.r) + 9 * (F.r + G.r + J.r + K.r); 1646 int gSum = (A.g + D.g + M.g + P.g) + 3 * (B.g + C.g + E.g + H.g + I.g + L.g + N.g + O.g) + 9 * (F.g + G.g + J.g + K.g); 1647 int bSum = (A.b + D.b + M.b + P.b) + 3 * (B.b + C.b + E.b + H.b + I.b + L.b + N.b + O.b) + 9 * (F.b + G.b + J.b + K.b); 1648 int aSum = (A.a + D.a + M.a + P.a) + 3 * (B.a + C.a + E.a + H.a + I.a + L.a + N.a + O.a) + 9 * (F.a + G.a + J.a + K.a); 1649 dest[x].r = cast(ubyte)((rSum + 32) >> 6); 1650 dest[x].g = cast(ubyte)((gSum + 32) >> 6); 1651 dest[x].b = cast(ubyte)((bSum + 32) >> 6); 1652 dest[x].a = cast(ubyte)((aSum + 32) >> 6); 1653 } 1654 } 1655 } 1656 } 1657 1658 void generateLevelCubicL16(OwnedImage!L16 thisLevel, 1659 OwnedImage!L16 previousLevel, 1660 box2i updateRect) nothrow @nogc 1661 { 1662 for (int y = updateRect.min.y; y < updateRect.max.y; ++y) 1663 { 1664 int y2m1 = 2 * y - 1; 1665 if (y2m1 < 0) 1666 y2m1 = 0; 1667 1668 int y2p2 = 2 * y + 2; 1669 if (y2p2 > previousLevel.h - 1) 1670 y2p2 = previousLevel.h - 1; 1671 1672 L16* LM1 = previousLevel.scanline(y2m1).ptr; 1673 L16* L0 = previousLevel.scanline(y * 2).ptr; 1674 L16* L1 = previousLevel.scanline(y * 2 + 1).ptr; 1675 L16* L2 = previousLevel.scanline(y2p2).ptr; 1676 L16* dest = thisLevel.scanline(y).ptr; 1677 1678 for (int x = updateRect.min.x; x < updateRect.max.x; ++x) 1679 { 1680 // A B C D 1681 // E F G H 1682 // I J K L 1683 // M N O P 1684 1685 int x2m1 = 2 * x - 1; 1686 if (x2m1 < 0) 1687 x2m1 = 0; 1688 int x2p0 = 2 * x; 1689 int x2p2 = 2 * x + 2; 1690 if (x2p2 > previousLevel.w - 1) 1691 x2p2 = previousLevel.w - 1; 1692 1693 ushort A = LM1[x2m1].l; 1694 ushort B = LM1[x2p0].l; 1695 ushort C = LM1[x2p0+1].l; 1696 ushort D = LM1[x2p2].l; 1697 1698 ushort E = L0[x2m1].l; 1699 ushort F = L0[x2p0].l; 1700 ushort G = L0[x2p0+1].l; 1701 ushort H = L0[x2p2].l; 1702 1703 ushort I = L1[x2m1].l; 1704 ushort J = L1[x2p0].l; 1705 ushort K = L1[x2p0+1].l; 1706 ushort L = L1[x2p2].l; 1707 1708 ushort M = L2[x2m1].l; 1709 ushort N = L2[x2p0].l; 1710 ushort O = L2[x2p0+1].l; 1711 ushort P = L2[x2p2].l; 1712 1713 // Apply filter 1714 // 1 3 3 1 A B C D 1715 // 3 9 9 3 E F G H 1716 // 3 9 9 3 I J K L 1717 // 1 3 3 1 M N O P 1718 1719 int depthSum = (A + D + M + P) 1720 + 3 * (B + C + E + H + I + L + N + O) 1721 + 9 * (F + G + J + K); 1722 dest[x].l = cast(ushort)((depthSum + 32) >> 6 ); 1723 } 1724 } 1725 } 1726 1727 unittest 1728 { 1729 Mipmap!RGBA rgbaMipmap; 1730 Mipmap!L16 l16Mipmap; 1731 }