1 /** 2 * Copyright: Copyright Auburn Sounds 2015 and later. 3 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 4 * Authors: Guillaume Piolat 5 */ 6 module dplug.graphics.mipmap; 7 8 import std.algorithm.comparison; 9 10 import gfm.math.vector; 11 import gfm.math.box; 12 import dplug.graphics.color; 13 14 import dplug.core.nogc; 15 import dplug.core.alignedbuffer; 16 import dplug.graphics.drawex; 17 18 version( D_InlineAsm_X86 ) 19 { 20 version = AsmX86; 21 } 22 else version( D_InlineAsm_X86_64 ) 23 { 24 version = AsmX86; 25 } 26 27 // Because of unability to load globals in PIC code with DMD, only enable some assembly with LDC 28 version(LDC) 29 { 30 version( D_InlineAsm_X86 ) 31 { 32 version = inlineAsmCanLoadGlobalsInPIC; 33 } 34 else version( D_InlineAsm_X86_64 ) 35 { 36 version = inlineAsmCanLoadGlobalsInPIC; 37 } 38 } 39 40 41 /// Mipmapped images. 42 /// Supports non power-of-two textures. 43 /// Size of the i+1-th mipmap is { (width)/2, (height)/2 } 44 /// The mipmap owns each of its levels. 45 final class Mipmap(COLOR) if (is(COLOR == RGBA) || is(COLOR == L16)) 46 { 47 public: 48 nothrow: 49 @nogc: 50 51 enum Quality 52 { 53 box, // simple 2x2 filter, creates phase problems with NPOT. For higher levels, automatically uses cubic. 54 cubic, // Very smooth kernel [1 2 1] x [1 2 1] 55 boxAlphaCov, // ditto but alpha is used as weight, only implemented for RGBA 56 boxAlphaCovIntoPremul, // same as boxAlphaConv but after such a step the next level is alpha-premultiplied 57 } 58 59 AlignedBuffer!(OwnedImage!COLOR) levels; 60 61 /// Creates empty 62 this() 63 { 64 levels = makeAlignedBuffer!(OwnedImage!COLOR)(); 65 } 66 67 /// Set number of levels and size 68 /// maxLevel = 0 => only one image 69 /// maxLevel = 1 => one image + one 2x downsampled mipmap 70 /// etc... 71 this(int maxLevel, int w, int h) 72 { 73 this(); 74 size(maxLevel, w, h); 75 } 76 77 78 /// Creates a Mipmap out of a flat OwnedImage. 79 /// This takes ownership of the given image, which is now owned by the `Mipmap`. 80 this(int maxLevel, OwnedImage!COLOR level0) 81 { 82 //PERF: could avoid to create the 0th level only to replace it later 83 84 this(maxLevel, level0.w, level0.h); 85 86 // replaces level 0 87 levels[0].destroyFree(); 88 levels[0] = level0; 89 generateMipmaps(Quality.box); 90 } 91 92 void size(int maxLevel, int w, int h) 93 { 94 // find number of needed levels 95 int neededLevels = 0; 96 { 97 int wr = w; 98 int hr = h; 99 for (; neededLevels <= maxLevel; ++neededLevels) 100 { 101 if (wr == 0 || hr == 0) 102 break; 103 wr = (wr + 0) >> 1; 104 hr = (hr + 0) >> 1; 105 } 106 } 107 108 void setLevels(int numLevels) 109 { 110 // FUTURE: cleanup excess levels 111 // should not happen until we have resizing 112 if (numLevels < levels.length) 113 { 114 assert(false); 115 } 116 117 int previousLength = cast(int)levels.length; 118 119 levels.resize(numLevels); 120 121 // create empty image for new levels 122 for(int level = previousLength; level < numLevels; ++level) 123 { 124 levels[level] = mallocEmplace!(OwnedImage!COLOR)(); 125 } 126 } 127 128 setLevels(neededLevels); 129 130 // resize levels 131 for (int level = 0; level < neededLevels; ++level) 132 { 133 assert(w != 0 && h != 0); 134 levels[level].size(w, h); 135 w = (w + 0) >> 1; 136 h = (h + 0) >> 1; 137 } 138 } 139 140 ~this() 141 { 142 foreach(level; levels) 143 level.destroyFree(); 144 } 145 146 /// Interpolates a color between mipmap levels. Floating-point level, spatial linear interpolation. 147 /// x and y are in base level coordinates (top-left pixel is on (0.5, 0.5) coordinates). 148 /// Clamped to borders. 149 auto linearMipmapSample(float level, float x, float y) nothrow @nogc 150 { 151 int ilevel = cast(int)level; 152 float flevel = level - ilevel; 153 vec4f levelN = linearSample(ilevel, x, y); 154 if (flevel == 0) 155 return levelN; 156 157 auto levelNp1 = linearSample(ilevel + 1, x, y); 158 159 return levelN * (1 - flevel) + levelNp1 * flevel; 160 } 161 162 163 /// Interpolates a color. Integer level, spatial linear interpolation. 164 /// x and y are in base level coordinates (top-left pixel is on (0.5, 0.5) coordinates). 165 /// Clamped to borders. 166 auto linearSample(int level, float x, float y) nothrow @nogc 167 { 168 if (level < 0) 169 level = 0; 170 int numLevels = cast(int)levels.length; 171 if (level >= numLevels) 172 level = numLevels - 1; 173 174 OwnedImage!COLOR image = levels[level]; 175 176 177 static immutable float[14] factors = [ 1.0f, 0.5f, 0.25f, 0.125f, 178 0.0625f, 0.03125f, 0.015625f, 0.0078125f, 179 0.00390625f, 0.001953125f, 0.0009765625f, 0.00048828125f, 180 0.000244140625f, 0.0001220703125f]; 181 182 float divider = factors[level]; 183 x = x * divider - 0.5f; 184 y = y * divider - 0.5f; 185 186 float maxX = image.w - 1.001f; // avoids an edge case with truncation 187 float maxY = image.h - 1.001f; 188 189 if (x < 0) 190 x = 0; 191 if (y < 0) 192 y = 0; 193 if (x > maxX) 194 x = maxX; 195 if (y > maxY) 196 y = maxY; 197 198 int ix = cast(int)x; 199 int iy = cast(int)y; 200 float fx = x - ix; 201 202 int ixp1 = ix + 1; 203 if (ixp1 >= image.w) 204 ixp1 = image.w - 1; 205 int iyp1 = iy + 1; 206 if (iyp1 >= image.h) 207 iyp1 = image.h - 1; 208 209 float fxm1 = 1 - fx; 210 float fy = y - iy; 211 float fym1 = 1 - fy; 212 213 COLOR[] L0 = image.scanline(iy); 214 COLOR[] L1 = image.scanline(iyp1); 215 216 COLOR A = L0.ptr[ix]; 217 COLOR B = L0.ptr[ixp1]; 218 COLOR C = L1.ptr[ix]; 219 COLOR D = L1.ptr[ixp1]; 220 221 static if (is(COLOR == RGBA)) 222 { 223 float inv255 = 1 / 255.0f; 224 225 version( AsmX86 ) 226 { 227 vec4f asmResult; 228 229 asm nothrow @nogc 230 { 231 movd XMM0, A; 232 movd XMM1, B; 233 movd XMM2, C; 234 movd XMM3, D; 235 pxor XMM4, XMM4; 236 237 punpcklbw XMM0, XMM4; 238 punpcklbw XMM1, XMM4; 239 punpcklbw XMM2, XMM4; 240 punpcklbw XMM3, XMM4; 241 242 punpcklwd XMM0, XMM4; 243 punpcklwd XMM1, XMM4; 244 punpcklwd XMM2, XMM4; 245 punpcklwd XMM3, XMM4; 246 247 cvtdq2ps XMM0, XMM0; 248 cvtdq2ps XMM1, XMM1; 249 250 cvtdq2ps XMM2, XMM2; 251 cvtdq2ps XMM3, XMM3; 252 253 movss XMM4, fxm1; 254 pshufd XMM4, XMM4, 0; 255 movss XMM5, fx; 256 pshufd XMM5, XMM5, 0; 257 258 mulps XMM0, XMM4; 259 mulps XMM1, XMM5; 260 mulps XMM2, XMM4; 261 mulps XMM3, XMM5; 262 263 movss XMM4, fym1; 264 pshufd XMM4, XMM4, 0; 265 movss XMM5, fy; 266 pshufd XMM5, XMM5, 0; 267 268 addps XMM0, XMM1; 269 addps XMM2, XMM3; 270 271 mulps XMM0, XMM4; 272 mulps XMM2, XMM5; 273 274 addps XMM0, XMM2; 275 276 movups asmResult, XMM0; 277 } 278 279 // Uncomment to check 280 /* 281 vec4f vA = vec4f(A.r, A.g, A.b, A.a); 282 vec4f vB = vec4f(B.r, B.g, B.b, B.a); 283 vec4f vC = vec4f(C.r, C.g, C.b, C.a); 284 vec4f vD = vec4f(D.r, D.g, D.b, D.a); 285 286 vec4f up = vA * fxm1 + vB * fx; 287 vec4f down = vC * fxm1 + vD * fx; 288 vec4f dResult = up * fym1 + down * fy; 289 290 import gfm.core; 291 292 if (dResult.distanceTo(result) < 1.0f) 293 debugBreak(); 294 */ 295 296 vec4f result = asmResult; 297 return result; 298 } 299 else 300 { 301 vec4f vA = vec4f(A.r, A.g, A.b, A.a); 302 vec4f vB = vec4f(B.r, B.g, B.b, B.a); 303 vec4f vC = vec4f(C.r, C.g, C.b, C.a); 304 vec4f vD = vec4f(D.r, D.g, D.b, D.a); 305 306 307 308 vec4f up = vA * fxm1 + vB * fx; 309 vec4f down = vC * fxm1 + vD * fx; 310 vec4f dResult = up * fym1 + down * fy; 311 312 // assert(dResult.distanceTo(asmResult) < 1.0f); 313 314 return dResult; 315 } 316 } 317 else 318 { 319 float up = A.l * fxm1 + B.l * fx; 320 float down = C.l * fxm1 + D.l * fx; 321 return up * fym1 + down * fy; 322 } 323 } 324 325 /// Returns: Width of the base level. 326 int width() pure const nothrow @nogc 327 { 328 return levels[0].w; 329 } 330 331 /// Returns: Height of the base level. 332 int height() pure const nothrow @nogc 333 { 334 return levels[0].h; 335 } 336 337 /// Returns: Number of levels. The maximum level is numLevels() - 1. 338 int numLevels() pure const nothrow @nogc 339 { 340 return cast(int)levels.length; 341 } 342 343 /// Regenerates the whole upper levels. 344 void generateMipmaps(Quality quality) nothrow @nogc 345 { 346 box2i updateRect = box2i(0, 0, width(), height()); 347 for (int level = 1; level < numLevels(); ++level) 348 { 349 // HACK: Force cubic filter past a level else it makes ugly looking mipmaps 350 if (level >= 3 && quality == Quality.box) 351 quality = Quality.cubic; 352 353 updateRect = generateNextLevel(quality, updateRect, level); 354 } 355 } 356 357 /// Regenerates a single mipmap level based on changes in the provided rectangle (expressed in level 0 coordinates). 358 /// updateRect expressed in level 0 coordinates 359 /// In general if you have several subparts of mipmaps to update, make sure a level is fully completed 360 /// before computing the next one. 361 box2i generateNextLevel(Quality quality, box2i updateRectPreviousLevel, int level) nothrow @nogc 362 { 363 OwnedImage!COLOR previousLevel = levels[level - 1]; 364 box2i updateRect = impactOnNextLevel(quality, updateRectPreviousLevel, previousLevel.w, previousLevel.h); 365 generateLevel(level, quality, updateRect); 366 return updateRect; 367 } 368 369 /// Regenerates one level 370 /// updateRect expressed in level i-th coordinates 371 void generateLevel(int level, Quality quality, box2i updateRect) nothrow @nogc 372 { 373 assert(level > 0); 374 OwnedImage!COLOR thisLevel = levels[level]; 375 OwnedImage!COLOR previousLevel = levels[level - 1]; 376 377 final switch(quality) with (Quality) 378 { 379 case box: 380 381 static if (is(COLOR == RGBA)) 382 generateLevelBoxRGBA(thisLevel, previousLevel, updateRect); 383 else static if (is(COLOR == L16)) 384 generateLevelBoxL16(thisLevel, previousLevel, updateRect); 385 else 386 static assert(false, "not implemented"); 387 388 enum checkBoxMipmaps = false; 389 390 static if (checkBoxMipmaps) 391 { 392 for (int y = updateRect.min.y; y < updateRect.max.y; ++y) 393 { 394 COLOR[] L0 = previousLevel.scanline(y * 2); 395 COLOR[] L1 = previousLevel.scanline(y * 2 + 1); 396 COLOR[] dest = thisLevel.scanline(y); 397 398 for (int x = updateRect.min.x; x < updateRect.max.x; ++x) 399 { 400 // A B 401 // C D 402 COLOR A = L0[2 * x]; 403 COLOR B = L0[2 * x + 1]; 404 COLOR C = L1[2 * x]; 405 COLOR D = L1[2 * x + 1]; 406 assert(dest[x] == COLOR.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D)); 407 } 408 } 409 } 410 break; 411 412 case boxAlphaCov: 413 414 static if (is(COLOR == RGBA)) 415 { 416 generateLevelBoxAlphaCovRGBA(thisLevel, previousLevel, updateRect); 417 418 static if (false) 419 { 420 void checkLevelBoxAlphaConvRGBA(Image!RGBA* thisLevel, Image!RGBA* previousLevel, box2i updateRect) 421 { 422 for (int y = updateRect.min.y; y < updateRect.max.y; ++y) 423 { 424 RGBA[] L0 = previousLevel.scanline(y * 2); 425 RGBA[] L1 = previousLevel.scanline(y * 2 + 1); 426 RGBA[] dest = thisLevel.scanline(y); 427 428 for (int x = updateRect.min.x; x < updateRect.max.x; ++x) 429 { 430 // A B 431 // C D 432 RGBA A = L0.ptr[2 * x]; 433 RGBA B = L0.ptr[2 * x + 1]; 434 RGBA C = L1.ptr[2 * x]; 435 RGBA D = L1.ptr[2 * x + 1]; 436 437 int alphaA = A.a; 438 int alphaB = B.a; 439 int alphaC = C.a; 440 int alphaD = D.a; 441 int sum = alphaA + alphaB + alphaC + alphaD; 442 if (sum == 0) 443 { 444 assert(dest.ptr[x] == A); 445 } 446 else 447 { 448 int destAlpha = cast(ubyte)( (alphaA + alphaB + alphaC + alphaD + 2) >> 2 ); 449 int red = (A.r * alphaA + B.r * alphaB + C.r * alphaC + D.r * alphaD); 450 int green = (A.g * alphaA + B.g * alphaB + C.g * alphaC + D.g * alphaD); 451 int blue = (A.b * alphaA + B.b* alphaB + C.b * alphaC + D.b * alphaD); 452 float invSum = 1 / cast(float)(sum); 453 454 RGBA finalColor = RGBA( cast(ubyte)(0.5f + red * invSum), 455 cast(ubyte)(0.5f + green * invSum), 456 cast(ubyte)(0.5f + blue * invSum), 457 cast(ubyte)destAlpha ); 458 assert(dest.ptr[x] == finalColor); 459 } 460 } 461 } 462 } 463 checkLevelBoxAlphaConvRGBA(thisLevel, previousLevel, updateRect); 464 } 465 break; 466 } 467 else 468 assert(false); 469 470 case boxAlphaCovIntoPremul: 471 472 static if (is(COLOR == RGBA)) 473 { 474 generateLevelBoxAlphaCovIntoPremulRGBA(thisLevel, previousLevel, updateRect); 475 break; 476 } 477 else 478 assert(false); 479 480 case cubic: 481 static if (is(COLOR == RGBA)) 482 { 483 generateLevelCubicRGBA(thisLevel, previousLevel, updateRect); 484 break; 485 } 486 else static if (is(COLOR == L16)) 487 { 488 generateLevelCubicL16(thisLevel, previousLevel, updateRect); 489 break; 490 } 491 else 492 static assert(false, "not implemented"); 493 494 495 } 496 } 497 498 499 private: 500 /// Computes impact of updating the area box on next level 501 static box2i impactOnNextLevel(Quality quality, box2i area, int currentLevelWidth, int currentLevelHeight) pure nothrow @nogc 502 { 503 box2i maxArea = box2i(0, 0, currentLevelWidth / 2, currentLevelHeight / 2); 504 505 final switch(quality) with (Quality) 506 { 507 case box: 508 case boxAlphaCov: 509 case boxAlphaCovIntoPremul: 510 int xmin = area.min.x / 2; 511 int ymin = area.min.y / 2; 512 int xmax = (area.max.x + 1) / 2; 513 int ymax = (area.max.y + 1) / 2; 514 return box2i(xmin, ymin, xmax, ymax).intersection(maxArea); 515 516 case cubic: 517 int xmin = (area.min.x - 1) / 2; 518 int ymin = (area.min.y - 1) / 2; 519 int xmax = (area.max.x + 2) / 2; 520 int ymax = (area.max.y + 2) / 2; 521 return box2i(xmin, ymin, xmax, ymax).intersection(maxArea); 522 } 523 524 } 525 } 526 527 unittest 528 { 529 Mipmap!RGBA a = new Mipmap!RGBA(); 530 a.size(4, 256, 256); 531 a.destroy(); 532 533 Mipmap!L16 b = new Mipmap!L16(); 534 b.size(16, 17, 333); 535 b.destroy(); 536 } 537 538 539 private: 540 541 align(16) static immutable short[8] xmmTwoShort = [ 2, 2, 2, 2, 2, 2, 2, 2 ]; 542 align(16) static immutable int[4] xmmTwoInt = [ 2, 2, 2, 2 ]; 543 align(16) static immutable float[4] xmm0_5 = [ 0.5f, 0.5f, 0.5f, 0.5f ]; 544 align(16) static immutable int[4] xmm512 = [ 512, 512, 512, 512 ]; 545 align(16) static immutable short[8] xmm11113333 = [ 1, 1, 1, 1, 3, 3, 3, 3 ]; 546 align(16) static immutable short[8] xmm33331111 = [ 3, 3, 3, 3, 1, 1, 1, 1 ]; 547 align(16) static immutable short[8] xmm33339999 = [ 3, 3, 3, 3, 9, 9, 9, 9 ]; 548 align(16) static immutable short[8] xmm99993333 = [ 9, 9, 9, 9, 3, 3, 3, 3 ]; 549 align(16) static immutable short[8] xmm32 = [ 32, 32, 32, 32, 32, 32, 32, 32 ]; 550 551 552 void generateLevelBoxRGBA(OwnedImage!RGBA thisLevel, 553 OwnedImage!RGBA previousLevel, 554 box2i updateRect) pure nothrow @nogc 555 { 556 int width = updateRect.width(); 557 int height = updateRect.height(); 558 559 int previousPitch = previousLevel.w; 560 int thisPitch = thisLevel.w; 561 562 RGBA* L0 = previousLevel.scanline(updateRect.min.y * 2).ptr + updateRect.min.x * 2; 563 RGBA* L1 = L0 + previousPitch; 564 RGBA* dest = thisLevel.scanline(updateRect.min.y).ptr + updateRect.min.x; 565 566 for (int y = 0; y < height; ++y) 567 { 568 version(inlineAsmCanLoadGlobalsInPIC) 569 { 570 version(D_InlineAsm_X86) 571 { 572 asm pure nothrow @nogc 573 { 574 mov ECX, width; 575 shr ECX, 1; 576 jz no_need; // ECX = 0 => no pair of pixels to process 577 578 mov EAX, L0; 579 mov EDX, L1; 580 mov EDI, dest; 581 movaps XMM5, xmmTwoShort; 582 583 loop_ecx: 584 movdqu XMM0, [EAX]; // A B E F 585 pxor XMM4, XMM4; 586 movdqu XMM1, [EDX]; // C D G H 587 movdqa XMM2, XMM0; 588 movdqa XMM3, XMM1; 589 punpcklbw XMM0, XMM4; // A B in short 590 punpcklbw XMM1, XMM4; // C D in short 591 punpckhbw XMM2, XMM4; // E F in short 592 punpckhbw XMM3, XMM4; // G H in short 593 paddusw XMM0, XMM1; // A + C | B + D 594 paddusw XMM2, XMM3; // E + F | G + H 595 movdqa XMM1, XMM0; 596 movdqa XMM3, XMM2; 597 psrldq XMM1, 8; 598 psrldq XMM3, 8; 599 add EDI, 8; 600 paddusw XMM0, XMM1; // A + B + C + D | garbage 601 paddusw XMM2, XMM3; // E + F + G + H | garbage 602 paddusw XMM0, XMM5; // A + B + C + D + 2 | garbage 603 paddusw XMM2, XMM5; // E + F + G + H + 2 | garbage 604 psrlw XMM0, 2; // (A + B + C + D + 2) >> 2 | garbage 605 psrlw XMM2, 2; // (E + F + G + H + 2) >> 2 | garbage 606 add EAX, 16; 607 punpcklqdq XMM0, XMM2; 608 add EDX, 16; 609 packuswb XMM0, XMM4; // (A + B + C + D + 2) >> 2 | (E + F + G + H + 2) >> 2 | 0 | 0 610 movq [EDI-8], XMM0; 611 sub ECX, 1; 612 jnz loop_ecx; 613 no_need: ; 614 } 615 616 // Eventually filter the last pixel 617 int remaining = width & ~1; 618 for (int x = remaining; x < width; ++x) 619 { 620 RGBA A = L0[2 * x]; 621 RGBA B = L0[2 * x + 1]; 622 RGBA C = L1[2 * x]; 623 RGBA D = L1[2 * x + 1]; 624 dest[x] = RGBA.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D); 625 } 626 } 627 else version(D_InlineAsm_X86_64) 628 { 629 asm pure nothrow @nogc 630 { 631 mov ECX, width; 632 shr ECX, 1; 633 jz no_need; // ECX = 0 => no pair of pixels to process 634 635 mov RAX, L0; 636 mov RDX, L1; 637 mov RDI, dest; 638 movaps XMM5, xmmTwoShort; 639 640 loop_ecx: 641 movdqu XMM0, [RAX]; // A B E F 642 pxor XMM4, XMM4; 643 movdqu XMM1, [RDX]; // C D G H 644 movdqa XMM2, XMM0; 645 movdqa XMM3, XMM1; 646 punpcklbw XMM0, XMM4; // A B in short 647 punpcklbw XMM1, XMM4; // C D in short 648 punpckhbw XMM2, XMM4; // E F in short 649 punpckhbw XMM3, XMM4; // G H in short 650 paddusw XMM0, XMM1; // A + C | B + D 651 paddusw XMM2, XMM3; // E + F | G + H 652 movdqa XMM1, XMM0; 653 movdqa XMM3, XMM2; 654 psrldq XMM1, 8; 655 psrldq XMM3, 8; 656 add RDI, 8; 657 paddusw XMM0, XMM1; // A + B + C + D | garbage 658 paddusw XMM2, XMM3; // E + F + G + H | garbage 659 paddusw XMM0, XMM5; // A + B + C + D + 2 | garbage 660 paddusw XMM2, XMM5; // E + F + G + H + 2 | garbage 661 psrlw XMM0, 2; // (A + B + C + D + 2) >> 2 | garbage 662 psrlw XMM2, 2; // (E + F + G + H + 2) >> 2 | garbage 663 add RAX, 16; 664 punpcklqdq XMM0, XMM2; 665 add RDX, 16; 666 packuswb XMM0, XMM4; // (A + B + C + D + 2) >> 2 | (E + F + G + H + 2) >> 2 | 0 | 0 667 movq [RDI-8], XMM0; 668 sub ECX, 1; 669 jnz loop_ecx; 670 no_need: ; 671 } 672 673 // Eventually filter the last pixel 674 int remaining = width & ~1; 675 for (int x = remaining; x < width; ++x) 676 { 677 RGBA A = L0[2 * x]; 678 RGBA B = L0[2 * x + 1]; 679 RGBA C = L1[2 * x]; 680 RGBA D = L1[2 * x + 1]; 681 dest[x] = RGBA.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D); 682 } 683 } 684 else 685 static assert(false); 686 } 687 else 688 { 689 for (int x = 0; x < width; ++x) 690 { 691 // A B 692 // C D 693 RGBA A = L0[2 * x]; 694 RGBA B = L0[2 * x + 1]; 695 RGBA C = L1[2 * x]; 696 RGBA D = L1[2 * x + 1]; 697 698 dest[x] = RGBA.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D); 699 } 700 } 701 702 L0 += (2 * previousPitch); 703 L1 += (2 * previousPitch); 704 dest += thisPitch; 705 } 706 } 707 708 void generateLevelBoxL16(OwnedImage!L16 thisLevel, 709 OwnedImage!L16 previousLevel, 710 box2i updateRect) pure nothrow @nogc 711 { 712 int width = updateRect.width(); 713 int height = updateRect.height(); 714 715 int previousPitch = previousLevel.w; 716 int thisPitch = thisLevel.w; 717 718 L16* L0 = previousLevel.scanline(updateRect.min.y * 2).ptr + updateRect.min.x * 2; 719 L16* L1 = L0 + previousPitch; 720 721 L16* dest = thisLevel.scanline(updateRect.min.y).ptr + updateRect.min.x; 722 723 for (int y = 0; y < height; ++y) 724 { 725 version(inlineAsmCanLoadGlobalsInPIC) 726 { 727 version(D_InlineAsm_X86) 728 { 729 asm pure nothrow @nogc 730 { 731 mov ECX, width; 732 shr ECX, 2; 733 jz no_need; // ECX = 0 => less than 4 pixels to process 734 735 mov EAX, L0; 736 mov EDX, L1; 737 mov EDI, dest; 738 movdqa XMM5, xmmTwoInt; 739 pxor XMM4, XMM4; 740 741 loop_ecx: 742 movdqu XMM0, [EAX]; // A B E F I J M N 743 movdqu XMM1, [EDX]; // C D G H K L O P 744 745 add EAX, 16; 746 add EDX, 16; 747 748 movdqa XMM2, XMM0; 749 movdqa XMM3, XMM1; 750 751 punpcklwd XMM0, XMM4; // A B E F in int32 752 punpckhwd XMM2, XMM4; // I J M N in int32 753 punpcklwd XMM1, XMM4; // C D G H in int32 754 punpckhwd XMM3, XMM4; // K L O P in int32 755 756 paddd XMM0, XMM1; // A+C B+D E+G F+H 757 paddd XMM2, XMM3; // I+K J+L M+O N+P 758 759 movdqa XMM1, XMM0; 760 movdqa XMM3, XMM2; 761 762 psrldq XMM1, 4; // B+D E+G F+H 0 763 psrldq XMM3, 4; // J+L M+O N+P 0 764 765 paddd XMM0, XMM1; // A+B+C+D garbage E+F+G+H garbage 766 paddd XMM2, XMM3; // I+J+K+L garbage M+N+O+P garbage 767 768 pshufd XMM0, XMM0, 0b00001000; // A+B+C+D E+F+G+H garbage garbage 769 pshufd XMM2, XMM2, 0b00001000; // I+J+K+L M+N+O+P garbage garbage 770 771 punpcklqdq XMM0, XMM2; // A+B+C+D E+F+G+H I+J+K+L M+N+O+P 772 paddd XMM0, XMM5; // add 2 773 psrld XMM0, 2; // >> 2 774 775 // because packusdw is not available before SSE4.1 776 // Extend sign bit to the right 777 pslld XMM0, 16; 778 psrad XMM0, 16; 779 add EDI, 8; 780 packssdw XMM0, XMM4; 781 782 movq [EDI-8], XMM0; 783 sub ECX, 1; 784 jnz loop_ecx; 785 no_need: ; 786 } 787 788 // Eventually filter the 0 to 3 pixels 789 int remaining = width & ~3; 790 for (int x = remaining; x < width; ++x) 791 { 792 L16 A = L0[2 * x]; 793 L16 B = L0[2 * x + 1]; 794 L16 C = L1[2 * x]; 795 L16 D = L1[2 * x + 1]; 796 dest[x] = L16.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D); 797 } 798 } 799 else version(D_InlineAsm_X86_64) 800 { 801 asm pure nothrow @nogc 802 { 803 mov ECX, width; 804 shr ECX, 2; 805 jz no_need; // ECX = 0 => less than 4 pixels to process 806 807 mov RAX, L0; 808 mov RDX, L1; 809 mov RDI, dest; 810 movdqa XMM5, xmmTwoInt; 811 pxor XMM4, XMM4; 812 813 loop_ecx: 814 movdqu XMM0, [RAX]; // A B E F I J M N 815 movdqu XMM1, [RDX]; // C D G H K L O P 816 817 add RAX, 16; 818 add RDX, 16; 819 820 movdqa XMM2, XMM0; 821 movdqa XMM3, XMM1; 822 823 punpcklwd XMM0, XMM4; // A B E F in int32 824 punpckhwd XMM2, XMM4; // I J M N in int32 825 punpcklwd XMM1, XMM4; // C D G H in int32 826 punpckhwd XMM3, XMM4; // K L O P in int32 827 828 paddd XMM0, XMM1; // A+C B+D E+G F+H 829 paddd XMM2, XMM3; // I+K J+L M+O N+P 830 831 movdqa XMM1, XMM0; 832 movdqa XMM3, XMM2; 833 834 psrldq XMM1, 4; // B+D E+G F+H 0 835 psrldq XMM3, 4; // J+L M+O N+P 0 836 837 paddd XMM0, XMM1; // A+B+C+D garbage E+F+G+H garbage 838 paddd XMM2, XMM3; // I+J+K+L garbage M+N+O+P garbage 839 840 pshufd XMM0, XMM0, 0b00001000; // A+B+C+D E+F+G+H garbage garbage 841 pshufd XMM2, XMM2, 0b00001000; // I+J+K+L M+N+O+P garbage garbage 842 843 punpcklqdq XMM0, XMM2; // A+B+C+D E+F+G+H I+J+K+L M+N+O+P 844 paddd XMM0, XMM5; // add 2 845 psrld XMM0, 2; // >> 2 846 847 // because packusdw is not available before SSE4.1 848 // Extend sign bit to the right 849 pslld XMM0, 16; 850 psrad XMM0, 16; 851 add RDI, 8; 852 packssdw XMM0, XMM4; 853 854 movq [RDI-8], XMM0; 855 sub ECX, 1; 856 jnz loop_ecx; 857 no_need: ; 858 } 859 860 // Eventually filter the 0 to 3 pixels 861 int remaining = width & ~3; 862 for (int x = remaining; x < width; ++x) 863 { 864 L16 A = L0[2 * x]; 865 L16 B = L0[2 * x + 1]; 866 L16 C = L1[2 * x]; 867 L16 D = L1[2 * x + 1]; 868 dest[x] = L16.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D); 869 } 870 } 871 else 872 static assert(false); 873 } 874 else 875 { 876 for (int x = 0; x < width; ++x) 877 { 878 // A B 879 // C D 880 L16 A = L0[2 * x]; 881 L16 B = L0[2 * x + 1]; 882 L16 C = L1[2 * x]; 883 L16 D = L1[2 * x + 1]; 884 885 dest[x] = L16.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D); 886 } 887 } 888 889 L0 += (2 * previousPitch); 890 L1 += (2 * previousPitch); 891 dest += thisPitch; 892 } 893 } 894 895 896 void generateLevelBoxAlphaCovRGBA(OwnedImage!RGBA thisLevel, 897 OwnedImage!RGBA previousLevel, 898 box2i updateRect) nothrow @nogc 899 { 900 int width = updateRect.width(); 901 int height = updateRect.height(); 902 903 int previousPitch = previousLevel.w; 904 int thisPitch = thisLevel.w; 905 906 RGBA* L0 = previousLevel.scanline(updateRect.min.y * 2).ptr + updateRect.min.x * 2; 907 RGBA* L1 = L0 + previousPitch; 908 909 RGBA* dest = thisLevel.scanline(updateRect.min.y).ptr + updateRect.min.x; 910 911 for (int y = 0; y < height; ++y) 912 { 913 version(inlineAsmCanLoadGlobalsInPIC) 914 { 915 version(D_InlineAsm_X86) 916 { 917 assert(width > 0); 918 asm nothrow @nogc 919 { 920 mov ECX, width; 921 922 mov EAX, L0; 923 mov EDX, L1; 924 mov EDI, dest; 925 926 loop_ecx: 927 928 movq XMM0, [EAX]; // Ar Ag Ab Aa Br Bg Bb Ba + zeroes 929 movq XMM1, [EDX]; // Cr Cg Cb Ca Dr Dg Db Da + zeroes 930 pxor XMM4, XMM4; 931 add EAX, 8; 932 add EDX, 8; 933 934 punpcklbw XMM0, XMM4; // Ar Ag Ab Aa Br Bg Bb Ba 935 punpcklbw XMM1, XMM4; // Cr Cg Cb Ca Dr Dg Db Da 936 937 movdqa XMM2, XMM0; 938 punpcklwd XMM0, XMM1; // Ar Cr Ag Cg Ab Cb Aa Ca 939 punpckhwd XMM2, XMM1; // Br Dr Bg Dg Bb Db Ba Da 940 941 // perhaps unnecessary 942 movdqa XMM3, XMM0; 943 punpcklwd XMM0, XMM2; // Ar Br Cr Dr Ag Bg Cg Dg 944 punpckhwd XMM3, XMM2; // Ab Bb Cb Db Aa Ba Ca Da 945 946 movdqa XMM1, XMM3; 947 punpckhqdq XMM1, XMM1; // Aa Ba Ca Da Aa Ba Ca Da 948 949 // Are alpha all zeroes? if so, early continue. 950 movdqa XMM2, XMM1; 951 pcmpeqb XMM2, XMM4; 952 add EDI, 4; 953 pmovmskb ESI, XMM2; 954 cmp ESI, 0xffff; 955 jnz non_null; 956 957 pxor XMM0, XMM0; 958 sub ECX, 1; 959 movd [EDI-4], XMM0; // dest[x] = A 960 jnz loop_ecx; 961 jmp end_of_loop; 962 963 non_null: 964 965 pmaddwd XMM0, XMM1; // Ar*Aa+Br*Ba Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 966 pmaddwd XMM3, XMM1; // Ab*Aa+Bb*Ba Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 967 968 // Starting computing sum of coefficients too 969 punpcklwd XMM1, XMM4; // Aa Ba Ca Da 970 971 movdqa XMM2, XMM0; 972 movdqa XMM5, XMM3; 973 movdqa XMM4, XMM1; 974 psrldq XMM4, 8; 975 976 psrldq XMM2, 4; // Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 0 977 psrldq XMM5, 4; // Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 0 978 paddq XMM1, XMM4; // Aa+Ca Ba+Da garbage garbage 979 movdqa XMM4, XMM1; 980 981 paddd XMM0, XMM2; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da garbage Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage 982 paddd XMM3, XMM5; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da garbage Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage 983 psrldq XMM4, 4; 984 985 pshufd XMM0, XMM0, 0b00001000; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage garbage 986 paddq XMM1, XMM4; // Aa+Ba+Ca+Da garbage garbage garbage 987 pshufd XMM3, XMM3, 0b00001000; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage garbage 988 989 punpcklqdq XMM0, XMM3; // fR fG fB fA 990 pshufd XMM1, XMM1, 0; 991 992 cvtdq2ps XMM0, XMM0; 993 994 cvtdq2ps XMM3, XMM1; // sum sum sum sum 995 996 divps XMM0, XMM3; // fR/sum fG/sum fB/sum fA/sum 997 addps XMM0, xmm0_5; 998 cvttps2dq XMM0, XMM0; // return into integer domain using cast(int)(x + 0.5f) 999 1000 paddd XMM1, xmmTwoInt; 1001 psrld XMM1, 2; // finalAlpha finalAlpha finalAlpha finalAlpha 1002 1003 pslldq XMM0, 4; // 0 fR/sum fG/sum fB/sum 1004 pslldq XMM1, 12; // 0 0 0 finalAlpha 1005 psrldq XMM0, 4; // fR/sum fG/sum fB/sum 0 1006 1007 por XMM0, XMM1; // fR/sum fG/sum fB/sum finalAlpha 1008 pxor XMM3, XMM3; 1009 packssdw XMM0, XMM3; // same in words 1010 packuswb XMM0, XMM3; // same in bytes 1011 1012 sub ECX, 1; 1013 movd [EDI-4], XMM0; // dest[x] = A 1014 jnz loop_ecx; 1015 end_of_loop: ; 1016 } 1017 } 1018 else version(D_InlineAsm_X86_64) 1019 { 1020 assert(width > 0); 1021 asm nothrow @nogc 1022 { 1023 mov ECX, width; 1024 1025 mov RAX, L0; 1026 mov RDX, L1; 1027 mov RDI, dest; 1028 1029 loop_ecx: 1030 1031 movq XMM0, [RAX]; // Ar Ag Ab Aa Br Bg Bb Ba + zeroes 1032 movq XMM1, [RDX]; // Cr Cg Cb Ca Dr Dg Db Da + zeroes 1033 pxor XMM4, XMM4; 1034 add RAX, 8; 1035 add RDX, 8; 1036 1037 punpcklbw XMM0, XMM4; // Ar Ag Ab Aa Br Bg Bb Ba 1038 punpcklbw XMM1, XMM4; // Cr Cg Cb Ca Dr Dg Db Da 1039 1040 movdqa XMM2, XMM0; 1041 punpcklwd XMM0, XMM1; // Ar Cr Ag Cg Ab Cb Aa Ca 1042 punpckhwd XMM2, XMM1; // Br Dr Bg Dg Bb Db Ba Da 1043 1044 // perhaps unnecessary 1045 movdqa XMM3, XMM0; 1046 punpcklwd XMM0, XMM2; // Ar Br Cr Dr Ag Bg Cg Dg 1047 punpckhwd XMM3, XMM2; // Ab Bb Cb Db Aa Ba Ca Da 1048 1049 movdqa XMM1, XMM3; 1050 punpckhqdq XMM1, XMM1; // Aa Ba Ca Da Aa Ba Ca Da 1051 1052 // Are alpha all zeroes? if so, early continue. 1053 movdqa XMM2, XMM1; 1054 pcmpeqb XMM2, XMM4; 1055 add RDI, 4; 1056 pmovmskb ESI, XMM2; 1057 cmp ESI, 0xffff; 1058 jnz non_null; 1059 1060 pxor XMM0, XMM0; 1061 sub ECX, 1; 1062 movd [RDI-4], XMM0; // dest[x] = A 1063 jnz loop_ecx; 1064 jmp end_of_loop; 1065 1066 non_null: 1067 1068 pmaddwd XMM0, XMM1; // Ar*Aa+Br*Ba Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 1069 pmaddwd XMM3, XMM1; // Ab*Aa+Bb*Ba Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 1070 1071 // Starting computing sum of coefficients too 1072 punpcklwd XMM1, XMM4; // Aa Ba Ca Da 1073 1074 movdqa XMM2, XMM0; 1075 movdqa XMM5, XMM3; 1076 movdqa XMM4, XMM1; 1077 psrldq XMM4, 8; 1078 1079 psrldq XMM2, 4; // Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 0 1080 psrldq XMM5, 4; // Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 0 1081 paddq XMM1, XMM4; // Aa+Ca Ba+Da garbage garbage 1082 movdqa XMM4, XMM1; 1083 1084 paddd XMM0, XMM2; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da garbage Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage 1085 paddd XMM3, XMM5; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da garbage Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage 1086 psrldq XMM4, 4; 1087 1088 pshufd XMM0, XMM0, 0b00001000; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage garbage 1089 paddq XMM1, XMM4; // Aa+Ba+Ca+Da garbage garbage garbage 1090 pshufd XMM3, XMM3, 0b00001000; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage garbage 1091 1092 punpcklqdq XMM0, XMM3; // fR fG fB fA 1093 pshufd XMM1, XMM1, 0; 1094 1095 cvtdq2ps XMM0, XMM0; 1096 1097 cvtdq2ps XMM3, XMM1; // sum sum sum sum 1098 1099 divps XMM0, XMM3; // fR/sum fG/sum fB/sum fA/sum 1100 addps XMM0, xmm0_5; 1101 cvttps2dq XMM0, XMM0; // return into integer domain using cast(int)(x + 0.5f) 1102 1103 paddd XMM1, xmmTwoInt; 1104 psrld XMM1, 2; // finalAlpha finalAlpha finalAlpha finalAlpha 1105 1106 pslldq XMM0, 4; // 0 fR/sum fG/sum fB/sum 1107 pslldq XMM1, 12; // 0 0 0 finalAlpha 1108 psrldq XMM0, 4; // fR/sum fG/sum fB/sum 0 1109 1110 por XMM0, XMM1; // fR/sum fG/sum fB/sum finalAlpha 1111 pxor XMM3, XMM3; 1112 packssdw XMM0, XMM3; // same in words 1113 packuswb XMM0, XMM3; // same in bytes 1114 1115 sub ECX, 1; 1116 movd [RDI-4], XMM0; // dest[x] = A 1117 jnz loop_ecx; 1118 end_of_loop: ; 1119 } 1120 } 1121 else 1122 static assert(false); 1123 } 1124 else 1125 { 1126 for (int x = 0; x < width; ++x) 1127 { 1128 // A B 1129 // C D 1130 RGBA A = L0[2 * x]; 1131 RGBA B = L0[2 * x + 1]; 1132 RGBA C = L1[2 * x]; 1133 RGBA D = L1[2 * x + 1]; 1134 1135 int alphaA = A.a; 1136 int alphaB = B.a; 1137 int alphaC = C.a; 1138 int alphaD = D.a; 1139 int sum = alphaA + alphaB + alphaC + alphaD; 1140 if (sum == 0) 1141 { 1142 dest[x] = RGBA(0,0,0,0); 1143 } 1144 else 1145 { 1146 int destAlpha = cast(ubyte)( (alphaA + alphaB + alphaC + alphaD + 2) >> 2 ); 1147 int red = (A.r * alphaA + B.r * alphaB + C.r * alphaC + D.r * alphaD); 1148 int green = (A.g * alphaA + B.g * alphaB + C.g * alphaC + D.g * alphaD); 1149 int blue = (A.b * alphaA + B.b* alphaB + C.b * alphaC + D.b * alphaD); 1150 float invSum = 1 / cast(float)(sum); 1151 1152 RGBA finalColor = RGBA( cast(ubyte)(0.5f + red * invSum), 1153 cast(ubyte)(0.5f + green * invSum), 1154 cast(ubyte)(0.5f + blue * invSum), 1155 cast(ubyte)destAlpha ); 1156 dest[x] = finalColor; 1157 } 1158 } 1159 } 1160 1161 enum verify = false; 1162 1163 static if (verify) 1164 { 1165 for (int x = 0; x < width; ++x) 1166 { 1167 // A B 1168 // C D 1169 RGBA A = L0[2 * x]; 1170 RGBA B = L0[2 * x + 1]; 1171 RGBA C = L1[2 * x]; 1172 RGBA D = L1[2 * x + 1]; 1173 1174 int alphaA = A.a; 1175 int alphaB = B.a; 1176 int alphaC = C.a; 1177 int alphaD = D.a; 1178 int sum = alphaA + alphaB + alphaC + alphaD; 1179 if (sum == 0) 1180 { 1181 assert(dest[x] == RGBA(0,0,0,0)); 1182 } 1183 else 1184 { 1185 int destAlpha = cast(ubyte)( (alphaA + alphaB + alphaC + alphaD + 2) >> 2 ); 1186 int red = (A.r * alphaA + B.r * alphaB + C.r * alphaC + D.r * alphaD); 1187 int green = (A.g * alphaA + B.g * alphaB + C.g * alphaC + D.g * alphaD); 1188 int blue = (A.b * alphaA + B.b* alphaB + C.b * alphaC + D.b * alphaD); 1189 1190 float invSum = 1 / cast(float)(sum); 1191 1192 RGBA finalColor = RGBA( cast(ubyte)(0.5f + red * invSum), 1193 cast(ubyte)(0.5f + green * invSum), 1194 cast(ubyte)(0.5f + blue * invSum), 1195 cast(ubyte)destAlpha ); 1196 RGBA instead = dest[x]; 1197 1198 int insteadR = instead.r; 1199 int insteadG = instead.g; 1200 int insteadB = instead.b; 1201 int insteadA = instead.a; 1202 int finalColorR = finalColor.r; 1203 int finalColorG = finalColor.g; 1204 int finalColorB = finalColor.b; 1205 int finalColorA = finalColor.a; 1206 import std.math; 1207 assert(abs(insteadR - finalColorR) <= 1); // some remaining differences because of rounding 1208 assert(abs(insteadG - finalColorG) <= 1); 1209 assert(abs(insteadB - finalColorB) <= 1); 1210 assert(insteadA == finalColorA); 1211 } 1212 } 1213 } 1214 1215 L0 += (2 * previousPitch); 1216 L1 += (2 * previousPitch); 1217 dest += thisPitch; 1218 } 1219 } 1220 1221 void generateLevelBoxAlphaCovIntoPremulRGBA(OwnedImage!RGBA thisLevel, 1222 OwnedImage!RGBA previousLevel, 1223 box2i updateRect) nothrow @nogc 1224 { 1225 int width = updateRect.width(); 1226 int height = updateRect.height(); 1227 1228 int previousPitch = previousLevel.w; 1229 int thisPitch = thisLevel.w; 1230 1231 RGBA* L0 = previousLevel.scanline(updateRect.min.y * 2).ptr + updateRect.min.x * 2; 1232 RGBA* L1 = L0 + previousPitch; 1233 1234 RGBA* dest = thisLevel.scanline(updateRect.min.y).ptr + updateRect.min.x; 1235 1236 for (int y = 0; y < height; ++y) 1237 { 1238 version(inlineAsmCanLoadGlobalsInPIC) 1239 { 1240 version(D_InlineAsm_X86) 1241 { 1242 asm nothrow @nogc 1243 { 1244 mov ECX, width; 1245 1246 mov EAX, L0; 1247 mov EDX, L1; 1248 mov EDI, dest; 1249 1250 movdqa XMM5, xmm512; // 512 512 5121 512 1251 pxor XMM4, XMM4; // all zeroes 1252 1253 loop_ecx: 1254 1255 movq XMM0, [EAX]; // Ar Ag Ab Aa Br Bg Bb Ba + zeroes 1256 movq XMM1, [EDX]; // Cr Cg Cb Ca Dr Dg Db Da + zeroes 1257 pxor XMM4, XMM4; 1258 add EAX, 8; 1259 add EDX, 8; 1260 1261 punpcklbw XMM0, XMM4; // Ar Ag Ab Aa Br Bg Bb Ba 1262 punpcklbw XMM1, XMM4; // Cr Cg Cb Ca Dr Dg Db Da 1263 1264 movdqa XMM2, XMM0; 1265 punpcklwd XMM0, XMM1; // Ar Cr Ag Cg Ab Cb Aa Ca 1266 punpckhwd XMM2, XMM1; // Br Dr Bg Dg Bb Db Ba Da 1267 1268 movdqa XMM3, XMM0; 1269 punpcklwd XMM0, XMM2; // Ar Br Cr Dr Ag Bg Cg Dg 1270 punpckhwd XMM3, XMM2; // Ab Bb Cb Db Aa Ba Ca Da 1271 1272 movdqa XMM1, XMM3; 1273 punpckhqdq XMM1, XMM1; // Aa Ba Ca Da Aa Ba Ca Da 1274 1275 add EDI, 4; 1276 1277 pmaddwd XMM0, XMM1; // Ar*Aa+Br*Ba Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 1278 pmaddwd XMM3, XMM1; // Ab*Aa+Bb*Ba Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 1279 1280 movdqa XMM2, XMM0; 1281 movdqa XMM1, XMM3; 1282 1283 psrldq XMM2, 4; // Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 0 1284 psrldq XMM1, 4; // Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 0 1285 1286 paddd XMM0, XMM2; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da garbage Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage 1287 paddd XMM3, XMM1; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da garbage Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage 1288 1289 pshufd XMM0, XMM0, 0b00001000; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage garbage 1290 pshufd XMM3, XMM3, 0b00001000; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage garbage 1291 1292 punpcklqdq XMM0, XMM3; // fR fG fB fA 1293 1294 1295 paddd XMM0, XMM5; 1296 psrld XMM0, 10; // final color in dwords 1297 1298 packssdw XMM0, XMM4; // same in words 1299 packuswb XMM0, XMM4; // same in bytes 1300 1301 sub ECX, 1; 1302 movd [EDI-4], XMM0; // dest[x] = A 1303 jnz loop_ecx; 1304 } 1305 } 1306 else version(D_InlineAsm_X86_64) 1307 { 1308 asm nothrow @nogc 1309 { 1310 mov ECX, width; 1311 1312 mov RAX, L0; 1313 mov RDX, L1; 1314 mov RDI, dest; 1315 1316 movdqa XMM5, xmm512; // 512 512 5121 512 1317 pxor XMM4, XMM4; // all zeroes 1318 1319 loop_ecx: 1320 1321 movq XMM0, [RAX]; // Ar Ag Ab Aa Br Bg Bb Ba + zeroes 1322 movq XMM1, [RDX]; // Cr Cg Cb Ca Dr Dg Db Da + zeroes 1323 pxor XMM4, XMM4; 1324 add RAX, 8; 1325 add RDX, 8; 1326 1327 punpcklbw XMM0, XMM4; // Ar Ag Ab Aa Br Bg Bb Ba 1328 punpcklbw XMM1, XMM4; // Cr Cg Cb Ca Dr Dg Db Da 1329 1330 movdqa XMM2, XMM0; 1331 punpcklwd XMM0, XMM1; // Ar Cr Ag Cg Ab Cb Aa Ca 1332 punpckhwd XMM2, XMM1; // Br Dr Bg Dg Bb Db Ba Da 1333 1334 movdqa XMM3, XMM0; 1335 punpcklwd XMM0, XMM2; // Ar Br Cr Dr Ag Bg Cg Dg 1336 punpckhwd XMM3, XMM2; // Ab Bb Cb Db Aa Ba Ca Da 1337 1338 movdqa XMM1, XMM3; 1339 punpckhqdq XMM1, XMM1; // Aa Ba Ca Da Aa Ba Ca Da 1340 1341 add RDI, 4; 1342 1343 pmaddwd XMM0, XMM1; // Ar*Aa+Br*Ba Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 1344 pmaddwd XMM3, XMM1; // Ab*Aa+Bb*Ba Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 1345 1346 movdqa XMM2, XMM0; 1347 movdqa XMM1, XMM3; 1348 1349 psrldq XMM2, 4; // Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 0 1350 psrldq XMM1, 4; // Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 0 1351 1352 paddd XMM0, XMM2; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da garbage Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage 1353 paddd XMM3, XMM1; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da garbage Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage 1354 1355 pshufd XMM0, XMM0, 0b00001000; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage garbage 1356 pshufd XMM3, XMM3, 0b00001000; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage garbage 1357 1358 punpcklqdq XMM0, XMM3; // fR fG fB fA 1359 1360 1361 paddd XMM0, XMM5; 1362 psrld XMM0, 10; // final color in dwords 1363 1364 packssdw XMM0, XMM4; // same in words 1365 packuswb XMM0, XMM4; // same in bytes 1366 1367 sub ECX, 1; 1368 movd [RDI-4], XMM0; // dest[x] = A 1369 jnz loop_ecx; 1370 } 1371 } 1372 else 1373 static assert(false); 1374 } 1375 else 1376 { 1377 for (int x = 0; x < width; ++x) 1378 { 1379 RGBA A = L0[2 * x]; 1380 RGBA B = L0[2 * x + 1]; 1381 RGBA C = L1[2 * x]; 1382 RGBA D = L1[2 * x + 1]; 1383 int red = (A.r * A.a + B.r * B.a + C.r * C.a + D.r * D.a); 1384 int green = (A.g * A.a + B.g * B.a + C.g * C.a + D.g * D.a); 1385 int blue = (A.b * A.a + B.b* B.a + C.b * C.a + D.b * D.a); 1386 int alpha = (A.a * A.a + B.a* B.a + C.a * C.a + D.a * D.a); 1387 RGBA finalColor = RGBA( cast(ubyte)((red + 512) >> 10), 1388 cast(ubyte)((green + 512) >> 10), 1389 cast(ubyte)((blue + 512) >> 10), 1390 cast(ubyte)((alpha + 512) >> 10)); 1391 dest[x] = finalColor; 1392 } 1393 } 1394 1395 enum bool verify = false; 1396 1397 static if (verify) 1398 { 1399 for (int x = 0; x < width; ++x) 1400 { 1401 RGBA A = L0[2 * x]; 1402 RGBA B = L0[2 * x + 1]; 1403 RGBA C = L1[2 * x]; 1404 RGBA D = L1[2 * x + 1]; 1405 int red = (A.r * A.a + B.r * B.a + C.r * C.a + D.r * D.a); 1406 int green = (A.g * A.a + B.g * B.a + C.g * C.a + D.g * D.a); 1407 int blue = (A.b * A.a + B.b* B.a + C.b * C.a + D.b * D.a); 1408 int alpha = (A.a * A.a + B.a* B.a + C.a * C.a + D.a * D.a); 1409 RGBA finalColor = RGBA( cast(ubyte)((red + 512) >> 10), 1410 cast(ubyte)((green + 512) >> 10), 1411 cast(ubyte)((blue + 512) >> 10), 1412 cast(ubyte)((alpha + 512) >> 10)); 1413 assert(dest[x] == finalColor); 1414 } 1415 } 1416 1417 L0 += (2 * previousPitch); 1418 L1 += (2 * previousPitch); 1419 dest += thisPitch; 1420 } 1421 } 1422 1423 void generateLevelCubicRGBA(OwnedImage!RGBA thisLevel, 1424 OwnedImage!RGBA previousLevel, 1425 box2i updateRect) nothrow @nogc 1426 { 1427 for (int y = updateRect.min.y; y < updateRect.max.y; ++y) 1428 { 1429 int y2m1 = 2 * y - 1; 1430 if (y2m1 < 0) 1431 y2m1 = 0; 1432 1433 int y2p2 = 2 * y + 2; 1434 if (y2p2 > previousLevel.h - 1) 1435 y2p2 = previousLevel.h - 1; 1436 1437 RGBA* LM1 = previousLevel.scanline(y2m1).ptr; 1438 RGBA* L0 = previousLevel.scanline(y * 2).ptr; 1439 RGBA* L1 = previousLevel.scanline(y * 2 + 1).ptr; 1440 RGBA* L2 = previousLevel.scanline(y2p2).ptr; 1441 RGBA* dest = thisLevel.scanline(y).ptr; 1442 1443 for (int x = updateRect.min.x; x < updateRect.max.x; ++x) 1444 { 1445 // A B C D 1446 // E F G H 1447 // I J K L 1448 // M N O P 1449 1450 int x2m1 = 2 * x - 1; 1451 if (x2m1 < 0) 1452 x2m1 = 0; 1453 int x2p0 = 2 * x; 1454 int x2p2 = 2 * x + 2; 1455 if (x2p2 > previousLevel.w - 1) 1456 x2p2 = previousLevel.w - 1; 1457 1458 version(inlineAsmCanLoadGlobalsInPIC) 1459 { 1460 version(D_InlineAsm_X86) 1461 { 1462 RGBA[16] buf = void; 1463 buf[0] = LM1[x2m1]; 1464 buf[1] = LM1[x2p0]; 1465 buf[2] = LM1[x2p0+1]; 1466 buf[3] = LM1[x2p2]; 1467 buf[4] = L0[x2m1]; 1468 buf[5] = L0[x2p0]; 1469 buf[6] = L0[x2p0+1]; 1470 buf[7] = L0[x2p2]; 1471 buf[8] = L1[x2m1]; 1472 buf[9] = L1[x2p0]; 1473 buf[10] = L1[x2p0+1]; 1474 buf[11] = L1[x2p2]; 1475 buf[12] = L2[x2m1]; 1476 buf[13] = L2[x2p0]; 1477 buf[14] = L2[x2p0+1]; 1478 buf[15] = L2[x2p2]; 1479 RGBA* pDest = dest + x; 1480 1481 asm nothrow @nogc 1482 { 1483 movdqu XMM0, buf; // A B C D 1484 movdqu XMM1, buf; 1485 pxor XMM2, XMM2; // zeroes 1486 punpcklbw XMM0, XMM2; // A B 1487 punpckhbw XMM1, XMM2; // C D 1488 pmullw XMM0, xmm11113333; // A*1 B*3 in shorts 1489 movdqa XMM3, XMM0; 1490 pmullw XMM1, xmm33331111; // C*3 D*3 in shorts 1491 movdqa XMM5, XMM1; 1492 1493 movdqu XMM0, buf+16; // E F G H 1494 movdqu XMM1, buf+16; 1495 punpcklbw XMM0, XMM2; // E F 1496 punpckhbw XMM1, XMM2; // G H 1497 pmullw XMM0, xmm33339999; // E*3 F*9 in shorts 1498 paddw XMM3, XMM0; 1499 pmullw XMM1, xmm99993333; // G*9 H*3 in shorts 1500 paddw XMM5, XMM1; 1501 1502 movdqu XMM0, buf+32; // I J K L 1503 movdqu XMM1, buf+32; 1504 punpcklbw XMM0, XMM2; // I J 1505 punpckhbw XMM1, XMM2; // K L 1506 pmullw XMM0, xmm33339999; // I*3 J*9 in shorts 1507 paddw XMM3, XMM0; 1508 pmullw XMM1, xmm99993333; // K*9 L*3 in shorts 1509 paddw XMM5, XMM1; 1510 1511 movdqu XMM0, buf+48; // M N O P 1512 movdqu XMM1, buf+48; 1513 punpcklbw XMM0, XMM2; // M N 1514 punpckhbw XMM1, XMM2; // O P 1515 pmullw XMM0, xmm11113333; // M*1 N*3 in shorts 1516 paddw XMM3, XMM0; // A+E*3+I*3+M B*3+F*9+J*9+3*N 1517 pmullw XMM1, xmm33331111; // O*3 P*1 in shorts 1518 paddw XMM5, XMM1; // C*3+G*9+K*9+O*3 D+H*3+L*3+P 1519 1520 movdqa XMM0, XMM3; 1521 movdqa XMM1, XMM5; 1522 psrldq XMM0, 8; 1523 psrldq XMM1, 8; 1524 paddw XMM3, XMM0; // A+E*3+I*3+M+B*3+F*9+J*9+3*N garbage(x4) 1525 paddw XMM5, XMM1; // C*3+G*9+K*9+O*3+D+H*3+L*3+P garbage(x4) 1526 paddw XMM3, XMM5; // total-sum garbage(x4) 1527 1528 paddw XMM3, xmm32; 1529 psrlw XMM3, 6; 1530 mov EAX, pDest; 1531 packuswb XMM3, XMM2; 1532 1533 movd [EAX], XMM3; 1534 } 1535 } 1536 else version(D_InlineAsm_X86_64) 1537 { 1538 RGBA[16] buf = void; 1539 buf[0] = LM1[x2m1]; 1540 buf[1] = LM1[x2p0]; 1541 buf[2] = LM1[x2p0+1]; 1542 buf[3] = LM1[x2p2]; 1543 buf[4] = L0[x2m1]; 1544 buf[5] = L0[x2p0]; 1545 buf[6] = L0[x2p0+1]; 1546 buf[7] = L0[x2p2]; 1547 buf[8] = L1[x2m1]; 1548 buf[9] = L1[x2p0]; 1549 buf[10] = L1[x2p0+1]; 1550 buf[11] = L1[x2p2]; 1551 buf[12] = L2[x2m1]; 1552 buf[13] = L2[x2p0]; 1553 buf[14] = L2[x2p0+1]; 1554 buf[15] = L2[x2p2]; 1555 RGBA* pDest = dest + x; 1556 1557 asm nothrow @nogc 1558 { 1559 movdqu XMM0, buf; // A B C D 1560 movdqu XMM1, buf; 1561 pxor XMM2, XMM2; // zeroes 1562 punpcklbw XMM0, XMM2; // A B 1563 punpckhbw XMM1, XMM2; // C D 1564 pmullw XMM0, xmm11113333; // A*1 B*3 in shorts 1565 movdqa XMM3, XMM0; 1566 pmullw XMM1, xmm33331111; // C*3 D*3 in shorts 1567 movdqa XMM5, XMM1; 1568 1569 movdqu XMM0, buf+16; // E F G H 1570 movdqu XMM1, buf+16; 1571 punpcklbw XMM0, XMM2; // E F 1572 punpckhbw XMM1, XMM2; // G H 1573 pmullw XMM0, xmm33339999; // E*3 F*9 in shorts 1574 paddw XMM3, XMM0; 1575 pmullw XMM1, xmm99993333; // G*9 H*3 in shorts 1576 paddw XMM5, XMM1; 1577 1578 movdqu XMM0, buf+32; // I J K L 1579 movdqu XMM1, buf+32; 1580 punpcklbw XMM0, XMM2; // I J 1581 punpckhbw XMM1, XMM2; // K L 1582 pmullw XMM0, xmm33339999; // I*3 J*9 in shorts 1583 paddw XMM3, XMM0; 1584 pmullw XMM1, xmm99993333; // K*9 L*3 in shorts 1585 paddw XMM5, XMM1; 1586 1587 movdqu XMM0, buf+48; // M N O P 1588 movdqu XMM1, buf+48; 1589 punpcklbw XMM0, XMM2; // M N 1590 punpckhbw XMM1, XMM2; // O P 1591 pmullw XMM0, xmm11113333; // M*1 N*3 in shorts 1592 paddw XMM3, XMM0; // A+E*3+I*3+M B*3+F*9+J*9+3*N 1593 pmullw XMM1, xmm33331111; // O*3 P*1 in shorts 1594 paddw XMM5, XMM1; // C*3+G*9+K*9+O*3 D+H*3+L*3+P 1595 1596 movdqa XMM0, XMM3; 1597 movdqa XMM1, XMM5; 1598 psrldq XMM0, 8; 1599 psrldq XMM1, 8; 1600 paddw XMM3, XMM0; // A+E*3+I*3+M+B*3+F*9+J*9+3*N garbage(x4) 1601 paddw XMM5, XMM1; // C*3+G*9+K*9+O*3+D+H*3+L*3+P garbage(x4) 1602 paddw XMM3, XMM5; // total-sum garbage(x4) 1603 1604 paddw XMM3, xmm32; 1605 psrlw XMM3, 6; 1606 mov RAX, pDest; 1607 packuswb XMM3, XMM2; 1608 1609 movd [RAX], XMM3; 1610 } 1611 } 1612 else 1613 static assert(false); 1614 } 1615 else 1616 { 1617 auto A = LM1[x2m1]; 1618 auto B = LM1[x2p0]; 1619 auto C = LM1[x2p0+1]; 1620 auto D = LM1[x2p2]; 1621 1622 auto E = L0[x2m1]; 1623 auto F = L0[x2p0]; 1624 auto G = L0[x2p0+1]; 1625 auto H = L0[x2p2]; 1626 1627 auto I = L1[x2m1]; 1628 auto J = L1[x2p0]; 1629 auto K = L1[x2p0+1]; 1630 auto L = L1[x2p2]; 1631 1632 auto M = L2[x2m1]; 1633 auto N = L2[x2p0]; 1634 auto O = L2[x2p0+1]; 1635 auto P = L2[x2p2]; 1636 1637 // Apply filter 1638 // 1 3 3 1 1639 // 3 9 9 3 1640 // 3 9 9 3 1641 // 1 3 3 1 1642 1643 int rSum = (A.r + D.r + M.r + P.r) + 3 * (B.r + C.r + E.r + H.r + I.r + L.r + N.r + O.r) + 9 * (F.r + G.r + J.r + K.r); 1644 int gSum = (A.g + D.g + M.g + P.g) + 3 * (B.g + C.g + E.g + H.g + I.g + L.g + N.g + O.g) + 9 * (F.g + G.g + J.g + K.g); 1645 int bSum = (A.b + D.b + M.b + P.b) + 3 * (B.b + C.b + E.b + H.b + I.b + L.b + N.b + O.b) + 9 * (F.b + G.b + J.b + K.b); 1646 int aSum = (A.a + D.a + M.a + P.a) + 3 * (B.a + C.a + E.a + H.a + I.a + L.a + N.a + O.a) + 9 * (F.a + G.a + J.a + K.a); 1647 dest[x].r = cast(ubyte)((rSum + 32) >> 6); 1648 dest[x].g = cast(ubyte)((gSum + 32) >> 6); 1649 dest[x].b = cast(ubyte)((bSum + 32) >> 6); 1650 dest[x].a = cast(ubyte)((aSum + 32) >> 6); 1651 } 1652 } 1653 } 1654 } 1655 1656 void generateLevelCubicL16(OwnedImage!L16 thisLevel, 1657 OwnedImage!L16 previousLevel, 1658 box2i updateRect) nothrow @nogc 1659 { 1660 for (int y = updateRect.min.y; y < updateRect.max.y; ++y) 1661 { 1662 int y2m1 = 2 * y - 1; 1663 if (y2m1 < 0) 1664 y2m1 = 0; 1665 1666 int y2p2 = 2 * y + 2; 1667 if (y2p2 > previousLevel.h - 1) 1668 y2p2 = previousLevel.h - 1; 1669 1670 L16* LM1 = previousLevel.scanline(y2m1).ptr; 1671 L16* L0 = previousLevel.scanline(y * 2).ptr; 1672 L16* L1 = previousLevel.scanline(y * 2 + 1).ptr; 1673 L16* L2 = previousLevel.scanline(y2p2).ptr; 1674 L16* dest = thisLevel.scanline(y).ptr; 1675 1676 for (int x = updateRect.min.x; x < updateRect.max.x; ++x) 1677 { 1678 // A B C D 1679 // E F G H 1680 // I J K L 1681 // M N O P 1682 1683 int x2m1 = 2 * x - 1; 1684 if (x2m1 < 0) 1685 x2m1 = 0; 1686 int x2p0 = 2 * x; 1687 int x2p2 = 2 * x + 2; 1688 if (x2p2 > previousLevel.w - 1) 1689 x2p2 = previousLevel.w - 1; 1690 1691 ushort A = LM1[x2m1].l; 1692 ushort B = LM1[x2p0].l; 1693 ushort C = LM1[x2p0+1].l; 1694 ushort D = LM1[x2p2].l; 1695 1696 ushort E = L0[x2m1].l; 1697 ushort F = L0[x2p0].l; 1698 ushort G = L0[x2p0+1].l; 1699 ushort H = L0[x2p2].l; 1700 1701 ushort I = L1[x2m1].l; 1702 ushort J = L1[x2p0].l; 1703 ushort K = L1[x2p0+1].l; 1704 ushort L = L1[x2p2].l; 1705 1706 ushort M = L2[x2m1].l; 1707 ushort N = L2[x2p0].l; 1708 ushort O = L2[x2p0+1].l; 1709 ushort P = L2[x2p2].l; 1710 1711 // Apply filter 1712 // 1 3 3 1 A B C D 1713 // 3 9 9 3 E F G H 1714 // 3 9 9 3 I J K L 1715 // 1 3 3 1 M N O P 1716 1717 int depthSum = (A + D + M + P) 1718 + 3 * (B + C + E + H + I + L + N + O) 1719 + 9 * (F + G + J + K); 1720 dest[x].l = cast(ushort)((depthSum + 32) >> 6 ); 1721 } 1722 } 1723 } 1724 1725 unittest 1726 { 1727 Mipmap!RGBA rgbaMipmap; 1728 Mipmap!L16 l16Mipmap; 1729 }