1 /**
2 Mipmap pyramid implementation.
3 
4 Copyright: Guillaume Piolat 2015-2016.
5 License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module dplug.graphics.mipmap;
8 
9 import std.algorithm.comparison;
10 
11 import dplug.math.vector;
12 import dplug.math.box;
13 import dplug.graphics.image;
14 import dplug.core.nogc;
15 import dplug.core.vec;
16 
17 
18 import inteli.emmintrin;
19 
20 version( D_InlineAsm_X86 )
21 {
22     version = AsmX86;
23 }
24 else version( D_InlineAsm_X86_64 )
25 {
26     version = AsmX86;
27 }
28 
29 // Because of unability to load globals in PIC code with DMD, only enable some assembly with LDC
30 version(LDC)
31 {
32     version( D_InlineAsm_X86 )
33     {
34         version = inlineAsmCanLoadGlobalsInPIC;
35     }
36     else version( D_InlineAsm_X86_64 )
37     {
38         version = inlineAsmCanLoadGlobalsInPIC;
39     }
40 }
41 
42 
43 /// Mipmapped images.
44 /// Supports non power-of-two textures.
45 /// Size of the i+1-th mipmap is { (width)/2, (height)/2 }
46 /// The mipmap owns each of its levels.
47 final class Mipmap(COLOR) if (is(COLOR == RGBA) || is(COLOR == L16))
48 {
49 public:
50 nothrow:
51 @nogc:
52 
53     enum Quality
54     {
55         box,                   // simple 2x2 filter, creates phase problems with NPOT. For higher levels, automatically uses cubic.
56         cubic,                 // Very smooth kernel [1 2 1] x [1 2 1]
57         boxAlphaCov,           // ditto but alpha is used as weight, only implemented for RGBA
58         boxAlphaCovIntoPremul, // same as boxAlphaConv but after such a step the next level is alpha-premultiplied
59     }
60 
61     Vec!(OwnedImage!COLOR) levels;
62 
63     /// Creates empty
64     this()
65     {
66         levels = makeVec!(OwnedImage!COLOR)();
67     }
68 
69     /// Set number of levels and size
70     /// maxLevel = 0 => only one image
71     /// maxLevel = 1 => one image + one 2x downsampled mipmap
72     /// etc...
73     this(int maxLevel, int w, int h)
74     {
75         this();
76         size(maxLevel, w, h);
77     }
78 
79 
80     /// Creates a Mipmap out of a flat OwnedImage.
81     /// This takes ownership of the given image, which is now owned by the `Mipmap`.
82     this(int maxLevel, OwnedImage!COLOR level0)
83     {
84         //PERF: could avoid to create the 0th level only to replace it later
85 
86         this(maxLevel, level0.w, level0.h);
87 
88         // replaces level 0
89         levels[0].destroyFree();
90         levels[0] = level0;
91     }
92 
93     void size(int maxLevel, int w, int h)
94     {
95         // find number of needed levels
96         int neededLevels = 0;
97         {
98             int wr = w;
99             int hr = h;
100             for (; neededLevels <= maxLevel; ++neededLevels)
101             {
102                 if (wr == 0 || hr == 0)
103                     break;
104                 wr  = (wr + 0) >> 1;
105                 hr  = (hr + 0) >> 1;
106             }
107         }
108 
109         void setLevels(int numLevels)
110         {
111             // FUTURE: cleanup excess levels
112             // should not happen until we have resizing
113             if (numLevels < levels.length)
114             {
115                 assert(false);
116             }
117 
118             int previousLength = cast(int)levels.length;
119 
120             levels.resize(numLevels);
121 
122             // create empty image for new levels
123             for(int level = previousLength; level < numLevels; ++level)
124             {
125                 levels[level] = mallocNew!(OwnedImage!COLOR)();
126             }
127         }
128 
129         setLevels(neededLevels);
130 
131         // resize levels
132         for (int level = 0; level < neededLevels; ++level)
133         {
134             assert(w != 0 && h != 0);
135             levels[level].size(w, h);
136             w  = (w + 0) >> 1;
137             h  = (h + 0) >> 1;
138         }
139     }
140 
141     ~this()
142     {
143         foreach(level; levels)
144             level.destroyFree();
145     }
146 
147     /// Interpolates a color between mipmap levels.  Floating-point level, spatial linear interpolation.
148     /// x and y are in base level coordinates (top-left pixel is on (0.5, 0.5) coordinates).
149     /// Clamped to borders.
150     auto linearMipmapSample(float level, float x, float y) nothrow @nogc
151     {
152         int ilevel = cast(int)level;
153         float flevel = level - ilevel;
154         vec4f levelN = linearSample(ilevel, x, y);
155         if (flevel == 0)
156             return levelN;
157 
158         auto levelNp1 = linearSample(ilevel + 1, x, y);
159 
160         return levelN * (1 - flevel) + levelNp1 * flevel;
161     }
162 
163 
164     /// Interpolates a color.  Integer level, spatial linear interpolation.
165     /// x and y are in base level coordinates (top-left pixel is on (0.5, 0.5) coordinates).
166     /// Clamped to borders.
167     auto linearSample(int level, float x, float y) nothrow @nogc
168     {
169         if (level < 0)
170             level = 0;
171         int numLevels = cast(int)levels.length;
172         if (level >= numLevels)
173             level = numLevels - 1;
174 
175         OwnedImage!COLOR image = levels[level];
176 
177 
178         static immutable float[14] factors = [ 1.0f, 0.5f, 0.25f, 0.125f,
179                                                0.0625f, 0.03125f, 0.015625f, 0.0078125f,
180                                                0.00390625f, 0.001953125f, 0.0009765625f, 0.00048828125f,
181                                                0.000244140625f, 0.0001220703125f];
182 
183         float divider = factors[level];
184         x = x * divider - 0.5f;
185         y = y * divider - 0.5f;
186 
187         if (x < 0)
188             x = 0;
189         if (y < 0)
190             y = 0;
191 
192         __m128 floatCoords = _mm_setr_ps(x, y, 0, 0);
193         __m128i truncatedCoord = _mm_cvttps_epi32(floatCoords);
194         int ix = truncatedCoord.array[0];
195         int iy = truncatedCoord.array[1];
196 
197         // Get fractional part
198         float fx = x - ix;
199         float fy = y - iy;
200 
201         const int maxX = image.w-1;
202         const int maxY = image.h-1;
203         if (ix > maxX)
204             ix = maxX;
205         if (iy > maxY)
206             iy = maxY;
207 
208         int ixp1 = ix + 1;
209         int iyp1 = iy + 1;
210         if (ixp1 > maxX)
211             ixp1 = maxX;
212         if (iyp1 > maxY)
213             iyp1 = maxY;  
214 
215         float fxm1 = 1 - fx;
216         float fym1 = 1 - fy;
217 
218         COLOR* L0 = image.scanlinePtr(iy);
219         COLOR* L1 = image.scanlinePtr(iyp1);
220 
221         COLOR A = L0[ix];
222         COLOR B = L0[ixp1];
223         COLOR C = L1[ix];
224         COLOR D = L1[ixp1];
225 
226         static if (is(COLOR == RGBA))
227         {
228             float inv255 = 1 / 255.0f;
229             version(LDC)
230             {
231                 int Ai = *cast(int*)(&A);
232                 int Bi = *cast(int*)(&B);
233                 int Ci = *cast(int*)(&C);
234                 int Di = *cast(int*)(&D);
235 
236                 __m128i mmZero = _mm_setzero_si128();
237                 __m128i mmABCD = _mm_setr_epi32(Ai, Bi, Ci, Di);
238 
239                 // Convert to float of the form (R, G, B, A)
240                 __m128i mmAB = _mm_unpacklo_epi8(mmABCD, mmZero);
241                 __m128i mmCD = _mm_unpackhi_epi8(mmABCD, mmZero);
242                 __m128 vA = _mm_cvtepi32_ps( _mm_unpacklo_epi16(mmAB, mmZero));
243                 __m128 vB = _mm_cvtepi32_ps( _mm_unpackhi_epi16(mmAB, mmZero));
244                 __m128 vC = _mm_cvtepi32_ps( _mm_unpacklo_epi16(mmCD, mmZero));
245                 __m128 vD = _mm_cvtepi32_ps( _mm_unpackhi_epi16(mmCD, mmZero));
246 
247                 __m128 vfx = _mm_set1_ps(fx);
248                 __m128 vfxm1 = _mm_set1_ps(fxm1);
249                 __m128 up = vA * vfxm1 + vB * vfx;
250                 __m128 down = vC * vfxm1 + vD * vfx;
251 
252                 __m128 vfy = _mm_set1_ps(fy);
253                 __m128 vfym1 = _mm_set1_ps(fym1);
254                 __m128 dResult = up * fym1 + down * fy;
255                 vec4f result = void;
256                 _mm_storeu_ps(result.ptr, dResult);
257                 return result;
258 
259             }
260             else version( AsmX86 )
261             {
262                 vec4f asmResult;
263 
264                 asm nothrow @nogc
265                 {
266                     movd XMM0, A;
267                     movd XMM1, B;
268                     movd XMM2, C;
269                     movd XMM3, D;
270                     pxor XMM4, XMM4;
271 
272                     punpcklbw XMM0, XMM4;
273                     punpcklbw XMM1, XMM4;
274                     punpcklbw XMM2, XMM4;
275                     punpcklbw XMM3, XMM4;
276 
277                     punpcklwd XMM0, XMM4;
278                     punpcklwd XMM1, XMM4;
279                     punpcklwd XMM2, XMM4;
280                     punpcklwd XMM3, XMM4;
281 
282                     cvtdq2ps XMM0, XMM0;
283                     cvtdq2ps XMM1, XMM1;
284 
285                     cvtdq2ps XMM2, XMM2;
286                     cvtdq2ps XMM3, XMM3;
287 
288                     movss XMM4, fxm1;
289                     pshufd XMM4, XMM4, 0;
290                     movss XMM5, fx;
291                     pshufd XMM5, XMM5, 0;
292 
293                     mulps XMM0, XMM4;
294                     mulps XMM1, XMM5;
295                     mulps XMM2, XMM4;
296                     mulps XMM3, XMM5;
297 
298                     movss XMM4, fym1;
299                     pshufd XMM4, XMM4, 0;
300                     movss XMM5, fy;
301                     pshufd XMM5, XMM5, 0;
302 
303                     addps XMM0, XMM1;
304                     addps XMM2, XMM3;
305 
306                     mulps XMM0, XMM4;
307                     mulps XMM2, XMM5;
308 
309                     addps XMM0, XMM2;
310 
311                     movups asmResult, XMM0;
312                 }
313 
314                 // Uncomment to check
315     /*
316                 vec4f vA = vec4f(A.r, A.g, A.b, A.a);
317                 vec4f vB = vec4f(B.r, B.g, B.b, B.a);
318                 vec4f vC = vec4f(C.r, C.g, C.b, C.a);
319                 vec4f vD = vec4f(D.r, D.g, D.b, D.a);
320 
321                 vec4f up = vA * fxm1 + vB * fx;
322                 vec4f down = vC * fxm1 + vD * fx;
323                 vec4f dResult = up * fym1 + down * fy;
324 
325                 import gfm.core;
326 
327                 if (dResult.distanceTo(result) < 1.0f)
328                     debugBreak();
329     */
330 
331                 vec4f result = asmResult;
332                 return result;
333             }
334             else
335             {
336                 vec4f vA = vec4f(A.r, A.g, A.b, A.a);
337                 vec4f vB = vec4f(B.r, B.g, B.b, B.a);
338                 vec4f vC = vec4f(C.r, C.g, C.b, C.a);
339                 vec4f vD = vec4f(D.r, D.g, D.b, D.a);
340 
341 
342 
343                 vec4f up = vA * fxm1 + vB * fx;
344                 vec4f down = vC * fxm1 + vD * fx;
345                 vec4f dResult = up * fym1 + down * fy;
346 
347               //  assert(dResult.distanceTo(asmResult) < 1.0f);
348 
349                 return dResult;
350             }
351         }
352         else
353         {
354             float up = A.l * fxm1 + B.l * fx;
355             float down = C.l * fxm1 + D.l * fx;
356             return up * fym1 + down * fy;
357         }
358     }
359 
360     /// Returns: Width of the base level.
361     int width() pure const nothrow @nogc
362     {
363         return levels[0].w;
364     }
365 
366     /// Returns: Height of the base level.
367     int height() pure const nothrow @nogc
368     {
369         return levels[0].h;
370     }
371 
372     /// Returns: Number of levels. The maximum level is numLevels() - 1.
373     int numLevels() pure const nothrow @nogc
374     {
375         return cast(int)levels.length;
376     }
377 
378     /// Regenerates the whole upper levels.
379     void generateMipmaps(Quality quality) nothrow @nogc
380     {
381         box2i updateRect = box2i(0, 0, width(), height());
382         for (int level = 1; level < numLevels(); ++level)
383         {
384             // HACK: Force cubic filter past a level else it makes ugly looking mipmaps
385             if (level >= 3 && quality == Quality.box)
386                 quality = Quality.cubic;
387 
388             updateRect = generateNextLevel(quality, updateRect, level);
389         }
390     }
391 
392     /// Regenerates a single mipmap level based on changes in the provided rectangle (expressed in level 0 coordinates).
393     /// updateRect expressed in level 0 coordinates
394     /// In general if you have several subparts of mipmaps to update, make sure a level is fully completed
395     /// before computing the next one.
396     box2i generateNextLevel(Quality quality, box2i updateRectPreviousLevel, int level) nothrow @nogc
397     {
398         OwnedImage!COLOR previousLevel = levels[level - 1];
399         box2i updateRect = impactOnNextLevel(quality, updateRectPreviousLevel, previousLevel.w, previousLevel.h);
400         generateLevel(level, quality, updateRect);
401         return updateRect;
402     }
403 
404     /// Regenerates one level
405     /// updateRect expressed in level i-th coordinates
406     void generateLevel(int level, Quality quality, box2i updateRect) nothrow @nogc
407     {
408         assert(level > 0);
409         OwnedImage!COLOR thisLevel = levels[level];
410         OwnedImage!COLOR previousLevel = levels[level - 1];
411 
412         final switch(quality) with (Quality)
413         {
414             case box:
415 
416                 static if (is(COLOR == RGBA))
417                     generateLevelBoxRGBA(thisLevel, previousLevel, updateRect);
418                 else static if (is(COLOR == L16))
419                     generateLevelBoxL16(thisLevel, previousLevel, updateRect);
420                 else
421                     static assert(false, "not implemented");
422 
423                 enum checkBoxMipmaps = false;
424 
425                 static if (checkBoxMipmaps)
426                 {
427                     for (int y = updateRect.min.y; y < updateRect.max.y; ++y)
428                     {
429                         COLOR[] L0 = previousLevel.scanline(y * 2);
430                         COLOR[] L1 = previousLevel.scanline(y * 2 + 1);
431                         COLOR[] dest = thisLevel.scanline(y);
432 
433                         for (int x = updateRect.min.x; x < updateRect.max.x; ++x)
434                         {
435                             // A B
436                             // C D
437                             COLOR A = L0[2 * x];
438                             COLOR B = L0[2 * x + 1];
439                             COLOR C = L1[2 * x];
440                             COLOR D = L1[2 * x + 1];
441                             assert(dest[x] == COLOR.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D));
442                         }
443                     }
444                 }
445                 break;
446 
447         case boxAlphaCov:
448 
449             static if (is(COLOR == RGBA))
450             {
451                 generateLevelBoxAlphaCovRGBA(thisLevel, previousLevel, updateRect);
452 
453                 static if (false)
454                 {
455                     void checkLevelBoxAlphaConvRGBA(Image!RGBA* thisLevel, Image!RGBA* previousLevel, box2i updateRect)
456                     {
457                         for (int y = updateRect.min.y; y < updateRect.max.y; ++y)
458                         {
459                             RGBA[] L0 = previousLevel.scanline(y * 2);
460                             RGBA[] L1 = previousLevel.scanline(y * 2 + 1);
461                             RGBA[] dest = thisLevel.scanline(y);
462 
463                             for (int x = updateRect.min.x; x < updateRect.max.x; ++x)
464                             {
465                                 // A B
466                                 // C D
467                                 RGBA A = L0.ptr[2 * x];
468                                 RGBA B = L0.ptr[2 * x + 1];
469                                 RGBA C = L1.ptr[2 * x];
470                                 RGBA D = L1.ptr[2 * x + 1];
471 
472                                 int alphaA = A.a;
473                                 int alphaB = B.a;
474                                 int alphaC = C.a;
475                                 int alphaD = D.a;
476                                 int sum = alphaA + alphaB + alphaC + alphaD;
477                                 if (sum == 0)
478                                 {
479                                     assert(dest.ptr[x] == A);
480                                 }
481                                 else
482                                 {
483                                     int destAlpha = cast(ubyte)( (alphaA + alphaB + alphaC + alphaD + 2) >> 2 );
484                                     int red =   (A.r * alphaA + B.r * alphaB + C.r * alphaC + D.r * alphaD);
485                                     int green = (A.g * alphaA + B.g * alphaB + C.g * alphaC + D.g * alphaD);
486                                     int blue =  (A.b * alphaA + B.b* alphaB + C.b * alphaC + D.b * alphaD);
487                                     float invSum = 1 / cast(float)(sum);
488 
489                                     RGBA finalColor = RGBA( cast(ubyte)(0.5f + red * invSum),
490                                                             cast(ubyte)(0.5f + green * invSum),
491                                                             cast(ubyte)(0.5f + blue * invSum),
492                                                             cast(ubyte)destAlpha );
493                                     assert(dest.ptr[x] == finalColor);
494                                 }
495                             }
496                         }
497                     }
498                     checkLevelBoxAlphaConvRGBA(thisLevel, previousLevel, updateRect);
499                 }
500                 break;
501             }
502             else
503                 assert(false);
504 
505         case boxAlphaCovIntoPremul:
506 
507             static if (is(COLOR == RGBA))
508             {
509                 generateLevelBoxAlphaCovIntoPremulRGBA(thisLevel, previousLevel, updateRect);
510                 break;
511             }
512             else
513                 assert(false);
514 
515         case cubic:
516             static if (is(COLOR == RGBA))
517             {
518                 generateLevelCubicRGBA(thisLevel, previousLevel, updateRect);
519                 break;
520             }
521             else static if (is(COLOR == L16))
522             {
523                 generateLevelCubicL16(thisLevel, previousLevel, updateRect);
524                 break;
525             }
526             else
527                 static assert(false, "not implemented");
528 
529 
530         }
531     }
532 
533 
534 private:
535     /// Computes impact of updating the area box on next level
536     static box2i impactOnNextLevel(Quality quality, box2i area, int currentLevelWidth, int currentLevelHeight) pure nothrow @nogc
537     {
538         box2i maxArea = box2i(0, 0, currentLevelWidth / 2, currentLevelHeight / 2);
539 
540         final  switch(quality) with (Quality)
541         {
542         case box:
543         case boxAlphaCov:
544         case boxAlphaCovIntoPremul:
545             int xmin = area.min.x / 2;
546             int ymin = area.min.y / 2;
547             int xmax = (area.max.x + 1) / 2;
548             int ymax = (area.max.y + 1) / 2;
549             return box2i(xmin, ymin, xmax, ymax).intersection(maxArea);
550 
551         case cubic:
552             int xmin = (area.min.x - 1) / 2;
553             int ymin = (area.min.y - 1) / 2;
554             int xmax = (area.max.x + 2) / 2;
555             int ymax = (area.max.y + 2) / 2;
556             return box2i(xmin, ymin, xmax, ymax).intersection(maxArea);
557         }
558 
559     }
560 }
561 
562 unittest
563 {
564     Mipmap!RGBA a = new Mipmap!RGBA();
565     a.size(4, 256, 256);
566     a.destroy();
567 
568     Mipmap!L16 b = new Mipmap!L16();
569     b.size(16, 17, 333);
570     b.destroy();
571 }
572 
573 
574 private:
575 
576 align(16) static immutable short[8] xmmTwoShort = [ 2, 2, 2, 2, 2, 2, 2, 2 ];
577 align(16) static immutable int[4] xmmTwoInt = [ 2, 2, 2, 2 ];
578 align(16) static immutable float[4] xmm0_5 = [ 0.5f, 0.5f, 0.5f, 0.5f ];
579 align(16) static immutable int[4] xmm512 = [ 512, 512, 512, 512 ];
580 align(16) static immutable short[8] xmm11113333 = [ 1, 1, 1, 1, 3, 3, 3, 3 ];
581 align(16) static immutable short[8] xmm33331111 = [ 3, 3, 3, 3, 1, 1, 1, 1 ];
582 align(16) static immutable short[8] xmm33339999 = [ 3, 3, 3, 3, 9, 9, 9, 9 ];
583 align(16) static immutable short[8] xmm99993333 = [ 9, 9, 9, 9, 3, 3, 3, 3 ];
584 align(16) static immutable short[8] xmm32       = [ 32, 32, 32, 32, 32, 32, 32, 32 ];
585 
586 
587 void generateLevelBoxRGBA(OwnedImage!RGBA thisLevel,
588                           OwnedImage!RGBA previousLevel,
589                           box2i updateRect) pure nothrow @nogc
590 {
591     int width = updateRect.width();
592     int height = updateRect.height();
593 
594     for (int y = 0; y < height; ++y)
595     {
596         RGBA* L0   = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2    ) + updateRect.min.x * 2;
597         RGBA* L1   = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 + 1) + updateRect.min.x * 2;
598         RGBA* dest =     thisLevel.scanlinePtr(           updateRect.min.y + y) + updateRect.min.x;
599         version(inlineAsmCanLoadGlobalsInPIC)
600         {
601             version(D_InlineAsm_X86)
602             {
603                 asm pure nothrow @nogc
604                 {
605                     mov ECX, width;
606                     shr ECX, 1;
607                     jz no_need; // ECX = 0 => no pair of pixels to process
608 
609                     mov EAX, L0;
610                     mov EDX, L1;
611                     mov EDI, dest;
612                     movaps XMM5, xmmTwoShort;
613 
614                 loop_ecx:
615                     movdqu XMM0, [EAX]; // A B E F
616                     pxor XMM4, XMM4;
617                     movdqu XMM1, [EDX]; // C D G H
618                     movdqa XMM2, XMM0;
619                     movdqa XMM3, XMM1;
620                     punpcklbw XMM0, XMM4; // A B in short
621                     punpcklbw XMM1, XMM4; // C D in short
622                     punpckhbw XMM2, XMM4; // E F in short
623                     punpckhbw XMM3, XMM4; // G H in short
624                     paddusw XMM0, XMM1; // A + C | B + D
625                     paddusw XMM2, XMM3; // E + F | G + H
626                     movdqa XMM1, XMM0;
627                     movdqa XMM3, XMM2;
628                     psrldq XMM1, 8;
629                     psrldq XMM3, 8;
630                     add EDI, 8;
631                     paddusw XMM0, XMM1; // A + B + C + D | garbage
632                     paddusw XMM2, XMM3; // E + F + G + H | garbage
633                     paddusw XMM0, XMM5; // A + B + C + D + 2 | garbage
634                     paddusw XMM2, XMM5; // E + F + G + H + 2 | garbage
635                     psrlw XMM0, 2; // (A + B + C + D + 2) >> 2 | garbage
636                     psrlw XMM2, 2; // (E + F + G + H + 2) >> 2 | garbage
637                     add EAX, 16;
638                     punpcklqdq XMM0, XMM2;
639                     add EDX, 16;
640                     packuswb XMM0, XMM4; // (A + B + C + D + 2) >> 2 | (E + F + G + H + 2) >> 2 | 0 | 0
641                     movq [EDI-8], XMM0;
642                     sub ECX, 1;
643                     jnz loop_ecx;
644                 no_need: ;
645                 }
646 
647                 // Eventually filter the last pixel
648                 int remaining = width & ~1;
649                 for (int x = remaining; x < width; ++x)
650                 {
651                     RGBA A = L0[2 * x];
652                     RGBA B = L0[2 * x + 1];
653                     RGBA C = L1[2 * x];
654                     RGBA D = L1[2 * x + 1];
655                     dest[x] = RGBA.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D);
656                 }
657             }
658             else version(D_InlineAsm_X86_64)
659             {
660                 asm pure nothrow @nogc
661                 {
662                     mov ECX, width;
663                     shr ECX, 1;
664                     jz no_need; // ECX = 0 => no pair of pixels to process
665 
666                     mov RAX, L0;
667                     mov RDX, L1;
668                     mov RDI, dest;
669                     movaps XMM5, xmmTwoShort;
670 
671                 loop_ecx:
672                     movdqu XMM0, [RAX]; // A B E F
673                     pxor XMM4, XMM4;
674                     movdqu XMM1, [RDX]; // C D G H
675                     movdqa XMM2, XMM0;
676                     movdqa XMM3, XMM1;
677                     punpcklbw XMM0, XMM4; // A B in short
678                     punpcklbw XMM1, XMM4; // C D in short
679                     punpckhbw XMM2, XMM4; // E F in short
680                     punpckhbw XMM3, XMM4; // G H in short
681                     paddusw XMM0, XMM1; // A + C | B + D
682                     paddusw XMM2, XMM3; // E + F | G + H
683                     movdqa XMM1, XMM0;
684                     movdqa XMM3, XMM2;
685                     psrldq XMM1, 8;
686                     psrldq XMM3, 8;
687                     add RDI, 8;
688                     paddusw XMM0, XMM1; // A + B + C + D | garbage
689                     paddusw XMM2, XMM3; // E + F + G + H | garbage
690                     paddusw XMM0, XMM5; // A + B + C + D + 2 | garbage
691                     paddusw XMM2, XMM5; // E + F + G + H + 2 | garbage
692                     psrlw XMM0, 2; // (A + B + C + D + 2) >> 2 | garbage
693                     psrlw XMM2, 2; // (E + F + G + H + 2) >> 2 | garbage
694                     add RAX, 16;
695                     punpcklqdq XMM0, XMM2;
696                     add RDX, 16;
697                     packuswb XMM0, XMM4; // (A + B + C + D + 2) >> 2 | (E + F + G + H + 2) >> 2 | 0 | 0
698                     movq [RDI-8], XMM0;
699                     sub ECX, 1;
700                     jnz loop_ecx;
701                 no_need: ;
702                 }
703 
704                 // Eventually filter the last pixel
705                 int remaining = width & ~1;
706                 for (int x = remaining; x < width; ++x)
707                 {
708                     RGBA A = L0[2 * x];
709                     RGBA B = L0[2 * x + 1];
710                     RGBA C = L1[2 * x];
711                     RGBA D = L1[2 * x + 1];
712                     dest[x] = RGBA.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D);
713                 }
714             }
715             else
716                 static assert(false);
717         }
718         else
719         {
720             for (int x = 0; x < width; ++x)
721             {
722                 // A B
723                 // C D
724                 RGBA A = L0[2 * x];
725                 RGBA B = L0[2 * x + 1];
726                 RGBA C = L1[2 * x];
727                 RGBA D = L1[2 * x + 1];
728 
729                 dest[x] = RGBA.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D);
730             }
731         }
732     }
733 }
734 
735 void generateLevelBoxL16(OwnedImage!L16 thisLevel,
736                          OwnedImage!L16 previousLevel,
737                          box2i updateRect) pure nothrow @nogc
738 {
739     int width = updateRect.width();
740     int height = updateRect.height();
741 
742     for (int y = 0; y < height; ++y)
743     {
744         L16* L0   = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2    ) + updateRect.min.x * 2;
745         L16* L1   = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 + 1) + updateRect.min.x * 2;
746         L16* dest =     thisLevel.scanlinePtr(           updateRect.min.y + y) + updateRect.min.x;
747 
748         version(inlineAsmCanLoadGlobalsInPIC)
749         {
750             version(D_InlineAsm_X86)
751             {
752                 asm pure nothrow @nogc
753                 {
754                     mov ECX, width;
755                     shr ECX, 2;
756                     jz no_need; // ECX = 0 => less than 4 pixels to process
757 
758                     mov EAX, L0;
759                     mov EDX, L1;
760                     mov EDI, dest;
761                     movdqa XMM5, xmmTwoInt;
762                     pxor XMM4, XMM4;
763 
764                 loop_ecx:
765                     movdqu XMM0, [EAX]; // A B E F I J M N
766                     movdqu XMM1, [EDX]; // C D G H K L O P
767 
768                     add EAX, 16;
769                     add EDX, 16;
770 
771                     movdqa XMM2, XMM0;
772                     movdqa XMM3, XMM1;
773 
774                     punpcklwd XMM0, XMM4; // A B E F in int32
775                     punpckhwd XMM2, XMM4; // I J M N in int32
776                     punpcklwd XMM1, XMM4; // C D G H in int32
777                     punpckhwd XMM3, XMM4; // K L O P in int32
778 
779                     paddd XMM0, XMM1; // A+C B+D E+G F+H
780                     paddd XMM2, XMM3; // I+K J+L M+O N+P
781 
782                     movdqa XMM1, XMM0;
783                     movdqa XMM3, XMM2;
784 
785                     psrldq XMM1, 4; // B+D E+G F+H 0
786                     psrldq XMM3, 4; // J+L M+O N+P 0
787 
788                     paddd XMM0, XMM1; // A+B+C+D garbage E+F+G+H garbage
789                     paddd XMM2, XMM3; // I+J+K+L garbage M+N+O+P garbage
790 
791                     pshufd XMM0, XMM0, 0b00001000; // A+B+C+D E+F+G+H garbage garbage
792                     pshufd XMM2, XMM2, 0b00001000; // I+J+K+L M+N+O+P garbage garbage
793 
794                     punpcklqdq XMM0, XMM2; // A+B+C+D E+F+G+H I+J+K+L M+N+O+P
795                     paddd XMM0, XMM5; // add 2
796                     psrld XMM0, 2; // >> 2
797 
798                     // because packusdw is not available before SSE4.1
799                     // Extend sign bit to the right
800                     pslld XMM0, 16;
801                     psrad XMM0, 16;
802                     add EDI, 8;
803                     packssdw XMM0, XMM4;
804 
805                     movq [EDI-8], XMM0;
806                     sub ECX, 1;
807                     jnz loop_ecx;
808                 no_need: ;
809                 }
810 
811                 // Eventually filter the 0 to 3 pixels
812                 int remaining = width & ~3;
813                 for (int x = remaining; x < width; ++x)
814                 {
815                     L16 A = L0[2 * x];
816                     L16 B = L0[2 * x + 1];
817                     L16 C = L1[2 * x];
818                     L16 D = L1[2 * x + 1];
819                     dest[x] = L16.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D);
820                 }
821             }
822             else version(D_InlineAsm_X86_64)
823             {
824                 asm pure nothrow @nogc
825                 {
826                     mov ECX, width;
827                     shr ECX, 2;
828                     jz no_need; // ECX = 0 => less than 4 pixels to process
829 
830                     mov RAX, L0;
831                     mov RDX, L1;
832                     mov RDI, dest;
833                     movdqa XMM5, xmmTwoInt;
834                     pxor XMM4, XMM4;
835 
836                 loop_ecx:
837                     movdqu XMM0, [RAX]; // A B E F I J M N
838                     movdqu XMM1, [RDX]; // C D G H K L O P
839 
840                     add RAX, 16;
841                     add RDX, 16;
842 
843                     movdqa XMM2, XMM0;
844                     movdqa XMM3, XMM1;
845 
846                     punpcklwd XMM0, XMM4; // A B E F in int32
847                     punpckhwd XMM2, XMM4; // I J M N in int32
848                     punpcklwd XMM1, XMM4; // C D G H in int32
849                     punpckhwd XMM3, XMM4; // K L O P in int32
850 
851                     paddd XMM0, XMM1; // A+C B+D E+G F+H
852                     paddd XMM2, XMM3; // I+K J+L M+O N+P
853 
854                     movdqa XMM1, XMM0;
855                     movdqa XMM3, XMM2;
856 
857                     psrldq XMM1, 4; // B+D E+G F+H 0
858                     psrldq XMM3, 4; // J+L M+O N+P 0
859 
860                     paddd XMM0, XMM1; // A+B+C+D garbage E+F+G+H garbage
861                     paddd XMM2, XMM3; // I+J+K+L garbage M+N+O+P garbage
862 
863                     pshufd XMM0, XMM0, 0b00001000; // A+B+C+D E+F+G+H garbage garbage
864                     pshufd XMM2, XMM2, 0b00001000; // I+J+K+L M+N+O+P garbage garbage
865 
866                     punpcklqdq XMM0, XMM2; // A+B+C+D E+F+G+H I+J+K+L M+N+O+P
867                     paddd XMM0, XMM5; // add 2
868                     psrld XMM0, 2; // >> 2
869 
870                     // because packusdw is not available before SSE4.1
871                     // Extend sign bit to the right
872                     pslld XMM0, 16;
873                     psrad XMM0, 16;
874                     add RDI, 8;
875                     packssdw XMM0, XMM4;
876 
877                     movq [RDI-8], XMM0;
878                     sub ECX, 1;
879                     jnz loop_ecx;
880                 no_need: ;
881                 }
882 
883                 // Eventually filter the 0 to 3 pixels
884                 int remaining = width & ~3;
885                 for (int x = remaining; x < width; ++x)
886                 {
887                     L16 A = L0[2 * x];
888                     L16 B = L0[2 * x + 1];
889                     L16 C = L1[2 * x];
890                     L16 D = L1[2 * x + 1];
891                     dest[x] = L16.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D);
892                 }
893             }
894             else
895                 static assert(false);
896         }
897         else
898         {
899             for (int x = 0; x < width; ++x)
900             {
901                 // A B
902                 // C D
903                 L16 A = L0[2 * x];
904                 L16 B = L0[2 * x + 1];
905                 L16 C = L1[2 * x];
906                 L16 D = L1[2 * x + 1];
907 
908                 dest[x] = L16.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D);
909             }
910         }
911     }
912 }
913 
914 
915 void generateLevelBoxAlphaCovRGBA(OwnedImage!RGBA thisLevel,
916                                   OwnedImage!RGBA previousLevel,
917                                   box2i updateRect) nothrow @nogc
918 {
919     int width = updateRect.width();
920     int height = updateRect.height();
921 
922     for (int y = 0; y < height; ++y)
923     {
924         RGBA* L0   = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2    ) + updateRect.min.x * 2;
925         RGBA* L1   = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 + 1) + updateRect.min.x * 2;
926         RGBA* dest =     thisLevel.scanlinePtr(           updateRect.min.y + y) + updateRect.min.x;
927 
928         version(inlineAsmCanLoadGlobalsInPIC)
929         {
930             version(D_InlineAsm_X86)
931             {
932                 // Note: this block of assembly created problems with LDC -a x86,
933                 // and other problems with DMD, so it's currently disabled
934 
935                 // Generic code
936                 assert(width > 0);
937 
938                 for (int x = 0; x < width; ++x)
939                 {
940                     // A B
941                     // C D
942                     RGBA A = L0[2 * x];
943                     RGBA B = L0[2 * x + 1];
944                     RGBA C = L1[2 * x];
945                     RGBA D = L1[2 * x + 1];
946 
947                     int alphaA = A.a;
948                     int alphaB = B.a;
949                     int alphaC = C.a;
950                     int alphaD = D.a;
951                     int sum = alphaA + alphaB + alphaC + alphaD;
952                     if (sum == 0)
953                     {
954                         dest[x] = RGBA(0,0,0,0);
955                     }
956                     else
957                     {
958                         int destAlpha = cast(ubyte)( (alphaA + alphaB + alphaC + alphaD + 2) >> 2 );
959                         int red =   (A.r * alphaA + B.r * alphaB + C.r * alphaC + D.r * alphaD);
960                         int green = (A.g * alphaA + B.g * alphaB + C.g * alphaC + D.g * alphaD);
961                         int blue =  (A.b * alphaA + B.b* alphaB + C.b * alphaC + D.b * alphaD);
962                         float invSum = 1 / cast(float)(sum);
963 
964                         RGBA finalColor = RGBA( cast(ubyte)(0.5f + red * invSum),
965                                                 cast(ubyte)(0.5f + green * invSum),
966                                                 cast(ubyte)(0.5f + blue * invSum),
967                                                 cast(ubyte)destAlpha );
968                         dest[x] = finalColor;
969                     }
970                 }
971 
972                 /+
973                 asm nothrow @nogc
974                 {
975                     mov ECX, width;
976 
977                     mov EAX, L0;
978                     mov EDX, L1;
979                     mov EDI, dest;
980 
981                     loop_ecx:
982 
983                         movq XMM0, [EAX];                  // Ar Ag Ab Aa Br Bg Bb Ba + zeroes
984                         movq XMM1, [EDX];                  // Cr Cg Cb Ca Dr Dg Db Da + zeroes
985                         pxor XMM4, XMM4;
986                         add EAX, 8;
987                         add EDX, 8;
988 
989                         punpcklbw XMM0, XMM4;              // Ar Ag Ab Aa Br Bg Bb Ba
990                         punpcklbw XMM1, XMM4;              // Cr Cg Cb Ca Dr Dg Db Da
991 
992                         movdqa XMM2, XMM0;
993                         punpcklwd XMM0, XMM1;              // Ar Cr Ag Cg Ab Cb Aa Ca
994                         punpckhwd XMM2, XMM1;              // Br Dr Bg Dg Bb Db Ba Da
995 
996                         // perhaps unnecessary
997                         movdqa XMM3, XMM0;
998                         punpcklwd XMM0, XMM2;              // Ar Br Cr Dr Ag Bg Cg Dg
999                         punpckhwd XMM3, XMM2;              // Ab Bb Cb Db Aa Ba Ca Da
1000 
1001                         movdqa XMM1, XMM3;
1002                         punpckhqdq XMM1, XMM1;             // Aa Ba Ca Da Aa Ba Ca Da
1003 
1004                         // Are alpha all zeroes? if so, early continue.
1005                         movdqa XMM2, XMM1;
1006                         pcmpeqb XMM2, XMM4;
1007                         add EDI, 4;
1008                         pmovmskb ESI, XMM2;
1009                         cmp ESI, 0xffff;
1010                         jnz non_null;
1011 
1012                             pxor XMM0, XMM0;
1013                             sub ECX, 1;
1014                             movd [EDI-4], XMM0;            // dest[x] = A
1015                             jnz loop_ecx;
1016                             jmp end_of_loop;
1017 
1018                         non_null:
1019 
1020                             pmaddwd XMM0, XMM1;            // Ar*Aa+Br*Ba Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da
1021                             pmaddwd XMM3, XMM1;            // Ab*Aa+Bb*Ba Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da
1022 
1023                             // Starting computing sum of coefficients too
1024                             punpcklwd XMM1, XMM4;      // Aa Ba Ca Da
1025 
1026                             movdqa XMM2, XMM0;
1027                             movdqa XMM5, XMM3;
1028                             movdqa XMM4, XMM1;
1029                             psrldq XMM4, 8;
1030 
1031                             psrldq XMM2, 4;                // Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 0
1032                             psrldq XMM5, 4;                // Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 0
1033                             paddq XMM1, XMM4;              // Aa+Ca Ba+Da garbage garbage
1034                             movdqa XMM4, XMM1;
1035 
1036                             paddd XMM0, XMM2;              // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da garbage Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage
1037                             paddd XMM3, XMM5;              // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da garbage Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage
1038                             psrldq XMM4, 4;
1039 
1040                             pshufd XMM0, XMM0, 0b00001000; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage garbage
1041                             paddq XMM1, XMM4;          // Aa+Ba+Ca+Da garbage garbage garbage
1042                             pshufd XMM3, XMM3, 0b00001000; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage garbage
1043 
1044                             punpcklqdq XMM0, XMM3;     // fR fG fB fA
1045                             pshufd XMM1, XMM1, 0;
1046 
1047                             cvtdq2ps XMM0, XMM0;
1048 
1049                             cvtdq2ps XMM3, XMM1;       // sum sum sum sum
1050 
1051                             divps XMM0, XMM3;          // fR/sum fG/sum fB/sum fA/sum
1052                             addps XMM0, xmm0_5;
1053                             cvttps2dq XMM0, XMM0;      // return into integer domain using cast(int)(x + 0.5f)
1054 
1055                             paddd XMM1, xmmTwoInt;
1056                             psrld XMM1, 2;             // finalAlpha finalAlpha finalAlpha finalAlpha
1057 
1058                             pslldq XMM0, 4;            // 0 fR/sum fG/sum fB/sum
1059                             pslldq XMM1, 12;           // 0 0 0 finalAlpha
1060                             psrldq XMM0, 4;            // fR/sum fG/sum fB/sum 0
1061 
1062                             por XMM0, XMM1;            // fR/sum fG/sum fB/sum finalAlpha
1063                             pxor XMM3, XMM3;
1064                             packssdw XMM0, XMM3;       // same in words
1065                             packuswb XMM0, XMM3;       // same in bytes
1066 
1067                             sub ECX, 1;
1068                             movd [EDI-4], XMM0;            // dest[x] = A
1069                     jnz loop_ecx;
1070                     end_of_loop: ;
1071                 }
1072                 +/
1073             }
1074             else version(D_InlineAsm_X86_64)
1075             {
1076                 assert(width > 0);
1077                 asm nothrow @nogc
1078                 {
1079                     mov ECX, width;
1080 
1081                     mov RAX, L0;
1082                     mov RDX, L1;
1083                     mov RDI, dest;
1084 
1085                 loop_ecx:
1086 
1087                     movq XMM0, [RAX];                  // Ar Ag Ab Aa Br Bg Bb Ba + zeroes
1088                     movq XMM1, [RDX];                  // Cr Cg Cb Ca Dr Dg Db Da + zeroes
1089                     pxor XMM4, XMM4;
1090                     add RAX, 8;
1091                     add RDX, 8;
1092 
1093                     punpcklbw XMM0, XMM4;              // Ar Ag Ab Aa Br Bg Bb Ba
1094                     punpcklbw XMM1, XMM4;              // Cr Cg Cb Ca Dr Dg Db Da
1095 
1096                     movdqa XMM2, XMM0;
1097                     punpcklwd XMM0, XMM1;              // Ar Cr Ag Cg Ab Cb Aa Ca
1098                     punpckhwd XMM2, XMM1;              // Br Dr Bg Dg Bb Db Ba Da
1099 
1100                     // perhaps unnecessary
1101                     movdqa XMM3, XMM0;
1102                     punpcklwd XMM0, XMM2;              // Ar Br Cr Dr Ag Bg Cg Dg
1103                     punpckhwd XMM3, XMM2;              // Ab Bb Cb Db Aa Ba Ca Da
1104 
1105                     movdqa XMM1, XMM3;
1106                     punpckhqdq XMM1, XMM1;             // Aa Ba Ca Da Aa Ba Ca Da
1107 
1108                     // Are alpha all zeroes? if so, early continue.
1109                     movdqa XMM2, XMM1;
1110                     pcmpeqb XMM2, XMM4;
1111                     add RDI, 4;
1112                     pmovmskb ESI, XMM2;
1113                     cmp ESI, 0xffff;
1114                     jnz non_null;
1115 
1116                     pxor XMM0, XMM0;
1117                     sub ECX, 1;
1118                     movd [RDI-4], XMM0;            // dest[x] = A
1119                     jnz loop_ecx;
1120                     jmp end_of_loop;
1121 
1122                 non_null:
1123 
1124                     pmaddwd XMM0, XMM1;            // Ar*Aa+Br*Ba Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da
1125                     pmaddwd XMM3, XMM1;            // Ab*Aa+Bb*Ba Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da
1126 
1127                     // Starting computing sum of coefficients too
1128                     punpcklwd XMM1, XMM4;      // Aa Ba Ca Da
1129 
1130                     movdqa XMM2, XMM0;
1131                     movdqa XMM5, XMM3;
1132                     movdqa XMM4, XMM1;
1133                     psrldq XMM4, 8;
1134 
1135                     psrldq XMM2, 4;                // Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 0
1136                     psrldq XMM5, 4;                // Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 0
1137                     paddq XMM1, XMM4;              // Aa+Ca Ba+Da garbage garbage
1138                     movdqa XMM4, XMM1;
1139 
1140                     paddd XMM0, XMM2;              // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da garbage Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage
1141                     paddd XMM3, XMM5;              // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da garbage Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage
1142                     psrldq XMM4, 4;
1143 
1144                     pshufd XMM0, XMM0, 0b00001000; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage garbage
1145                     paddq XMM1, XMM4;          // Aa+Ba+Ca+Da garbage garbage garbage
1146                     pshufd XMM3, XMM3, 0b00001000; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage garbage
1147 
1148                     punpcklqdq XMM0, XMM3;     // fR fG fB fA
1149                     pshufd XMM1, XMM1, 0;
1150 
1151                     cvtdq2ps XMM0, XMM0;
1152 
1153                     cvtdq2ps XMM3, XMM1;       // sum sum sum sum
1154 
1155                     divps XMM0, XMM3;          // fR/sum fG/sum fB/sum fA/sum
1156                     addps XMM0, xmm0_5;
1157                     cvttps2dq XMM0, XMM0;      // return into integer domain using cast(int)(x + 0.5f)
1158 
1159                     paddd XMM1, xmmTwoInt;
1160                     psrld XMM1, 2;             // finalAlpha finalAlpha finalAlpha finalAlpha
1161 
1162                     pslldq XMM0, 4;            // 0 fR/sum fG/sum fB/sum
1163                     pslldq XMM1, 12;           // 0 0 0 finalAlpha
1164                     psrldq XMM0, 4;            // fR/sum fG/sum fB/sum 0
1165 
1166                     por XMM0, XMM1;            // fR/sum fG/sum fB/sum finalAlpha
1167                     pxor XMM3, XMM3;
1168                     packssdw XMM0, XMM3;       // same in words
1169                     packuswb XMM0, XMM3;       // same in bytes
1170 
1171                     sub ECX, 1;
1172                     movd [RDI-4], XMM0;            // dest[x] = A
1173                     jnz loop_ecx;
1174                 end_of_loop: ;
1175                 }
1176             }
1177             else
1178                 static assert(false);
1179         }
1180         else
1181         {
1182             for (int x = 0; x < width; ++x)
1183             {
1184                 // A B
1185                 // C D
1186                 RGBA A = L0[2 * x];
1187                 RGBA B = L0[2 * x + 1];
1188                 RGBA C = L1[2 * x];
1189                 RGBA D = L1[2 * x + 1];
1190 
1191                 int alphaA = A.a;
1192                 int alphaB = B.a;
1193                 int alphaC = C.a;
1194                 int alphaD = D.a;
1195                 int sum = alphaA + alphaB + alphaC + alphaD;
1196                 if (sum == 0)
1197                 {
1198                     dest[x] = RGBA(0,0,0,0);
1199                 }
1200                 else
1201                 {
1202                     int destAlpha = cast(ubyte)( (alphaA + alphaB + alphaC + alphaD + 2) >> 2 );
1203                     int red =   (A.r * alphaA + B.r * alphaB + C.r * alphaC + D.r * alphaD);
1204                     int green = (A.g * alphaA + B.g * alphaB + C.g * alphaC + D.g * alphaD);
1205                     int blue =  (A.b * alphaA + B.b* alphaB + C.b * alphaC + D.b * alphaD);
1206                     float invSum = 1 / cast(float)(sum);
1207 
1208                     RGBA finalColor = RGBA( cast(ubyte)(0.5f + red * invSum),
1209                                             cast(ubyte)(0.5f + green * invSum),
1210                                             cast(ubyte)(0.5f + blue * invSum),
1211                                             cast(ubyte)destAlpha );
1212                     dest[x] = finalColor;
1213                 }
1214             }
1215         }
1216 
1217         enum verify = false;
1218 
1219         static if (verify)
1220         {
1221             for (int x = 0; x < width; ++x)
1222             {
1223                 // A B
1224                 // C D
1225                 RGBA A = L0[2 * x];
1226                 RGBA B = L0[2 * x + 1];
1227                 RGBA C = L1[2 * x];
1228                 RGBA D = L1[2 * x + 1];
1229 
1230                 int alphaA = A.a;
1231                 int alphaB = B.a;
1232                 int alphaC = C.a;
1233                 int alphaD = D.a;
1234                 int sum = alphaA + alphaB + alphaC + alphaD;
1235                 if (sum == 0)
1236                 {
1237                     assert(dest[x] == RGBA(0,0,0,0));
1238                 }
1239                 else
1240                 {
1241                     int destAlpha = cast(ubyte)( (alphaA + alphaB + alphaC + alphaD + 2) >> 2 );
1242                     int red =   (A.r * alphaA + B.r * alphaB + C.r * alphaC + D.r * alphaD);
1243                     int green = (A.g * alphaA + B.g * alphaB + C.g * alphaC + D.g * alphaD);
1244                     int blue =  (A.b * alphaA + B.b* alphaB + C.b * alphaC + D.b * alphaD);
1245 
1246                     float invSum = 1 / cast(float)(sum);
1247 
1248                     RGBA finalColor = RGBA( cast(ubyte)(0.5f + red * invSum),
1249                                             cast(ubyte)(0.5f + green * invSum),
1250                                            cast(ubyte)(0.5f + blue * invSum),
1251                                            cast(ubyte)destAlpha );
1252                     RGBA instead = dest[x];
1253 
1254                     int insteadR = instead.r;
1255                     int insteadG = instead.g;
1256                     int insteadB = instead.b;
1257                     int insteadA = instead.a;
1258                     int finalColorR = finalColor.r;
1259                     int finalColorG = finalColor.g;
1260                     int finalColorB = finalColor.b;
1261                     int finalColorA = finalColor.a;
1262                     import std.math;
1263                     assert(abs(insteadR - finalColorR) <= 1); // some remaining differences because of rounding
1264                     assert(abs(insteadG - finalColorG) <= 1);
1265                     assert(abs(insteadB - finalColorB) <= 1);
1266                     assert(insteadA == finalColorA);
1267                 }
1268             }
1269         }
1270     }
1271 }
1272 
1273 void generateLevelBoxAlphaCovIntoPremulRGBA(OwnedImage!RGBA thisLevel,
1274                                             OwnedImage!RGBA previousLevel,
1275                                             box2i updateRect) nothrow @nogc
1276 {
1277     int width = updateRect.width();
1278     int height = updateRect.height();
1279 
1280     for (int y = 0; y < height; ++y)
1281     {
1282         RGBA* L0   = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2    ) + updateRect.min.x * 2;
1283         RGBA* L1   = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 + 1) + updateRect.min.x * 2;
1284         RGBA* dest =     thisLevel.scanlinePtr(           updateRect.min.y + y) + updateRect.min.x;
1285 
1286         version(inlineAsmCanLoadGlobalsInPIC)
1287         {
1288             version(D_InlineAsm_X86)
1289             {
1290                 asm nothrow @nogc
1291                 {
1292                     mov ECX, width;
1293 
1294                     mov EAX, L0;
1295                     mov EDX, L1;
1296                     mov EDI, dest;
1297 
1298                     movdqa XMM5, xmm512;               // 512 512 5121 512
1299                     pxor XMM4, XMM4;                   // all zeroes
1300 
1301                 loop_ecx:
1302 
1303                     movq XMM0, [EAX];                  // Ar Ag Ab Aa Br Bg Bb Ba + zeroes
1304                     movq XMM1, [EDX];                  // Cr Cg Cb Ca Dr Dg Db Da + zeroes
1305                     pxor XMM4, XMM4;
1306                     add EAX, 8;
1307                     add EDX, 8;
1308 
1309                     punpcklbw XMM0, XMM4;              // Ar Ag Ab Aa Br Bg Bb Ba
1310                     punpcklbw XMM1, XMM4;              // Cr Cg Cb Ca Dr Dg Db Da
1311 
1312                     movdqa XMM2, XMM0;
1313                     punpcklwd XMM0, XMM1;              // Ar Cr Ag Cg Ab Cb Aa Ca
1314                     punpckhwd XMM2, XMM1;              // Br Dr Bg Dg Bb Db Ba Da
1315 
1316                     movdqa XMM3, XMM0;
1317                     punpcklwd XMM0, XMM2;              // Ar Br Cr Dr Ag Bg Cg Dg
1318                     punpckhwd XMM3, XMM2;              // Ab Bb Cb Db Aa Ba Ca Da
1319 
1320                     movdqa XMM1, XMM3;
1321                     punpckhqdq XMM1, XMM1;             // Aa Ba Ca Da Aa Ba Ca Da
1322 
1323                     add EDI, 4;
1324 
1325                     pmaddwd XMM0, XMM1;            // Ar*Aa+Br*Ba Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da
1326                     pmaddwd XMM3, XMM1;            // Ab*Aa+Bb*Ba Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da
1327 
1328                     movdqa XMM2, XMM0;
1329                     movdqa XMM1, XMM3;
1330 
1331                     psrldq XMM2, 4;                // Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 0
1332                     psrldq XMM1, 4;                // Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 0
1333 
1334                     paddd XMM0, XMM2;              // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da garbage Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage
1335                     paddd XMM3, XMM1;              // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da garbage Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage
1336 
1337                     pshufd XMM0, XMM0, 0b00001000; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage garbage
1338                     pshufd XMM3, XMM3, 0b00001000; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage garbage
1339 
1340                     punpcklqdq XMM0, XMM3;     // fR fG fB fA
1341 
1342 
1343                     paddd XMM0, XMM5;
1344                     psrld XMM0, 10;             // final color in dwords
1345 
1346                     packssdw XMM0, XMM4;       // same in words
1347                     packuswb XMM0, XMM4;       // same in bytes
1348 
1349                     sub ECX, 1;
1350                     movd [EDI-4], XMM0;            // dest[x] = A
1351                     jnz loop_ecx;
1352                 }
1353             }
1354             else version(D_InlineAsm_X86_64)
1355             {
1356                 asm nothrow @nogc
1357                 {
1358                     mov ECX, width;
1359 
1360                     mov RAX, L0;
1361                     mov RDX, L1;
1362                     mov RDI, dest;
1363 
1364                     movdqa XMM5, xmm512;               // 512 512 5121 512
1365                     pxor XMM4, XMM4;                   // all zeroes
1366 
1367                 loop_ecx:
1368 
1369                     movq XMM0, [RAX];                  // Ar Ag Ab Aa Br Bg Bb Ba + zeroes
1370                     movq XMM1, [RDX];                  // Cr Cg Cb Ca Dr Dg Db Da + zeroes
1371                     pxor XMM4, XMM4;
1372                     add RAX, 8;
1373                     add RDX, 8;
1374 
1375                     punpcklbw XMM0, XMM4;              // Ar Ag Ab Aa Br Bg Bb Ba
1376                     punpcklbw XMM1, XMM4;              // Cr Cg Cb Ca Dr Dg Db Da
1377 
1378                     movdqa XMM2, XMM0;
1379                     punpcklwd XMM0, XMM1;              // Ar Cr Ag Cg Ab Cb Aa Ca
1380                     punpckhwd XMM2, XMM1;              // Br Dr Bg Dg Bb Db Ba Da
1381 
1382                     movdqa XMM3, XMM0;
1383                     punpcklwd XMM0, XMM2;              // Ar Br Cr Dr Ag Bg Cg Dg
1384                     punpckhwd XMM3, XMM2;              // Ab Bb Cb Db Aa Ba Ca Da
1385 
1386                     movdqa XMM1, XMM3;
1387                     punpckhqdq XMM1, XMM1;             // Aa Ba Ca Da Aa Ba Ca Da
1388 
1389                     add RDI, 4;
1390 
1391                     pmaddwd XMM0, XMM1;            // Ar*Aa+Br*Ba Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da
1392                     pmaddwd XMM3, XMM1;            // Ab*Aa+Bb*Ba Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da
1393 
1394                     movdqa XMM2, XMM0;
1395                     movdqa XMM1, XMM3;
1396 
1397                     psrldq XMM2, 4;                // Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 0
1398                     psrldq XMM1, 4;                // Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 0
1399 
1400                     paddd XMM0, XMM2;              // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da garbage Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage
1401                     paddd XMM3, XMM1;              // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da garbage Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage
1402 
1403                     pshufd XMM0, XMM0, 0b00001000; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage garbage
1404                     pshufd XMM3, XMM3, 0b00001000; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage garbage
1405 
1406                     punpcklqdq XMM0, XMM3;     // fR fG fB fA
1407 
1408 
1409                     paddd XMM0, XMM5;
1410                     psrld XMM0, 10;             // final color in dwords
1411 
1412                     packssdw XMM0, XMM4;       // same in words
1413                     packuswb XMM0, XMM4;       // same in bytes
1414 
1415                     sub ECX, 1;
1416                     movd [RDI-4], XMM0;            // dest[x] = A
1417                     jnz loop_ecx;
1418                 }
1419             }
1420             else 
1421                 static assert(false);
1422         }
1423         else
1424         {
1425             for (int x = 0; x < width; ++x)
1426             {
1427                 RGBA A = L0[2 * x];
1428                 RGBA B = L0[2 * x + 1];
1429                 RGBA C = L1[2 * x];
1430                 RGBA D = L1[2 * x + 1];
1431                 int red =   (A.r * A.a + B.r * B.a + C.r * C.a + D.r * D.a);
1432                 int green = (A.g * A.a + B.g * B.a + C.g * C.a + D.g * D.a);
1433                 int blue =  (A.b * A.a + B.b* B.a + C.b * C.a + D.b * D.a);
1434                 int alpha =  (A.a * A.a + B.a* B.a + C.a * C.a + D.a * D.a);
1435                 RGBA finalColor = RGBA( cast(ubyte)((red + 512) >> 10),
1436                                         cast(ubyte)((green + 512) >> 10),
1437                                         cast(ubyte)((blue + 512) >> 10),
1438                                         cast(ubyte)((alpha + 512) >> 10));
1439                 dest[x] = finalColor;
1440             }
1441         }
1442 
1443         enum bool verify = false;
1444 
1445         static if (verify)
1446         {
1447             for (int x = 0; x < width; ++x)
1448             {
1449                 RGBA A = L0[2 * x];
1450                 RGBA B = L0[2 * x + 1];
1451                 RGBA C = L1[2 * x];
1452                 RGBA D = L1[2 * x + 1];
1453                 int red =   (A.r * A.a + B.r * B.a + C.r * C.a + D.r * D.a);
1454                 int green = (A.g * A.a + B.g * B.a + C.g * C.a + D.g * D.a);
1455                 int blue =  (A.b * A.a + B.b* B.a + C.b * C.a + D.b * D.a);
1456                 int alpha =  (A.a * A.a + B.a* B.a + C.a * C.a + D.a * D.a);
1457                 RGBA finalColor = RGBA( cast(ubyte)((red + 512) >> 10),
1458                                         cast(ubyte)((green + 512) >> 10),
1459                                        cast(ubyte)((blue + 512) >> 10),
1460                                        cast(ubyte)((alpha + 512) >> 10));
1461                 assert(dest[x] == finalColor);
1462             }
1463         }
1464     }
1465 }
1466 
1467 void generateLevelCubicRGBA(OwnedImage!RGBA thisLevel,
1468                             OwnedImage!RGBA previousLevel,
1469                             box2i updateRect) nothrow @nogc
1470 {
1471     for (int y = updateRect.min.y; y < updateRect.max.y; ++y)
1472     {
1473         int y2m1 = 2 * y - 1;
1474         if (y2m1 < 0)
1475             y2m1 = 0;
1476 
1477         int y2p2 = 2 * y + 2;
1478         if (y2p2 > previousLevel.h - 1)
1479             y2p2 = previousLevel.h - 1;
1480 
1481         RGBA* LM1 = previousLevel.scanlinePtr(y2m1);
1482         RGBA* L0 = previousLevel.scanlinePtr(y * 2);
1483         RGBA* L1 = previousLevel.scanlinePtr(y * 2 + 1);
1484         RGBA* L2 = previousLevel.scanlinePtr(y2p2);
1485         RGBA* dest = thisLevel.scanlinePtr(y);
1486 
1487         for (int x = updateRect.min.x; x < updateRect.max.x; ++x)
1488         {
1489             // A B C D
1490             // E F G H
1491             // I J K L
1492             // M N O P
1493 
1494             int x2m1 = 2 * x - 1;
1495             if (x2m1 < 0)
1496                 x2m1 = 0;
1497             int x2p0 = 2 * x;
1498             int x2p2 = 2 * x + 2;
1499             if (x2p2 > previousLevel.w - 1)
1500                 x2p2 = previousLevel.w - 1;
1501 
1502             version(inlineAsmCanLoadGlobalsInPIC)
1503             {
1504                 version(D_InlineAsm_X86)
1505                 {
1506                     RGBA[16] buf = void;
1507                     buf[0] = LM1[x2m1];
1508                     buf[1] = LM1[x2p0];
1509                     buf[2] = LM1[x2p0+1];
1510                     buf[3] = LM1[x2p2];
1511                     buf[4] = L0[x2m1];
1512                     buf[5] = L0[x2p0];
1513                     buf[6] = L0[x2p0+1];
1514                     buf[7] = L0[x2p2];
1515                     buf[8] = L1[x2m1];
1516                     buf[9] = L1[x2p0];
1517                     buf[10] = L1[x2p0+1];
1518                     buf[11] = L1[x2p2];
1519                     buf[12] = L2[x2m1];
1520                     buf[13] = L2[x2p0];
1521                     buf[14] = L2[x2p0+1];
1522                     buf[15] = L2[x2p2];
1523                     RGBA* pDest = dest + x;
1524 
1525                     asm nothrow @nogc
1526                     {
1527                         movdqu XMM0, buf;  // A B C D
1528                         movdqu XMM1, buf;
1529                         pxor XMM2, XMM2;      // zeroes
1530                         punpcklbw XMM0, XMM2; // A B
1531                         punpckhbw XMM1, XMM2; // C D
1532                         pmullw XMM0, xmm11113333; // A*1 B*3 in shorts
1533                         movdqa XMM3, XMM0;
1534                         pmullw XMM1, xmm33331111; // C*3 D*3 in shorts
1535                         movdqa XMM5, XMM1;
1536 
1537                         movdqu XMM0, buf+16;  // E F G H
1538                         movdqu XMM1, buf+16;
1539                         punpcklbw XMM0, XMM2; // E F
1540                         punpckhbw XMM1, XMM2; // G H
1541                         pmullw XMM0, xmm33339999; // E*3 F*9 in shorts
1542                         paddw XMM3, XMM0;
1543                         pmullw XMM1, xmm99993333; // G*9 H*3 in shorts
1544                         paddw XMM5, XMM1;
1545 
1546                         movdqu XMM0, buf+32;  // I J K L
1547                         movdqu XMM1, buf+32;
1548                         punpcklbw XMM0, XMM2; // I J
1549                         punpckhbw XMM1, XMM2; // K L
1550                         pmullw XMM0, xmm33339999; // I*3 J*9 in shorts
1551                         paddw XMM3, XMM0;
1552                         pmullw XMM1, xmm99993333; // K*9 L*3 in shorts
1553                         paddw XMM5, XMM1;
1554 
1555                         movdqu XMM0, buf+48;  // M N O P
1556                         movdqu XMM1, buf+48;
1557                         punpcklbw XMM0, XMM2; // M N
1558                         punpckhbw XMM1, XMM2; // O P
1559                         pmullw XMM0, xmm11113333; // M*1 N*3 in shorts
1560                         paddw XMM3, XMM0; // A+E*3+I*3+M B*3+F*9+J*9+3*N
1561                         pmullw XMM1, xmm33331111; // O*3 P*1 in shorts
1562                         paddw XMM5, XMM1; // C*3+G*9+K*9+O*3 D+H*3+L*3+P
1563 
1564                         movdqa XMM0, XMM3;
1565                         movdqa XMM1, XMM5;
1566                         psrldq XMM0, 8;
1567                         psrldq XMM1, 8;
1568                         paddw XMM3, XMM0; // A+E*3+I*3+M+B*3+F*9+J*9+3*N garbage(x4)
1569                         paddw XMM5, XMM1; // C*3+G*9+K*9+O*3+D+H*3+L*3+P garbage(x4)
1570                         paddw XMM3, XMM5; // total-sum garbage(x4)
1571 
1572                         paddw XMM3, xmm32;
1573                         psrlw XMM3, 6;
1574                         mov EAX, pDest;
1575                         packuswb XMM3, XMM2;
1576 
1577                         movd [EAX], XMM3;
1578                     }
1579                 }
1580                 else version(D_InlineAsm_X86_64)
1581                 {
1582                     RGBA[16] buf = void;
1583                     buf[0] = LM1[x2m1];
1584                     buf[1] = LM1[x2p0];
1585                     buf[2] = LM1[x2p0+1];
1586                     buf[3] = LM1[x2p2];
1587                     buf[4] = L0[x2m1];
1588                     buf[5] = L0[x2p0];
1589                     buf[6] = L0[x2p0+1];
1590                     buf[7] = L0[x2p2];
1591                     buf[8] = L1[x2m1];
1592                     buf[9] = L1[x2p0];
1593                     buf[10] = L1[x2p0+1];
1594                     buf[11] = L1[x2p2];
1595                     buf[12] = L2[x2m1];
1596                     buf[13] = L2[x2p0];
1597                     buf[14] = L2[x2p0+1];
1598                     buf[15] = L2[x2p2];
1599                     RGBA* pDest = dest + x;
1600 
1601                     asm nothrow @nogc
1602                     {
1603                         movdqu XMM0, buf;  // A B C D
1604                         movdqu XMM1, buf;
1605                         pxor XMM2, XMM2;      // zeroes
1606                         punpcklbw XMM0, XMM2; // A B
1607                         punpckhbw XMM1, XMM2; // C D
1608                         pmullw XMM0, xmm11113333; // A*1 B*3 in shorts
1609                         movdqa XMM3, XMM0;
1610                         pmullw XMM1, xmm33331111; // C*3 D*3 in shorts
1611                         movdqa XMM5, XMM1;
1612 
1613                         movdqu XMM0, buf+16;  // E F G H
1614                         movdqu XMM1, buf+16;
1615                         punpcklbw XMM0, XMM2; // E F
1616                         punpckhbw XMM1, XMM2; // G H
1617                         pmullw XMM0, xmm33339999; // E*3 F*9 in shorts
1618                         paddw XMM3, XMM0;
1619                         pmullw XMM1, xmm99993333; // G*9 H*3 in shorts
1620                         paddw XMM5, XMM1;
1621 
1622                         movdqu XMM0, buf+32;  // I J K L
1623                         movdqu XMM1, buf+32;
1624                         punpcklbw XMM0, XMM2; // I J
1625                         punpckhbw XMM1, XMM2; // K L
1626                         pmullw XMM0, xmm33339999; // I*3 J*9 in shorts
1627                         paddw XMM3, XMM0;
1628                         pmullw XMM1, xmm99993333; // K*9 L*3 in shorts
1629                         paddw XMM5, XMM1;
1630 
1631                         movdqu XMM0, buf+48;  // M N O P
1632                         movdqu XMM1, buf+48;
1633                         punpcklbw XMM0, XMM2; // M N
1634                         punpckhbw XMM1, XMM2; // O P
1635                         pmullw XMM0, xmm11113333; // M*1 N*3 in shorts
1636                         paddw XMM3, XMM0; // A+E*3+I*3+M B*3+F*9+J*9+3*N
1637                         pmullw XMM1, xmm33331111; // O*3 P*1 in shorts
1638                         paddw XMM5, XMM1; // C*3+G*9+K*9+O*3 D+H*3+L*3+P
1639 
1640                         movdqa XMM0, XMM3;
1641                         movdqa XMM1, XMM5;
1642                         psrldq XMM0, 8;
1643                         psrldq XMM1, 8;
1644                         paddw XMM3, XMM0; // A+E*3+I*3+M+B*3+F*9+J*9+3*N garbage(x4)
1645                         paddw XMM5, XMM1; // C*3+G*9+K*9+O*3+D+H*3+L*3+P garbage(x4)
1646                         paddw XMM3, XMM5; // total-sum garbage(x4)
1647 
1648                         paddw XMM3, xmm32;
1649                         psrlw XMM3, 6;
1650                         mov RAX, pDest;
1651                         packuswb XMM3, XMM2;
1652 
1653                         movd [RAX], XMM3;
1654                     }
1655                 }
1656                 else
1657                     static assert(false);
1658             }
1659             else
1660             {
1661                 auto A = LM1[x2m1];
1662                 auto B = LM1[x2p0];
1663                 auto C = LM1[x2p0+1];
1664                 auto D = LM1[x2p2];
1665 
1666                 auto E = L0[x2m1];
1667                 auto F = L0[x2p0];
1668                 auto G = L0[x2p0+1];
1669                 auto H = L0[x2p2];
1670 
1671                 auto I = L1[x2m1];
1672                 auto J = L1[x2p0];
1673                 auto K = L1[x2p0+1];
1674                 auto L = L1[x2p2];
1675 
1676                 auto M = L2[x2m1];
1677                 auto N = L2[x2p0];
1678                 auto O = L2[x2p0+1];
1679                 auto P = L2[x2p2];
1680 
1681                 // Apply filter
1682                 // 1 3 3 1
1683                 // 3 9 9 3
1684                 // 3 9 9 3
1685                 // 1 3 3 1
1686 
1687                 int rSum = (A.r + D.r + M.r + P.r) + 3 * (B.r + C.r + E.r + H.r + I.r + L.r + N.r + O.r) + 9 * (F.r + G.r + J.r + K.r);
1688                 int gSum = (A.g + D.g + M.g + P.g) + 3 * (B.g + C.g + E.g + H.g + I.g + L.g + N.g + O.g) + 9 * (F.g + G.g + J.g + K.g);
1689                 int bSum = (A.b + D.b + M.b + P.b) + 3 * (B.b + C.b + E.b + H.b + I.b + L.b + N.b + O.b) + 9 * (F.b + G.b + J.b + K.b);
1690                 int aSum = (A.a + D.a + M.a + P.a) + 3 * (B.a + C.a + E.a + H.a + I.a + L.a + N.a + O.a) + 9 * (F.a + G.a + J.a + K.a);
1691                 dest[x].r = cast(ubyte)((rSum + 32) >> 6);
1692                 dest[x].g = cast(ubyte)((gSum + 32) >> 6);
1693                 dest[x].b = cast(ubyte)((bSum + 32) >> 6);
1694                 dest[x].a = cast(ubyte)((aSum + 32) >> 6);
1695             }
1696         }
1697     }
1698 }
1699 
1700 void generateLevelCubicL16(OwnedImage!L16 thisLevel,
1701                            OwnedImage!L16 previousLevel,
1702                            box2i updateRect) nothrow @nogc
1703 {
1704     for (int y = updateRect.min.y; y < updateRect.max.y; ++y)
1705     {
1706         int y2m1 = 2 * y - 1;
1707         if (y2m1 < 0)
1708             y2m1 = 0;
1709 
1710         int y2p2 = 2 * y + 2;
1711         if (y2p2 > previousLevel.h - 1)
1712             y2p2 = previousLevel.h - 1;
1713 
1714         L16* LM1 = previousLevel.scanlinePtr(y2m1);
1715         L16* L0 = previousLevel.scanlinePtr(y * 2);
1716         L16* L1 = previousLevel.scanlinePtr(y * 2 + 1);
1717         L16* L2 = previousLevel.scanlinePtr(y2p2);
1718         L16* dest = thisLevel.scanlinePtr(y);
1719 
1720         for (int x = updateRect.min.x; x < updateRect.max.x; ++x)
1721         {
1722             // A B C D
1723             // E F G H
1724             // I J K L
1725             // M N O P
1726 
1727             int x2m1 = 2 * x - 1;
1728             if (x2m1 < 0)
1729                 x2m1 = 0;
1730             int x2p0 = 2 * x;
1731             int x2p2 = 2 * x + 2;
1732             if (x2p2 > previousLevel.w - 1)
1733                 x2p2 = previousLevel.w - 1;
1734 
1735             ushort A = LM1[x2m1].l;
1736             ushort B = LM1[x2p0].l;
1737             ushort C = LM1[x2p0+1].l;
1738             ushort D = LM1[x2p2].l;
1739 
1740             ushort E = L0[x2m1].l;
1741             ushort F = L0[x2p0].l;
1742             ushort G = L0[x2p0+1].l;
1743             ushort H = L0[x2p2].l;
1744 
1745             ushort I = L1[x2m1].l;
1746             ushort J = L1[x2p0].l;
1747             ushort K = L1[x2p0+1].l;
1748             ushort L = L1[x2p2].l;
1749 
1750             ushort M = L2[x2m1].l;
1751             ushort N = L2[x2p0].l;
1752             ushort O = L2[x2p0+1].l;
1753             ushort P = L2[x2p2].l;
1754 
1755             // Apply filter
1756             // 1 3 3 1    A B C D
1757             // 3 9 9 3    E F G H
1758             // 3 9 9 3    I J K L
1759             // 1 3 3 1    M N O P
1760 
1761             int depthSum = (A + D + M + P)
1762                          + 3 * (B + C + E + H + I + L + N + O)
1763                          + 9 * (F + G + J + K);
1764             dest[x].l = cast(ushort)((depthSum + 32) >> 6  );
1765         }
1766     }
1767 }
1768 
1769 unittest
1770 {
1771     Mipmap!RGBA rgbaMipmap;
1772     Mipmap!L16 l16Mipmap;
1773 }