1 /**
2 * Copyright: Copyright Auburn Sounds 2015 and later.
3 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
4 * Authors:   Guillaume Piolat
5 */
6 module dplug.graphics.mipmap;
7 
8 import std.algorithm.comparison;
9 
10 import gfm.math.vector;
11 import gfm.math.box;
12 import dplug.graphics.color;
13 
14 import dplug.core.nogc;
15 import dplug.core.alignedbuffer;
16 import dplug.graphics.drawex;
17 
18 version( D_InlineAsm_X86 )
19 {
20     version = AsmX86;
21 }
22 else version( D_InlineAsm_X86_64 )
23 {
24     version = AsmX86;
25 }
26 
27 // Because of unability to load globals in PIC code with DMD, only enable some assembly with LDC
28 version(LDC)
29 {
30     version( D_InlineAsm_X86 )
31     {
32         version = inlineAsmCanLoadGlobalsInPIC;
33     }
34     else version( D_InlineAsm_X86_64 )
35     {
36         version = inlineAsmCanLoadGlobalsInPIC;
37     }
38 }
39 
40 
41 /// Mipmapped images.
42 /// Supports non power-of-two textures.
43 /// Size of the i+1-th mipmap is { (width)/2, (height)/2 }
44 /// The mipmap owns each of its levels.
45 final class Mipmap(COLOR) if (is(COLOR == RGBA) || is(COLOR == L16))
46 {
47 public:
48 nothrow:
49 @nogc:
50 
51     enum Quality
52     {
53         box,                  // simple 2x2 filter, creates phase problems with NPOT. For higher levels, automatically uses cubic.
54         cubic,                // Very smooth kernel [1 2 1] x [1 2 1]
55         boxAlphaCov,          // ditto but alpha is used as weight, only implemented for RGBA
56         boxAlphaCovIntoPremul, // same as boxAlphaConv but after such a step the next level is alpha-premultiplied
57     }
58 
59     AlignedBuffer!(OwnedImage!COLOR) levels;
60 
61     /// Creates empty
62     this()
63     {
64         levels = makeAlignedBuffer!(OwnedImage!COLOR)();
65     }
66 
67     /// Set number of levels and size
68     /// maxLevel = 0 => only one image
69     /// maxLevel = 1 => one image + one 2x downsampled mipmap
70     /// etc...
71     this(int maxLevel, int w, int h)
72     {
73         this();
74         size(maxLevel, w, h);
75     }
76 
77 
78     /// Creates a Mipmap out of a flat OwnedImage.
79     /// This takes ownership of the given image, which is now owned by the `Mipmap`.
80     this(int maxLevel, OwnedImage!COLOR level0)
81     {
82         //PERF: could avoid to create the 0th level only to replace it later
83 
84         this(maxLevel, level0.w, level0.h);
85 
86         // replaces level 0
87         levels[0].destroyFree();
88         levels[0] = level0;
89         generateMipmaps(Quality.box);
90     }
91 
92     void size(int maxLevel, int w, int h)
93     {
94         // find number of needed levels
95         int neededLevels = 0;
96         {
97             int wr = w;
98             int hr = h;
99             for (; neededLevels <= maxLevel; ++neededLevels)
100             {
101                 if (wr == 0 || hr == 0)
102                     break;
103                 wr  = (wr + 0) >> 1;
104                 hr  = (hr + 0) >> 1;
105             }
106         }
107 
108         void setLevels(int numLevels)
109         {
110             // FUTURE: cleanup excess levels
111             // should not happen until we have resizing
112             if (numLevels < levels.length)
113             {
114                 assert(false);
115             }
116 
117             int previousLength = cast(int)levels.length;
118 
119             levels.resize(numLevels);
120 
121             // create empty image for new levels
122             for(int level = previousLength; level < numLevels; ++level)
123             {
124                 levels[level] = mallocEmplace!(OwnedImage!COLOR)();
125             }
126         }
127 
128         setLevels(neededLevels);
129 
130         // resize levels
131         for (int level = 0; level < neededLevels; ++level)
132         {
133             assert(w != 0 && h != 0);
134             levels[level].size(w, h);
135             w  = (w + 0) >> 1;
136             h  = (h + 0) >> 1;
137         }
138     }
139 
140     ~this()
141     {
142         foreach(level; levels)
143             level.destroyFree();
144     }
145 
146     /// Interpolates a color between mipmap levels.  Floating-point level, spatial linear interpolation.
147     /// x and y are in base level coordinates (top-left pixel is on (0.5, 0.5) coordinates).
148     /// Clamped to borders.
149     auto linearMipmapSample(float level, float x, float y) nothrow @nogc
150     {
151         int ilevel = cast(int)level;
152         float flevel = level - ilevel;
153         vec4f levelN = linearSample(ilevel, x, y);
154         if (flevel == 0)
155             return levelN;
156 
157         auto levelNp1 = linearSample(ilevel + 1, x, y);
158 
159         return levelN * (1 - flevel) + levelNp1 * flevel;
160     }
161 
162 
163     /// Interpolates a color.  Integer level, spatial linear interpolation.
164     /// x and y are in base level coordinates (top-left pixel is on (0.5, 0.5) coordinates).
165     /// Clamped to borders.
166     auto linearSample(int level, float x, float y) nothrow @nogc
167     {
168         if (level < 0)
169             level = 0;
170         int numLevels = cast(int)levels.length;
171         if (level >= numLevels)
172             level = numLevels - 1;
173 
174         OwnedImage!COLOR image = levels[level];
175 
176 
177         static immutable float[14] factors = [ 1.0f, 0.5f, 0.25f, 0.125f,
178                                                0.0625f, 0.03125f, 0.015625f, 0.0078125f,
179                                                0.00390625f, 0.001953125f, 0.0009765625f, 0.00048828125f,
180                                                0.000244140625f, 0.0001220703125f];
181 
182         float divider = factors[level];
183         x = x * divider - 0.5f;
184         y = y * divider - 0.5f;
185 
186         float maxX = image.w - 1.001f; // avoids an edge case with truncation
187         float maxY = image.h - 1.001f;
188 
189         if (x < 0)
190             x = 0;
191         if (y < 0)
192             y = 0;
193         if (x > maxX)
194             x = maxX;
195         if (y > maxY)
196             y = maxY;
197 
198         int ix = cast(int)x;
199         int iy = cast(int)y;
200         float fx = x - ix;
201 
202         int ixp1 = ix + 1;
203         if (ixp1 >= image.w)
204             ixp1 = image.w - 1;
205         int iyp1 = iy + 1;
206         if (iyp1 >= image.h)
207             iyp1 = image.h - 1;
208 
209         float fxm1 = 1 - fx;
210         float fy = y - iy;
211         float fym1 = 1 - fy;
212 
213         COLOR[] L0 = image.scanline(iy);
214         COLOR[] L1 = image.scanline(iyp1);
215 
216         COLOR A = L0.ptr[ix];
217         COLOR B = L0.ptr[ixp1];
218         COLOR C = L1.ptr[ix];
219         COLOR D = L1.ptr[ixp1];
220 
221         static if (is(COLOR == RGBA))
222         {
223             float inv255 = 1 / 255.0f;
224 
225             version( AsmX86 )
226             {
227                 vec4f asmResult;
228 
229                 asm nothrow @nogc
230                 {
231                     movd XMM0, A;
232                     movd XMM1, B;
233                     movd XMM2, C;
234                     movd XMM3, D;
235                     pxor XMM4, XMM4;
236 
237                     punpcklbw XMM0, XMM4;
238                     punpcklbw XMM1, XMM4;
239                     punpcklbw XMM2, XMM4;
240                     punpcklbw XMM3, XMM4;
241 
242                     punpcklwd XMM0, XMM4;
243                     punpcklwd XMM1, XMM4;
244                     punpcklwd XMM2, XMM4;
245                     punpcklwd XMM3, XMM4;
246 
247                     cvtdq2ps XMM0, XMM0;
248                     cvtdq2ps XMM1, XMM1;
249 
250                     cvtdq2ps XMM2, XMM2;
251                     cvtdq2ps XMM3, XMM3;
252 
253                     movss XMM4, fxm1;
254                     pshufd XMM4, XMM4, 0;
255                     movss XMM5, fx;
256                     pshufd XMM5, XMM5, 0;
257 
258                     mulps XMM0, XMM4;
259                     mulps XMM1, XMM5;
260                     mulps XMM2, XMM4;
261                     mulps XMM3, XMM5;
262 
263                     movss XMM4, fym1;
264                     pshufd XMM4, XMM4, 0;
265                     movss XMM5, fy;
266                     pshufd XMM5, XMM5, 0;
267 
268                     addps XMM0, XMM1;
269                     addps XMM2, XMM3;
270 
271                     mulps XMM0, XMM4;
272                     mulps XMM2, XMM5;
273 
274                     addps XMM0, XMM2;
275 
276                     movups asmResult, XMM0;
277                 }
278 
279                 // Uncomment to check
280     /*
281                 vec4f vA = vec4f(A.r, A.g, A.b, A.a);
282                 vec4f vB = vec4f(B.r, B.g, B.b, B.a);
283                 vec4f vC = vec4f(C.r, C.g, C.b, C.a);
284                 vec4f vD = vec4f(D.r, D.g, D.b, D.a);
285 
286                 vec4f up = vA * fxm1 + vB * fx;
287                 vec4f down = vC * fxm1 + vD * fx;
288                 vec4f dResult = up * fym1 + down * fy;
289 
290                 import gfm.core;
291 
292                 if (dResult.distanceTo(result) < 1.0f)
293                     debugBreak();
294     */
295 
296                 vec4f result = asmResult;
297                 return result;
298             }
299             else
300             {
301                 vec4f vA = vec4f(A.r, A.g, A.b, A.a);
302                 vec4f vB = vec4f(B.r, B.g, B.b, B.a);
303                 vec4f vC = vec4f(C.r, C.g, C.b, C.a);
304                 vec4f vD = vec4f(D.r, D.g, D.b, D.a);
305 
306 
307 
308                 vec4f up = vA * fxm1 + vB * fx;
309                 vec4f down = vC * fxm1 + vD * fx;
310                 vec4f dResult = up * fym1 + down * fy;
311 
312               //  assert(dResult.distanceTo(asmResult) < 1.0f);
313 
314                 return dResult;
315             }
316         }
317         else
318         {
319             float up = A.l * fxm1 + B.l * fx;
320             float down = C.l * fxm1 + D.l * fx;
321             return up * fym1 + down * fy;
322         }
323     }
324 
325     /// Returns: Width of the base level.
326     int width() pure const nothrow @nogc
327     {
328         return levels[0].w;
329     }
330 
331     /// Returns: Height of the base level.
332     int height() pure const nothrow @nogc
333     {
334         return levels[0].h;
335     }
336 
337     /// Returns: Number of levels. The maximum level is numLevels() - 1.
338     int numLevels() pure const nothrow @nogc
339     {
340         return cast(int)levels.length;
341     }
342 
343     /// Regenerates the whole upper levels.
344     void generateMipmaps(Quality quality) nothrow @nogc
345     {
346         box2i updateRect = box2i(0, 0, width(), height());
347         for (int level = 1; level < numLevels(); ++level)
348         {
349             // HACK: Force cubic filter past a level else it makes ugly looking mipmaps
350             if (level >= 3 && quality == Quality.box)
351                 quality = Quality.cubic;
352 
353             updateRect = generateNextLevel(quality, updateRect, level);
354         }
355     }
356 
357     /// Regenerates a single mipmap level based on changes in the provided rectangle (expressed in level 0 coordinates).
358     /// updateRect expressed in level 0 coordinates
359     /// In general if you have several subparts of mipmaps to update, make sure a level is fully completed
360     /// before computing the next one.
361     box2i generateNextLevel(Quality quality, box2i updateRectPreviousLevel, int level) nothrow @nogc
362     {
363         OwnedImage!COLOR previousLevel = levels[level - 1];
364         box2i updateRect = impactOnNextLevel(quality, updateRectPreviousLevel, previousLevel.w, previousLevel.h);
365         generateLevel(level, quality, updateRect);
366         return updateRect;
367     }
368 
369     /// Regenerates one level
370     /// updateRect expressed in level i-th coordinates
371     void generateLevel(int level, Quality quality, box2i updateRect) nothrow @nogc
372     {
373         assert(level > 0);
374         OwnedImage!COLOR thisLevel = levels[level];
375         OwnedImage!COLOR previousLevel = levels[level - 1];
376 
377         final switch(quality) with (Quality)
378         {
379             case box:
380 
381                 static if (is(COLOR == RGBA))
382                     generateLevelBoxRGBA(thisLevel, previousLevel, updateRect);
383                 else static if (is(COLOR == L16))
384                     generateLevelBoxL16(thisLevel, previousLevel, updateRect);
385                 else
386                     static assert(false, "not implemented");
387 
388                 enum checkBoxMipmaps = false;
389 
390                 static if (checkBoxMipmaps)
391                 {
392                     for (int y = updateRect.min.y; y < updateRect.max.y; ++y)
393                     {
394                         COLOR[] L0 = previousLevel.scanline(y * 2);
395                         COLOR[] L1 = previousLevel.scanline(y * 2 + 1);
396                         COLOR[] dest = thisLevel.scanline(y);
397 
398                         for (int x = updateRect.min.x; x < updateRect.max.x; ++x)
399                         {
400                             // A B
401                             // C D
402                             COLOR A = L0[2 * x];
403                             COLOR B = L0[2 * x + 1];
404                             COLOR C = L1[2 * x];
405                             COLOR D = L1[2 * x + 1];
406                             assert(dest[x] == COLOR.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D));
407                         }
408                     }
409                 }
410                 break;
411 
412         case boxAlphaCov:
413 
414             static if (is(COLOR == RGBA))
415             {
416                 generateLevelBoxAlphaCovRGBA(thisLevel, previousLevel, updateRect);
417 
418                 static if (false)
419                 {
420                     void checkLevelBoxAlphaConvRGBA(Image!RGBA* thisLevel, Image!RGBA* previousLevel, box2i updateRect)
421                     {
422                         for (int y = updateRect.min.y; y < updateRect.max.y; ++y)
423                         {
424                             RGBA[] L0 = previousLevel.scanline(y * 2);
425                             RGBA[] L1 = previousLevel.scanline(y * 2 + 1);
426                             RGBA[] dest = thisLevel.scanline(y);
427 
428                             for (int x = updateRect.min.x; x < updateRect.max.x; ++x)
429                             {
430                                 // A B
431                                 // C D
432                                 RGBA A = L0.ptr[2 * x];
433                                 RGBA B = L0.ptr[2 * x + 1];
434                                 RGBA C = L1.ptr[2 * x];
435                                 RGBA D = L1.ptr[2 * x + 1];
436 
437                                 int alphaA = A.a;
438                                 int alphaB = B.a;
439                                 int alphaC = C.a;
440                                 int alphaD = D.a;
441                                 int sum = alphaA + alphaB + alphaC + alphaD;
442                                 if (sum == 0)
443                                 {
444                                     assert(dest.ptr[x] == A);
445                                 }
446                                 else
447                                 {
448                                     int destAlpha = cast(ubyte)( (alphaA + alphaB + alphaC + alphaD + 2) >> 2 );
449                                     int red =   (A.r * alphaA + B.r * alphaB + C.r * alphaC + D.r * alphaD);
450                                     int green = (A.g * alphaA + B.g * alphaB + C.g * alphaC + D.g * alphaD);
451                                     int blue =  (A.b * alphaA + B.b* alphaB + C.b * alphaC + D.b * alphaD);
452                                     float invSum = 1 / cast(float)(sum);
453 
454                                     RGBA finalColor = RGBA( cast(ubyte)(0.5f + red * invSum),
455                                                             cast(ubyte)(0.5f + green * invSum),
456                                                             cast(ubyte)(0.5f + blue * invSum),
457                                                             cast(ubyte)destAlpha );
458                                     assert(dest.ptr[x] == finalColor);
459                                 }
460                             }
461                         }
462                     }
463                     checkLevelBoxAlphaConvRGBA(thisLevel, previousLevel, updateRect);
464                 }
465                 break;
466             }
467             else
468                 assert(false);
469 
470         case boxAlphaCovIntoPremul:
471 
472             static if (is(COLOR == RGBA))
473             {
474                 generateLevelBoxAlphaCovIntoPremulRGBA(thisLevel, previousLevel, updateRect);
475                 break;
476             }
477             else
478                 assert(false);
479 
480         case cubic:
481             static if (is(COLOR == RGBA))
482             {
483                 generateLevelCubicRGBA(thisLevel, previousLevel, updateRect);
484                 break;
485             }
486             else static if (is(COLOR == L16))
487             {
488                 generateLevelCubicL16(thisLevel, previousLevel, updateRect);
489                 break;
490             }
491             else
492                 static assert(false, "not implemented");
493 
494 
495         }
496     }
497 
498 
499 private:
500     /// Computes impact of updating the area box on next level
501     static box2i impactOnNextLevel(Quality quality, box2i area, int currentLevelWidth, int currentLevelHeight) pure nothrow @nogc
502     {
503         box2i maxArea = box2i(0, 0, currentLevelWidth / 2, currentLevelHeight / 2);
504 
505         final  switch(quality) with (Quality)
506         {
507         case box:
508         case boxAlphaCov:
509         case boxAlphaCovIntoPremul:
510             int xmin = area.min.x / 2;
511             int ymin = area.min.y / 2;
512             int xmax = (area.max.x + 1) / 2;
513             int ymax = (area.max.y + 1) / 2;
514             return box2i(xmin, ymin, xmax, ymax).intersection(maxArea);
515 
516         case cubic:
517             int xmin = (area.min.x - 1) / 2;
518             int ymin = (area.min.y - 1) / 2;
519             int xmax = (area.max.x + 2) / 2;
520             int ymax = (area.max.y + 2) / 2;
521             return box2i(xmin, ymin, xmax, ymax).intersection(maxArea);
522         }
523 
524     }
525 }
526 
527 unittest
528 {
529     Mipmap!RGBA a = new Mipmap!RGBA();
530     a.size(4, 256, 256);
531     a.destroy();
532 
533     Mipmap!L16 b = new Mipmap!L16();
534     b.size(16, 17, 333);
535     b.destroy();
536 }
537 
538 
539 private:
540 
541 align(16) static immutable short[8] xmmTwoShort = [ 2, 2, 2, 2, 2, 2, 2, 2 ];
542 align(16) static immutable int[4] xmmTwoInt = [ 2, 2, 2, 2 ];
543 align(16) static immutable float[4] xmm0_5 = [ 0.5f, 0.5f, 0.5f, 0.5f ];
544 align(16) static immutable int[4] xmm512 = [ 512, 512, 512, 512 ];
545 align(16) static immutable short[8] xmm11113333 = [ 1, 1, 1, 1, 3, 3, 3, 3 ];
546 align(16) static immutable short[8] xmm33331111 = [ 3, 3, 3, 3, 1, 1, 1, 1 ];
547 align(16) static immutable short[8] xmm33339999 = [ 3, 3, 3, 3, 9, 9, 9, 9 ];
548 align(16) static immutable short[8] xmm99993333 = [ 9, 9, 9, 9, 3, 3, 3, 3 ];
549 align(16) static immutable short[8] xmm32       = [ 32, 32, 32, 32, 32, 32, 32, 32 ];
550 
551 
552 void generateLevelBoxRGBA(OwnedImage!RGBA thisLevel,
553                           OwnedImage!RGBA previousLevel,
554                           box2i updateRect) pure nothrow @nogc
555 {
556     int width = updateRect.width();
557     int height = updateRect.height();
558 
559     int previousPitch = previousLevel.w;
560     int thisPitch = thisLevel.w;
561 
562     RGBA* L0 = previousLevel.scanline(updateRect.min.y * 2).ptr + updateRect.min.x * 2;
563     RGBA* L1 = L0 + previousPitch;
564     RGBA* dest = thisLevel.scanline(updateRect.min.y).ptr + updateRect.min.x;
565 
566     for (int y = 0; y < height; ++y)
567     {
568         version(inlineAsmCanLoadGlobalsInPIC)
569         {
570             version(D_InlineAsm_X86)
571             {
572                 asm pure nothrow @nogc
573                 {
574                     mov ECX, width;
575                     shr ECX, 1;
576                     jz no_need; // ECX = 0 => no pair of pixels to process
577 
578                     mov EAX, L0;
579                     mov EDX, L1;
580                     mov EDI, dest;
581                     movaps XMM5, xmmTwoShort;
582 
583                 loop_ecx:
584                     movdqu XMM0, [EAX]; // A B E F
585                     pxor XMM4, XMM4;
586                     movdqu XMM1, [EDX]; // C D G H
587                     movdqa XMM2, XMM0;
588                     movdqa XMM3, XMM1;
589                     punpcklbw XMM0, XMM4; // A B in short
590                     punpcklbw XMM1, XMM4; // C D in short
591                     punpckhbw XMM2, XMM4; // E F in short
592                     punpckhbw XMM3, XMM4; // G H in short
593                     paddusw XMM0, XMM1; // A + C | B + D
594                     paddusw XMM2, XMM3; // E + F | G + H
595                     movdqa XMM1, XMM0;
596                     movdqa XMM3, XMM2;
597                     psrldq XMM1, 8;
598                     psrldq XMM3, 8;
599                     add EDI, 8;
600                     paddusw XMM0, XMM1; // A + B + C + D | garbage
601                     paddusw XMM2, XMM3; // E + F + G + H | garbage
602                     paddusw XMM0, XMM5; // A + B + C + D + 2 | garbage
603                     paddusw XMM2, XMM5; // E + F + G + H + 2 | garbage
604                     psrlw XMM0, 2; // (A + B + C + D + 2) >> 2 | garbage
605                     psrlw XMM2, 2; // (E + F + G + H + 2) >> 2 | garbage
606                     add EAX, 16;
607                     punpcklqdq XMM0, XMM2;
608                     add EDX, 16;
609                     packuswb XMM0, XMM4; // (A + B + C + D + 2) >> 2 | (E + F + G + H + 2) >> 2 | 0 | 0
610                     movq [EDI-8], XMM0;
611                     sub ECX, 1;
612                     jnz loop_ecx;
613                 no_need: ;
614                 }
615 
616                 // Eventually filter the last pixel
617                 int remaining = width & ~1;
618                 for (int x = remaining; x < width; ++x)
619                 {
620                     RGBA A = L0[2 * x];
621                     RGBA B = L0[2 * x + 1];
622                     RGBA C = L1[2 * x];
623                     RGBA D = L1[2 * x + 1];
624                     dest[x] = RGBA.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D);
625                 }
626             }
627             else version(D_InlineAsm_X86_64)
628             {
629                 asm pure nothrow @nogc
630                 {
631                     mov ECX, width;
632                     shr ECX, 1;
633                     jz no_need; // ECX = 0 => no pair of pixels to process
634 
635                     mov RAX, L0;
636                     mov RDX, L1;
637                     mov RDI, dest;
638                     movaps XMM5, xmmTwoShort;
639 
640                 loop_ecx:
641                     movdqu XMM0, [RAX]; // A B E F
642                     pxor XMM4, XMM4;
643                     movdqu XMM1, [RDX]; // C D G H
644                     movdqa XMM2, XMM0;
645                     movdqa XMM3, XMM1;
646                     punpcklbw XMM0, XMM4; // A B in short
647                     punpcklbw XMM1, XMM4; // C D in short
648                     punpckhbw XMM2, XMM4; // E F in short
649                     punpckhbw XMM3, XMM4; // G H in short
650                     paddusw XMM0, XMM1; // A + C | B + D
651                     paddusw XMM2, XMM3; // E + F | G + H
652                     movdqa XMM1, XMM0;
653                     movdqa XMM3, XMM2;
654                     psrldq XMM1, 8;
655                     psrldq XMM3, 8;
656                     add RDI, 8;
657                     paddusw XMM0, XMM1; // A + B + C + D | garbage
658                     paddusw XMM2, XMM3; // E + F + G + H | garbage
659                     paddusw XMM0, XMM5; // A + B + C + D + 2 | garbage
660                     paddusw XMM2, XMM5; // E + F + G + H + 2 | garbage
661                     psrlw XMM0, 2; // (A + B + C + D + 2) >> 2 | garbage
662                     psrlw XMM2, 2; // (E + F + G + H + 2) >> 2 | garbage
663                     add RAX, 16;
664                     punpcklqdq XMM0, XMM2;
665                     add RDX, 16;
666                     packuswb XMM0, XMM4; // (A + B + C + D + 2) >> 2 | (E + F + G + H + 2) >> 2 | 0 | 0
667                     movq [RDI-8], XMM0;
668                     sub ECX, 1;
669                     jnz loop_ecx;
670                 no_need: ;
671                 }
672 
673                 // Eventually filter the last pixel
674                 int remaining = width & ~1;
675                 for (int x = remaining; x < width; ++x)
676                 {
677                     RGBA A = L0[2 * x];
678                     RGBA B = L0[2 * x + 1];
679                     RGBA C = L1[2 * x];
680                     RGBA D = L1[2 * x + 1];
681                     dest[x] = RGBA.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D);
682                 }
683             }
684             else
685                 static assert(false);
686         }
687         else
688         {
689             for (int x = 0; x < width; ++x)
690             {
691                 // A B
692                 // C D
693                 RGBA A = L0[2 * x];
694                 RGBA B = L0[2 * x + 1];
695                 RGBA C = L1[2 * x];
696                 RGBA D = L1[2 * x + 1];
697 
698                 dest[x] = RGBA.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D);
699             }
700         }
701 
702         L0 += (2 * previousPitch);
703         L1 += (2 * previousPitch);
704         dest += thisPitch;
705     }
706 }
707 
708 void generateLevelBoxL16(OwnedImage!L16 thisLevel,
709                          OwnedImage!L16 previousLevel,
710                          box2i updateRect) pure nothrow @nogc
711 {
712     int width = updateRect.width();
713     int height = updateRect.height();
714 
715     int previousPitch = previousLevel.w;
716     int thisPitch = thisLevel.w;
717 
718     L16* L0 = previousLevel.scanline(updateRect.min.y * 2).ptr + updateRect.min.x * 2;
719     L16* L1 = L0 + previousPitch;
720 
721     L16* dest = thisLevel.scanline(updateRect.min.y).ptr + updateRect.min.x;
722 
723     for (int y = 0; y < height; ++y)
724     {
725         version(inlineAsmCanLoadGlobalsInPIC)
726         {
727             version(D_InlineAsm_X86)
728             {
729                 asm pure nothrow @nogc
730                 {
731                     mov ECX, width;
732                     shr ECX, 2;
733                     jz no_need; // ECX = 0 => less than 4 pixels to process
734 
735                     mov EAX, L0;
736                     mov EDX, L1;
737                     mov EDI, dest;
738                     movdqa XMM5, xmmTwoInt;
739                     pxor XMM4, XMM4;
740 
741                 loop_ecx:
742                     movdqu XMM0, [EAX]; // A B E F I J M N
743                     movdqu XMM1, [EDX]; // C D G H K L O P
744 
745                     add EAX, 16;
746                     add EDX, 16;
747 
748                     movdqa XMM2, XMM0;
749                     movdqa XMM3, XMM1;
750 
751                     punpcklwd XMM0, XMM4; // A B E F in int32
752                     punpckhwd XMM2, XMM4; // I J M N in int32
753                     punpcklwd XMM1, XMM4; // C D G H in int32
754                     punpckhwd XMM3, XMM4; // K L O P in int32
755 
756                     paddd XMM0, XMM1; // A+C B+D E+G F+H
757                     paddd XMM2, XMM3; // I+K J+L M+O N+P
758 
759                     movdqa XMM1, XMM0;
760                     movdqa XMM3, XMM2;
761 
762                     psrldq XMM1, 4; // B+D E+G F+H 0
763                     psrldq XMM3, 4; // J+L M+O N+P 0
764 
765                     paddd XMM0, XMM1; // A+B+C+D garbage E+F+G+H garbage
766                     paddd XMM2, XMM3; // I+J+K+L garbage M+N+O+P garbage
767 
768                     pshufd XMM0, XMM0, 0b00001000; // A+B+C+D E+F+G+H garbage garbage
769                     pshufd XMM2, XMM2, 0b00001000; // I+J+K+L M+N+O+P garbage garbage
770 
771                     punpcklqdq XMM0, XMM2; // A+B+C+D E+F+G+H I+J+K+L M+N+O+P
772                     paddd XMM0, XMM5; // add 2
773                     psrld XMM0, 2; // >> 2
774 
775                     // because packusdw is not available before SSE4.1
776                     // Extend sign bit to the right
777                     pslld XMM0, 16;
778                     psrad XMM0, 16;
779                     add EDI, 8;
780                     packssdw XMM0, XMM4;
781 
782                     movq [EDI-8], XMM0;
783                     sub ECX, 1;
784                     jnz loop_ecx;
785                 no_need: ;
786                 }
787 
788                 // Eventually filter the 0 to 3 pixels
789                 int remaining = width & ~3;
790                 for (int x = remaining; x < width; ++x)
791                 {
792                     L16 A = L0[2 * x];
793                     L16 B = L0[2 * x + 1];
794                     L16 C = L1[2 * x];
795                     L16 D = L1[2 * x + 1];
796                     dest[x] = L16.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D);
797                 }
798             }
799             else version(D_InlineAsm_X86_64)
800             {
801                 asm pure nothrow @nogc
802                 {
803                     mov ECX, width;
804                     shr ECX, 2;
805                     jz no_need; // ECX = 0 => less than 4 pixels to process
806 
807                     mov RAX, L0;
808                     mov RDX, L1;
809                     mov RDI, dest;
810                     movdqa XMM5, xmmTwoInt;
811                     pxor XMM4, XMM4;
812 
813                 loop_ecx:
814                     movdqu XMM0, [RAX]; // A B E F I J M N
815                     movdqu XMM1, [RDX]; // C D G H K L O P
816 
817                     add RAX, 16;
818                     add RDX, 16;
819 
820                     movdqa XMM2, XMM0;
821                     movdqa XMM3, XMM1;
822 
823                     punpcklwd XMM0, XMM4; // A B E F in int32
824                     punpckhwd XMM2, XMM4; // I J M N in int32
825                     punpcklwd XMM1, XMM4; // C D G H in int32
826                     punpckhwd XMM3, XMM4; // K L O P in int32
827 
828                     paddd XMM0, XMM1; // A+C B+D E+G F+H
829                     paddd XMM2, XMM3; // I+K J+L M+O N+P
830 
831                     movdqa XMM1, XMM0;
832                     movdqa XMM3, XMM2;
833 
834                     psrldq XMM1, 4; // B+D E+G F+H 0
835                     psrldq XMM3, 4; // J+L M+O N+P 0
836 
837                     paddd XMM0, XMM1; // A+B+C+D garbage E+F+G+H garbage
838                     paddd XMM2, XMM3; // I+J+K+L garbage M+N+O+P garbage
839 
840                     pshufd XMM0, XMM0, 0b00001000; // A+B+C+D E+F+G+H garbage garbage
841                     pshufd XMM2, XMM2, 0b00001000; // I+J+K+L M+N+O+P garbage garbage
842 
843                     punpcklqdq XMM0, XMM2; // A+B+C+D E+F+G+H I+J+K+L M+N+O+P
844                     paddd XMM0, XMM5; // add 2
845                     psrld XMM0, 2; // >> 2
846 
847                     // because packusdw is not available before SSE4.1
848                     // Extend sign bit to the right
849                     pslld XMM0, 16;
850                     psrad XMM0, 16;
851                     add RDI, 8;
852                     packssdw XMM0, XMM4;
853 
854                     movq [RDI-8], XMM0;
855                     sub ECX, 1;
856                     jnz loop_ecx;
857                 no_need: ;
858                 }
859 
860                 // Eventually filter the 0 to 3 pixels
861                 int remaining = width & ~3;
862                 for (int x = remaining; x < width; ++x)
863                 {
864                     L16 A = L0[2 * x];
865                     L16 B = L0[2 * x + 1];
866                     L16 C = L1[2 * x];
867                     L16 D = L1[2 * x + 1];
868                     dest[x] = L16.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D);
869                 }
870             }
871             else
872                 static assert(false);
873         }
874         else
875         {
876             for (int x = 0; x < width; ++x)
877             {
878                 // A B
879                 // C D
880                 L16 A = L0[2 * x];
881                 L16 B = L0[2 * x + 1];
882                 L16 C = L1[2 * x];
883                 L16 D = L1[2 * x + 1];
884 
885                 dest[x] = L16.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D);
886             }
887         }
888 
889         L0 += (2 * previousPitch);
890         L1 += (2 * previousPitch);
891         dest += thisPitch;
892     }
893 }
894 
895 
896 void generateLevelBoxAlphaCovRGBA(OwnedImage!RGBA thisLevel,
897                                   OwnedImage!RGBA previousLevel,
898                                   box2i updateRect) nothrow @nogc
899 {
900     int width = updateRect.width();
901     int height = updateRect.height();
902 
903     int previousPitch = previousLevel.w;
904     int thisPitch = thisLevel.w;
905 
906     RGBA* L0 = previousLevel.scanline(updateRect.min.y * 2).ptr + updateRect.min.x * 2;
907     RGBA* L1 = L0 + previousPitch;
908 
909     RGBA* dest = thisLevel.scanline(updateRect.min.y).ptr + updateRect.min.x;
910 
911     for (int y = 0; y < height; ++y)
912     {
913         version(inlineAsmCanLoadGlobalsInPIC)
914         {
915             version(D_InlineAsm_X86)
916             {
917                 assert(width > 0);
918                 asm nothrow @nogc
919                 {
920                     mov ECX, width;
921 
922                     mov EAX, L0;
923                     mov EDX, L1;
924                     mov EDI, dest;
925 
926                     loop_ecx:
927 
928                         movq XMM0, [EAX];                  // Ar Ag Ab Aa Br Bg Bb Ba + zeroes
929                         movq XMM1, [EDX];                  // Cr Cg Cb Ca Dr Dg Db Da + zeroes
930                         pxor XMM4, XMM4;
931                         add EAX, 8;
932                         add EDX, 8;
933 
934                         punpcklbw XMM0, XMM4;              // Ar Ag Ab Aa Br Bg Bb Ba
935                         punpcklbw XMM1, XMM4;              // Cr Cg Cb Ca Dr Dg Db Da
936 
937                         movdqa XMM2, XMM0;
938                         punpcklwd XMM0, XMM1;              // Ar Cr Ag Cg Ab Cb Aa Ca
939                         punpckhwd XMM2, XMM1;              // Br Dr Bg Dg Bb Db Ba Da
940 
941                         // perhaps unnecessary
942                         movdqa XMM3, XMM0;
943                         punpcklwd XMM0, XMM2;              // Ar Br Cr Dr Ag Bg Cg Dg
944                         punpckhwd XMM3, XMM2;              // Ab Bb Cb Db Aa Ba Ca Da
945 
946                         movdqa XMM1, XMM3;
947                         punpckhqdq XMM1, XMM1;             // Aa Ba Ca Da Aa Ba Ca Da
948 
949                         // Are alpha all zeroes? if so, early continue.
950                         movdqa XMM2, XMM1;
951                         pcmpeqb XMM2, XMM4;
952                         add EDI, 4;
953                         pmovmskb ESI, XMM2;
954                         cmp ESI, 0xffff;
955                         jnz non_null;
956 
957                             pxor XMM0, XMM0;
958                             sub ECX, 1;
959                             movd [EDI-4], XMM0;            // dest[x] = A
960                             jnz loop_ecx;
961                             jmp end_of_loop;
962 
963                         non_null:
964 
965                             pmaddwd XMM0, XMM1;            // Ar*Aa+Br*Ba Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da
966                             pmaddwd XMM3, XMM1;            // Ab*Aa+Bb*Ba Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da
967 
968                             // Starting computing sum of coefficients too
969                             punpcklwd XMM1, XMM4;      // Aa Ba Ca Da
970 
971                             movdqa XMM2, XMM0;
972                             movdqa XMM5, XMM3;
973                             movdqa XMM4, XMM1;
974                             psrldq XMM4, 8;
975 
976                             psrldq XMM2, 4;                // Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 0
977                             psrldq XMM5, 4;                // Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 0
978                             paddq XMM1, XMM4;              // Aa+Ca Ba+Da garbage garbage
979                             movdqa XMM4, XMM1;
980 
981                             paddd XMM0, XMM2;              // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da garbage Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage
982                             paddd XMM3, XMM5;              // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da garbage Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage
983                             psrldq XMM4, 4;
984 
985                             pshufd XMM0, XMM0, 0b00001000; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage garbage
986                             paddq XMM1, XMM4;          // Aa+Ba+Ca+Da garbage garbage garbage
987                             pshufd XMM3, XMM3, 0b00001000; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage garbage
988 
989                             punpcklqdq XMM0, XMM3;     // fR fG fB fA
990                             pshufd XMM1, XMM1, 0;
991 
992                             cvtdq2ps XMM0, XMM0;
993 
994                             cvtdq2ps XMM3, XMM1;       // sum sum sum sum
995 
996                             divps XMM0, XMM3;          // fR/sum fG/sum fB/sum fA/sum
997                             addps XMM0, xmm0_5;
998                             cvttps2dq XMM0, XMM0;      // return into integer domain using cast(int)(x + 0.5f)
999 
1000                             paddd XMM1, xmmTwoInt;
1001                             psrld XMM1, 2;             // finalAlpha finalAlpha finalAlpha finalAlpha
1002 
1003                             pslldq XMM0, 4;            // 0 fR/sum fG/sum fB/sum
1004                             pslldq XMM1, 12;           // 0 0 0 finalAlpha
1005                             psrldq XMM0, 4;            // fR/sum fG/sum fB/sum 0
1006 
1007                             por XMM0, XMM1;            // fR/sum fG/sum fB/sum finalAlpha
1008                             pxor XMM3, XMM3;
1009                             packssdw XMM0, XMM3;       // same in words
1010                             packuswb XMM0, XMM3;       // same in bytes
1011 
1012                             sub ECX, 1;
1013                             movd [EDI-4], XMM0;            // dest[x] = A
1014                     jnz loop_ecx;
1015                     end_of_loop: ;
1016                 }
1017             }
1018             else version(D_InlineAsm_X86_64)
1019             {
1020                 assert(width > 0);
1021                 asm nothrow @nogc
1022                 {
1023                     mov ECX, width;
1024 
1025                     mov RAX, L0;
1026                     mov RDX, L1;
1027                     mov RDI, dest;
1028 
1029                 loop_ecx:
1030 
1031                     movq XMM0, [RAX];                  // Ar Ag Ab Aa Br Bg Bb Ba + zeroes
1032                     movq XMM1, [RDX];                  // Cr Cg Cb Ca Dr Dg Db Da + zeroes
1033                     pxor XMM4, XMM4;
1034                     add RAX, 8;
1035                     add RDX, 8;
1036 
1037                     punpcklbw XMM0, XMM4;              // Ar Ag Ab Aa Br Bg Bb Ba
1038                     punpcklbw XMM1, XMM4;              // Cr Cg Cb Ca Dr Dg Db Da
1039 
1040                     movdqa XMM2, XMM0;
1041                     punpcklwd XMM0, XMM1;              // Ar Cr Ag Cg Ab Cb Aa Ca
1042                     punpckhwd XMM2, XMM1;              // Br Dr Bg Dg Bb Db Ba Da
1043 
1044                     // perhaps unnecessary
1045                     movdqa XMM3, XMM0;
1046                     punpcklwd XMM0, XMM2;              // Ar Br Cr Dr Ag Bg Cg Dg
1047                     punpckhwd XMM3, XMM2;              // Ab Bb Cb Db Aa Ba Ca Da
1048 
1049                     movdqa XMM1, XMM3;
1050                     punpckhqdq XMM1, XMM1;             // Aa Ba Ca Da Aa Ba Ca Da
1051 
1052                     // Are alpha all zeroes? if so, early continue.
1053                     movdqa XMM2, XMM1;
1054                     pcmpeqb XMM2, XMM4;
1055                     add RDI, 4;
1056                     pmovmskb ESI, XMM2;
1057                     cmp ESI, 0xffff;
1058                     jnz non_null;
1059 
1060                     pxor XMM0, XMM0;
1061                     sub ECX, 1;
1062                     movd [RDI-4], XMM0;            // dest[x] = A
1063                     jnz loop_ecx;
1064                     jmp end_of_loop;
1065 
1066                 non_null:
1067 
1068                     pmaddwd XMM0, XMM1;            // Ar*Aa+Br*Ba Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da
1069                     pmaddwd XMM3, XMM1;            // Ab*Aa+Bb*Ba Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da
1070 
1071                     // Starting computing sum of coefficients too
1072                     punpcklwd XMM1, XMM4;      // Aa Ba Ca Da
1073 
1074                     movdqa XMM2, XMM0;
1075                     movdqa XMM5, XMM3;
1076                     movdqa XMM4, XMM1;
1077                     psrldq XMM4, 8;
1078 
1079                     psrldq XMM2, 4;                // Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 0
1080                     psrldq XMM5, 4;                // Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 0
1081                     paddq XMM1, XMM4;              // Aa+Ca Ba+Da garbage garbage
1082                     movdqa XMM4, XMM1;
1083 
1084                     paddd XMM0, XMM2;              // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da garbage Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage
1085                     paddd XMM3, XMM5;              // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da garbage Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage
1086                     psrldq XMM4, 4;
1087 
1088                     pshufd XMM0, XMM0, 0b00001000; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage garbage
1089                     paddq XMM1, XMM4;          // Aa+Ba+Ca+Da garbage garbage garbage
1090                     pshufd XMM3, XMM3, 0b00001000; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage garbage
1091 
1092                     punpcklqdq XMM0, XMM3;     // fR fG fB fA
1093                     pshufd XMM1, XMM1, 0;
1094 
1095                     cvtdq2ps XMM0, XMM0;
1096 
1097                     cvtdq2ps XMM3, XMM1;       // sum sum sum sum
1098 
1099                     divps XMM0, XMM3;          // fR/sum fG/sum fB/sum fA/sum
1100                     addps XMM0, xmm0_5;
1101                     cvttps2dq XMM0, XMM0;      // return into integer domain using cast(int)(x + 0.5f)
1102 
1103                     paddd XMM1, xmmTwoInt;
1104                     psrld XMM1, 2;             // finalAlpha finalAlpha finalAlpha finalAlpha
1105 
1106                     pslldq XMM0, 4;            // 0 fR/sum fG/sum fB/sum
1107                     pslldq XMM1, 12;           // 0 0 0 finalAlpha
1108                     psrldq XMM0, 4;            // fR/sum fG/sum fB/sum 0
1109 
1110                     por XMM0, XMM1;            // fR/sum fG/sum fB/sum finalAlpha
1111                     pxor XMM3, XMM3;
1112                     packssdw XMM0, XMM3;       // same in words
1113                     packuswb XMM0, XMM3;       // same in bytes
1114 
1115                     sub ECX, 1;
1116                     movd [RDI-4], XMM0;            // dest[x] = A
1117                     jnz loop_ecx;
1118                 end_of_loop: ;
1119                 }
1120             }
1121             else
1122                 static assert(false);
1123         }
1124         else
1125         {
1126             for (int x = 0; x < width; ++x)
1127             {
1128                 // A B
1129                 // C D
1130                 RGBA A = L0[2 * x];
1131                 RGBA B = L0[2 * x + 1];
1132                 RGBA C = L1[2 * x];
1133                 RGBA D = L1[2 * x + 1];
1134 
1135                 int alphaA = A.a;
1136                 int alphaB = B.a;
1137                 int alphaC = C.a;
1138                 int alphaD = D.a;
1139                 int sum = alphaA + alphaB + alphaC + alphaD;
1140                 if (sum == 0)
1141                 {
1142                     dest[x] = RGBA(0,0,0,0);
1143                 }
1144                 else
1145                 {
1146                     int destAlpha = cast(ubyte)( (alphaA + alphaB + alphaC + alphaD + 2) >> 2 );
1147                     int red =   (A.r * alphaA + B.r * alphaB + C.r * alphaC + D.r * alphaD);
1148                     int green = (A.g * alphaA + B.g * alphaB + C.g * alphaC + D.g * alphaD);
1149                     int blue =  (A.b * alphaA + B.b* alphaB + C.b * alphaC + D.b * alphaD);
1150                     float invSum = 1 / cast(float)(sum);
1151 
1152                     RGBA finalColor = RGBA( cast(ubyte)(0.5f + red * invSum),
1153                                             cast(ubyte)(0.5f + green * invSum),
1154                                             cast(ubyte)(0.5f + blue * invSum),
1155                                             cast(ubyte)destAlpha );
1156                     dest[x] = finalColor;
1157                 }
1158             }
1159         }
1160 
1161         enum verify = false;
1162 
1163         static if (verify)
1164         {
1165             for (int x = 0; x < width; ++x)
1166             {
1167                 // A B
1168                 // C D
1169                 RGBA A = L0[2 * x];
1170                 RGBA B = L0[2 * x + 1];
1171                 RGBA C = L1[2 * x];
1172                 RGBA D = L1[2 * x + 1];
1173 
1174                 int alphaA = A.a;
1175                 int alphaB = B.a;
1176                 int alphaC = C.a;
1177                 int alphaD = D.a;
1178                 int sum = alphaA + alphaB + alphaC + alphaD;
1179                 if (sum == 0)
1180                 {
1181                     assert(dest[x] == RGBA(0,0,0,0));
1182                 }
1183                 else
1184                 {
1185                     int destAlpha = cast(ubyte)( (alphaA + alphaB + alphaC + alphaD + 2) >> 2 );
1186                     int red =   (A.r * alphaA + B.r * alphaB + C.r * alphaC + D.r * alphaD);
1187                     int green = (A.g * alphaA + B.g * alphaB + C.g * alphaC + D.g * alphaD);
1188                     int blue =  (A.b * alphaA + B.b* alphaB + C.b * alphaC + D.b * alphaD);
1189 
1190                     float invSum = 1 / cast(float)(sum);
1191 
1192                     RGBA finalColor = RGBA( cast(ubyte)(0.5f + red * invSum),
1193                                             cast(ubyte)(0.5f + green * invSum),
1194                                            cast(ubyte)(0.5f + blue * invSum),
1195                                            cast(ubyte)destAlpha );
1196                     RGBA instead = dest[x];
1197 
1198                     int insteadR = instead.r;
1199                     int insteadG = instead.g;
1200                     int insteadB = instead.b;
1201                     int insteadA = instead.a;
1202                     int finalColorR = finalColor.r;
1203                     int finalColorG = finalColor.g;
1204                     int finalColorB = finalColor.b;
1205                     int finalColorA = finalColor.a;
1206                     import std.math;
1207                     assert(abs(insteadR - finalColorR) <= 1); // some remaining differences because of rounding
1208                     assert(abs(insteadG - finalColorG) <= 1);
1209                     assert(abs(insteadB - finalColorB) <= 1);
1210                     assert(insteadA == finalColorA);
1211                 }
1212             }
1213         }
1214 
1215         L0 += (2 * previousPitch);
1216         L1 += (2 * previousPitch);
1217         dest += thisPitch;
1218     }
1219 }
1220 
1221 void generateLevelBoxAlphaCovIntoPremulRGBA(OwnedImage!RGBA thisLevel,
1222                                             OwnedImage!RGBA previousLevel,
1223                                             box2i updateRect) nothrow @nogc
1224 {
1225     int width = updateRect.width();
1226     int height = updateRect.height();
1227 
1228     int previousPitch = previousLevel.w;
1229     int thisPitch = thisLevel.w;
1230 
1231     RGBA* L0 = previousLevel.scanline(updateRect.min.y * 2).ptr + updateRect.min.x * 2;
1232     RGBA* L1 = L0 + previousPitch;
1233 
1234     RGBA* dest = thisLevel.scanline(updateRect.min.y).ptr + updateRect.min.x;
1235 
1236     for (int y = 0; y < height; ++y)
1237     {
1238         version(inlineAsmCanLoadGlobalsInPIC)
1239         {
1240             version(D_InlineAsm_X86)
1241             {
1242                 asm nothrow @nogc
1243                 {
1244                     mov ECX, width;
1245 
1246                     mov EAX, L0;
1247                     mov EDX, L1;
1248                     mov EDI, dest;
1249 
1250                     movdqa XMM5, xmm512;               // 512 512 5121 512
1251                     pxor XMM4, XMM4;                   // all zeroes
1252 
1253                 loop_ecx:
1254 
1255                     movq XMM0, [EAX];                  // Ar Ag Ab Aa Br Bg Bb Ba + zeroes
1256                     movq XMM1, [EDX];                  // Cr Cg Cb Ca Dr Dg Db Da + zeroes
1257                     pxor XMM4, XMM4;
1258                     add EAX, 8;
1259                     add EDX, 8;
1260 
1261                     punpcklbw XMM0, XMM4;              // Ar Ag Ab Aa Br Bg Bb Ba
1262                     punpcklbw XMM1, XMM4;              // Cr Cg Cb Ca Dr Dg Db Da
1263 
1264                     movdqa XMM2, XMM0;
1265                     punpcklwd XMM0, XMM1;              // Ar Cr Ag Cg Ab Cb Aa Ca
1266                     punpckhwd XMM2, XMM1;              // Br Dr Bg Dg Bb Db Ba Da
1267 
1268                     movdqa XMM3, XMM0;
1269                     punpcklwd XMM0, XMM2;              // Ar Br Cr Dr Ag Bg Cg Dg
1270                     punpckhwd XMM3, XMM2;              // Ab Bb Cb Db Aa Ba Ca Da
1271 
1272                     movdqa XMM1, XMM3;
1273                     punpckhqdq XMM1, XMM1;             // Aa Ba Ca Da Aa Ba Ca Da
1274 
1275                     add EDI, 4;
1276 
1277                     pmaddwd XMM0, XMM1;            // Ar*Aa+Br*Ba Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da
1278                     pmaddwd XMM3, XMM1;            // Ab*Aa+Bb*Ba Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da
1279 
1280                     movdqa XMM2, XMM0;
1281                     movdqa XMM1, XMM3;
1282 
1283                     psrldq XMM2, 4;                // Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 0
1284                     psrldq XMM1, 4;                // Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 0
1285 
1286                     paddd XMM0, XMM2;              // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da garbage Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage
1287                     paddd XMM3, XMM1;              // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da garbage Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage
1288 
1289                     pshufd XMM0, XMM0, 0b00001000; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage garbage
1290                     pshufd XMM3, XMM3, 0b00001000; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage garbage
1291 
1292                     punpcklqdq XMM0, XMM3;     // fR fG fB fA
1293 
1294 
1295                     paddd XMM0, XMM5;
1296                     psrld XMM0, 10;             // final color in dwords
1297 
1298                     packssdw XMM0, XMM4;       // same in words
1299                     packuswb XMM0, XMM4;       // same in bytes
1300 
1301                     sub ECX, 1;
1302                     movd [EDI-4], XMM0;            // dest[x] = A
1303                     jnz loop_ecx;
1304                 }
1305             }
1306             else version(D_InlineAsm_X86_64)
1307             {
1308                 asm nothrow @nogc
1309                 {
1310                     mov ECX, width;
1311 
1312                     mov RAX, L0;
1313                     mov RDX, L1;
1314                     mov RDI, dest;
1315 
1316                     movdqa XMM5, xmm512;               // 512 512 5121 512
1317                     pxor XMM4, XMM4;                   // all zeroes
1318 
1319                 loop_ecx:
1320 
1321                     movq XMM0, [RAX];                  // Ar Ag Ab Aa Br Bg Bb Ba + zeroes
1322                     movq XMM1, [RDX];                  // Cr Cg Cb Ca Dr Dg Db Da + zeroes
1323                     pxor XMM4, XMM4;
1324                     add RAX, 8;
1325                     add RDX, 8;
1326 
1327                     punpcklbw XMM0, XMM4;              // Ar Ag Ab Aa Br Bg Bb Ba
1328                     punpcklbw XMM1, XMM4;              // Cr Cg Cb Ca Dr Dg Db Da
1329 
1330                     movdqa XMM2, XMM0;
1331                     punpcklwd XMM0, XMM1;              // Ar Cr Ag Cg Ab Cb Aa Ca
1332                     punpckhwd XMM2, XMM1;              // Br Dr Bg Dg Bb Db Ba Da
1333 
1334                     movdqa XMM3, XMM0;
1335                     punpcklwd XMM0, XMM2;              // Ar Br Cr Dr Ag Bg Cg Dg
1336                     punpckhwd XMM3, XMM2;              // Ab Bb Cb Db Aa Ba Ca Da
1337 
1338                     movdqa XMM1, XMM3;
1339                     punpckhqdq XMM1, XMM1;             // Aa Ba Ca Da Aa Ba Ca Da
1340 
1341                     add RDI, 4;
1342 
1343                     pmaddwd XMM0, XMM1;            // Ar*Aa+Br*Ba Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da
1344                     pmaddwd XMM3, XMM1;            // Ab*Aa+Bb*Ba Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da
1345 
1346                     movdqa XMM2, XMM0;
1347                     movdqa XMM1, XMM3;
1348 
1349                     psrldq XMM2, 4;                // Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 0
1350                     psrldq XMM1, 4;                // Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 0
1351 
1352                     paddd XMM0, XMM2;              // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da garbage Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage
1353                     paddd XMM3, XMM1;              // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da garbage Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage
1354 
1355                     pshufd XMM0, XMM0, 0b00001000; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage garbage
1356                     pshufd XMM3, XMM3, 0b00001000; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage garbage
1357 
1358                     punpcklqdq XMM0, XMM3;     // fR fG fB fA
1359 
1360 
1361                     paddd XMM0, XMM5;
1362                     psrld XMM0, 10;             // final color in dwords
1363 
1364                     packssdw XMM0, XMM4;       // same in words
1365                     packuswb XMM0, XMM4;       // same in bytes
1366 
1367                     sub ECX, 1;
1368                     movd [RDI-4], XMM0;            // dest[x] = A
1369                     jnz loop_ecx;
1370                 }
1371             }
1372             else 
1373                 static assert(false);
1374         }
1375         else
1376         {
1377             for (int x = 0; x < width; ++x)
1378             {
1379                 RGBA A = L0[2 * x];
1380                 RGBA B = L0[2 * x + 1];
1381                 RGBA C = L1[2 * x];
1382                 RGBA D = L1[2 * x + 1];
1383                 int red =   (A.r * A.a + B.r * B.a + C.r * C.a + D.r * D.a);
1384                 int green = (A.g * A.a + B.g * B.a + C.g * C.a + D.g * D.a);
1385                 int blue =  (A.b * A.a + B.b* B.a + C.b * C.a + D.b * D.a);
1386                 int alpha =  (A.a * A.a + B.a* B.a + C.a * C.a + D.a * D.a);
1387                 RGBA finalColor = RGBA( cast(ubyte)((red + 512) >> 10),
1388                                         cast(ubyte)((green + 512) >> 10),
1389                                         cast(ubyte)((blue + 512) >> 10),
1390                                         cast(ubyte)((alpha + 512) >> 10));
1391                 dest[x] = finalColor;
1392             }
1393         }
1394 
1395         enum bool verify = false;
1396 
1397         static if (verify)
1398         {
1399             for (int x = 0; x < width; ++x)
1400             {
1401                 RGBA A = L0[2 * x];
1402                 RGBA B = L0[2 * x + 1];
1403                 RGBA C = L1[2 * x];
1404                 RGBA D = L1[2 * x + 1];
1405                 int red =   (A.r * A.a + B.r * B.a + C.r * C.a + D.r * D.a);
1406                 int green = (A.g * A.a + B.g * B.a + C.g * C.a + D.g * D.a);
1407                 int blue =  (A.b * A.a + B.b* B.a + C.b * C.a + D.b * D.a);
1408                 int alpha =  (A.a * A.a + B.a* B.a + C.a * C.a + D.a * D.a);
1409                 RGBA finalColor = RGBA( cast(ubyte)((red + 512) >> 10),
1410                                         cast(ubyte)((green + 512) >> 10),
1411                                        cast(ubyte)((blue + 512) >> 10),
1412                                        cast(ubyte)((alpha + 512) >> 10));
1413                 assert(dest[x] == finalColor);
1414             }
1415         }
1416 
1417         L0 += (2 * previousPitch);
1418         L1 += (2 * previousPitch);
1419         dest += thisPitch;
1420     }
1421 }
1422 
1423 void generateLevelCubicRGBA(OwnedImage!RGBA thisLevel,
1424                             OwnedImage!RGBA previousLevel,
1425                             box2i updateRect) nothrow @nogc
1426 {
1427     for (int y = updateRect.min.y; y < updateRect.max.y; ++y)
1428     {
1429         int y2m1 = 2 * y - 1;
1430         if (y2m1 < 0)
1431             y2m1 = 0;
1432 
1433         int y2p2 = 2 * y + 2;
1434         if (y2p2 > previousLevel.h - 1)
1435             y2p2 = previousLevel.h - 1;
1436 
1437         RGBA* LM1 = previousLevel.scanline(y2m1).ptr;
1438         RGBA* L0 = previousLevel.scanline(y * 2).ptr;
1439         RGBA* L1 = previousLevel.scanline(y * 2 + 1).ptr;
1440         RGBA* L2 = previousLevel.scanline(y2p2).ptr;
1441         RGBA* dest = thisLevel.scanline(y).ptr;
1442 
1443         for (int x = updateRect.min.x; x < updateRect.max.x; ++x)
1444         {
1445             // A B C D
1446             // E F G H
1447             // I J K L
1448             // M N O P
1449 
1450             int x2m1 = 2 * x - 1;
1451             if (x2m1 < 0)
1452                 x2m1 = 0;
1453             int x2p0 = 2 * x;
1454             int x2p2 = 2 * x + 2;
1455             if (x2p2 > previousLevel.w - 1)
1456                 x2p2 = previousLevel.w - 1;
1457 
1458             version(inlineAsmCanLoadGlobalsInPIC)
1459             {
1460                 version(D_InlineAsm_X86)
1461                 {
1462                     RGBA[16] buf = void;
1463                     buf[0] = LM1[x2m1];
1464                     buf[1] = LM1[x2p0];
1465                     buf[2] = LM1[x2p0+1];
1466                     buf[3] = LM1[x2p2];
1467                     buf[4] = L0[x2m1];
1468                     buf[5] = L0[x2p0];
1469                     buf[6] = L0[x2p0+1];
1470                     buf[7] = L0[x2p2];
1471                     buf[8] = L1[x2m1];
1472                     buf[9] = L1[x2p0];
1473                     buf[10] = L1[x2p0+1];
1474                     buf[11] = L1[x2p2];
1475                     buf[12] = L2[x2m1];
1476                     buf[13] = L2[x2p0];
1477                     buf[14] = L2[x2p0+1];
1478                     buf[15] = L2[x2p2];
1479                     RGBA* pDest = dest + x;
1480 
1481                     asm nothrow @nogc
1482                     {
1483                         movdqu XMM0, buf;  // A B C D
1484                         movdqu XMM1, buf;
1485                         pxor XMM2, XMM2;      // zeroes
1486                         punpcklbw XMM0, XMM2; // A B
1487                         punpckhbw XMM1, XMM2; // C D
1488                         pmullw XMM0, xmm11113333; // A*1 B*3 in shorts
1489                         movdqa XMM3, XMM0;
1490                         pmullw XMM1, xmm33331111; // C*3 D*3 in shorts
1491                         movdqa XMM5, XMM1;
1492 
1493                         movdqu XMM0, buf+16;  // E F G H
1494                         movdqu XMM1, buf+16;
1495                         punpcklbw XMM0, XMM2; // E F
1496                         punpckhbw XMM1, XMM2; // G H
1497                         pmullw XMM0, xmm33339999; // E*3 F*9 in shorts
1498                         paddw XMM3, XMM0;
1499                         pmullw XMM1, xmm99993333; // G*9 H*3 in shorts
1500                         paddw XMM5, XMM1;
1501 
1502                         movdqu XMM0, buf+32;  // I J K L
1503                         movdqu XMM1, buf+32;
1504                         punpcklbw XMM0, XMM2; // I J
1505                         punpckhbw XMM1, XMM2; // K L
1506                         pmullw XMM0, xmm33339999; // I*3 J*9 in shorts
1507                         paddw XMM3, XMM0;
1508                         pmullw XMM1, xmm99993333; // K*9 L*3 in shorts
1509                         paddw XMM5, XMM1;
1510 
1511                         movdqu XMM0, buf+48;  // M N O P
1512                         movdqu XMM1, buf+48;
1513                         punpcklbw XMM0, XMM2; // M N
1514                         punpckhbw XMM1, XMM2; // O P
1515                         pmullw XMM0, xmm11113333; // M*1 N*3 in shorts
1516                         paddw XMM3, XMM0; // A+E*3+I*3+M B*3+F*9+J*9+3*N
1517                         pmullw XMM1, xmm33331111; // O*3 P*1 in shorts
1518                         paddw XMM5, XMM1; // C*3+G*9+K*9+O*3 D+H*3+L*3+P
1519 
1520                         movdqa XMM0, XMM3;
1521                         movdqa XMM1, XMM5;
1522                         psrldq XMM0, 8;
1523                         psrldq XMM1, 8;
1524                         paddw XMM3, XMM0; // A+E*3+I*3+M+B*3+F*9+J*9+3*N garbage(x4)
1525                         paddw XMM5, XMM1; // C*3+G*9+K*9+O*3+D+H*3+L*3+P garbage(x4)
1526                         paddw XMM3, XMM5; // total-sum garbage(x4)
1527 
1528                         paddw XMM3, xmm32;
1529                         psrlw XMM3, 6;
1530                         mov EAX, pDest;
1531                         packuswb XMM3, XMM2;
1532 
1533                         movd [EAX], XMM3;
1534                     }
1535                 }
1536                 else version(D_InlineAsm_X86_64)
1537                 {
1538                     RGBA[16] buf = void;
1539                     buf[0] = LM1[x2m1];
1540                     buf[1] = LM1[x2p0];
1541                     buf[2] = LM1[x2p0+1];
1542                     buf[3] = LM1[x2p2];
1543                     buf[4] = L0[x2m1];
1544                     buf[5] = L0[x2p0];
1545                     buf[6] = L0[x2p0+1];
1546                     buf[7] = L0[x2p2];
1547                     buf[8] = L1[x2m1];
1548                     buf[9] = L1[x2p0];
1549                     buf[10] = L1[x2p0+1];
1550                     buf[11] = L1[x2p2];
1551                     buf[12] = L2[x2m1];
1552                     buf[13] = L2[x2p0];
1553                     buf[14] = L2[x2p0+1];
1554                     buf[15] = L2[x2p2];
1555                     RGBA* pDest = dest + x;
1556 
1557                     asm nothrow @nogc
1558                     {
1559                         movdqu XMM0, buf;  // A B C D
1560                         movdqu XMM1, buf;
1561                         pxor XMM2, XMM2;      // zeroes
1562                         punpcklbw XMM0, XMM2; // A B
1563                         punpckhbw XMM1, XMM2; // C D
1564                         pmullw XMM0, xmm11113333; // A*1 B*3 in shorts
1565                         movdqa XMM3, XMM0;
1566                         pmullw XMM1, xmm33331111; // C*3 D*3 in shorts
1567                         movdqa XMM5, XMM1;
1568 
1569                         movdqu XMM0, buf+16;  // E F G H
1570                         movdqu XMM1, buf+16;
1571                         punpcklbw XMM0, XMM2; // E F
1572                         punpckhbw XMM1, XMM2; // G H
1573                         pmullw XMM0, xmm33339999; // E*3 F*9 in shorts
1574                         paddw XMM3, XMM0;
1575                         pmullw XMM1, xmm99993333; // G*9 H*3 in shorts
1576                         paddw XMM5, XMM1;
1577 
1578                         movdqu XMM0, buf+32;  // I J K L
1579                         movdqu XMM1, buf+32;
1580                         punpcklbw XMM0, XMM2; // I J
1581                         punpckhbw XMM1, XMM2; // K L
1582                         pmullw XMM0, xmm33339999; // I*3 J*9 in shorts
1583                         paddw XMM3, XMM0;
1584                         pmullw XMM1, xmm99993333; // K*9 L*3 in shorts
1585                         paddw XMM5, XMM1;
1586 
1587                         movdqu XMM0, buf+48;  // M N O P
1588                         movdqu XMM1, buf+48;
1589                         punpcklbw XMM0, XMM2; // M N
1590                         punpckhbw XMM1, XMM2; // O P
1591                         pmullw XMM0, xmm11113333; // M*1 N*3 in shorts
1592                         paddw XMM3, XMM0; // A+E*3+I*3+M B*3+F*9+J*9+3*N
1593                         pmullw XMM1, xmm33331111; // O*3 P*1 in shorts
1594                         paddw XMM5, XMM1; // C*3+G*9+K*9+O*3 D+H*3+L*3+P
1595 
1596                         movdqa XMM0, XMM3;
1597                         movdqa XMM1, XMM5;
1598                         psrldq XMM0, 8;
1599                         psrldq XMM1, 8;
1600                         paddw XMM3, XMM0; // A+E*3+I*3+M+B*3+F*9+J*9+3*N garbage(x4)
1601                         paddw XMM5, XMM1; // C*3+G*9+K*9+O*3+D+H*3+L*3+P garbage(x4)
1602                         paddw XMM3, XMM5; // total-sum garbage(x4)
1603 
1604                         paddw XMM3, xmm32;
1605                         psrlw XMM3, 6;
1606                         mov RAX, pDest;
1607                         packuswb XMM3, XMM2;
1608 
1609                         movd [RAX], XMM3;
1610                     }
1611                 }
1612                 else
1613                     static assert(false);
1614             }
1615             else
1616             {
1617                 auto A = LM1[x2m1];
1618                 auto B = LM1[x2p0];
1619                 auto C = LM1[x2p0+1];
1620                 auto D = LM1[x2p2];
1621 
1622                 auto E = L0[x2m1];
1623                 auto F = L0[x2p0];
1624                 auto G = L0[x2p0+1];
1625                 auto H = L0[x2p2];
1626 
1627                 auto I = L1[x2m1];
1628                 auto J = L1[x2p0];
1629                 auto K = L1[x2p0+1];
1630                 auto L = L1[x2p2];
1631 
1632                 auto M = L2[x2m1];
1633                 auto N = L2[x2p0];
1634                 auto O = L2[x2p0+1];
1635                 auto P = L2[x2p2];
1636 
1637                 // Apply filter
1638                 // 1 3 3 1
1639                 // 3 9 9 3
1640                 // 3 9 9 3
1641                 // 1 3 3 1
1642 
1643                 int rSum = (A.r + D.r + M.r + P.r) + 3 * (B.r + C.r + E.r + H.r + I.r + L.r + N.r + O.r) + 9 * (F.r + G.r + J.r + K.r);
1644                 int gSum = (A.g + D.g + M.g + P.g) + 3 * (B.g + C.g + E.g + H.g + I.g + L.g + N.g + O.g) + 9 * (F.g + G.g + J.g + K.g);
1645                 int bSum = (A.b + D.b + M.b + P.b) + 3 * (B.b + C.b + E.b + H.b + I.b + L.b + N.b + O.b) + 9 * (F.b + G.b + J.b + K.b);
1646                 int aSum = (A.a + D.a + M.a + P.a) + 3 * (B.a + C.a + E.a + H.a + I.a + L.a + N.a + O.a) + 9 * (F.a + G.a + J.a + K.a);
1647                 dest[x].r = cast(ubyte)((rSum + 32) >> 6);
1648                 dest[x].g = cast(ubyte)((gSum + 32) >> 6);
1649                 dest[x].b = cast(ubyte)((bSum + 32) >> 6);
1650                 dest[x].a = cast(ubyte)((aSum + 32) >> 6);
1651             }
1652         }
1653     }
1654 }
1655 
1656 void generateLevelCubicL16(OwnedImage!L16 thisLevel,
1657                            OwnedImage!L16 previousLevel,
1658                            box2i updateRect) nothrow @nogc
1659 {
1660     for (int y = updateRect.min.y; y < updateRect.max.y; ++y)
1661     {
1662         int y2m1 = 2 * y - 1;
1663         if (y2m1 < 0)
1664             y2m1 = 0;
1665 
1666         int y2p2 = 2 * y + 2;
1667         if (y2p2 > previousLevel.h - 1)
1668             y2p2 = previousLevel.h - 1;
1669 
1670         L16* LM1 = previousLevel.scanline(y2m1).ptr;
1671         L16* L0 = previousLevel.scanline(y * 2).ptr;
1672         L16* L1 = previousLevel.scanline(y * 2 + 1).ptr;
1673         L16* L2 = previousLevel.scanline(y2p2).ptr;
1674         L16* dest = thisLevel.scanline(y).ptr;
1675 
1676         for (int x = updateRect.min.x; x < updateRect.max.x; ++x)
1677         {
1678             // A B C D
1679             // E F G H
1680             // I J K L
1681             // M N O P
1682 
1683             int x2m1 = 2 * x - 1;
1684             if (x2m1 < 0)
1685                 x2m1 = 0;
1686             int x2p0 = 2 * x;
1687             int x2p2 = 2 * x + 2;
1688             if (x2p2 > previousLevel.w - 1)
1689                 x2p2 = previousLevel.w - 1;
1690 
1691             ushort A = LM1[x2m1].l;
1692             ushort B = LM1[x2p0].l;
1693             ushort C = LM1[x2p0+1].l;
1694             ushort D = LM1[x2p2].l;
1695 
1696             ushort E = L0[x2m1].l;
1697             ushort F = L0[x2p0].l;
1698             ushort G = L0[x2p0+1].l;
1699             ushort H = L0[x2p2].l;
1700 
1701             ushort I = L1[x2m1].l;
1702             ushort J = L1[x2p0].l;
1703             ushort K = L1[x2p0+1].l;
1704             ushort L = L1[x2p2].l;
1705 
1706             ushort M = L2[x2m1].l;
1707             ushort N = L2[x2p0].l;
1708             ushort O = L2[x2p0+1].l;
1709             ushort P = L2[x2p2].l;
1710 
1711             // Apply filter
1712             // 1 3 3 1    A B C D
1713             // 3 9 9 3    E F G H
1714             // 3 9 9 3    I J K L
1715             // 1 3 3 1    M N O P
1716 
1717             int depthSum = (A + D + M + P)
1718                          + 3 * (B + C + E + H + I + L + N + O)
1719                          + 9 * (F + G + J + K);
1720             dest[x].l = cast(ushort)((depthSum + 32) >> 6  );
1721         }
1722     }
1723 }
1724 
1725 unittest
1726 {
1727     Mipmap!RGBA rgbaMipmap;
1728     Mipmap!L16 l16Mipmap;
1729 }