1 /**
2 * Mipmap pyramid implementation.
3 *
4 * Copyright: Copyright Auburn Sounds 2015 and later.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 * Authors:   Guillaume Piolat
7 */
8 module dplug.graphics.mipmap;
9 
10 import std.algorithm.comparison;
11 
12 import gfm.math.vector;
13 import gfm.math.box;
14 import dplug.graphics.color;
15 
16 import dplug.core.nogc;
17 import dplug.core.vec;
18 import dplug.graphics.drawex;
19 
20 version( D_InlineAsm_X86 )
21 {
22     version = AsmX86;
23 }
24 else version( D_InlineAsm_X86_64 )
25 {
26     version = AsmX86;
27 }
28 
29 // Because of unability to load globals in PIC code with DMD, only enable some assembly with LDC
30 version(LDC)
31 {
32     version( D_InlineAsm_X86 )
33     {
34         version = inlineAsmCanLoadGlobalsInPIC;
35     }
36     else version( D_InlineAsm_X86_64 )
37     {
38         version = inlineAsmCanLoadGlobalsInPIC;
39     }
40 }
41 
42 
43 /// Mipmapped images.
44 /// Supports non power-of-two textures.
45 /// Size of the i+1-th mipmap is { (width)/2, (height)/2 }
46 /// The mipmap owns each of its levels.
47 final class Mipmap(COLOR) if (is(COLOR == RGBA) || is(COLOR == L16))
48 {
49 public:
50 nothrow:
51 @nogc:
52 
53     enum Quality
54     {
55         box,                  // simple 2x2 filter, creates phase problems with NPOT. For higher levels, automatically uses cubic.
56         cubic,                // Very smooth kernel [1 2 1] x [1 2 1]
57         boxAlphaCov,          // ditto but alpha is used as weight, only implemented for RGBA
58         boxAlphaCovIntoPremul, // same as boxAlphaConv but after such a step the next level is alpha-premultiplied
59     }
60 
61     Vec!(OwnedImage!COLOR) levels;
62 
63     /// Creates empty
64     this()
65     {
66         levels = makeVec!(OwnedImage!COLOR)();
67     }
68 
69     /// Set number of levels and size
70     /// maxLevel = 0 => only one image
71     /// maxLevel = 1 => one image + one 2x downsampled mipmap
72     /// etc...
73     this(int maxLevel, int w, int h)
74     {
75         this();
76         size(maxLevel, w, h);
77     }
78 
79 
80     /// Creates a Mipmap out of a flat OwnedImage.
81     /// This takes ownership of the given image, which is now owned by the `Mipmap`.
82     this(int maxLevel, OwnedImage!COLOR level0)
83     {
84         //PERF: could avoid to create the 0th level only to replace it later
85 
86         this(maxLevel, level0.w, level0.h);
87 
88         // replaces level 0
89         levels[0].destroyFree();
90         levels[0] = level0;
91         generateMipmaps(Quality.box);
92     }
93 
94     void size(int maxLevel, int w, int h)
95     {
96         // find number of needed levels
97         int neededLevels = 0;
98         {
99             int wr = w;
100             int hr = h;
101             for (; neededLevels <= maxLevel; ++neededLevels)
102             {
103                 if (wr == 0 || hr == 0)
104                     break;
105                 wr  = (wr + 0) >> 1;
106                 hr  = (hr + 0) >> 1;
107             }
108         }
109 
110         void setLevels(int numLevels)
111         {
112             // FUTURE: cleanup excess levels
113             // should not happen until we have resizing
114             if (numLevels < levels.length)
115             {
116                 assert(false);
117             }
118 
119             int previousLength = cast(int)levels.length;
120 
121             levels.resize(numLevels);
122 
123             // create empty image for new levels
124             for(int level = previousLength; level < numLevels; ++level)
125             {
126                 levels[level] = mallocNew!(OwnedImage!COLOR)();
127             }
128         }
129 
130         setLevels(neededLevels);
131 
132         // resize levels
133         for (int level = 0; level < neededLevels; ++level)
134         {
135             assert(w != 0 && h != 0);
136             levels[level].size(w, h);
137             w  = (w + 0) >> 1;
138             h  = (h + 0) >> 1;
139         }
140     }
141 
142     ~this()
143     {
144         foreach(level; levels)
145             level.destroyFree();
146     }
147 
148     /// Interpolates a color between mipmap levels.  Floating-point level, spatial linear interpolation.
149     /// x and y are in base level coordinates (top-left pixel is on (0.5, 0.5) coordinates).
150     /// Clamped to borders.
151     auto linearMipmapSample(float level, float x, float y) nothrow @nogc
152     {
153         int ilevel = cast(int)level;
154         float flevel = level - ilevel;
155         vec4f levelN = linearSample(ilevel, x, y);
156         if (flevel == 0)
157             return levelN;
158 
159         auto levelNp1 = linearSample(ilevel + 1, x, y);
160 
161         return levelN * (1 - flevel) + levelNp1 * flevel;
162     }
163 
164 
165     /// Interpolates a color.  Integer level, spatial linear interpolation.
166     /// x and y are in base level coordinates (top-left pixel is on (0.5, 0.5) coordinates).
167     /// Clamped to borders.
168     auto linearSample(int level, float x, float y) nothrow @nogc
169     {
170         if (level < 0)
171             level = 0;
172         int numLevels = cast(int)levels.length;
173         if (level >= numLevels)
174             level = numLevels - 1;
175 
176         OwnedImage!COLOR image = levels[level];
177 
178 
179         static immutable float[14] factors = [ 1.0f, 0.5f, 0.25f, 0.125f,
180                                                0.0625f, 0.03125f, 0.015625f, 0.0078125f,
181                                                0.00390625f, 0.001953125f, 0.0009765625f, 0.00048828125f,
182                                                0.000244140625f, 0.0001220703125f];
183 
184         float divider = factors[level];
185         x = x * divider - 0.5f;
186         y = y * divider - 0.5f;
187 
188         float maxX = image.w - 1.001f; // avoids an edge case with truncation
189         float maxY = image.h - 1.001f;
190 
191         if (x < 0)
192             x = 0;
193         if (y < 0)
194             y = 0;
195         if (x > maxX)
196             x = maxX;
197         if (y > maxY)
198             y = maxY;
199 
200         int ix = cast(int)x;
201         int iy = cast(int)y;
202         float fx = x - ix;
203 
204         int ixp1 = ix + 1;
205         if (ixp1 >= image.w)
206             ixp1 = image.w - 1;
207         int iyp1 = iy + 1;
208         if (iyp1 >= image.h)
209             iyp1 = image.h - 1;
210 
211         float fxm1 = 1 - fx;
212         float fy = y - iy;
213         float fym1 = 1 - fy;
214 
215         COLOR[] L0 = image.scanline(iy);
216         COLOR[] L1 = image.scanline(iyp1);
217 
218         COLOR A = L0.ptr[ix];
219         COLOR B = L0.ptr[ixp1];
220         COLOR C = L1.ptr[ix];
221         COLOR D = L1.ptr[ixp1];
222 
223         static if (is(COLOR == RGBA))
224         {
225             float inv255 = 1 / 255.0f;
226 
227             version( AsmX86 )
228             {
229                 vec4f asmResult;
230 
231                 asm nothrow @nogc
232                 {
233                     movd XMM0, A;
234                     movd XMM1, B;
235                     movd XMM2, C;
236                     movd XMM3, D;
237                     pxor XMM4, XMM4;
238 
239                     punpcklbw XMM0, XMM4;
240                     punpcklbw XMM1, XMM4;
241                     punpcklbw XMM2, XMM4;
242                     punpcklbw XMM3, XMM4;
243 
244                     punpcklwd XMM0, XMM4;
245                     punpcklwd XMM1, XMM4;
246                     punpcklwd XMM2, XMM4;
247                     punpcklwd XMM3, XMM4;
248 
249                     cvtdq2ps XMM0, XMM0;
250                     cvtdq2ps XMM1, XMM1;
251 
252                     cvtdq2ps XMM2, XMM2;
253                     cvtdq2ps XMM3, XMM3;
254 
255                     movss XMM4, fxm1;
256                     pshufd XMM4, XMM4, 0;
257                     movss XMM5, fx;
258                     pshufd XMM5, XMM5, 0;
259 
260                     mulps XMM0, XMM4;
261                     mulps XMM1, XMM5;
262                     mulps XMM2, XMM4;
263                     mulps XMM3, XMM5;
264 
265                     movss XMM4, fym1;
266                     pshufd XMM4, XMM4, 0;
267                     movss XMM5, fy;
268                     pshufd XMM5, XMM5, 0;
269 
270                     addps XMM0, XMM1;
271                     addps XMM2, XMM3;
272 
273                     mulps XMM0, XMM4;
274                     mulps XMM2, XMM5;
275 
276                     addps XMM0, XMM2;
277 
278                     movups asmResult, XMM0;
279                 }
280 
281                 // Uncomment to check
282     /*
283                 vec4f vA = vec4f(A.r, A.g, A.b, A.a);
284                 vec4f vB = vec4f(B.r, B.g, B.b, B.a);
285                 vec4f vC = vec4f(C.r, C.g, C.b, C.a);
286                 vec4f vD = vec4f(D.r, D.g, D.b, D.a);
287 
288                 vec4f up = vA * fxm1 + vB * fx;
289                 vec4f down = vC * fxm1 + vD * fx;
290                 vec4f dResult = up * fym1 + down * fy;
291 
292                 import gfm.core;
293 
294                 if (dResult.distanceTo(result) < 1.0f)
295                     debugBreak();
296     */
297 
298                 vec4f result = asmResult;
299                 return result;
300             }
301             else
302             {
303                 vec4f vA = vec4f(A.r, A.g, A.b, A.a);
304                 vec4f vB = vec4f(B.r, B.g, B.b, B.a);
305                 vec4f vC = vec4f(C.r, C.g, C.b, C.a);
306                 vec4f vD = vec4f(D.r, D.g, D.b, D.a);
307 
308 
309 
310                 vec4f up = vA * fxm1 + vB * fx;
311                 vec4f down = vC * fxm1 + vD * fx;
312                 vec4f dResult = up * fym1 + down * fy;
313 
314               //  assert(dResult.distanceTo(asmResult) < 1.0f);
315 
316                 return dResult;
317             }
318         }
319         else
320         {
321             float up = A.l * fxm1 + B.l * fx;
322             float down = C.l * fxm1 + D.l * fx;
323             return up * fym1 + down * fy;
324         }
325     }
326 
327     /// Returns: Width of the base level.
328     int width() pure const nothrow @nogc
329     {
330         return levels[0].w;
331     }
332 
333     /// Returns: Height of the base level.
334     int height() pure const nothrow @nogc
335     {
336         return levels[0].h;
337     }
338 
339     /// Returns: Number of levels. The maximum level is numLevels() - 1.
340     int numLevels() pure const nothrow @nogc
341     {
342         return cast(int)levels.length;
343     }
344 
345     /// Regenerates the whole upper levels.
346     void generateMipmaps(Quality quality) nothrow @nogc
347     {
348         box2i updateRect = box2i(0, 0, width(), height());
349         for (int level = 1; level < numLevels(); ++level)
350         {
351             // HACK: Force cubic filter past a level else it makes ugly looking mipmaps
352             if (level >= 3 && quality == Quality.box)
353                 quality = Quality.cubic;
354 
355             updateRect = generateNextLevel(quality, updateRect, level);
356         }
357     }
358 
359     /// Regenerates a single mipmap level based on changes in the provided rectangle (expressed in level 0 coordinates).
360     /// updateRect expressed in level 0 coordinates
361     /// In general if you have several subparts of mipmaps to update, make sure a level is fully completed
362     /// before computing the next one.
363     box2i generateNextLevel(Quality quality, box2i updateRectPreviousLevel, int level) nothrow @nogc
364     {
365         OwnedImage!COLOR previousLevel = levels[level - 1];
366         box2i updateRect = impactOnNextLevel(quality, updateRectPreviousLevel, previousLevel.w, previousLevel.h);
367         generateLevel(level, quality, updateRect);
368         return updateRect;
369     }
370 
371     /// Regenerates one level
372     /// updateRect expressed in level i-th coordinates
373     void generateLevel(int level, Quality quality, box2i updateRect) nothrow @nogc
374     {
375         assert(level > 0);
376         OwnedImage!COLOR thisLevel = levels[level];
377         OwnedImage!COLOR previousLevel = levels[level - 1];
378 
379         final switch(quality) with (Quality)
380         {
381             case box:
382 
383                 static if (is(COLOR == RGBA))
384                     generateLevelBoxRGBA(thisLevel, previousLevel, updateRect);
385                 else static if (is(COLOR == L16))
386                     generateLevelBoxL16(thisLevel, previousLevel, updateRect);
387                 else
388                     static assert(false, "not implemented");
389 
390                 enum checkBoxMipmaps = false;
391 
392                 static if (checkBoxMipmaps)
393                 {
394                     for (int y = updateRect.min.y; y < updateRect.max.y; ++y)
395                     {
396                         COLOR[] L0 = previousLevel.scanline(y * 2);
397                         COLOR[] L1 = previousLevel.scanline(y * 2 + 1);
398                         COLOR[] dest = thisLevel.scanline(y);
399 
400                         for (int x = updateRect.min.x; x < updateRect.max.x; ++x)
401                         {
402                             // A B
403                             // C D
404                             COLOR A = L0[2 * x];
405                             COLOR B = L0[2 * x + 1];
406                             COLOR C = L1[2 * x];
407                             COLOR D = L1[2 * x + 1];
408                             assert(dest[x] == COLOR.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D));
409                         }
410                     }
411                 }
412                 break;
413 
414         case boxAlphaCov:
415 
416             static if (is(COLOR == RGBA))
417             {
418                 generateLevelBoxAlphaCovRGBA(thisLevel, previousLevel, updateRect);
419 
420                 static if (false)
421                 {
422                     void checkLevelBoxAlphaConvRGBA(Image!RGBA* thisLevel, Image!RGBA* previousLevel, box2i updateRect)
423                     {
424                         for (int y = updateRect.min.y; y < updateRect.max.y; ++y)
425                         {
426                             RGBA[] L0 = previousLevel.scanline(y * 2);
427                             RGBA[] L1 = previousLevel.scanline(y * 2 + 1);
428                             RGBA[] dest = thisLevel.scanline(y);
429 
430                             for (int x = updateRect.min.x; x < updateRect.max.x; ++x)
431                             {
432                                 // A B
433                                 // C D
434                                 RGBA A = L0.ptr[2 * x];
435                                 RGBA B = L0.ptr[2 * x + 1];
436                                 RGBA C = L1.ptr[2 * x];
437                                 RGBA D = L1.ptr[2 * x + 1];
438 
439                                 int alphaA = A.a;
440                                 int alphaB = B.a;
441                                 int alphaC = C.a;
442                                 int alphaD = D.a;
443                                 int sum = alphaA + alphaB + alphaC + alphaD;
444                                 if (sum == 0)
445                                 {
446                                     assert(dest.ptr[x] == A);
447                                 }
448                                 else
449                                 {
450                                     int destAlpha = cast(ubyte)( (alphaA + alphaB + alphaC + alphaD + 2) >> 2 );
451                                     int red =   (A.r * alphaA + B.r * alphaB + C.r * alphaC + D.r * alphaD);
452                                     int green = (A.g * alphaA + B.g * alphaB + C.g * alphaC + D.g * alphaD);
453                                     int blue =  (A.b * alphaA + B.b* alphaB + C.b * alphaC + D.b * alphaD);
454                                     float invSum = 1 / cast(float)(sum);
455 
456                                     RGBA finalColor = RGBA( cast(ubyte)(0.5f + red * invSum),
457                                                             cast(ubyte)(0.5f + green * invSum),
458                                                             cast(ubyte)(0.5f + blue * invSum),
459                                                             cast(ubyte)destAlpha );
460                                     assert(dest.ptr[x] == finalColor);
461                                 }
462                             }
463                         }
464                     }
465                     checkLevelBoxAlphaConvRGBA(thisLevel, previousLevel, updateRect);
466                 }
467                 break;
468             }
469             else
470                 assert(false);
471 
472         case boxAlphaCovIntoPremul:
473 
474             static if (is(COLOR == RGBA))
475             {
476                 generateLevelBoxAlphaCovIntoPremulRGBA(thisLevel, previousLevel, updateRect);
477                 break;
478             }
479             else
480                 assert(false);
481 
482         case cubic:
483             static if (is(COLOR == RGBA))
484             {
485                 generateLevelCubicRGBA(thisLevel, previousLevel, updateRect);
486                 break;
487             }
488             else static if (is(COLOR == L16))
489             {
490                 generateLevelCubicL16(thisLevel, previousLevel, updateRect);
491                 break;
492             }
493             else
494                 static assert(false, "not implemented");
495 
496 
497         }
498     }
499 
500 
501 private:
502     /// Computes impact of updating the area box on next level
503     static box2i impactOnNextLevel(Quality quality, box2i area, int currentLevelWidth, int currentLevelHeight) pure nothrow @nogc
504     {
505         box2i maxArea = box2i(0, 0, currentLevelWidth / 2, currentLevelHeight / 2);
506 
507         final  switch(quality) with (Quality)
508         {
509         case box:
510         case boxAlphaCov:
511         case boxAlphaCovIntoPremul:
512             int xmin = area.min.x / 2;
513             int ymin = area.min.y / 2;
514             int xmax = (area.max.x + 1) / 2;
515             int ymax = (area.max.y + 1) / 2;
516             return box2i(xmin, ymin, xmax, ymax).intersection(maxArea);
517 
518         case cubic:
519             int xmin = (area.min.x - 1) / 2;
520             int ymin = (area.min.y - 1) / 2;
521             int xmax = (area.max.x + 2) / 2;
522             int ymax = (area.max.y + 2) / 2;
523             return box2i(xmin, ymin, xmax, ymax).intersection(maxArea);
524         }
525 
526     }
527 }
528 
529 unittest
530 {
531     Mipmap!RGBA a = new Mipmap!RGBA();
532     a.size(4, 256, 256);
533     a.destroy();
534 
535     Mipmap!L16 b = new Mipmap!L16();
536     b.size(16, 17, 333);
537     b.destroy();
538 }
539 
540 
541 private:
542 
543 align(16) static immutable short[8] xmmTwoShort = [ 2, 2, 2, 2, 2, 2, 2, 2 ];
544 align(16) static immutable int[4] xmmTwoInt = [ 2, 2, 2, 2 ];
545 align(16) static immutable float[4] xmm0_5 = [ 0.5f, 0.5f, 0.5f, 0.5f ];
546 align(16) static immutable int[4] xmm512 = [ 512, 512, 512, 512 ];
547 align(16) static immutable short[8] xmm11113333 = [ 1, 1, 1, 1, 3, 3, 3, 3 ];
548 align(16) static immutable short[8] xmm33331111 = [ 3, 3, 3, 3, 1, 1, 1, 1 ];
549 align(16) static immutable short[8] xmm33339999 = [ 3, 3, 3, 3, 9, 9, 9, 9 ];
550 align(16) static immutable short[8] xmm99993333 = [ 9, 9, 9, 9, 3, 3, 3, 3 ];
551 align(16) static immutable short[8] xmm32       = [ 32, 32, 32, 32, 32, 32, 32, 32 ];
552 
553 
554 void generateLevelBoxRGBA(OwnedImage!RGBA thisLevel,
555                           OwnedImage!RGBA previousLevel,
556                           box2i updateRect) pure nothrow @nogc
557 {
558     int width = updateRect.width();
559     int height = updateRect.height();
560 
561     int previousPitch = previousLevel.w;
562     int thisPitch = thisLevel.w;
563 
564     RGBA* L0 = previousLevel.scanline(updateRect.min.y * 2).ptr + updateRect.min.x * 2;
565     RGBA* L1 = L0 + previousPitch;
566     RGBA* dest = thisLevel.scanline(updateRect.min.y).ptr + updateRect.min.x;
567 
568     for (int y = 0; y < height; ++y)
569     {
570         version(inlineAsmCanLoadGlobalsInPIC)
571         {
572             version(D_InlineAsm_X86)
573             {
574                 asm pure nothrow @nogc
575                 {
576                     mov ECX, width;
577                     shr ECX, 1;
578                     jz no_need; // ECX = 0 => no pair of pixels to process
579 
580                     mov EAX, L0;
581                     mov EDX, L1;
582                     mov EDI, dest;
583                     movaps XMM5, xmmTwoShort;
584 
585                 loop_ecx:
586                     movdqu XMM0, [EAX]; // A B E F
587                     pxor XMM4, XMM4;
588                     movdqu XMM1, [EDX]; // C D G H
589                     movdqa XMM2, XMM0;
590                     movdqa XMM3, XMM1;
591                     punpcklbw XMM0, XMM4; // A B in short
592                     punpcklbw XMM1, XMM4; // C D in short
593                     punpckhbw XMM2, XMM4; // E F in short
594                     punpckhbw XMM3, XMM4; // G H in short
595                     paddusw XMM0, XMM1; // A + C | B + D
596                     paddusw XMM2, XMM3; // E + F | G + H
597                     movdqa XMM1, XMM0;
598                     movdqa XMM3, XMM2;
599                     psrldq XMM1, 8;
600                     psrldq XMM3, 8;
601                     add EDI, 8;
602                     paddusw XMM0, XMM1; // A + B + C + D | garbage
603                     paddusw XMM2, XMM3; // E + F + G + H | garbage
604                     paddusw XMM0, XMM5; // A + B + C + D + 2 | garbage
605                     paddusw XMM2, XMM5; // E + F + G + H + 2 | garbage
606                     psrlw XMM0, 2; // (A + B + C + D + 2) >> 2 | garbage
607                     psrlw XMM2, 2; // (E + F + G + H + 2) >> 2 | garbage
608                     add EAX, 16;
609                     punpcklqdq XMM0, XMM2;
610                     add EDX, 16;
611                     packuswb XMM0, XMM4; // (A + B + C + D + 2) >> 2 | (E + F + G + H + 2) >> 2 | 0 | 0
612                     movq [EDI-8], XMM0;
613                     sub ECX, 1;
614                     jnz loop_ecx;
615                 no_need: ;
616                 }
617 
618                 // Eventually filter the last pixel
619                 int remaining = width & ~1;
620                 for (int x = remaining; x < width; ++x)
621                 {
622                     RGBA A = L0[2 * x];
623                     RGBA B = L0[2 * x + 1];
624                     RGBA C = L1[2 * x];
625                     RGBA D = L1[2 * x + 1];
626                     dest[x] = RGBA.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D);
627                 }
628             }
629             else version(D_InlineAsm_X86_64)
630             {
631                 asm pure nothrow @nogc
632                 {
633                     mov ECX, width;
634                     shr ECX, 1;
635                     jz no_need; // ECX = 0 => no pair of pixels to process
636 
637                     mov RAX, L0;
638                     mov RDX, L1;
639                     mov RDI, dest;
640                     movaps XMM5, xmmTwoShort;
641 
642                 loop_ecx:
643                     movdqu XMM0, [RAX]; // A B E F
644                     pxor XMM4, XMM4;
645                     movdqu XMM1, [RDX]; // C D G H
646                     movdqa XMM2, XMM0;
647                     movdqa XMM3, XMM1;
648                     punpcklbw XMM0, XMM4; // A B in short
649                     punpcklbw XMM1, XMM4; // C D in short
650                     punpckhbw XMM2, XMM4; // E F in short
651                     punpckhbw XMM3, XMM4; // G H in short
652                     paddusw XMM0, XMM1; // A + C | B + D
653                     paddusw XMM2, XMM3; // E + F | G + H
654                     movdqa XMM1, XMM0;
655                     movdqa XMM3, XMM2;
656                     psrldq XMM1, 8;
657                     psrldq XMM3, 8;
658                     add RDI, 8;
659                     paddusw XMM0, XMM1; // A + B + C + D | garbage
660                     paddusw XMM2, XMM3; // E + F + G + H | garbage
661                     paddusw XMM0, XMM5; // A + B + C + D + 2 | garbage
662                     paddusw XMM2, XMM5; // E + F + G + H + 2 | garbage
663                     psrlw XMM0, 2; // (A + B + C + D + 2) >> 2 | garbage
664                     psrlw XMM2, 2; // (E + F + G + H + 2) >> 2 | garbage
665                     add RAX, 16;
666                     punpcklqdq XMM0, XMM2;
667                     add RDX, 16;
668                     packuswb XMM0, XMM4; // (A + B + C + D + 2) >> 2 | (E + F + G + H + 2) >> 2 | 0 | 0
669                     movq [RDI-8], XMM0;
670                     sub ECX, 1;
671                     jnz loop_ecx;
672                 no_need: ;
673                 }
674 
675                 // Eventually filter the last pixel
676                 int remaining = width & ~1;
677                 for (int x = remaining; x < width; ++x)
678                 {
679                     RGBA A = L0[2 * x];
680                     RGBA B = L0[2 * x + 1];
681                     RGBA C = L1[2 * x];
682                     RGBA D = L1[2 * x + 1];
683                     dest[x] = RGBA.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D);
684                 }
685             }
686             else
687                 static assert(false);
688         }
689         else
690         {
691             for (int x = 0; x < width; ++x)
692             {
693                 // A B
694                 // C D
695                 RGBA A = L0[2 * x];
696                 RGBA B = L0[2 * x + 1];
697                 RGBA C = L1[2 * x];
698                 RGBA D = L1[2 * x + 1];
699 
700                 dest[x] = RGBA.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D);
701             }
702         }
703 
704         L0 += (2 * previousPitch);
705         L1 += (2 * previousPitch);
706         dest += thisPitch;
707     }
708 }
709 
710 void generateLevelBoxL16(OwnedImage!L16 thisLevel,
711                          OwnedImage!L16 previousLevel,
712                          box2i updateRect) pure nothrow @nogc
713 {
714     int width = updateRect.width();
715     int height = updateRect.height();
716 
717     int previousPitch = previousLevel.w;
718     int thisPitch = thisLevel.w;
719 
720     L16* L0 = previousLevel.scanline(updateRect.min.y * 2).ptr + updateRect.min.x * 2;
721     L16* L1 = L0 + previousPitch;
722 
723     L16* dest = thisLevel.scanline(updateRect.min.y).ptr + updateRect.min.x;
724 
725     for (int y = 0; y < height; ++y)
726     {
727         version(inlineAsmCanLoadGlobalsInPIC)
728         {
729             version(D_InlineAsm_X86)
730             {
731                 asm pure nothrow @nogc
732                 {
733                     mov ECX, width;
734                     shr ECX, 2;
735                     jz no_need; // ECX = 0 => less than 4 pixels to process
736 
737                     mov EAX, L0;
738                     mov EDX, L1;
739                     mov EDI, dest;
740                     movdqa XMM5, xmmTwoInt;
741                     pxor XMM4, XMM4;
742 
743                 loop_ecx:
744                     movdqu XMM0, [EAX]; // A B E F I J M N
745                     movdqu XMM1, [EDX]; // C D G H K L O P
746 
747                     add EAX, 16;
748                     add EDX, 16;
749 
750                     movdqa XMM2, XMM0;
751                     movdqa XMM3, XMM1;
752 
753                     punpcklwd XMM0, XMM4; // A B E F in int32
754                     punpckhwd XMM2, XMM4; // I J M N in int32
755                     punpcklwd XMM1, XMM4; // C D G H in int32
756                     punpckhwd XMM3, XMM4; // K L O P in int32
757 
758                     paddd XMM0, XMM1; // A+C B+D E+G F+H
759                     paddd XMM2, XMM3; // I+K J+L M+O N+P
760 
761                     movdqa XMM1, XMM0;
762                     movdqa XMM3, XMM2;
763 
764                     psrldq XMM1, 4; // B+D E+G F+H 0
765                     psrldq XMM3, 4; // J+L M+O N+P 0
766 
767                     paddd XMM0, XMM1; // A+B+C+D garbage E+F+G+H garbage
768                     paddd XMM2, XMM3; // I+J+K+L garbage M+N+O+P garbage
769 
770                     pshufd XMM0, XMM0, 0b00001000; // A+B+C+D E+F+G+H garbage garbage
771                     pshufd XMM2, XMM2, 0b00001000; // I+J+K+L M+N+O+P garbage garbage
772 
773                     punpcklqdq XMM0, XMM2; // A+B+C+D E+F+G+H I+J+K+L M+N+O+P
774                     paddd XMM0, XMM5; // add 2
775                     psrld XMM0, 2; // >> 2
776 
777                     // because packusdw is not available before SSE4.1
778                     // Extend sign bit to the right
779                     pslld XMM0, 16;
780                     psrad XMM0, 16;
781                     add EDI, 8;
782                     packssdw XMM0, XMM4;
783 
784                     movq [EDI-8], XMM0;
785                     sub ECX, 1;
786                     jnz loop_ecx;
787                 no_need: ;
788                 }
789 
790                 // Eventually filter the 0 to 3 pixels
791                 int remaining = width & ~3;
792                 for (int x = remaining; x < width; ++x)
793                 {
794                     L16 A = L0[2 * x];
795                     L16 B = L0[2 * x + 1];
796                     L16 C = L1[2 * x];
797                     L16 D = L1[2 * x + 1];
798                     dest[x] = L16.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D);
799                 }
800             }
801             else version(D_InlineAsm_X86_64)
802             {
803                 asm pure nothrow @nogc
804                 {
805                     mov ECX, width;
806                     shr ECX, 2;
807                     jz no_need; // ECX = 0 => less than 4 pixels to process
808 
809                     mov RAX, L0;
810                     mov RDX, L1;
811                     mov RDI, dest;
812                     movdqa XMM5, xmmTwoInt;
813                     pxor XMM4, XMM4;
814 
815                 loop_ecx:
816                     movdqu XMM0, [RAX]; // A B E F I J M N
817                     movdqu XMM1, [RDX]; // C D G H K L O P
818 
819                     add RAX, 16;
820                     add RDX, 16;
821 
822                     movdqa XMM2, XMM0;
823                     movdqa XMM3, XMM1;
824 
825                     punpcklwd XMM0, XMM4; // A B E F in int32
826                     punpckhwd XMM2, XMM4; // I J M N in int32
827                     punpcklwd XMM1, XMM4; // C D G H in int32
828                     punpckhwd XMM3, XMM4; // K L O P in int32
829 
830                     paddd XMM0, XMM1; // A+C B+D E+G F+H
831                     paddd XMM2, XMM3; // I+K J+L M+O N+P
832 
833                     movdqa XMM1, XMM0;
834                     movdqa XMM3, XMM2;
835 
836                     psrldq XMM1, 4; // B+D E+G F+H 0
837                     psrldq XMM3, 4; // J+L M+O N+P 0
838 
839                     paddd XMM0, XMM1; // A+B+C+D garbage E+F+G+H garbage
840                     paddd XMM2, XMM3; // I+J+K+L garbage M+N+O+P garbage
841 
842                     pshufd XMM0, XMM0, 0b00001000; // A+B+C+D E+F+G+H garbage garbage
843                     pshufd XMM2, XMM2, 0b00001000; // I+J+K+L M+N+O+P garbage garbage
844 
845                     punpcklqdq XMM0, XMM2; // A+B+C+D E+F+G+H I+J+K+L M+N+O+P
846                     paddd XMM0, XMM5; // add 2
847                     psrld XMM0, 2; // >> 2
848 
849                     // because packusdw is not available before SSE4.1
850                     // Extend sign bit to the right
851                     pslld XMM0, 16;
852                     psrad XMM0, 16;
853                     add RDI, 8;
854                     packssdw XMM0, XMM4;
855 
856                     movq [RDI-8], XMM0;
857                     sub ECX, 1;
858                     jnz loop_ecx;
859                 no_need: ;
860                 }
861 
862                 // Eventually filter the 0 to 3 pixels
863                 int remaining = width & ~3;
864                 for (int x = remaining; x < width; ++x)
865                 {
866                     L16 A = L0[2 * x];
867                     L16 B = L0[2 * x + 1];
868                     L16 C = L1[2 * x];
869                     L16 D = L1[2 * x + 1];
870                     dest[x] = L16.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D);
871                 }
872             }
873             else
874                 static assert(false);
875         }
876         else
877         {
878             for (int x = 0; x < width; ++x)
879             {
880                 // A B
881                 // C D
882                 L16 A = L0[2 * x];
883                 L16 B = L0[2 * x + 1];
884                 L16 C = L1[2 * x];
885                 L16 D = L1[2 * x + 1];
886 
887                 dest[x] = L16.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D);
888             }
889         }
890 
891         L0 += (2 * previousPitch);
892         L1 += (2 * previousPitch);
893         dest += thisPitch;
894     }
895 }
896 
897 
898 void generateLevelBoxAlphaCovRGBA(OwnedImage!RGBA thisLevel,
899                                   OwnedImage!RGBA previousLevel,
900                                   box2i updateRect) nothrow @nogc
901 {
902     int width = updateRect.width();
903     int height = updateRect.height();
904 
905     int previousPitch = previousLevel.w;
906     int thisPitch = thisLevel.w;
907 
908     RGBA* L0 = previousLevel.scanline(updateRect.min.y * 2).ptr + updateRect.min.x * 2;
909     RGBA* L1 = L0 + previousPitch;
910 
911     RGBA* dest = thisLevel.scanline(updateRect.min.y).ptr + updateRect.min.x;
912 
913     for (int y = 0; y < height; ++y)
914     {
915         version(inlineAsmCanLoadGlobalsInPIC)
916         {
917             version(D_InlineAsm_X86)
918             {
919                 assert(width > 0);
920                 asm nothrow @nogc
921                 {
922                     mov ECX, width;
923 
924                     mov EAX, L0;
925                     mov EDX, L1;
926                     mov EDI, dest;
927 
928                     loop_ecx:
929 
930                         movq XMM0, [EAX];                  // Ar Ag Ab Aa Br Bg Bb Ba + zeroes
931                         movq XMM1, [EDX];                  // Cr Cg Cb Ca Dr Dg Db Da + zeroes
932                         pxor XMM4, XMM4;
933                         add EAX, 8;
934                         add EDX, 8;
935 
936                         punpcklbw XMM0, XMM4;              // Ar Ag Ab Aa Br Bg Bb Ba
937                         punpcklbw XMM1, XMM4;              // Cr Cg Cb Ca Dr Dg Db Da
938 
939                         movdqa XMM2, XMM0;
940                         punpcklwd XMM0, XMM1;              // Ar Cr Ag Cg Ab Cb Aa Ca
941                         punpckhwd XMM2, XMM1;              // Br Dr Bg Dg Bb Db Ba Da
942 
943                         // perhaps unnecessary
944                         movdqa XMM3, XMM0;
945                         punpcklwd XMM0, XMM2;              // Ar Br Cr Dr Ag Bg Cg Dg
946                         punpckhwd XMM3, XMM2;              // Ab Bb Cb Db Aa Ba Ca Da
947 
948                         movdqa XMM1, XMM3;
949                         punpckhqdq XMM1, XMM1;             // Aa Ba Ca Da Aa Ba Ca Da
950 
951                         // Are alpha all zeroes? if so, early continue.
952                         movdqa XMM2, XMM1;
953                         pcmpeqb XMM2, XMM4;
954                         add EDI, 4;
955                         pmovmskb ESI, XMM2;
956                         cmp ESI, 0xffff;
957                         jnz non_null;
958 
959                             pxor XMM0, XMM0;
960                             sub ECX, 1;
961                             movd [EDI-4], XMM0;            // dest[x] = A
962                             jnz loop_ecx;
963                             jmp end_of_loop;
964 
965                         non_null:
966 
967                             pmaddwd XMM0, XMM1;            // Ar*Aa+Br*Ba Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da
968                             pmaddwd XMM3, XMM1;            // Ab*Aa+Bb*Ba Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da
969 
970                             // Starting computing sum of coefficients too
971                             punpcklwd XMM1, XMM4;      // Aa Ba Ca Da
972 
973                             movdqa XMM2, XMM0;
974                             movdqa XMM5, XMM3;
975                             movdqa XMM4, XMM1;
976                             psrldq XMM4, 8;
977 
978                             psrldq XMM2, 4;                // Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 0
979                             psrldq XMM5, 4;                // Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 0
980                             paddq XMM1, XMM4;              // Aa+Ca Ba+Da garbage garbage
981                             movdqa XMM4, XMM1;
982 
983                             paddd XMM0, XMM2;              // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da garbage Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage
984                             paddd XMM3, XMM5;              // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da garbage Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage
985                             psrldq XMM4, 4;
986 
987                             pshufd XMM0, XMM0, 0b00001000; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage garbage
988                             paddq XMM1, XMM4;          // Aa+Ba+Ca+Da garbage garbage garbage
989                             pshufd XMM3, XMM3, 0b00001000; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage garbage
990 
991                             punpcklqdq XMM0, XMM3;     // fR fG fB fA
992                             pshufd XMM1, XMM1, 0;
993 
994                             cvtdq2ps XMM0, XMM0;
995 
996                             cvtdq2ps XMM3, XMM1;       // sum sum sum sum
997 
998                             divps XMM0, XMM3;          // fR/sum fG/sum fB/sum fA/sum
999                             addps XMM0, xmm0_5;
1000                             cvttps2dq XMM0, XMM0;      // return into integer domain using cast(int)(x + 0.5f)
1001 
1002                             paddd XMM1, xmmTwoInt;
1003                             psrld XMM1, 2;             // finalAlpha finalAlpha finalAlpha finalAlpha
1004 
1005                             pslldq XMM0, 4;            // 0 fR/sum fG/sum fB/sum
1006                             pslldq XMM1, 12;           // 0 0 0 finalAlpha
1007                             psrldq XMM0, 4;            // fR/sum fG/sum fB/sum 0
1008 
1009                             por XMM0, XMM1;            // fR/sum fG/sum fB/sum finalAlpha
1010                             pxor XMM3, XMM3;
1011                             packssdw XMM0, XMM3;       // same in words
1012                             packuswb XMM0, XMM3;       // same in bytes
1013 
1014                             sub ECX, 1;
1015                             movd [EDI-4], XMM0;            // dest[x] = A
1016                     jnz loop_ecx;
1017                     end_of_loop: ;
1018                 }
1019             }
1020             else version(D_InlineAsm_X86_64)
1021             {
1022                 assert(width > 0);
1023                 asm nothrow @nogc
1024                 {
1025                     mov ECX, width;
1026 
1027                     mov RAX, L0;
1028                     mov RDX, L1;
1029                     mov RDI, dest;
1030 
1031                 loop_ecx:
1032 
1033                     movq XMM0, [RAX];                  // Ar Ag Ab Aa Br Bg Bb Ba + zeroes
1034                     movq XMM1, [RDX];                  // Cr Cg Cb Ca Dr Dg Db Da + zeroes
1035                     pxor XMM4, XMM4;
1036                     add RAX, 8;
1037                     add RDX, 8;
1038 
1039                     punpcklbw XMM0, XMM4;              // Ar Ag Ab Aa Br Bg Bb Ba
1040                     punpcklbw XMM1, XMM4;              // Cr Cg Cb Ca Dr Dg Db Da
1041 
1042                     movdqa XMM2, XMM0;
1043                     punpcklwd XMM0, XMM1;              // Ar Cr Ag Cg Ab Cb Aa Ca
1044                     punpckhwd XMM2, XMM1;              // Br Dr Bg Dg Bb Db Ba Da
1045 
1046                     // perhaps unnecessary
1047                     movdqa XMM3, XMM0;
1048                     punpcklwd XMM0, XMM2;              // Ar Br Cr Dr Ag Bg Cg Dg
1049                     punpckhwd XMM3, XMM2;              // Ab Bb Cb Db Aa Ba Ca Da
1050 
1051                     movdqa XMM1, XMM3;
1052                     punpckhqdq XMM1, XMM1;             // Aa Ba Ca Da Aa Ba Ca Da
1053 
1054                     // Are alpha all zeroes? if so, early continue.
1055                     movdqa XMM2, XMM1;
1056                     pcmpeqb XMM2, XMM4;
1057                     add RDI, 4;
1058                     pmovmskb ESI, XMM2;
1059                     cmp ESI, 0xffff;
1060                     jnz non_null;
1061 
1062                     pxor XMM0, XMM0;
1063                     sub ECX, 1;
1064                     movd [RDI-4], XMM0;            // dest[x] = A
1065                     jnz loop_ecx;
1066                     jmp end_of_loop;
1067 
1068                 non_null:
1069 
1070                     pmaddwd XMM0, XMM1;            // Ar*Aa+Br*Ba Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da
1071                     pmaddwd XMM3, XMM1;            // Ab*Aa+Bb*Ba Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da
1072 
1073                     // Starting computing sum of coefficients too
1074                     punpcklwd XMM1, XMM4;      // Aa Ba Ca Da
1075 
1076                     movdqa XMM2, XMM0;
1077                     movdqa XMM5, XMM3;
1078                     movdqa XMM4, XMM1;
1079                     psrldq XMM4, 8;
1080 
1081                     psrldq XMM2, 4;                // Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 0
1082                     psrldq XMM5, 4;                // Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 0
1083                     paddq XMM1, XMM4;              // Aa+Ca Ba+Da garbage garbage
1084                     movdqa XMM4, XMM1;
1085 
1086                     paddd XMM0, XMM2;              // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da garbage Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage
1087                     paddd XMM3, XMM5;              // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da garbage Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage
1088                     psrldq XMM4, 4;
1089 
1090                     pshufd XMM0, XMM0, 0b00001000; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage garbage
1091                     paddq XMM1, XMM4;          // Aa+Ba+Ca+Da garbage garbage garbage
1092                     pshufd XMM3, XMM3, 0b00001000; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage garbage
1093 
1094                     punpcklqdq XMM0, XMM3;     // fR fG fB fA
1095                     pshufd XMM1, XMM1, 0;
1096 
1097                     cvtdq2ps XMM0, XMM0;
1098 
1099                     cvtdq2ps XMM3, XMM1;       // sum sum sum sum
1100 
1101                     divps XMM0, XMM3;          // fR/sum fG/sum fB/sum fA/sum
1102                     addps XMM0, xmm0_5;
1103                     cvttps2dq XMM0, XMM0;      // return into integer domain using cast(int)(x + 0.5f)
1104 
1105                     paddd XMM1, xmmTwoInt;
1106                     psrld XMM1, 2;             // finalAlpha finalAlpha finalAlpha finalAlpha
1107 
1108                     pslldq XMM0, 4;            // 0 fR/sum fG/sum fB/sum
1109                     pslldq XMM1, 12;           // 0 0 0 finalAlpha
1110                     psrldq XMM0, 4;            // fR/sum fG/sum fB/sum 0
1111 
1112                     por XMM0, XMM1;            // fR/sum fG/sum fB/sum finalAlpha
1113                     pxor XMM3, XMM3;
1114                     packssdw XMM0, XMM3;       // same in words
1115                     packuswb XMM0, XMM3;       // same in bytes
1116 
1117                     sub ECX, 1;
1118                     movd [RDI-4], XMM0;            // dest[x] = A
1119                     jnz loop_ecx;
1120                 end_of_loop: ;
1121                 }
1122             }
1123             else
1124                 static assert(false);
1125         }
1126         else
1127         {
1128             for (int x = 0; x < width; ++x)
1129             {
1130                 // A B
1131                 // C D
1132                 RGBA A = L0[2 * x];
1133                 RGBA B = L0[2 * x + 1];
1134                 RGBA C = L1[2 * x];
1135                 RGBA D = L1[2 * x + 1];
1136 
1137                 int alphaA = A.a;
1138                 int alphaB = B.a;
1139                 int alphaC = C.a;
1140                 int alphaD = D.a;
1141                 int sum = alphaA + alphaB + alphaC + alphaD;
1142                 if (sum == 0)
1143                 {
1144                     dest[x] = RGBA(0,0,0,0);
1145                 }
1146                 else
1147                 {
1148                     int destAlpha = cast(ubyte)( (alphaA + alphaB + alphaC + alphaD + 2) >> 2 );
1149                     int red =   (A.r * alphaA + B.r * alphaB + C.r * alphaC + D.r * alphaD);
1150                     int green = (A.g * alphaA + B.g * alphaB + C.g * alphaC + D.g * alphaD);
1151                     int blue =  (A.b * alphaA + B.b* alphaB + C.b * alphaC + D.b * alphaD);
1152                     float invSum = 1 / cast(float)(sum);
1153 
1154                     RGBA finalColor = RGBA( cast(ubyte)(0.5f + red * invSum),
1155                                             cast(ubyte)(0.5f + green * invSum),
1156                                             cast(ubyte)(0.5f + blue * invSum),
1157                                             cast(ubyte)destAlpha );
1158                     dest[x] = finalColor;
1159                 }
1160             }
1161         }
1162 
1163         enum verify = false;
1164 
1165         static if (verify)
1166         {
1167             for (int x = 0; x < width; ++x)
1168             {
1169                 // A B
1170                 // C D
1171                 RGBA A = L0[2 * x];
1172                 RGBA B = L0[2 * x + 1];
1173                 RGBA C = L1[2 * x];
1174                 RGBA D = L1[2 * x + 1];
1175 
1176                 int alphaA = A.a;
1177                 int alphaB = B.a;
1178                 int alphaC = C.a;
1179                 int alphaD = D.a;
1180                 int sum = alphaA + alphaB + alphaC + alphaD;
1181                 if (sum == 0)
1182                 {
1183                     assert(dest[x] == RGBA(0,0,0,0));
1184                 }
1185                 else
1186                 {
1187                     int destAlpha = cast(ubyte)( (alphaA + alphaB + alphaC + alphaD + 2) >> 2 );
1188                     int red =   (A.r * alphaA + B.r * alphaB + C.r * alphaC + D.r * alphaD);
1189                     int green = (A.g * alphaA + B.g * alphaB + C.g * alphaC + D.g * alphaD);
1190                     int blue =  (A.b * alphaA + B.b* alphaB + C.b * alphaC + D.b * alphaD);
1191 
1192                     float invSum = 1 / cast(float)(sum);
1193 
1194                     RGBA finalColor = RGBA( cast(ubyte)(0.5f + red * invSum),
1195                                             cast(ubyte)(0.5f + green * invSum),
1196                                            cast(ubyte)(0.5f + blue * invSum),
1197                                            cast(ubyte)destAlpha );
1198                     RGBA instead = dest[x];
1199 
1200                     int insteadR = instead.r;
1201                     int insteadG = instead.g;
1202                     int insteadB = instead.b;
1203                     int insteadA = instead.a;
1204                     int finalColorR = finalColor.r;
1205                     int finalColorG = finalColor.g;
1206                     int finalColorB = finalColor.b;
1207                     int finalColorA = finalColor.a;
1208                     import std.math;
1209                     assert(abs(insteadR - finalColorR) <= 1); // some remaining differences because of rounding
1210                     assert(abs(insteadG - finalColorG) <= 1);
1211                     assert(abs(insteadB - finalColorB) <= 1);
1212                     assert(insteadA == finalColorA);
1213                 }
1214             }
1215         }
1216 
1217         L0 += (2 * previousPitch);
1218         L1 += (2 * previousPitch);
1219         dest += thisPitch;
1220     }
1221 }
1222 
1223 void generateLevelBoxAlphaCovIntoPremulRGBA(OwnedImage!RGBA thisLevel,
1224                                             OwnedImage!RGBA previousLevel,
1225                                             box2i updateRect) nothrow @nogc
1226 {
1227     int width = updateRect.width();
1228     int height = updateRect.height();
1229 
1230     int previousPitch = previousLevel.w;
1231     int thisPitch = thisLevel.w;
1232 
1233     RGBA* L0 = previousLevel.scanline(updateRect.min.y * 2).ptr + updateRect.min.x * 2;
1234     RGBA* L1 = L0 + previousPitch;
1235 
1236     RGBA* dest = thisLevel.scanline(updateRect.min.y).ptr + updateRect.min.x;
1237 
1238     for (int y = 0; y < height; ++y)
1239     {
1240         version(inlineAsmCanLoadGlobalsInPIC)
1241         {
1242             version(D_InlineAsm_X86)
1243             {
1244                 asm nothrow @nogc
1245                 {
1246                     mov ECX, width;
1247 
1248                     mov EAX, L0;
1249                     mov EDX, L1;
1250                     mov EDI, dest;
1251 
1252                     movdqa XMM5, xmm512;               // 512 512 5121 512
1253                     pxor XMM4, XMM4;                   // all zeroes
1254 
1255                 loop_ecx:
1256 
1257                     movq XMM0, [EAX];                  // Ar Ag Ab Aa Br Bg Bb Ba + zeroes
1258                     movq XMM1, [EDX];                  // Cr Cg Cb Ca Dr Dg Db Da + zeroes
1259                     pxor XMM4, XMM4;
1260                     add EAX, 8;
1261                     add EDX, 8;
1262 
1263                     punpcklbw XMM0, XMM4;              // Ar Ag Ab Aa Br Bg Bb Ba
1264                     punpcklbw XMM1, XMM4;              // Cr Cg Cb Ca Dr Dg Db Da
1265 
1266                     movdqa XMM2, XMM0;
1267                     punpcklwd XMM0, XMM1;              // Ar Cr Ag Cg Ab Cb Aa Ca
1268                     punpckhwd XMM2, XMM1;              // Br Dr Bg Dg Bb Db Ba Da
1269 
1270                     movdqa XMM3, XMM0;
1271                     punpcklwd XMM0, XMM2;              // Ar Br Cr Dr Ag Bg Cg Dg
1272                     punpckhwd XMM3, XMM2;              // Ab Bb Cb Db Aa Ba Ca Da
1273 
1274                     movdqa XMM1, XMM3;
1275                     punpckhqdq XMM1, XMM1;             // Aa Ba Ca Da Aa Ba Ca Da
1276 
1277                     add EDI, 4;
1278 
1279                     pmaddwd XMM0, XMM1;            // Ar*Aa+Br*Ba Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da
1280                     pmaddwd XMM3, XMM1;            // Ab*Aa+Bb*Ba Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da
1281 
1282                     movdqa XMM2, XMM0;
1283                     movdqa XMM1, XMM3;
1284 
1285                     psrldq XMM2, 4;                // Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 0
1286                     psrldq XMM1, 4;                // Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 0
1287 
1288                     paddd XMM0, XMM2;              // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da garbage Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage
1289                     paddd XMM3, XMM1;              // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da garbage Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage
1290 
1291                     pshufd XMM0, XMM0, 0b00001000; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage garbage
1292                     pshufd XMM3, XMM3, 0b00001000; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage garbage
1293 
1294                     punpcklqdq XMM0, XMM3;     // fR fG fB fA
1295 
1296 
1297                     paddd XMM0, XMM5;
1298                     psrld XMM0, 10;             // final color in dwords
1299 
1300                     packssdw XMM0, XMM4;       // same in words
1301                     packuswb XMM0, XMM4;       // same in bytes
1302 
1303                     sub ECX, 1;
1304                     movd [EDI-4], XMM0;            // dest[x] = A
1305                     jnz loop_ecx;
1306                 }
1307             }
1308             else version(D_InlineAsm_X86_64)
1309             {
1310                 asm nothrow @nogc
1311                 {
1312                     mov ECX, width;
1313 
1314                     mov RAX, L0;
1315                     mov RDX, L1;
1316                     mov RDI, dest;
1317 
1318                     movdqa XMM5, xmm512;               // 512 512 5121 512
1319                     pxor XMM4, XMM4;                   // all zeroes
1320 
1321                 loop_ecx:
1322 
1323                     movq XMM0, [RAX];                  // Ar Ag Ab Aa Br Bg Bb Ba + zeroes
1324                     movq XMM1, [RDX];                  // Cr Cg Cb Ca Dr Dg Db Da + zeroes
1325                     pxor XMM4, XMM4;
1326                     add RAX, 8;
1327                     add RDX, 8;
1328 
1329                     punpcklbw XMM0, XMM4;              // Ar Ag Ab Aa Br Bg Bb Ba
1330                     punpcklbw XMM1, XMM4;              // Cr Cg Cb Ca Dr Dg Db Da
1331 
1332                     movdqa XMM2, XMM0;
1333                     punpcklwd XMM0, XMM1;              // Ar Cr Ag Cg Ab Cb Aa Ca
1334                     punpckhwd XMM2, XMM1;              // Br Dr Bg Dg Bb Db Ba Da
1335 
1336                     movdqa XMM3, XMM0;
1337                     punpcklwd XMM0, XMM2;              // Ar Br Cr Dr Ag Bg Cg Dg
1338                     punpckhwd XMM3, XMM2;              // Ab Bb Cb Db Aa Ba Ca Da
1339 
1340                     movdqa XMM1, XMM3;
1341                     punpckhqdq XMM1, XMM1;             // Aa Ba Ca Da Aa Ba Ca Da
1342 
1343                     add RDI, 4;
1344 
1345                     pmaddwd XMM0, XMM1;            // Ar*Aa+Br*Ba Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da
1346                     pmaddwd XMM3, XMM1;            // Ab*Aa+Bb*Ba Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da
1347 
1348                     movdqa XMM2, XMM0;
1349                     movdqa XMM1, XMM3;
1350 
1351                     psrldq XMM2, 4;                // Cr*Ca+Dr*Da Ag*Aa+Bg*Ba Cg*Ca+Dg*Da 0
1352                     psrldq XMM1, 4;                // Cb*Ca+Db*Da Aa*Aa+Ba*Ba Ca*Ca+Da*Da 0
1353 
1354                     paddd XMM0, XMM2;              // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da garbage Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage
1355                     paddd XMM3, XMM1;              // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da garbage Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage
1356 
1357                     pshufd XMM0, XMM0, 0b00001000; // Ar*Aa+Br*Ba+Cr*Ca+Dr*Da Ag*Aa+Bg*Ba+Cg*Ca+Dg*Da garbage garbage
1358                     pshufd XMM3, XMM3, 0b00001000; // Ab*Aa+Bb*Ba+Cb*Ca+Db*Da Aa*Aa+Ba*Ba+Ca*Ca+Da*Da garbage garbage
1359 
1360                     punpcklqdq XMM0, XMM3;     // fR fG fB fA
1361 
1362 
1363                     paddd XMM0, XMM5;
1364                     psrld XMM0, 10;             // final color in dwords
1365 
1366                     packssdw XMM0, XMM4;       // same in words
1367                     packuswb XMM0, XMM4;       // same in bytes
1368 
1369                     sub ECX, 1;
1370                     movd [RDI-4], XMM0;            // dest[x] = A
1371                     jnz loop_ecx;
1372                 }
1373             }
1374             else 
1375                 static assert(false);
1376         }
1377         else
1378         {
1379             for (int x = 0; x < width; ++x)
1380             {
1381                 RGBA A = L0[2 * x];
1382                 RGBA B = L0[2 * x + 1];
1383                 RGBA C = L1[2 * x];
1384                 RGBA D = L1[2 * x + 1];
1385                 int red =   (A.r * A.a + B.r * B.a + C.r * C.a + D.r * D.a);
1386                 int green = (A.g * A.a + B.g * B.a + C.g * C.a + D.g * D.a);
1387                 int blue =  (A.b * A.a + B.b* B.a + C.b * C.a + D.b * D.a);
1388                 int alpha =  (A.a * A.a + B.a* B.a + C.a * C.a + D.a * D.a);
1389                 RGBA finalColor = RGBA( cast(ubyte)((red + 512) >> 10),
1390                                         cast(ubyte)((green + 512) >> 10),
1391                                         cast(ubyte)((blue + 512) >> 10),
1392                                         cast(ubyte)((alpha + 512) >> 10));
1393                 dest[x] = finalColor;
1394             }
1395         }
1396 
1397         enum bool verify = false;
1398 
1399         static if (verify)
1400         {
1401             for (int x = 0; x < width; ++x)
1402             {
1403                 RGBA A = L0[2 * x];
1404                 RGBA B = L0[2 * x + 1];
1405                 RGBA C = L1[2 * x];
1406                 RGBA D = L1[2 * x + 1];
1407                 int red =   (A.r * A.a + B.r * B.a + C.r * C.a + D.r * D.a);
1408                 int green = (A.g * A.a + B.g * B.a + C.g * C.a + D.g * D.a);
1409                 int blue =  (A.b * A.a + B.b* B.a + C.b * C.a + D.b * D.a);
1410                 int alpha =  (A.a * A.a + B.a* B.a + C.a * C.a + D.a * D.a);
1411                 RGBA finalColor = RGBA( cast(ubyte)((red + 512) >> 10),
1412                                         cast(ubyte)((green + 512) >> 10),
1413                                        cast(ubyte)((blue + 512) >> 10),
1414                                        cast(ubyte)((alpha + 512) >> 10));
1415                 assert(dest[x] == finalColor);
1416             }
1417         }
1418 
1419         L0 += (2 * previousPitch);
1420         L1 += (2 * previousPitch);
1421         dest += thisPitch;
1422     }
1423 }
1424 
1425 void generateLevelCubicRGBA(OwnedImage!RGBA thisLevel,
1426                             OwnedImage!RGBA previousLevel,
1427                             box2i updateRect) nothrow @nogc
1428 {
1429     for (int y = updateRect.min.y; y < updateRect.max.y; ++y)
1430     {
1431         int y2m1 = 2 * y - 1;
1432         if (y2m1 < 0)
1433             y2m1 = 0;
1434 
1435         int y2p2 = 2 * y + 2;
1436         if (y2p2 > previousLevel.h - 1)
1437             y2p2 = previousLevel.h - 1;
1438 
1439         RGBA* LM1 = previousLevel.scanline(y2m1).ptr;
1440         RGBA* L0 = previousLevel.scanline(y * 2).ptr;
1441         RGBA* L1 = previousLevel.scanline(y * 2 + 1).ptr;
1442         RGBA* L2 = previousLevel.scanline(y2p2).ptr;
1443         RGBA* dest = thisLevel.scanline(y).ptr;
1444 
1445         for (int x = updateRect.min.x; x < updateRect.max.x; ++x)
1446         {
1447             // A B C D
1448             // E F G H
1449             // I J K L
1450             // M N O P
1451 
1452             int x2m1 = 2 * x - 1;
1453             if (x2m1 < 0)
1454                 x2m1 = 0;
1455             int x2p0 = 2 * x;
1456             int x2p2 = 2 * x + 2;
1457             if (x2p2 > previousLevel.w - 1)
1458                 x2p2 = previousLevel.w - 1;
1459 
1460             version(inlineAsmCanLoadGlobalsInPIC)
1461             {
1462                 version(D_InlineAsm_X86)
1463                 {
1464                     RGBA[16] buf = void;
1465                     buf[0] = LM1[x2m1];
1466                     buf[1] = LM1[x2p0];
1467                     buf[2] = LM1[x2p0+1];
1468                     buf[3] = LM1[x2p2];
1469                     buf[4] = L0[x2m1];
1470                     buf[5] = L0[x2p0];
1471                     buf[6] = L0[x2p0+1];
1472                     buf[7] = L0[x2p2];
1473                     buf[8] = L1[x2m1];
1474                     buf[9] = L1[x2p0];
1475                     buf[10] = L1[x2p0+1];
1476                     buf[11] = L1[x2p2];
1477                     buf[12] = L2[x2m1];
1478                     buf[13] = L2[x2p0];
1479                     buf[14] = L2[x2p0+1];
1480                     buf[15] = L2[x2p2];
1481                     RGBA* pDest = dest + x;
1482 
1483                     asm nothrow @nogc
1484                     {
1485                         movdqu XMM0, buf;  // A B C D
1486                         movdqu XMM1, buf;
1487                         pxor XMM2, XMM2;      // zeroes
1488                         punpcklbw XMM0, XMM2; // A B
1489                         punpckhbw XMM1, XMM2; // C D
1490                         pmullw XMM0, xmm11113333; // A*1 B*3 in shorts
1491                         movdqa XMM3, XMM0;
1492                         pmullw XMM1, xmm33331111; // C*3 D*3 in shorts
1493                         movdqa XMM5, XMM1;
1494 
1495                         movdqu XMM0, buf+16;  // E F G H
1496                         movdqu XMM1, buf+16;
1497                         punpcklbw XMM0, XMM2; // E F
1498                         punpckhbw XMM1, XMM2; // G H
1499                         pmullw XMM0, xmm33339999; // E*3 F*9 in shorts
1500                         paddw XMM3, XMM0;
1501                         pmullw XMM1, xmm99993333; // G*9 H*3 in shorts
1502                         paddw XMM5, XMM1;
1503 
1504                         movdqu XMM0, buf+32;  // I J K L
1505                         movdqu XMM1, buf+32;
1506                         punpcklbw XMM0, XMM2; // I J
1507                         punpckhbw XMM1, XMM2; // K L
1508                         pmullw XMM0, xmm33339999; // I*3 J*9 in shorts
1509                         paddw XMM3, XMM0;
1510                         pmullw XMM1, xmm99993333; // K*9 L*3 in shorts
1511                         paddw XMM5, XMM1;
1512 
1513                         movdqu XMM0, buf+48;  // M N O P
1514                         movdqu XMM1, buf+48;
1515                         punpcklbw XMM0, XMM2; // M N
1516                         punpckhbw XMM1, XMM2; // O P
1517                         pmullw XMM0, xmm11113333; // M*1 N*3 in shorts
1518                         paddw XMM3, XMM0; // A+E*3+I*3+M B*3+F*9+J*9+3*N
1519                         pmullw XMM1, xmm33331111; // O*3 P*1 in shorts
1520                         paddw XMM5, XMM1; // C*3+G*9+K*9+O*3 D+H*3+L*3+P
1521 
1522                         movdqa XMM0, XMM3;
1523                         movdqa XMM1, XMM5;
1524                         psrldq XMM0, 8;
1525                         psrldq XMM1, 8;
1526                         paddw XMM3, XMM0; // A+E*3+I*3+M+B*3+F*9+J*9+3*N garbage(x4)
1527                         paddw XMM5, XMM1; // C*3+G*9+K*9+O*3+D+H*3+L*3+P garbage(x4)
1528                         paddw XMM3, XMM5; // total-sum garbage(x4)
1529 
1530                         paddw XMM3, xmm32;
1531                         psrlw XMM3, 6;
1532                         mov EAX, pDest;
1533                         packuswb XMM3, XMM2;
1534 
1535                         movd [EAX], XMM3;
1536                     }
1537                 }
1538                 else version(D_InlineAsm_X86_64)
1539                 {
1540                     RGBA[16] buf = void;
1541                     buf[0] = LM1[x2m1];
1542                     buf[1] = LM1[x2p0];
1543                     buf[2] = LM1[x2p0+1];
1544                     buf[3] = LM1[x2p2];
1545                     buf[4] = L0[x2m1];
1546                     buf[5] = L0[x2p0];
1547                     buf[6] = L0[x2p0+1];
1548                     buf[7] = L0[x2p2];
1549                     buf[8] = L1[x2m1];
1550                     buf[9] = L1[x2p0];
1551                     buf[10] = L1[x2p0+1];
1552                     buf[11] = L1[x2p2];
1553                     buf[12] = L2[x2m1];
1554                     buf[13] = L2[x2p0];
1555                     buf[14] = L2[x2p0+1];
1556                     buf[15] = L2[x2p2];
1557                     RGBA* pDest = dest + x;
1558 
1559                     asm nothrow @nogc
1560                     {
1561                         movdqu XMM0, buf;  // A B C D
1562                         movdqu XMM1, buf;
1563                         pxor XMM2, XMM2;      // zeroes
1564                         punpcklbw XMM0, XMM2; // A B
1565                         punpckhbw XMM1, XMM2; // C D
1566                         pmullw XMM0, xmm11113333; // A*1 B*3 in shorts
1567                         movdqa XMM3, XMM0;
1568                         pmullw XMM1, xmm33331111; // C*3 D*3 in shorts
1569                         movdqa XMM5, XMM1;
1570 
1571                         movdqu XMM0, buf+16;  // E F G H
1572                         movdqu XMM1, buf+16;
1573                         punpcklbw XMM0, XMM2; // E F
1574                         punpckhbw XMM1, XMM2; // G H
1575                         pmullw XMM0, xmm33339999; // E*3 F*9 in shorts
1576                         paddw XMM3, XMM0;
1577                         pmullw XMM1, xmm99993333; // G*9 H*3 in shorts
1578                         paddw XMM5, XMM1;
1579 
1580                         movdqu XMM0, buf+32;  // I J K L
1581                         movdqu XMM1, buf+32;
1582                         punpcklbw XMM0, XMM2; // I J
1583                         punpckhbw XMM1, XMM2; // K L
1584                         pmullw XMM0, xmm33339999; // I*3 J*9 in shorts
1585                         paddw XMM3, XMM0;
1586                         pmullw XMM1, xmm99993333; // K*9 L*3 in shorts
1587                         paddw XMM5, XMM1;
1588 
1589                         movdqu XMM0, buf+48;  // M N O P
1590                         movdqu XMM1, buf+48;
1591                         punpcklbw XMM0, XMM2; // M N
1592                         punpckhbw XMM1, XMM2; // O P
1593                         pmullw XMM0, xmm11113333; // M*1 N*3 in shorts
1594                         paddw XMM3, XMM0; // A+E*3+I*3+M B*3+F*9+J*9+3*N
1595                         pmullw XMM1, xmm33331111; // O*3 P*1 in shorts
1596                         paddw XMM5, XMM1; // C*3+G*9+K*9+O*3 D+H*3+L*3+P
1597 
1598                         movdqa XMM0, XMM3;
1599                         movdqa XMM1, XMM5;
1600                         psrldq XMM0, 8;
1601                         psrldq XMM1, 8;
1602                         paddw XMM3, XMM0; // A+E*3+I*3+M+B*3+F*9+J*9+3*N garbage(x4)
1603                         paddw XMM5, XMM1; // C*3+G*9+K*9+O*3+D+H*3+L*3+P garbage(x4)
1604                         paddw XMM3, XMM5; // total-sum garbage(x4)
1605 
1606                         paddw XMM3, xmm32;
1607                         psrlw XMM3, 6;
1608                         mov RAX, pDest;
1609                         packuswb XMM3, XMM2;
1610 
1611                         movd [RAX], XMM3;
1612                     }
1613                 }
1614                 else
1615                     static assert(false);
1616             }
1617             else
1618             {
1619                 auto A = LM1[x2m1];
1620                 auto B = LM1[x2p0];
1621                 auto C = LM1[x2p0+1];
1622                 auto D = LM1[x2p2];
1623 
1624                 auto E = L0[x2m1];
1625                 auto F = L0[x2p0];
1626                 auto G = L0[x2p0+1];
1627                 auto H = L0[x2p2];
1628 
1629                 auto I = L1[x2m1];
1630                 auto J = L1[x2p0];
1631                 auto K = L1[x2p0+1];
1632                 auto L = L1[x2p2];
1633 
1634                 auto M = L2[x2m1];
1635                 auto N = L2[x2p0];
1636                 auto O = L2[x2p0+1];
1637                 auto P = L2[x2p2];
1638 
1639                 // Apply filter
1640                 // 1 3 3 1
1641                 // 3 9 9 3
1642                 // 3 9 9 3
1643                 // 1 3 3 1
1644 
1645                 int rSum = (A.r + D.r + M.r + P.r) + 3 * (B.r + C.r + E.r + H.r + I.r + L.r + N.r + O.r) + 9 * (F.r + G.r + J.r + K.r);
1646                 int gSum = (A.g + D.g + M.g + P.g) + 3 * (B.g + C.g + E.g + H.g + I.g + L.g + N.g + O.g) + 9 * (F.g + G.g + J.g + K.g);
1647                 int bSum = (A.b + D.b + M.b + P.b) + 3 * (B.b + C.b + E.b + H.b + I.b + L.b + N.b + O.b) + 9 * (F.b + G.b + J.b + K.b);
1648                 int aSum = (A.a + D.a + M.a + P.a) + 3 * (B.a + C.a + E.a + H.a + I.a + L.a + N.a + O.a) + 9 * (F.a + G.a + J.a + K.a);
1649                 dest[x].r = cast(ubyte)((rSum + 32) >> 6);
1650                 dest[x].g = cast(ubyte)((gSum + 32) >> 6);
1651                 dest[x].b = cast(ubyte)((bSum + 32) >> 6);
1652                 dest[x].a = cast(ubyte)((aSum + 32) >> 6);
1653             }
1654         }
1655     }
1656 }
1657 
1658 void generateLevelCubicL16(OwnedImage!L16 thisLevel,
1659                            OwnedImage!L16 previousLevel,
1660                            box2i updateRect) nothrow @nogc
1661 {
1662     for (int y = updateRect.min.y; y < updateRect.max.y; ++y)
1663     {
1664         int y2m1 = 2 * y - 1;
1665         if (y2m1 < 0)
1666             y2m1 = 0;
1667 
1668         int y2p2 = 2 * y + 2;
1669         if (y2p2 > previousLevel.h - 1)
1670             y2p2 = previousLevel.h - 1;
1671 
1672         L16* LM1 = previousLevel.scanline(y2m1).ptr;
1673         L16* L0 = previousLevel.scanline(y * 2).ptr;
1674         L16* L1 = previousLevel.scanline(y * 2 + 1).ptr;
1675         L16* L2 = previousLevel.scanline(y2p2).ptr;
1676         L16* dest = thisLevel.scanline(y).ptr;
1677 
1678         for (int x = updateRect.min.x; x < updateRect.max.x; ++x)
1679         {
1680             // A B C D
1681             // E F G H
1682             // I J K L
1683             // M N O P
1684 
1685             int x2m1 = 2 * x - 1;
1686             if (x2m1 < 0)
1687                 x2m1 = 0;
1688             int x2p0 = 2 * x;
1689             int x2p2 = 2 * x + 2;
1690             if (x2p2 > previousLevel.w - 1)
1691                 x2p2 = previousLevel.w - 1;
1692 
1693             ushort A = LM1[x2m1].l;
1694             ushort B = LM1[x2p0].l;
1695             ushort C = LM1[x2p0+1].l;
1696             ushort D = LM1[x2p2].l;
1697 
1698             ushort E = L0[x2m1].l;
1699             ushort F = L0[x2p0].l;
1700             ushort G = L0[x2p0+1].l;
1701             ushort H = L0[x2p2].l;
1702 
1703             ushort I = L1[x2m1].l;
1704             ushort J = L1[x2p0].l;
1705             ushort K = L1[x2p0+1].l;
1706             ushort L = L1[x2p2].l;
1707 
1708             ushort M = L2[x2m1].l;
1709             ushort N = L2[x2p0].l;
1710             ushort O = L2[x2p0+1].l;
1711             ushort P = L2[x2p2].l;
1712 
1713             // Apply filter
1714             // 1 3 3 1    A B C D
1715             // 3 9 9 3    E F G H
1716             // 3 9 9 3    I J K L
1717             // 1 3 3 1    M N O P
1718 
1719             int depthSum = (A + D + M + P)
1720                          + 3 * (B + C + E + H + I + L + N + O)
1721                          + 9 * (F + G + J + K);
1722             dest[x].l = cast(ushort)((depthSum + 32) >> 6  );
1723         }
1724     }
1725 }
1726 
1727 unittest
1728 {
1729     Mipmap!RGBA rgbaMipmap;
1730     Mipmap!L16 l16Mipmap;
1731 }