dplug.graphics.mipmap source code

1 /**
2 Mipmap pyramid implementation.
3 
4 Copyright: Guillaume Piolat 2015-2023.
5 License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module dplug.graphics.mipmap;
8 
9 import dplug.math.vector;
10 import dplug.math.box;
11 import dplug.graphics.image;
12 import dplug.core.nogc;
13 import dplug.core.vec;
14 
15 import inteli.smmintrin;
16 
17 version( D_InlineAsm_X86 )
18 {
19     version = AsmX86;
20 }
21 else version( D_InlineAsm_X86_64 )
22 {
23     version = AsmX86;
24 }
25 
26 /// Mipmapped images.
27 /// Supports non power-of-two textures.
28 /// Size of the i+1-th mipmap is { (width)/2, (height)/2 }
29 /// The mipmap owns each of its levels.
30 final class Mipmap(COLOR) if (is(COLOR == RGBA) || is(COLOR == L16) || is(COLOR == RGBA16) )
31 {
32 public:
33 nothrow:
34 @nogc:
35 
36     enum Quality
37     {
38         box,                   // simple 2x2 filter, creates phase problems with NPOT. For higher levels, automatically uses cubic.
39         cubic,                 // Very smooth kernel [1 2 1] x [1 2 1]
40 
41         /// Box-filter, and after such a step the next level is alpha-premultiplied.
42         /// This is intended for the first level 0 to level 1 transition, in case of bloom.
43         /// Within version(futurePBREmissive), this also transitions to linear space to have 
44         /// more natural highlights.
45         boxAlphaCovIntoPremul, 
46     }
47 
48     Vec!(OwnedImage!COLOR) levels;
49 
50     /// Creates empty
51     this()
52     {
53         levels = makeVec!(OwnedImage!COLOR)();
54     }
55 
56     /// Set number of levels and size
57     /// maxLevel = 0 => only one image
58     /// maxLevel = 1 => one image + one 2x downsampled mipmap
59     /// etc...
60     this(int maxLevel, int w, int h)
61     {
62         this();
63         size(maxLevel, w, h);
64     }
65 
66 
67     /// Creates a Mipmap out of a flat OwnedImage.
68     /// This takes ownership of the given image, which is now owned by the `Mipmap`.
69     this(int maxLevel, OwnedImage!COLOR level0)
70     {
71         //PERF: could avoid to create the 0th level only to replace it later
72 
73         this(maxLevel, level0.w, level0.h);
74 
75         // replaces level 0
76         levels[0].destroyFree();
77         levels[0] = level0;
78     }
79 
80     void size(int maxLevel, int w, int h)
81     {
82         // find number of needed levels
83         int neededLevels = 0;
84         {
85             int wr = w;
86             int hr = h;
87             for (; neededLevels <= maxLevel; ++neededLevels)
88             {
89                 if (wr == 0 || hr == 0)
90                     break;
91                 wr  = (wr + 0) >> 1;
92                 hr  = (hr + 0) >> 1;
93             }
94         }
95 
96         void setLevels(int numLevels)
97         {
98             // FUTURE: cleanup excess levels
99             // should not happen until we have resizing
100             if (numLevels < levels.length)
101             {
102                 assert(false);
103             }
104 
105             int previousLength = cast(int)levels.length;
106 
107             levels.resize(numLevels);
108 
109             // create empty image for new levels
110             for(int level = previousLength; level < numLevels; ++level)
111             {
112                 levels[level] = mallocNew!(OwnedImage!COLOR)();
113             }
114         }
115 
116         setLevels(neededLevels);
117 
118         // resize levels
119         for (int level = 0; level < neededLevels; ++level)
120         {
121             assert(w != 0 && h != 0);
122             levels[level].size(w, h);
123             w  = (w + 0) >> 1;
124             h  = (h + 0) >> 1;
125         }
126     }
127 
128     ~this()
129     {
130         foreach(level; levels)
131             level.destroyFree();
132     }
133 
134     /// Interpolates a color between mipmap levels.  Floating-point level, spatial linear interpolation.
135     /// x and y are in base level coordinates (top-left pixel is on (0.5, 0.5) coordinates).
136     /// Clamped to borders.
137     auto linearMipmapSample(float level, float x, float y) nothrow @nogc
138     {
139         int ilevel = cast(int)level;
140         float flevel = level - ilevel;
141         vec4f levelN = linearSample(ilevel, x, y);
142         if (flevel == 0)
143             return levelN;
144 
145         auto levelNp1 = linearSample(ilevel + 1, x, y);
146 
147         return levelN * (1 - flevel) + levelNp1 * flevel;
148     }
149 
150     /// Cubic filtering mode, using a Catmull-Rom bicubic filter.
151     /// Integer level, spatial linear interpolation.
152     /// x and y are in base level coordinates (top-left pixel is on (0.5, 0.5) coordinates).
153     /// Clamped to borders.
154     /// Reference: https://registry.khronos.org/OpenGL/extensions/IMG/IMG_texture_filter_cubic.txt
155     auto cubicSample(int level, float x, float y) nothrow @nogc 
156     {
157         if (level < 0)
158             level = 0;
159         int numLevels = cast(int)levels.length;
160         if (level >= numLevels)
161             level = numLevels - 1;
162 
163         OwnedImage!COLOR image = levels[level];
164 
165         static immutable float[14] factors = [ 1.0f, 0.5f, 0.25f, 0.125f,
166                                                0.0625f, 0.03125f, 0.015625f, 0.0078125f,
167         0.00390625f, 0.001953125f, 0.0009765625f, 0.00048828125f,
168         0.000244140625f, 0.0001220703125f];
169 
170         float divider = factors[level];
171         x = x * divider - 0.5f;
172         y = y * divider - 0.5f;
173 
174         __m128 mm0123 = _mm_setr_ps(-1, 0, 1, 2);
175         __m128i x_indices = _mm_cvttps_epi32( _mm_set1_ps(x) + mm0123);
176         __m128i y_indices = _mm_cvttps_epi32( _mm_set1_ps(y) + mm0123);
177         __m128i zero = _mm_setzero_si128();
178         x_indices = _mm_max_epi32(x_indices, zero);
179         y_indices = _mm_max_epi32(y_indices, zero);
180         x_indices = _mm_min_epi32(x_indices, _mm_set1_epi32(image.w-1));
181         y_indices = _mm_min_epi32(y_indices, _mm_set1_epi32(image.h-1));
182 
183         int i0 = x_indices.array[0];
184         int i1 = x_indices.array[1];
185         int i2 = x_indices.array[2];
186         int i3 = x_indices.array[3];
187 
188         // fractional part
189         float a = x + 1.0f;
190         float b = y + 1.0f;
191         a = a - cast(int)(a);
192         b = b - cast(int)(b);
193         assert(a >= -0.01 && a <= 1.01);
194         assert(b >= -0.01 && b <= 1.01);
195 
196         COLOR*[4] L = void;
197         L[0] = image.scanlinePtr(y_indices.array[0]);
198         L[1] = image.scanlinePtr(y_indices.array[1]);
199         L[2] = image.scanlinePtr(y_indices.array[2]);
200         L[3] = image.scanlinePtr(y_indices.array[3]);
201 
202         static if (is(COLOR == L16))
203         {
204             static float clamp_0_to_65535(float a)
205             {
206                 if (a < 0) a = 0;
207                 if (a > 65535) a = 65535;
208                 return a;
209             }
210             static cubicInterp(float t, float x0, float x1, float x2, float x3) pure nothrow @nogc
211             {
212                 // PERF: doesn't sound that great???
213                 return x1 
214                     + t * ((-0.5f * x0) + (0.5f * x2))
215                     + t * t * (x0 - (2.5f * x1) + (2.0f * x2) - (0.5f * x3))
216                     + t * t * t * ((-0.5f * x0) + (1.5f * x1) - (1.5f * x2) + 0.5f * x3);
217             }
218 
219             float[4] R;
220             for (int row = 0; row < 4; ++row)
221             {
222                 COLOR* pRow = L[row];
223                 COLOR ri0jn = pRow[i0];
224                 COLOR ri1jn = pRow[i1];
225                 COLOR ri2jn = pRow[i2];
226                 COLOR ri3jn = pRow[i3];
227                 float A = ri0jn.l;
228                 float B = ri1jn.l;
229                 float C = ri2jn.l;
230                 float D = ri3jn.l;
231                 R[row] = cubicInterp(a, A, B, C, D);
232             }
233             return clamp_0_to_65535(cubicInterp(b, R[0], R[1], R[2], R[3]));
234         }
235         else
236         {
237             // actually optimized ok by LDC
238             static vec4f clamp_0_to_65535(vec4f a)
239             {
240                 if (a[0] < 0) a[0] = 0;
241                 if (a[1] < 0) a[1] = 0;
242                 if (a[2] < 0) a[2] = 0;
243                 if (a[3] < 0) a[3] = 0;
244                 if (a[0] > 65535) a[0] = 65535;
245                 if (a[1] > 65535) a[1] = 65535;
246                 if (a[2] > 65535) a[2] = 65535;
247                 if (a[3] > 65535) a[3] = 65535;
248                 return a;
249             }
250 
251             static cubicInterp(float t, vec4f x0, vec4f x1, vec4f x2, vec4f x3) pure nothrow @nogc
252             {
253                 // PERF: doesn't sound that great???
254                 return x1 
255                      + t * ((-0.5f * x0) + (0.5f * x2))
256                      + t * t * (x0 - (2.5f * x1) + (2.0f * x2) - (0.5f * x3))
257                      + t * t * t * ((-0.5f * x0) + (1.5f * x1) - (1.5f * x2) + 0.5f * x3);
258             }
259             vec4f[4] R = void;
260             for (int row = 0; row < 4; ++row)
261             {
262                 COLOR* pRow = L[row];
263                 COLOR ri0jn = pRow[i0];
264                 COLOR ri1jn = pRow[i1];
265                 COLOR ri2jn = pRow[i2];
266                 COLOR ri3jn = pRow[i3];
267                 vec4f A = vec4f(ri0jn.r, ri0jn.g, ri0jn.b, ri0jn.a);
268                 vec4f B = vec4f(ri1jn.r, ri1jn.g, ri1jn.b, ri1jn.a);
269                 vec4f C = vec4f(ri2jn.r, ri2jn.g, ri2jn.b, ri2jn.a);
270                 vec4f D = vec4f(ri3jn.r, ri3jn.g, ri3jn.b, ri3jn.a);
271                 R[row] = cubicInterp(a, A, B, C, D);
272             }
273             return clamp_0_to_65535(cubicInterp(b, R[0], R[1], R[2], R[3]));
274         }
275     }
276 
277 
278     /// Interpolates a color.  Integer level, spatial linear interpolation.
279     /// x and y are in base level coordinates (top-left pixel is on (0.5, 0.5) coordinates).
280     /// Clamped to borders.
281     auto linearSample(int level, float x, float y) nothrow @nogc
282     {
283         if (level < 0)
284             level = 0;
285         int numLevels = cast(int)levels.length;
286         if (level >= numLevels)
287             level = numLevels - 1;
288 
289         OwnedImage!COLOR image = levels[level];
290 
291 
292         static immutable float[14] factors = [ 1.0f, 0.5f, 0.25f, 0.125f,
293                                                0.0625f, 0.03125f, 0.015625f, 0.0078125f,
294                                                0.00390625f, 0.001953125f, 0.0009765625f, 0.00048828125f,
295                                                0.000244140625f, 0.0001220703125f];
296 
297         float divider = factors[level];
298         x = x * divider - 0.5f;
299         y = y * divider - 0.5f;
300 
301         if (x < 0)
302             x = 0;
303         if (y < 0)
304             y = 0;
305 
306         __m128 floatCoords = _mm_setr_ps(x, y, 0, 0);
307         __m128i truncatedCoord = _mm_cvttps_epi32(floatCoords);
308         int ix = truncatedCoord.array[0];
309         int iy = truncatedCoord.array[1];
310 
311         // Get fractional part
312         float fx = x - ix;
313         float fy = y - iy;
314 
315         const int maxX = image.w-1;
316         const int maxY = image.h-1;
317         if (ix > maxX)
318             ix = maxX;
319         if (iy > maxY)
320             iy = maxY;
321 
322         int ixp1 = ix + 1;
323         int iyp1 = iy + 1;
324         if (ixp1 > maxX)
325             ixp1 = maxX;
326         if (iyp1 > maxY)
327             iyp1 = maxY;  
328 
329         float fxm1 = 1 - fx;
330         float fym1 = 1 - fy;
331 
332         COLOR* L0 = image.scanlinePtr(iy);
333         COLOR* L1 = image.scanlinePtr(iyp1);
334 
335         COLOR A = L0[ix];
336         COLOR B = L0[ixp1];
337         COLOR C = L1[ix];
338         COLOR D = L1[ixp1];
339 
340         static if (is(COLOR == RGBA))
341         {
342             float inv255 = 1 / 255.0f;
343             version(LDC)
344             {
345                 int Ai = *cast(int*)(&A);
346                 int Bi = *cast(int*)(&B);
347                 int Ci = *cast(int*)(&C);
348                 int Di = *cast(int*)(&D);
349 
350                 __m128i mmZero = _mm_setzero_si128();
351                 __m128i mmABCD = _mm_setr_epi32(Ai, Bi, Ci, Di);
352 
353                 // Convert to float of the form (R, G, B, A)
354                 __m128i mmAB = _mm_unpacklo_epi8(mmABCD, mmZero);
355                 __m128i mmCD = _mm_unpackhi_epi8(mmABCD, mmZero);
356                 __m128 vA = _mm_cvtepi32_ps( _mm_unpacklo_epi16(mmAB, mmZero));
357                 __m128 vB = _mm_cvtepi32_ps( _mm_unpackhi_epi16(mmAB, mmZero));
358                 __m128 vC = _mm_cvtepi32_ps( _mm_unpacklo_epi16(mmCD, mmZero));
359                 __m128 vD = _mm_cvtepi32_ps( _mm_unpackhi_epi16(mmCD, mmZero));
360 
361                 __m128 vfx = _mm_set1_ps(fx);
362                 __m128 vfxm1 = _mm_set1_ps(fxm1);
363                 __m128 up = vA * vfxm1 + vB * vfx;
364                 __m128 down = vC * vfxm1 + vD * vfx;
365 
366                 __m128 vfy = _mm_set1_ps(fy);
367                 __m128 vfym1 = _mm_set1_ps(fym1);
368                 __m128 dResult = up * fym1 + down * fy;
369                 vec4f result = void;
370                 _mm_storeu_ps(result.ptr, dResult);
371                 return result;
372 
373             }
374             else version( AsmX86 )
375             {
376                 vec4f asmResult;
377 
378                 asm nothrow @nogc
379                 {
380                     movd XMM0, A;
381                     movd XMM1, B;
382                     movd XMM2, C;
383                     movd XMM3, D;
384                     pxor XMM4, XMM4;
385 
386                     punpcklbw XMM0, XMM4;
387                     punpcklbw XMM1, XMM4;
388                     punpcklbw XMM2, XMM4;
389                     punpcklbw XMM3, XMM4;
390 
391                     punpcklwd XMM0, XMM4;
392                     punpcklwd XMM1, XMM4;
393                     punpcklwd XMM2, XMM4;
394                     punpcklwd XMM3, XMM4;
395 
396                     cvtdq2ps XMM0, XMM0;
397                     cvtdq2ps XMM1, XMM1;
398 
399                     cvtdq2ps XMM2, XMM2;
400                     cvtdq2ps XMM3, XMM3;
401 
402                     movss XMM4, fxm1;
403                     pshufd XMM4, XMM4, 0;
404                     movss XMM5, fx;
405                     pshufd XMM5, XMM5, 0;
406 
407                     mulps XMM0, XMM4;
408                     mulps XMM1, XMM5;
409                     mulps XMM2, XMM4;
410                     mulps XMM3, XMM5;
411 
412                     movss XMM4, fym1;
413                     pshufd XMM4, XMM4, 0;
414                     movss XMM5, fy;
415                     pshufd XMM5, XMM5, 0;
416 
417                     addps XMM0, XMM1;
418                     addps XMM2, XMM3;
419 
420                     mulps XMM0, XMM4;
421                     mulps XMM2, XMM5;
422 
423                     addps XMM0, XMM2;
424 
425                     movups asmResult, XMM0;
426                 }
427 
428                 // Uncomment to check
429     /*
430                 vec4f vA = vec4f(A.r, A.g, A.b, A.a);
431                 vec4f vB = vec4f(B.r, B.g, B.b, B.a);
432                 vec4f vC = vec4f(C.r, C.g, C.b, C.a);
433                 vec4f vD = vec4f(D.r, D.g, D.b, D.a);
434 
435                 vec4f up = vA * fxm1 + vB * fx;
436                 vec4f down = vC * fxm1 + vD * fx;
437                 vec4f dResult = up * fym1 + down * fy;
438 
439                 import gfm.core;
440 
441                 if (dResult.distanceTo(result) < 1.0f)
442                     debugBreak();
443     */
444 
445                 vec4f result = asmResult;
446                 return result;
447             }
448             else
449             {
450                 vec4f vA = vec4f(A.r, A.g, A.b, A.a);
451                 vec4f vB = vec4f(B.r, B.g, B.b, B.a);
452                 vec4f vC = vec4f(C.r, C.g, C.b, C.a);
453                 vec4f vD = vec4f(D.r, D.g, D.b, D.a);
454 
455 
456 
457                 vec4f up = vA * fxm1 + vB * fx;
458                 vec4f down = vC * fxm1 + vD * fx;
459                 vec4f dResult = up * fym1 + down * fy;
460 
461               //  assert(dResult.distanceTo(asmResult) < 1.0f);
462 
463                 return dResult;
464             }
465         }
466         else static if (is(COLOR == L16))
467         {
468             float up = A.l * fxm1 + B.l * fx;
469             float down = C.l * fxm1 + D.l * fx;
470             return up * fym1 + down * fy;
471         }
472         else // RGBA16
473         {
474             vec4f vA = vec4f(A.r, A.g, A.b, A.a);
475             vec4f vB = vec4f(B.r, B.g, B.b, B.a);
476             vec4f vC = vec4f(C.r, C.g, C.b, C.a);
477             vec4f vD = vec4f(D.r, D.g, D.b, D.a);
478 
479             vec4f up = vA * fxm1 + vB * fx;
480             vec4f down = vC * fxm1 + vD * fx;
481             vec4f result = up * fym1 + down * fy;
482             return result;
483         }
484     }
485 
486     /// Returns: Width of the base level.
487     int width() pure const nothrow @nogc
488     {
489         return levels[0].w;
490     }
491 
492     /// Returns: Height of the base level.
493     int height() pure const nothrow @nogc
494     {
495         return levels[0].h;
496     }
497 
498     /// Returns: Number of levels. The maximum level is numLevels() - 1.
499     int numLevels() pure const nothrow @nogc
500     {
501         return cast(int)levels.length;
502     }
503 
504     /// Regenerates the whole upper levels.
505     void generateMipmaps(Quality quality) nothrow @nogc
506     {
507         box2i updateRect = box2i(0, 0, width(), height());
508         for (int level = 1; level < numLevels(); ++level)
509         {
510             // HACK: Force cubic filter past a level else it makes ugly looking mipmaps
511             if (level >= 3 && quality == Quality.box)
512                 quality = Quality.cubic;
513 
514             updateRect = generateNextLevel(quality, updateRect, level);
515         }
516     }
517 
518     /// Regenerates a single mipmap level based on changes in the provided rectangle (expressed in level 0 coordinates).
519     /// updateRect expressed in level 0 coordinates
520     /// In general if you have several subparts of mipmaps to update, make sure a level is fully completed
521     /// before computing the next one.
522     box2i generateNextLevel(Quality quality, box2i updateRectPreviousLevel, int level) nothrow @nogc
523     {
524         OwnedImage!COLOR previousLevel = levels[level - 1];
525         box2i updateRect = impactOnNextLevel(quality, updateRectPreviousLevel, previousLevel.w, previousLevel.h);
526         generateLevel(level, quality, updateRect);
527         return updateRect;
528     }
529 
530     /// Regenerates one level
531     /// updateRect expressed in level i-th coordinates
532     void generateLevel(int level, Quality quality, box2i updateRect) nothrow @nogc
533     {
534         assert(level > 0);
535         OwnedImage!COLOR thisLevel = levels[level];
536         OwnedImage!COLOR previousLevel = levels[level - 1];
537 
538         final switch(quality) with (Quality)
539         {
540             case box:
541 
542                 static if (is(COLOR == RGBA))
543                     generateLevelBoxRGBA(thisLevel, previousLevel, updateRect);
544                 else static if (is(COLOR == L16))
545                     generateLevelBoxL16(thisLevel, previousLevel, updateRect);
546                 else static if (is(COLOR == RGBA16))
547                     generateLevelBoxRGBA16(thisLevel, previousLevel, updateRect);
548                 else
549                     static assert(false, "not implemented");
550 
551                 enum checkBoxMipmaps = false;
552 
553                 static if (checkBoxMipmaps)
554                 {
555                     for (int y = updateRect.min.y; y < updateRect.max.y; ++y)
556                     {
557                         COLOR[] L0 = previousLevel.scanline(y * 2);
558                         COLOR[] L1 = previousLevel.scanline(y * 2 + 1);
559                         COLOR[] dest = thisLevel.scanline(y);
560 
561                         for (int x = updateRect.min.x; x < updateRect.max.x; ++x)
562                         {
563                             // A B
564                             // C D
565                             COLOR A = L0[2 * x];
566                             COLOR B = L0[2 * x + 1];
567                             COLOR C = L1[2 * x];
568                             COLOR D = L1[2 * x + 1];
569                             assert(dest[x] == COLOR.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D));
570                         }
571                     }
572                 }
573                 break;
574 
575         case boxAlphaCovIntoPremul:
576 
577             static if (is(COLOR == RGBA))
578             {
579                 generateLevelBoxAlphaCovIntoPremulRGBA(thisLevel, previousLevel, updateRect);
580                 break;
581             }
582             else
583                 assert(false);
584 
585         case cubic:
586             static if (is(COLOR == RGBA))
587             {
588                 generateLevelCubicRGBA(thisLevel, previousLevel, updateRect);
589                 break;
590             }
591             else static if (is(COLOR == L16))
592             {
593                 generateLevelCubicL16(thisLevel, previousLevel, updateRect);
594                 break;
595             }
596             else static if (is(COLOR == RGBA16))
597             {
598                 generateLevelCubicRGBA16(thisLevel, previousLevel, updateRect);
599                 break;
600             }
601             else
602                 static assert(false, "not implemented");
603 
604 
605         }
606     }
607 
608 
609 private:
610     /// Computes impact of updating the area box on next level
611     static box2i impactOnNextLevel(Quality quality, box2i area, int currentLevelWidth, int currentLevelHeight) pure nothrow @nogc
612     {
613         box2i maxArea = box2i(0, 0, currentLevelWidth / 2, currentLevelHeight / 2);
614 
615         final  switch(quality) with (Quality)
616         {
617         case box:
618         case boxAlphaCovIntoPremul:
619             int xmin = area.min.x / 2;
620             int ymin = area.min.y / 2;
621             int xmax = (area.max.x + 1) / 2;
622             int ymax = (area.max.y + 1) / 2;
623             return box2i(xmin, ymin, xmax, ymax).intersection(maxArea);
624 
625         case cubic:
626             int xmin = (area.min.x - 1) / 2;
627             int ymin = (area.min.y - 1) / 2;
628             int xmax = (area.max.x + 2) / 2;
629             int ymax = (area.max.y + 2) / 2;
630             return box2i(xmin, ymin, xmax, ymax).intersection(maxArea);
631         }
632 
633     }
634 }
635 
636 unittest
637 {
638     Mipmap!RGBA a = new Mipmap!RGBA();
639     a.size(4, 256, 256);
640     a.destroy();
641 
642     Mipmap!L16 b = new Mipmap!L16();
643     b.size(16, 17, 333);
644     b.destroy();
645 }
646 
647 
648 private:
649 
650 align(16) static immutable short[8] xmmTwoShort = [ 2, 2, 2, 2, 2, 2, 2, 2 ];
651 align(16) static immutable int[4] xmmTwoInt = [ 2, 2, 2, 2 ];
652 align(16) static immutable float[4] xmm0_5 = [ 0.5f, 0.5f, 0.5f, 0.5f ];
653 align(16) static immutable int[4] xmm512 = [ 512, 512, 512, 512 ];
654 align(16) static immutable short[8] xmm11113333 = [ 1, 1, 1, 1, 3, 3, 3, 3 ];
655 align(16) static immutable short[8] xmm33331111 = [ 3, 3, 3, 3, 1, 1, 1, 1 ];
656 align(16) static immutable short[8] xmm33339999 = [ 3, 3, 3, 3, 9, 9, 9, 9 ];
657 align(16) static immutable short[8] xmm99993333 = [ 9, 9, 9, 9, 3, 3, 3, 3 ];
658 align(16) static immutable short[8] xmm32       = [ 32, 32, 32, 32, 32, 32, 32, 32 ];
659 
660 
661 void generateLevelBoxRGBA(OwnedImage!RGBA thisLevel,
662                           OwnedImage!RGBA previousLevel,
663                           box2i updateRect) pure nothrow @nogc
664 {
665     int width = updateRect.width();
666     int height = updateRect.height();
667 
668     for (int y = 0; y < height; ++y)
669     {
670         RGBA* L0   = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2    ) + updateRect.min.x * 2;
671         RGBA* L1   = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 + 1) + updateRect.min.x * 2;
672         RGBA* dest =     thisLevel.scanlinePtr(           updateRect.min.y + y) + updateRect.min.x;
673 
674         
675 
676      // PERF: enable later, this is faster on a full mipmap even without AVX2
677      /// Requires a somewhat recent intel-intrinsics though
678      /+
679             int x = 0;
680             __m256i zero = _mm256_setzero_si256();
681             __m256i two = _mm256_set1_epi16(2);
682             for ( ; x + 3 < width; x += 4)
683             {
684                 // pixel patches:
685                 // A B E F   Goal = (A + B + C + D + 2) / 4   => res
686                 // C D G H          (E + F + G + H + 2) / 4   => res+1
687                 //
688                 __m256i ABEF = _mm256_loadu_si256(cast(const(__m256i)*) &L0[2*x]);
689                 __m256i CDGH = _mm256_loadu_si256(cast(const(__m256i)*) &L1[2*x]);
690                 __m256i AB = _mm256_unpacklo_epi8(ABEF, zero);
691                 __m256i EF = _mm256_unpackhi_epi8(ABEF, zero);
692                 __m256i CD = _mm256_unpacklo_epi8(CDGH, zero);
693                 __m256i GH = _mm256_unpackhi_epi8(CDGH, zero);
694                 AB = _mm256_add_epi16(AB, CD);                 // A + C   B + D
695                 EF = _mm256_add_epi16(EF, GH);                 // E + G   F + H
696                 __m256i AC_EG = _mm256_unpacklo_epi64(AB, EF); // A+C  E+G
697                 __m256i BD_FH = _mm256_unpackhi_epi64(AB, EF); // B+D  F+H
698                 __m256i sum = _mm256_add_epi16(AC_EG, BD_FH); // A+B+C+D   E+F+G+H
699                 sum = _mm256_add_epi16(sum, two);             // A+B+C+D+2 E+F+G+H+2
700                 sum = _mm256_srai_epi16(sum, 2);              // (A+B+C+D+2)/4 (E+F+G+H+2)/4
701                 __m256i finalPixels = _mm256_packus_epi16(sum, zero);
702 
703                 __m128i f_lo = _mm256_extractf128_si256!0(finalPixels);
704                 __m128i f_hi = _mm256_extractf128_si256!1(finalPixels);
705                 _mm_storeu_si64(&dest[x], f_lo);  // PERF Would need a vpermute here. In each lane, only the low 8 bytes are interesting.
706                 _mm_storeu_si64(&dest[x+2], f_hi);
707             }
708         }
709 
710         +/
711 
712         __m128i zero = _mm_setzero_si128();
713         __m128i two = _mm_set1_epi16(2);
714         int x = 0;
715         for ( ; x + 1 < width; x += 2)
716         {
717             // pixel patches:
718             // A B E F   Goal = (A + B + C + D + 2) / 4   => res
719             // C D G H          (E + F + G + H + 2) / 4   => res+1
720             //
721             __m128i ABEF = _mm_loadu_si128(cast(const(__m128i)*) &L0[2*x]);
722             __m128i CDGH = _mm_loadu_si128(cast(const(__m128i)*) &L1[2*x]);
723             __m128i AB = _mm_unpacklo_epi8(ABEF, zero);
724             __m128i EF = _mm_unpackhi_epi8(ABEF, zero);
725             __m128i CD = _mm_unpacklo_epi8(CDGH, zero);
726             __m128i GH = _mm_unpackhi_epi8(CDGH, zero);
727             AB = _mm_add_epi16(AB, CD);                 // A + C   B + D
728             EF = _mm_add_epi16(EF, GH);                 // E + G   F + H
729             __m128i AC_EG = _mm_unpacklo_epi64(AB, EF); // A+C  E+G
730             __m128i BD_FH = _mm_unpackhi_epi64(AB, EF); // B+D  F+H
731             __m128i sum = _mm_add_epi16(AC_EG, BD_FH); // A+B+C+D   E+F+G+H
732             sum = _mm_add_epi16(sum, two);             // A+B+C+D+2 E+F+G+H+2
733             sum = _mm_srai_epi16(sum, 2);              // (A+B+C+D+2)/4 (E+F+G+H+2)/4
734             __m128i finalPixels = _mm_packus_epi16(sum, zero);
735             _mm_storeu_si64(&dest[x], finalPixels);
736         }
737 
738         for (; x < width; ++x)
739         {
740             RGBA A = L0[2 * x];
741             RGBA B = L0[2 * x + 1];
742             RGBA C = L1[2 * x];
743             RGBA D = L1[2 * x + 1];
744             dest[x] = RGBA.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D);
745         }
746     }
747 }
748 
749 void generateLevelBoxL16(OwnedImage!L16 thisLevel,
750                          OwnedImage!L16 previousLevel,
751                          box2i updateRect) pure nothrow @nogc
752 {
753     int width = updateRect.width();
754     int height = updateRect.height();
755 
756     for (int y = 0; y < height; ++y)
757     {
758         L16* L0   = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2    ) + updateRect.min.x * 2;
759         L16* L1   = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 + 1) + updateRect.min.x * 2;
760         L16* dest =     thisLevel.scanlinePtr(           updateRect.min.y + y) + updateRect.min.x;
761 
762 
763         // Fun performance fact: for this loop (LDC 1.33, arch x86_64), assembly is slower than intrinsics, 
764         // themselves slower than normal D code.
765 
766         int x = 0;
767         for (; x < width; ++x)
768         {
769             // A B
770             // C D
771             L16 A = L0[2 * x];
772             L16 B = L0[2 * x + 1];
773             L16 C = L1[2 * x];
774             L16 D = L1[2 * x + 1];
775 
776             dest[x] = L16.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D);
777         }
778     }
779 }
780 
781 void generateLevelBoxRGBA16(OwnedImage!RGBA16 thisLevel,
782                             OwnedImage!RGBA16 previousLevel,
783                             box2i updateRect) pure nothrow @nogc
784 {
785     // untested and unused for now
786     int width = updateRect.width();
787     int height = updateRect.height();
788 
789     for (int y = 0; y < height; ++y)
790     {
791         RGBA16* L0   = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2    ) + updateRect.min.x * 2;
792         RGBA16* L1   = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 + 1) + updateRect.min.x * 2;
793         RGBA16* dest =     thisLevel.scanlinePtr(           updateRect.min.y + y) + updateRect.min.x;
794         for (int x = 0; x < width; ++x)
795         {
796             // A B
797             // C D
798             RGBA16 A = L0[2 * x];
799             RGBA16 B = L0[2 * x + 1];
800             RGBA16 C = L1[2 * x];
801             RGBA16 D = L1[2 * x + 1];
802 
803             dest[x] = RGBA16.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D);
804         }
805     }
806 }
807 
808 void generateLevelBoxAlphaCovIntoPremulRGBA(OwnedImage!RGBA thisLevel,
809                                             OwnedImage!RGBA previousLevel,
810                                             box2i updateRect) nothrow @nogc
811 {
812     int width = updateRect.width();
813     int height = updateRect.height();
814 
815     for (int y = 0; y < height; ++y)
816     {
817         RGBA* L0   = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2    ) + updateRect.min.x * 2;
818         RGBA* L1   = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 + 1) + updateRect.min.x * 2;
819         RGBA* dest =     thisLevel.scanlinePtr(           updateRect.min.y + y) + updateRect.min.x;
820 
821         version(futurePBREmissive)
822         {
823             // Note: basically very hard to beat with intrinsics.
824             // Hours lost trying to do that: 4.
825             // Neither float or integer intrinsics shenanigans do better than this plain code.
826 
827             for (int x = 0; x < width; ++x)
828             {
829                 RGBA A = L0[2 * x];
830                 RGBA B = L0[2 * x + 1];
831                 RGBA C = L1[2 * x];
832                 RGBA D = L1[2 * x + 1];
833 
834                 // This is only approximate, does a pow2
835                 static RGBAf convert_gammaspace_to_linear_premul (RGBA col)
836                 {
837                     RGBAf res;
838                     enum float inv_255 = 1.0f / 255;
839                     res.a = col.a * inv_255; // alpha is linear
840                     res.r = col.r * inv_255 *col.r * inv_255* res.a;
841                     res.g = col.g * inv_255 *col.g * inv_255* res.a;
842                     res.b = col.b * inv_255 *col.b * inv_255* res.a;
843                     return res;
844                 }
845 
846                 // Convert those into 
847                 RGBAf A_linear = convert_gammaspace_to_linear_premul(A);
848                 RGBAf B_linear = convert_gammaspace_to_linear_premul(B);
849                 RGBAf C_linear = convert_gammaspace_to_linear_premul(C);
850                 RGBAf D_linear = convert_gammaspace_to_linear_premul(D);
851 
852                 float meanR = A_linear.r + B_linear.r + C_linear.r + D_linear.r;
853                 float meanG = A_linear.g + B_linear.g + C_linear.g + D_linear.g;
854                 float meanB = A_linear.b + B_linear.b + C_linear.b + D_linear.b;
855                 float meanA = A_linear.a + B_linear.a + C_linear.a + D_linear.a;
856 
857                 RGBA finalColor = RGBA( cast(ubyte)(meanR * 0.25f * 255.0f + 0.5f),
858                                         cast(ubyte)(meanG * 0.25f * 255.0f + 0.5f),
859                                         cast(ubyte)(meanB * 0.25f * 255.0f + 0.5f),
860                                         cast(ubyte)(meanA * 0.25f * 255.0f + 0.5f) );
861                 dest[x] = finalColor;
862             }
863         }
864         else
865         {
866             for (int x = 0; x < width; ++x)
867             {
868                 RGBA A = L0[2 * x];
869                 RGBA B = L0[2 * x + 1];
870                 RGBA C = L1[2 * x];
871                 RGBA D = L1[2 * x + 1];
872                 int red =   (A.r * A.a + B.r * B.a + C.r * C.a + D.r * D.a);
873                 int green = (A.g * A.a + B.g * B.a + C.g * C.a + D.g * D.a);
874                 int blue =  (A.b * A.a + B.b* B.a + C.b * C.a + D.b * D.a);
875                 int alpha =  (A.a * A.a + B.a* B.a + C.a * C.a + D.a * D.a);
876                 RGBA finalColor = RGBA( cast(ubyte)((red + 512) >> 10),
877                                         cast(ubyte)((green + 512) >> 10),
878                                        cast(ubyte)((blue + 512) >> 10),
879                                        cast(ubyte)((alpha + 512) >> 10));
880                 dest[x] = finalColor;
881             }
882         }
883     }
884 }
885 
886 void generateLevelCubicRGBA(OwnedImage!RGBA thisLevel,
887                             OwnedImage!RGBA previousLevel,
888                             box2i updateRect) nothrow @nogc
889 {
890     for (int y = updateRect.min.y; y < updateRect.max.y; ++y)
891     {
892         int y2m1 = 2 * y - 1;
893         if (y2m1 < 0)
894             y2m1 = 0;
895 
896         int y2p2 = 2 * y + 2;
897         if (y2p2 > previousLevel.h - 1)
898             y2p2 = previousLevel.h - 1;
899 
900         RGBA* LM1 = previousLevel.scanlinePtr(y2m1);
901         RGBA* L0 = previousLevel.scanlinePtr(y * 2);
902         RGBA* L1 = previousLevel.scanlinePtr(y * 2 + 1);
903         RGBA* L2 = previousLevel.scanlinePtr(y2p2);
904         RGBA* dest = thisLevel.scanlinePtr(y);
905 
906         for (int x = updateRect.min.x; x < updateRect.max.x; ++x)
907         {
908             // A B C D
909             // E F G H
910             // I J K L
911             // M N O P
912 
913             int x2m1 = 2 * x - 1;
914             if (x2m1 < 0)
915                 x2m1 = 0;
916             int x2p0 = 2 * x;
917             int x2p2 = 2 * x + 2;
918             if (x2p2 > previousLevel.w - 1)
919                 x2p2 = previousLevel.w - 1;
920 
921             static if (true)
922             {
923                 align(16) RGBA[16] buf = void;
924                 buf[0] = LM1[x2m1];
925                 buf[1] = LM1[x2p0];
926                 buf[2] = LM1[x2p0+1];
927                 buf[3] = LM1[x2p2];
928                 buf[4] = L0[x2m1];
929                 buf[5] = L0[x2p0];
930                 buf[6] = L0[x2p0+1];
931                 buf[7] = L0[x2p2];
932                 buf[8] = L1[x2m1];
933                 buf[9] = L1[x2p0];
934                 buf[10] = L1[x2p0+1];
935                 buf[11] = L1[x2p2];
936                 buf[12] = L2[x2m1];
937                 buf[13] = L2[x2p0];
938                 buf[14] = L2[x2p0+1];
939                 buf[15] = L2[x2p2];
940                 RGBA* pDest = dest + x;
941 
942                 const __m128i mmZero = _mm_setzero_si128();
943 
944                 // Note: no coefficients improvements really convince.
945                 // This was Issue #827, read for more context.
946 
947                 const __m128i xmm11113333 = _mm_setr_epi16(1, 1, 1, 1, 3, 3, 3, 3);
948                 const __m128i xmm33339999 = _mm_setr_epi16(3, 3, 3, 3, 9, 9, 9, 9);
949 
950                 __m128i ABCD = _mm_load_si128(cast(const(__m128i*)) &buf[0]);
951                 __m128i EFGH = _mm_load_si128(cast(const(__m128i*)) &buf[4]);
952                 __m128i IJKL = _mm_load_si128(cast(const(__m128i*)) &buf[8]);
953                 __m128i MNOP = _mm_load_si128(cast(const(__m128i*)) &buf[12]);
954 
955                 __m128i AB = _mm_unpacklo_epi8(ABCD, mmZero);
956                 __m128i CD = _mm_unpackhi_epi8(ABCD, mmZero);
957                 __m128i EF = _mm_unpacklo_epi8(EFGH, mmZero);
958                 __m128i GH = _mm_unpackhi_epi8(EFGH, mmZero);
959                 __m128i IJ = _mm_unpacklo_epi8(IJKL, mmZero);
960                 __m128i KL = _mm_unpackhi_epi8(IJKL, mmZero);
961                 __m128i MN = _mm_unpacklo_epi8(MNOP, mmZero);
962                 __m128i OP = _mm_unpackhi_epi8(MNOP, mmZero);
963 
964                 // This avoid a few multiplications
965                 AB = _mm_add_epi16(AB, MN);
966                 CD = _mm_add_epi16(CD, OP);
967                 EF = _mm_add_epi16(EF, IJ);
968                 GH = _mm_add_epi16(GH, KL);
969 
970                 // Wrap a bit more, avoids two muls
971                 AB = _mm_add_epi16(AB, _mm_shuffle_epi32!0x4e(CD)); // invert quadwords
972                 EF = _mm_add_epi16(EF, _mm_shuffle_epi32!0x4e(GH)); // invert quadwords
973 
974                 // PERF: we can win a few mul here
975                 __m128i sum01 = _mm_mullo_epi16(AB, xmm11113333);
976                 sum01 = _mm_add_epi16(sum01, _mm_mullo_epi16(EF, xmm33339999));
977                 sum01 = _mm_add_epi16(sum01, _mm_srli_si128!8(sum01));
978 
979                 __m128i sum = sum01;
980                 sum = _mm_add_epi16(sum, _mm_set1_epi16(32));
981                 sum = _mm_srli_epi16(sum, 6);
982                 __m128i finalPixels = _mm_packus_epi16(sum, mmZero);
983                 _mm_storeu_si32(pDest, finalPixels);
984             }
985             else
986             {
987                 RGBA A = LM1[x2m1];
988                 RGBA B = LM1[x2p0];
989                 RGBA C = LM1[x2p0+1];
990                 RGBA D = LM1[x2p2];
991 
992                 RGBA E = L0[x2m1];
993                 RGBA F = L0[x2p0];
994                 RGBA G = L0[x2p0+1];
995                 RGBA H = L0[x2p2];
996 
997                 RGBA I = L1[x2m1];
998                 RGBA J = L1[x2p0];
999                 RGBA K = L1[x2p0+1];
1000                 RGBA L = L1[x2p2];
1001 
1002                 RGBA M = L2[x2m1];
1003                 RGBA N = L2[x2p0];
1004                 RGBA O = L2[x2p0+1];
1005                 RGBA P = L2[x2p2];
1006 
1007                 // Apply filter
1008                 // 1 3 3 1
1009                 // 3 9 9 3      / 64
1010                 // 3 9 9 3
1011                 // 1 3 3 1
1012 
1013                 int rSum = (A.r + D.r + M.r + P.r) + 3 * (B.r + C.r + E.r + H.r + I.r + L.r + N.r + O.r) + 9 * (F.r + G.r + J.r + K.r);
1014                 int gSum = (A.g + D.g + M.g + P.g) + 3 * (B.g + C.g + E.g + H.g + I.g + L.g + N.g + O.g) + 9 * (F.g + G.g + J.g + K.g);
1015                 int bSum = (A.b + D.b + M.b + P.b) + 3 * (B.b + C.b + E.b + H.b + I.b + L.b + N.b + O.b) + 9 * (F.b + G.b + J.b + K.b);
1016                 int aSum = (A.a + D.a + M.a + P.a) + 3 * (B.a + C.a + E.a + H.a + I.a + L.a + N.a + O.a) + 9 * (F.a + G.a + J.a + K.a);
1017                 dest[x].r = cast(ubyte)((rSum + 32) >> 6);
1018                 dest[x].g = cast(ubyte)((gSum + 32) >> 6);
1019                 dest[x].b = cast(ubyte)((bSum + 32) >> 6);
1020                 dest[x].a = cast(ubyte)((aSum + 32) >> 6);
1021             }
1022         }
1023     }
1024 }
1025 
1026 void generateLevelCubicL16(OwnedImage!L16 thisLevel,
1027                            OwnedImage!L16 previousLevel,
1028                            box2i updateRect) nothrow @nogc
1029 {
1030     for (int y = updateRect.min.y; y < updateRect.max.y; ++y)
1031     {
1032         int y2m1 = 2 * y - 1;
1033         if (y2m1 < 0)
1034             y2m1 = 0;
1035 
1036         int y2p2 = 2 * y + 2;
1037         if (y2p2 > previousLevel.h - 1)
1038             y2p2 = previousLevel.h - 1;
1039 
1040         L16* LM1 = previousLevel.scanlinePtr(y2m1);
1041         L16* L0 = previousLevel.scanlinePtr(y * 2);
1042         L16* L1 = previousLevel.scanlinePtr(y * 2 + 1);
1043         L16* L2 = previousLevel.scanlinePtr(y2p2);
1044         L16* dest = thisLevel.scanlinePtr(y);
1045 
1046         for (int x = updateRect.min.x; x < updateRect.max.x; ++x)
1047         {
1048             // A B C D
1049             // E F G H
1050             // I J K L
1051             // M N O P
1052 
1053             int x2m1 = 2 * x - 1;
1054             if (x2m1 < 0)
1055                 x2m1 = 0;
1056             int x2p0 = 2 * x;
1057             int x2p2 = 2 * x + 2;
1058             if (x2p2 > previousLevel.w - 1)
1059                 x2p2 = previousLevel.w - 1;
1060 
1061             ushort A = LM1[x2m1].l;
1062             ushort B = LM1[x2p0].l;
1063             ushort C = LM1[x2p0+1].l;
1064             ushort D = LM1[x2p2].l;
1065 
1066             ushort E = L0[x2m1].l;
1067             ushort F = L0[x2p0].l;
1068             ushort G = L0[x2p0+1].l;
1069             ushort H = L0[x2p2].l;
1070 
1071             ushort I = L1[x2m1].l;
1072             ushort J = L1[x2p0].l;
1073             ushort K = L1[x2p0+1].l;
1074             ushort L = L1[x2p2].l;
1075 
1076             ushort M = L2[x2m1].l;
1077             ushort N = L2[x2p0].l;
1078             ushort O = L2[x2p0+1].l;
1079             ushort P = L2[x2p2].l;
1080 
1081             // Apply filter
1082             // 1 3 3 1    A B C D
1083             // 3 9 9 3    E F G H
1084             // 3 9 9 3    I J K L
1085             // 1 3 3 1    M N O P
1086 
1087             int depthSum = (A + D + M + P)
1088                          + 3 * (B + C + E + H + I + L + N + O)
1089                          + 9 * (F + G + J + K);
1090             dest[x].l = cast(ushort)((depthSum + 32) >> 6  );
1091         }
1092     }
1093 }
1094 
1095 void generateLevelCubicRGBA16(OwnedImage!RGBA16 thisLevel,
1096                               OwnedImage!RGBA16 previousLevel,
1097                               box2i updateRect) nothrow @nogc
1098 {
1099     // untested and unused for now
1100     for (int y = updateRect.min.y; y < updateRect.max.y; ++y)
1101     {
1102         int y2m1 = 2 * y - 1;
1103         if (y2m1 < 0)
1104             y2m1 = 0;
1105 
1106         int y2p2 = 2 * y + 2;
1107         if (y2p2 > previousLevel.h - 1)
1108             y2p2 = previousLevel.h - 1;
1109 
1110         RGBA16* LM1 = previousLevel.scanlinePtr(y2m1);
1111         RGBA16* L0 = previousLevel.scanlinePtr(y * 2);
1112         RGBA16* L1 = previousLevel.scanlinePtr(y * 2 + 1);
1113         RGBA16* L2 = previousLevel.scanlinePtr(y2p2);
1114         RGBA16* dest = thisLevel.scanlinePtr(y);
1115 
1116         for (int x = updateRect.min.x; x < updateRect.max.x; ++x)
1117         {
1118             // A B C D
1119             // E F G H
1120             // I J K L
1121             // M N O P
1122 
1123             int x2m1 = 2 * x - 1;
1124             if (x2m1 < 0)
1125                 x2m1 = 0;
1126             int x2p0 = 2 * x;
1127             int x2p2 = 2 * x + 2;
1128             if (x2p2 > previousLevel.w - 1)
1129                 x2p2 = previousLevel.w - 1;
1130 
1131             auto A = LM1[x2m1];
1132             auto B = LM1[x2p0];
1133             auto C = LM1[x2p0+1];
1134             auto D = LM1[x2p2];
1135 
1136             auto E = L0[x2m1];
1137             auto F = L0[x2p0];
1138             auto G = L0[x2p0+1];
1139             auto H = L0[x2p2];
1140 
1141             auto I = L1[x2m1];
1142             auto J = L1[x2p0];
1143             auto K = L1[x2p0+1];
1144             auto L = L1[x2p2];
1145 
1146             auto M = L2[x2m1];
1147             auto N = L2[x2p0];
1148             auto O = L2[x2p0+1];
1149             auto P = L2[x2p2];
1150 
1151             // Apply filter
1152             // 1 3 3 1
1153             // 3 9 9 3
1154             // 3 9 9 3
1155             // 1 3 3 1
1156 
1157             int rSum = (A.r + D.r + M.r + P.r) + 3 * (B.r + C.r + E.r + H.r + I.r + L.r + N.r + O.r) + 9 * (F.r + G.r + J.r + K.r);
1158             int gSum = (A.g + D.g + M.g + P.g) + 3 * (B.g + C.g + E.g + H.g + I.g + L.g + N.g + O.g) + 9 * (F.g + G.g + J.g + K.g);
1159             int bSum = (A.b + D.b + M.b + P.b) + 3 * (B.b + C.b + E.b + H.b + I.b + L.b + N.b + O.b) + 9 * (F.b + G.b + J.b + K.b);
1160             int aSum = (A.a + D.a + M.a + P.a) + 3 * (B.a + C.a + E.a + H.a + I.a + L.a + N.a + O.a) + 9 * (F.a + G.a + J.a + K.a);
1161             dest[x].r = cast(ushort)((rSum + 32) >> 6);
1162             dest[x].g = cast(ushort)((gSum + 32) >> 6);
1163             dest[x].b = cast(ushort)((bSum + 32) >> 6);
1164             dest[x].a = cast(ushort)((aSum + 32) >> 6);
1165         }
1166     }
1167 }
1168 
1169 unittest
1170 {
1171     Mipmap!RGBA rgbaMipmap;
1172     Mipmap!L16 l16Mipmap;
1173 }