1 /**
2 Mipmap pyramid implementation.
3
4 Copyright: Guillaume Piolat 2015-2023.
5 License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module dplug.graphics.mipmap;
8
9 import dplug.math.vector;
10 import dplug.math.box;
11 import dplug.graphics.image;
12 import dplug.core.nogc;
13 import dplug.core.vec;
14
15 import inteli.smmintrin;
16
17 version( D_InlineAsm_X86 )
18 {
19 version = AsmX86;
20 }
21 else version( D_InlineAsm_X86_64 )
22 {
23 version = AsmX86;
24 }
25
26 /// Mipmapped images.
27 /// Supports non power-of-two textures.
28 /// Size of the i+1-th mipmap is { (width)/2, (height)/2 }
29 /// The mipmap owns each of its levels.
30 final class Mipmap(COLOR) if (is(COLOR == RGBA) || is(COLOR == L16) || is(COLOR == RGBA16) )
31 {
32 public:
33 nothrow:
34 @nogc:
35
36 enum Quality
37 {
38 box, // simple 2x2 filter, creates phase problems with NPOT. For higher levels, automatically uses cubic.
39 cubic, // Very smooth kernel [1 2 1] x [1 2 1]
40
41 /// Box-filter, and after such a step the next level is alpha-premultiplied.
42 /// This is intended for the first level 0 to level 1 transition, in case of bloom.
43 /// Within version(futurePBREmissive), this also transitions to linear space to have
44 /// more natural highlights.
45 boxAlphaCovIntoPremul,
46 }
47
48 Vec!(OwnedImage!COLOR) levels;
49
50 /// Creates empty
51 this()
52 {
53 levels = makeVec!(OwnedImage!COLOR)();
54 }
55
56 /// Set number of levels and size
57 /// maxLevel = 0 => only one image
58 /// maxLevel = 1 => one image + one 2x downsampled mipmap
59 /// etc...
60 this(int maxLevel, int w, int h)
61 {
62 this();
63 size(maxLevel, w, h);
64 }
65
66
67 /// Creates a Mipmap out of a flat OwnedImage.
68 /// This takes ownership of the given image, which is now owned by the `Mipmap`.
69 this(int maxLevel, OwnedImage!COLOR level0)
70 {
71 //PERF: could avoid to create the 0th level only to replace it later
72
73 this(maxLevel, level0.w, level0.h);
74
75 // replaces level 0
76 levels[0].destroyFree();
77 levels[0] = level0;
78 }
79
80 void size(int maxLevel, int w, int h)
81 {
82 // find number of needed levels
83 int neededLevels = 0;
84 {
85 int wr = w;
86 int hr = h;
87 for (; neededLevels <= maxLevel; ++neededLevels)
88 {
89 if (wr == 0 || hr == 0)
90 break;
91 wr = (wr + 0) >> 1;
92 hr = (hr + 0) >> 1;
93 }
94 }
95
96 void setLevels(int numLevels)
97 {
98 // FUTURE: cleanup excess levels
99 // should not happen until we have resizing
100 if (numLevels < levels.length)
101 {
102 assert(false);
103 }
104
105 int previousLength = cast(int)levels.length;
106
107 levels.resize(numLevels);
108
109 // create empty image for new levels
110 for(int level = previousLength; level < numLevels; ++level)
111 {
112 levels[level] = mallocNew!(OwnedImage!COLOR)();
113 }
114 }
115
116 setLevels(neededLevels);
117
118 // resize levels
119 for (int level = 0; level < neededLevels; ++level)
120 {
121 assert(w != 0 && h != 0);
122 levels[level].size(w, h);
123 w = (w + 0) >> 1;
124 h = (h + 0) >> 1;
125 }
126 }
127
128 ~this()
129 {
130 foreach(level; levels)
131 level.destroyFree();
132 }
133
134 /// Interpolates a color between mipmap levels. Floating-point level, spatial linear interpolation.
135 /// x and y are in base level coordinates (top-left pixel is on (0.5, 0.5) coordinates).
136 /// Clamped to borders.
137 auto linearMipmapSample(float level, float x, float y) nothrow @nogc
138 {
139 int ilevel = cast(int)level;
140 float flevel = level - ilevel;
141 vec4f levelN = linearSample(ilevel, x, y);
142 if (flevel == 0)
143 return levelN;
144
145 auto levelNp1 = linearSample(ilevel + 1, x, y);
146
147 return levelN * (1 - flevel) + levelNp1 * flevel;
148 }
149
150 /// Cubic filtering mode, using a Catmull-Rom bicubic filter.
151 /// Integer level, spatial linear interpolation.
152 /// x and y are in base level coordinates (top-left pixel is on (0.5, 0.5) coordinates).
153 /// Clamped to borders.
154 /// Reference: https://registry.khronos.org/OpenGL/extensions/IMG/IMG_texture_filter_cubic.txt
155 auto cubicSample(int level, float x, float y) nothrow @nogc
156 {
157 if (level < 0)
158 level = 0;
159 int numLevels = cast(int)levels.length;
160 if (level >= numLevels)
161 level = numLevels - 1;
162
163 OwnedImage!COLOR image = levels[level];
164
165 static immutable float[14] factors = [ 1.0f, 0.5f, 0.25f, 0.125f,
166 0.0625f, 0.03125f, 0.015625f, 0.0078125f,
167 0.00390625f, 0.001953125f, 0.0009765625f, 0.00048828125f,
168 0.000244140625f, 0.0001220703125f];
169
170 float divider = factors[level];
171 x = x * divider - 0.5f;
172 y = y * divider - 0.5f;
173
174 __m128 mm0123 = _mm_setr_ps(-1, 0, 1, 2);
175 __m128i x_indices = _mm_cvttps_epi32( _mm_set1_ps(x) + mm0123);
176 __m128i y_indices = _mm_cvttps_epi32( _mm_set1_ps(y) + mm0123);
177 __m128i zero = _mm_setzero_si128();
178 x_indices = _mm_max_epi32(x_indices, zero);
179 y_indices = _mm_max_epi32(y_indices, zero);
180 x_indices = _mm_min_epi32(x_indices, _mm_set1_epi32(image.w-1));
181 y_indices = _mm_min_epi32(y_indices, _mm_set1_epi32(image.h-1));
182
183 int i0 = x_indices.array[0];
184 int i1 = x_indices.array[1];
185 int i2 = x_indices.array[2];
186 int i3 = x_indices.array[3];
187
188 // fractional part
189 float a = x + 1.0f;
190 float b = y + 1.0f;
191 a = a - cast(int)(a);
192 b = b - cast(int)(b);
193 assert(a >= -0.01 && a <= 1.01);
194 assert(b >= -0.01 && b <= 1.01);
195
196 COLOR*[4] L = void;
197 L[0] = image.scanlinePtr(y_indices.array[0]);
198 L[1] = image.scanlinePtr(y_indices.array[1]);
199 L[2] = image.scanlinePtr(y_indices.array[2]);
200 L[3] = image.scanlinePtr(y_indices.array[3]);
201
202 static if (is(COLOR == L16))
203 {
204 static float clamp_0_to_65535(float a)
205 {
206 if (a < 0) a = 0;
207 if (a > 65535) a = 65535;
208 return a;
209 }
210 static cubicInterp(float t, float x0, float x1, float x2, float x3) pure nothrow @nogc
211 {
212 // PERF: doesn't sound that great???
213 return x1
214 + t * ((-0.5f * x0) + (0.5f * x2))
215 + t * t * (x0 - (2.5f * x1) + (2.0f * x2) - (0.5f * x3))
216 + t * t * t * ((-0.5f * x0) + (1.5f * x1) - (1.5f * x2) + 0.5f * x3);
217 }
218
219 float[4] R;
220 for (int row = 0; row < 4; ++row)
221 {
222 COLOR* pRow = L[row];
223 COLOR ri0jn = pRow[i0];
224 COLOR ri1jn = pRow[i1];
225 COLOR ri2jn = pRow[i2];
226 COLOR ri3jn = pRow[i3];
227 float A = ri0jn.l;
228 float B = ri1jn.l;
229 float C = ri2jn.l;
230 float D = ri3jn.l;
231 R[row] = cubicInterp(a, A, B, C, D);
232 }
233 return clamp_0_to_65535(cubicInterp(b, R[0], R[1], R[2], R[3]));
234 }
235 else
236 {
237 // actually optimized ok by LDC
238 static vec4f clamp_0_to_65535(vec4f a)
239 {
240 if (a[0] < 0) a[0] = 0;
241 if (a[1] < 0) a[1] = 0;
242 if (a[2] < 0) a[2] = 0;
243 if (a[3] < 0) a[3] = 0;
244 if (a[0] > 65535) a[0] = 65535;
245 if (a[1] > 65535) a[1] = 65535;
246 if (a[2] > 65535) a[2] = 65535;
247 if (a[3] > 65535) a[3] = 65535;
248 return a;
249 }
250
251 static cubicInterp(float t, vec4f x0, vec4f x1, vec4f x2, vec4f x3) pure nothrow @nogc
252 {
253 // PERF: doesn't sound that great???
254 return x1
255 + t * ((-0.5f * x0) + (0.5f * x2))
256 + t * t * (x0 - (2.5f * x1) + (2.0f * x2) - (0.5f * x3))
257 + t * t * t * ((-0.5f * x0) + (1.5f * x1) - (1.5f * x2) + 0.5f * x3);
258 }
259 vec4f[4] R = void;
260 for (int row = 0; row < 4; ++row)
261 {
262 COLOR* pRow = L[row];
263 COLOR ri0jn = pRow[i0];
264 COLOR ri1jn = pRow[i1];
265 COLOR ri2jn = pRow[i2];
266 COLOR ri3jn = pRow[i3];
267 vec4f A = vec4f(ri0jn.r, ri0jn.g, ri0jn.b, ri0jn.a);
268 vec4f B = vec4f(ri1jn.r, ri1jn.g, ri1jn.b, ri1jn.a);
269 vec4f C = vec4f(ri2jn.r, ri2jn.g, ri2jn.b, ri2jn.a);
270 vec4f D = vec4f(ri3jn.r, ri3jn.g, ri3jn.b, ri3jn.a);
271 R[row] = cubicInterp(a, A, B, C, D);
272 }
273 return clamp_0_to_65535(cubicInterp(b, R[0], R[1], R[2], R[3]));
274 }
275 }
276
277
278 /// Interpolates a color. Integer level, spatial linear interpolation.
279 /// x and y are in base level coordinates (top-left pixel is on (0.5, 0.5) coordinates).
280 /// Clamped to borders.
281 auto linearSample(int level, float x, float y) nothrow @nogc
282 {
283 if (level < 0)
284 level = 0;
285 int numLevels = cast(int)levels.length;
286 if (level >= numLevels)
287 level = numLevels - 1;
288
289 OwnedImage!COLOR image = levels[level];
290
291
292 static immutable float[14] factors = [ 1.0f, 0.5f, 0.25f, 0.125f,
293 0.0625f, 0.03125f, 0.015625f, 0.0078125f,
294 0.00390625f, 0.001953125f, 0.0009765625f, 0.00048828125f,
295 0.000244140625f, 0.0001220703125f];
296
297 float divider = factors[level];
298 x = x * divider - 0.5f;
299 y = y * divider - 0.5f;
300
301 if (x < 0)
302 x = 0;
303 if (y < 0)
304 y = 0;
305
306 __m128 floatCoords = _mm_setr_ps(x, y, 0, 0);
307 __m128i truncatedCoord = _mm_cvttps_epi32(floatCoords);
308 int ix = truncatedCoord.array[0];
309 int iy = truncatedCoord.array[1];
310
311 // Get fractional part
312 float fx = x - ix;
313 float fy = y - iy;
314
315 const int maxX = image.w-1;
316 const int maxY = image.h-1;
317 if (ix > maxX)
318 ix = maxX;
319 if (iy > maxY)
320 iy = maxY;
321
322 int ixp1 = ix + 1;
323 int iyp1 = iy + 1;
324 if (ixp1 > maxX)
325 ixp1 = maxX;
326 if (iyp1 > maxY)
327 iyp1 = maxY;
328
329 float fxm1 = 1 - fx;
330 float fym1 = 1 - fy;
331
332 COLOR* L0 = image.scanlinePtr(iy);
333 COLOR* L1 = image.scanlinePtr(iyp1);
334
335 COLOR A = L0[ix];
336 COLOR B = L0[ixp1];
337 COLOR C = L1[ix];
338 COLOR D = L1[ixp1];
339
340 static if (is(COLOR == RGBA))
341 {
342 float inv255 = 1 / 255.0f;
343 version(LDC)
344 {
345 int Ai = *cast(int*)(&A);
346 int Bi = *cast(int*)(&B);
347 int Ci = *cast(int*)(&C);
348 int Di = *cast(int*)(&D);
349
350 __m128i mmZero = _mm_setzero_si128();
351 __m128i mmABCD = _mm_setr_epi32(Ai, Bi, Ci, Di);
352
353 // Convert to float of the form (R, G, B, A)
354 __m128i mmAB = _mm_unpacklo_epi8(mmABCD, mmZero);
355 __m128i mmCD = _mm_unpackhi_epi8(mmABCD, mmZero);
356 __m128 vA = _mm_cvtepi32_ps( _mm_unpacklo_epi16(mmAB, mmZero));
357 __m128 vB = _mm_cvtepi32_ps( _mm_unpackhi_epi16(mmAB, mmZero));
358 __m128 vC = _mm_cvtepi32_ps( _mm_unpacklo_epi16(mmCD, mmZero));
359 __m128 vD = _mm_cvtepi32_ps( _mm_unpackhi_epi16(mmCD, mmZero));
360
361 __m128 vfx = _mm_set1_ps(fx);
362 __m128 vfxm1 = _mm_set1_ps(fxm1);
363 __m128 up = vA * vfxm1 + vB * vfx;
364 __m128 down = vC * vfxm1 + vD * vfx;
365
366 __m128 vfy = _mm_set1_ps(fy);
367 __m128 vfym1 = _mm_set1_ps(fym1);
368 __m128 dResult = up * fym1 + down * fy;
369 vec4f result = void;
370 _mm_storeu_ps(result.ptr, dResult);
371 return result;
372
373 }
374 else version( AsmX86 )
375 {
376 vec4f asmResult;
377
378 asm nothrow @nogc
379 {
380 movd XMM0, A;
381 movd XMM1, B;
382 movd XMM2, C;
383 movd XMM3, D;
384 pxor XMM4, XMM4;
385
386 punpcklbw XMM0, XMM4;
387 punpcklbw XMM1, XMM4;
388 punpcklbw XMM2, XMM4;
389 punpcklbw XMM3, XMM4;
390
391 punpcklwd XMM0, XMM4;
392 punpcklwd XMM1, XMM4;
393 punpcklwd XMM2, XMM4;
394 punpcklwd XMM3, XMM4;
395
396 cvtdq2ps XMM0, XMM0;
397 cvtdq2ps XMM1, XMM1;
398
399 cvtdq2ps XMM2, XMM2;
400 cvtdq2ps XMM3, XMM3;
401
402 movss XMM4, fxm1;
403 pshufd XMM4, XMM4, 0;
404 movss XMM5, fx;
405 pshufd XMM5, XMM5, 0;
406
407 mulps XMM0, XMM4;
408 mulps XMM1, XMM5;
409 mulps XMM2, XMM4;
410 mulps XMM3, XMM5;
411
412 movss XMM4, fym1;
413 pshufd XMM4, XMM4, 0;
414 movss XMM5, fy;
415 pshufd XMM5, XMM5, 0;
416
417 addps XMM0, XMM1;
418 addps XMM2, XMM3;
419
420 mulps XMM0, XMM4;
421 mulps XMM2, XMM5;
422
423 addps XMM0, XMM2;
424
425 movups asmResult, XMM0;
426 }
427
428 // Uncomment to check
429 /*
430 vec4f vA = vec4f(A.r, A.g, A.b, A.a);
431 vec4f vB = vec4f(B.r, B.g, B.b, B.a);
432 vec4f vC = vec4f(C.r, C.g, C.b, C.a);
433 vec4f vD = vec4f(D.r, D.g, D.b, D.a);
434
435 vec4f up = vA * fxm1 + vB * fx;
436 vec4f down = vC * fxm1 + vD * fx;
437 vec4f dResult = up * fym1 + down * fy;
438
439 import gfm.core;
440
441 if (dResult.distanceTo(result) < 1.0f)
442 debugBreak();
443 */
444
445 vec4f result = asmResult;
446 return result;
447 }
448 else
449 {
450 vec4f vA = vec4f(A.r, A.g, A.b, A.a);
451 vec4f vB = vec4f(B.r, B.g, B.b, B.a);
452 vec4f vC = vec4f(C.r, C.g, C.b, C.a);
453 vec4f vD = vec4f(D.r, D.g, D.b, D.a);
454
455
456
457 vec4f up = vA * fxm1 + vB * fx;
458 vec4f down = vC * fxm1 + vD * fx;
459 vec4f dResult = up * fym1 + down * fy;
460
461 // assert(dResult.distanceTo(asmResult) < 1.0f);
462
463 return dResult;
464 }
465 }
466 else static if (is(COLOR == L16))
467 {
468 float up = A.l * fxm1 + B.l * fx;
469 float down = C.l * fxm1 + D.l * fx;
470 return up * fym1 + down * fy;
471 }
472 else // RGBA16
473 {
474 vec4f vA = vec4f(A.r, A.g, A.b, A.a);
475 vec4f vB = vec4f(B.r, B.g, B.b, B.a);
476 vec4f vC = vec4f(C.r, C.g, C.b, C.a);
477 vec4f vD = vec4f(D.r, D.g, D.b, D.a);
478
479 vec4f up = vA * fxm1 + vB * fx;
480 vec4f down = vC * fxm1 + vD * fx;
481 vec4f result = up * fym1 + down * fy;
482 return result;
483 }
484 }
485
486 /// Returns: Width of the base level.
487 int width() pure const nothrow @nogc
488 {
489 return levels[0].w;
490 }
491
492 /// Returns: Height of the base level.
493 int height() pure const nothrow @nogc
494 {
495 return levels[0].h;
496 }
497
498 /// Returns: Number of levels. The maximum level is numLevels() - 1.
499 int numLevels() pure const nothrow @nogc
500 {
501 return cast(int)levels.length;
502 }
503
504 /// Regenerates the whole upper levels.
505 void generateMipmaps(Quality quality) nothrow @nogc
506 {
507 box2i updateRect = box2i(0, 0, width(), height());
508 for (int level = 1; level < numLevels(); ++level)
509 {
510 // HACK: Force cubic filter past a level else it makes ugly looking mipmaps
511 if (level >= 3 && quality == Quality.box)
512 quality = Quality.cubic;
513
514 updateRect = generateNextLevel(quality, updateRect, level);
515 }
516 }
517
518 /// Regenerates a single mipmap level based on changes in the provided rectangle (expressed in level 0 coordinates).
519 /// updateRect expressed in level 0 coordinates
520 /// In general if you have several subparts of mipmaps to update, make sure a level is fully completed
521 /// before computing the next one.
522 box2i generateNextLevel(Quality quality, box2i updateRectPreviousLevel, int level) nothrow @nogc
523 {
524 OwnedImage!COLOR previousLevel = levels[level - 1];
525 box2i updateRect = impactOnNextLevel(quality, updateRectPreviousLevel, previousLevel.w, previousLevel.h);
526 generateLevel(level, quality, updateRect);
527 return updateRect;
528 }
529
530 /// Regenerates one level
531 /// updateRect expressed in level i-th coordinates
532 void generateLevel(int level, Quality quality, box2i updateRect) nothrow @nogc
533 {
534 assert(level > 0);
535 OwnedImage!COLOR thisLevel = levels[level];
536 OwnedImage!COLOR previousLevel = levels[level - 1];
537
538 final switch(quality) with (Quality)
539 {
540 case box:
541
542 static if (is(COLOR == RGBA))
543 generateLevelBoxRGBA(thisLevel, previousLevel, updateRect);
544 else static if (is(COLOR == L16))
545 generateLevelBoxL16(thisLevel, previousLevel, updateRect);
546 else static if (is(COLOR == RGBA16))
547 generateLevelBoxRGBA16(thisLevel, previousLevel, updateRect);
548 else
549 static assert(false, "not implemented");
550
551 enum checkBoxMipmaps = false;
552
553 static if (checkBoxMipmaps)
554 {
555 for (int y = updateRect.min.y; y < updateRect.max.y; ++y)
556 {
557 COLOR[] L0 = previousLevel.scanline(y * 2);
558 COLOR[] L1 = previousLevel.scanline(y * 2 + 1);
559 COLOR[] dest = thisLevel.scanline(y);
560
561 for (int x = updateRect.min.x; x < updateRect.max.x; ++x)
562 {
563 // A B
564 // C D
565 COLOR A = L0[2 * x];
566 COLOR B = L0[2 * x + 1];
567 COLOR C = L1[2 * x];
568 COLOR D = L1[2 * x + 1];
569 assert(dest[x] == COLOR.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D));
570 }
571 }
572 }
573 break;
574
575 case boxAlphaCovIntoPremul:
576
577 static if (is(COLOR == RGBA))
578 {
579 generateLevelBoxAlphaCovIntoPremulRGBA(thisLevel, previousLevel, updateRect);
580 break;
581 }
582 else
583 assert(false);
584
585 case cubic:
586 static if (is(COLOR == RGBA))
587 {
588 generateLevelCubicRGBA(thisLevel, previousLevel, updateRect);
589 break;
590 }
591 else static if (is(COLOR == L16))
592 {
593 generateLevelCubicL16(thisLevel, previousLevel, updateRect);
594 break;
595 }
596 else static if (is(COLOR == RGBA16))
597 {
598 generateLevelCubicRGBA16(thisLevel, previousLevel, updateRect);
599 break;
600 }
601 else
602 static assert(false, "not implemented");
603
604
605 }
606 }
607
608
609 private:
610 /// Computes impact of updating the area box on next level
611 static box2i impactOnNextLevel(Quality quality, box2i area, int currentLevelWidth, int currentLevelHeight) pure nothrow @nogc
612 {
613 box2i maxArea = box2i(0, 0, currentLevelWidth / 2, currentLevelHeight / 2);
614
615 final switch(quality) with (Quality)
616 {
617 case box:
618 case boxAlphaCovIntoPremul:
619 int xmin = area.min.x / 2;
620 int ymin = area.min.y / 2;
621 int xmax = (area.max.x + 1) / 2;
622 int ymax = (area.max.y + 1) / 2;
623 return box2i(xmin, ymin, xmax, ymax).intersection(maxArea);
624
625 case cubic:
626 int xmin = (area.min.x - 1) / 2;
627 int ymin = (area.min.y - 1) / 2;
628 int xmax = (area.max.x + 2) / 2;
629 int ymax = (area.max.y + 2) / 2;
630 return box2i(xmin, ymin, xmax, ymax).intersection(maxArea);
631 }
632
633 }
634 }
635
636 unittest
637 {
638 Mipmap!RGBA a = new Mipmap!RGBA();
639 a.size(4, 256, 256);
640 a.destroy();
641
642 Mipmap!L16 b = new Mipmap!L16();
643 b.size(16, 17, 333);
644 b.destroy();
645 }
646
647
648 private:
649
650 align(16) static immutable short[8] xmmTwoShort = [ 2, 2, 2, 2, 2, 2, 2, 2 ];
651 align(16) static immutable int[4] xmmTwoInt = [ 2, 2, 2, 2 ];
652 align(16) static immutable float[4] xmm0_5 = [ 0.5f, 0.5f, 0.5f, 0.5f ];
653 align(16) static immutable int[4] xmm512 = [ 512, 512, 512, 512 ];
654 align(16) static immutable short[8] xmm11113333 = [ 1, 1, 1, 1, 3, 3, 3, 3 ];
655 align(16) static immutable short[8] xmm33331111 = [ 3, 3, 3, 3, 1, 1, 1, 1 ];
656 align(16) static immutable short[8] xmm33339999 = [ 3, 3, 3, 3, 9, 9, 9, 9 ];
657 align(16) static immutable short[8] xmm99993333 = [ 9, 9, 9, 9, 3, 3, 3, 3 ];
658 align(16) static immutable short[8] xmm32 = [ 32, 32, 32, 32, 32, 32, 32, 32 ];
659
660
661 void generateLevelBoxRGBA(OwnedImage!RGBA thisLevel,
662 OwnedImage!RGBA previousLevel,
663 box2i updateRect) pure nothrow @nogc
664 {
665 int width = updateRect.width();
666 int height = updateRect.height();
667
668 for (int y = 0; y < height; ++y)
669 {
670 RGBA* L0 = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 ) + updateRect.min.x * 2;
671 RGBA* L1 = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 + 1) + updateRect.min.x * 2;
672 RGBA* dest = thisLevel.scanlinePtr( updateRect.min.y + y) + updateRect.min.x;
673
674
675
676 // PERF: enable later, this is faster on a full mipmap even without AVX2
677 /// Requires a somewhat recent intel-intrinsics though
678 /+
679 int x = 0;
680 __m256i zero = _mm256_setzero_si256();
681 __m256i two = _mm256_set1_epi16(2);
682 for ( ; x + 3 < width; x += 4)
683 {
684 // pixel patches:
685 // A B E F Goal = (A + B + C + D + 2) / 4 => res
686 // C D G H (E + F + G + H + 2) / 4 => res+1
687 //
688 __m256i ABEF = _mm256_loadu_si256(cast(const(__m256i)*) &L0[2*x]);
689 __m256i CDGH = _mm256_loadu_si256(cast(const(__m256i)*) &L1[2*x]);
690 __m256i AB = _mm256_unpacklo_epi8(ABEF, zero);
691 __m256i EF = _mm256_unpackhi_epi8(ABEF, zero);
692 __m256i CD = _mm256_unpacklo_epi8(CDGH, zero);
693 __m256i GH = _mm256_unpackhi_epi8(CDGH, zero);
694 AB = _mm256_add_epi16(AB, CD); // A + C B + D
695 EF = _mm256_add_epi16(EF, GH); // E + G F + H
696 __m256i AC_EG = _mm256_unpacklo_epi64(AB, EF); // A+C E+G
697 __m256i BD_FH = _mm256_unpackhi_epi64(AB, EF); // B+D F+H
698 __m256i sum = _mm256_add_epi16(AC_EG, BD_FH); // A+B+C+D E+F+G+H
699 sum = _mm256_add_epi16(sum, two); // A+B+C+D+2 E+F+G+H+2
700 sum = _mm256_srai_epi16(sum, 2); // (A+B+C+D+2)/4 (E+F+G+H+2)/4
701 __m256i finalPixels = _mm256_packus_epi16(sum, zero);
702
703 __m128i f_lo = _mm256_extractf128_si256!0(finalPixels);
704 __m128i f_hi = _mm256_extractf128_si256!1(finalPixels);
705 _mm_storeu_si64(&dest[x], f_lo); // PERF Would need a vpermute here. In each lane, only the low 8 bytes are interesting.
706 _mm_storeu_si64(&dest[x+2], f_hi);
707 }
708 }
709
710 +/
711
712 __m128i zero = _mm_setzero_si128();
713 __m128i two = _mm_set1_epi16(2);
714 int x = 0;
715 for ( ; x + 1 < width; x += 2)
716 {
717 // pixel patches:
718 // A B E F Goal = (A + B + C + D + 2) / 4 => res
719 // C D G H (E + F + G + H + 2) / 4 => res+1
720 //
721 __m128i ABEF = _mm_loadu_si128(cast(const(__m128i)*) &L0[2*x]);
722 __m128i CDGH = _mm_loadu_si128(cast(const(__m128i)*) &L1[2*x]);
723 __m128i AB = _mm_unpacklo_epi8(ABEF, zero);
724 __m128i EF = _mm_unpackhi_epi8(ABEF, zero);
725 __m128i CD = _mm_unpacklo_epi8(CDGH, zero);
726 __m128i GH = _mm_unpackhi_epi8(CDGH, zero);
727 AB = _mm_add_epi16(AB, CD); // A + C B + D
728 EF = _mm_add_epi16(EF, GH); // E + G F + H
729 __m128i AC_EG = _mm_unpacklo_epi64(AB, EF); // A+C E+G
730 __m128i BD_FH = _mm_unpackhi_epi64(AB, EF); // B+D F+H
731 __m128i sum = _mm_add_epi16(AC_EG, BD_FH); // A+B+C+D E+F+G+H
732 sum = _mm_add_epi16(sum, two); // A+B+C+D+2 E+F+G+H+2
733 sum = _mm_srai_epi16(sum, 2); // (A+B+C+D+2)/4 (E+F+G+H+2)/4
734 __m128i finalPixels = _mm_packus_epi16(sum, zero);
735 _mm_storeu_si64(&dest[x], finalPixels);
736 }
737
738 for (; x < width; ++x)
739 {
740 RGBA A = L0[2 * x];
741 RGBA B = L0[2 * x + 1];
742 RGBA C = L1[2 * x];
743 RGBA D = L1[2 * x + 1];
744 dest[x] = RGBA.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D);
745 }
746 }
747 }
748
749 void generateLevelBoxL16(OwnedImage!L16 thisLevel,
750 OwnedImage!L16 previousLevel,
751 box2i updateRect) pure nothrow @nogc
752 {
753 int width = updateRect.width();
754 int height = updateRect.height();
755
756 for (int y = 0; y < height; ++y)
757 {
758 L16* L0 = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 ) + updateRect.min.x * 2;
759 L16* L1 = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 + 1) + updateRect.min.x * 2;
760 L16* dest = thisLevel.scanlinePtr( updateRect.min.y + y) + updateRect.min.x;
761
762
763 // Fun performance fact: for this loop (LDC 1.33, arch x86_64), assembly is slower than intrinsics,
764 // themselves slower than normal D code.
765
766 int x = 0;
767 for (; x < width; ++x)
768 {
769 // A B
770 // C D
771 L16 A = L0[2 * x];
772 L16 B = L0[2 * x + 1];
773 L16 C = L1[2 * x];
774 L16 D = L1[2 * x + 1];
775
776 dest[x] = L16.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D);
777 }
778 }
779 }
780
781 void generateLevelBoxRGBA16(OwnedImage!RGBA16 thisLevel,
782 OwnedImage!RGBA16 previousLevel,
783 box2i updateRect) pure nothrow @nogc
784 {
785 // untested and unused for now
786 int width = updateRect.width();
787 int height = updateRect.height();
788
789 for (int y = 0; y < height; ++y)
790 {
791 RGBA16* L0 = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 ) + updateRect.min.x * 2;
792 RGBA16* L1 = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 + 1) + updateRect.min.x * 2;
793 RGBA16* dest = thisLevel.scanlinePtr( updateRect.min.y + y) + updateRect.min.x;
794 for (int x = 0; x < width; ++x)
795 {
796 // A B
797 // C D
798 RGBA16 A = L0[2 * x];
799 RGBA16 B = L0[2 * x + 1];
800 RGBA16 C = L1[2 * x];
801 RGBA16 D = L1[2 * x + 1];
802
803 dest[x] = RGBA16.op!q{(a + b + c + d + 2) >> 2}(A, B, C, D);
804 }
805 }
806 }
807
808 void generateLevelBoxAlphaCovIntoPremulRGBA(OwnedImage!RGBA thisLevel,
809 OwnedImage!RGBA previousLevel,
810 box2i updateRect) nothrow @nogc
811 {
812 int width = updateRect.width();
813 int height = updateRect.height();
814
815 for (int y = 0; y < height; ++y)
816 {
817 RGBA* L0 = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 ) + updateRect.min.x * 2;
818 RGBA* L1 = previousLevel.scanlinePtr( (updateRect.min.y + y) * 2 + 1) + updateRect.min.x * 2;
819 RGBA* dest = thisLevel.scanlinePtr( updateRect.min.y + y) + updateRect.min.x;
820
821 version(futurePBREmissive)
822 {
823 // Note: basically very hard to beat with intrinsics.
824 // Hours lost trying to do that: 4.
825 // Neither float or integer intrinsics shenanigans do better than this plain code.
826
827 for (int x = 0; x < width; ++x)
828 {
829 RGBA A = L0[2 * x];
830 RGBA B = L0[2 * x + 1];
831 RGBA C = L1[2 * x];
832 RGBA D = L1[2 * x + 1];
833
834 // This is only approximate, does a pow2
835 static RGBAf convert_gammaspace_to_linear_premul (RGBA col)
836 {
837 RGBAf res;
838 enum float inv_255 = 1.0f / 255;
839 res.a = col.a * inv_255; // alpha is linear
840 res.r = col.r * inv_255 *col.r * inv_255* res.a;
841 res.g = col.g * inv_255 *col.g * inv_255* res.a;
842 res.b = col.b * inv_255 *col.b * inv_255* res.a;
843 return res;
844 }
845
846 // Convert those into
847 RGBAf A_linear = convert_gammaspace_to_linear_premul(A);
848 RGBAf B_linear = convert_gammaspace_to_linear_premul(B);
849 RGBAf C_linear = convert_gammaspace_to_linear_premul(C);
850 RGBAf D_linear = convert_gammaspace_to_linear_premul(D);
851
852 float meanR = A_linear.r + B_linear.r + C_linear.r + D_linear.r;
853 float meanG = A_linear.g + B_linear.g + C_linear.g + D_linear.g;
854 float meanB = A_linear.b + B_linear.b + C_linear.b + D_linear.b;
855 float meanA = A_linear.a + B_linear.a + C_linear.a + D_linear.a;
856
857 RGBA finalColor = RGBA( cast(ubyte)(meanR * 0.25f * 255.0f + 0.5f),
858 cast(ubyte)(meanG * 0.25f * 255.0f + 0.5f),
859 cast(ubyte)(meanB * 0.25f * 255.0f + 0.5f),
860 cast(ubyte)(meanA * 0.25f * 255.0f + 0.5f) );
861 dest[x] = finalColor;
862 }
863 }
864 else
865 {
866 for (int x = 0; x < width; ++x)
867 {
868 RGBA A = L0[2 * x];
869 RGBA B = L0[2 * x + 1];
870 RGBA C = L1[2 * x];
871 RGBA D = L1[2 * x + 1];
872 int red = (A.r * A.a + B.r * B.a + C.r * C.a + D.r * D.a);
873 int green = (A.g * A.a + B.g * B.a + C.g * C.a + D.g * D.a);
874 int blue = (A.b * A.a + B.b* B.a + C.b * C.a + D.b * D.a);
875 int alpha = (A.a * A.a + B.a* B.a + C.a * C.a + D.a * D.a);
876 RGBA finalColor = RGBA( cast(ubyte)((red + 512) >> 10),
877 cast(ubyte)((green + 512) >> 10),
878 cast(ubyte)((blue + 512) >> 10),
879 cast(ubyte)((alpha + 512) >> 10));
880 dest[x] = finalColor;
881 }
882 }
883 }
884 }
885
886 void generateLevelCubicRGBA(OwnedImage!RGBA thisLevel,
887 OwnedImage!RGBA previousLevel,
888 box2i updateRect) nothrow @nogc
889 {
890 for (int y = updateRect.min.y; y < updateRect.max.y; ++y)
891 {
892 int y2m1 = 2 * y - 1;
893 if (y2m1 < 0)
894 y2m1 = 0;
895
896 int y2p2 = 2 * y + 2;
897 if (y2p2 > previousLevel.h - 1)
898 y2p2 = previousLevel.h - 1;
899
900 RGBA* LM1 = previousLevel.scanlinePtr(y2m1);
901 RGBA* L0 = previousLevel.scanlinePtr(y * 2);
902 RGBA* L1 = previousLevel.scanlinePtr(y * 2 + 1);
903 RGBA* L2 = previousLevel.scanlinePtr(y2p2);
904 RGBA* dest = thisLevel.scanlinePtr(y);
905
906 for (int x = updateRect.min.x; x < updateRect.max.x; ++x)
907 {
908 // A B C D
909 // E F G H
910 // I J K L
911 // M N O P
912
913 int x2m1 = 2 * x - 1;
914 if (x2m1 < 0)
915 x2m1 = 0;
916 int x2p0 = 2 * x;
917 int x2p2 = 2 * x + 2;
918 if (x2p2 > previousLevel.w - 1)
919 x2p2 = previousLevel.w - 1;
920
921 static if (true)
922 {
923 align(16) RGBA[16] buf = void;
924 buf[0] = LM1[x2m1];
925 buf[1] = LM1[x2p0];
926 buf[2] = LM1[x2p0+1];
927 buf[3] = LM1[x2p2];
928 buf[4] = L0[x2m1];
929 buf[5] = L0[x2p0];
930 buf[6] = L0[x2p0+1];
931 buf[7] = L0[x2p2];
932 buf[8] = L1[x2m1];
933 buf[9] = L1[x2p0];
934 buf[10] = L1[x2p0+1];
935 buf[11] = L1[x2p2];
936 buf[12] = L2[x2m1];
937 buf[13] = L2[x2p0];
938 buf[14] = L2[x2p0+1];
939 buf[15] = L2[x2p2];
940 RGBA* pDest = dest + x;
941
942 const __m128i mmZero = _mm_setzero_si128();
943
944 // Note: no coefficients improvements really convince.
945 // This was Issue #827, read for more context.
946
947 const __m128i xmm11113333 = _mm_setr_epi16(1, 1, 1, 1, 3, 3, 3, 3);
948 const __m128i xmm33339999 = _mm_setr_epi16(3, 3, 3, 3, 9, 9, 9, 9);
949
950 __m128i ABCD = _mm_load_si128(cast(const(__m128i*)) &buf[0]);
951 __m128i EFGH = _mm_load_si128(cast(const(__m128i*)) &buf[4]);
952 __m128i IJKL = _mm_load_si128(cast(const(__m128i*)) &buf[8]);
953 __m128i MNOP = _mm_load_si128(cast(const(__m128i*)) &buf[12]);
954
955 __m128i AB = _mm_unpacklo_epi8(ABCD, mmZero);
956 __m128i CD = _mm_unpackhi_epi8(ABCD, mmZero);
957 __m128i EF = _mm_unpacklo_epi8(EFGH, mmZero);
958 __m128i GH = _mm_unpackhi_epi8(EFGH, mmZero);
959 __m128i IJ = _mm_unpacklo_epi8(IJKL, mmZero);
960 __m128i KL = _mm_unpackhi_epi8(IJKL, mmZero);
961 __m128i MN = _mm_unpacklo_epi8(MNOP, mmZero);
962 __m128i OP = _mm_unpackhi_epi8(MNOP, mmZero);
963
964 // This avoid a few multiplications
965 AB = _mm_add_epi16(AB, MN);
966 CD = _mm_add_epi16(CD, OP);
967 EF = _mm_add_epi16(EF, IJ);
968 GH = _mm_add_epi16(GH, KL);
969
970 // Wrap a bit more, avoids two muls
971 AB = _mm_add_epi16(AB, _mm_shuffle_epi32!0x4e(CD)); // invert quadwords
972 EF = _mm_add_epi16(EF, _mm_shuffle_epi32!0x4e(GH)); // invert quadwords
973
974 // PERF: we can win a few mul here
975 __m128i sum01 = _mm_mullo_epi16(AB, xmm11113333);
976 sum01 = _mm_add_epi16(sum01, _mm_mullo_epi16(EF, xmm33339999));
977 sum01 = _mm_add_epi16(sum01, _mm_srli_si128!8(sum01));
978
979 __m128i sum = sum01;
980 sum = _mm_add_epi16(sum, _mm_set1_epi16(32));
981 sum = _mm_srli_epi16(sum, 6);
982 __m128i finalPixels = _mm_packus_epi16(sum, mmZero);
983 _mm_storeu_si32(pDest, finalPixels);
984 }
985 else
986 {
987 RGBA A = LM1[x2m1];
988 RGBA B = LM1[x2p0];
989 RGBA C = LM1[x2p0+1];
990 RGBA D = LM1[x2p2];
991
992 RGBA E = L0[x2m1];
993 RGBA F = L0[x2p0];
994 RGBA G = L0[x2p0+1];
995 RGBA H = L0[x2p2];
996
997 RGBA I = L1[x2m1];
998 RGBA J = L1[x2p0];
999 RGBA K = L1[x2p0+1];
1000 RGBA L = L1[x2p2];
1001
1002 RGBA M = L2[x2m1];
1003 RGBA N = L2[x2p0];
1004 RGBA O = L2[x2p0+1];
1005 RGBA P = L2[x2p2];
1006
1007 // Apply filter
1008 // 1 3 3 1
1009 // 3 9 9 3 / 64
1010 // 3 9 9 3
1011 // 1 3 3 1
1012
1013 int rSum = (A.r + D.r + M.r + P.r) + 3 * (B.r + C.r + E.r + H.r + I.r + L.r + N.r + O.r) + 9 * (F.r + G.r + J.r + K.r);
1014 int gSum = (A.g + D.g + M.g + P.g) + 3 * (B.g + C.g + E.g + H.g + I.g + L.g + N.g + O.g) + 9 * (F.g + G.g + J.g + K.g);
1015 int bSum = (A.b + D.b + M.b + P.b) + 3 * (B.b + C.b + E.b + H.b + I.b + L.b + N.b + O.b) + 9 * (F.b + G.b + J.b + K.b);
1016 int aSum = (A.a + D.a + M.a + P.a) + 3 * (B.a + C.a + E.a + H.a + I.a + L.a + N.a + O.a) + 9 * (F.a + G.a + J.a + K.a);
1017 dest[x].r = cast(ubyte)((rSum + 32) >> 6);
1018 dest[x].g = cast(ubyte)((gSum + 32) >> 6);
1019 dest[x].b = cast(ubyte)((bSum + 32) >> 6);
1020 dest[x].a = cast(ubyte)((aSum + 32) >> 6);
1021 }
1022 }
1023 }
1024 }
1025
1026 void generateLevelCubicL16(OwnedImage!L16 thisLevel,
1027 OwnedImage!L16 previousLevel,
1028 box2i updateRect) nothrow @nogc
1029 {
1030 for (int y = updateRect.min.y; y < updateRect.max.y; ++y)
1031 {
1032 int y2m1 = 2 * y - 1;
1033 if (y2m1 < 0)
1034 y2m1 = 0;
1035
1036 int y2p2 = 2 * y + 2;
1037 if (y2p2 > previousLevel.h - 1)
1038 y2p2 = previousLevel.h - 1;
1039
1040 L16* LM1 = previousLevel.scanlinePtr(y2m1);
1041 L16* L0 = previousLevel.scanlinePtr(y * 2);
1042 L16* L1 = previousLevel.scanlinePtr(y * 2 + 1);
1043 L16* L2 = previousLevel.scanlinePtr(y2p2);
1044 L16* dest = thisLevel.scanlinePtr(y);
1045
1046 for (int x = updateRect.min.x; x < updateRect.max.x; ++x)
1047 {
1048 // A B C D
1049 // E F G H
1050 // I J K L
1051 // M N O P
1052
1053 int x2m1 = 2 * x - 1;
1054 if (x2m1 < 0)
1055 x2m1 = 0;
1056 int x2p0 = 2 * x;
1057 int x2p2 = 2 * x + 2;
1058 if (x2p2 > previousLevel.w - 1)
1059 x2p2 = previousLevel.w - 1;
1060
1061 ushort A = LM1[x2m1].l;
1062 ushort B = LM1[x2p0].l;
1063 ushort C = LM1[x2p0+1].l;
1064 ushort D = LM1[x2p2].l;
1065
1066 ushort E = L0[x2m1].l;
1067 ushort F = L0[x2p0].l;
1068 ushort G = L0[x2p0+1].l;
1069 ushort H = L0[x2p2].l;
1070
1071 ushort I = L1[x2m1].l;
1072 ushort J = L1[x2p0].l;
1073 ushort K = L1[x2p0+1].l;
1074 ushort L = L1[x2p2].l;
1075
1076 ushort M = L2[x2m1].l;
1077 ushort N = L2[x2p0].l;
1078 ushort O = L2[x2p0+1].l;
1079 ushort P = L2[x2p2].l;
1080
1081 // Apply filter
1082 // 1 3 3 1 A B C D
1083 // 3 9 9 3 E F G H
1084 // 3 9 9 3 I J K L
1085 // 1 3 3 1 M N O P
1086
1087 int depthSum = (A + D + M + P)
1088 + 3 * (B + C + E + H + I + L + N + O)
1089 + 9 * (F + G + J + K);
1090 dest[x].l = cast(ushort)((depthSum + 32) >> 6 );
1091 }
1092 }
1093 }
1094
1095 void generateLevelCubicRGBA16(OwnedImage!RGBA16 thisLevel,
1096 OwnedImage!RGBA16 previousLevel,
1097 box2i updateRect) nothrow @nogc
1098 {
1099 // untested and unused for now
1100 for (int y = updateRect.min.y; y < updateRect.max.y; ++y)
1101 {
1102 int y2m1 = 2 * y - 1;
1103 if (y2m1 < 0)
1104 y2m1 = 0;
1105
1106 int y2p2 = 2 * y + 2;
1107 if (y2p2 > previousLevel.h - 1)
1108 y2p2 = previousLevel.h - 1;
1109
1110 RGBA16* LM1 = previousLevel.scanlinePtr(y2m1);
1111 RGBA16* L0 = previousLevel.scanlinePtr(y * 2);
1112 RGBA16* L1 = previousLevel.scanlinePtr(y * 2 + 1);
1113 RGBA16* L2 = previousLevel.scanlinePtr(y2p2);
1114 RGBA16* dest = thisLevel.scanlinePtr(y);
1115
1116 for (int x = updateRect.min.x; x < updateRect.max.x; ++x)
1117 {
1118 // A B C D
1119 // E F G H
1120 // I J K L
1121 // M N O P
1122
1123 int x2m1 = 2 * x - 1;
1124 if (x2m1 < 0)
1125 x2m1 = 0;
1126 int x2p0 = 2 * x;
1127 int x2p2 = 2 * x + 2;
1128 if (x2p2 > previousLevel.w - 1)
1129 x2p2 = previousLevel.w - 1;
1130
1131 auto A = LM1[x2m1];
1132 auto B = LM1[x2p0];
1133 auto C = LM1[x2p0+1];
1134 auto D = LM1[x2p2];
1135
1136 auto E = L0[x2m1];
1137 auto F = L0[x2p0];
1138 auto G = L0[x2p0+1];
1139 auto H = L0[x2p2];
1140
1141 auto I = L1[x2m1];
1142 auto J = L1[x2p0];
1143 auto K = L1[x2p0+1];
1144 auto L = L1[x2p2];
1145
1146 auto M = L2[x2m1];
1147 auto N = L2[x2p0];
1148 auto O = L2[x2p0+1];
1149 auto P = L2[x2p2];
1150
1151 // Apply filter
1152 // 1 3 3 1
1153 // 3 9 9 3
1154 // 3 9 9 3
1155 // 1 3 3 1
1156
1157 int rSum = (A.r + D.r + M.r + P.r) + 3 * (B.r + C.r + E.r + H.r + I.r + L.r + N.r + O.r) + 9 * (F.r + G.r + J.r + K.r);
1158 int gSum = (A.g + D.g + M.g + P.g) + 3 * (B.g + C.g + E.g + H.g + I.g + L.g + N.g + O.g) + 9 * (F.g + G.g + J.g + K.g);
1159 int bSum = (A.b + D.b + M.b + P.b) + 3 * (B.b + C.b + E.b + H.b + I.b + L.b + N.b + O.b) + 9 * (F.b + G.b + J.b + K.b);
1160 int aSum = (A.a + D.a + M.a + P.a) + 3 * (B.a + C.a + E.a + H.a + I.a + L.a + N.a + O.a) + 9 * (F.a + G.a + J.a + K.a);
1161 dest[x].r = cast(ushort)((rSum + 32) >> 6);
1162 dest[x].g = cast(ushort)((gSum + 32) >> 6);
1163 dest[x].b = cast(ushort)((bSum + 32) >> 6);
1164 dest[x].a = cast(ushort)((aSum + 32) >> 6);
1165 }
1166 }
1167 }
1168
1169 unittest
1170 {
1171 Mipmap!RGBA rgbaMipmap;
1172 Mipmap!L16 l16Mipmap;
1173 }