1 /**
2 * Original fixed-function PBR rendering in Dplug.
3 * For compatibility purpose.
4 *
5 * Copyright: Copyright Auburn Sounds 2015-2019.
6 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 */
8 module dplug.gui.legacypbr;
9 
10 
11 import core.stdc.stdio;
12 import std.math;
13 
14 import dplug.math.vector;
15 import dplug.math.box;
16 import dplug.math.matrix;
17 
18 import dplug.core.vec;
19 import dplug.core.nogc;
20 import dplug.core.math;
21 import dplug.core.thread;
22 
23 import dplug.gui.compositor;
24 
25 import dplug.graphics;
26 import dplug.window.window;
27 
28 import dplug.gui.ransac;
29 
30 import inteli.math;
31 import inteli.smmintrin;
32 import dplug.gui.profiler;
33 
34 // FUTURE: introduce a tonemap operator that doesn't break existing things and only "add" to the final render.
35 // TODO: PBR rendering doesn't depend rightly on size of the plugin.
36 //       The #RESIZE tag below makrs all areas that needs updating.
37 
38 
39 /// When inheriging from `MultipassCompositor`, you can define what the passes exchange 
40 /// between each other. However, the first field has to be a `CompositorPassBuffers`.
41 struct PBRCompositorPassBuffers
42 {
43     // First field must be `CompositorPassBuffers` for ABI compatibility of `MultipassCompositor`.
44     CompositorPassBuffers parent;
45     alias parent this;
46 
47     // Computed normal, one buffer per thread
48     OwnedImage!RGBf[] normalBuffers;
49 
50     // Accumulates light for each deferred pass, one buffer per thread
51     OwnedImage!RGBAf[] accumBuffers;
52 
53     // Approximate of normal variance, one buffer per thread
54     OwnedImage!L32f[] varianceBuffers;
55 }
56 
57 
58 /// Equivalence factor between Z samples and pixels.
59 /// Tuned once by hand to match the other normal computation algorithm
60 /// This affects virtual geometry, and as such: normals and raymarching into depth.
61 /// Future: this should be modifiable in order to have more Z range in plugins (more 3D).
62 /// Bug: resizing should affect this factor.
63 enum float FACTOR_Z = 4655.0f; // #RESIZE: this factor depends on DPI
64 
65 /// Originally, Dplug compositor was fixed function.
66 /// This is the legacy compositor.
67 class PBRCompositor : MultipassCompositor
68 {
69 nothrow @nogc:
70 
71 
72     // <LEGACY> parameters, reproduced here as properties for compatibility.
73     // Instead you are supposed to tweak settings when creating the passes.
74 
75     void light1Color(vec3f color)
76     {
77         (cast(PassObliqueShadowLight)getPass(PASS_OBLIQUE_SHADOW)).color = color;
78     }
79 
80     void light2Dir(vec3f dir)
81     {
82         (cast(PassDirectionalLight)getPass(PASS_DIRECTIONAL)).direction = dir;
83     }
84 
85     void light2Color(vec3f color)
86     {
87         (cast(PassDirectionalLight)getPass(PASS_DIRECTIONAL)).color = color;
88     }
89 
90     void light3Dir(vec3f dir)
91     {
92         (cast(PassSpecularLight)getPass(PASS_SPECULAR)).direction = dir;
93     }
94 
95     void light3Color(vec3f color)
96     {
97         (cast(PassSpecularLight)getPass(PASS_SPECULAR)).color = color;
98     }
99 
100     void skyboxAmount(float amount)
101     {
102         (cast(PassSkyboxReflections)getPass(PASS_SKYBOX)).amount = amount;
103     }
104 
105     void ambientLight(float amount)
106     {
107         (cast(PassAmbientOcclusion)getPass(PASS_AO)).amount = amount;
108     }
109 
110     version(futurePBREmissive)
111     {
112         void tonemapThreshold(float value)
113         {
114             (cast(PassClampAndConvertTo8bit)getPass(PASS_CLAMP)).tonemapThreshold = value;
115         }
116 
117         void tonemapRatio(float value)
118         {
119             (cast(PassClampAndConvertTo8bit)getPass(PASS_CLAMP)).tonemapRatio = value;
120         }
121     }
122 
123     // </LEGACY>
124 
125 
126 
127     private enum // MUST be kept in sync with below passes, it's for legacy purpose
128     {
129         PASS_NORMAL      = 0,
130         PASS_AO          = 1,
131         PASS_OBLIQUE_SHADOW = 2,
132         PASS_DIRECTIONAL = 3,
133         PASS_SPECULAR    = 4,
134         PASS_SKYBOX      = 5,
135         PASS_EMISSIVE    = 6,
136         PASS_CLAMP       = 7
137     }
138 
139     this(CompositorCreationContext* context)
140     {
141         super(context);
142 
143         _normalBuffers = mallocSlice!(OwnedImage!RGBf)(numThreads());
144         _accumBuffers = mallocSlice!(OwnedImage!RGBAf)(numThreads());
145         _varianceBuffers = mallocSlice!(OwnedImage!L32f)(numThreads());
146 
147         for (int t = 0; t < numThreads(); ++t)
148         {
149             _normalBuffers[t] = mallocNew!(OwnedImage!RGBf)();
150             _accumBuffers[t] = mallocNew!(OwnedImage!RGBAf)();
151             _varianceBuffers[t] = mallocNew!(OwnedImage!L32f)();
152         }
153 
154         // Create the passes
155         addPass( mallocNew!PassComputeNormal(this) );         // PASS_NORMAL
156         addPass( mallocNew!PassAmbientOcclusion(this) );      // PASS_AO
157         addPass( mallocNew!PassObliqueShadowLight(this) );    // PASS_OBLIQUE_SHADOW
158         addPass( mallocNew!PassDirectionalLight(this) );      // PASS_DIRECTIONAL
159         addPass( mallocNew!PassSpecularLight(this) );         // PASS_SPECULAR
160         addPass( mallocNew!PassSkyboxReflections(this) );     // PASS_SKYBOX
161         addPass( mallocNew!PassEmissiveContribution(this) );  // PASS_EMISSIVE
162         addPass( mallocNew!PassClampAndConvertTo8bit(this) ); // PASS_CLAMP
163     }
164 
165     ~this()
166     {
167         for (size_t t = 0; t < _normalBuffers.length; ++t)
168         {
169             _normalBuffers[t].destroyFree();
170             _accumBuffers[t].destroyFree();
171             _varianceBuffers[t].destroyFree();
172         }
173         freeSlice(_normalBuffers);
174         freeSlice(_accumBuffers);
175         freeSlice(_varianceBuffers);
176     }
177 
178     override void resizeBuffers(int width, 
179                                 int height,
180                                 int areaMaxWidth,
181                                 int areaMaxHeight)
182     {
183         super.resizeBuffers(width, height, areaMaxWidth, areaMaxHeight);
184 
185         // Create numThreads thread-local buffers of areaMaxWidth x areaMaxHeight size.
186         for (int t = 0; t < numThreads(); ++t)
187         {
188 
189             int border_0 = 0;
190             int rowAlign_1 = 1;
191             int rowAlign_16 = 16;
192             _normalBuffers[t].size(areaMaxWidth, areaMaxHeight, border_0, rowAlign_1);
193             _accumBuffers[t].size(areaMaxWidth, areaMaxHeight, border_0, rowAlign_16);
194             _varianceBuffers[t].size(areaMaxWidth, areaMaxHeight, border_0, rowAlign_1);
195         }
196     }
197 
198 
199     override void compositeTile(ImageRef!RGBA wfb, 
200                                 const(box2i)[] areas,
201                                 Mipmap!RGBA diffuseMap,
202                                 Mipmap!RGBA materialMap,
203                                 Mipmap!L16 depthMap,
204                                 IProfiler profiler)
205     {
206         // Call each pass in sequence
207         PBRCompositorPassBuffers buffers;
208         buffers.outputBuf = &wfb;
209         buffers.diffuseMap = diffuseMap;
210         buffers.materialMap = materialMap;
211         buffers.depthMap = depthMap;
212         buffers.accumBuffers = _accumBuffers;
213         buffers.normalBuffers = _normalBuffers;
214         buffers.varianceBuffers = _varianceBuffers;
215 
216         // For each tile, do all pass one by one.
217         void compositeOneTile(int i, int threadIndex) nothrow @nogc
218         {
219             OwnedImage!RGBAf accumBuffer = _accumBuffers[threadIndex];
220 
221             version(Dplug_ProfileUI) 
222             {
223                 profiler.category("PBR");
224             }
225 
226             box2i area = areas[i];
227             // Clear the accumulation buffer, since all passes add to it
228             {
229                 RGBAf zero = RGBAf(0.0f, 0.0f, 0.0f, 0.0f);
230                 for (int j = 0; j < area.height; ++j)
231                 {
232                     RGBAf* accumScan = accumBuffer.scanline(j).ptr;
233                     accumScan[0..area.width] = zero;
234                 }
235             }
236 
237             
238 
239             foreach(pass; passes())
240             {
241                 version(Dplug_ProfileUI) 
242                 {
243                     char[96] buf;
244                     snprintf(buf.ptr, 96, "Pass %s".ptr, pass.name.ptr);
245                     profiler.begin(buf);
246                 }
247 
248                 pass.renderIfActive(threadIndex, area, cast(CompositorPassBuffers*)&buffers);
249 
250                 version(Dplug_ProfileUI) 
251                 {
252                     profiler.end;
253                 }
254             }
255         }
256         int numAreas = cast(int)areas.length;
257         threadPool().parallelFor(numAreas, &compositeOneTile);
258     }
259 
260 private:
261     OwnedImage!RGBf[] _normalBuffers; // store computed normals
262     OwnedImage!RGBAf[] _accumBuffers; // store accumulated color
263     OwnedImage!L32f[] _varianceBuffers; // store computed normal variance, useful for anti-aliasing
264 }
265 
266 // Compute normals from depth, and normal variance.
267 class PassComputeNormal : CompositorPass
268 {
269 nothrow:
270 @nogc:
271 
272     this(MultipassCompositor parent)
273     {
274         super(parent);
275     }
276 
277     override void render(int threadIndex, const(box2i) area, CompositorPassBuffers* buffers)
278     {
279         PBRCompositorPassBuffers* PBRbuf = cast(PBRCompositorPassBuffers*) buffers;
280         OwnedImage!RGBf normalBuffer = PBRbuf.normalBuffers[threadIndex];
281         OwnedImage!L16 depthLevel0 = PBRbuf.depthMap.levels[0];
282         OwnedImage!L32f varianceBuffer = PBRbuf.varianceBuffers[threadIndex];
283 
284         const int depthPitchBytes = depthLevel0.pitchInBytes();
285 
286         for (int j = area.min.y; j < area.max.y; ++j)
287         {
288             RGBf* normalScan = normalBuffer.scanline(j - area.min.y).ptr;
289             L32f* varianceScan = varianceBuffer.scanline(j - area.min.y).ptr;
290 
291             // Note: because the level 0 of depth map has a border of 1 and a trailingSamples of 2,
292             //       then we are allowed to read 4 depth samples at once.
293             const(L16)* depthScan   = depthLevel0.scanlinePtr(j);
294 
295             for (int i = area.min.x; i < area.max.x; ++i)
296             {
297                 // Compute normal
298                 {
299                     const(L16)* depthHere = depthScan + i;
300                     const(L16)* depthHereM1 = cast(const(L16)*) ( cast(const(ubyte)*)depthHere - depthPitchBytes );
301                     const(L16)* depthHereP1 = cast(const(L16)*) ( cast(const(ubyte)*)depthHere + depthPitchBytes );
302                     enum float multUshort = 1.0 / FACTOR_Z;
303                     float[9] depthNeighbourhood = void;
304                     depthNeighbourhood[0] = depthHereM1[-1].l * multUshort;
305                     depthNeighbourhood[1] = depthHereM1[ 0].l * multUshort;
306                     depthNeighbourhood[2] = depthHereM1[+1].l * multUshort;
307                     depthNeighbourhood[3] = depthHere[-1].l   * multUshort;
308                     depthNeighbourhood[4] = depthHere[ 0].l   * multUshort;
309                     depthNeighbourhood[5] = depthHere[+1].l   * multUshort;
310                     depthNeighbourhood[6] = depthHereP1[-1].l * multUshort;
311                     depthNeighbourhood[7] = depthHereP1[ 0].l * multUshort;
312                     depthNeighbourhood[8] = depthHereP1[+1].l * multUshort;
313                     vec3f normal = computePlaneFittingNormal(depthNeighbourhood.ptr);
314                     normalScan[i - area.min.x] = RGBf(normal.x, normal.y, normal.z);
315                 }
316 
317                 // Compute normal variance (old method)
318                 {
319                     const(ubyte)* depthHere = cast(const(ubyte)*)(depthScan + i);
320 
321                     // Read 12 depth samples, the rightmost are unused
322                     __m128i depthSamplesM1 = _mm_loadl_epi64( cast(const(__m128i)*)(depthHere - depthPitchBytes - 2) );
323                     __m128i depthSamplesP0 = _mm_loadl_epi64( cast(const(__m128i)*)(depthHere - 2) );
324                     __m128i depthSamplesP1 = _mm_loadl_epi64( cast(const(__m128i)*)(depthHere + depthPitchBytes - 2) );
325 
326                     // Extend to float
327                     __m128i zero = _mm_setzero_si128();
328                     __m128 depthM1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(depthSamplesM1, zero));
329                     __m128 depthP0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(depthSamplesP0, zero));
330                     __m128 depthP1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(depthSamplesP1, zero));
331 
332                     enum useLaplacian = false;
333                     static if (useLaplacian)
334                     {
335                         // Possible a bit better, not tried further since
336                         // it is a pain to make it match for the passes that uses it.
337                         // 2nd-order-derivative for depth in the X direction
338                         align(16) static immutable float[12] LAPLACIAN =
339                         [
340                             0.25,  0.5, 0.25, 0,
341                             0.5, -3.0,  0.5, 0,
342                             0.25,  0.5, 0.25, 0,
343                         ];
344 
345                         __m128 mul = depthM1 * _mm_load_ps(&LAPLACIAN[0]) 
346                                    + depthP0 * _mm_load_ps(&LAPLACIAN[4])
347                                    + depthP1 * _mm_load_ps(&LAPLACIAN[8]);
348                         float laplace = mul.array[0] + mul.array[1] + mul.array[2] + mul.array[3];
349                         laplace /= 256.0f;
350                         float variance = laplace*laplace;
351                     }
352                     else
353                     {
354                         // 2nd-order-derivative for depth in the X direction
355                         //  1 -2  1
356                         //  1 -2  1
357                         //  1 -2  1
358                         const(__m128) fact_DDX_M1 = _mm_setr_ps( 1.0f, -2.0f,  1.0f, 0.0f);   
359                         __m128 mulForDDX = fact_DDX_M1 * (depthM1 + depthP0 + depthP1);
360                         float depthDX = mulForDDX.array[0] + mulForDDX.array[1] + mulForDDX.array[2];
361 
362                         // 2nd-order-derivative for depth in the Y direction
363                         //  1  1  1
364                         // -2 -2 -2
365                         //  1  1  1
366                         const(__m128) fact_DDY_M1 = _mm_setr_ps( 1.0f,  1.0f,  1.0f, 0.0f);
367                         const(__m128) fact_DDY_P0 = _mm_setr_ps(-2.0f, -2.0f, -2.0f, 0.0f);
368                         __m128 mulForDDY = fact_DDY_M1 * (depthM1 + depthP1) + depthP0 * fact_DDY_P0;
369                         float depthDY = mulForDDY.array[0] + mulForDDY.array[1] + mulForDDY.array[2];
370 
371                         depthDX *= (1 / 256.0f); // #RESIZE: sounds strange
372                         depthDY *= (1 / 256.0f);
373                         float variance = (depthDX * depthDX + depthDY * depthDY);
374                     }
375                     varianceScan[i - area.min.x] = L32f(variance);
376                 }
377             }
378         }
379     }
380 }
381 
382 
383 /// Give light depending on whether the pixels are statistically above their neighbours.
384 class PassAmbientOcclusion : CompositorPass
385 {
386 nothrow:
387 @nogc:
388 
389     float amount = 0.08125f;
390 
391     // TODO: add ambient light color
392 
393     this(MultipassCompositor parent)
394     {
395         super(parent);
396     }
397 
398     override void render(int threadIndex, const(box2i) area, CompositorPassBuffers* buffers)
399     {
400         PBRCompositorPassBuffers* PBRbuf = cast(PBRCompositorPassBuffers*) buffers;
401         OwnedImage!RGBA diffuseLevel0 = PBRbuf.diffuseMap.levels[0];
402         Mipmap!L16 depthMap = PBRbuf.depthMap;
403         OwnedImage!L16 depthLevel0 = PBRbuf.depthMap.levels[0];
404         OwnedImage!RGBAf accumBuffer = PBRbuf.accumBuffers[threadIndex];
405 
406         for (int j = area.min.y; j < area.max.y; ++j)
407         {
408             RGBA* diffuseScan = diffuseLevel0.scanlinePtr(j);
409             const(L16*) depthScan = depthLevel0.scanlinePtr(j);
410             RGBAf* accumScan = accumBuffer.scanlinePtr(j - area.min.y);
411 
412             for (int i = area.min.x; i < area.max.x; ++i)
413             {
414                 __m128 baseColor = convertBaseColorToFloat4(diffuseScan[i]);
415 
416                 const(L16)* depthHere = depthScan + i;
417 
418                 float px = i + 0.5f;
419                 float py = j + 0.5f;
420 
421                 // #RESIZE: if the plugin is large, should sample higher in mipmap levels
422 
423                 float avgDepthHere =
424                     ( depthMap.linearSample(1, px, py)
425                         + depthMap.linearSample(2, px, py)
426                         + depthMap.linearSample(3, px, py)
427                         + depthMap.linearSample(4, px, py) ) * 0.25f;
428 
429                 float diff = (*depthHere).l - avgDepthHere;
430 
431                 enum float divider23040 = 1.0f / 23040;
432                 float cavity = (diff + 23040.0f) * divider23040;
433                 if (cavity >= 1)
434                     cavity = 1;
435                 else if (cavity < 0)
436                     cavity = 0;
437 
438                 __m128 color = baseColor * _mm_set1_ps(cavity * amount);
439                 _mm_store_ps(cast(float*)(&accumScan[i - area.min.x]), _mm_load_ps(cast(float*)(&accumScan[i - area.min.x])) + color);
440             }
441         }
442     }
443 }
444 
445 class PassObliqueShadowLight : CompositorPass
446 {
447 nothrow:
448 @nogc:
449 
450     /// Color of this light pass.
451     vec3f color = vec3f(0.25f, 0.25f, 0.25f) * 1.3f;
452 
453     this(MultipassCompositor parent)
454     {
455         super(parent);
456     }
457 
458     override void render(int threadIndex, const(box2i) area, CompositorPassBuffers* buffers)
459     {
460         PBRCompositorPassBuffers* PBRbuf = cast(PBRCompositorPassBuffers*) buffers;
461         OwnedImage!L16 depthLevel0 = PBRbuf.depthMap.levels[0];
462         OwnedImage!RGBA diffuseLevel0 = PBRbuf.diffuseMap.levels[0];
463         OwnedImage!RGBAf accumBuffer = PBRbuf.accumBuffers[threadIndex];
464 
465         // Add a primary light that cast shadows
466         
467         enum float fallOff = 0.78f; // #RESIZE, recompute that table as needed
468 
469         int samples = 11; // #RESIZE ditto
470 
471         // PERF: align(16) on weight[1]
472         static immutable float[11] weights =
473         [
474             1.0f,
475             fallOff,
476             fallOff ^^ 2,
477             fallOff ^^ 3,
478             fallOff ^^ 4,
479             fallOff ^^ 5,
480             fallOff ^^ 6,
481             fallOff ^^ 7,
482             fallOff ^^ 8,
483             fallOff ^^ 9,
484             fallOff ^^ 10
485         ];
486 
487         enum float totalWeights = (1.0f - (fallOff ^^ 11)) / (1.0f - fallOff) - 1;
488         enum float invTotalWeights = 1 / (1.7f * totalWeights);
489 
490         int wholeWidth = depthLevel0.w;
491         int wholeHeight = depthLevel0.h;
492 
493         for (int j = area.min.y; j < area.max.y; ++j)
494         {
495             RGBA* diffuseScan = diffuseLevel0.scanlinePtr(j);
496 
497             const(L16*) depthScan = depthLevel0.scanlinePtr(j);
498             RGBAf* accumScan = accumBuffer.scanlinePtr(j - area.min.y);
499 
500             for (int i = area.min.x; i < area.max.x; ++i)
501             {
502                 const(L16)* depthHere = depthScan + i;
503                 RGBA ibaseColor = diffuseScan[i];
504                 vec3f baseColor = vec3f(ibaseColor.r, ibaseColor.g, ibaseColor.b) * div255;
505 
506                 float lightPassed = 0.0f;
507 
508                 int depthCenter = (*depthHere).l;
509                 {
510                     int sample = 1;
511                     __m128 mmZeroesf = _mm_setzero_ps();
512                     __m128i mmZero = _mm_setzero_si128();
513                     __m128 mmOnes = _mm_set1_ps(1.0f);
514                     __m128 mm0_7 = _mm_set1_ps(0.7f);
515                     __m128i maxX = _mm_set1_epi32(wholeWidth - 1);
516 
517                     for (; sample + 3 < samples; sample += 4)
518                     {
519                         __m128i mm0123 =  _mm_setr_epi32(0, 1, 2, 3);
520                         __m128i mmSample = _mm_set1_epi32(sample) + mm0123;
521                         __m128i mmI = _mm_set1_epi32(i); // X coord
522                         __m128i mmJ = _mm_set1_epi32(j); // Y coord
523                         __m128i x1 = mmI + mmSample;
524                         __m128i x2 = mmI - mmSample;
525                         __m128i y  = mmJ - mmSample;
526 
527                         // clamp source indices
528 
529                         // PERF: _mm_min_epi32 and _mm_max_epi32 not available in SSE3, use _mm_min_epi16 instead
530                         x1 = _mm_min_epi32(x1, maxX);
531                         x2 = _mm_max_epi32(x2, mmZero);
532                         y  = _mm_max_epi32( y, mmZero);
533 
534                         __m128i z = _mm_set1_epi32(depthCenter) + mmSample; /// ??? same WTF this makes no sense whatsoever
535 
536                         L16* scan0 = depthLevel0.scanlinePtr(y.array[0]);
537                         L16* scan1 = depthLevel0.scanlinePtr(y.array[1]);
538                         L16* scan2 = depthLevel0.scanlinePtr(y.array[2]);
539                         L16* scan3 = depthLevel0.scanlinePtr(y.array[3]);
540 
541                         __m128 diff1 = _mm_cvtepi32_ps(
542                                        z - _mm_setr_epi32( scan0[x1.array[0]].l, 
543                                                            scan1[x1.array[1]].l,
544                                                            scan2[x1.array[2]].l,
545                                                            scan3[x1.array[3]].l ) );
546 
547                         __m128 diff2 = _mm_cvtepi32_ps(
548                                        z - _mm_setr_epi32( scan0[x2.array[0]].l, 
549                                                            scan1[x2.array[1]].l,
550                                                            scan2[x2.array[2]].l,
551                                                            scan3[x2.array[3]].l ) );
552 
553                         __m128 mmA = _mm_set1_ps(0.00006510416f); // 1 / 15360
554                         __m128 contrib1 = _mm_max_ps(mmZeroesf, _mm_min_ps(mmOnes, mmOnes + diff1 * mmA));
555                         __m128 contrib2 = _mm_max_ps(mmZeroesf, _mm_min_ps(mmOnes, mmOnes + diff2 * mmA));
556                         __m128 mmWeight = _mm_loadu_ps(&weights[sample]);
557                         __m128 contrib = (contrib1 + contrib2 * mm0_7) * mmWeight;
558                         lightPassed += contrib.array[0];
559                         lightPassed += contrib.array[1];
560                         lightPassed += contrib.array[2];
561                         lightPassed += contrib.array[3];
562                     }
563 
564                     for ( ; sample < samples; ++sample)
565                     {
566                         int x1 = i + sample;
567                         if (x1 >= wholeWidth)
568                             x1 = wholeWidth - 1;
569                         int x2 = i - sample;
570                         if (x2 < 0)
571                             x2 = 0;
572                         int y = j - sample;
573                         if (y < 0)
574                             y = 0;
575                         int z = depthCenter + sample; // ??? WTF
576                         L16* scan = depthLevel0.scanlinePtr(y);
577 
578                         int diff1 = z - scan[x1].l; // FUTURE: use pointer offsets here instead of opIndex
579                         int diff2 = z - scan[x2].l;
580 
581                         float contrib1 = void, 
582                             contrib2 = void;
583 
584                         // Map diff 0 to contribution = 1
585                         // Map -15360 to contribution = 0
586                         // Clamp otherwise.
587                         // In otherwords, this is f(x) = clamp(Ax+B, 0, 1) 
588                         //                                with A = 1/15360
589                         //                                     B = 1
590                         static immutable float divider15360 = 1.0f / 15360; // BUG: not consistent with FACTOR_Z, this is steeper...
591 
592                         if (diff1 >= 0)
593                             contrib1 = 1;
594                         else if (diff1 < -15360)
595                             contrib1 = 0;
596                         else
597                             contrib1 = (diff1 + 15360) * divider15360;
598 
599                         if (diff2 >= 0)
600                             contrib2 = 1;
601                         else if (diff2 < -15360)
602                             contrib2 = 0;
603                         else
604                             contrib2 = (diff2 + 15360) * divider15360;
605 
606                         lightPassed += (contrib1 + contrib2 * 0.7f) * weights[sample];
607                     }
608                 }
609                 vec3f finalColor = baseColor * color * (lightPassed * invTotalWeights);
610                 __m128 mmColor = _mm_setr_ps(finalColor.r, finalColor.g, finalColor.b, 0.0f);
611                 _mm_store_ps(cast(float*)(&accumScan[i - area.min.x]), _mm_load_ps(cast(float*)(&accumScan[i - area.min.x])) + mmColor);
612             }
613         }
614     }
615 }
616 
617 class PassDirectionalLight : CompositorPass
618 {
619 nothrow:
620 @nogc:
621 public:
622 
623     /// World-space direction. Unsure of the particular space it lives in.
624     vec3f direction = vec3f(0.0f, 1.0f, 0.1f).normalized;
625 
626     /// Color of this light pass.
627     vec3f color = vec3f(0.481f, 0.481f, 0.481f);
628 
629     this(MultipassCompositor parent)
630     {
631         super(parent);
632     }
633 
634     override void render(int threadIndex, const(box2i) area, CompositorPassBuffers* buffers)
635     {
636         PBRCompositorPassBuffers* PBRbuf = cast(PBRCompositorPassBuffers*) buffers;
637         OwnedImage!RGBA diffuseLevel0 = PBRbuf.diffuseMap.levels[0];
638         OwnedImage!RGBA materialLevel0 = PBRbuf.materialMap.levels[0];
639         OwnedImage!RGBf normalBuffer = PBRbuf.normalBuffers[threadIndex];
640         OwnedImage!RGBAf accumBuffer = PBRbuf.accumBuffers[threadIndex];
641 
642         // secundary light
643         for (int j = area.min.y; j < area.max.y; ++j)
644         {
645             RGBA* materialScan = materialLevel0.scanlinePtr(j);
646             RGBA* diffuseScan = diffuseLevel0.scanlinePtr(j);
647             RGBf* normalScan = normalBuffer.scanlinePtr(j - area.min.y);
648             RGBAf* accumScan = accumBuffer.scanlinePtr(j - area.min.y);
649 
650             for (int i = area.min.x; i < area.max.x; ++i)
651             {
652                 RGBf normalFromBuf = normalScan[i - area.min.x];
653                 RGBA materialHere = materialScan[i];
654                 float roughness = materialHere.r * div255;
655                 RGBA ibaseColor = diffuseScan[i];
656                 vec3f baseColor = vec3f(ibaseColor.r, ibaseColor.g, ibaseColor.b) * div255;
657                 vec3f normal = vec3f(normalFromBuf.r, normalFromBuf.g, normalFromBuf.b);
658                 float diffuseFactor = 0.5f + 0.5f * dot(normal, direction);
659                 diffuseFactor = linmap!float(diffuseFactor, 0.24f - roughness * 0.5f, 1, 0, 1.0f);
660                 vec3f finalColor = baseColor * color * diffuseFactor;
661                 accumScan[i - area.min.x] += RGBAf(finalColor.r, finalColor.g, finalColor.b, 0.0f);
662             }
663         }
664     }
665 }
666 
667 class PassSpecularLight : CompositorPass
668 {
669 nothrow:
670 @nogc:
671 public:
672 
673     /// World-space direction. Unsure of the particular space it lives in.
674     vec3f direction = vec3f(0.0f, 1.0f, 0.1f).normalized;
675 
676     /// Color of this light pass.
677     vec3f color = vec3f(0.26f, 0.26f, 0.26f);
678 
679     this(MultipassCompositor parent)
680     {
681         super(parent);
682         _specularFactor.reallocBuffer(numThreads());
683         _exponentFactor.reallocBuffer(numThreads());
684         _toksvigScaleFactor.reallocBuffer(numThreads());
685 
686         // initialize new elements in the array, else realloc wouldn't work well next
687         for (int thread = 0; thread < numThreads(); ++thread)
688         {
689             _specularFactor[thread] = null;
690             _exponentFactor[thread] = null;
691             _toksvigScaleFactor[thread] = null;
692         }
693 
694         for (int roughByte = 0; roughByte < 256; ++roughByte)
695         {
696             _exponentTable[roughByte] = 0.8f * exp( (1-roughByte / 255.0f) * 5.5f);
697 
698             // Convert Phong exponent to Blinn-phong exponent
699             _exponentTable[roughByte] *= 2.8f; // tuned by hand to match the former "legacy" Phong specular highlight. This makes very little difference.
700         }
701 
702     }
703 
704     override void resizeBuffers(int width, 
705                                 int height,
706                                 int areaMaxWidth,
707                                 int areaMaxHeight)
708     {
709         // resize all thread-local buffers
710         for (int thread = 0; thread < numThreads(); ++thread)
711         {
712             _specularFactor[thread].reallocBuffer(width);
713             _exponentFactor[thread].reallocBuffer(width);
714             _toksvigScaleFactor[thread].reallocBuffer(width);
715         }
716     }
717 
718     override void render(int threadIndex, const(box2i) area, CompositorPassBuffers* buffers)
719     {
720         PBRCompositorPassBuffers* PBRbuf = cast(PBRCompositorPassBuffers*) buffers;
721         OwnedImage!RGBA diffuseLevel0 = PBRbuf.diffuseMap.levels[0];
722         OwnedImage!RGBA materialLevel0 = PBRbuf.materialMap.levels[0];
723         OwnedImage!RGBf normalBuffer = PBRbuf.normalBuffers[threadIndex];
724         OwnedImage!RGBAf accumBuffer = PBRbuf.accumBuffers[threadIndex];
725         OwnedImage!L32f varianceBuffer = PBRbuf.varianceBuffers[threadIndex];
726 
727         int w = diffuseLevel0.w;
728         int h = diffuseLevel0.h;
729         immutable float invW = 1.0f / w;
730         immutable float invH = 1.0f / h;
731 
732         __m128 mmlight3Dir = _mm_setr_ps(-direction.x, -direction.y, -direction.z, 0.0f);
733         float* pSpecular = _specularFactor[threadIndex].ptr;
734         float* pExponent = _exponentFactor[threadIndex].ptr;
735         float* pToksvigScale = _toksvigScaleFactor[threadIndex].ptr;
736 
737         for (int j = area.min.y; j < area.max.y; ++j)
738         {
739             RGBA* materialScan = materialLevel0.scanlinePtr(j);
740             RGBA* diffuseScan = diffuseLevel0.scanlinePtr(j);
741             RGBf* normalScan = normalBuffer.scanlinePtr(j - area.min.y);
742             RGBAf* accumScan = accumBuffer.scanlinePtr(j - area.min.y);
743             L32f* varianceScan = varianceBuffer.scanlinePtr(j - area.min.y);
744 
745             for (int i = area.min.x; i < area.max.x; ++i)
746             {
747                 RGBA materialHere = materialScan[i];
748                 RGBf normalFromBuf = normalScan[i - area.min.x];
749                 __m128 normal = convertNormalToFloat4(normalFromBuf);
750 
751                 // TODO: this should be tuned interactively, maybe it's annoying to feel
752                 //       Need to compute the viewer distance from screen... and DPI.
753                 // #RESIZE
754                 __m128 toEye = _mm_setr_ps(0.5f - i * invW, j * invH - 0.5f, 1.0f, 0.0f);
755                 toEye = _mm_fast_normalize_ps(toEye);
756 
757                 __m128 halfVector = toEye - mmlight3Dir;
758                 halfVector = _mm_fast_normalize_ps(halfVector);
759                 float specularFactor = _mm_dot_ps(halfVector, normal);
760 
761                 if (specularFactor < 1e-3f) 
762                     specularFactor = 1e-3f;
763 
764                 float exponent = _exponentTable[materialHere.r];
765 
766                 // From NVIDIA Technical Brief: "Mipmapping Normal Maps"
767                 // We use normal variance to reduce exponent and scale of the specular
768                 // highlight, which should avoid aliasing.
769                 float VARIANCE_FACTOR = 4e-5f; // was very hard to tune, probably should not be dx*dx+dy*dy?
770                 float variance = varianceScan[i - area.min.x].l;
771                 float Ft = 1.0f / (1.0f + exponent * variance * VARIANCE_FACTOR);
772                 float scaleFactorToksvig = ( (1.0f + exponent * Ft) / (1.0f + exponent) );
773                 assert(scaleFactorToksvig <= 1);
774                 pToksvigScale[i] = scaleFactorToksvig;
775                 pSpecular[i] = specularFactor;
776                 pExponent[i] = exponent * Ft;
777             }
778 
779             // Just the pow operation for this line
780             {
781                 int i = area.min.x;
782                 for (; i + 3 < area.max.x; i += 4)
783                 {
784                     _mm_storeu_ps(&pSpecular[i], _mm_pow_ps(_mm_loadu_ps(&pSpecular[i]), _mm_loadu_ps(&pExponent[i])));
785                 }
786                 for (; i < area.max.x; ++i)
787                 {
788                     pSpecular[i] = _mm_pow_ss(pSpecular[i], pExponent[i]);
789                 }
790             }
791 
792             for (int i = area.min.x; i < area.max.x; ++i)
793             {
794                 float specularFactor = pSpecular[i];
795 
796                 __m128 material = convertMaterialToFloat4(materialScan[i]);
797                 RGBA materialHere = materialScan[i];
798                 float roughness = material.array[0];
799                 float metalness = material.array[1];
800                 float specular  = material.array[2];
801                 __m128 baseColor = convertBaseColorToFloat4(diffuseScan[i]);
802                 __m128 mmLightColor = _mm_setr_ps(color.x, color.y, color.z, 0.0f);
803 
804                 float roughFactor = 10 * (1.0f - roughness) * (1 - metalness * 0.5f);
805                 specularFactor = specularFactor * roughFactor * pToksvigScale[i];
806                 __m128 finalColor = baseColor * mmLightColor * _mm_set1_ps(specularFactor * specular);
807 
808                 _mm_store_ps(cast(float*)(&accumScan[i - area.min.x]), _mm_load_ps(cast(float*)(&accumScan[i - area.min.x])) + finalColor);
809             }
810         }
811     }
812 
813     ~this()
814     {
815         foreach(thread; 0..numThreads())
816         {
817             _specularFactor[thread].reallocBuffer(0);
818             _exponentFactor[thread].reallocBuffer(0);
819             _toksvigScaleFactor[thread].reallocBuffer(0);
820         }
821         _specularFactor.reallocBuffer(0);
822         _exponentFactor.reallocBuffer(0);
823         _toksvigScaleFactor.reallocBuffer(0);
824     }
825 
826 private:
827     float[256] _exponentTable;
828 
829     // Note: those are thread-local buffers
830     float[][] _specularFactor;
831     float[][] _exponentFactor; 
832     float[][] _toksvigScaleFactor;
833 }
834 
835 class PassSkyboxReflections : CompositorPass
836 {
837 nothrow:
838 @nogc:
839 public:
840 
841     float amount = 0.52f;
842 
843     this(MultipassCompositor parent)
844     {
845         super(parent);
846     }
847 
848     ~this()
849     {
850         if (_skybox !is null)
851         {
852             _skybox.destroyFree();
853             _skybox = null;
854         }
855     }
856 
857     // Note: take ownership of image
858     // That image must have been built with `mallocNew`
859     void setSkybox(OwnedImage!RGBA image)
860     {
861         if (_skybox !is null)
862         {
863             _skybox.destroyFree();
864             _skybox = null;
865         }
866         _skybox = mallocNew!(Mipmap!RGBA)(12, image);
867         _skybox.generateMipmaps(Mipmap!RGBA.Quality.box);
868     }
869 
870     override void render(int threadIndex, const(box2i) area, CompositorPassBuffers* buffers)
871     {
872         PBRCompositorPassBuffers* PBRbuf = cast(PBRCompositorPassBuffers*) buffers;
873         OwnedImage!RGBA diffuseLevel0 = PBRbuf.diffuseMap.levels[0];
874         OwnedImage!RGBA materialLevel0 = PBRbuf.materialMap.levels[0];
875         OwnedImage!RGBf normalBuffer = PBRbuf.normalBuffers[threadIndex];
876         OwnedImage!RGBAf accumBuffer = PBRbuf.accumBuffers[threadIndex];
877         OwnedImage!L32f varianceBuffer = PBRbuf.varianceBuffers[threadIndex];
878 
879         int w = diffuseLevel0.w;
880         int h = diffuseLevel0.h;
881         immutable float invW = 1.0f / w;
882         immutable float invH = 1.0f / h;
883 
884         // skybox reflection (use the same shininess as specular)
885         if (_skybox !is null)
886         {
887             for (int j = area.min.y; j < area.max.y; ++j)
888             {
889                 RGBA* materialScan = materialLevel0.scanlinePtr(j);
890                 RGBA* diffuseScan = diffuseLevel0.scanlinePtr(j);
891                 RGBf* normalScan = normalBuffer.scanlinePtr(j - area.min.y);
892                 RGBAf* accumScan = accumBuffer.scanlinePtr(j - area.min.y);
893                 L32f* varianceScan = varianceBuffer.scanlinePtr(j - area.min.y);
894 
895                 immutable float amountOfSkyboxPixels = _skybox.width * _skybox.height;
896                 
897                 for (int i = area.min.x; i < area.max.x; ++i)
898                 {
899                     // First compute the needed mipmap level for this line
900                     float mipmapLevel = varianceScan[i - area.min.x].l * amountOfSkyboxPixels;
901                     enum float ROUGH_FACT = 6.0f / 255.0f;
902                     float roughness = materialScan[i].r;
903                     mipmapLevel = 0.5f * fastlog2(1.0f + mipmapLevel * 0.00001f) + ROUGH_FACT * roughness;
904 
905                     immutable float fskyX = (_skybox.width - 1.0f);
906                     immutable float fSkyY = (_skybox.height - 1.0f);
907 
908                     immutable float amountFactor = amount * div255;
909 
910                     // TODO: same remark than above about toEye, something to think about
911                     // #RESIZE
912                     __m128 toEye = _mm_setr_ps(0.5f - i * invW, j * invH - 0.5f, 1.0f, 0.0f);
913                     toEye = _mm_fast_normalize_ps(toEye);
914 
915                     __m128 normal = convertNormalToFloat4(normalScan[i - area.min.x]);
916                     __m128 pureReflection = _mm_reflectnormal_ps(toEye, normal);
917                     __m128 material = convertMaterialToFloat4(materialScan[i]);
918                     float metalness = material.array[1];
919                     __m128 baseColor = convertBaseColorToFloat4(diffuseScan[i]);
920                     float skyx = 0.5f + ((0.5f - pureReflection.array[0] * 0.5f) * fskyX);
921                     float skyy = 0.5f + ((0.5f + pureReflection.array[1] * 0.5f) * fSkyY);
922                     __m128 skyColorAtThisPoint = convertVec4fToFloat4( _skybox.linearMipmapSample(mipmapLevel, skyx, skyy) );
923                     __m128 color = baseColor * skyColorAtThisPoint * _mm_set1_ps(metalness * amountFactor);
924                     _mm_store_ps(cast(float*)(&accumScan[i - area.min.x]), _mm_load_ps(cast(float*)(&accumScan[i - area.min.x])) + color);
925                 }
926             }
927         }
928     }
929 
930 private:
931     /// Used for faking environment reflections.
932     Mipmap!RGBA _skybox = null;
933 }
934 
935 class PassEmissiveContribution : CompositorPass
936 {
937 nothrow:
938 @nogc:
939 public:
940 
941     this(MultipassCompositor parent)
942     {
943         super(parent);
944     }
945 
946     override void render(int threadIndex, const(box2i) area, CompositorPassBuffers* buffers)
947     {
948         PBRCompositorPassBuffers* PBRbuf = cast(PBRCompositorPassBuffers*) buffers;
949         OwnedImage!RGBAf accumBuffer = PBRbuf.accumBuffers[threadIndex];
950         Mipmap!RGBA diffuseMap = PBRbuf.diffuseMap;
951 
952         // Add light emitted by neighbours
953         // Bloom-like.
954         for (int j = area.min.y; j < area.max.y; ++j)
955         {
956             RGBAf* accumScan = accumBuffer.scanlinePtr(j - area.min.y);
957             for (int i = area.min.x; i < area.max.x; ++i)
958             {
959                 float ic = i + 0.5f;
960                 float jc = j + 0.5f;
961 
962                 // Get alpha-premultiplied, avoids to have to do alpha-aware mipmapping
963                 // #RESIZE: more pixels => light travels further
964                 vec4f colorLevel1 = diffuseMap.linearSample(1, ic, jc);
965                 vec4f colorLevel2 = diffuseMap.linearSample(2, ic, jc);
966                 vec4f colorLevel3 = diffuseMap.linearSample(3, ic, jc);
967 
968                 version(futurePBREmissive)
969                 {
970                     // See Issue #827; this was a problem for Emissive highlights.
971                     vec4f colorLevel4 = diffuseMap.cubicSample(4, ic, jc);
972                     vec4f colorLevel5 = diffuseMap.cubicSample(5, ic, jc);
973                 }
974                 else
975                 {
976                     vec4f colorLevel4 = diffuseMap.linearSample(4, ic, jc);
977                     vec4f colorLevel5 = diffuseMap.linearSample(5, ic, jc);
978                 }
979 
980                 version(futurePBREmissive)
981                 {
982                     // What is super nice with the linear-space mipmap in Diffuse, is that
983                     // taking a blurred samples seemingly take equal weights in several layers.
984                     float noise = (BLUE_NOISE_16x16[(i & 15)*16 + (j & 15)] - 127.5f) * 0.003f;
985                     enum float AMT = 0.002f * 0.67f; // good values for Couture: 0.67f (and 0.66f in 2nd pos)
986                     vec4f emitted = colorLevel1 * AMT;
987                     emitted += colorLevel2      * AMT;
988                     emitted += colorLevel3      * AMT;
989                     emitted += colorLevel4      * AMT;
990                     emitted += colorLevel5      * AMT * (1 + noise);
991                 }
992                 else
993                 {
994                     vec4f emitted = colorLevel1 * 0.00117647f;
995                     emitted += colorLevel2      * 0.00176471f;
996                     emitted += colorLevel3      * 0.00147059f;
997                     emitted += colorLevel4      * 0.00088235f;
998                     emitted += colorLevel5      * 0.00058823f;
999                 }
1000                 accumScan[i - area.min.x] += RGBAf(emitted.r, emitted.g, emitted.b, emitted.a);
1001             }
1002         }
1003     }
1004 }
1005 
1006 
1007 // 16x16 Patch of 8-bit blue noise, tileable.
1008 private static immutable ubyte[256] BLUE_NOISE_16x16 =
1009 [
1010     127, 194, 167,  79,  64, 173,  22,  83, 167, 105, 119, 250, 201,  34, 214, 145, 
1011     233,  56,  13, 251, 203, 124, 243,  42, 216,  34,  73, 175, 133,  64, 185,  73, 
1012      93, 156, 109, 144,  34,  98, 153, 138, 187, 238, 155,  46,  13, 102, 247,   0,
1013      28, 180,  46, 218, 183,  13, 212,  69,  13,  92, 126, 228, 211, 161, 117, 197, 
1014     134, 240, 121,  75, 234,  88,  53, 170, 109, 204,  59,  22,  86, 141,  38, 222,
1015      81, 205,  13,  59, 160, 198, 129, 252,   0, 147, 176, 193, 244,  71, 173,  56,
1016      22, 168, 104, 139,  22, 114,  38, 220, 101, 231,  77,  34, 113,  13, 189,  96, 
1017     253, 148, 227, 190, 246, 174,  66, 155,  28,  50, 164, 131, 217, 151, 232, 128, 
1018     115,  69,  34,  50,  93,  13, 209,  85, 192, 120, 248,  64,  90,  28, 208,  42,
1019       0, 200, 215,  79, 125, 148, 239, 136, 181,  22, 206,  13, 185, 108,  59, 179,
1020      90, 130, 159, 182, 235,  42, 106,   0,  56,  99, 226, 140, 157, 237,  77, 165, 
1021     249,  28, 105,  13,  61, 170, 224,  75, 202, 163, 114,  81,  46,  22, 137, 223, 
1022     189,  53, 219, 142, 196,  28, 122, 154, 254,  42,  28, 242, 196, 210, 119,  38, 
1023     149,  86, 118, 245,  71,  96, 213,  13,  88, 178,  66, 129, 171,   0,  99,  69, 
1024     178,  13, 207,  38, 159, 187,  50, 132, 236, 146, 191,  95,  53, 229, 163, 241,
1025      46, 225, 102, 135,   0, 230, 110, 199,  61,   0, 221,  22, 150,  83, 112, 22
1026 ];
1027 
1028 class PassClampAndConvertTo8bit : CompositorPass
1029 {
1030 nothrow:
1031 @nogc:
1032 public:
1033 
1034     version(futurePBREmissive)
1035     {
1036         /// Normally not much reason to change this. This is the threshold above which colors are 
1037         /// allowed to "bleed" into others in a gray way.
1038         float tonemapThreshold = 1.0f;
1039 
1040         /// Tuned on Auburn plugins. This brings a sense of dynamic range, 
1041         /// possibly lower would be a bit better. 0.3f wins over 0.5f and 1.0f.
1042         float tonemapRatio     = 0.3f; 
1043     }
1044 
1045     this(MultipassCompositor parent)
1046     {
1047         super(parent);
1048     }
1049 
1050     override void render(int threadIndex, const(box2i) area, CompositorPassBuffers* buffers)
1051     {
1052         PBRCompositorPassBuffers* PBRbuf = cast(PBRCompositorPassBuffers*) buffers;
1053         OwnedImage!RGBAf accumBuffer = PBRbuf.accumBuffers[threadIndex];
1054         ImageRef!RGBA* wfb = PBRbuf.outputBuf;
1055         
1056         immutable __m128 mm255_99 = _mm_set1_ps(255.99f);
1057         immutable __m128i zero = _mm_setzero_si128();
1058 
1059         version(futurePBREmissive)
1060         {
1061             float toneRatio = tonemapRatio / 3;
1062         }
1063 
1064         // Final pass, clamp, convert to ubyte
1065         for (int j = area.min.y; j < area.max.y; ++j)
1066         {
1067             int* wfb_scan = cast(int*)(wfb.scanline(j).ptr);
1068             const(RGBAf)* accumScan = accumBuffer.scanlinePtr(j - area.min.y);
1069 
1070             for (int i = area.min.x; i < area.max.x; ++i)
1071             {
1072                 RGBAf accum = accumScan[i - area.min.x];
1073                 __m128 color = _mm_setr_ps(accum.r, accum.g, accum.b, 1.0f);
1074 
1075                 version(futurePBREmissive)
1076                 {
1077                     // Try to weight green higher.
1078                     // This avoids shifting hue when tonemapping.
1079                     __m128 exceed = _mm_max_ps(_mm_setzero_ps(), color - _mm_set1_ps(tonemapThreshold));
1080 
1081                     // Compute luma of exceed energy. Note that we're operating in gamma-space still.
1082                     // Should it be applied equivalently to all components? not sure
1083                     float exceedLuma = 0.212655f * exceed.array[0] 
1084                                      + 0.715158f * exceed.array[1] 
1085                                      + 0.072187f * exceed.array[2];
1086 
1087                     // should it be applied equivalently to all components? not sure
1088                     color += _mm_set1_ps(exceedLuma * toneRatio); 
1089                     color.ptr[3] = 1.0f;
1090                 }
1091 
1092                 __m128i icolorD = _mm_cvttps_epi32(color * mm255_99);
1093                 __m128i icolorW = _mm_packs_epi32(icolorD, zero);
1094                 __m128i icolorB = _mm_packus_epi16(icolorW, zero);
1095                 wfb_scan[i] = icolorB.array[0];
1096             }
1097         }
1098     }
1099 }
1100 
1101 
1102 
1103 
1104 private:
1105 
1106 // log2 approximation by Laurent de Soras
1107 // http://www.flipcode.com/archives/Fast_log_Function.shtml
1108 float fastlog2(float val) pure nothrow @nogc
1109 {
1110     union fi_t
1111     {
1112         int i;
1113         float f;
1114     }
1115 
1116     fi_t fi;
1117     fi.f = val;
1118     int x = fi.i;
1119     int log_2 = ((x >> 23) & 255) - 128;
1120     x = x & ~(255 << 23);
1121     x += 127 << 23;
1122     fi.i = x;
1123     return fi.f + log_2;
1124 }
1125 
1126 // log2 approximation by Laurent de Soras
1127 // http://www.flipcode.com/archives/Fast_log_Function.shtml
1128 // Same but 4x at once
1129 __m128 _mm_fastlog2_ps(__m128 val) pure nothrow @nogc
1130 {
1131     __m128i x = _mm_castps_si128(val);
1132     __m128i m128 = _mm_set1_epi32(128);
1133     __m128i m255 = _mm_set1_epi32(255);
1134     __m128i log_2 = _mm_and_si128(_mm_srai_epi32(x, 23), m255) - m128;
1135     x = _mm_and_si128(x, _mm_set1_epi32(~(255 << 23)));
1136     x = x + _mm_set1_epi32(127 << 23);
1137     __m128 fif = _mm_castsi128_ps(x);
1138     return fif + _mm_cvtepi32_ps(log_2);
1139 }
1140 
1141 
1142 
1143 alias convertMaterialToFloat4 = convertBaseColorToFloat4;
1144 
1145 // Convert a 8-bit color to a normalized 4xfloat color
1146 __m128 convertBaseColorToFloat4(RGBA rgba) nothrow @nogc pure
1147 {
1148     int asInt = *cast(int*)(&rgba);
1149     __m128i packed = _mm_cvtsi32_si128(asInt);
1150     __m128i mmZero = _mm_setzero_si128();
1151     __m128i shorts = _mm_unpacklo_epi8(packed, mmZero);
1152     __m128i ints = _mm_unpacklo_epi16(shorts, mmZero);
1153     return _mm_cvtepi32_ps(ints) * _mm_set1_ps(div255);
1154 }
1155 
1156 __m128 convertNormalToFloat4(RGBf normal) nothrow @nogc pure
1157 {
1158     return _mm_setr_ps(normal.r, normal.g, normal.b, 0.0f);
1159 }
1160 
1161 __m128 convertVec4fToFloat4(vec4f vec) nothrow @nogc pure
1162 {
1163     return _mm_setr_ps(vec.x, vec.y, vec.z, vec.w);
1164 }
1165 
1166 private enum float div255 = 1 / 255.0f;
1167 
1168 
1169 // Removed Options:
1170 version(legacyBlinnPhong)
1171 {
1172     static assert("legacyBlinnPhong was removed in Dplug v13");
1173 }
1174 
1175 version(legacyPBRNormals)
1176 {
1177     static assert("legacyPBRNormals was removed in Dplug v12");
1178 }