1 /**
2 * Implement the linear gradient fill style. dplug:canvas internals.
3 *
4 * Copyright: Copyright Chris Jones 2020.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module dplug.canvas.linearblit;
8 
9 import dplug.core.math;
10 
11 import dplug.canvas.rasterizer;
12 import dplug.canvas.gradient;
13 import dplug.canvas.misc;
14 
15 /*
16   linear gradient blit
17 */
18 
19 struct LinearBlit
20 {   
21 nothrow:
22 @nogc:
23 
24     void init(ubyte* pixels, size_t strideBytes, int height,
25               Gradient g, float x0, float y0, float x1, float y1)
26     {
27         assert(height > 0);
28         assert(g !is null);
29         assert(isPow2(g.lutLength));
30 
31         this.pixels = pixels;
32         this.strideBytes = strideBytes;
33         this.height = height;
34         this.gradient = g;
35         int lutsize = g.lutLength;
36 
37         xctr = x0;
38         yctr = y0;
39         float w = x1-x0;
40         float h = y1-y0;
41         float hsq = w*w + h*h;
42         if (hsq < 0.1) hsq = 0.1; // avoid div by zero
43         xstep = lutsize * w / hsq; 
44         ystep = lutsize * h / hsq;
45     }
46 
47 private:
48 
49     void linear_blit(WindingRule wr)(int* delta, DMWord* mask, int x0, int x1, int y)
50     {
51         assert(x0 >= 0);
52         assert(x1*4 <= strideBytes);
53         assert(y >= 0);
54         assert(y < height);
55         assert((x0 & 3) == 0);
56         assert((x1 & 3) == 0);
57 
58         // main blit variables
59 
60         int bpos = x0 / 4;
61         int endbit = x1 / 4;
62         uint* dest = cast(uint*)(&pixels[y*strideBytes]);
63         __m128i xmWinding = 0;
64         uint* lut = gradient.getLookup.ptr;
65         assert(gradient.lutLength <= short.max); // LUT can be non-power-of-2 as far as LinearBlit is concerned, but this held low interest
66         short lutMax = cast(short)(gradient.lutLength - 1);
67 
68         bool isopaque = false;//gradient.isOpaque
69 
70         // XMM constants
71 
72         immutable __m128i XMZERO = 0;
73         immutable __m128i XMFFFF = [0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF];
74         immutable __m128i XMMSK16 = [0xFFFF,0xFFFF,0xFFFF,0xFFFF];
75 
76         // paint variables
77 
78         float t0 = (bpos*4-xctr)*xstep + (y-yctr)*ystep;
79         __m128 xmT0 = _mm_mul_ps(_mm_set1_ps(xstep), _mm_setr_ps(0.0f,1.0f,2.0f,3.0f));
80         xmT0 = _mm_add_ps(xmT0, _mm_set1_ps(t0));
81         __m128 xmStep0 = _mm_set1_ps(xstep*4);
82 
83         // main loop
84 
85         while (bpos < endbit)
86         {
87             int nsb = nextSetBit(mask, bpos, endbit);
88 
89             // do we have a span of unchanging coverage?
90 
91             if (bpos < nsb)
92             {
93                 // Calc coverage of first pixel
94 
95                 static if (wr == WindingRule.NonZero)
96                 {
97                     int cover = xmWinding[3]+delta[bpos*4];
98                     cover = abs(cover)*2;
99                     if (cover > 0xFFFF) cover = 0xFFFF;
100                 }
101                 else
102                 {
103                     int cover = xmWinding[3]+delta[bpos*4];
104                     short tsc = cast(short) cover;
105                     cover = (tsc ^ (tsc >> 15)) * 2;
106                 }
107 
108                 // We can skip the span
109 
110                 if (cover == 0)
111                 {
112                     __m128 tsl = _mm_set1_ps(nsb-bpos);
113                     xmT0 = _mm_add_ps(xmT0, _mm_mul_ps(tsl,xmStep0));
114                     bpos = nsb;
115                 }
116 
117                 // Or fill span with soid color
118 
119                 else if (isopaque && (cover > 0xFF00))
120                 {
121                     uint* ptr = &dest[bpos*4];
122                     uint* end = ptr + ((nsb-bpos)*4);
123 
124                     while (ptr < end)
125                     {
126                         __m128i ipos = _mm_cvttps_epi32 (xmT0);
127                         ipos = _mm_clamp_0_to_N_epi32(ipos, lutMax);
128                         xmT0 = xmT0 + xmStep0;
129 
130                         ptr[0] = lut[ ipos.array[0] ];
131                         ptr[1] = lut[ ipos.array[1] ];
132                         ptr[2] = lut[ ipos.array[2] ];
133                         ptr[3] = lut[ ipos.array[3] ];
134 
135                         ptr+=4;                        
136                     }
137 
138                     bpos = nsb;
139                 }
140 
141                 // Or fill span with transparent color
142 
143                 else
144                 {
145                     __m128i tqcvr = _mm_set1_epi16 (cast(ushort) cover);
146 
147                     uint* ptr = &dest[bpos*4];
148                     uint* end = &dest[nsb*4];
149 
150                     while (ptr < end)
151                     {
152                         __m128i ipos = _mm_cvttps_epi32 (xmT0);
153                         ipos = _mm_clamp_0_to_N_epi32(ipos, lutMax);
154                         xmT0 = xmT0 + xmStep0;
155 
156                         __m128i d01 = _mm_loadu_si128(cast(__m128i*) ptr);
157                         __m128i d0 = _mm_unpacklo_epi8 (d01, XMZERO);
158                         __m128i d1 = _mm_unpackhi_epi8 (d01, XMZERO);
159 
160                         __m128i c0 = _mm_loadu_si32 (&lut[ ipos.array[0] ]);
161                         __m128i tnc = _mm_loadu_si32 (&lut[ ipos.array[1] ]);
162                         c0 = _mm_unpacklo_epi32 (c0, tnc);
163                         c0 = _mm_unpacklo_epi8 (c0, XMZERO);
164                         __m128i a0 = _mm_broadcast_alpha(c0);
165                         a0 = _mm_mulhi_epu16(a0, tqcvr);
166 
167                         __m128i c1 = _mm_loadu_si32 (&lut[ ipos.array[2] ]);
168                         tnc = _mm_loadu_si32 (&lut[ ipos.array[3] ]);
169                         c1 = _mm_unpacklo_epi32 (c1, tnc);
170                         c1 = _mm_unpacklo_epi8 (c1, XMZERO);
171                         __m128i a1 = _mm_broadcast_alpha(c1);
172                         a1 = _mm_mulhi_epu16(a1, tqcvr);
173 
174                        // alpha*source + dest - alpha*dest
175 
176                         c0 = _mm_mulhi_epu16 (c0,a0);
177                         c1 = _mm_mulhi_epu16 (c1,a1);
178                         c0 = _mm_adds_epi16 (c0,d0);
179                         c1 = _mm_adds_epi16 (c1,d1);
180                         d0 = _mm_mulhi_epu16 (d0,a0);
181                         d1 = _mm_mulhi_epu16 (d1,a1);
182                         c0 =  _mm_subs_epi16 (c0, d0);
183                         c1 =  _mm_subs_epi16 (c1, d1);
184 
185                         d0 = _mm_packus_epi16 (c0,c1);
186 
187                         _mm_storeu_si128 (cast(__m128i*)ptr,d0);
188                         
189                         ptr+=4;
190                     }
191 
192                     bpos = nsb;
193                 }
194             }
195 
196             // At this point we need to integrate scandelta
197 
198             uint* ptr = &dest[bpos*4];
199             uint* end = &dest[endbit*4];
200             int* dlptr = &delta[bpos*4];
201 
202             while (bpos < endbit)
203             {
204                 // Integrate delta values
205 
206                 __m128i tqw = _mm_loadu_si128(cast(__m128i*)dlptr);
207                 tqw = _mm_add_epi32(tqw, _mm_slli_si128!4(tqw)); 
208                 tqw = _mm_add_epi32(tqw, _mm_slli_si128!8(tqw)); 
209                 tqw = _mm_add_epi32(tqw, xmWinding); 
210                 xmWinding = _mm_shuffle_epi32!255(tqw);  
211                 _mm_storeu_si128(cast(__m128i*)dlptr,XMZERO);
212 
213                 // Process coverage values taking account of winding rule
214                 
215                 static if (wr == WindingRule.NonZero)
216                 {
217                     __m128i tcvr = _mm_srai_epi32(tqw,31); 
218                     tqw = _mm_add_epi32(tcvr,tqw);
219                     tqw = _mm_xor_si128(tqw,tcvr);         // abs
220                     tcvr = _mm_packs_epi32(tqw,XMZERO);    // saturate/pack to int16
221                     tcvr = _mm_slli_epi16(tcvr, 1);        // << to uint16
222                 }
223                 else
224                 {
225                     __m128i tcvr = _mm_and_si128(tqw,XMMSK16); 
226                     tqw = _mm_srai_epi16(tcvr,15);         // mask
227                     tcvr = _mm_xor_si128(tcvr,tqw);        // fold in halff
228                     tcvr = _mm_packs_epi32(tcvr,XMZERO);   // pack to int16
229                     tcvr = _mm_slli_epi16(tcvr, 1);        // << to uint16
230                 }
231 
232                 // convert grad pos to integer
233 
234                 __m128i ipos = _mm_cvttps_epi32 (xmT0);
235                 ipos = _mm_clamp_0_to_N_epi32(ipos, lutMax);
236                 xmT0 = xmT0 + xmStep0;
237 
238                 // Load destination pixels
239                 __m128i d01 = _mm_loadu_si128(cast(__m128i*) ptr);
240                 __m128i d0 = _mm_unpacklo_epi8 (d01, XMZERO);
241                 __m128i d1 = _mm_unpackhi_epi8 (d01, XMZERO);
242 
243                 // load grad colors
244 
245                 tcvr = _mm_unpacklo_epi16 (tcvr, tcvr);
246                 __m128i tcvr2 = _mm_unpackhi_epi32 (tcvr, tcvr);
247                 tcvr = _mm_unpacklo_epi32 (tcvr, tcvr);
248 
249                 __m128i c0 = _mm_loadu_si32 (&lut[ ipos.array[0] ]);
250                 __m128i tnc = _mm_loadu_si32 (&lut[ ipos.array[1] ]);
251                 c0 = _mm_unpacklo_epi32 (c0, tnc);
252                 c0 = _mm_unpacklo_epi8 (c0, XMZERO);
253                 __m128i a0 = _mm_broadcast_alpha(c0);
254                 a0 = _mm_mulhi_epu16(a0, tcvr);
255 
256 
257                 __m128i c1 = _mm_loadu_si32 (&lut[ ipos.array[2] ]);
258                 tnc = _mm_loadu_si32 (&lut[ ipos.array[3] ]);
259                 c1 = _mm_unpacklo_epi32 (c1, tnc);
260                 c1 = _mm_unpacklo_epi8 (c1, XMZERO);
261                 __m128i a1 = _mm_broadcast_alpha(c1);
262                 a1 = _mm_mulhi_epu16(a1, tcvr2);
263 
264                 // alpha*source + dest - alpha*dest
265 
266                 c0 = _mm_mulhi_epu16 (c0,a0);
267                 c1 = _mm_mulhi_epu16 (c1,a1);
268                 c0 = _mm_adds_epi16 (c0,d0);
269                 c1 = _mm_adds_epi16 (c1,d1);
270                 d0 = _mm_mulhi_epu16 (d0,a0);
271                 d1 = _mm_mulhi_epu16 (d1,a1);
272                 c0 =  _mm_subs_epi16 (c0, d0);
273                 c1 =  _mm_subs_epi16 (c1, d1);
274 
275                 d0 = _mm_packus_epi16 (c0,c1);
276 
277                 _mm_storeu_si128 (cast(__m128i*)ptr,d0);
278                 
279                 bpos++;
280                 ptr+=4;
281                 dlptr+=4;
282 
283                 if (((cast(ulong*)dlptr)[0] | (cast(ulong*)dlptr)[1]) == 0)  break;
284             }
285         }
286     }
287 
288     // Member variables
289 
290     ubyte* pixels;
291     size_t strideBytes;
292     int height;
293     Gradient gradient;
294     float xctr,yctr;
295     float xstep,ystep;
296 }
297 
298 nothrow:
299 @nogc:
300 
301 void doBlit_LinearBlit(void* userData, int* delta, DMWord* mask, int x0, int x1, int y)
302 {
303     LinearBlit* lb = cast(LinearBlit*)userData;
304     return lb.linear_blit!(WindingRule.NonZero)(delta, mask, x0, x1, y);
305 }