1 /**
2 * Implement the linear gradient fill style. dplug:canvas internals.
3 *
4 * Copyright: Copyright Chris Jones 2020.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module dplug.canvas.linearblit;
8 
9 import dplug.core.math;
10 
11 import dplug.canvas.rasterizer;
12 import dplug.canvas.gradient;
13 import dplug.canvas.misc;
14 
15 /*
16   linear gradient blit
17 */
18 
19 struct LinearBlit
20 {   
21 nothrow:
22 @nogc:
23 
24     void init(Gradient g, float x0, float y0, float x1, float y1)
25     {
26         assert(g !is null);
27         assert(isPow2(g.lutLength));
28 
29         this.gradient = g;
30         int lutsize = g.lutLength;
31 
32         xctr = x0;
33         yctr = y0;
34         float w = x1-x0;
35         float h = y1-y0;
36         float hsq = w*w + h*h;
37         if (hsq < 0.1) hsq = 0.1; // avoid div by zero
38         xstep = lutsize * w / hsq; 
39         ystep = lutsize * h / hsq;
40     }
41 
42 private:
43 
44     void linear_blit(WindingRule wr)(uint* dest, int* delta, DMWord* mask, int x0, int x1, int y)
45     {
46         assert(x0 >= 0);
47         assert(y >= 0);
48         assert((x0 & 3) == 0);
49         assert((x1 & 3) == 0);
50 
51         // main blit variables
52 
53         int bpos = x0 / 4;
54         int endbit = x1 / 4;
55         __m128i xmWinding = 0;
56         uint* lut = gradient.getLookup.ptr;
57         assert(gradient.lutLength <= short.max); // LUT can be non-power-of-2 as far as LinearBlit is concerned, but this held low interest
58         short lutMax = cast(short)(gradient.lutLength - 1);
59 
60         bool isopaque = false;//gradient.isOpaque
61 
62         // XMM constants
63 
64         immutable __m128i XMZERO = 0;
65         immutable __m128i XMFFFF = [0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF];
66         immutable __m128i XMMSK16 = [0xFFFF,0xFFFF,0xFFFF,0xFFFF];
67 
68         // paint variables
69 
70         float t0 = (bpos*4-xctr)*xstep + (y-yctr)*ystep;
71         __m128 xmT0 = _mm_mul_ps(_mm_set1_ps(xstep), _mm_setr_ps(0.0f,1.0f,2.0f,3.0f));
72         xmT0 = _mm_add_ps(xmT0, _mm_set1_ps(t0));
73         __m128 xmStep0 = _mm_set1_ps(xstep*4);
74 
75         // main loop
76 
77         while (bpos < endbit)
78         {
79             int nsb = nextSetBit(mask, bpos, endbit);
80 
81             // do we have a span of unchanging coverage?
82 
83             if (bpos < nsb)
84             {
85                 // Calc coverage of first pixel
86 
87                 static if (wr == WindingRule.NonZero)
88                 {
89                     int cover = xmWinding[3]+delta[bpos*4];
90                     cover = abs(cover)*2;
91                     if (cover > 0xFFFF) cover = 0xFFFF;
92                 }
93                 else
94                 {
95                     int cover = xmWinding[3]+delta[bpos*4];
96                     short tsc = cast(short) cover;
97                     cover = (tsc ^ (tsc >> 15)) * 2;
98                 }
99 
100                 // We can skip the span
101 
102                 if (cover == 0)
103                 {
104                     __m128 tsl = _mm_set1_ps(nsb-bpos);
105                     xmT0 = _mm_add_ps(xmT0, _mm_mul_ps(tsl,xmStep0));
106                     bpos = nsb;
107                 }
108 
109                 // Or fill span with soid color
110 
111                 else if (isopaque && (cover > 0xFF00))
112                 {
113                     uint* ptr = &dest[bpos*4];
114                     uint* end = ptr + ((nsb-bpos)*4);
115 
116                     while (ptr < end)
117                     {
118                         __m128i ipos = _mm_cvttps_epi32 (xmT0);
119                         ipos = _mm_clamp_0_to_N_epi32(ipos, lutMax);
120                         xmT0 = xmT0 + xmStep0;
121 
122                         ptr[0] = lut[ ipos.array[0] ];
123                         ptr[1] = lut[ ipos.array[1] ];
124                         ptr[2] = lut[ ipos.array[2] ];
125                         ptr[3] = lut[ ipos.array[3] ];
126 
127                         ptr+=4;                        
128                     }
129 
130                     bpos = nsb;
131                 }
132 
133                 // Or fill span with transparent color
134 
135                 else
136                 {
137                     __m128i tqcvr = _mm_set1_epi16 (cast(ushort) cover);
138 
139                     uint* ptr = &dest[bpos*4];
140                     uint* end = &dest[nsb*4];
141 
142                     while (ptr < end)
143                     {
144                         __m128i ipos = _mm_cvttps_epi32 (xmT0);
145                         ipos = _mm_clamp_0_to_N_epi32(ipos, lutMax);
146                         xmT0 = xmT0 + xmStep0;
147 
148                         __m128i d01 = _mm_loadu_si128(cast(__m128i*) ptr);
149                         __m128i d0 = _mm_unpacklo_epi8 (d01, XMZERO);
150                         __m128i d1 = _mm_unpackhi_epi8 (d01, XMZERO);
151 
152                         __m128i c0 = _mm_loadu_si32 (&lut[ ipos.array[0] ]);
153                         __m128i tnc = _mm_loadu_si32 (&lut[ ipos.array[1] ]);
154                         c0 = _mm_unpacklo_epi32 (c0, tnc);
155                         c0 = _mm_unpacklo_epi8 (c0, XMZERO);
156                         __m128i a0 = _mm_broadcast_alpha(c0);
157                         a0 = _mm_mulhi_epu16(a0, tqcvr);
158 
159                         __m128i c1 = _mm_loadu_si32 (&lut[ ipos.array[2] ]);
160                         tnc = _mm_loadu_si32 (&lut[ ipos.array[3] ]);
161                         c1 = _mm_unpacklo_epi32 (c1, tnc);
162                         c1 = _mm_unpacklo_epi8 (c1, XMZERO);
163                         __m128i a1 = _mm_broadcast_alpha(c1);
164                         a1 = _mm_mulhi_epu16(a1, tqcvr);
165 
166                        // alpha*source + dest - alpha*dest
167 
168                         c0 = _mm_mulhi_epu16 (c0,a0);
169                         c1 = _mm_mulhi_epu16 (c1,a1);
170                         c0 = _mm_adds_epi16 (c0,d0);
171                         c1 = _mm_adds_epi16 (c1,d1);
172                         d0 = _mm_mulhi_epu16 (d0,a0);
173                         d1 = _mm_mulhi_epu16 (d1,a1);
174                         c0 =  _mm_subs_epi16 (c0, d0);
175                         c1 =  _mm_subs_epi16 (c1, d1);
176 
177                         d0 = _mm_packus_epi16 (c0,c1);
178 
179                         _mm_storeu_si128 (cast(__m128i*)ptr,d0);
180                         
181                         ptr+=4;
182                     }
183 
184                     bpos = nsb;
185                 }
186             }
187 
188             // At this point we need to integrate scandelta
189 
190             uint* ptr = &dest[bpos*4];
191             uint* end = &dest[endbit*4];
192             int* dlptr = &delta[bpos*4];
193 
194             while (bpos < endbit)
195             {
196                 // Integrate delta values
197 
198                 __m128i tqw = _mm_loadu_si128(cast(__m128i*)dlptr);
199                 tqw = _mm_add_epi32(tqw, _mm_slli_si128!4(tqw)); 
200                 tqw = _mm_add_epi32(tqw, _mm_slli_si128!8(tqw)); 
201                 tqw = _mm_add_epi32(tqw, xmWinding); 
202                 xmWinding = _mm_shuffle_epi32!255(tqw);  
203                 _mm_storeu_si128(cast(__m128i*)dlptr,XMZERO);
204 
205                 // Process coverage values taking account of winding rule
206                 
207                 static if (wr == WindingRule.NonZero)
208                 {
209                     __m128i tcvr = _mm_srai_epi32(tqw,31); 
210                     tqw = _mm_add_epi32(tcvr,tqw);
211                     tqw = _mm_xor_si128(tqw,tcvr);         // abs
212                     tcvr = _mm_packs_epi32(tqw,XMZERO);    // saturate/pack to int16
213                     tcvr = _mm_slli_epi16(tcvr, 1);        // << to uint16
214                 }
215                 else
216                 {
217                     __m128i tcvr = _mm_and_si128(tqw,XMMSK16); 
218                     tqw = _mm_srai_epi16(tcvr,15);         // mask
219                     tcvr = _mm_xor_si128(tcvr,tqw);        // fold in halff
220                     tcvr = _mm_packs_epi32(tcvr,XMZERO);   // pack to int16
221                     tcvr = _mm_slli_epi16(tcvr, 1);        // << to uint16
222                 }
223 
224                 // convert grad pos to integer
225 
226                 __m128i ipos = _mm_cvttps_epi32 (xmT0);
227                 ipos = _mm_clamp_0_to_N_epi32(ipos, lutMax);
228                 xmT0 = xmT0 + xmStep0;
229 
230                 // Load destination pixels
231                 __m128i d01 = _mm_loadu_si128(cast(__m128i*) ptr);
232                 __m128i d0 = _mm_unpacklo_epi8 (d01, XMZERO);
233                 __m128i d1 = _mm_unpackhi_epi8 (d01, XMZERO);
234 
235                 // load grad colors
236 
237                 tcvr = _mm_unpacklo_epi16 (tcvr, tcvr);
238                 __m128i tcvr2 = _mm_unpackhi_epi32 (tcvr, tcvr);
239                 tcvr = _mm_unpacklo_epi32 (tcvr, tcvr);
240 
241                 __m128i c0 = _mm_loadu_si32 (&lut[ ipos.array[0] ]);
242                 __m128i tnc = _mm_loadu_si32 (&lut[ ipos.array[1] ]);
243                 c0 = _mm_unpacklo_epi32 (c0, tnc);
244                 c0 = _mm_unpacklo_epi8 (c0, XMZERO);
245                 __m128i a0 = _mm_broadcast_alpha(c0);
246                 a0 = _mm_mulhi_epu16(a0, tcvr);
247 
248 
249                 __m128i c1 = _mm_loadu_si32 (&lut[ ipos.array[2] ]);
250                 tnc = _mm_loadu_si32 (&lut[ ipos.array[3] ]);
251                 c1 = _mm_unpacklo_epi32 (c1, tnc);
252                 c1 = _mm_unpacklo_epi8 (c1, XMZERO);
253                 __m128i a1 = _mm_broadcast_alpha(c1);
254                 a1 = _mm_mulhi_epu16(a1, tcvr2);
255 
256                 // alpha*source + dest - alpha*dest
257 
258                 c0 = _mm_mulhi_epu16 (c0,a0);
259                 c1 = _mm_mulhi_epu16 (c1,a1);
260                 c0 = _mm_adds_epi16 (c0,d0);
261                 c1 = _mm_adds_epi16 (c1,d1);
262                 d0 = _mm_mulhi_epu16 (d0,a0);
263                 d1 = _mm_mulhi_epu16 (d1,a1);
264                 c0 =  _mm_subs_epi16 (c0, d0);
265                 c1 =  _mm_subs_epi16 (c1, d1);
266 
267                 d0 = _mm_packus_epi16 (c0,c1);
268 
269                 _mm_storeu_si128 (cast(__m128i*)ptr,d0);
270                 
271                 bpos++;
272                 ptr+=4;
273                 dlptr+=4;
274 
275                 if (((cast(ulong*)dlptr)[0] | (cast(ulong*)dlptr)[1]) == 0)  break;
276             }
277         }
278     }
279 
280     // Member variables
281 
282     Gradient gradient;
283     float xctr,yctr;
284     float xstep,ystep;
285 }
286 
287 nothrow:
288 @nogc:
289 
290 void doBlit_LinearBlit_NonZero(void* userData, uint* dest, int* delta, DMWord* mask, int x0, int x1, int y)
291 {
292     LinearBlit* lb = cast(LinearBlit*)userData;
293     return lb.linear_blit!(WindingRule.NonZero)(dest, delta, mask, x0, x1, y);
294 }
295 
296 void doBlit_LinearBlit_EvenOdd(void* userData, uint* dest, int* delta, DMWord* mask, int x0, int x1, int y)
297 {
298     LinearBlit* lb = cast(LinearBlit*)userData;
299     return lb.linear_blit!(WindingRule.EvenOdd)(dest, delta, mask, x0, x1, y);
300 }