1 /**
2 * Implement the elliptic gradient fill style. dplug:canvas internals.
3 *
4 * Copyright: Copyright Chris Jones 2020.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module dplug.canvas.ellipticalblit;
8 
9 import dplug.canvas.rasterizer;
10 import dplug.canvas.gradient;
11 import dplug.canvas.misc;
12 
13 struct EllipticalBlit
14 {   
15 nothrow:
16 @nogc:
17 
18     void init(ubyte* pixels, size_t strideBytes, int height,
19               Gradient g, float x0, float y0, float x1, float y1, float r2)
20     {
21         assert(height > 0);
22         assert(g !is null);
23         assert(isPow2(g.lutLength));
24 
25         this.pixels = pixels;
26         this.strideBytes = strideBytes;
27         this.height = height;
28         this.gradient = g;
29         int lutsize = g.lutLength;
30 
31         xctr = x0;
32         yctr = y0;
33         float w = x1-x0;
34         float h = y1-y0;
35         float hyp = w*w + h*h;
36         if (hyp < 1.0) hyp = 1.0;
37         xstep0 = lutsize * w / hyp; 
38         ystep0 = lutsize * h / hyp;
39         hyp = sqrt(hyp);
40         xstep1 = lutsize * h / (r2*hyp);
41         ystep1 = lutsize * -w / (r2*hyp); 
42     }
43 
44 private:
45 
46     void color_blit(WindingRule wr)(int* delta, DMWord* mask, int x0, int x1, int y)
47     {
48         assert(x0 >= 0);
49         assert(x1*4 <= strideBytes);
50         assert(y >= 0);
51         assert(y < height);
52         assert((x0 & 3) == 0);
53         assert((x1 & 3) == 0);
54 
55         // main blit variables
56 
57         int bpos = x0 / 4;
58         int endbit = x1 / 4;
59         uint* dest = cast(uint*)(&pixels[y*strideBytes]);
60         __m128i xmWinding = 0;
61         uint* lut = gradient.getLookup.ptr;
62         short lutMax = cast(short)(gradient.lutLength - 1);
63         bool isopaque = false;//gradient.isOpaque
64 
65         // XMM constants
66 
67         immutable __m128i XMZERO = 0;
68         immutable __m128i XMFFFF = [0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF];
69         immutable __m128i XMMSK16 = [0xFFFF,0xFFFF,0xFFFF,0xFFFF];
70 
71         // paint variables
72 
73         float t0 = (bpos*4-xctr)*xstep0 + (y-yctr)*ystep0;
74         __m128 xmT0 = _mm_mul_ps(_mm_set1_ps(xstep0), _mm_setr_ps(0.0f,1.0f,2.0f,3.0f));
75         xmT0 = _mm_add_ps(xmT0, _mm_set1_ps(t0));
76         __m128 xmStep0 = _mm_set1_ps(xstep0*4);
77 
78         float t1 = (bpos*4-xctr)*xstep1 + (y-yctr)*ystep1;
79         __m128 xmT1 = _mm_mul_ps(_mm_set1_ps(xstep1), _mm_setr_ps(0.0f,1.0f,2.0f,3.0f));
80         xmT1 = _mm_add_ps(xmT1, _mm_set1_ps(t1));
81         __m128 xmStep1 = _mm_set1_ps(xstep1*4);
82 
83         // main loop 
84 
85         while (bpos < endbit)
86         {
87             int nsb = nextSetBit(mask, bpos, endbit);
88 
89             // do we have a span of unchanging coverage?
90 
91             if (bpos < nsb)
92             {
93                 // Calc coverage of first pixel
94 
95                 static if (wr == WindingRule.NonZero)
96                 {
97                     int cover = xmWinding[3]+delta[bpos*4];
98                     cover = abs(cover)*2;
99                     if (cover > 0xFFFF) cover = 0xFFFF;
100                 }
101                 else
102                 {
103                     int cover = xmWinding[3]+delta[bpos*4];
104                     short tsc = cast(short) cover;
105                     cover = (tsc ^ (tsc >> 15)) * 2;
106                 }
107 
108                 // We can skip the span
109 
110                 if (cover == 0)
111                 {
112                     __m128 tsl = _mm_set1_ps(nsb-bpos);
113                     xmT0 = _mm_add_ps(xmT0, _mm_mul_ps(tsl,xmStep0));
114                     xmT1 = _mm_add_ps(xmT1, _mm_mul_ps(tsl,xmStep1));
115                     bpos = nsb;
116                 }
117 
118                 // Or fill span with soid color
119 
120                 else if (isopaque && (cover > 0xFF00))
121                 {
122                     uint* ptr = &dest[bpos*4];
123                     uint* end = ptr + ((nsb-bpos)*4);
124 
125                     while (ptr < end)
126                     {
127                         __m128 rad = _mm_add_ps(_mm_mul_ps(xmT0, xmT0),_mm_mul_ps(xmT1, xmT1));
128                         rad = _mm_sqrt_ps(rad);
129                         xmT0 = xmT0 + xmStep0;
130                         xmT1 = xmT1 + xmStep1;
131                         __m128i ipos = _mm_cvttps_epi32 (rad);
132                         ipos = _mm_clamp_0_to_N_epi32(ipos, lutMax);
133 
134                         ptr[0] = lut[ ipos.array[0] ];
135                         ptr[1] = lut[ ipos.array[1] ];
136                         ptr[2] = lut[ ipos.array[2] ];
137                         ptr[3] = lut[ ipos.array[3] ];
138 
139                         ptr+=4;
140                     }
141 
142                     bpos = nsb;
143                 }
144 
145                 // Or fill span with transparent color
146 
147                 else
148                 {
149                     __m128i tqcvr = _mm_set1_epi16 (cast(ushort) cover);
150 
151                     uint* ptr = &dest[bpos*4];
152                     uint* end = &dest[nsb*4];
153 
154                     while (ptr < end)
155                     {
156                         __m128 rad = _mm_add_ps(_mm_mul_ps(xmT0, xmT0),_mm_mul_ps(xmT1, xmT1));
157                         xmT0 = xmT0 + xmStep0;
158                         xmT1 = xmT1 + xmStep1;
159                         rad = _mm_sqrt_ps(rad);
160 
161                         __m128i d0 = _mm_loadu_si64 (ptr);
162                         d0 = _mm_unpacklo_epi8 (d0, XMZERO);
163                         __m128i d1 = _mm_loadu_si64 (ptr+2);
164                         d1 = _mm_unpacklo_epi8 (d1, XMZERO);
165 
166                         __m128i ipos = _mm_cvttps_epi32 (rad);
167                         ipos = _mm_clamp_0_to_N_epi32(ipos, lutMax);
168 
169                         __m128i c0 = _mm_loadu_si32 (&lut[ ipos.array[0] ]);
170                         __m128i tnc = _mm_loadu_si32 (&lut[ ipos.array[1] ]);
171                         c0 = _mm_unpacklo_epi32 (c0, tnc);
172                         c0 = _mm_unpacklo_epi8 (c0, XMZERO);
173                         __m128i a0 = _mm_broadcast_alpha(c0);
174                         a0 = _mm_mulhi_epu16(a0, tqcvr);
175 
176                         __m128i c1 = _mm_loadu_si32 (&lut[ ipos.array[2] ]);
177                         tnc = _mm_loadu_si32 (&lut[ ipos.array[3] ]);
178                         c1 = _mm_unpacklo_epi32 (c1, tnc);
179                         c1 = _mm_unpacklo_epi8 (c1, XMZERO);
180                         __m128i a1 = _mm_broadcast_alpha(c1);
181                         a1 = _mm_mulhi_epu16(a1, tqcvr);
182 
183                        // alpha*source + dest - alpha*dest
184 
185                         c0 = _mm_mulhi_epu16 (c0,a0);
186                         c1 = _mm_mulhi_epu16 (c1,a1);
187                         c0 = _mm_adds_epi16 (c0,d0);
188                         c1 = _mm_adds_epi16 (c1,d1);
189                         d0 = _mm_mulhi_epu16 (d0,a0);
190                         d1 = _mm_mulhi_epu16 (d1,a1);
191                         c0 =  _mm_subs_epi16 (c0, d0);
192                         c1 =  _mm_subs_epi16 (c1, d1);
193 
194                         d0 = _mm_packus_epi16 (c0,c1);
195 
196                         _mm_storeu_si128 (cast(__m128i*)ptr,d0);
197                         
198                         ptr+=4;
199                     }
200 
201                     bpos = nsb;
202                 }
203             }
204 
205             // At this point we need to integrate scandelta
206 
207             uint* ptr = &dest[bpos*4];
208             uint* end = &dest[endbit*4];
209             int* dlptr = &delta[bpos*4];
210 
211             while (bpos < endbit)
212             {
213                 __m128 rad = _mm_add_ps(_mm_mul_ps(xmT0, xmT0),_mm_mul_ps(xmT1, xmT1));
214                 rad = _mm_sqrt_ps(rad);
215 
216                 // Integrate delta values
217 
218                 __m128i tqw = _mm_loadu_si128(cast(__m128i*)dlptr);
219                 tqw = _mm_add_epi32(tqw, _mm_slli_si128!4(tqw)); 
220                 tqw = _mm_add_epi32(tqw, _mm_slli_si128!8(tqw)); 
221                 tqw = _mm_add_epi32(tqw, xmWinding); 
222                 xmWinding = _mm_shuffle_epi32!255(tqw);  
223                 _mm_storeu_si128(cast(__m128i*)dlptr,XMZERO);
224 
225                 // convert grad pos to integer
226 
227                 __m128i ipos = _mm_cvttps_epi32(rad);
228                 ipos = _mm_clamp_0_to_N_epi32(ipos, lutMax);
229                 xmT0 = xmT0 + xmStep0;
230                 xmT1 = xmT1 + xmStep1;
231 
232                 // Process coverage values taking account of winding rule
233                 
234                 static if (wr == WindingRule.NonZero)
235                 {
236                     __m128i tcvr = _mm_srai_epi32(tqw,31); 
237                     tqw = _mm_add_epi32(tcvr,tqw);
238                     tqw = _mm_xor_si128(tqw,tcvr);        // abs
239                     tcvr = _mm_packs_epi32(tqw,XMZERO);   // saturate/pack to int16
240                     tcvr = _mm_slli_epi16(tcvr, 1);       // << to uint16
241                 }
242                 else
243                 {
244                     __m128i tcvr = _mm_and_si128(tqw,XMMSK16); 
245                     tqw = _mm_srai_epi16(tcvr,15);       // mask
246                     tcvr = _mm_xor_si128(tcvr,tqw);      // fold in halff
247                     tcvr = _mm_packs_epi32(tcvr,XMZERO); // pack to int16
248                     tcvr = _mm_slli_epi16(tcvr, 1);      // << to uint16
249                 }
250 
251                 // Load destination pixels
252 
253                 __m128i d0 = _mm_loadu_si64 (ptr);
254                 d0 = _mm_unpacklo_epi8 (d0, XMZERO);
255                 __m128i d1 = _mm_loadu_si64 (ptr+2);
256                 d1 = _mm_unpacklo_epi8 (d1, XMZERO);
257 
258                 // load grad colors
259 
260                 tcvr = _mm_unpacklo_epi16 (tcvr, tcvr);
261                 __m128i tcvr2 = _mm_unpackhi_epi32 (tcvr, tcvr);
262                 tcvr = _mm_unpacklo_epi32 (tcvr, tcvr);
263 
264                 __m128i c0 = _mm_loadu_si32 (&lut[ ipos.array[0] ]);
265                 __m128i tnc = _mm_loadu_si32 (&lut[ ipos.array[1] ]);
266                 c0 = _mm_unpacklo_epi32 (c0, tnc);
267                 c0 = _mm_unpacklo_epi8 (c0, XMZERO);
268                 __m128i a0 = _mm_broadcast_alpha(c0);
269                 a0 = _mm_mulhi_epu16(a0, tcvr);
270 
271                 __m128i c1 = _mm_loadu_si32 (&lut[ ipos.array[2] ]);
272                 tnc = _mm_loadu_si32 (&lut[ ipos.array[3] ]);
273                 c1 = _mm_unpacklo_epi32 (c1, tnc);
274                 c1 = _mm_unpacklo_epi8 (c1, XMZERO);
275                 __m128i a1 = _mm_broadcast_alpha(c1);
276                 a1 = _mm_mulhi_epu16(a1, tcvr2);
277 
278                 // alpha*source + dest - alpha*dest
279 
280                 c0 = _mm_mulhi_epu16 (c0,a0);
281                 c1 = _mm_mulhi_epu16 (c1,a1);
282                 c0 = _mm_adds_epi16 (c0,d0);
283                 c1 = _mm_adds_epi16 (c1,d1);
284                 d0 = _mm_mulhi_epu16 (d0,a0);
285                 d1 = _mm_mulhi_epu16 (d1,a1);
286                 c0 =  _mm_subs_epi16 (c0, d0);
287                 c1 =  _mm_subs_epi16 (c1, d1);
288 
289                 d0 = _mm_packus_epi16 (c0,c1);
290 
291                 _mm_storeu_si128 (cast(__m128i*)ptr,d0);
292                 
293                 bpos++;
294                 ptr+=4;
295                 dlptr+=4;
296 
297                 if (((cast(ulong*)dlptr)[0] | (cast(ulong*)dlptr)[1]) == 0)  break;
298             }
299         }
300     }
301 
302     // Member variables
303 
304     ubyte*      pixels;
305     size_t      strideBytes;
306     int        height;
307     Gradient  gradient;
308     float      xctr,yctr;
309     float      xstep0,ystep0;
310     float      xstep1,ystep1; 
311 }
312 
313 void doBlit_EllipticalBlit(void* userData, int* delta, DMWord* mask, int x0, int x1, int y) nothrow @nogc
314 {
315     EllipticalBlit* cb = cast(EllipticalBlit*)userData;
316     return cb.color_blit!(WindingRule.NonZero)(delta, mask, x0, x1, y);
317 }
318