1 /**
2 * Implement the elliptic gradient fill style. dplug:canvas internals.
3 *
4 * Copyright: Copyright Chris Jones 2020.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module dplug.canvas.ellipticalblit;
8 
9 import dplug.canvas.rasterizer;
10 import dplug.canvas.gradient;
11 import dplug.canvas.misc;
12 
13 struct EllipticalBlit
14 {   
15 nothrow:
16 @nogc:
17 
18     void init(Gradient g, float x0, float y0, float x1, float y1, float r2)
19     {
20         assert(g !is null);
21         assert(isPow2(g.lutLength));
22         this.gradient = g;
23         int lutsize = g.lutLength;
24 
25         xctr = x0;
26         yctr = y0;
27         float w = x1-x0;
28         float h = y1-y0;
29         float hyp = w*w + h*h;
30         if (hyp < 1.0) hyp = 1.0;
31         xstep0 = lutsize * w / hyp; 
32         ystep0 = lutsize * h / hyp;
33         hyp = sqrt(hyp);
34         xstep1 = lutsize * h / (r2*hyp);
35         ystep1 = lutsize * -w / (r2*hyp); 
36     }
37 
38 private:
39 
40     void color_blit(WindingRule wr)(uint* dest, int* delta, DMWord* mask, int x0, int x1, int y)
41     {
42         assert(x0 >= 0);
43         assert(y >= 0);
44         assert((x0 & 3) == 0);
45         assert((x1 & 3) == 0);
46 
47         // main blit variables
48 
49         int bpos = x0 / 4;
50         int endbit = x1 / 4;
51 
52         __m128i xmWinding = 0;
53         uint* lut = gradient.getLookup.ptr;
54         short lutMax = cast(short)(gradient.lutLength - 1);
55         bool isopaque = false;//gradient.isOpaque
56 
57         // XMM constants
58 
59         immutable __m128i XMZERO = 0;
60         immutable __m128i XMFFFF = [0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF];
61         immutable __m128i XMMSK16 = [0xFFFF,0xFFFF,0xFFFF,0xFFFF];
62 
63         // paint variables
64 
65         float t0 = (bpos*4-xctr)*xstep0 + (y-yctr)*ystep0;
66         __m128 xmT0 = _mm_mul_ps(_mm_set1_ps(xstep0), _mm_setr_ps(0.0f,1.0f,2.0f,3.0f));
67         xmT0 = _mm_add_ps(xmT0, _mm_set1_ps(t0));
68         __m128 xmStep0 = _mm_set1_ps(xstep0*4);
69 
70         float t1 = (bpos*4-xctr)*xstep1 + (y-yctr)*ystep1;
71         __m128 xmT1 = _mm_mul_ps(_mm_set1_ps(xstep1), _mm_setr_ps(0.0f,1.0f,2.0f,3.0f));
72         xmT1 = _mm_add_ps(xmT1, _mm_set1_ps(t1));
73         __m128 xmStep1 = _mm_set1_ps(xstep1*4);
74 
75         // main loop 
76 
77         while (bpos < endbit)
78         {
79             int nsb = nextSetBit(mask, bpos, endbit);
80 
81             // do we have a span of unchanging coverage?
82 
83             if (bpos < nsb)
84             {
85                 // Calc coverage of first pixel
86 
87                 static if (wr == WindingRule.NonZero)
88                 {
89                     int cover = xmWinding[3]+delta[bpos*4];
90                     cover = abs(cover)*2;
91                     if (cover > 0xFFFF) cover = 0xFFFF;
92                 }
93                 else
94                 {
95                     int cover = xmWinding[3]+delta[bpos*4];
96                     short tsc = cast(short) cover;
97                     cover = (tsc ^ (tsc >> 15)) * 2;
98                 }
99 
100                 // We can skip the span
101 
102                 if (cover == 0)
103                 {
104                     __m128 tsl = _mm_set1_ps(nsb-bpos);
105                     xmT0 = _mm_add_ps(xmT0, _mm_mul_ps(tsl,xmStep0));
106                     xmT1 = _mm_add_ps(xmT1, _mm_mul_ps(tsl,xmStep1));
107                     bpos = nsb;
108                 }
109 
110                 // Or fill span with soid color
111 
112                 else if (isopaque && (cover > 0xFF00))
113                 {
114                     uint* ptr = &dest[bpos*4];
115                     uint* end = ptr + ((nsb-bpos)*4);
116 
117                     while (ptr < end)
118                     {
119                         __m128 rad = _mm_add_ps(_mm_mul_ps(xmT0, xmT0),_mm_mul_ps(xmT1, xmT1));
120                         rad = _mm_sqrt_ps(rad);
121                         xmT0 = xmT0 + xmStep0;
122                         xmT1 = xmT1 + xmStep1;
123                         __m128i ipos = _mm_cvttps_epi32 (rad);
124                         ipos = _mm_clamp_0_to_N_epi32(ipos, lutMax);
125 
126                         ptr[0] = lut[ ipos.array[0] ];
127                         ptr[1] = lut[ ipos.array[1] ];
128                         ptr[2] = lut[ ipos.array[2] ];
129                         ptr[3] = lut[ ipos.array[3] ];
130 
131                         ptr+=4;
132                     }
133 
134                     bpos = nsb;
135                 }
136 
137                 // Or fill span with transparent color
138 
139                 else
140                 {
141                     __m128i tqcvr = _mm_set1_epi16 (cast(ushort) cover);
142 
143                     uint* ptr = &dest[bpos*4];
144                     uint* end = &dest[nsb*4];
145 
146                     while (ptr < end)
147                     {
148                         __m128 rad = _mm_add_ps(_mm_mul_ps(xmT0, xmT0),_mm_mul_ps(xmT1, xmT1));
149                         xmT0 = xmT0 + xmStep0;
150                         xmT1 = xmT1 + xmStep1;
151                         rad = _mm_sqrt_ps(rad);
152 
153                         __m128i d0 = _mm_loadu_si64 (ptr);
154                         d0 = _mm_unpacklo_epi8 (d0, XMZERO);
155                         __m128i d1 = _mm_loadu_si64 (ptr+2);
156                         d1 = _mm_unpacklo_epi8 (d1, XMZERO);
157 
158                         __m128i ipos = _mm_cvttps_epi32 (rad);
159                         ipos = _mm_clamp_0_to_N_epi32(ipos, lutMax);
160 
161                         __m128i c0 = _mm_loadu_si32 (&lut[ ipos.array[0] ]);
162                         __m128i tnc = _mm_loadu_si32 (&lut[ ipos.array[1] ]);
163                         c0 = _mm_unpacklo_epi32 (c0, tnc);
164                         c0 = _mm_unpacklo_epi8 (c0, XMZERO);
165                         __m128i a0 = _mm_broadcast_alpha(c0);
166                         a0 = _mm_mulhi_epu16(a0, tqcvr);
167 
168                         __m128i c1 = _mm_loadu_si32 (&lut[ ipos.array[2] ]);
169                         tnc = _mm_loadu_si32 (&lut[ ipos.array[3] ]);
170                         c1 = _mm_unpacklo_epi32 (c1, tnc);
171                         c1 = _mm_unpacklo_epi8 (c1, XMZERO);
172                         __m128i a1 = _mm_broadcast_alpha(c1);
173                         a1 = _mm_mulhi_epu16(a1, tqcvr);
174 
175                        // alpha*source + dest - alpha*dest
176 
177                         c0 = _mm_mulhi_epu16 (c0,a0);
178                         c1 = _mm_mulhi_epu16 (c1,a1);
179                         c0 = _mm_adds_epi16 (c0,d0);
180                         c1 = _mm_adds_epi16 (c1,d1);
181                         d0 = _mm_mulhi_epu16 (d0,a0);
182                         d1 = _mm_mulhi_epu16 (d1,a1);
183                         c0 =  _mm_subs_epi16 (c0, d0);
184                         c1 =  _mm_subs_epi16 (c1, d1);
185 
186                         d0 = _mm_packus_epi16 (c0,c1);
187 
188                         _mm_storeu_si128 (cast(__m128i*)ptr,d0);
189                         
190                         ptr+=4;
191                     }
192 
193                     bpos = nsb;
194                 }
195             }
196 
197             // At this point we need to integrate scandelta
198 
199             uint* ptr = &dest[bpos*4];
200             uint* end = &dest[endbit*4];
201             int* dlptr = &delta[bpos*4];
202 
203             while (bpos < endbit)
204             {
205                 __m128 rad = _mm_add_ps(_mm_mul_ps(xmT0, xmT0),_mm_mul_ps(xmT1, xmT1));
206                 rad = _mm_sqrt_ps(rad);
207 
208                 // Integrate delta values
209 
210                 __m128i tqw = _mm_loadu_si128(cast(__m128i*)dlptr);
211                 tqw = _mm_add_epi32(tqw, _mm_slli_si128!4(tqw)); 
212                 tqw = _mm_add_epi32(tqw, _mm_slli_si128!8(tqw)); 
213                 tqw = _mm_add_epi32(tqw, xmWinding); 
214                 xmWinding = _mm_shuffle_epi32!255(tqw);  
215                 _mm_storeu_si128(cast(__m128i*)dlptr,XMZERO);
216 
217                 // convert grad pos to integer
218 
219                 __m128i ipos = _mm_cvttps_epi32(rad);
220                 ipos = _mm_clamp_0_to_N_epi32(ipos, lutMax);
221                 xmT0 = xmT0 + xmStep0;
222                 xmT1 = xmT1 + xmStep1;
223 
224                 // Process coverage values taking account of winding rule
225                 
226                 static if (wr == WindingRule.NonZero)
227                 {
228                     __m128i tcvr = _mm_srai_epi32(tqw,31); 
229                     tqw = _mm_add_epi32(tcvr,tqw);
230                     tqw = _mm_xor_si128(tqw,tcvr);        // abs
231                     tcvr = _mm_packs_epi32(tqw,XMZERO);   // saturate/pack to int16
232                     tcvr = _mm_slli_epi16(tcvr, 1);       // << to uint16
233                 }
234                 else
235                 {
236                     __m128i tcvr = _mm_and_si128(tqw,XMMSK16); 
237                     tqw = _mm_srai_epi16(tcvr,15);       // mask
238                     tcvr = _mm_xor_si128(tcvr,tqw);      // fold in halff
239                     tcvr = _mm_packs_epi32(tcvr,XMZERO); // pack to int16
240                     tcvr = _mm_slli_epi16(tcvr, 1);      // << to uint16
241                 }
242 
243                 // Load destination pixels
244 
245                 __m128i d0 = _mm_loadu_si64 (ptr);
246                 d0 = _mm_unpacklo_epi8 (d0, XMZERO);
247                 __m128i d1 = _mm_loadu_si64 (ptr+2);
248                 d1 = _mm_unpacklo_epi8 (d1, XMZERO);
249 
250                 // load grad colors
251 
252                 tcvr = _mm_unpacklo_epi16 (tcvr, tcvr);
253                 __m128i tcvr2 = _mm_unpackhi_epi32 (tcvr, tcvr);
254                 tcvr = _mm_unpacklo_epi32 (tcvr, tcvr);
255 
256                 __m128i c0 = _mm_loadu_si32 (&lut[ ipos.array[0] ]);
257                 __m128i tnc = _mm_loadu_si32 (&lut[ ipos.array[1] ]);
258                 c0 = _mm_unpacklo_epi32 (c0, tnc);
259                 c0 = _mm_unpacklo_epi8 (c0, XMZERO);
260                 __m128i a0 = _mm_broadcast_alpha(c0);
261                 a0 = _mm_mulhi_epu16(a0, tcvr);
262 
263                 __m128i c1 = _mm_loadu_si32 (&lut[ ipos.array[2] ]);
264                 tnc = _mm_loadu_si32 (&lut[ ipos.array[3] ]);
265                 c1 = _mm_unpacklo_epi32 (c1, tnc);
266                 c1 = _mm_unpacklo_epi8 (c1, XMZERO);
267                 __m128i a1 = _mm_broadcast_alpha(c1);
268                 a1 = _mm_mulhi_epu16(a1, tcvr2);
269 
270                 // alpha*source + dest - alpha*dest
271 
272                 c0 = _mm_mulhi_epu16 (c0,a0);
273                 c1 = _mm_mulhi_epu16 (c1,a1);
274                 c0 = _mm_adds_epi16 (c0,d0);
275                 c1 = _mm_adds_epi16 (c1,d1);
276                 d0 = _mm_mulhi_epu16 (d0,a0);
277                 d1 = _mm_mulhi_epu16 (d1,a1);
278                 c0 =  _mm_subs_epi16 (c0, d0);
279                 c1 =  _mm_subs_epi16 (c1, d1);
280 
281                 d0 = _mm_packus_epi16 (c0,c1);
282 
283                 _mm_storeu_si128 (cast(__m128i*)ptr,d0);
284                 
285                 bpos++;
286                 ptr+=4;
287                 dlptr+=4;
288 
289                 if (((cast(ulong*)dlptr)[0] | (cast(ulong*)dlptr)[1]) == 0)  break;
290             }
291         }
292     }
293 
294     // Member variables
295 
296     Gradient  gradient;
297     float      xctr,yctr;
298     float      xstep0,ystep0;
299     float      xstep1,ystep1; 
300 }
301 
302 void doBlit_EllipticalBlit_NonZero(void* userData, uint* dest, int* delta, DMWord* mask, int x0, int x1, int y) nothrow @nogc
303 {
304     EllipticalBlit* cb = cast(EllipticalBlit*)userData;
305     return cb.color_blit!(WindingRule.NonZero)(dest, delta, mask, x0, x1, y);
306 }
307 
308 void doBlit_EllipticalBlit_EvenOdd(void* userData, uint* dest, int* delta, DMWord* mask, int x0, int x1, int y) nothrow @nogc
309 {
310     EllipticalBlit* cb = cast(EllipticalBlit*)userData;
311     return cb.color_blit!(WindingRule.EvenOdd)(dest, delta, mask, x0, x1, y);
312 }
313