dplug.canvas.angularblit source code

1 /**
2 * Not supported for now.
3 *
4 * Copyright: Copyright Chris Jones 2020.
5 * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
6 */
7 module dplug.canvas.angularblit;
8 
9 // disabled for now
10 /+
11 
12 import dplug.canvas.rasterizer;
13 import dplug.canvas.gradient;
14 import dplug.canvas.misc;
15 
16 /*
17    angular gradient blit
18 */
19 
20 struct AngularBlit
21 {   
22     void init(uint* pixels, int stride, int height,
23               Gradient g, float x0, float y0, float x1, float y1, float r2)
24     {
25         assert(((cast(int)pixels) & 15) == 0); // must be 16 byte alligned
26         assert((stride & 3) == 0);             // stride must be 16 byte alligned
27         assert(height > 0);
28         assert(g !is null);
29         assert(isPow2(g.lutLength));
30 
31         this.pixels = pixels;
32         this.stride = stride;
33         this.height = height;
34         this.gradient = g;
35         int lutsize = g.lutLength;
36 
37         xctr = x0;
38         yctr = y0;
39         float w = x1-x0;
40         float h = y1-y0;
41         float hyp = w*w + h*h;
42         if (hyp < 0.1) hyp = 0.1;
43         xstep0 = lutsize * w / hyp; 
44         ystep0 = lutsize * h / hyp;
45         hyp = sqrt(hyp);
46         xstep1 = lutsize * h / (r2*hyp);
47         ystep1 = lutsize * -w / (r2*hyp); 
48     }
49 
50     Blitter getBlitter(WindingRule wr)
51     {
52         if (wr == WindingRule.NonZero)
53         {
54             return &angular_blit!(WindingRule.NonZero);
55         }
56         else
57         {
58             return &angular_blit!(WindingRule.EvenOdd);
59         }
60     }
61 
62 private:
63 
64     void angular_blit(WindingRule wr)(int* delta, DMWord* mask, int x0, int x1, int y)
65     {
66         assert(x0 >= 0);
67         assert(x1 <= stride);
68         assert(y >= 0);
69         assert(y < height);
70         assert((x0 & 3) == 0);
71         assert((x1 & 3) == 0);
72 
73         // main blit variables
74 
75         int bpos = x0 / 4;
76         int endbit = x1 / 4;
77         uint* dest = &pixels[y*stride];
78         __m128i xmWinding = 0;
79         uint* lut = gradient.getLookup.ptr;
80         uint lutmsk = gradient.lutLength - 1;
81         bool isopaque = false;//gradient.isOpaque
82 
83         // XMM constants
84 
85         immutable __m128i XMZERO = 0;
86         immutable __m128i XMFFFF = [0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF];
87         immutable __m128i XMMSK16 = [0xFFFF,0xFFFF,0xFFFF,0xFFFF];
88 
89         // paint variables
90 
91         float t0 = (bpos*4-xctr)*xstep0 + (y-yctr)*ystep0;
92         __m128 xmT0 = _mm_mul_ps(_mm_set1_ps(xstep0), _mm_setr_ps(0.0f,1.0f,2.0f,3.0f));
93         xmT0 = _mm_add_ps(xmT0, _mm_set1_ps(t0));
94         __m128 xmStep0 = _mm_set1_ps(xstep0*4);
95 
96         float t1 = (bpos*4-xctr)*xstep1 + (y-yctr)*ystep1;
97         __m128 xmT1 = _mm_mul_ps(_mm_set1_ps(xstep1), _mm_setr_ps(0.0f,1.0f,2.0f,3.0f));
98         xmT1 = _mm_add_ps(xmT1, _mm_set1_ps(t1));
99         __m128 xmStep1 = _mm_set1_ps(xstep1*4);
100 
101         // main loop
102 
103         while (bpos < endbit)
104         {
105             int nsb = nextSetBit(mask, bpos, endbit);
106 
107             // do we have a span of unchanging coverage?
108 
109             if (bpos < nsb)
110             {
111                 // Calc coverage of first pixel
112 
113                 static if (wr == WindingRule.NonZero)
114                 {
115                     int cover = xmWinding[3]+delta[bpos*4];
116                     cover = abs(cover)*2;
117                     if (cover > 0xFFFF) cover = 0xFFFF;
118                 }
119                 else
120                 {
121                     int cover = xmWinding[3]+delta[bpos*4];
122                     short tsc = cast(short) cover;
123                     cover = (tsc ^ (tsc >> 15)) * 2;
124                 }
125 
126                 // We can skip the span
127 
128                 if (cover == 0)
129                 {
130                     __m128 tsl = _mm_set1_ps(nsb-bpos);
131                     xmT0 = _mm_add_ps(xmT0, _mm_mul_ps(tsl,xmStep0));
132                     xmT1 = _mm_add_ps(xmT1, _mm_mul_ps(tsl,xmStep1));
133                     bpos = nsb;
134                 }
135 
136                 // Or fill span with soid color
137 
138                 else if (isopaque && (cover > 0xFF00))
139                 {
140                     uint* ptr = &dest[bpos*4];
141                     uint* end = ptr + ((nsb-bpos)*4);
142 
143                     while (ptr < end)
144                     {
145                         __m128 grad = gradOfSorts(xmT0,xmT1);
146                         __m128 poly = polyAprox(grad);
147                         __m128i ipos = _mm_cvtps_epi32(poly);
148                         ipos = fixupQuadrant(ipos,xmT0,xmT1);
149 
150                         xmT0 = xmT0 + xmStep0;
151                         xmT1 = xmT1 + xmStep1;
152 
153                         long tlip = _mm_cvtsi128_si64 (ipos);
154                         ipos = _mm_shuffle_epi32!14(ipos);
155                         ptr[0] = lut[tlip & lutmsk];
156                         ptr[1] = lut[(tlip >> 32) & lutmsk];
157                         tlip = _mm_cvtsi128_si64 (ipos);
158                         ptr[2] = lut[tlip & lutmsk];
159                         ptr[3] = lut[(tlip >> 32) & lutmsk];
160 
161                         ptr+=4;                        
162                     }
163 
164                     bpos = nsb;
165                 }
166 
167                 // Or fill span with transparent color
168 
169                 else
170                 {
171                     __m128i tqcvr = _mm_set1_epi16 (cast(ushort) cover);
172 
173                     uint* ptr = &dest[bpos*4];
174                     uint* end = &dest[nsb*4];
175 
176                     while (ptr < end)
177                     {
178                         __m128 grad = gradOfSorts(xmT0,xmT1);
179 
180                         __m128i d0 = _mm_loadu_si64 (ptr);
181                         d0 = _mm_unpacklo_epi8 (d0, XMZERO);
182                         __m128i d1 = _mm_loadu_si64 (ptr+2);
183                         d1 = _mm_unpacklo_epi8 (d1, XMZERO);
184 
185                         __m128 poly = polyAprox(grad);
186                         __m128i ipos = _mm_cvtps_epi32(poly);
187                         ipos = fixupQuadrant(ipos,xmT0,xmT1);
188 
189                         long tlip = _mm_cvtsi128_si64 (ipos);
190                         ipos = _mm_unpackhi_epi64 (ipos, ipos);
191 
192                         __m128i c0 = _mm_loadu_si32 (&lut[tlip & lutmsk]);
193                         __m128i tnc = _mm_loadu_si32 (&lut[(tlip >> 32) & lutmsk]);
194                         c0 = _mm_unpacklo_epi32 (c0, tnc);
195                         c0 = _mm_unpacklo_epi8 (c0, XMZERO);
196                         __m128i a0 = _mm_broadcast_alpha(c0);
197                         a0 = _mm_mulhi_epu16(a0, tqcvr);
198 
199                         tlip = _mm_cvtsi128_si64 (ipos);
200                         
201                         __m128i c1 = _mm_loadu_si32 (&lut[tlip & lutmsk]);
202                         tnc = _mm_loadu_si32 (&lut[(tlip >> 32) & lutmsk]);
203                         c1 = _mm_unpacklo_epi32 (c1, tnc);
204                         c1 = _mm_unpacklo_epi8 (c1, XMZERO);
205                         __m128i a1 = _mm_broadcast_alpha(c1);
206                         a1 = _mm_mulhi_epu16(a1, tqcvr);
207 
208                         xmT0 = xmT0 + xmStep0;
209                         xmT1 = xmT1 + xmStep1;
210 
211                        // alpha*source + dest - alpha*dest
212 
213                         c0 = _mm_mulhi_epu16 (c0,a0);
214                         c1 = _mm_mulhi_epu16 (c1,a1);
215                         c0 = _mm_adds_epi16 (c0,d0);
216                         c1 = _mm_adds_epi16 (c1,d1);
217                         d0 = _mm_mulhi_epu16 (d0,a0);
218                         d1 = _mm_mulhi_epu16 (d1,a1);
219                         c0 =  _mm_subs_epi16 (c0, d0);
220                         c1 =  _mm_subs_epi16 (c1, d1);
221 
222                         d0 = _mm_packus_epi16 (c0,c1);
223 
224                         _mm_store_si128 (cast(__m128i*)ptr,d0);
225                         
226                         ptr+=4;
227                     }
228 
229                     bpos = nsb;
230                 }
231             }
232 
233             // At this point we need to integrate scandelta
234 
235             uint* ptr = &dest[bpos*4];
236             uint* end = &dest[endbit*4];
237             int* dlptr = &delta[bpos*4];
238 
239             while (bpos < endbit)
240             {
241                 __m128 grad = gradOfSorts(xmT0,xmT1);
242 
243                 // Integrate delta values
244 
245                 __m128i tqw = _mm_load_si128(cast(__m128i*)dlptr);
246                 tqw = _mm_add_epi32(tqw, _mm_slli_si128!4(tqw)); 
247                 tqw = _mm_add_epi32(tqw, _mm_slli_si128!8(tqw)); 
248                 tqw = _mm_add_epi32(tqw, xmWinding); 
249                 xmWinding = _mm_shuffle_epi32!255(tqw);  
250                 _mm_store_si128(cast(__m128i*)dlptr,XMZERO);
251 
252                 __m128 poly = polyAprox(grad);
253 
254                 // Process coverage values taking account of winding rule
255                 
256                 static if (wr == WindingRule.NonZero)
257                 {
258                     __m128i tcvr = _mm_srai_epi32(tqw,31); 
259                     tqw = _mm_add_epi32(tcvr,tqw);
260                     tqw = _mm_xor_si128(tqw,tcvr);        // abs
261                     tcvr = _mm_packs_epi32(tqw,XMZERO);   // saturate/pack to int16
262                     tcvr = _mm_slli_epi16(tcvr, 1);       // << to uint16
263                 }
264                 else
265                 {
266                     __m128i tcvr = _mm_and_si128(tqw,XMMSK16); 
267                     tqw = _mm_srai_epi16(tcvr,15);       // mask
268                     tcvr = _mm_xor_si128(tcvr,tqw);      // fold in halff
269                     tcvr = _mm_packs_epi32(tcvr,XMZERO); // pack to int16
270                     tcvr = _mm_slli_epi16(tcvr, 1);      // << to uint16
271                 }
272 
273                 // convert grad pos to integer
274 
275                 __m128i ipos = _mm_cvtps_epi32(poly);
276 
277                 // Load destination pixels
278 
279                 __m128i d0 = _mm_loadu_si64 (ptr);
280                 d0 = _mm_unpacklo_epi8 (d0, XMZERO);
281                 __m128i d1 = _mm_loadu_si64 (ptr+2);
282                 d1 = _mm_unpacklo_epi8 (d1, XMZERO);
283 
284                 ipos = fixupQuadrant(ipos,xmT0,xmT1);
285 
286                 xmT0 = xmT0 + xmStep0;
287                 xmT1 = xmT1 + xmStep1;
288 
289                 // load grad colors
290 
291                 long tlip = _mm_cvtsi128_si64 (ipos);
292                 ipos = _mm_unpackhi_epi64 (ipos, ipos);
293 
294                 tcvr = _mm_unpacklo_epi16 (tcvr, tcvr);
295                 __m128i tcvr2 = _mm_unpackhi_epi32 (tcvr, tcvr);
296                 tcvr = _mm_unpacklo_epi32 (tcvr, tcvr);
297 
298                 __m128i c0 = _mm_loadu_si32 (&lut[tlip & lutmsk]);
299                 __m128i tnc = _mm_loadu_si32 (&lut[(tlip >> 32) & lutmsk]);
300                 c0 = _mm_unpacklo_epi32 (c0, tnc);
301                 c0 = _mm_unpacklo_epi8 (c0, XMZERO);
302                 __m128i a0 = _mm_broadcast_alpha(c0);
303                 a0 = _mm_mulhi_epu16(a0, tcvr);
304 
305                 tlip = _mm_cvtsi128_si64 (ipos);
306 
307                 __m128i c1 = _mm_loadu_si32 (&lut[tlip & lutmsk]);
308                 tnc = _mm_loadu_si32 (&lut[(tlip >> 32) & lutmsk]);
309                 c1 = _mm_unpacklo_epi32 (c1, tnc);
310                 c1 = _mm_unpacklo_epi8 (c1, XMZERO);
311                 __m128i a1 = _mm_broadcast_alpha(c1);
312                 a1 = _mm_mulhi_epu16(a1, tcvr2);
313 
314                 // alpha*source + dest - alpha*dest
315 
316                 c0 = _mm_mulhi_epu16 (c0,a0);
317                 c1 = _mm_mulhi_epu16 (c1,a1);
318                 c0 = _mm_adds_epi16 (c0,d0);
319                 c1 = _mm_adds_epi16 (c1,d1);
320                 d0 = _mm_mulhi_epu16 (d0,a0);
321                 d1 = _mm_mulhi_epu16 (d1,a1);
322                 c0 =  _mm_subs_epi16 (c0, d0);
323                 c1 =  _mm_subs_epi16 (c1, d1);
324 
325                 d0 = _mm_packus_epi16 (c0,c1);
326 
327                 _mm_store_si128 (cast(__m128i*)ptr,d0);
328                 
329                 bpos++;
330                 ptr+=4;
331                 dlptr+=4;
332 
333                 if (((cast(ulong*)dlptr)[0] | (cast(ulong*)dlptr)[1]) == 0)  break;
334             }
335         }
336     }
337 
338     // Member variables
339 
340     uint*      pixels;
341     int        stride;
342     int        height;
343     Gradient   gradient;
344     float      xctr,yctr;
345     float      xstep0,ystep0;
346     float      xstep1,ystep1; 
347 }
348 
349 // helpers for fast atan2
350 // these should be inlined by ldc
351 
352 private:
353 
354 immutable __m128i ABSMASK = [0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff];
355 immutable __m128 MINSUM = [0.001,0.001,0.001,0.001];
356 
357 __m128 gradOfSorts(__m128 x, __m128 y)
358 {
359     __m128 absx = _mm_and_ps(x, cast(__m128) ABSMASK);
360     __m128 absy = _mm_and_ps(y, cast(__m128) ABSMASK);
361     __m128 sum = _mm_add_ps(absx,absy);
362     __m128 diff = _mm_sub_ps(absx,absy);
363     sum = _mm_max_ps(sum,MINSUM);
364     return diff / sum;
365 }
366 
367 immutable __m128 PCOEF0  = [0.785398163f,0.785398163f,0.785398163f,0.785398163f];
368 immutable __m128 PCOEF1  = [0.972394341f,0.972394341f,0.972394341f,0.972394341f];
369 immutable __m128 PCOEF3  = [0.19194811f,0.19194811f,0.19194811f,0.19194811f];
370 immutable __m128 PSCALE  = [128.0f / 3.142f,128.0f / 3.142f,128.0f / 3.142f,128.0f / 3.142f];
371 
372 __m128 polyAprox(__m128 g)
373 {
374     __m128 sqr = g*g;
375     __m128 p3 = PCOEF3*g;
376     __m128 p1 = PCOEF1*g;
377     __m128 poly = PCOEF0 - p1 + p3*sqr;
378     return poly * PSCALE;
379 }
380 
381 __m128i fixupQuadrant(__m128i ipos, __m128 t0, __m128 t1)
382 {
383     __m128i xmsk = _mm_srai_epi32(cast(__m128i)t1,31);
384     __m128i ymsk = _mm_srai_epi32(cast(__m128i)t0,31);
385     ipos = ipos ^ (xmsk ^ ymsk);
386     return ipos ^ _mm_slli_epi32(ymsk,7);
387 }
388 
389 // test mixing in rather than inlining???
390 
391 /*
392 string gradOfSorts(string res, string x, string y)
393 {
394     return 
395         "{ __m128 absx = _mm_and_ps("~x~", ABSMASK);" ~
396         "__m128 absy = _mm_and_ps(y, ABSMASK);"
397         "__m128 sum = _mm_add_ps(absx,absy);"
398         "__m128 diff = _mm_sub_ps(absx,absy);"
399         "sum = _mm_max_ps(sum,MINSUM);"
400         res ~ " = diff / sum;"
401 }*/
402 
403 
404 +/