1 /**
2  * Analytic antialiasing rasterizer.
3  * Copyright: Copyright Chris Jones 2020.
4  * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
5  */
6 module dplug.canvas.rasterizer;
7 
8 import std.traits;
9 import dplug.core.math;
10 import dplug.core.vec;
11 import dplug.canvas.misc;
12 
13 // Those asserts are disabled by default, since they are very slow in debug mode.
14 //debug = checkRasterizer;
15 
16 /*
17   Analytic antialiasing rasterizer.
18   =================================
19 
20   Internally works with 24:8 fixed point integer coordinates.
21 
22   You need 8 bits fractional to get 256 levels of gray for almost
23   horizontal or almost vertical lines. Hence 24:8 fixed point.
24 
25   It's a scanline based rasterizer. You add path data in the form
26   of lines and curves etc... Those are converted to edges. The edges
27   are converted to scanline coverage, then coverage is combined with
28   paint and blended to the destination pixels.
29 
30   The coverage is stored in differentiated form, each cell in
31   the scanline stores the difference between cells rather than
32   the actual coverage. This means we dont have to track coverage
33   for long spans where nothing happens.
34 
35   It also uses a bitmask to track changes in coverage. It's an idea
36   taken from Blend2D although i think my implementation is different.
37   Basically anywhere an edge crosses the current scanline the bit
38   for the leftmost pixel it touches is set in the mask. So we can
39   use a bitscan instruction to find long spans of unchanging
40   coverage and where we need to start processing the coverage again.
41 
42   The mask uses 1 bit for every 4 pixels, because the blitters are
43   processing 4 pixels at once with SIMD.
44 
45   Cliping is handled by having left and right clip buffers, any edges
46   crossing the left or righ boundry are spit at the boundry. Parts inside
47   are handled normaly, parts outside are added to the clip buffers so
48   that we keep track of what coverage they contribute. This is then
49   added to the scandelta at the start of processing each line. These
50   buffers use differentiated coverage.
51 */
52 
53 /*
54    Winding rule
55    Gradient repeat mode
56    Angular gradient mode
57 
58    (repeat modes not implemented yet)
59 */
60 
61 nothrow:
62 @nogc:
63 
64 enum WindingRule
65 {
66     NonZero,
67     EvenOdd
68 }
69 
70 enum RepeatMode
71 {
72     Pad,
73     Repeat,
74     Mirror
75 }
76 
77 enum AngularMode
78 {
79     Single,
80     Double,
81     Quad
82 }
83 
84 enum CompositeOp
85 {
86     SourceOver,
87     Add,
88     Subtract,
89     LightenOnly,
90     DarkenOnly
91 }
92 
93 /*
94   Delta mask stuff
95   what word type to use for mask
96   how many pixels per bit
97   how many pixels per word
98   bit mask for width of DMWord
99 */
100 
101 static if ((void*).sizeof == 4)
102     alias DMWord = uint;
103 else static if ((void*).sizeof == 8)
104     alias DMWord = ulong;
105 else
106     static assert(0);
107 
108 private:
109 
110 enum dmPixPerBit = 4; 
111 enum dmPixPerWord = dmPixPerBit * 8 * DMWord.sizeof;
112 enum dmWordMask = 8 * DMWord.sizeof - 1;
113 
114 /*
115   set a bit in the delta mask, 'x' is pixel cordinate, not bit index
116 */
117 
118 void DMSetBit(DMWord* mask, uint x)
119 {
120     mask[x/dmPixPerWord] |= (cast(DMWord)1) << ((x / dmPixPerBit) & dmWordMask);  
121 }
122 
123 void DMSetBitRange(DMWord* mask, uint x0, int x1)
124 {
125     while (x0 <= x1)
126     {
127         mask[x0/dmPixPerWord] |= (cast(DMWord)1) << ((x0 / dmPixPerBit) & dmWordMask);
128         x0+=4;
129     }
130 }
131 
132 /*
133   Few constants for fixed point coordinates / gradients
134 */
135 
136 enum fpFracBits = 8;    // 8 bits fractional
137 enum fpScale = 256.0f;  // for converting from float
138 enum fpDXScale = 4294967296.0; // convert to dx gradient in 32:32
139 enum fpDYScale = 1073741824.0; // as above but div 4
140 
141 /*
142   Blitter delegate. A callback that does the actual blitting once coverage
143   for the given scanline has been calculated.
144     dest  - destination pixels
145     delta - pointer to the delta buffer
146     mask  - pointer to delta mask
147     x0    - start x
148     x1    - end x
149     y     - current y (needed for gradients)
150 */
151 
152 alias BlitFunction = void function(void* userData, uint* dest, int* delta, DMWord* mask, int x0, int x1, int y);
153 
154 
155 
156 /*
157   a*b/c, with the intermediate result of a*b in 64 bit
158   the asm version might be faster in 32 bit mode, havent tested yet, but the
159   plain D version is same speed with 64bit / LDC
160 */
161 
162 public:
163 
164 struct Blitter
165 {
166     void* userData; // used for retrieving context
167     BlitFunction doBlit;
168 }
169 
170 /// Rasterizer
171 package struct Rasterizer
172 {
173 public:
174 nothrow:
175 @nogc:
176 
177     @disable this(this);
178 
179 
180     /*
181       initialise -- This sets the clip rectange, flushes any existing state
182       and preps for drawing.
183 
184       The clip window left,top is inside, right,bottom is outside. So if the
185       window is 100,100 --> 200,200, then pixel 100,100 can be modified but
186       pixel 200,200 will not.
187 
188       The rasterizer however needs to allow coordinates that fall directly on
189       the right side and bottom side of the clip even though those pixels are
190       techically outside. It's easier and faster to give the temporary buffers
191       a bit extra room for overspill than it is to check and special case
192       when it happens.
193 
194       Also the delta buffer and two clip buffers use differentiated coverage
195       which also causes one extra pixel overspill. If you differentiate a
196       sequence of length N you get a sequence of length N+1. Again it's easier
197       and faster to just allow for the overspill than it is to check for and
198       special case it.
199     */
200 
201     void initialise(int left, int top, int right, int bottom)
202     {
203         assert((left >= 0) && (left < right));
204         assert((top >= 0) && (top < bottom));
205 
206         m_clipleft = left << fpFracBits;
207         m_cliptop = top << fpFracBits;
208         m_clipright = right << fpFracBits;
209         m_clipbottom = bottom << fpFracBits;
210 
211         // reset edge buffer and Y extent tracking
212 
213         m_edgepool.reset();
214         m_yrmin = bottom;
215         m_yrmax = top;
216 
217         // init buffers
218         m_scandelta.resize(roundUpPow2((right+3)|63));
219 
220         m_deltamaskSize = cast(size_t) roundUpPow2(1+right/dmPixPerWord);
221         if (m_deltamaskSize > m_deltamaskAlloc)
222         {
223             m_deltamask = cast(DMWord*) alignedReallocDiscard(m_deltamask, m_deltamaskSize * (DMWord*).sizeof, 1);
224             m_deltamaskAlloc = m_deltamaskSize;
225         }
226 
227         m_buckets.resize(roundUpPow2(bottom+1));
228         m_clipbfr_l.resize(roundUpPow2((bottom+2)|63));
229         m_clipbfr_r.resize(roundUpPow2((bottom+2)|63));
230 
231         m_destBuf.resize( (right+3)*4 ); // 3 additional pixels, size of a scanline
232 
233         m_scandelta.fill(0);
234         // m_deltamask is init on each rasterized line
235         m_buckets.fill(null);
236         m_clipbfr_l.fill(0);
237         m_clipbfr_r.fill(0);
238 
239         // init prev x,y and sub path start x,y
240 
241         m_prevx = 0;
242         m_prevy = 0;
243         m_subpx = 0;
244         m_subpy = 0;
245         m_fprevx = 0;
246         m_fprevy = 0;
247     }
248 
249     ~this()
250     {
251         if (m_deltamask)
252         {
253             alignedFree(m_deltamask, 1);
254             m_deltamask = null;
255         }
256     }
257 
258     // rasterize
259 
260     void rasterize(ubyte* imagePixels,
261                    const size_t imagePitchInBytes,
262                    const int imageHeight,
263                    Blitter blitter,
264                    CompositeOp compositeOp)
265     {
266         Edge dummy;
267         Edge* prev = &dummy;
268         Edge* edge = null;
269 
270         int startx = (m_clipleft >> fpFracBits) & 0xFFFFFFFC;
271         int endx = ((m_clipright >> fpFracBits) + 3) & 0xFFFFFFFC;
272         int starty = m_yrmin >> fpFracBits;
273         int endy = (m_yrmax+255) >> fpFracBits;
274 
275         int cl_acc,cr_acc;
276         int cl_pos = m_clipleft >> fpFracBits;
277         int cr_pos = m_clipright >> fpFracBits;
278 
279         ubyte* pDest = imagePixels + imagePitchInBytes * starty;
280 
281         for (int y = starty; y < endy; y++)
282         {
283             m_deltamask[0..m_deltamaskSize] = 0;
284             int ly = (y << fpFracBits) + 256;
285 
286             // clip accumulator
287 
288             cl_acc += m_clipbfr_l[y];
289             m_clipbfr_l[y] = 0;
290             cr_acc += m_clipbfr_r[y];
291             m_clipbfr_r[y] = 0;
292 
293             if (cl_acc) DMSetBit(m_deltamask, cl_pos);
294             if (cr_acc) DMSetBit(m_deltamask, cr_pos);
295 
296             m_scandelta[cl_pos] += cl_acc;
297             m_scandelta[cr_pos] += cr_acc;
298 
299             // At this point 'prev' either points at 'dummy' or at the last node in
300             //   active edges linked list, so we just add the new edges to it.
301 
302             prev.next = m_buckets[y];
303             m_buckets[y] = null;
304 
305             // loop through the active edges
306 
307             prev = &dummy;
308             edge = dummy.next;
309 
310             while (edge)
311             {
312                 int ny = void;
313 
314                 if (edge.y2 <= ly)
315                 {
316                     ny = edge.y2;
317                     prev.next = edge.next;
318                 }
319                 else
320                 {
321                     ny = ly;
322                     prev = edge;
323                 }
324 
325                 int span = ny - edge.y;
326                 long nx = edge.x + edge.dx * span;
327 
328                 int bpspan = span * ((cast(int)(edge.dy>>63))|1);
329 
330                 int x0 = cast(int)(edge.x >> 40);
331                 int x1 = cast(int)(nx >> 40);
332                 int steps = x1 - x0;
333 
334                 if (steps == 0)
335                 {
336                     DMSetBit(m_deltamask, x0);
337 
338                     int w = (edge.x >> 32) & 0xFF;
339                     int v = (nx >> 32) & 0xFF;
340                     int area = (bpspan * (512 - w - v)) >> 2;
341                     m_scandelta[x0] += area;
342                     x0++;
343                     m_scandelta[x0] += bpspan * 128 - area;
344                 }
345                 else if (steps > 0)
346                 {
347                     DMSetBitRange(m_deltamask, x0, x1);
348 
349                     int w = 256 - ((edge.x >> 32) & 0xFF);
350                     long acc = w * edge.dy;
351                     int area = cast(int)((w * acc) >> 32);
352                     m_scandelta[x0] += area;
353                     x0++;
354                     acc += edge.dy << 7;
355 
356                     while (x0 < x1)
357                     {
358                         int lc = area;
359                         area = cast(int)(acc >> 23);
360                         m_scandelta[x0] += area - lc;
361                         x0++;
362                         acc += edge.dy << 8;
363                     }
364 
365                     int q = (nx >> 32) & 0xFF;
366                     int rect = bpspan * 128;
367                     int lc = area;
368                     area = rect - cast(int)((q * q * edge.dy) >> 32);
369                     m_scandelta[x0] += area - lc;
370                     x0++;
371                     m_scandelta[x0] += rect - area;
372                 }
373                 else if (steps < 0)
374                 {
375                     DMSetBitRange(m_deltamask, x1, x0);
376 
377                     int w = 256 - ((nx >> 32) & 0xFF);
378                     long acc = w * edge.dy;
379                     int area = cast(int)((w * acc) >> 32);
380                     m_scandelta[x1] += area;
381                     x1++;
382                     acc += edge.dy << 7;
383 
384                     while (x1 < x0)
385                     {
386                         int lc = area;
387                         area = cast(int)(acc >> 23);
388                         m_scandelta[x1] += area - lc;
389                         x1++;
390                         acc += edge.dy << 8;
391                     }
392 
393                     int q = (edge.x >> 32) & 0xFF;
394                     int rect = bpspan * 128;
395                     int lc = area;
396                     area = rect - cast(int)((q * q * edge.dy) >> 32);
397                     m_scandelta[x1] += area - lc;
398                     x1++;
399                     m_scandelta[x1] += rect - area;
400                 }
401 
402                 edge.x = nx;
403                 edge.y = ny;
404                 edge = edge.next;
405             }
406 
407             // Blit scanline
408             if (compositeOp == CompositeOp.SourceOver)
409             {
410                 // special case for fastest blit, source-over is the default.
411                 blitter.doBlit(blitter.userData, cast(uint*)pDest, m_scandelta.ptr, m_deltamask, startx, endx, y);                
412             }
413             else
414             {
415                 uint* tmpBuf = cast(uint*)m_destBuf.ptr;
416                 uint* dest = cast(uint*)pDest;
417 
418                 assert((startx & 3) == 0);
419                 assert((endx & 3) == 0);
420 
421                 // Fill with zeroes
422                 tmpBuf[startx..endx] = 0; // black in rgba8
423 
424                 // Write into destbuf.
425                 // There is an additional difficulty, because the pixels that were touched have an excess in -3 to +3 potentially.
426                 // This is done with coverage being zero. So our Porter-Duff would overwrite if we don't multiply with source-alpha!
427                 blitter.doBlit(blitter.userData, tmpBuf, m_scandelta.ptr, m_deltamask, startx, endx, y);
428 
429                 // Custom composite operations go here.
430 
431                 // Note: in HTML standard, destination alpha is considered, and the resulting color
432                 // is a weighted average of the dest and blended source. I suppose alpha also gets to be
433                 // a weighted average, but this implies a divide, and we assume background alpha to be 255.
434                 //
435                 // alpha-res = (255 * 255 + src-alpha * src-alpha) / (src-alpha + 255)
436                 // Under these circumstances, alpha-res is always close to 255.
437 
438                 __m128i alphaMask = _mm_set1_epi32(0xff000000);
439 
440                 final switch (compositeOp)
441                 {
442                     case CompositeOp.SourceOver:
443                         assert(false);
444 
445                     case CompositeOp.Add:
446                         for (int x = startx; x < endx; x += 4)
447                         {
448                             __m128i D = _mm_loadu_si128(cast(__m128i*) &dest[x]);
449                             __m128i S = _mm_loadu_si128(cast(__m128i*) &tmpBuf[x]);
450                             __m128i Z = _mm_setzero_si128();
451                             __m128i D0_3 = _mm_unpacklo_epi8(D, Z); 
452                             __m128i D4_7 = _mm_unpackhi_epi8(D, Z);
453                             __m128i S0_3 = _mm_unpacklo_epi8(S, Z); 
454                             __m128i S4_7 = _mm_unpackhi_epi8(S, Z);
455                             D0_3 = _mm_add_epi16(D0_3, S0_3);
456                             D4_7 = _mm_add_epi16(D4_7, S4_7);
457                             __m128i R = _mm_packus_epi16(D0_3, D4_7);
458                             R = _mm_or_si128(R, alphaMask);
459                             _mm_storeu_si128(cast(__m128i*) &dest[x], R);
460                         }
461                         break;
462 
463                     case CompositeOp.Subtract:
464                         for (int x = startx; x < endx; x += 4)
465                         {
466                             __m128i D = _mm_loadu_si128(cast(__m128i*) &dest[x]);
467                             __m128i S = _mm_loadu_si128(cast(__m128i*) &tmpBuf[x]);
468                             __m128i Z = _mm_setzero_si128();
469                             __m128i D0_3 = _mm_unpacklo_epi8(D, Z); 
470                             __m128i D4_7 = _mm_unpackhi_epi8(D, Z);
471                             __m128i S0_3 = _mm_unpacklo_epi8(S, Z); 
472                             __m128i S4_7 = _mm_unpackhi_epi8(S, Z);
473                             D0_3 = _mm_sub_epi16(D0_3, S0_3);
474                             D4_7 = _mm_sub_epi16(D4_7, S4_7);
475                             __m128i R = _mm_packus_epi16(D0_3, D4_7);
476                             R = _mm_or_si128(R, alphaMask);
477                             _mm_storeu_si128(cast(__m128i*) &dest[x], R);
478                         }
479                         break;
480 
481                     case CompositeOp.LightenOnly:
482                         //    (1 - alpha) * bg + max(bg, fg) * alpha
483                         // == (1 - alpha) * bg + max(bg * alpha, fg * alpha) and we get fg.alpha
484                         for (int x = startx; x < endx; x += 4)
485                         {
486                             __m128i D = _mm_loadu_si128(cast(__m128i*) &dest[x]);   // BG
487                             __m128i S = _mm_loadu_si128(cast(__m128i*) &tmpBuf[x]); // FG * A, A
488                             __m128i Z = _mm_setzero_si128();
489                             
490                             __m128i D0_1 = _mm_unpacklo_epi8(D, Z); // two background pixels
491                             __m128i D2_3 = _mm_unpackhi_epi8(D, Z); // ditto
492                             __m128i S0_1 = _mm_unpacklo_epi8(S, Z); // two foreground pixels
493                             __m128i S2_3 = _mm_unpackhi_epi8(S, Z); // ditto
494 
495                             __m128i A0_1 = _mm_shufflelo_epi16!0xff(_mm_shufflehi_epi16!0xff(S0_1)); // A0 A0 A0 A0 A1 A1 A1 A1
496                             __m128i A2_3 = _mm_shufflelo_epi16!0xff(_mm_shufflehi_epi16!0xff(S2_3)); // A2 A2 A2 A2 A3 A3 A3 A3
497                             __m128i m255 = _mm_set1_epi16(255);
498                             __m128i mA0_1 = _mm_sub_epi16(m255, A0_1); // (255-A0)x4 (255-A1)x4
499                             __m128i mA2_3 = _mm_sub_epi16(m255, A2_3);
500                             A0_1 = _mm_slli_epi16(A0_1, 8);
501                             A2_3 = _mm_slli_epi16(A2_3, 8);
502                             mA0_1 = _mm_slli_epi16(mA0_1, 8);
503                             mA2_3 = _mm_slli_epi16(mA2_3, 8);
504 
505                             // multiply bg * alpha
506                             __m128i BGA0_1 = _mm_mulhi_epu16(A0_1, D0_1);
507                             __m128i BGA2_3 = _mm_mulhi_epu16(A2_3, D2_3);
508 
509                             // multiply bg * (1 - alpha)
510                             __m128i mBGA0_1 = _mm_mulhi_epu16(mA0_1, D0_1);
511                             __m128i mBGA2_3 = _mm_mulhi_epu16(mA2_3, D2_3);
512 
513                             __m128i R0_1 = _mm_add_epi16( _mm_max_epi16(S0_1, BGA0_1), mBGA0_1);
514                             __m128i R2_3 = _mm_add_epi16( _mm_max_epi16(S2_3, BGA2_3), mBGA2_3);
515 
516                             __m128i R = _mm_packus_epi16(R0_1, R2_3);
517                             R = _mm_or_si128(R, alphaMask);
518                             _mm_storeu_si128(cast(__m128i*) &dest[x], R);
519                         }
520                         break;
521 
522                     case CompositeOp.DarkenOnly:
523                         //    (1 - alpha) * bg + min(bg, fg) * alpha
524                         // == (1 - alpha) * bg + min(bg * alpha, fg * alpha) and we get fg.alpha
525                         for (int x = startx; x < endx; x += 4)
526                         {
527                             __m128i D = _mm_loadu_si128(cast(__m128i*) &dest[x]);   // BG
528                             __m128i S = _mm_loadu_si128(cast(__m128i*) &tmpBuf[x]); // FG * A, A
529                             __m128i Z = _mm_setzero_si128();
530 
531                             __m128i D0_1 = _mm_unpacklo_epi8(D, Z); // two background pixels
532                             __m128i D2_3 = _mm_unpackhi_epi8(D, Z); // ditto
533                             __m128i S0_1 = _mm_unpacklo_epi8(S, Z); // two foreground pixels
534                             __m128i S2_3 = _mm_unpackhi_epi8(S, Z); // ditto
535 
536                             __m128i A0_1 = _mm_shufflelo_epi16!0xff(_mm_shufflehi_epi16!0xff(S0_1)); // A0 A0 A0 A0 A1 A1 A1 A1
537                             __m128i A2_3 = _mm_shufflelo_epi16!0xff(_mm_shufflehi_epi16!0xff(S2_3)); // A2 A2 A2 A2 A3 A3 A3 A3
538                             __m128i m255 = _mm_set1_epi16(255);
539                             __m128i mA0_1 = _mm_sub_epi16(m255, A0_1); // (255-A0)x4 (255-A1)x4
540                             __m128i mA2_3 = _mm_sub_epi16(m255, A2_3);
541                             A0_1 = _mm_slli_epi16(A0_1, 8);
542                             A2_3 = _mm_slli_epi16(A2_3, 8);
543                             mA0_1 = _mm_slli_epi16(mA0_1, 8);
544                             mA2_3 = _mm_slli_epi16(mA2_3, 8);
545 
546                             // multiply bg * alpha
547                             __m128i BGA0_1 = _mm_mulhi_epu16(A0_1, D0_1);
548                             __m128i BGA2_3 = _mm_mulhi_epu16(A2_3, D2_3);
549 
550                             // multiply bg * (1 - alpha)
551                             __m128i mBGA0_1 = _mm_mulhi_epu16(mA0_1, D0_1);
552                             __m128i mBGA2_3 = _mm_mulhi_epu16(mA2_3, D2_3);
553 
554                             __m128i R0_1 = _mm_add_epi16( _mm_min_epi16(S0_1, BGA0_1), mBGA0_1);
555                             __m128i R2_3 = _mm_add_epi16( _mm_min_epi16(S2_3, BGA2_3), mBGA2_3);
556 
557                             __m128i R = _mm_packus_epi16(R0_1, R2_3);
558                             R = _mm_or_si128(R, alphaMask);
559                             _mm_storeu_si128(cast(__m128i*) &dest[x], R);
560                         }
561                         break;
562                 }
563             }
564 
565             pDest += imagePitchInBytes;
566 
567             // clear scandelta overspill
568 
569             m_scandelta[endx] = 0;
570 
571             debug(checkRasterizer)
572             {
573 
574                 size_t size = m_scandelta.length;
575                 foreach(e; m_scandelta) assert(e == 0);
576             }
577         }
578 
579         // clear clip buffers overspill
580 
581         m_clipbfr_l[endy] = 0;
582         m_clipbfr_r[endy] = 0;
583 
584         debug(checkRasterizer)
585         {
586            foreach(e; m_clipbfr_l) assert(e == 0);
587            foreach(e; m_clipbfr_r) assert(e == 0);
588         }
589 
590         // clear m_buckets overspill, this is only needed because in very
591         // rare cases we could end up with an edge could end up on the
592         // bottom clip boundry after spliting an edge, these should really
593         // be removed in the clipping code
594 
595         m_buckets[endy] = null;
596         
597         debug(checkRasterizer)
598         {
599            foreach(e; m_buckets) assert(e == null);
600         } 
601 
602         m_edgepool.reset();
603     }
604 
605     /*
606       drawing methods
607     */
608 
609     void moveTo(double x, double y)
610     {
611         intMoveTo(cast(int)(x * fpScale), cast(int)(y * fpScale));
612         m_fprevx = x;
613         m_fprevy = y;
614     }
615 
616     void moveTo(float x, float y)
617     {
618         intMoveTo(cast(int)(x * fpScale), cast(int)(y * fpScale));
619         m_fprevx = x;
620         m_fprevy = y;
621     }
622 
623     void lineTo(double x, double y)
624     {
625         intLineTo(cast(int)(x * fpScale), cast(int)(y * fpScale));
626         m_fprevx = x;
627         m_fprevy = y;
628     }
629 
630     void lineTo(float x, float y)
631     {
632         intLineTo(cast(int)(x * fpScale), cast(int)(y * fpScale));
633         m_fprevx = x;
634         m_fprevy = y;
635     }
636 
637     void quadTo(float x1, float y1, float x2, float y2)
638     {
639         float x01 = (m_fprevx+x1)*0.5;
640         float y01 = (m_fprevy+y1)*0.5;
641         float x12 = (x1+x2)*0.5;
642         float y12 = (y1+y2)*0.5;
643         float xctr = (x01+x12)*0.5;
644         float yctr = (y01+y12)*0.5;
645         float err = (x1-xctr)*(x1-xctr)+(y1-yctr)*(y1-yctr);
646 
647         if (err > 0.1)
648         {
649             quadTo(x01,y01,xctr,yctr);
650             quadTo(x12,y12,x2,y2);
651         }
652         else
653         {
654             intLineTo(cast(int)(x2 * fpScale), cast(int)(y2 * fpScale));
655         }
656 
657         m_fprevx = x2;
658         m_fprevy = y2;
659     }
660 
661     void cubicTo(float x1, float y1, float x2, float y2, float x3, float y3)
662     {
663         bool error = (x1 == x2 && x2 == x3 && y1 == y2 && y2 == y3);
664 
665         // Avoid stack overflow with infinite recursion.
666         // It is poorly understood why that happens.
667         if (error)
668             return;
669 
670         float x01 = (m_fprevx+x1)*0.5;
671         float y01 = (m_fprevy+y1)*0.5;
672         float x12 = (x1+x2)*0.5;
673         float y12 = (y1+y2)*0.5;
674         float x23 = (x2+x3)*0.5;
675         float y23 = (y2+y3)*0.5;
676         
677         float xc0 = (x01+x12)*0.5;
678         float yc0 = (y01+y12)*0.5;
679         float xc1 = (x12+x23)*0.5;
680         float yc1 = (y12+y23)*0.5;
681         float xctr = (xc0+xc1)*0.5;
682         float yctr = (yc0+yc1)*0.5;
683         
684         // this flattenening test code was from a page on the antigrain geometry
685         // website.
686 
687         float dx = x3-m_fprevx;
688         float dy = y3-m_fprevy;
689 
690         double d2 = fast_fabs(((x1 - x3) * dy - (y1 - y3) * dx));
691         double d3 = fast_fabs(((x2 - x3) * dy - (y2 - y3) * dx));
692 
693         if((d2 + d3)*(d2 + d3) < 0.5 * (dx*dx + dy*dy))
694         {
695             intLineTo(cast(int)(x3 * fpScale), cast(int)(y3 * fpScale));
696         }
697         else
698         {
699             cubicTo(x01,y01,xc0,yc0,xctr,yctr);
700             cubicTo(xc1,yc1,x23,y23,x3,y3);
701         }
702 
703         m_fprevx = x3;
704         m_fprevy = y3;
705     }
706 
707     void closePath()
708     {
709         if ((m_prevx != m_subpx) || (m_prevy != m_subpy))
710         {
711             intLineTo(m_subpx, m_subpy);
712         }
713     }
714 
715 private:
716 
717     // internal moveTo. Note this will close any existing subpath because
718     // unclosed paths cause bad things to happen. (visually at least)
719 
720     void intMoveTo(int x, int y)
721     {
722         closePath();
723         m_prevx = x;
724         m_prevy = y;
725         m_subpx = x;
726         m_subpy = y;
727     }
728 
729     // internal lineTo, clips and adds the line to edge buckets and clip
730     // buffers as appropriate
731 
732     void intLineTo(int x, int y)
733     {
734         // mixin for adding edges. For some reason LDC wouldnt inline this when
735         // it was a separate function, and it was 15% slower that way
736 
737         string addEdgeM(string x0, string y0, string x1, string y1, string dir)
738         {
739             string tmp = (dir == "+") ? (y1~"-"~y0) : (y0~"-"~y1);
740             return
741                 "Edge* edge = m_edgepool.allocate();" ~
742                 "edge.dx = cast(long) (fpDXScale * ("~x1~"-"~x0~") / ("~y1~"-"~y0~"));" ~
743                 "edge.x = (cast(long) "~x0~") << 32;" ~
744                 "edge.y = "~y0~";" ~
745                 "edge.y2 = "~y1~";" ~
746                 "int by = "~y0~" >> fpFracBits;" ~
747                 "int xxx = max(abs("~x1~"-"~x0~"),1);" ~
748                 "edge.dy = cast(long) (fpDYScale * ("~tmp~") /  xxx);" ~
749                 "edge.next = m_buckets[by];" ~
750                 "m_buckets[by] = edge;";
751         }
752 
753         // mixin for clip accumulator
754 
755         string addToClip(string y0, string y1, string side, string dir)
756         {
757             return
758                 "{ int i0 = "~y0~" >> fpFracBits;" ~
759                 "int f0 = ("~y0~" & 0xFF) << 7;" ~
760                 "int i1 = "~y1~" >> fpFracBits;" ~
761                 "int f1 = ("~y1~" & 0xFF) << 7;" ~
762                 "m_clipbfr_"~side~"[i0] "~dir~"= 32768-f0;" ~
763                 "m_clipbfr_"~side~"[i0+1] "~dir~"= f0;" ~
764                 "m_clipbfr_"~side~"[i1] "~dir~"= f1-32768;" ~
765                 "m_clipbfr_"~side~"[i1+1] "~dir~"= -f1; }";
766         }
767 
768         // handle upward and downward lines seperately
769 
770         if (m_prevy < y)
771         {
772             int x0 = m_prevx, y0 = m_prevy, x1 = x, y1 = y;
773 
774             // edge is outside clip box or horizontal
775 
776             if ((y0 == y1) || (y0 >= m_clipbottom) || (y1 <= m_cliptop))
777             {
778                 goto finished;
779             }
780 
781             // clip to top and bottom
782 
783             if (y0 < m_cliptop)
784             {
785                 x0 = x0 + MulDiv64(m_cliptop - y0, x1 - x0,  y1 - y0);
786                 y0 = m_cliptop;
787             }
788 
789             if (y1 > m_clipbottom)
790             {
791                 x1 = x0 + MulDiv64(m_clipbottom - y0, x1 - x0, y1 - y0);
792                 y1 = m_clipbottom;
793             }
794 
795             // track y extent
796 
797             if (y0 < m_yrmin) m_yrmin = y0;
798             if (y1 > m_yrmax) m_yrmax = y1;
799 
800             // generate horizontal zoning flags, these are set depending on where
801             // x0 and x1 are in respect of the clip box.
802 
803             uint a = cast(uint)(x0<m_clipleft);
804             uint b = cast(uint)(x0>m_clipright);
805             uint c = cast(uint)(x1<m_clipleft);
806             uint d = cast(uint)(x1>m_clipright);
807             uint flags = a | (b*2) | (c*4) | (d*8);
808 
809             if (flags == 0) // bit faster to pull no clip out front
810             {             
811                 mixin(addEdgeM("x0","y0","x1","y1","+"));
812                 goto finished;
813             }
814 
815             // note cliping here can occasionaly result in horizontals, and can
816             // ocaisionaly put a horizontal on bucket for clipbotttom, which is
817             // outside the drawable area, currently it allows it and zeros that
818             // bucket after rasterization. 
819 
820             switch (flags)
821             {
822             case (1): // 0001 --> x0 left, x1 center
823                 int sy = y0 + MulDiv64(y1 - y0, m_clipleft - x0, x1 - x0);
824                 mixin(addToClip("y0","sy","l","+"));
825                 mixin(addEdgeM("m_clipleft","sy","x1","y1","+"));
826                 break;
827             case (2): // 0010 --> x0 right, x1 center
828                 int sy = y0 + MulDiv64(y1 - y0, m_clipright - x0, x1 - x0);
829                 mixin(addToClip("y0","sy","r","+"));
830                 mixin(addEdgeM("m_clipright","sy","x1","y1","+"));
831                 break;
832             case (4): // 0100 --> x0 center, x1 left
833                 int sy = y0 + MulDiv64(y1 - y0, m_clipleft - x0, x1 - x0);
834                 mixin(addEdgeM("x0","y0","m_clipleft","sy","+"));
835                 mixin(addToClip("sy","y1","l","+"));
836                 break;
837             case (5): // 0101 --> x0 left, x1 left
838                 mixin(addToClip("y0","y1","l","+"));
839                 break;
840             case (6): // 0110 --> x0 right, x1 left
841                 int sl = y0 + MulDiv64(y1 - y0, m_clipleft - x0, x1 - x0);
842                 int sr = y0 + MulDiv64(y1 - y0, m_clipright - x0, x1 - x0);
843                 mixin(addToClip("y0","sr","r","+"));
844                 mixin(addEdgeM("m_clipright","sr","m_clipleft","sl","+"));
845                 mixin(addToClip("sl","y1","l","+"));
846                 break;
847             case (8): // 1000 --> x0 center, x1 right
848                 int sy = y0 + MulDiv64(y1 - y0, m_clipright - x0, x1 - x0);
849                 mixin(addEdgeM("x0","y0","m_clipright","sy","+"));
850                 mixin(addToClip("sy","y1","r","+"));
851                 break;
852             case (9): // 1001 --> x0 left, x1 right
853                 int sl = y0 + MulDiv64(y1 - y0, m_clipleft - x0, x1 - x0);
854                 int sr = y0 + MulDiv64(y1 - y0, m_clipright - x0, x1 - x0);
855                 mixin(addToClip("y0","sl","l","+"));
856                 mixin(addEdgeM("m_clipleft","sl","m_clipright","sr","+"));
857                 mixin(addToClip("sr","y1","r","+"));
858                 break;
859             case (10): // 1001 --> x0 right, x1 right
860                 mixin(addToClip("y0","y1","r","+"));
861                 break;
862             default: // everything else is NOP
863                 break; 
864             }
865         }
866         else
867         {
868             int x1 = m_prevx, y1 = m_prevy, x0 = x, y0 = y;
869 
870             // edge is outside clip box or horizontal
871 
872             if ((y0 == y1) || (y0 >= m_clipbottom) || (y1 <= m_cliptop))
873             {
874                 goto finished;
875             }
876 
877             // clip to top and bottom
878 
879             if (y0 < m_cliptop)
880             {
881                 x0 = x0 + MulDiv64(m_cliptop - y0, x1 - x0,  y1 - y0);
882                 y0 = m_cliptop;
883             }
884 
885             if (y1 > m_clipbottom)
886             {
887                 x1 = x0 + MulDiv64(m_clipbottom - y0, x1 - x0, y1 - y0);
888                 y1 = m_clipbottom;
889             }
890 
891             // track y extent
892 
893             if (y0 < m_yrmin) m_yrmin = y0;
894             if (y1 > m_yrmax) m_yrmax = y1;
895 
896             // generate horizontal zoning flags, these are set depending on where
897             // x0 and x1 are in respect of the clip box.
898 
899             uint a = cast(uint)(x0<m_clipleft);
900             uint b = cast(uint)(x0>m_clipright);
901             uint c = cast(uint)(x1<m_clipleft);
902             uint d = cast(uint)(x1>m_clipright);
903             uint flags = a | (b*2) | (c*4) | (d*8);
904 
905             if (flags == 0) // bit faster to pull no clip out front
906             {             
907                 mixin(addEdgeM("x0","y0","x1","y1","-"));
908                 goto finished;
909             }
910          
911             // note cliping here can occasionaly result in horizontals, and can
912             // occasionally put a horizontal on bucket for clipbotttom, which is
913             // outside the drawable area, currently it allows it and zeros that
914             // bucket after rasterization. 
915 
916             switch (flags)
917             {
918             case (1): // 0001 --> x0 left, x1 center
919                 int sy = y0 + MulDiv64(y1 - y0, m_clipleft - x0, x1 - x0);
920                 mixin(addToClip("y0","sy","l","-"));
921                 mixin(addEdgeM("m_clipleft","sy","x1","y1","-"));
922                 break;
923             case (2): // 0010 --> x0 right, x1 center
924                 int sy = y0 + MulDiv64(y1 - y0, m_clipright - x0, x1 - x0);
925                 mixin(addToClip("y0","sy","r","-"));
926                 mixin(addEdgeM("m_clipright","sy","x1","y1","-"));
927                 break;
928             case (4): // 0100 --> x0 center, x1 left
929                 int sy = y0 + MulDiv64(y1 - y0, m_clipleft - x0, x1 - x0);
930                 mixin(addEdgeM("x0","y0","m_clipleft","sy","-"));
931                 mixin(addToClip("sy","y1","l","-"));
932                 break;
933             case (5): // 0101 --> x0 left, x1 left
934                 mixin(addToClip("y0","y1","l","-"));
935                 break;
936             case (6): // 0110 --> x0 right, x1 left
937                 int sl = y0 + MulDiv64(y1 - y0, m_clipleft - x0, x1 - x0);
938                 int sr = y0 + MulDiv64(y1 - y0, m_clipright - x0, x1 - x0);
939                 mixin(addToClip("y0","sr","r","-"));
940                 mixin(addEdgeM("m_clipright","sr","m_clipleft","sl","-"));
941                 mixin(addToClip("sl","y1","l","-"));
942                 break;
943             case (8): // 1000 --> x0 center, x1 right
944                 int sy = y0 + MulDiv64(y1 - y0, m_clipright - x0, x1 - x0);
945                 mixin(addEdgeM("x0","y0","m_clipright","sy","-"));
946                 mixin(addToClip("sy","y1","r","-"));
947                 break;
948             case (9): // 1001 --> x0 left, x1 right
949                 int sl = y0 + MulDiv64(y1 - y0, m_clipleft - x0, x1 - x0);
950                 int sr = y0 + MulDiv64(y1 - y0, m_clipright - x0, x1 - x0);
951                 mixin(addToClip("y0","sl","l","-"));
952                 mixin(addEdgeM("m_clipleft","sl","m_clipright","sr","-"));
953                 mixin(addToClip("sr","y1","r","-"));
954                 break;
955             case (10): // 1001 --> x0 right, x1 right
956                 mixin(addToClip("y0","y1","r","-"));
957                 break;
958             default: // everything else is NOP
959                 break; 
960             }
961         }
962     
963     finished:
964 
965         m_prevx = x;
966         m_prevy = y;
967     }
968 
969     // edge struct
970 
971     struct Edge
972     {
973         long x, dx, dy;
974         int y, y2;
975         Edge* next;
976     }
977 
978     ArenaAllocator!(Edge,100) m_edgepool;
979 
980     // Note: the reason why it's Vec is to avoid an initialization that would reallocate down.
981 
982     Vec!(Edge*) m_buckets;
983     Vec!int m_scandelta;
984 
985     size_t m_deltamaskSize = 0;
986     size_t m_deltamaskAlloc = 0;
987     DMWord* m_deltamask;
988 
989     Vec!int m_clipbfr_l;
990     Vec!int m_clipbfr_r;
991 
992     // clip rectangle, in 24:8 fixed point
993 
994     int m_clipleft;
995     int m_cliptop;
996     int m_clipright;
997     int m_clipbottom;
998 
999     // keeps track of y extent
1000 
1001     int m_yrmin,m_yrmax;
1002 
1003     // start of current subpath, 
1004 
1005     int m_subpx,m_subpy;
1006 
1007     // previous x,y (internal coords)
1008 
1009     int m_prevx,m_prevy;
1010 
1011     // previous x,y float coords
1012 
1013     float m_fprevx,m_fprevy;
1014 
1015     // Temporary dest buffer for Porter Duff operations.
1016     Vec!ubyte m_destBuf;
1017 }
1018 
1019 // the rasterizer itself should be a reusable, small object suitable for the stack
1020 static assert(Rasterizer.sizeof < 280);
1021 
1022 private:
1023 
1024 int MulDiv64(int a, int b, int c)
1025 {
1026     return cast(int) ((cast(long) a * b) / c);
1027 }