1 /** 2 * Implement the elliptic gradient fill style. dplug:canvas internals. 3 * 4 * Copyright: Copyright Chris Jones 2020. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module dplug.canvas.ellipticalblit; 8 9 import dplug.canvas.rasterizer; 10 import dplug.canvas.gradient; 11 import dplug.canvas.misc; 12 13 struct EllipticalBlit 14 { 15 nothrow: 16 @nogc: 17 18 void init(ubyte* pixels, size_t strideBytes, int height, 19 Gradient g, float x0, float y0, float x1, float y1, float r2) 20 { 21 assert(height > 0); 22 assert(g !is null); 23 assert(isPow2(g.lutLength)); 24 25 this.pixels = pixels; 26 this.strideBytes = strideBytes; 27 this.height = height; 28 this.gradient = g; 29 int lutsize = g.lutLength; 30 31 xctr = x0; 32 yctr = y0; 33 float w = x1-x0; 34 float h = y1-y0; 35 float hyp = w*w + h*h; 36 if (hyp < 1.0) hyp = 1.0; 37 xstep0 = lutsize * w / hyp; 38 ystep0 = lutsize * h / hyp; 39 hyp = sqrt(hyp); 40 xstep1 = lutsize * h / (r2*hyp); 41 ystep1 = lutsize * -w / (r2*hyp); 42 } 43 44 private: 45 46 void color_blit(WindingRule wr)(int* delta, DMWord* mask, int x0, int x1, int y) 47 { 48 assert(x0 >= 0); 49 assert(x1*4 <= strideBytes); 50 assert(y >= 0); 51 assert(y < height); 52 assert((x0 & 3) == 0); 53 assert((x1 & 3) == 0); 54 55 // main blit variables 56 57 int bpos = x0 / 4; 58 int endbit = x1 / 4; 59 uint* dest = cast(uint*)(&pixels[y*strideBytes]); 60 __m128i xmWinding = 0; 61 uint* lut = gradient.getLookup.ptr; 62 short lutMax = cast(short)(gradient.lutLength - 1); 63 bool isopaque = false;//gradient.isOpaque 64 65 // XMM constants 66 67 immutable __m128i XMZERO = 0; 68 immutable __m128i XMFFFF = [0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF]; 69 immutable __m128i XMMSK16 = [0xFFFF,0xFFFF,0xFFFF,0xFFFF]; 70 71 // paint variables 72 73 float t0 = (bpos*4-xctr)*xstep0 + (y-yctr)*ystep0; 74 __m128 xmT0 = _mm_mul_ps(_mm_set1_ps(xstep0), _mm_setr_ps(0.0f,1.0f,2.0f,3.0f)); 75 xmT0 = _mm_add_ps(xmT0, _mm_set1_ps(t0)); 76 __m128 xmStep0 = _mm_set1_ps(xstep0*4); 77 78 float t1 = (bpos*4-xctr)*xstep1 + (y-yctr)*ystep1; 79 __m128 xmT1 = _mm_mul_ps(_mm_set1_ps(xstep1), _mm_setr_ps(0.0f,1.0f,2.0f,3.0f)); 80 xmT1 = _mm_add_ps(xmT1, _mm_set1_ps(t1)); 81 __m128 xmStep1 = _mm_set1_ps(xstep1*4); 82 83 // main loop 84 85 while (bpos < endbit) 86 { 87 int nsb = nextSetBit(mask, bpos, endbit); 88 89 // do we have a span of unchanging coverage? 90 91 if (bpos < nsb) 92 { 93 // Calc coverage of first pixel 94 95 static if (wr == WindingRule.NonZero) 96 { 97 int cover = xmWinding[3]+delta[bpos*4]; 98 cover = abs(cover)*2; 99 if (cover > 0xFFFF) cover = 0xFFFF; 100 } 101 else 102 { 103 int cover = xmWinding[3]+delta[bpos*4]; 104 short tsc = cast(short) cover; 105 cover = (tsc ^ (tsc >> 15)) * 2; 106 } 107 108 // We can skip the span 109 110 if (cover == 0) 111 { 112 __m128 tsl = _mm_set1_ps(nsb-bpos); 113 xmT0 = _mm_add_ps(xmT0, _mm_mul_ps(tsl,xmStep0)); 114 xmT1 = _mm_add_ps(xmT1, _mm_mul_ps(tsl,xmStep1)); 115 bpos = nsb; 116 } 117 118 // Or fill span with soid color 119 120 else if (isopaque && (cover > 0xFF00)) 121 { 122 uint* ptr = &dest[bpos*4]; 123 uint* end = ptr + ((nsb-bpos)*4); 124 125 while (ptr < end) 126 { 127 __m128 rad = _mm_add_ps(_mm_mul_ps(xmT0, xmT0),_mm_mul_ps(xmT1, xmT1)); 128 rad = _mm_sqrt_ps(rad); 129 xmT0 = xmT0 + xmStep0; 130 xmT1 = xmT1 + xmStep1; 131 __m128i ipos = _mm_cvttps_epi32 (rad); 132 ipos = _mm_clamp_0_to_N_epi32(ipos, lutMax); 133 134 ptr[0] = lut[ ipos.array[0] ]; 135 ptr[1] = lut[ ipos.array[1] ]; 136 ptr[2] = lut[ ipos.array[2] ]; 137 ptr[3] = lut[ ipos.array[3] ]; 138 139 ptr+=4; 140 } 141 142 bpos = nsb; 143 } 144 145 // Or fill span with transparent color 146 147 else 148 { 149 __m128i tqcvr = _mm_set1_epi16 (cast(ushort) cover); 150 151 uint* ptr = &dest[bpos*4]; 152 uint* end = &dest[nsb*4]; 153 154 while (ptr < end) 155 { 156 __m128 rad = _mm_add_ps(_mm_mul_ps(xmT0, xmT0),_mm_mul_ps(xmT1, xmT1)); 157 xmT0 = xmT0 + xmStep0; 158 xmT1 = xmT1 + xmStep1; 159 rad = _mm_sqrt_ps(rad); 160 161 __m128i d0 = _mm_loadu_si64 (ptr); 162 d0 = _mm_unpacklo_epi8 (d0, XMZERO); 163 __m128i d1 = _mm_loadu_si64 (ptr+2); 164 d1 = _mm_unpacklo_epi8 (d1, XMZERO); 165 166 __m128i ipos = _mm_cvttps_epi32 (rad); 167 ipos = _mm_clamp_0_to_N_epi32(ipos, lutMax); 168 169 __m128i c0 = _mm_loadu_si32 (&lut[ ipos.array[0] ]); 170 __m128i tnc = _mm_loadu_si32 (&lut[ ipos.array[1] ]); 171 c0 = _mm_unpacklo_epi32 (c0, tnc); 172 c0 = _mm_unpacklo_epi8 (c0, XMZERO); 173 __m128i a0 = _mm_broadcast_alpha(c0); 174 a0 = _mm_mulhi_epu16(a0, tqcvr); 175 176 __m128i c1 = _mm_loadu_si32 (&lut[ ipos.array[2] ]); 177 tnc = _mm_loadu_si32 (&lut[ ipos.array[3] ]); 178 c1 = _mm_unpacklo_epi32 (c1, tnc); 179 c1 = _mm_unpacklo_epi8 (c1, XMZERO); 180 __m128i a1 = _mm_broadcast_alpha(c1); 181 a1 = _mm_mulhi_epu16(a1, tqcvr); 182 183 // alpha*source + dest - alpha*dest 184 185 c0 = _mm_mulhi_epu16 (c0,a0); 186 c1 = _mm_mulhi_epu16 (c1,a1); 187 c0 = _mm_adds_epi16 (c0,d0); 188 c1 = _mm_adds_epi16 (c1,d1); 189 d0 = _mm_mulhi_epu16 (d0,a0); 190 d1 = _mm_mulhi_epu16 (d1,a1); 191 c0 = _mm_subs_epi16 (c0, d0); 192 c1 = _mm_subs_epi16 (c1, d1); 193 194 d0 = _mm_packus_epi16 (c0,c1); 195 196 _mm_storeu_si128 (cast(__m128i*)ptr,d0); 197 198 ptr+=4; 199 } 200 201 bpos = nsb; 202 } 203 } 204 205 // At this point we need to integrate scandelta 206 207 uint* ptr = &dest[bpos*4]; 208 uint* end = &dest[endbit*4]; 209 int* dlptr = &delta[bpos*4]; 210 211 while (bpos < endbit) 212 { 213 __m128 rad = _mm_add_ps(_mm_mul_ps(xmT0, xmT0),_mm_mul_ps(xmT1, xmT1)); 214 rad = _mm_sqrt_ps(rad); 215 216 // Integrate delta values 217 218 __m128i tqw = _mm_loadu_si128(cast(__m128i*)dlptr); 219 tqw = _mm_add_epi32(tqw, _mm_slli_si128!4(tqw)); 220 tqw = _mm_add_epi32(tqw, _mm_slli_si128!8(tqw)); 221 tqw = _mm_add_epi32(tqw, xmWinding); 222 xmWinding = _mm_shuffle_epi32!255(tqw); 223 _mm_storeu_si128(cast(__m128i*)dlptr,XMZERO); 224 225 // convert grad pos to integer 226 227 __m128i ipos = _mm_cvttps_epi32(rad); 228 ipos = _mm_clamp_0_to_N_epi32(ipos, lutMax); 229 xmT0 = xmT0 + xmStep0; 230 xmT1 = xmT1 + xmStep1; 231 232 // Process coverage values taking account of winding rule 233 234 static if (wr == WindingRule.NonZero) 235 { 236 __m128i tcvr = _mm_srai_epi32(tqw,31); 237 tqw = _mm_add_epi32(tcvr,tqw); 238 tqw = _mm_xor_si128(tqw,tcvr); // abs 239 tcvr = _mm_packs_epi32(tqw,XMZERO); // saturate/pack to int16 240 tcvr = _mm_slli_epi16(tcvr, 1); // << to uint16 241 } 242 else 243 { 244 __m128i tcvr = _mm_and_si128(tqw,XMMSK16); 245 tqw = _mm_srai_epi16(tcvr,15); // mask 246 tcvr = _mm_xor_si128(tcvr,tqw); // fold in halff 247 tcvr = _mm_packs_epi32(tcvr,XMZERO); // pack to int16 248 tcvr = _mm_slli_epi16(tcvr, 1); // << to uint16 249 } 250 251 // Load destination pixels 252 253 __m128i d0 = _mm_loadu_si64 (ptr); 254 d0 = _mm_unpacklo_epi8 (d0, XMZERO); 255 __m128i d1 = _mm_loadu_si64 (ptr+2); 256 d1 = _mm_unpacklo_epi8 (d1, XMZERO); 257 258 // load grad colors 259 260 tcvr = _mm_unpacklo_epi16 (tcvr, tcvr); 261 __m128i tcvr2 = _mm_unpackhi_epi32 (tcvr, tcvr); 262 tcvr = _mm_unpacklo_epi32 (tcvr, tcvr); 263 264 __m128i c0 = _mm_loadu_si32 (&lut[ ipos.array[0] ]); 265 __m128i tnc = _mm_loadu_si32 (&lut[ ipos.array[1] ]); 266 c0 = _mm_unpacklo_epi32 (c0, tnc); 267 c0 = _mm_unpacklo_epi8 (c0, XMZERO); 268 __m128i a0 = _mm_broadcast_alpha(c0); 269 a0 = _mm_mulhi_epu16(a0, tcvr); 270 271 __m128i c1 = _mm_loadu_si32 (&lut[ ipos.array[2] ]); 272 tnc = _mm_loadu_si32 (&lut[ ipos.array[3] ]); 273 c1 = _mm_unpacklo_epi32 (c1, tnc); 274 c1 = _mm_unpacklo_epi8 (c1, XMZERO); 275 __m128i a1 = _mm_broadcast_alpha(c1); 276 a1 = _mm_mulhi_epu16(a1, tcvr2); 277 278 // alpha*source + dest - alpha*dest 279 280 c0 = _mm_mulhi_epu16 (c0,a0); 281 c1 = _mm_mulhi_epu16 (c1,a1); 282 c0 = _mm_adds_epi16 (c0,d0); 283 c1 = _mm_adds_epi16 (c1,d1); 284 d0 = _mm_mulhi_epu16 (d0,a0); 285 d1 = _mm_mulhi_epu16 (d1,a1); 286 c0 = _mm_subs_epi16 (c0, d0); 287 c1 = _mm_subs_epi16 (c1, d1); 288 289 d0 = _mm_packus_epi16 (c0,c1); 290 291 _mm_storeu_si128 (cast(__m128i*)ptr,d0); 292 293 bpos++; 294 ptr+=4; 295 dlptr+=4; 296 297 if (((cast(ulong*)dlptr)[0] | (cast(ulong*)dlptr)[1]) == 0) break; 298 } 299 } 300 } 301 302 // Member variables 303 304 ubyte* pixels; 305 size_t strideBytes; 306 int height; 307 Gradient gradient; 308 float xctr,yctr; 309 float xstep0,ystep0; 310 float xstep1,ystep1; 311 } 312 313 void doBlit_EllipticalBlit(void* userData, int* delta, DMWord* mask, int x0, int x1, int y) nothrow @nogc 314 { 315 EllipticalBlit* cb = cast(EllipticalBlit*)userData; 316 return cb.color_blit!(WindingRule.NonZero)(delta, mask, x0, x1, y); 317 } 318