1 /** 2 * Not supported for now. 3 * 4 * Copyright: Copyright Chris Jones 2020. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module dplug.canvas.angularblit; 8 9 // disabled for now 10 /+ 11 12 import dplug.canvas.rasterizer; 13 import dplug.canvas.gradient; 14 import dplug.canvas.misc; 15 16 /* 17 angular gradient blit 18 */ 19 20 struct AngularBlit 21 { 22 void init(uint* pixels, int stride, int height, 23 Gradient g, float x0, float y0, float x1, float y1, float r2) 24 { 25 assert(((cast(int)pixels) & 15) == 0); // must be 16 byte alligned 26 assert((stride & 3) == 0); // stride must be 16 byte alligned 27 assert(height > 0); 28 assert(g !is null); 29 assert(isPow2(g.lutLength)); 30 31 this.pixels = pixels; 32 this.stride = stride; 33 this.height = height; 34 this.gradient = g; 35 int lutsize = g.lutLength; 36 37 xctr = x0; 38 yctr = y0; 39 float w = x1-x0; 40 float h = y1-y0; 41 float hyp = w*w + h*h; 42 if (hyp < 0.1) hyp = 0.1; 43 xstep0 = lutsize * w / hyp; 44 ystep0 = lutsize * h / hyp; 45 hyp = sqrt(hyp); 46 xstep1 = lutsize * h / (r2*hyp); 47 ystep1 = lutsize * -w / (r2*hyp); 48 } 49 50 Blitter getBlitter(WindingRule wr) 51 { 52 if (wr == WindingRule.NonZero) 53 { 54 return &angular_blit!(WindingRule.NonZero); 55 } 56 else 57 { 58 return &angular_blit!(WindingRule.EvenOdd); 59 } 60 } 61 62 private: 63 64 void angular_blit(WindingRule wr)(int* delta, DMWord* mask, int x0, int x1, int y) 65 { 66 assert(x0 >= 0); 67 assert(x1 <= stride); 68 assert(y >= 0); 69 assert(y < height); 70 assert((x0 & 3) == 0); 71 assert((x1 & 3) == 0); 72 73 // main blit variables 74 75 int bpos = x0 / 4; 76 int endbit = x1 / 4; 77 uint* dest = &pixels[y*stride]; 78 __m128i xmWinding = 0; 79 uint* lut = gradient.getLookup.ptr; 80 uint lutmsk = gradient.lutLength - 1; 81 bool isopaque = false;//gradient.isOpaque 82 83 // XMM constants 84 85 immutable __m128i XMZERO = 0; 86 immutable __m128i XMFFFF = [0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF]; 87 immutable __m128i XMMSK16 = [0xFFFF,0xFFFF,0xFFFF,0xFFFF]; 88 89 // paint variables 90 91 float t0 = (bpos*4-xctr)*xstep0 + (y-yctr)*ystep0; 92 __m128 xmT0 = _mm_mul_ps(_mm_set1_ps(xstep0), _mm_setr_ps(0.0f,1.0f,2.0f,3.0f)); 93 xmT0 = _mm_add_ps(xmT0, _mm_set1_ps(t0)); 94 __m128 xmStep0 = _mm_set1_ps(xstep0*4); 95 96 float t1 = (bpos*4-xctr)*xstep1 + (y-yctr)*ystep1; 97 __m128 xmT1 = _mm_mul_ps(_mm_set1_ps(xstep1), _mm_setr_ps(0.0f,1.0f,2.0f,3.0f)); 98 xmT1 = _mm_add_ps(xmT1, _mm_set1_ps(t1)); 99 __m128 xmStep1 = _mm_set1_ps(xstep1*4); 100 101 // main loop 102 103 while (bpos < endbit) 104 { 105 int nsb = nextSetBit(mask, bpos, endbit); 106 107 // do we have a span of unchanging coverage? 108 109 if (bpos < nsb) 110 { 111 // Calc coverage of first pixel 112 113 static if (wr == WindingRule.NonZero) 114 { 115 int cover = xmWinding[3]+delta[bpos*4]; 116 cover = abs(cover)*2; 117 if (cover > 0xFFFF) cover = 0xFFFF; 118 } 119 else 120 { 121 int cover = xmWinding[3]+delta[bpos*4]; 122 short tsc = cast(short) cover; 123 cover = (tsc ^ (tsc >> 15)) * 2; 124 } 125 126 // We can skip the span 127 128 if (cover == 0) 129 { 130 __m128 tsl = _mm_set1_ps(nsb-bpos); 131 xmT0 = _mm_add_ps(xmT0, _mm_mul_ps(tsl,xmStep0)); 132 xmT1 = _mm_add_ps(xmT1, _mm_mul_ps(tsl,xmStep1)); 133 bpos = nsb; 134 } 135 136 // Or fill span with soid color 137 138 else if (isopaque && (cover > 0xFF00)) 139 { 140 uint* ptr = &dest[bpos*4]; 141 uint* end = ptr + ((nsb-bpos)*4); 142 143 while (ptr < end) 144 { 145 __m128 grad = gradOfSorts(xmT0,xmT1); 146 __m128 poly = polyAprox(grad); 147 __m128i ipos = _mm_cvtps_epi32(poly); 148 ipos = fixupQuadrant(ipos,xmT0,xmT1); 149 150 xmT0 = xmT0 + xmStep0; 151 xmT1 = xmT1 + xmStep1; 152 153 long tlip = _mm_cvtsi128_si64 (ipos); 154 ipos = _mm_shuffle_epi32!14(ipos); 155 ptr[0] = lut[tlip & lutmsk]; 156 ptr[1] = lut[(tlip >> 32) & lutmsk]; 157 tlip = _mm_cvtsi128_si64 (ipos); 158 ptr[2] = lut[tlip & lutmsk]; 159 ptr[3] = lut[(tlip >> 32) & lutmsk]; 160 161 ptr+=4; 162 } 163 164 bpos = nsb; 165 } 166 167 // Or fill span with transparent color 168 169 else 170 { 171 __m128i tqcvr = _mm_set1_epi16 (cast(ushort) cover); 172 173 uint* ptr = &dest[bpos*4]; 174 uint* end = &dest[nsb*4]; 175 176 while (ptr < end) 177 { 178 __m128 grad = gradOfSorts(xmT0,xmT1); 179 180 __m128i d0 = _mm_loadu_si64 (ptr); 181 d0 = _mm_unpacklo_epi8 (d0, XMZERO); 182 __m128i d1 = _mm_loadu_si64 (ptr+2); 183 d1 = _mm_unpacklo_epi8 (d1, XMZERO); 184 185 __m128 poly = polyAprox(grad); 186 __m128i ipos = _mm_cvtps_epi32(poly); 187 ipos = fixupQuadrant(ipos,xmT0,xmT1); 188 189 long tlip = _mm_cvtsi128_si64 (ipos); 190 ipos = _mm_unpackhi_epi64 (ipos, ipos); 191 192 __m128i c0 = _mm_loadu_si32 (&lut[tlip & lutmsk]); 193 __m128i tnc = _mm_loadu_si32 (&lut[(tlip >> 32) & lutmsk]); 194 c0 = _mm_unpacklo_epi32 (c0, tnc); 195 c0 = _mm_unpacklo_epi8 (c0, XMZERO); 196 __m128i a0 = _mm_broadcast_alpha(c0); 197 a0 = _mm_mulhi_epu16(a0, tqcvr); 198 199 tlip = _mm_cvtsi128_si64 (ipos); 200 201 __m128i c1 = _mm_loadu_si32 (&lut[tlip & lutmsk]); 202 tnc = _mm_loadu_si32 (&lut[(tlip >> 32) & lutmsk]); 203 c1 = _mm_unpacklo_epi32 (c1, tnc); 204 c1 = _mm_unpacklo_epi8 (c1, XMZERO); 205 __m128i a1 = _mm_broadcast_alpha(c1); 206 a1 = _mm_mulhi_epu16(a1, tqcvr); 207 208 xmT0 = xmT0 + xmStep0; 209 xmT1 = xmT1 + xmStep1; 210 211 // alpha*source + dest - alpha*dest 212 213 c0 = _mm_mulhi_epu16 (c0,a0); 214 c1 = _mm_mulhi_epu16 (c1,a1); 215 c0 = _mm_adds_epi16 (c0,d0); 216 c1 = _mm_adds_epi16 (c1,d1); 217 d0 = _mm_mulhi_epu16 (d0,a0); 218 d1 = _mm_mulhi_epu16 (d1,a1); 219 c0 = _mm_subs_epi16 (c0, d0); 220 c1 = _mm_subs_epi16 (c1, d1); 221 222 d0 = _mm_packus_epi16 (c0,c1); 223 224 _mm_store_si128 (cast(__m128i*)ptr,d0); 225 226 ptr+=4; 227 } 228 229 bpos = nsb; 230 } 231 } 232 233 // At this point we need to integrate scandelta 234 235 uint* ptr = &dest[bpos*4]; 236 uint* end = &dest[endbit*4]; 237 int* dlptr = &delta[bpos*4]; 238 239 while (bpos < endbit) 240 { 241 __m128 grad = gradOfSorts(xmT0,xmT1); 242 243 // Integrate delta values 244 245 __m128i tqw = _mm_load_si128(cast(__m128i*)dlptr); 246 tqw = _mm_add_epi32(tqw, _mm_slli_si128!4(tqw)); 247 tqw = _mm_add_epi32(tqw, _mm_slli_si128!8(tqw)); 248 tqw = _mm_add_epi32(tqw, xmWinding); 249 xmWinding = _mm_shuffle_epi32!255(tqw); 250 _mm_store_si128(cast(__m128i*)dlptr,XMZERO); 251 252 __m128 poly = polyAprox(grad); 253 254 // Process coverage values taking account of winding rule 255 256 static if (wr == WindingRule.NonZero) 257 { 258 __m128i tcvr = _mm_srai_epi32(tqw,31); 259 tqw = _mm_add_epi32(tcvr,tqw); 260 tqw = _mm_xor_si128(tqw,tcvr); // abs 261 tcvr = _mm_packs_epi32(tqw,XMZERO); // saturate/pack to int16 262 tcvr = _mm_slli_epi16(tcvr, 1); // << to uint16 263 } 264 else 265 { 266 __m128i tcvr = _mm_and_si128(tqw,XMMSK16); 267 tqw = _mm_srai_epi16(tcvr,15); // mask 268 tcvr = _mm_xor_si128(tcvr,tqw); // fold in halff 269 tcvr = _mm_packs_epi32(tcvr,XMZERO); // pack to int16 270 tcvr = _mm_slli_epi16(tcvr, 1); // << to uint16 271 } 272 273 // convert grad pos to integer 274 275 __m128i ipos = _mm_cvtps_epi32(poly); 276 277 // Load destination pixels 278 279 __m128i d0 = _mm_loadu_si64 (ptr); 280 d0 = _mm_unpacklo_epi8 (d0, XMZERO); 281 __m128i d1 = _mm_loadu_si64 (ptr+2); 282 d1 = _mm_unpacklo_epi8 (d1, XMZERO); 283 284 ipos = fixupQuadrant(ipos,xmT0,xmT1); 285 286 xmT0 = xmT0 + xmStep0; 287 xmT1 = xmT1 + xmStep1; 288 289 // load grad colors 290 291 long tlip = _mm_cvtsi128_si64 (ipos); 292 ipos = _mm_unpackhi_epi64 (ipos, ipos); 293 294 tcvr = _mm_unpacklo_epi16 (tcvr, tcvr); 295 __m128i tcvr2 = _mm_unpackhi_epi32 (tcvr, tcvr); 296 tcvr = _mm_unpacklo_epi32 (tcvr, tcvr); 297 298 __m128i c0 = _mm_loadu_si32 (&lut[tlip & lutmsk]); 299 __m128i tnc = _mm_loadu_si32 (&lut[(tlip >> 32) & lutmsk]); 300 c0 = _mm_unpacklo_epi32 (c0, tnc); 301 c0 = _mm_unpacklo_epi8 (c0, XMZERO); 302 __m128i a0 = _mm_broadcast_alpha(c0); 303 a0 = _mm_mulhi_epu16(a0, tcvr); 304 305 tlip = _mm_cvtsi128_si64 (ipos); 306 307 __m128i c1 = _mm_loadu_si32 (&lut[tlip & lutmsk]); 308 tnc = _mm_loadu_si32 (&lut[(tlip >> 32) & lutmsk]); 309 c1 = _mm_unpacklo_epi32 (c1, tnc); 310 c1 = _mm_unpacklo_epi8 (c1, XMZERO); 311 __m128i a1 = _mm_broadcast_alpha(c1); 312 a1 = _mm_mulhi_epu16(a1, tcvr2); 313 314 // alpha*source + dest - alpha*dest 315 316 c0 = _mm_mulhi_epu16 (c0,a0); 317 c1 = _mm_mulhi_epu16 (c1,a1); 318 c0 = _mm_adds_epi16 (c0,d0); 319 c1 = _mm_adds_epi16 (c1,d1); 320 d0 = _mm_mulhi_epu16 (d0,a0); 321 d1 = _mm_mulhi_epu16 (d1,a1); 322 c0 = _mm_subs_epi16 (c0, d0); 323 c1 = _mm_subs_epi16 (c1, d1); 324 325 d0 = _mm_packus_epi16 (c0,c1); 326 327 _mm_store_si128 (cast(__m128i*)ptr,d0); 328 329 bpos++; 330 ptr+=4; 331 dlptr+=4; 332 333 if (((cast(ulong*)dlptr)[0] | (cast(ulong*)dlptr)[1]) == 0) break; 334 } 335 } 336 } 337 338 // Member variables 339 340 uint* pixels; 341 int stride; 342 int height; 343 Gradient gradient; 344 float xctr,yctr; 345 float xstep0,ystep0; 346 float xstep1,ystep1; 347 } 348 349 // helpers for fast atan2 350 // these should be inlined by ldc 351 352 private: 353 354 immutable __m128i ABSMASK = [0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff]; 355 immutable __m128 MINSUM = [0.001,0.001,0.001,0.001]; 356 357 __m128 gradOfSorts(__m128 x, __m128 y) 358 { 359 __m128 absx = _mm_and_ps(x, cast(__m128) ABSMASK); 360 __m128 absy = _mm_and_ps(y, cast(__m128) ABSMASK); 361 __m128 sum = _mm_add_ps(absx,absy); 362 __m128 diff = _mm_sub_ps(absx,absy); 363 sum = _mm_max_ps(sum,MINSUM); 364 return diff / sum; 365 } 366 367 immutable __m128 PCOEF0 = [0.785398163f,0.785398163f,0.785398163f,0.785398163f]; 368 immutable __m128 PCOEF1 = [0.972394341f,0.972394341f,0.972394341f,0.972394341f]; 369 immutable __m128 PCOEF3 = [0.19194811f,0.19194811f,0.19194811f,0.19194811f]; 370 immutable __m128 PSCALE = [128.0f / 3.142f,128.0f / 3.142f,128.0f / 3.142f,128.0f / 3.142f]; 371 372 __m128 polyAprox(__m128 g) 373 { 374 __m128 sqr = g*g; 375 __m128 p3 = PCOEF3*g; 376 __m128 p1 = PCOEF1*g; 377 __m128 poly = PCOEF0 - p1 + p3*sqr; 378 return poly * PSCALE; 379 } 380 381 __m128i fixupQuadrant(__m128i ipos, __m128 t0, __m128 t1) 382 { 383 __m128i xmsk = _mm_srai_epi32(cast(__m128i)t1,31); 384 __m128i ymsk = _mm_srai_epi32(cast(__m128i)t0,31); 385 ipos = ipos ^ (xmsk ^ ymsk); 386 return ipos ^ _mm_slli_epi32(ymsk,7); 387 } 388 389 // test mixing in rather than inlining??? 390 391 /* 392 string gradOfSorts(string res, string x, string y) 393 { 394 return 395 "{ __m128 absx = _mm_and_ps("~x~", ABSMASK);" ~ 396 "__m128 absy = _mm_and_ps(y, ABSMASK);" 397 "__m128 sum = _mm_add_ps(absx,absy);" 398 "__m128 diff = _mm_sub_ps(absx,absy);" 399 "sum = _mm_max_ps(sum,MINSUM);" 400 res ~ " = diff / sum;" 401 }*/ 402 403 404 +/