1 /** 2 * Implement the linear gradient fill style. dplug:canvas internals. 3 * 4 * Copyright: Copyright Chris Jones 2020. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module dplug.canvas.linearblit; 8 9 import dplug.core.math; 10 11 import dplug.canvas.rasterizer; 12 import dplug.canvas.gradient; 13 import dplug.canvas.misc; 14 15 /* 16 linear gradient blit 17 */ 18 19 struct LinearBlit 20 { 21 nothrow: 22 @nogc: 23 24 void init(ubyte* pixels, size_t strideBytes, int height, 25 Gradient g, float x0, float y0, float x1, float y1) 26 { 27 assert(height > 0); 28 assert(g !is null); 29 assert(isPow2(g.lutLength)); 30 31 this.pixels = pixels; 32 this.strideBytes = strideBytes; 33 this.height = height; 34 this.gradient = g; 35 int lutsize = g.lutLength; 36 37 xctr = x0; 38 yctr = y0; 39 float w = x1-x0; 40 float h = y1-y0; 41 float hsq = w*w + h*h; 42 if (hsq < 0.1) hsq = 0.1; // avoid div by zero 43 xstep = lutsize * w / hsq; 44 ystep = lutsize * h / hsq; 45 } 46 47 private: 48 49 void linear_blit(WindingRule wr)(int* delta, DMWord* mask, int x0, int x1, int y) 50 { 51 assert(x0 >= 0); 52 assert(x1*4 <= strideBytes); 53 assert(y >= 0); 54 assert(y < height); 55 assert((x0 & 3) == 0); 56 assert((x1 & 3) == 0); 57 58 // main blit variables 59 60 int bpos = x0 / 4; 61 int endbit = x1 / 4; 62 uint* dest = cast(uint*)(&pixels[y*strideBytes]); 63 __m128i xmWinding = 0; 64 uint* lut = gradient.getLookup.ptr; 65 assert(gradient.lutLength <= short.max); // LUT can be non-power-of-2 as far as LinearBlit is concerned, but this held low interest 66 short lutMax = cast(short)(gradient.lutLength - 1); 67 68 bool isopaque = false;//gradient.isOpaque 69 70 // XMM constants 71 72 immutable __m128i XMZERO = 0; 73 immutable __m128i XMFFFF = [0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF]; 74 immutable __m128i XMMSK16 = [0xFFFF,0xFFFF,0xFFFF,0xFFFF]; 75 76 // paint variables 77 78 float t0 = (bpos*4-xctr)*xstep + (y-yctr)*ystep; 79 __m128 xmT0 = _mm_mul_ps(_mm_set1_ps(xstep), _mm_setr_ps(0.0f,1.0f,2.0f,3.0f)); 80 xmT0 = _mm_add_ps(xmT0, _mm_set1_ps(t0)); 81 __m128 xmStep0 = _mm_set1_ps(xstep*4); 82 83 // main loop 84 85 while (bpos < endbit) 86 { 87 int nsb = nextSetBit(mask, bpos, endbit); 88 89 // do we have a span of unchanging coverage? 90 91 if (bpos < nsb) 92 { 93 // Calc coverage of first pixel 94 95 static if (wr == WindingRule.NonZero) 96 { 97 int cover = xmWinding[3]+delta[bpos*4]; 98 cover = abs(cover)*2; 99 if (cover > 0xFFFF) cover = 0xFFFF; 100 } 101 else 102 { 103 int cover = xmWinding[3]+delta[bpos*4]; 104 short tsc = cast(short) cover; 105 cover = (tsc ^ (tsc >> 15)) * 2; 106 } 107 108 // We can skip the span 109 110 if (cover == 0) 111 { 112 __m128 tsl = _mm_set1_ps(nsb-bpos); 113 xmT0 = _mm_add_ps(xmT0, _mm_mul_ps(tsl,xmStep0)); 114 bpos = nsb; 115 } 116 117 // Or fill span with soid color 118 119 else if (isopaque && (cover > 0xFF00)) 120 { 121 uint* ptr = &dest[bpos*4]; 122 uint* end = ptr + ((nsb-bpos)*4); 123 124 while (ptr < end) 125 { 126 __m128i ipos = _mm_cvttps_epi32 (xmT0); 127 ipos = _mm_clamp_0_to_N_epi32(ipos, lutMax); 128 xmT0 = xmT0 + xmStep0; 129 130 ptr[0] = lut[ ipos.array[0] ]; 131 ptr[1] = lut[ ipos.array[1] ]; 132 ptr[2] = lut[ ipos.array[2] ]; 133 ptr[3] = lut[ ipos.array[3] ]; 134 135 ptr+=4; 136 } 137 138 bpos = nsb; 139 } 140 141 // Or fill span with transparent color 142 143 else 144 { 145 __m128i tqcvr = _mm_set1_epi16 (cast(ushort) cover); 146 147 uint* ptr = &dest[bpos*4]; 148 uint* end = &dest[nsb*4]; 149 150 while (ptr < end) 151 { 152 __m128i ipos = _mm_cvttps_epi32 (xmT0); 153 ipos = _mm_clamp_0_to_N_epi32(ipos, lutMax); 154 xmT0 = xmT0 + xmStep0; 155 156 __m128i d01 = _mm_loadu_si128(cast(__m128i*) ptr); 157 __m128i d0 = _mm_unpacklo_epi8 (d01, XMZERO); 158 __m128i d1 = _mm_unpackhi_epi8 (d01, XMZERO); 159 160 __m128i c0 = _mm_loadu_si32 (&lut[ ipos.array[0] ]); 161 __m128i tnc = _mm_loadu_si32 (&lut[ ipos.array[1] ]); 162 c0 = _mm_unpacklo_epi32 (c0, tnc); 163 c0 = _mm_unpacklo_epi8 (c0, XMZERO); 164 __m128i a0 = _mm_broadcast_alpha(c0); 165 a0 = _mm_mulhi_epu16(a0, tqcvr); 166 167 __m128i c1 = _mm_loadu_si32 (&lut[ ipos.array[2] ]); 168 tnc = _mm_loadu_si32 (&lut[ ipos.array[3] ]); 169 c1 = _mm_unpacklo_epi32 (c1, tnc); 170 c1 = _mm_unpacklo_epi8 (c1, XMZERO); 171 __m128i a1 = _mm_broadcast_alpha(c1); 172 a1 = _mm_mulhi_epu16(a1, tqcvr); 173 174 // alpha*source + dest - alpha*dest 175 176 c0 = _mm_mulhi_epu16 (c0,a0); 177 c1 = _mm_mulhi_epu16 (c1,a1); 178 c0 = _mm_adds_epi16 (c0,d0); 179 c1 = _mm_adds_epi16 (c1,d1); 180 d0 = _mm_mulhi_epu16 (d0,a0); 181 d1 = _mm_mulhi_epu16 (d1,a1); 182 c0 = _mm_subs_epi16 (c0, d0); 183 c1 = _mm_subs_epi16 (c1, d1); 184 185 d0 = _mm_packus_epi16 (c0,c1); 186 187 _mm_storeu_si128 (cast(__m128i*)ptr,d0); 188 189 ptr+=4; 190 } 191 192 bpos = nsb; 193 } 194 } 195 196 // At this point we need to integrate scandelta 197 198 uint* ptr = &dest[bpos*4]; 199 uint* end = &dest[endbit*4]; 200 int* dlptr = &delta[bpos*4]; 201 202 while (bpos < endbit) 203 { 204 // Integrate delta values 205 206 __m128i tqw = _mm_loadu_si128(cast(__m128i*)dlptr); 207 tqw = _mm_add_epi32(tqw, _mm_slli_si128!4(tqw)); 208 tqw = _mm_add_epi32(tqw, _mm_slli_si128!8(tqw)); 209 tqw = _mm_add_epi32(tqw, xmWinding); 210 xmWinding = _mm_shuffle_epi32!255(tqw); 211 _mm_storeu_si128(cast(__m128i*)dlptr,XMZERO); 212 213 // Process coverage values taking account of winding rule 214 215 static if (wr == WindingRule.NonZero) 216 { 217 __m128i tcvr = _mm_srai_epi32(tqw,31); 218 tqw = _mm_add_epi32(tcvr,tqw); 219 tqw = _mm_xor_si128(tqw,tcvr); // abs 220 tcvr = _mm_packs_epi32(tqw,XMZERO); // saturate/pack to int16 221 tcvr = _mm_slli_epi16(tcvr, 1); // << to uint16 222 } 223 else 224 { 225 __m128i tcvr = _mm_and_si128(tqw,XMMSK16); 226 tqw = _mm_srai_epi16(tcvr,15); // mask 227 tcvr = _mm_xor_si128(tcvr,tqw); // fold in halff 228 tcvr = _mm_packs_epi32(tcvr,XMZERO); // pack to int16 229 tcvr = _mm_slli_epi16(tcvr, 1); // << to uint16 230 } 231 232 // convert grad pos to integer 233 234 __m128i ipos = _mm_cvttps_epi32 (xmT0); 235 ipos = _mm_clamp_0_to_N_epi32(ipos, lutMax); 236 xmT0 = xmT0 + xmStep0; 237 238 // Load destination pixels 239 __m128i d01 = _mm_loadu_si128(cast(__m128i*) ptr); 240 __m128i d0 = _mm_unpacklo_epi8 (d01, XMZERO); 241 __m128i d1 = _mm_unpackhi_epi8 (d01, XMZERO); 242 243 // load grad colors 244 245 tcvr = _mm_unpacklo_epi16 (tcvr, tcvr); 246 __m128i tcvr2 = _mm_unpackhi_epi32 (tcvr, tcvr); 247 tcvr = _mm_unpacklo_epi32 (tcvr, tcvr); 248 249 __m128i c0 = _mm_loadu_si32 (&lut[ ipos.array[0] ]); 250 __m128i tnc = _mm_loadu_si32 (&lut[ ipos.array[1] ]); 251 c0 = _mm_unpacklo_epi32 (c0, tnc); 252 c0 = _mm_unpacklo_epi8 (c0, XMZERO); 253 __m128i a0 = _mm_broadcast_alpha(c0); 254 a0 = _mm_mulhi_epu16(a0, tcvr); 255 256 257 __m128i c1 = _mm_loadu_si32 (&lut[ ipos.array[2] ]); 258 tnc = _mm_loadu_si32 (&lut[ ipos.array[3] ]); 259 c1 = _mm_unpacklo_epi32 (c1, tnc); 260 c1 = _mm_unpacklo_epi8 (c1, XMZERO); 261 __m128i a1 = _mm_broadcast_alpha(c1); 262 a1 = _mm_mulhi_epu16(a1, tcvr2); 263 264 // alpha*source + dest - alpha*dest 265 266 c0 = _mm_mulhi_epu16 (c0,a0); 267 c1 = _mm_mulhi_epu16 (c1,a1); 268 c0 = _mm_adds_epi16 (c0,d0); 269 c1 = _mm_adds_epi16 (c1,d1); 270 d0 = _mm_mulhi_epu16 (d0,a0); 271 d1 = _mm_mulhi_epu16 (d1,a1); 272 c0 = _mm_subs_epi16 (c0, d0); 273 c1 = _mm_subs_epi16 (c1, d1); 274 275 d0 = _mm_packus_epi16 (c0,c1); 276 277 _mm_storeu_si128 (cast(__m128i*)ptr,d0); 278 279 bpos++; 280 ptr+=4; 281 dlptr+=4; 282 283 if (((cast(ulong*)dlptr)[0] | (cast(ulong*)dlptr)[1]) == 0) break; 284 } 285 } 286 } 287 288 // Member variables 289 290 ubyte* pixels; 291 size_t strideBytes; 292 int height; 293 Gradient gradient; 294 float xctr,yctr; 295 float xstep,ystep; 296 } 297 298 nothrow: 299 @nogc: 300 301 void doBlit_LinearBlit(void* userData, int* delta, DMWord* mask, int x0, int x1, int y) 302 { 303 LinearBlit* lb = cast(LinearBlit*)userData; 304 return lb.linear_blit!(WindingRule.NonZero)(delta, mask, x0, x1, y); 305 }