1 /** 2 * Implement the linear gradient fill style. dplug:canvas internals. 3 * 4 * Copyright: Copyright Chris Jones 2020. 5 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 6 */ 7 module dplug.canvas.linearblit; 8 9 import dplug.core.math; 10 11 import dplug.canvas.rasterizer; 12 import dplug.canvas.gradient; 13 import dplug.canvas.misc; 14 15 /* 16 linear gradient blit 17 */ 18 19 struct LinearBlit 20 { 21 nothrow: 22 @nogc: 23 24 void init(Gradient g, float x0, float y0, float x1, float y1) 25 { 26 assert(g !is null); 27 assert(isPow2(g.lutLength)); 28 29 this.gradient = g; 30 int lutsize = g.lutLength; 31 32 xctr = x0; 33 yctr = y0; 34 float w = x1-x0; 35 float h = y1-y0; 36 float hsq = w*w + h*h; 37 if (hsq < 0.1) hsq = 0.1; // avoid div by zero 38 xstep = lutsize * w / hsq; 39 ystep = lutsize * h / hsq; 40 } 41 42 private: 43 44 void linear_blit(WindingRule wr)(uint* dest, int* delta, DMWord* mask, int x0, int x1, int y) 45 { 46 assert(x0 >= 0); 47 assert(y >= 0); 48 assert((x0 & 3) == 0); 49 assert((x1 & 3) == 0); 50 51 // main blit variables 52 53 int bpos = x0 / 4; 54 int endbit = x1 / 4; 55 __m128i xmWinding = 0; 56 uint* lut = gradient.getLookup.ptr; 57 assert(gradient.lutLength <= short.max); // LUT can be non-power-of-2 as far as LinearBlit is concerned, but this held low interest 58 short lutMax = cast(short)(gradient.lutLength - 1); 59 60 bool isopaque = false;//gradient.isOpaque 61 62 // XMM constants 63 64 immutable __m128i XMZERO = 0; 65 immutable __m128i XMFFFF = [0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF]; 66 immutable __m128i XMMSK16 = [0xFFFF,0xFFFF,0xFFFF,0xFFFF]; 67 68 // paint variables 69 70 float t0 = (bpos*4-xctr)*xstep + (y-yctr)*ystep; 71 __m128 xmT0 = _mm_mul_ps(_mm_set1_ps(xstep), _mm_setr_ps(0.0f,1.0f,2.0f,3.0f)); 72 xmT0 = _mm_add_ps(xmT0, _mm_set1_ps(t0)); 73 __m128 xmStep0 = _mm_set1_ps(xstep*4); 74 75 // main loop 76 77 while (bpos < endbit) 78 { 79 int nsb = nextSetBit(mask, bpos, endbit); 80 81 // do we have a span of unchanging coverage? 82 83 if (bpos < nsb) 84 { 85 // Calc coverage of first pixel 86 87 static if (wr == WindingRule.NonZero) 88 { 89 int cover = xmWinding[3]+delta[bpos*4]; 90 cover = abs(cover)*2; 91 if (cover > 0xFFFF) cover = 0xFFFF; 92 } 93 else 94 { 95 int cover = xmWinding[3]+delta[bpos*4]; 96 short tsc = cast(short) cover; 97 cover = (tsc ^ (tsc >> 15)) * 2; 98 } 99 100 // We can skip the span 101 102 if (cover == 0) 103 { 104 __m128 tsl = _mm_set1_ps(nsb-bpos); 105 xmT0 = _mm_add_ps(xmT0, _mm_mul_ps(tsl,xmStep0)); 106 bpos = nsb; 107 } 108 109 // Or fill span with soid color 110 111 else if (isopaque && (cover > 0xFF00)) 112 { 113 uint* ptr = &dest[bpos*4]; 114 uint* end = ptr + ((nsb-bpos)*4); 115 116 while (ptr < end) 117 { 118 __m128i ipos = _mm_cvttps_epi32 (xmT0); 119 ipos = _mm_clamp_0_to_N_epi32(ipos, lutMax); 120 xmT0 = xmT0 + xmStep0; 121 122 ptr[0] = lut[ ipos.array[0] ]; 123 ptr[1] = lut[ ipos.array[1] ]; 124 ptr[2] = lut[ ipos.array[2] ]; 125 ptr[3] = lut[ ipos.array[3] ]; 126 127 ptr+=4; 128 } 129 130 bpos = nsb; 131 } 132 133 // Or fill span with transparent color 134 135 else 136 { 137 __m128i tqcvr = _mm_set1_epi16 (cast(ushort) cover); 138 139 uint* ptr = &dest[bpos*4]; 140 uint* end = &dest[nsb*4]; 141 142 while (ptr < end) 143 { 144 __m128i ipos = _mm_cvttps_epi32 (xmT0); 145 ipos = _mm_clamp_0_to_N_epi32(ipos, lutMax); 146 xmT0 = xmT0 + xmStep0; 147 148 __m128i d01 = _mm_loadu_si128(cast(__m128i*) ptr); 149 __m128i d0 = _mm_unpacklo_epi8 (d01, XMZERO); 150 __m128i d1 = _mm_unpackhi_epi8 (d01, XMZERO); 151 152 __m128i c0 = _mm_loadu_si32 (&lut[ ipos.array[0] ]); 153 __m128i tnc = _mm_loadu_si32 (&lut[ ipos.array[1] ]); 154 c0 = _mm_unpacklo_epi32 (c0, tnc); 155 c0 = _mm_unpacklo_epi8 (c0, XMZERO); 156 __m128i a0 = _mm_broadcast_alpha(c0); 157 a0 = _mm_mulhi_epu16(a0, tqcvr); 158 159 __m128i c1 = _mm_loadu_si32 (&lut[ ipos.array[2] ]); 160 tnc = _mm_loadu_si32 (&lut[ ipos.array[3] ]); 161 c1 = _mm_unpacklo_epi32 (c1, tnc); 162 c1 = _mm_unpacklo_epi8 (c1, XMZERO); 163 __m128i a1 = _mm_broadcast_alpha(c1); 164 a1 = _mm_mulhi_epu16(a1, tqcvr); 165 166 // alpha*source + dest - alpha*dest 167 168 c0 = _mm_mulhi_epu16 (c0,a0); 169 c1 = _mm_mulhi_epu16 (c1,a1); 170 c0 = _mm_adds_epi16 (c0,d0); 171 c1 = _mm_adds_epi16 (c1,d1); 172 d0 = _mm_mulhi_epu16 (d0,a0); 173 d1 = _mm_mulhi_epu16 (d1,a1); 174 c0 = _mm_subs_epi16 (c0, d0); 175 c1 = _mm_subs_epi16 (c1, d1); 176 177 d0 = _mm_packus_epi16 (c0,c1); 178 179 _mm_storeu_si128 (cast(__m128i*)ptr,d0); 180 181 ptr+=4; 182 } 183 184 bpos = nsb; 185 } 186 } 187 188 // At this point we need to integrate scandelta 189 190 uint* ptr = &dest[bpos*4]; 191 uint* end = &dest[endbit*4]; 192 int* dlptr = &delta[bpos*4]; 193 194 while (bpos < endbit) 195 { 196 // Integrate delta values 197 198 __m128i tqw = _mm_loadu_si128(cast(__m128i*)dlptr); 199 tqw = _mm_add_epi32(tqw, _mm_slli_si128!4(tqw)); 200 tqw = _mm_add_epi32(tqw, _mm_slli_si128!8(tqw)); 201 tqw = _mm_add_epi32(tqw, xmWinding); 202 xmWinding = _mm_shuffle_epi32!255(tqw); 203 _mm_storeu_si128(cast(__m128i*)dlptr,XMZERO); 204 205 // Process coverage values taking account of winding rule 206 207 static if (wr == WindingRule.NonZero) 208 { 209 __m128i tcvr = _mm_srai_epi32(tqw,31); 210 tqw = _mm_add_epi32(tcvr,tqw); 211 tqw = _mm_xor_si128(tqw,tcvr); // abs 212 tcvr = _mm_packs_epi32(tqw,XMZERO); // saturate/pack to int16 213 tcvr = _mm_slli_epi16(tcvr, 1); // << to uint16 214 } 215 else 216 { 217 __m128i tcvr = _mm_and_si128(tqw,XMMSK16); 218 tqw = _mm_srai_epi16(tcvr,15); // mask 219 tcvr = _mm_xor_si128(tcvr,tqw); // fold in halff 220 tcvr = _mm_packs_epi32(tcvr,XMZERO); // pack to int16 221 tcvr = _mm_slli_epi16(tcvr, 1); // << to uint16 222 } 223 224 // convert grad pos to integer 225 226 __m128i ipos = _mm_cvttps_epi32 (xmT0); 227 ipos = _mm_clamp_0_to_N_epi32(ipos, lutMax); 228 xmT0 = xmT0 + xmStep0; 229 230 // Load destination pixels 231 __m128i d01 = _mm_loadu_si128(cast(__m128i*) ptr); 232 __m128i d0 = _mm_unpacklo_epi8 (d01, XMZERO); 233 __m128i d1 = _mm_unpackhi_epi8 (d01, XMZERO); 234 235 // load grad colors 236 237 tcvr = _mm_unpacklo_epi16 (tcvr, tcvr); 238 __m128i tcvr2 = _mm_unpackhi_epi32 (tcvr, tcvr); 239 tcvr = _mm_unpacklo_epi32 (tcvr, tcvr); 240 241 __m128i c0 = _mm_loadu_si32 (&lut[ ipos.array[0] ]); 242 __m128i tnc = _mm_loadu_si32 (&lut[ ipos.array[1] ]); 243 c0 = _mm_unpacklo_epi32 (c0, tnc); 244 c0 = _mm_unpacklo_epi8 (c0, XMZERO); 245 __m128i a0 = _mm_broadcast_alpha(c0); 246 a0 = _mm_mulhi_epu16(a0, tcvr); 247 248 249 __m128i c1 = _mm_loadu_si32 (&lut[ ipos.array[2] ]); 250 tnc = _mm_loadu_si32 (&lut[ ipos.array[3] ]); 251 c1 = _mm_unpacklo_epi32 (c1, tnc); 252 c1 = _mm_unpacklo_epi8 (c1, XMZERO); 253 __m128i a1 = _mm_broadcast_alpha(c1); 254 a1 = _mm_mulhi_epu16(a1, tcvr2); 255 256 // alpha*source + dest - alpha*dest 257 258 c0 = _mm_mulhi_epu16 (c0,a0); 259 c1 = _mm_mulhi_epu16 (c1,a1); 260 c0 = _mm_adds_epi16 (c0,d0); 261 c1 = _mm_adds_epi16 (c1,d1); 262 d0 = _mm_mulhi_epu16 (d0,a0); 263 d1 = _mm_mulhi_epu16 (d1,a1); 264 c0 = _mm_subs_epi16 (c0, d0); 265 c1 = _mm_subs_epi16 (c1, d1); 266 267 d0 = _mm_packus_epi16 (c0,c1); 268 269 _mm_storeu_si128 (cast(__m128i*)ptr,d0); 270 271 bpos++; 272 ptr+=4; 273 dlptr+=4; 274 275 if (((cast(ulong*)dlptr)[0] | (cast(ulong*)dlptr)[1]) == 0) break; 276 } 277 } 278 } 279 280 // Member variables 281 282 Gradient gradient; 283 float xctr,yctr; 284 float xstep,ystep; 285 } 286 287 nothrow: 288 @nogc: 289 290 void doBlit_LinearBlit_NonZero(void* userData, uint* dest, int* delta, DMWord* mask, int x0, int x1, int y) 291 { 292 LinearBlit* lb = cast(LinearBlit*)userData; 293 return lb.linear_blit!(WindingRule.NonZero)(dest, delta, mask, x0, x1, y); 294 } 295 296 void doBlit_LinearBlit_EvenOdd(void* userData, uint* dest, int* delta, DMWord* mask, int x0, int x1, int y) 297 { 298 LinearBlit* lb = cast(LinearBlit*)userData; 299 return lb.linear_blit!(WindingRule.EvenOdd)(dest, delta, mask, x0, x1, y); 300 }