SSE Optimized Compositing
As part of a graphics API I've been working on (for my own use, it's hardly ready for production,) I decided to try learning SSE optimization by making the compositing routine faster.
I came up with an implementation which, according to my tests, is 3-5X faster than the non-optimized version. The optimized version below composites a single color starting at a destination pixel buffer for a specified run of pixels. It uses source over compositing on a RGBA, 8bpc, integer pixel buffer.
The optimized version is presented below:
static __m128i zero =
_mm_set_epi32(0x0, 0x0, 0x0, 0x0);
static __m128i one =
_mm_set_epi32(0x00010001U, 0x00010001U, 0x00010001U, 0x00010001U);
static __m128i mask1 =
_mm_set_epi32(0x00FF00FFU, 0x00FF00FFU, 0x00FF00FFU, 0x00FF00FFU);
static __m128i mask2 =
_mm_set_epi32(0xFF00FF00U, 0xFF00FF00U, 0xFF00FF00U, 0xFF00FF00U);
if (_comp[3] == 255) {
/* We're compositing a fully opaque pixel, just copy onto the destination */
unsigned int *dst = (unsigned int *)_dst;
/* The pixel offset from dst */
size_t i = 0;
/* The number of quad-pixels processed */
size_t j = 0;
/* Leading 0..3 pixels */
while (((intptr_t)(dst+i) & (intptr_t)0xF) != 0) {
*(dst + i) = *(unsigned int *)_comp;
i++; run--;
}
/* Start at the pixel i */
__m128i *mdst = (__m128i *)(dst + i);
/* Set 4 pixel chunks */
for (j = 0; j < (run>>2); j++) {
_mm_prefetch(mdst+1, _MM_HINT_T0);
_mm_store_si128(mdst++, _src);
}
/* If we couldn't get all of the run in 4 pixel chunks, get them now */
for (size_t k = 0; k < run % 4; k++) {
*(dst + i + k + (j<<2)) = *(unsigned int *)_comp;
}
} else {
/* We need full composition as this pixel has transparency */
unsigned char *dst = (unsigned char *)_dst;
/* char offset from dst */
size_t i = 0;
/* Quad-pixels processed */
size_t j = 0;
/* get the non-aligned starting 0..3 pixels */
while (((intptr_t)(dst+i) & (intptr_t)0xF) != 0) {
dst[0+i] = _comp[0] + dst[0+i] -
(dst[0+i] > 0? ((((unsigned short)dst[0+i] * _comp[3]) >> 8) + 1) : 0); //R
dst[1+i] = _comp[1] + dst[1+i] -
(dst[1+i] > 0? ((((unsigned short)dst[1+i] * _comp[3]) >> 8) + 1) : 0); //G
dst[2+i] = _comp[2] + dst[2+i] -
(dst[2+i] > 0? ((((unsigned short)dst[2+i] * _comp[3]) >> 8) + 1) : 0); //B
dst[3+i] = _comp[3] + dst[3+i] -
(dst[3+i] > 0? ((((unsigned short)dst[3+i] * _comp[3]) >> 8) + 1) : 0); //A
i += 4; run--;
}
/* Load in 2 pixels */
__m128i *mdst = (__m128i *)(dst+i);
__m128i d, d1, d2, res;
for (j; j < (run>>2); j++) {
/* Load 4 pixels */
d = _mm_load_si128(mdst);
/* Take chars 0,2,4,6,8,10,12,14 and composite */
d1 = _mm_and_si128(d, mask1);
d1 = _mm_add_epi16(_mm_subs_epi16(d1, _mm_add_epi16(
_mm_srli_epi16(_mm_mullo_epi16(d1, _c3), 8),
_mm_and_si128(_mm_cmpgt_epi16(d1, zero), one))), _ce);
/* Take chars 1,3,5,7,9,11,13,15 and composite */
d2 = _mm_srli_epi16(_mm_and_si128(d, mask2), 8);
/* prefetch the next 4 pixels */
_mm_prefetch(mdst+1, _MM_HINT_T0);
d2 = _mm_add_epi16(_mm_subs_epi16(d2, _mm_add_epi16(
_mm_srli_epi16(_mm_mullo_epi16(d2, _c3), 8),
_mm_and_si128(_mm_cmpgt_epi16(d2, zero), one))), _co);
/* shift odd chars (d2) to high 8 bits and or with d1 */
res = _mm_or_si128(d1, _mm_slli_epi16(d2, 8));
/* Store 4 pixels */
_mm_store_si128(mdst, res);
mdst++;
}
// Get the trailing 0..3 pixels
dst += i + (j< <4);
for (size_t k = 0; k < (run % 4) * 4; k+=4) {
dst[0+k] = _comp[0] + dst[0+k] -
(dst[0+k] > 0? ((((unsigned short)dst[0+k] * _comp[3]) >> 8) + 1) : 0); //R
dst[1+k] = _comp[1] + dst[1+k] -
(dst[1+k] > 0? ((((unsigned short)dst[1+k] * _comp[3]) >> 8) + 1) : 0); //G
dst[2+k] = _comp[2] + dst[2+k] -
(dst[2+k] > 0? ((((unsigned short)dst[2+k] * _comp[3]) >> 8) + 1) : 0); //B
dst[3+k] = _comp[3] + dst[3+k] -
(dst[3+k] > 0? ((((unsigned short)dst[3+k] * _comp[3]) >> 8) + 1) : 0); //A
}
}
_c3 is a vector composed solely of the alpha value of the color.
_ce is a vector composed of pairs of the red and blue components.
_co is a vector composed of pairs of the green and alpha components.
_src is the start address of the pixel buffer.
_c3, _ce, _co, and _src are set from the following function:
unsigned short r = _comp[0], g = _comp[1], b = _comp[2], a = _comp[3]; _c3 = _mm_setr_epi16(a, a, a, a, a, a, a, a); _ce = _mm_setr_epi16(r, b, r, b, r, b, r, b); _co = _mm_setr_epi16(g, a, g, a, g, a, g, a); _src = _mm_set_epi32(*(int*)_comp, *(int*)_comp, *(int*)_comp, *(int*)_comp);
0 comments: