diff options
-rw-r--r-- | src/modules/mixer/mixer.c | 80 |
1 files changed, 58 insertions, 22 deletions
diff --git a/src/modules/mixer/mixer.c b/src/modules/mixer/mixer.c index efa7de3..d0974d9 100644 --- a/src/modules/mixer/mixer.c +++ b/src/modules/mixer/mixer.c @@ -151,32 +151,32 @@ static void mixer_prepare_frame(til_module_context_t *context, til_stream_t *str /* derived from modules/drizzle pixel_mult_scalar(), there's definitely room for optimizations */ -static inline uint32_t pixels_lerp(uint32_t a_pixel, uint32_t b_pixel, float t) +static inline uint32_t pixels_lerp(uint32_t a_pixel, uint32_t b_pixel, float one_sub_T, float T) { uint32_t pixel; float a, b; /* r */ - a = (a_pixel >> 16) & 0xff; - a *= 1.f - t; - b = (b_pixel >> 16) & 0xff; - b *= t; + a = ((uint8_t)(a_pixel >> 16)); + a *= one_sub_T; + b = ((uint8_t)(b_pixel >> 16)); + b *= T; pixel = (((uint32_t)(a+b)) << 16); /* g */ - a = (a_pixel >> 8) & 0xff; - a *= 1.f - t; - b = (b_pixel >> 8) & 0xff; - b *= t; + a = ((uint8_t)(a_pixel >> 8)); + a *= one_sub_T; + b = ((uint8_t)(b_pixel >> 8)); + b *= T; pixel |= (((uint32_t)(a+b)) << 8); /* b */ - a = a_pixel & 0xff; - a *= 1.f - t; - b = b_pixel & 0xff; - b *= t; + a = ((uint8_t)a_pixel); + a *= one_sub_T; + b = ((uint8_t)b_pixel); + b *= T; pixel |= ((uint32_t)(a+b)); @@ -195,7 +195,11 @@ static void mixer_render_fragment(til_module_context_t *context, til_stream_t *s break; case MIXER_STYLE_FADE: { - float T = ctxt->vars.T; + uint32_t *dest = fragment->buf; + til_fb_fragment_t *snapshot_a, *snapshot_b; + uint32_t *a, *b; + float T = ctxt->vars.T; + float one_sub_T = 1.f - T; if (T <= 0.f || T >= 1.f) break; @@ -203,18 +207,50 @@ static void mixer_render_fragment(til_module_context_t *context, til_stream_t *s assert(ctxt->snapshots[0]); assert(ctxt->snapshots[1]); + snapshot_a = ctxt->snapshots[0]; + snapshot_b = ctxt->snapshots[1]; + a = snapshot_a->buf + (fragment->y - snapshot_a->y) * snapshot_a->pitch + (fragment->x - snapshot_a->x); + b = snapshot_b->buf + (fragment->y - snapshot_b->y) * snapshot_b->pitch + (fragment->x - snapshot_b->x); + /* for the tweens, we already have snapshots sitting in ctxt->snapshots[], * which we now interpolate the pixels out of in parallel */ - for (int y = fragment->y; y < fragment->y + fragment->height; y++) { - for (int x = fragment->x; x < fragment->x + fragment->width; x++) { - uint32_t a_pixel = til_fb_fragment_get_pixel_unchecked(ctxt->snapshots[0], x, y); - uint32_t b_pixel = til_fb_fragment_get_pixel_unchecked(ctxt->snapshots[1], x, y); - uint32_t pixel; - - pixel = pixels_lerp(a_pixel, b_pixel, T); - til_fb_fragment_put_pixel_unchecked(fragment, 0, x, y, pixel); + for (unsigned y = 0, h = fragment->height, w = fragment->width; y < h; y++) { + unsigned x = 0; + + /* go four-wide if there's enough, note even without SSE this is a bit quicker a la unrolled loop */ + if ((w & ~3U)) { + for (; x < (w & ~3U); x += 4) { + /* TODO: explore adding a SIMD/SSE implementation, this is an ideal application for it */ + *dest = pixels_lerp(*a, *b, one_sub_T, T); + dest++; + a++; + b++; + + *dest = pixels_lerp(*a, *b, one_sub_T, T); + dest++; + a++; + b++; + + *dest = pixels_lerp(*a, *b, one_sub_T, T); + dest++; + a++; + b++; + + *dest = pixels_lerp(*a, *b, one_sub_T, T); + dest++; + a++; + b++; + } } + + /* pick up any tail pixels */ + for (; x < w; a++, b++, dest++, x++) + *dest = pixels_lerp(*a, *b, one_sub_T, T); + + a += snapshot_a->pitch - w; /* things are a little awkward because we're fragmenting a threaded render within what was snapshotted */ + b += snapshot_b->pitch - w; + dest += fragment->stride; } break; |