diff options
author | Vito Caputo <vcaputo@pengaru.com> | 2023-07-11 12:26:10 -0700 |
---|---|---|
committer | Vito Caputo <vcaputo@pengaru.com> | 2023-07-11 12:29:25 -0700 |
commit | b6362c546a41d58650859a82b91581edc5e7fe1e (patch) | |
tree | 6901b974e94808266ea968beb128a444b26ce344 /src/modules/shapes | |
parent | 02745f14a6b534f7a704ef3c512c9a0b4c999665 (diff) |
modules/shapes: cache per-pixel atan2 results
computing the angle for every pixel coordinate from the origin is
costly, even with the approximate method added by c690303.
An easy speedup is to only do this once for a given frame
dimensions, and cache those results. In the form of a 32-bit
float, it's equivalent to caching a full page of pixel data.
This is slightly complicated by needing to be an effectively
global cache and the potential for multiple shapes contexts
rendering concurrently when part of a composition.
I think this particular situation highlights a need for something
equivalent generalized on-stream where modules can register
discoverable caches of costly to compute information, having a
high probability of being useful to others.
In this particular case it was alphazed's use of shapes in two
layers that made it an obvious win, even without any other
modules needing atan2() per-pixel with a centered origin.
With this commit:
Configured settings as flags:
--seed=0x64adabae '--module=compose,layers=blank\,shapes\\\,type\\\=pinwheel\\\,scale\\\=.9\\\,pinch\\\=.25\\\,pinch_spin\\\=.1\\\,pinches\\\=7\\\,points\\\=19\\\,spin\\\=.01\,shapes\\\,type\\\=pinwheel\\\,scale\\\=.9\\\,pinch\\\=1\\\,pinch_spin\\\=-.25\\\,pinches\\\=8\\\,points\\\=5\\\,spin\\\=0,texture=moire\,centers\=3' '--video=mem,size=1366x768'
FPS: 73
FPS: 74
FPS: 73
Without:
Configured settings as flags:
--seed=0x64adb857 '--module=compose,layers=blank\,shapes\\\,type\\\=pinwheel\\\,scale\\\=.9\\\,pinch\\\=.25\\\,pinch_spin\\\=.1\\\,pinches\\\=7\\\,points\\\=19\\\,spin\\\=.01\,shapes\\\,type\\\=pinwheel\\\,scale\\\=.9\\\,pinch\\\=1\\\,pinch_spin\\\=-.25\\\,pinches\\\=8\\\,points\\\=5\\\,spin\\\=0,texture=moire\,centers\=3' '--video=mem,size=1366x768'
FPS: 55
FPS: 54
FPS: 54
So it's significant, and in alphazed there's also a transition
from one scene with two full-screen shapes layers into a
checkered scene with shapes as the fill_module. Further
amplifying the payoff.. infact whenever shapes is used for a
fill_module in checkers, there's n_cpus shapes contexts created
because checkers is threaded. All of those would be benefitting
from the cache.
Diffstat (limited to 'src/modules/shapes')
-rw-r--r-- | src/modules/shapes/shapes.c | 272 |
1 files changed, 237 insertions, 35 deletions
diff --git a/src/modules/shapes/shapes.c b/src/modules/shapes/shapes.c index 864c1c8..3943652 100644 --- a/src/modules/shapes/shapes.c +++ b/src/modules/shapes/shapes.c @@ -50,8 +50,10 @@ */ +#include <assert.h> #include <errno.h> #include <math.h> +#include <pthread.h> #include <stdlib.h> #include <unistd.h> @@ -69,6 +71,8 @@ #define SHAPES_SPIN_BASE .0025f +typedef struct shapes_radcache_t shapes_radcache_t; + typedef enum shapes_type_t { SHAPES_TYPE_CIRCLE, SHAPES_TYPE_PINWHEEL, @@ -90,8 +94,89 @@ typedef struct shapes_setup_t { typedef struct shapes_context_t { til_module_context_t til_module_context; shapes_setup_t *setup; + shapes_radcache_t *radcache; } shapes_context_t; +struct shapes_radcache_t { + shapes_radcache_t *next, *prev; + unsigned width, height; + unsigned refcount; + unsigned initialized:1; + float rads[]; +}; + +static struct { + shapes_radcache_t *head; + pthread_mutex_t lock; +} shapes_radcache_list = { .lock = PTHREAD_MUTEX_INITIALIZER }; + + +static void * shapes_radcache_unref(shapes_radcache_t *radcache) +{ + if (!radcache) + return NULL; + + if (__sync_fetch_and_sub(&radcache->refcount, 1) == 1) { + + pthread_mutex_lock(&shapes_radcache_list.lock); + if (radcache->prev) + radcache->prev->next = radcache->next; + else + shapes_radcache_list.head = radcache->next; + + if (radcache->next) + radcache->next->prev = radcache->prev; + pthread_mutex_unlock(&shapes_radcache_list.lock); + + free(radcache); + } + + return NULL; +} + + +static shapes_radcache_t * shapes_radcache_find(unsigned width, unsigned height) +{ + shapes_radcache_t *radcache; + + pthread_mutex_lock(&shapes_radcache_list.lock); + for (radcache = shapes_radcache_list.head; radcache; radcache = radcache->next) { + if (radcache->width == width && + radcache->height == height) { + /* if we race with removal, refcount will be zero and we can't use it */ + if (!__sync_fetch_and_add(&radcache->refcount, 1)) + radcache = NULL; + break; + } + } + pthread_mutex_unlock(&shapes_radcache_list.lock); + + return radcache; +} + + +static shapes_radcache_t * shapes_radcache_new(unsigned width, unsigned height) +{ + size_t size = width * height; + shapes_radcache_t *radcache; + + radcache = malloc(sizeof(shapes_radcache_t) + size * sizeof(radcache->rads[0])); + assert(radcache); + radcache->initialized = 0; + radcache->width = width; + radcache->height = height; + radcache->refcount = 1; + radcache->prev = NULL; + + pthread_mutex_lock(&shapes_radcache_list.lock); + radcache->next = shapes_radcache_list.head; + if (radcache->next) + radcache->next->prev = radcache; + pthread_mutex_unlock(&shapes_radcache_list.lock); + + return radcache; +} + static til_module_context_t * shapes_create_context(const til_module_t *module, til_stream_t *stream, unsigned seed, unsigned ticks, unsigned n_cpus, til_setup_t *setup) { @@ -107,9 +192,49 @@ static til_module_context_t * shapes_create_context(const til_module_t *module, } +static void shapes_destroy_context(til_module_context_t *context) +{ + shapes_context_t *ctxt = (shapes_context_t *)context; + + shapes_radcache_unref(ctxt->radcache); +} + + static void shapes_prepare_frame(til_module_context_t *context, til_stream_t *stream, unsigned ticks, til_fb_fragment_t **fragment_ptr, til_frame_plan_t *res_frame_plan) { + *res_frame_plan = (til_frame_plan_t){ .fragmenter = til_fragmenter_slice_per_cpu }; + + /* TODO: + * I've implemented this ad-hoc here for shapes, but I think there's a case to be made that + * such caching should be generalized and added to til_stream_t in a generalized manner. + * + * So shapes should be able to just register a cache of arbitrary type and dimensions with + * some identifier which can then be discovered by shapes and others via that potentially + * well-known identifier. + * + * In a sense this is just a prototype of what part of that might look like... it's pretty clear + * that something like "atan2() of every pixel coordinate in a centered origin coordinate system" + * could have cached value to many modules + */ + { /* radcache maintenance */ + til_fb_fragment_t *fragment = *fragment_ptr; + shapes_context_t *ctxt = (shapes_context_t *)context; + shapes_radcache_t *radcache = ctxt->radcache; + + if (radcache && + (radcache->width != fragment->frame_width || + radcache->height != fragment->frame_height)) + radcache = ctxt->radcache = shapes_radcache_unref(radcache); + + if (!radcache) + radcache = shapes_radcache_find(fragment->frame_width, fragment->frame_height); + + if (!radcache) + radcache = shapes_radcache_new(fragment->frame_width, fragment->frame_height); + + ctxt->radcache = radcache; + } } @@ -164,6 +289,8 @@ static void shapes_render_fragment(til_module_context_t *context, til_stream_t * unsigned xskip = (fragment->x > xoff ? (fragment->x - xoff) : 0); unsigned ystart = MAX(fragment->y, yoff), yend = MIN(yoff + size, fragment->y + fragment->height); unsigned xstart = MAX(fragment->x, xoff), xend = MIN(xoff + size, fragment->x + fragment->width); + shapes_radcache_t *radcache = ctxt->radcache; + float *rads = radcache->rads; if (!fragment->cleared) { /* when {letter,pillar}boxed we need to clear the padding */ @@ -208,14 +335,27 @@ static void shapes_render_fragment(til_module_context_t *context, til_stream_t * XX = -1.f + xskip * s; X = -(size >> 1) + xskip; YYY = Y * Y; - for (unsigned x = xstart; x < xend; x++, X++, XX += s) { - float a = atan2_approx(YY, XX); - - if (YYY+X*X < r_sq * (1.f - fabsf(cosf(n_pinches * a + pinch)) * pinch_s)) - til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x, y, 0xffffffff); /* TODO: stop relying on checked for clipping */ - else if (!fragment->cleared) - til_fb_fragment_put_pixel_unchecked(fragment, 0, x, y, 0x0); - + if (!radcache->initialized) { + for (unsigned x = xstart; x < xend; x++, X++, XX += s) { + float a = rads[y * radcache->width + x] = atan2_approx(YY, XX); + + if (YYY+X*X < r_sq * (1.f - fabsf(cosf(n_pinches * a + pinch)) * pinch_s)) + til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x, y, 0xffffffff); /* TODO: stop relying on checked for clipping */ + else if (!fragment->cleared) + til_fb_fragment_put_pixel_unchecked(fragment, 0, x, y, 0x0); + + } + } else { + float *rads = radcache->rads; + for (unsigned x = xstart; x < xend; x++, X++, XX += s) { + float a = rads[y * radcache->width + x]; + + if (YYY+X*X < r_sq * (1.f - fabsf(cosf(n_pinches * a + pinch)) * pinch_s)) + til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x, y, 0xffffffff); /* TODO: stop relying on checked for clipping */ + else if (!fragment->cleared) + til_fb_fragment_put_pixel_unchecked(fragment, 0, x, y, 0x0); + + } } } break; @@ -236,17 +376,32 @@ static void shapes_render_fragment(til_module_context_t *context, til_stream_t * for (unsigned y = ystart; y < yend; y++, YY += s) { XX = -1.f + xskip * s; YYYY = YY * YY; - for (unsigned x = xstart; x < xend; x++, XX += s) { - float a = atan2_approx(YY, XX); - float r = cosf(n_points * (a + spin)) * .5f + .5f; + if (!radcache->initialized) { + for (unsigned x = xstart; x < xend; x++, XX += s) { + float a = rads[y * radcache->width + x] = atan2_approx(YY, XX); + float r = cosf(n_points * (a + spin)) * .5f + .5f; - r *= 1.f - fabsf(cosf(n_pinches * (a + pinch))) * pinch_s; + r *= 1.f - fabsf(cosf(n_pinches * (a + pinch))) * pinch_s; - if (XX * XX + YYYY < r * r) - til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x, y, 0xffffffff); - else if (!fragment->cleared) - til_fb_fragment_put_pixel_unchecked(fragment, 0, x, y, 0x0); + if (XX * XX + YYYY < r * r) + til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x, y, 0xffffffff); + else if (!fragment->cleared) + til_fb_fragment_put_pixel_unchecked(fragment, 0, x, y, 0x0); + + } + } else { + for (unsigned x = xstart; x < xend; x++, XX += s) { + float a = rads[y * radcache->width + x]; + float r = cosf(n_points * (a + spin)) * .5f + .5f; + r *= 1.f - fabsf(cosf(n_pinches * (a + pinch))) * pinch_s; + + if (XX * XX + YYYY < r * r) + til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x, y, 0xffffffff); + else if (!fragment->cleared) + til_fb_fragment_put_pixel_unchecked(fragment, 0, x, y, 0x0); + + } } } break; @@ -266,16 +421,29 @@ static void shapes_render_fragment(til_module_context_t *context, til_stream_t * YY = -1.f + yskip * s; Y = -(size >> 1) + yskip; for (unsigned y = ystart; y < yend; y++, Y++, YY += s) { + float *rads = radcache->rads; XX = -1.f + xskip * s; X = -(size >> 1) + xskip; - for (unsigned x = xstart; x < xend; x++, X++, XX += s) { - float rad = atan2_approx(YY, XX); - - if (abs(Y) + abs(X) < r * (1.f - fabsf(cosf(n_pinches * rad + pinch)) * pinch_s)) - til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x, y, 0xffffffff); - else if (!fragment->cleared) - til_fb_fragment_put_pixel_unchecked(fragment, 0, x, y, 0x0); - + if (!radcache->initialized) { + for (unsigned x = xstart; x < xend; x++, X++, XX += s) { + float a = rads[y * radcache->width + x] = atan2_approx(YY, XX); + + if (abs(Y) + abs(X) < r * (1.f - fabsf(cosf(n_pinches * a + pinch)) * pinch_s)) + til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x, y, 0xffffffff); + else if (!fragment->cleared) + til_fb_fragment_put_pixel_unchecked(fragment, 0, x, y, 0x0); + + } + } else { + for (unsigned x = xstart; x < xend; x++, X++, XX += s) { + float a = rads[y * radcache->width + x]; + + if (abs(Y) + abs(X) < r * (1.f - fabsf(cosf(n_pinches * a + pinch)) * pinch_s)) + til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x, y, 0xffffffff); + else if (!fragment->cleared) + til_fb_fragment_put_pixel_unchecked(fragment, 0, x, y, 0x0); + + } } } break; @@ -296,17 +464,33 @@ static void shapes_render_fragment(til_module_context_t *context, til_stream_t * for (unsigned y = ystart; y < yend; y++, YY += s) { XX = -1.f + xskip * s; YYYY = YY * YY; - for (unsigned x = xstart; x < xend; x++, XX += s) { - float a = atan2_approx(YY, XX); - float r = (M_2_PI * asinf(sinf(n_points * (a + spin)) * .5f + .5f)) * .5f + .5f; - /* ^^^^^^^^^^^^^^^^^^^ approximates a triangle wave */ - - r *= 1.f - fabsf(cosf(n_pinches * a + pinch)) * pinch_s; - - if (XX * XX + YYYY < r * r) - til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x, y, 0xffffffff); - else if (!fragment->cleared) - til_fb_fragment_put_pixel_unchecked(fragment, 0, x, y, 0x0); + if (!radcache->initialized) { + for (unsigned x = xstart; x < xend; x++, XX += s) { + float a = rads[y * radcache->width + x] = atan2_approx(YY, XX); + float r = (M_2_PI * asinf(sinf(n_points * (a + spin)) * .5f + .5f)) * .5f + .5f; + /* ^^^^^^^^^^^^^^^^^^^ approximates a triangle wave */ + + r *= 1.f - fabsf(cosf(n_pinches * a + pinch)) * pinch_s; + + if (XX * XX + YYYY < r * r) + til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x, y, 0xffffffff); + else if (!fragment->cleared) + til_fb_fragment_put_pixel_unchecked(fragment, 0, x, y, 0x0); + } + } else { + float *rads = radcache->rads; + for (unsigned x = xstart; x < xend; x++, XX += s) { + float a = rads[y * radcache->width + x]; + float r = (M_2_PI * asinf(sinf(n_points * (a + spin)) * .5f + .5f)) * .5f + .5f; + /* ^^^^^^^^^^^^^^^^^^^ approximates a triangle wave */ + + r *= 1.f - fabsf(cosf(n_pinches * a + pinch)) * pinch_s; + + if (XX * XX + YYYY < r * r) + til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x, y, 0xffffffff); + else if (!fragment->cleared) + til_fb_fragment_put_pixel_unchecked(fragment, 0, x, y, 0x0); + } } } break; @@ -315,6 +499,22 @@ static void shapes_render_fragment(til_module_context_t *context, til_stream_t * } +static void shapes_finish_frame(til_module_context_t *context, til_stream_t *stream, unsigned int ticks, til_fb_fragment_t **fragment_ptr) +{ + shapes_context_t *ctxt = (shapes_context_t *)context; + + /* XXX: note that in rendering, initialized is checked racily and it's entirely possible + * for multiple contexts to be rendering and populating the radcache when !initialized + * simultaneously... but since they'd be producing identical data for the cache anyways, + * it seems mostly harmless for now. What should probably be done is make initialized a + * tri-state that's atomically advanced towards initialized wiht an "intializing" mid-state + * that only one renderer can enter, then the others treat "initializing" as !radcache at all + * TODO FIXME + */ + ctxt->radcache->initialized = 1; +} + + static int shapes_setup(const til_settings_t *settings, til_setting_t **res_setting, const til_setting_desc_t **res_desc, til_setup_t **res_setup) { const char *type; @@ -560,8 +760,10 @@ static int shapes_setup(const til_settings_t *settings, til_setting_t **res_sett til_module_t shapes_module = { .create_context = shapes_create_context, + .destroy_context = shapes_destroy_context, .prepare_frame = shapes_prepare_frame, .render_fragment = shapes_render_fragment, + .finish_frame = shapes_finish_frame, .setup = shapes_setup, .name = "shapes", .description = "Procedural 2D shapes (threaded)", |