modules/shapes: cache per-pixel atan2 results

computing the angle for every pixel coordinate from the origin is costly, even with the approximate method added by c690303. An easy speedup is to only do this once for a given frame dimensions, and cache those results. In the form of a 32-bit float, it's equivalent to caching a full page of pixel data. This is slightly complicated by needing to be an effectively global cache and the potential for multiple shapes contexts rendering concurrently when part of a composition. I think this particular situation highlights a need for something equivalent generalized on-stream where modules can register discoverable caches of costly to compute information, having a high probability of being useful to others. In this particular case it was alphazed's use of shapes in two layers that made it an obvious win, even without any other modules needing atan2() per-pixel with a centered origin. With this commit: Configured settings as flags: --seed=0x64adabae '--module=compose,layers=blank\,shapes\\\,type\\\=pinwheel\\\,scale\\\=.9\\\,pinch\\\=.25\\\,pinch_spin\\\=.1\\\,pinches\\\=7\\\,points\\\=19\\\,spin\\\=.01\,shapes\\\,type\\\=pinwheel\\\,scale\\\=.9\\\,pinch\\\=1\\\,pinch_spin\\\=-.25\\\,pinches\\\=8\\\,points\\\=5\\\,spin\\\=0,texture=moire\,centers\=3' '--video=mem,size=1366x768' FPS: 73 FPS: 74 FPS: 73 Without: Configured settings as flags: --seed=0x64adb857 '--module=compose,layers=blank\,shapes\\\,type\\\=pinwheel\\\,scale\\\=.9\\\,pinch\\\=.25\\\,pinch_spin\\\=.1\\\,pinches\\\=7\\\,points\\\=19\\\,spin\\\=.01\,shapes\\\,type\\\=pinwheel\\\,scale\\\=.9\\\,pinch\\\=1\\\,pinch_spin\\\=-.25\\\,pinches\\\=8\\\,points\\\=5\\\,spin\\\=0,texture=moire\,centers\=3' '--video=mem,size=1366x768' FPS: 55 FPS: 54 FPS: 54 So it's significant, and in alphazed there's also a transition from one scene with two full-screen shapes layers into a checkered scene with shapes as the fill_module. Further amplifying the payoff.. infact whenever shapes is used for a fill_module in checkers, there's n_cpus shapes contexts created because checkers is threaded. All of those would be benefitting from the cache.
author: Vito Caputo <vcaputo@pengaru.com> 2023-07-11 12:26:10 -0700
committer: Vito Caputo <vcaputo@pengaru.com> 2023-07-11 12:29:25 -0700
commit: b6362c546a41d58650859a82b91581edc5e7fe1e (patch)
tree: 6901b974e94808266ea968beb128a444b26ce344 /src/modules/shapes
parent: 02745f14a6b534f7a704ef3c512c9a0b4c999665 (diff)
1 files changed, 237 insertions, 35 deletions
diff --git a/src/modules/shapes/shapes.c b/src/modules/shapes/shapes.c
index 864c1c8..3943652 100644
--- a/src/modules/shapes/shapes.c
+++ b/src/modules/shapes/shapes.c
@@ -50,8 +50,10 @@
  */
 
 
+#include <assert.h>
 #include <errno.h>
 #include <math.h>
+#include <pthread.h>
 #include <stdlib.h>
 #include <unistd.h>
 
@@ -69,6 +71,8 @@
 
 #define SHAPES_SPIN_BASE		.0025f
 
+typedef struct shapes_radcache_t shapes_radcache_t;
+
 typedef enum shapes_type_t {
 	SHAPES_TYPE_CIRCLE,
 	SHAPES_TYPE_PINWHEEL,
@@ -90,8 +94,89 @@ typedef struct shapes_setup_t {
 typedef struct shapes_context_t {
 	til_module_context_t	til_module_context;
 	shapes_setup_t		*setup;
+	shapes_radcache_t	*radcache;
 } shapes_context_t;
 
+struct shapes_radcache_t {
+	shapes_radcache_t	*next, *prev;
+	unsigned		width, height;
+	unsigned		refcount;
+	unsigned		initialized:1;
+	float			rads[];
+};
+
+static struct {
+	shapes_radcache_t	*head;
+	pthread_mutex_t		lock;
+} shapes_radcache_list = { .lock = PTHREAD_MUTEX_INITIALIZER };
+
+
+static void * shapes_radcache_unref(shapes_radcache_t *radcache)
+{
+	if (!radcache)
+		return NULL;
+
+	if (__sync_fetch_and_sub(&radcache->refcount, 1) == 1) {
+
+		pthread_mutex_lock(&shapes_radcache_list.lock);
+		if (radcache->prev)
+			radcache->prev->next = radcache->next;
+		else
+			shapes_radcache_list.head = radcache->next;
+
+		if (radcache->next)
+			radcache->next->prev = radcache->prev;
+		pthread_mutex_unlock(&shapes_radcache_list.lock);
+
+		free(radcache);
+	}
+
+	return NULL;
+}
+
+
+static shapes_radcache_t * shapes_radcache_find(unsigned width, unsigned height)
+{
+	shapes_radcache_t	*radcache;
+
+	pthread_mutex_lock(&shapes_radcache_list.lock);
+	for (radcache = shapes_radcache_list.head; radcache; radcache = radcache->next) {
+		if (radcache->width == width &&
+		    radcache->height == height) {
+			/* if we race with removal, refcount will be zero and we can't use it */
+			if (!__sync_fetch_and_add(&radcache->refcount, 1))
+				radcache = NULL;
+			break;
+		}
+	}
+	pthread_mutex_unlock(&shapes_radcache_list.lock);
+
+	return radcache;
+}
+
+
+static shapes_radcache_t * shapes_radcache_new(unsigned width, unsigned height)
+{
+	size_t			size = width * height;
+	shapes_radcache_t	*radcache;
+
+	radcache = malloc(sizeof(shapes_radcache_t) + size * sizeof(radcache->rads[0]));
+	assert(radcache);
+	radcache->initialized = 0;
+	radcache->width = width;
+	radcache->height = height;
+	radcache->refcount = 1;
+	radcache->prev = NULL;
+
+	pthread_mutex_lock(&shapes_radcache_list.lock);
+	radcache->next = shapes_radcache_list.head;
+	if (radcache->next)
+		radcache->next->prev = radcache;
+	pthread_mutex_unlock(&shapes_radcache_list.lock);
+
+	return radcache;
+}
+
 
 static til_module_context_t * shapes_create_context(const til_module_t *module, til_stream_t *stream, unsigned seed, unsigned ticks, unsigned n_cpus, til_setup_t *setup)
 {
@@ -107,9 +192,49 @@ static til_module_context_t * shapes_create_context(const til_module_t *module,
 }
 
 
+static void shapes_destroy_context(til_module_context_t *context)
+{
+	shapes_context_t	*ctxt = (shapes_context_t *)context;
+
+	shapes_radcache_unref(ctxt->radcache);
+}
+
+
 static void shapes_prepare_frame(til_module_context_t *context, til_stream_t *stream, unsigned ticks, til_fb_fragment_t **fragment_ptr, til_frame_plan_t *res_frame_plan)
 {
+
 	*res_frame_plan = (til_frame_plan_t){ .fragmenter = til_fragmenter_slice_per_cpu };
+
+	/* TODO:
+	 * I've implemented this ad-hoc here for shapes, but I think there's a case to be made that
+	 * such caching should be generalized and added to til_stream_t in a generalized manner.
+	 *
+	 * So shapes should be able to just register a cache of arbitrary type and dimensions with
+	 * some identifier which can then be discovered by shapes and others via that potentially
+	 * well-known identifier.
+	 *
+	 * In a sense this is just a prototype of what part of that might look like... it's pretty clear
+	 * that something like "atan2() of every pixel coordinate in a centered origin coordinate system"
+	 * could have cached value to many modules
+	 */
+	{ /* radcache maintenance */
+		til_fb_fragment_t	*fragment = *fragment_ptr;
+		shapes_context_t	*ctxt = (shapes_context_t *)context;
+		shapes_radcache_t	*radcache = ctxt->radcache;
+
+		if (radcache &&
+		    (radcache->width != fragment->frame_width ||
+		     radcache->height != fragment->frame_height))
+			radcache = ctxt->radcache = shapes_radcache_unref(radcache);
+
+		if (!radcache)
+			radcache = shapes_radcache_find(fragment->frame_width, fragment->frame_height);
+
+		if (!radcache)
+			radcache = shapes_radcache_new(fragment->frame_width, fragment->frame_height);
+
+		ctxt->radcache = radcache;
+	}
 }
 
 
@@ -164,6 +289,8 @@ static void shapes_render_fragment(til_module_context_t *context, til_stream_t *
 	unsigned		xskip = (fragment->x > xoff ? (fragment->x - xoff) : 0);
 	unsigned		ystart = MAX(fragment->y, yoff), yend = MIN(yoff + size, fragment->y + fragment->height);
 	unsigned		xstart = MAX(fragment->x, xoff), xend = MIN(xoff + size, fragment->x + fragment->width);
+	shapes_radcache_t	*radcache = ctxt->radcache;
+	float			*rads = radcache->rads;
 
 	if (!fragment->cleared) {
 		/* when {letter,pillar}boxed we need to clear the padding */
@@ -208,14 +335,27 @@ static void shapes_render_fragment(til_module_context_t *context, til_stream_t *
 			XX = -1.f + xskip * s;
 			X = -(size >> 1) + xskip;
 			YYY = Y * Y;
-			for (unsigned x = xstart; x < xend; x++, X++, XX += s) {
-				float	a = atan2_approx(YY, XX);
-
-				if (YYY+X*X < r_sq * (1.f - fabsf(cosf(n_pinches * a + pinch)) * pinch_s))
-					til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x, y, 0xffffffff); /* TODO: stop relying on checked for clipping */
-				else if (!fragment->cleared)
-					til_fb_fragment_put_pixel_unchecked(fragment, 0, x, y, 0x0);
-
+			if (!radcache->initialized) {
+				for (unsigned x = xstart; x < xend; x++, X++, XX += s) {
+					float	a = rads[y * radcache->width + x] = atan2_approx(YY, XX);
+
+					if (YYY+X*X < r_sq * (1.f - fabsf(cosf(n_pinches * a + pinch)) * pinch_s))
+						til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x, y, 0xffffffff); /* TODO: stop relying on checked for clipping */
+					else if (!fragment->cleared)
+						til_fb_fragment_put_pixel_unchecked(fragment, 0, x, y, 0x0);
+
+				}
+			} else {
+				float	*rads = radcache->rads;
+				for (unsigned x = xstart; x < xend; x++, X++, XX += s) {
+					float	a = rads[y * radcache->width + x];
+
+					if (YYY+X*X < r_sq * (1.f - fabsf(cosf(n_pinches * a + pinch)) * pinch_s))
+						til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x, y, 0xffffffff); /* TODO: stop relying on checked for clipping */
+					else if (!fragment->cleared)
+						til_fb_fragment_put_pixel_unchecked(fragment, 0, x, y, 0x0);
+
+				}
 			}
 		}
 		break;
@@ -236,17 +376,32 @@ static void shapes_render_fragment(til_module_context_t *context, til_stream_t *
 		for (unsigned y = ystart; y < yend; y++, YY += s) {
 			XX = -1.f + xskip * s;
 			YYYY = YY * YY;
-			for (unsigned x = xstart; x < xend; x++, XX += s) {
-				float	a = atan2_approx(YY, XX);
-				float	r = cosf(n_points * (a + spin)) * .5f + .5f;
+			if (!radcache->initialized) {
+				for (unsigned x = xstart; x < xend; x++, XX += s) {
+					float	a = rads[y * radcache->width + x] = atan2_approx(YY, XX);
+					float	r = cosf(n_points * (a + spin)) * .5f + .5f;
 
-				r *= 1.f - fabsf(cosf(n_pinches * (a + pinch))) * pinch_s;
+					r *= 1.f - fabsf(cosf(n_pinches * (a + pinch))) * pinch_s;
 
-				if (XX * XX + YYYY < r * r)
-					til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x, y, 0xffffffff);
-				else if (!fragment->cleared)
-					til_fb_fragment_put_pixel_unchecked(fragment, 0, x, y, 0x0);
+					if (XX * XX + YYYY < r * r)
+						til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x, y, 0xffffffff);
+					else if (!fragment->cleared)
+						til_fb_fragment_put_pixel_unchecked(fragment, 0, x, y, 0x0);
+
+				}
+			} else {
+				for (unsigned x = xstart; x < xend; x++, XX += s) {
+					float	a = rads[y * radcache->width + x];
+					float	r = cosf(n_points * (a + spin)) * .5f + .5f;
 
+					r *= 1.f - fabsf(cosf(n_pinches * (a + pinch))) * pinch_s;
+
+					if (XX * XX + YYYY < r * r)
+						til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x, y, 0xffffffff);
+					else if (!fragment->cleared)
+						til_fb_fragment_put_pixel_unchecked(fragment, 0, x, y, 0x0);
+
+				}
 			}
 		}
 		break;
@@ -266,16 +421,29 @@ static void shapes_render_fragment(til_module_context_t *context, til_stream_t *
 		YY = -1.f + yskip * s;
 		Y = -(size >> 1) + yskip;
 		for (unsigned y = ystart; y < yend; y++, Y++, YY += s) {
+			float	*rads = radcache->rads;
 			XX = -1.f + xskip * s;
 			X = -(size >> 1) + xskip;
-			for (unsigned x = xstart; x < xend; x++, X++, XX += s) {
-				float	rad = atan2_approx(YY, XX);
-
-				if (abs(Y) + abs(X) < r * (1.f - fabsf(cosf(n_pinches * rad + pinch)) * pinch_s))
-					til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x, y, 0xffffffff);
-				else if (!fragment->cleared)
-					til_fb_fragment_put_pixel_unchecked(fragment, 0, x, y, 0x0);
-
+			if (!radcache->initialized) {
+				for (unsigned x = xstart; x < xend; x++, X++, XX += s) {
+					float	a = rads[y * radcache->width + x] = atan2_approx(YY, XX);
+
+					if (abs(Y) + abs(X) < r * (1.f - fabsf(cosf(n_pinches * a + pinch)) * pinch_s))
+						til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x, y, 0xffffffff);
+					else if (!fragment->cleared)
+						til_fb_fragment_put_pixel_unchecked(fragment, 0, x, y, 0x0);
+
+				}
+			} else {
+				for (unsigned x = xstart; x < xend; x++, X++, XX += s) {
+					float	a = rads[y * radcache->width + x];
+
+					if (abs(Y) + abs(X) < r * (1.f - fabsf(cosf(n_pinches * a + pinch)) * pinch_s))
+						til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x, y, 0xffffffff);
+					else if (!fragment->cleared)
+						til_fb_fragment_put_pixel_unchecked(fragment, 0, x, y, 0x0);
+
+				}
 			}
 		}
 		break;
@@ -296,17 +464,33 @@ static void shapes_render_fragment(til_module_context_t *context, til_stream_t *
 		for (unsigned y = ystart; y < yend; y++, YY += s) {
 			XX = -1.f + xskip * s;
 			YYYY = YY * YY;
-			for (unsigned x = xstart; x < xend; x++, XX += s) {
-				float	a = atan2_approx(YY, XX);
-				float	r = (M_2_PI * asinf(sinf(n_points * (a + spin)) * .5f + .5f)) * .5f + .5f;
-					/*   ^^^^^^^^^^^^^^^^^^^ approximates a triangle wave */
-
-				r *= 1.f - fabsf(cosf(n_pinches * a + pinch)) * pinch_s;
-
-				if (XX * XX + YYYY < r * r)
-					til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x, y, 0xffffffff);
-				else if (!fragment->cleared)
-					til_fb_fragment_put_pixel_unchecked(fragment, 0, x, y, 0x0);
+			if (!radcache->initialized) {
+				for (unsigned x = xstart; x < xend; x++, XX += s) {
+					float	a = rads[y * radcache->width + x] = atan2_approx(YY, XX);
+					float	r = (M_2_PI * asinf(sinf(n_points * (a + spin)) * .5f + .5f)) * .5f + .5f;
+						/*   ^^^^^^^^^^^^^^^^^^^ approximates a triangle wave */
+
+					r *= 1.f - fabsf(cosf(n_pinches * a + pinch)) * pinch_s;
+
+					if (XX * XX + YYYY < r * r)
+						til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x, y, 0xffffffff);
+					else if (!fragment->cleared)
+						til_fb_fragment_put_pixel_unchecked(fragment, 0, x, y, 0x0);
+				}
+			} else {
+				float	*rads = radcache->rads;
+				for (unsigned x = xstart; x < xend; x++, XX += s) {
+					float	a = rads[y * radcache->width + x];
+					float	r = (M_2_PI * asinf(sinf(n_points * (a + spin)) * .5f + .5f)) * .5f + .5f;
+						/*   ^^^^^^^^^^^^^^^^^^^ approximates a triangle wave */
+
+					r *= 1.f - fabsf(cosf(n_pinches * a + pinch)) * pinch_s;
+
+					if (XX * XX + YYYY < r * r)
+						til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x, y, 0xffffffff);
+					else if (!fragment->cleared)
+						til_fb_fragment_put_pixel_unchecked(fragment, 0, x, y, 0x0);
+				}
 			}
 		}
 		break;
@@ -315,6 +499,22 @@ static void shapes_render_fragment(til_module_context_t *context, til_stream_t *
 }
 
 
+static void shapes_finish_frame(til_module_context_t *context, til_stream_t *stream, unsigned int ticks, til_fb_fragment_t **fragment_ptr)
+{
+	shapes_context_t	*ctxt = (shapes_context_t *)context;
+
+	/* XXX: note that in rendering, initialized is checked racily and it's entirely possible
+	 * for multiple contexts to be rendering and populating the radcache when !initialized
+	 * simultaneously... but since they'd be producing identical data for the cache anyways,
+	 * it seems mostly harmless for now.  What should probably be done is make initialized a
+	 * tri-state that's atomically advanced towards initialized wiht an "intializing" mid-state
+	 * that only one renderer can enter, then the others treat "initializing" as !radcache at all
+	 * TODO FIXME
+	 */
+	ctxt->radcache->initialized = 1;
+}
+
+
 static int shapes_setup(const til_settings_t *settings, til_setting_t **res_setting, const til_setting_desc_t **res_desc, til_setup_t **res_setup)
 {
 	const char	*type;
@@ -560,8 +760,10 @@ static int shapes_setup(const til_settings_t *settings, til_setting_t **res_sett
 
 til_module_t	shapes_module = {
 	.create_context = shapes_create_context,
+	.destroy_context = shapes_destroy_context,
 	.prepare_frame = shapes_prepare_frame,
 	.render_fragment = shapes_render_fragment,
+	.finish_frame = shapes_finish_frame,
 	.setup = shapes_setup,
 	.name = "shapes",
 	.description = "Procedural 2D shapes (threaded)",
author	Vito Caputo <vcaputo@pengaru.com>	2023-07-11 12:26:10 -0700
committer	Vito Caputo <vcaputo@pengaru.com>	2023-07-11 12:29:25 -0700
commit	b6362c546a41d58650859a82b91581edc5e7fe1e (patch)
tree	6901b974e94808266ea968beb128a444b26ce344 /src/modules/shapes
parent	02745f14a6b534f7a704ef3c512c9a0b4c999665 (diff)