modules/flow: implement threaded rendering

This exploits the just added multipass rendering support. In the first pass, the flow-field is sampled and applied to the elements, with every thread operating on its own subset of the elements list. Since the flow-field sampling is all read-only, it's perfectly safe too do in parallel. Nothing is drawn in the first pass, it's only the elements updating according to the flow-field which is performed. In the second pass, the elements are rendered in parallel using the slice_per_cpu fragmenter. Since the elements are kept on a simple array, with no spatial indexing, every thread must visit every element. Since the fragmenter used divides the frame into horizontal slices, every thread needing to reject elements not overlapping its region can take some shortcuts in easily identifying elements entirely outside its region. But the whole 3d->2d projection step must still be performed for every element's current position and +n_iters final position for the frame, which does have a divide unfortunately. Nonetheless, this change improves frame rates substantially on my 2c/4t i7 X230 as benchmarked w/--video=mem,1366x768: --seed=0x64fa9508 '--module=rtv,channels=flow,duration=3,context_duration=3,caption_duration=0,log_channels=on,snow_duration=0,snow_module=none' '--video=mem,size=1366x768' rtv channel settings: 'flow,size=4,count=40000,speed=.8' FPS: 261 FPS: 265 rtv channel settings: 'flow,size=4,count=1000,speed=.9' FPS: 1153 FPS: 3204 FPS: 2934 rtv channel settings: 'flow,size=8,count=5000,speed=.9' FPS: 2923 FPS: 1634 FPS: 1592 rtv channel settings: 'flow,size=2,count=50000,speed=.4' FPS: 1006 FPS: 219 FPS: 268 rtv channel settings: 'flow,size=16,count=30000,speed=.8' FPS: 304 FPS: 350 FPS: 343 rtv channel settings: 'flow,size=16,count=30000,speed=.02' FPS: 379 FPS: 503 FPS: 472 rtv channel settings: 'flow,size=8,count=1000,speed=.16' FPS: 1393 FPS: 3822 FPS: 3876 --- Prior to this commit: --seed=0x64fa9508 '--module=rtv,channels=flow,duration=3,context_duration=3,caption_duration=0,log_channels=on,snow_duration=0,snow_module=none' '--video=mem,size=1366x768' rtv channel settings: 'flow,size=4,count=40000,speed=.8' FPS: 53 FPS: 53 rtv channel settings: 'flow,size=4,count=1000,speed=.9' FPS: 426 FPS: 1366 FPS: 1335 rtv channel settings: 'flow,size=8,count=5000,speed=.9' FPS: 1097 FPS: 368 FPS: 367 rtv channel settings: 'flow,size=2,count=50000,speed=.4' FPS: 279 FPS: 73 FPS: 74 rtv channel settings: 'flow,size=16,count=30000,speed=.8' FPS: 71 FPS: 71 FPS: 70 rtv channel settings: 'flow,size=16,count=30000,speed=.02' FPS: 136 FPS: 305 FPS: 305 rtv channel settings: 'flow,size=8,count=1000,speed=.16' FPS: 972 FPS: 2593 FPS: 2634
author: Vito Caputo <vcaputo@pengaru.com> 2023-09-04 20:45:42 -0700
committer: Vito Caputo <vcaputo@pengaru.com> 2023-09-04 20:54:50 -0700
commit: 6d6c14180096fc4417aeecd489146e0f6fe6f721 (patch)
tree: 9dcc9ac0e90eb2b369a6651559b8a086203e480b
parent: bab16b070f95687ec11d43e55f3dd6e69f96a576 (diff)
1 files changed, 211 insertions, 52 deletions
diff --git a/src/modules/flow/flow.c b/src/modules/flow/flow.c
index 936bafe..ddbae52 100644
--- a/src/modules/flow/flow.c
+++ b/src/modules/flow/flow.c
@@ -17,8 +17,8 @@
 /* Copyright (C) 2017 Vito Caputo <vcaputo@pengaru.com> */
 
 /* TODO:
- * - make threaded
- * - make colorful
+ * - improve the second pass's element rejection efficiency, a spatial data structure
+ *   could probably help here.
  */
 
 #define FLOW_DEFAULT_SIZE	"8"
@@ -29,7 +29,9 @@
 
 typedef struct flow_element_t {
 	float	lifetime;
-	v3f_t	position;
+	v3f_t	position_a, position_b;
+	v3f_t	velocity;	/* per-iter step + direction applicable directly to position_a */
+	v3f_t	color;
 } flow_element_t;
 
 typedef struct flow_context_t {
@@ -50,6 +52,9 @@ typedef struct flow_context_t {
 	unsigned		last_populate_idx;
 	unsigned		n_iters;
 	unsigned		n_elements;
+	unsigned		n_elements_per_cpu;
+	unsigned		pass;
+	float			w;
 	flow_element_t		elements[];
 } flow_context_t;
 
@@ -62,7 +67,7 @@ typedef struct flow_setup_t {
 } flow_setup_t;
 
 
-static void populator(void *context, unsigned size, const ff_data_t *other, ff_data_t *field)
+static void flow_ff_populator(void *context, unsigned size, const ff_data_t *other, ff_data_t *field)
 {
 	flow_context_t	*ctxt = context;
 	unsigned	*seedp = &ctxt->til_module_context.seed;
@@ -91,10 +96,12 @@ static inline float rand_within_range(unsigned *seed, float min, float max)
 
 static inline flow_element_t rand_element(unsigned *seed)
 {
-	flow_element_t	e;
+	flow_element_t	e = {
+				.lifetime = rand_within_range(seed, .5f, 20.f),
+				.position_a = v3f_rand(seed, -1.f, 1.f),
+			};
 
-	e.lifetime = rand_within_range(seed, 0.5f, 20.0f);
-	e.position = v3f_rand(seed, 0.0f, 1.0f);
+	e.position_b = e.position_a;
 
 	return e;
 }
@@ -121,20 +128,22 @@ static til_module_context_t * flow_create_context(const til_module_t *module, ti
 {
 	flow_setup_t	*s = (flow_setup_t *)setup;
 	flow_context_t	*ctxt;
-	unsigned	i;
+	unsigned	elements_per_cpu;
 
-	ctxt = til_module_context_new(module, sizeof(flow_context_t) + sizeof(ctxt->elements[0]) * s->count, stream, seed, ticks, n_cpus, setup);
+	elements_per_cpu  = s->count / n_cpus;
+	ctxt = til_module_context_new(module, sizeof(flow_context_t) + sizeof(ctxt->elements[0]) * elements_per_cpu * n_cpus, stream, seed, ticks, n_cpus, setup);
 	if (!ctxt)
 		return NULL;
 
-	ctxt->ff = ff_new(s->size, populator, ctxt);
+	ctxt->ff = ff_new(s->size, flow_ff_populator, ctxt);
 	if (!ctxt->ff)
 		return til_module_context_free(&ctxt->til_module_context);
 
-	for (i = 0; i < s->count; i++)
+	for (unsigned i = 0; i < s->count; i++)
 		ctxt->elements[i] = rand_element(&ctxt->til_module_context.seed);
 
-	ctxt->n_elements = s->count;
+	ctxt->n_elements_per_cpu = elements_per_cpu;
+	ctxt->n_elements = elements_per_cpu * n_cpus;
 
 	ctxt->taps.speed = til_tap_init_float(ctxt, &ctxt->speed, 1, &ctxt->vars.speed, "speed");
 	flow_update_taps(ctxt, stream);
@@ -169,65 +178,213 @@ static inline uint32_t color_to_uint32_rgb(v3f_t color) {
 }
 
 
+static void flow_prepare_frame(til_module_context_t *context, til_stream_t *stream, unsigned ticks, til_fb_fragment_t **fragment_ptr, til_frame_plan_t *res_frame_plan)
+{
+	flow_context_t	*ctxt = (flow_context_t *)context;
+
+	switch (ctxt->pass) {
+	case 0:
+		flow_update_taps(ctxt, stream);
+
+		ctxt->w = (M_2_PI * asinf(fabsf(sinf((ticks * .001f))))) * 2.f - 1.f;
+		/* ^^ this approximates a triangle wave,
+		 * a sine wave dwells too long for the illusion of continuously evolving
+		 */
+
+		*res_frame_plan = (til_frame_plan_t){ .fragmenter = til_fragmenter_noop_per_cpu };
+		return;
+
+	case 1:
+		*res_frame_plan = (til_frame_plan_t){ .fragmenter = til_fragmenter_slice_per_cpu };
+		return;
+
+	default:
+		assert(0);
+	}
+}
+
+
 static void flow_render_fragment(til_module_context_t *context, til_stream_t *stream, unsigned ticks, unsigned cpu, til_fb_fragment_t **fragment_ptr)
 {
 	flow_context_t		*ctxt = (flow_context_t *)context;
 	til_fb_fragment_t	*fragment = *fragment_ptr;
-	float			w;
 
-	flow_update_taps(ctxt, stream);
+	switch (ctxt->pass) {
+	case 0: {
+		flow_element_t	*e = &ctxt->elements[fragment->number * ctxt->n_elements_per_cpu];
+		unsigned	n = ctxt->n_elements_per_cpu;
+		float		w = ctxt->w * .5f + .5f;
+
+		/* XXX: note the fragment->number is used above as the cpu number, this is to ensure all cpu #s
+		 * are actually used.  Since our noop_fragmenter_per_cpu always produces a fragment per cpu,
+		 * the fragment->number should exhaust the cpu space.  Relying on the actual cpu number could
+		 * skip entire regions of the elements, since there's no guarantee we get scheduled on all CPUs
+		 * in a given frame, despite having a fragment per cpu.  An alternative would be to set the
+		 * .cpu_affinity flag in the frame_plan, but that just slows things down pointlessly.
+		 */
+
+		/* sample the flow-field and update the elements accordingly, splitting ctxt->elements
+		 * into elements_per_cpu chunks indexed by cpu, only working on the chunk for this cpu
+		 */
+		for (unsigned i = 0; i < n; e++, i++) {
+			v3f_t		pos;
+			ff_data_t	d;
+
+			e->lifetime -= .1f;
+			if (e->lifetime <= 0.0f)
+				*e = rand_element(&ctxt->til_module_context.seed);
+
+			if (e->position_b.x < -1.f || e->position_b.x > 1.f ||
+			    e->position_b.y < -1.f || e->position_b.y > 1.f ||
+			    e->position_b.z < -1.f || e->position_b.z > 1.f)
+				*e = rand_element(&ctxt->til_module_context.seed);
+
+			pos = e->position_a = e->position_b;
+
+			d = ff_get(ctxt->ff,
+				   &(v3f_t){ /* FIXME TODO: just make ff.[ch] use a -1..+1 coordinate system */
+					.x = pos.x * .5f + .5f,
+					.y = pos.y * .5f + .5f,
+					.z = pos.z * .5f + .5f,
+				   }, w);
+			e->color = d.color;
+			d.direction = v3f_mult_scalar(&d.direction, .001f); /* XXX FIXME: magic number alert! */
+			e->velocity = d.direction;
+
+			/* Compute the final position now for the next go-round.
+			 * The second pass can't just write it back willy-nilly while racing with others,
+			 * despite doing the same thing iteratively as it draws n_iters pixels.  Hence
+			 * this position_b becomes position_a situation above.
+			 */
+			d.direction = v3f_mult_scalar(&d.direction, (float)ctxt->n_iters);
+			e->position_b = v3f_add(&pos, &d.direction);
+		}
+
+		return;
+	}
 
-	til_fb_fragment_clear(fragment);
+	case 1: {
+		unsigned	ffw = fragment->frame_width,
+				ffh = fragment->frame_height;
+		unsigned	fx1 = fragment->x,
+				fy1 = fragment->y,
+				fx2 = fragment->x + fragment->width,
+				fy2 = fragment->y + fragment->height;
+
+		til_fb_fragment_clear(fragment);
+
+		/* render elements overlapping with this fragment's tile */
+		for (unsigned i = 0; i < ctxt->n_elements; i++) {
+			flow_element_t	*e = &ctxt->elements[i];
+			v3f_t		pos = e->position_a;
+			v3f_t		v = e->velocity;
+			unsigned	x1, y1, x2, y2;
+			uint32_t	pixel;
+
+			/* Perspective-project the endpoints of the element's travel, this is
+			 * the part we can't currently avoid doing per-element per-fragment.
+			 */
+#define ZCONST 1.0f
+			x1 = pos.x / (pos.z + ZCONST) * ffw + (ffw >> 1);
+			y1 = pos.y / (pos.z + ZCONST) * ffh + (ffh >> 1) ;
+			x2 = e->position_b.x / (e->position_b.z + ZCONST) * ffw + (ffw >> 1);
+			y2 = e->position_b.y / (e->position_b.z + ZCONST) * ffh + (ffh >> 1) ;
 
-	w = (M_2_PI * asinf(fabsf(sinf((ticks * .001f))))) * 2.f - 1.f;
-	/* ^^ this approximates a triangle wave,
-	 * a sine wave dwells too long for the illusion of continuously evolving
-	 */
+			/* for cases obviously outside the fragment, don't draw anything */
 
-	for (unsigned j = 0; j < ctxt->n_elements; j++) {
-		flow_element_t	*e = &ctxt->elements[j];
-		v3f_t		pos = e->position;
-		ff_data_t	d = ff_get(ctxt->ff, &pos, w * .5f + .5f);
+			/* totally outside (above) */
+			if (y1 < fy1 && y2 < fy1)
+				continue;
 
-		d.direction = v3f_mult_scalar(&d.direction, .001f);
+			/* totally outside (below) */
+			if (y1 > fy2 && y2 > fy2)
+				continue;
 
-		for (unsigned k = 0; k < ctxt->n_iters; k++) {
-			unsigned	x, y;
+			/* totally outside (left) */
+			if (x1 < fx1 && x2 < fx1)
+				continue;
 
-			pos = v3f_add(&pos, &d.direction);
-#define ZCONST 1.0f
-			x = (pos.x * 2.f - 1.f) / (pos.z + ZCONST) * fragment->width + (fragment->width >> 1);
-			y = (pos.y * 2.f - 1.f) / (pos.z + ZCONST) * fragment->height + (fragment->height >> 1) ;
+			/* totally outside (right) */
+			if (x1 > fx2 && x2 > fx2)
+				continue;
 
-			(void) til_fb_fragment_put_pixel_checked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x, y, color_to_uint32_rgb(d.color));
+			/* remaining cases draw something, get the pixel ready */
+			pixel = color_to_uint32_rgb(e->color);
 
-			if (pos.x < 0.f || pos.x > 1.f ||
-			    pos.y < 0.f || pos.y > 1.f ||
-			    pos.z < 0.f || pos.z > 1.f)
-				*e = rand_element(&ctxt->til_module_context.seed);
-			else
-				e->position = pos;
+			/* totally inside, render unchecked */
+			if (y1 >= fy1 && y1 < fy2 && y2 >= fy1 && y2 < fy2 &&
+			    x1 >= fx1 && x1 < fx2 && x2 >= fx1 && x2 < fx2) {
+
+				(void) til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x1, y1, pixel);
+				(void) til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x2, y2, pixel);
+
+				if (!ctxt->n_iters)
+					continue;
+
+				for (unsigned j = 1; j < ctxt->n_iters - 1; j++) {
+
+					pos = v3f_add(&pos, &v);
+
+					x1 = pos.x / (pos.z + ZCONST) * ffw + (ffw >> 1);
+					y1 = pos.y / (pos.z + ZCONST) * ffh + (ffh >> 1);
+
+					(void) til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x1, y1, pixel);
+				}
+
+				continue;
+			}
+
+			/* may partially overlap, do same as above but w/checking */
+			(void) til_fb_fragment_put_pixel_checked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x1, y1, pixel);
+			(void) til_fb_fragment_put_pixel_checked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x2, y2, pixel);
+
+			if (!ctxt->n_iters)
+				continue;
+
+			for (unsigned j = 1; j < ctxt->n_iters - 1; j++) {
+
+				pos = v3f_add(&pos, &v);
+
+				x1 = pos.x / (pos.z + ZCONST) * ffw + (ffw >> 1);
+				y1 = pos.y / (pos.z + ZCONST) * ffh + (ffh >> 1) ;
+
+				(void) til_fb_fragment_put_pixel_checked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x1, y1, pixel);
+			}
 		}
 
-		e->lifetime -= .1f;
-		if (e->lifetime <= 0.0f)
-			*e = rand_element(&ctxt->til_module_context.seed);
+		return;
 	}
 
-	/* Re-populate the other field before changing directions.
-	 * note if the frame rate is too low and we miss a >.95 sample
-	 * this will regress to just revisiting the previous field which
-	 * is relatively harmless.
-	 */
-	if (fabsf(w) > .95f) {
-		unsigned	other_idx;
-
-		other_idx = rintf(-w * .5f + .5f);
-		if (other_idx != ctxt->last_populate_idx) {
-			ff_populate(ctxt->ff, other_idx);
-			ctxt->last_populate_idx = other_idx;
+	default:
+		assert(0);
+	}
+}
+
+
+static int flow_finish_frame(til_module_context_t *context, til_stream_t *stream, unsigned int ticks, til_fb_fragment_t **fragment_ptr)
+{
+	flow_context_t		*ctxt = (flow_context_t *)context;
+
+	ctxt->pass = (ctxt->pass + 1) % 2;
+
+	if (!ctxt->pass) {
+		/* Re-populate the other field before changing directions.
+		 * note if the frame rate is too low and we miss a >.95 sample
+		 * this will regress to just revisiting the previous field which
+		 * is relatively harmless.
+		 */
+		if (fabsf(ctxt->w) > .95f) {
+			unsigned	other_idx;
+
+			other_idx = rintf(-ctxt->w * .5f + .5f);
+			if (other_idx != ctxt->last_populate_idx) {
+				ff_populate(ctxt->ff, other_idx);
+				ctxt->last_populate_idx = other_idx;
+			}
 		}
 	}
+
+	return ctxt->pass;
 }
 
 
@@ -237,7 +394,9 @@ static int flow_setup(const til_settings_t *settings, til_setting_t **res_settin
 til_module_t	flow_module = {
 	.create_context = flow_create_context,
 	.destroy_context = flow_destroy_context,
+	.prepare_frame = flow_prepare_frame,
 	.render_fragment = flow_render_fragment,
+	.finish_frame = flow_finish_frame,
 	.setup = flow_setup,
 	.name = "flow",
 	.description = "3D flow field",
author	Vito Caputo <vcaputo@pengaru.com>	2023-09-04 20:45:42 -0700
committer	Vito Caputo <vcaputo@pengaru.com>	2023-09-04 20:54:50 -0700
commit	6d6c14180096fc4417aeecd489146e0f6fe6f721 (patch)
tree	9dcc9ac0e90eb2b369a6651559b8a086203e480b
parent	bab16b070f95687ec11d43e55f3dd6e69f96a576 (diff)