From 6d6c14180096fc4417aeecd489146e0f6fe6f721 Mon Sep 17 00:00:00 2001 From: Vito Caputo Date: Mon, 4 Sep 2023 20:45:42 -0700 Subject: modules/flow: implement threaded rendering This exploits the just added multipass rendering support. In the first pass, the flow-field is sampled and applied to the elements, with every thread operating on its own subset of the elements list. Since the flow-field sampling is all read-only, it's perfectly safe too do in parallel. Nothing is drawn in the first pass, it's only the elements updating according to the flow-field which is performed. In the second pass, the elements are rendered in parallel using the slice_per_cpu fragmenter. Since the elements are kept on a simple array, with no spatial indexing, every thread must visit every element. Since the fragmenter used divides the frame into horizontal slices, every thread needing to reject elements not overlapping its region can take some shortcuts in easily identifying elements entirely outside its region. But the whole 3d->2d projection step must still be performed for every element's current position and +n_iters final position for the frame, which does have a divide unfortunately. Nonetheless, this change improves frame rates substantially on my 2c/4t i7 X230 as benchmarked w/--video=mem,1366x768: --seed=0x64fa9508 '--module=rtv,channels=flow,duration=3,context_duration=3,caption_duration=0,log_channels=on,snow_duration=0,snow_module=none' '--video=mem,size=1366x768' rtv channel settings: 'flow,size=4,count=40000,speed=.8' FPS: 261 FPS: 265 rtv channel settings: 'flow,size=4,count=1000,speed=.9' FPS: 1153 FPS: 3204 FPS: 2934 rtv channel settings: 'flow,size=8,count=5000,speed=.9' FPS: 2923 FPS: 1634 FPS: 1592 rtv channel settings: 'flow,size=2,count=50000,speed=.4' FPS: 1006 FPS: 219 FPS: 268 rtv channel settings: 'flow,size=16,count=30000,speed=.8' FPS: 304 FPS: 350 FPS: 343 rtv channel settings: 'flow,size=16,count=30000,speed=.02' FPS: 379 FPS: 503 FPS: 472 rtv channel settings: 'flow,size=8,count=1000,speed=.16' FPS: 1393 FPS: 3822 FPS: 3876 --- Prior to this commit: --seed=0x64fa9508 '--module=rtv,channels=flow,duration=3,context_duration=3,caption_duration=0,log_channels=on,snow_duration=0,snow_module=none' '--video=mem,size=1366x768' rtv channel settings: 'flow,size=4,count=40000,speed=.8' FPS: 53 FPS: 53 rtv channel settings: 'flow,size=4,count=1000,speed=.9' FPS: 426 FPS: 1366 FPS: 1335 rtv channel settings: 'flow,size=8,count=5000,speed=.9' FPS: 1097 FPS: 368 FPS: 367 rtv channel settings: 'flow,size=2,count=50000,speed=.4' FPS: 279 FPS: 73 FPS: 74 rtv channel settings: 'flow,size=16,count=30000,speed=.8' FPS: 71 FPS: 71 FPS: 70 rtv channel settings: 'flow,size=16,count=30000,speed=.02' FPS: 136 FPS: 305 FPS: 305 rtv channel settings: 'flow,size=8,count=1000,speed=.16' FPS: 972 FPS: 2593 FPS: 2634 --- src/modules/flow/flow.c | 263 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 211 insertions(+), 52 deletions(-) (limited to 'src') diff --git a/src/modules/flow/flow.c b/src/modules/flow/flow.c index 936bafe..ddbae52 100644 --- a/src/modules/flow/flow.c +++ b/src/modules/flow/flow.c @@ -17,8 +17,8 @@ /* Copyright (C) 2017 Vito Caputo */ /* TODO: - * - make threaded - * - make colorful + * - improve the second pass's element rejection efficiency, a spatial data structure + * could probably help here. */ #define FLOW_DEFAULT_SIZE "8" @@ -29,7 +29,9 @@ typedef struct flow_element_t { float lifetime; - v3f_t position; + v3f_t position_a, position_b; + v3f_t velocity; /* per-iter step + direction applicable directly to position_a */ + v3f_t color; } flow_element_t; typedef struct flow_context_t { @@ -50,6 +52,9 @@ typedef struct flow_context_t { unsigned last_populate_idx; unsigned n_iters; unsigned n_elements; + unsigned n_elements_per_cpu; + unsigned pass; + float w; flow_element_t elements[]; } flow_context_t; @@ -62,7 +67,7 @@ typedef struct flow_setup_t { } flow_setup_t; -static void populator(void *context, unsigned size, const ff_data_t *other, ff_data_t *field) +static void flow_ff_populator(void *context, unsigned size, const ff_data_t *other, ff_data_t *field) { flow_context_t *ctxt = context; unsigned *seedp = &ctxt->til_module_context.seed; @@ -91,10 +96,12 @@ static inline float rand_within_range(unsigned *seed, float min, float max) static inline flow_element_t rand_element(unsigned *seed) { - flow_element_t e; + flow_element_t e = { + .lifetime = rand_within_range(seed, .5f, 20.f), + .position_a = v3f_rand(seed, -1.f, 1.f), + }; - e.lifetime = rand_within_range(seed, 0.5f, 20.0f); - e.position = v3f_rand(seed, 0.0f, 1.0f); + e.position_b = e.position_a; return e; } @@ -121,20 +128,22 @@ static til_module_context_t * flow_create_context(const til_module_t *module, ti { flow_setup_t *s = (flow_setup_t *)setup; flow_context_t *ctxt; - unsigned i; + unsigned elements_per_cpu; - ctxt = til_module_context_new(module, sizeof(flow_context_t) + sizeof(ctxt->elements[0]) * s->count, stream, seed, ticks, n_cpus, setup); + elements_per_cpu = s->count / n_cpus; + ctxt = til_module_context_new(module, sizeof(flow_context_t) + sizeof(ctxt->elements[0]) * elements_per_cpu * n_cpus, stream, seed, ticks, n_cpus, setup); if (!ctxt) return NULL; - ctxt->ff = ff_new(s->size, populator, ctxt); + ctxt->ff = ff_new(s->size, flow_ff_populator, ctxt); if (!ctxt->ff) return til_module_context_free(&ctxt->til_module_context); - for (i = 0; i < s->count; i++) + for (unsigned i = 0; i < s->count; i++) ctxt->elements[i] = rand_element(&ctxt->til_module_context.seed); - ctxt->n_elements = s->count; + ctxt->n_elements_per_cpu = elements_per_cpu; + ctxt->n_elements = elements_per_cpu * n_cpus; ctxt->taps.speed = til_tap_init_float(ctxt, &ctxt->speed, 1, &ctxt->vars.speed, "speed"); flow_update_taps(ctxt, stream); @@ -169,65 +178,213 @@ static inline uint32_t color_to_uint32_rgb(v3f_t color) { } +static void flow_prepare_frame(til_module_context_t *context, til_stream_t *stream, unsigned ticks, til_fb_fragment_t **fragment_ptr, til_frame_plan_t *res_frame_plan) +{ + flow_context_t *ctxt = (flow_context_t *)context; + + switch (ctxt->pass) { + case 0: + flow_update_taps(ctxt, stream); + + ctxt->w = (M_2_PI * asinf(fabsf(sinf((ticks * .001f))))) * 2.f - 1.f; + /* ^^ this approximates a triangle wave, + * a sine wave dwells too long for the illusion of continuously evolving + */ + + *res_frame_plan = (til_frame_plan_t){ .fragmenter = til_fragmenter_noop_per_cpu }; + return; + + case 1: + *res_frame_plan = (til_frame_plan_t){ .fragmenter = til_fragmenter_slice_per_cpu }; + return; + + default: + assert(0); + } +} + + static void flow_render_fragment(til_module_context_t *context, til_stream_t *stream, unsigned ticks, unsigned cpu, til_fb_fragment_t **fragment_ptr) { flow_context_t *ctxt = (flow_context_t *)context; til_fb_fragment_t *fragment = *fragment_ptr; - float w; - flow_update_taps(ctxt, stream); + switch (ctxt->pass) { + case 0: { + flow_element_t *e = &ctxt->elements[fragment->number * ctxt->n_elements_per_cpu]; + unsigned n = ctxt->n_elements_per_cpu; + float w = ctxt->w * .5f + .5f; + + /* XXX: note the fragment->number is used above as the cpu number, this is to ensure all cpu #s + * are actually used. Since our noop_fragmenter_per_cpu always produces a fragment per cpu, + * the fragment->number should exhaust the cpu space. Relying on the actual cpu number could + * skip entire regions of the elements, since there's no guarantee we get scheduled on all CPUs + * in a given frame, despite having a fragment per cpu. An alternative would be to set the + * .cpu_affinity flag in the frame_plan, but that just slows things down pointlessly. + */ + + /* sample the flow-field and update the elements accordingly, splitting ctxt->elements + * into elements_per_cpu chunks indexed by cpu, only working on the chunk for this cpu + */ + for (unsigned i = 0; i < n; e++, i++) { + v3f_t pos; + ff_data_t d; + + e->lifetime -= .1f; + if (e->lifetime <= 0.0f) + *e = rand_element(&ctxt->til_module_context.seed); + + if (e->position_b.x < -1.f || e->position_b.x > 1.f || + e->position_b.y < -1.f || e->position_b.y > 1.f || + e->position_b.z < -1.f || e->position_b.z > 1.f) + *e = rand_element(&ctxt->til_module_context.seed); + + pos = e->position_a = e->position_b; + + d = ff_get(ctxt->ff, + &(v3f_t){ /* FIXME TODO: just make ff.[ch] use a -1..+1 coordinate system */ + .x = pos.x * .5f + .5f, + .y = pos.y * .5f + .5f, + .z = pos.z * .5f + .5f, + }, w); + e->color = d.color; + d.direction = v3f_mult_scalar(&d.direction, .001f); /* XXX FIXME: magic number alert! */ + e->velocity = d.direction; + + /* Compute the final position now for the next go-round. + * The second pass can't just write it back willy-nilly while racing with others, + * despite doing the same thing iteratively as it draws n_iters pixels. Hence + * this position_b becomes position_a situation above. + */ + d.direction = v3f_mult_scalar(&d.direction, (float)ctxt->n_iters); + e->position_b = v3f_add(&pos, &d.direction); + } + + return; + } - til_fb_fragment_clear(fragment); + case 1: { + unsigned ffw = fragment->frame_width, + ffh = fragment->frame_height; + unsigned fx1 = fragment->x, + fy1 = fragment->y, + fx2 = fragment->x + fragment->width, + fy2 = fragment->y + fragment->height; + + til_fb_fragment_clear(fragment); + + /* render elements overlapping with this fragment's tile */ + for (unsigned i = 0; i < ctxt->n_elements; i++) { + flow_element_t *e = &ctxt->elements[i]; + v3f_t pos = e->position_a; + v3f_t v = e->velocity; + unsigned x1, y1, x2, y2; + uint32_t pixel; + + /* Perspective-project the endpoints of the element's travel, this is + * the part we can't currently avoid doing per-element per-fragment. + */ +#define ZCONST 1.0f + x1 = pos.x / (pos.z + ZCONST) * ffw + (ffw >> 1); + y1 = pos.y / (pos.z + ZCONST) * ffh + (ffh >> 1) ; + x2 = e->position_b.x / (e->position_b.z + ZCONST) * ffw + (ffw >> 1); + y2 = e->position_b.y / (e->position_b.z + ZCONST) * ffh + (ffh >> 1) ; - w = (M_2_PI * asinf(fabsf(sinf((ticks * .001f))))) * 2.f - 1.f; - /* ^^ this approximates a triangle wave, - * a sine wave dwells too long for the illusion of continuously evolving - */ + /* for cases obviously outside the fragment, don't draw anything */ - for (unsigned j = 0; j < ctxt->n_elements; j++) { - flow_element_t *e = &ctxt->elements[j]; - v3f_t pos = e->position; - ff_data_t d = ff_get(ctxt->ff, &pos, w * .5f + .5f); + /* totally outside (above) */ + if (y1 < fy1 && y2 < fy1) + continue; - d.direction = v3f_mult_scalar(&d.direction, .001f); + /* totally outside (below) */ + if (y1 > fy2 && y2 > fy2) + continue; - for (unsigned k = 0; k < ctxt->n_iters; k++) { - unsigned x, y; + /* totally outside (left) */ + if (x1 < fx1 && x2 < fx1) + continue; - pos = v3f_add(&pos, &d.direction); -#define ZCONST 1.0f - x = (pos.x * 2.f - 1.f) / (pos.z + ZCONST) * fragment->width + (fragment->width >> 1); - y = (pos.y * 2.f - 1.f) / (pos.z + ZCONST) * fragment->height + (fragment->height >> 1) ; + /* totally outside (right) */ + if (x1 > fx2 && x2 > fx2) + continue; - (void) til_fb_fragment_put_pixel_checked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x, y, color_to_uint32_rgb(d.color)); + /* remaining cases draw something, get the pixel ready */ + pixel = color_to_uint32_rgb(e->color); - if (pos.x < 0.f || pos.x > 1.f || - pos.y < 0.f || pos.y > 1.f || - pos.z < 0.f || pos.z > 1.f) - *e = rand_element(&ctxt->til_module_context.seed); - else - e->position = pos; + /* totally inside, render unchecked */ + if (y1 >= fy1 && y1 < fy2 && y2 >= fy1 && y2 < fy2 && + x1 >= fx1 && x1 < fx2 && x2 >= fx1 && x2 < fx2) { + + (void) til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x1, y1, pixel); + (void) til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x2, y2, pixel); + + if (!ctxt->n_iters) + continue; + + for (unsigned j = 1; j < ctxt->n_iters - 1; j++) { + + pos = v3f_add(&pos, &v); + + x1 = pos.x / (pos.z + ZCONST) * ffw + (ffw >> 1); + y1 = pos.y / (pos.z + ZCONST) * ffh + (ffh >> 1); + + (void) til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x1, y1, pixel); + } + + continue; + } + + /* may partially overlap, do same as above but w/checking */ + (void) til_fb_fragment_put_pixel_checked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x1, y1, pixel); + (void) til_fb_fragment_put_pixel_checked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x2, y2, pixel); + + if (!ctxt->n_iters) + continue; + + for (unsigned j = 1; j < ctxt->n_iters - 1; j++) { + + pos = v3f_add(&pos, &v); + + x1 = pos.x / (pos.z + ZCONST) * ffw + (ffw >> 1); + y1 = pos.y / (pos.z + ZCONST) * ffh + (ffh >> 1) ; + + (void) til_fb_fragment_put_pixel_checked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x1, y1, pixel); + } } - e->lifetime -= .1f; - if (e->lifetime <= 0.0f) - *e = rand_element(&ctxt->til_module_context.seed); + return; } - /* Re-populate the other field before changing directions. - * note if the frame rate is too low and we miss a >.95 sample - * this will regress to just revisiting the previous field which - * is relatively harmless. - */ - if (fabsf(w) > .95f) { - unsigned other_idx; - - other_idx = rintf(-w * .5f + .5f); - if (other_idx != ctxt->last_populate_idx) { - ff_populate(ctxt->ff, other_idx); - ctxt->last_populate_idx = other_idx; + default: + assert(0); + } +} + + +static int flow_finish_frame(til_module_context_t *context, til_stream_t *stream, unsigned int ticks, til_fb_fragment_t **fragment_ptr) +{ + flow_context_t *ctxt = (flow_context_t *)context; + + ctxt->pass = (ctxt->pass + 1) % 2; + + if (!ctxt->pass) { + /* Re-populate the other field before changing directions. + * note if the frame rate is too low and we miss a >.95 sample + * this will regress to just revisiting the previous field which + * is relatively harmless. + */ + if (fabsf(ctxt->w) > .95f) { + unsigned other_idx; + + other_idx = rintf(-ctxt->w * .5f + .5f); + if (other_idx != ctxt->last_populate_idx) { + ff_populate(ctxt->ff, other_idx); + ctxt->last_populate_idx = other_idx; + } } } + + return ctxt->pass; } @@ -237,7 +394,9 @@ static int flow_setup(const til_settings_t *settings, til_setting_t **res_settin til_module_t flow_module = { .create_context = flow_create_context, .destroy_context = flow_destroy_context, + .prepare_frame = flow_prepare_frame, .render_fragment = flow_render_fragment, + .finish_frame = flow_finish_frame, .setup = flow_setup, .name = "flow", .description = "3D flow field", -- cgit v1.2.3