diff options
author | Vito Caputo <vcaputo@pengaru.com> | 2023-09-04 20:45:42 -0700 |
---|---|---|
committer | Vito Caputo <vcaputo@pengaru.com> | 2023-09-04 20:54:50 -0700 |
commit | 6d6c14180096fc4417aeecd489146e0f6fe6f721 (patch) | |
tree | 9dcc9ac0e90eb2b369a6651559b8a086203e480b | |
parent | bab16b070f95687ec11d43e55f3dd6e69f96a576 (diff) |
modules/flow: implement threaded rendering
This exploits the just added multipass rendering support.
In the first pass, the flow-field is sampled and applied to the
elements, with every thread operating on its own subset of the
elements list. Since the flow-field sampling is all read-only,
it's perfectly safe too do in parallel. Nothing is drawn in the
first pass, it's only the elements updating according to the
flow-field which is performed.
In the second pass, the elements are rendered in parallel using
the slice_per_cpu fragmenter. Since the elements are kept on a
simple array, with no spatial indexing, every thread must visit
every element.
Since the fragmenter used divides the frame into horizontal
slices, every thread needing to reject elements not overlapping
its region can take some shortcuts in easily identifying elements
entirely outside its region. But the whole 3d->2d projection
step must still be performed for every element's current position
and +n_iters final position for the frame, which does have a
divide unfortunately.
Nonetheless, this change improves frame rates substantially on my
2c/4t i7 X230 as benchmarked w/--video=mem,1366x768:
--seed=0x64fa9508 '--module=rtv,channels=flow,duration=3,context_duration=3,caption_duration=0,log_channels=on,snow_duration=0,snow_module=none' '--video=mem,size=1366x768'
rtv channel settings: 'flow,size=4,count=40000,speed=.8'
FPS: 261
FPS: 265
rtv channel settings: 'flow,size=4,count=1000,speed=.9'
FPS: 1153
FPS: 3204
FPS: 2934
rtv channel settings: 'flow,size=8,count=5000,speed=.9'
FPS: 2923
FPS: 1634
FPS: 1592
rtv channel settings: 'flow,size=2,count=50000,speed=.4'
FPS: 1006
FPS: 219
FPS: 268
rtv channel settings: 'flow,size=16,count=30000,speed=.8'
FPS: 304
FPS: 350
FPS: 343
rtv channel settings: 'flow,size=16,count=30000,speed=.02'
FPS: 379
FPS: 503
FPS: 472
rtv channel settings: 'flow,size=8,count=1000,speed=.16'
FPS: 1393
FPS: 3822
FPS: 3876
---
Prior to this commit:
--seed=0x64fa9508 '--module=rtv,channels=flow,duration=3,context_duration=3,caption_duration=0,log_channels=on,snow_duration=0,snow_module=none' '--video=mem,size=1366x768'
rtv channel settings: 'flow,size=4,count=40000,speed=.8'
FPS: 53
FPS: 53
rtv channel settings: 'flow,size=4,count=1000,speed=.9'
FPS: 426
FPS: 1366
FPS: 1335
rtv channel settings: 'flow,size=8,count=5000,speed=.9'
FPS: 1097
FPS: 368
FPS: 367
rtv channel settings: 'flow,size=2,count=50000,speed=.4'
FPS: 279
FPS: 73
FPS: 74
rtv channel settings: 'flow,size=16,count=30000,speed=.8'
FPS: 71
FPS: 71
FPS: 70
rtv channel settings: 'flow,size=16,count=30000,speed=.02'
FPS: 136
FPS: 305
FPS: 305
rtv channel settings: 'flow,size=8,count=1000,speed=.16'
FPS: 972
FPS: 2593
FPS: 2634
-rw-r--r-- | src/modules/flow/flow.c | 263 |
1 files changed, 211 insertions, 52 deletions
diff --git a/src/modules/flow/flow.c b/src/modules/flow/flow.c index 936bafe..ddbae52 100644 --- a/src/modules/flow/flow.c +++ b/src/modules/flow/flow.c @@ -17,8 +17,8 @@ /* Copyright (C) 2017 Vito Caputo <vcaputo@pengaru.com> */ /* TODO: - * - make threaded - * - make colorful + * - improve the second pass's element rejection efficiency, a spatial data structure + * could probably help here. */ #define FLOW_DEFAULT_SIZE "8" @@ -29,7 +29,9 @@ typedef struct flow_element_t { float lifetime; - v3f_t position; + v3f_t position_a, position_b; + v3f_t velocity; /* per-iter step + direction applicable directly to position_a */ + v3f_t color; } flow_element_t; typedef struct flow_context_t { @@ -50,6 +52,9 @@ typedef struct flow_context_t { unsigned last_populate_idx; unsigned n_iters; unsigned n_elements; + unsigned n_elements_per_cpu; + unsigned pass; + float w; flow_element_t elements[]; } flow_context_t; @@ -62,7 +67,7 @@ typedef struct flow_setup_t { } flow_setup_t; -static void populator(void *context, unsigned size, const ff_data_t *other, ff_data_t *field) +static void flow_ff_populator(void *context, unsigned size, const ff_data_t *other, ff_data_t *field) { flow_context_t *ctxt = context; unsigned *seedp = &ctxt->til_module_context.seed; @@ -91,10 +96,12 @@ static inline float rand_within_range(unsigned *seed, float min, float max) static inline flow_element_t rand_element(unsigned *seed) { - flow_element_t e; + flow_element_t e = { + .lifetime = rand_within_range(seed, .5f, 20.f), + .position_a = v3f_rand(seed, -1.f, 1.f), + }; - e.lifetime = rand_within_range(seed, 0.5f, 20.0f); - e.position = v3f_rand(seed, 0.0f, 1.0f); + e.position_b = e.position_a; return e; } @@ -121,20 +128,22 @@ static til_module_context_t * flow_create_context(const til_module_t *module, ti { flow_setup_t *s = (flow_setup_t *)setup; flow_context_t *ctxt; - unsigned i; + unsigned elements_per_cpu; - ctxt = til_module_context_new(module, sizeof(flow_context_t) + sizeof(ctxt->elements[0]) * s->count, stream, seed, ticks, n_cpus, setup); + elements_per_cpu = s->count / n_cpus; + ctxt = til_module_context_new(module, sizeof(flow_context_t) + sizeof(ctxt->elements[0]) * elements_per_cpu * n_cpus, stream, seed, ticks, n_cpus, setup); if (!ctxt) return NULL; - ctxt->ff = ff_new(s->size, populator, ctxt); + ctxt->ff = ff_new(s->size, flow_ff_populator, ctxt); if (!ctxt->ff) return til_module_context_free(&ctxt->til_module_context); - for (i = 0; i < s->count; i++) + for (unsigned i = 0; i < s->count; i++) ctxt->elements[i] = rand_element(&ctxt->til_module_context.seed); - ctxt->n_elements = s->count; + ctxt->n_elements_per_cpu = elements_per_cpu; + ctxt->n_elements = elements_per_cpu * n_cpus; ctxt->taps.speed = til_tap_init_float(ctxt, &ctxt->speed, 1, &ctxt->vars.speed, "speed"); flow_update_taps(ctxt, stream); @@ -169,65 +178,213 @@ static inline uint32_t color_to_uint32_rgb(v3f_t color) { } +static void flow_prepare_frame(til_module_context_t *context, til_stream_t *stream, unsigned ticks, til_fb_fragment_t **fragment_ptr, til_frame_plan_t *res_frame_plan) +{ + flow_context_t *ctxt = (flow_context_t *)context; + + switch (ctxt->pass) { + case 0: + flow_update_taps(ctxt, stream); + + ctxt->w = (M_2_PI * asinf(fabsf(sinf((ticks * .001f))))) * 2.f - 1.f; + /* ^^ this approximates a triangle wave, + * a sine wave dwells too long for the illusion of continuously evolving + */ + + *res_frame_plan = (til_frame_plan_t){ .fragmenter = til_fragmenter_noop_per_cpu }; + return; + + case 1: + *res_frame_plan = (til_frame_plan_t){ .fragmenter = til_fragmenter_slice_per_cpu }; + return; + + default: + assert(0); + } +} + + static void flow_render_fragment(til_module_context_t *context, til_stream_t *stream, unsigned ticks, unsigned cpu, til_fb_fragment_t **fragment_ptr) { flow_context_t *ctxt = (flow_context_t *)context; til_fb_fragment_t *fragment = *fragment_ptr; - float w; - flow_update_taps(ctxt, stream); + switch (ctxt->pass) { + case 0: { + flow_element_t *e = &ctxt->elements[fragment->number * ctxt->n_elements_per_cpu]; + unsigned n = ctxt->n_elements_per_cpu; + float w = ctxt->w * .5f + .5f; + + /* XXX: note the fragment->number is used above as the cpu number, this is to ensure all cpu #s + * are actually used. Since our noop_fragmenter_per_cpu always produces a fragment per cpu, + * the fragment->number should exhaust the cpu space. Relying on the actual cpu number could + * skip entire regions of the elements, since there's no guarantee we get scheduled on all CPUs + * in a given frame, despite having a fragment per cpu. An alternative would be to set the + * .cpu_affinity flag in the frame_plan, but that just slows things down pointlessly. + */ + + /* sample the flow-field and update the elements accordingly, splitting ctxt->elements + * into elements_per_cpu chunks indexed by cpu, only working on the chunk for this cpu + */ + for (unsigned i = 0; i < n; e++, i++) { + v3f_t pos; + ff_data_t d; + + e->lifetime -= .1f; + if (e->lifetime <= 0.0f) + *e = rand_element(&ctxt->til_module_context.seed); + + if (e->position_b.x < -1.f || e->position_b.x > 1.f || + e->position_b.y < -1.f || e->position_b.y > 1.f || + e->position_b.z < -1.f || e->position_b.z > 1.f) + *e = rand_element(&ctxt->til_module_context.seed); + + pos = e->position_a = e->position_b; + + d = ff_get(ctxt->ff, + &(v3f_t){ /* FIXME TODO: just make ff.[ch] use a -1..+1 coordinate system */ + .x = pos.x * .5f + .5f, + .y = pos.y * .5f + .5f, + .z = pos.z * .5f + .5f, + }, w); + e->color = d.color; + d.direction = v3f_mult_scalar(&d.direction, .001f); /* XXX FIXME: magic number alert! */ + e->velocity = d.direction; + + /* Compute the final position now for the next go-round. + * The second pass can't just write it back willy-nilly while racing with others, + * despite doing the same thing iteratively as it draws n_iters pixels. Hence + * this position_b becomes position_a situation above. + */ + d.direction = v3f_mult_scalar(&d.direction, (float)ctxt->n_iters); + e->position_b = v3f_add(&pos, &d.direction); + } + + return; + } - til_fb_fragment_clear(fragment); + case 1: { + unsigned ffw = fragment->frame_width, + ffh = fragment->frame_height; + unsigned fx1 = fragment->x, + fy1 = fragment->y, + fx2 = fragment->x + fragment->width, + fy2 = fragment->y + fragment->height; + + til_fb_fragment_clear(fragment); + + /* render elements overlapping with this fragment's tile */ + for (unsigned i = 0; i < ctxt->n_elements; i++) { + flow_element_t *e = &ctxt->elements[i]; + v3f_t pos = e->position_a; + v3f_t v = e->velocity; + unsigned x1, y1, x2, y2; + uint32_t pixel; + + /* Perspective-project the endpoints of the element's travel, this is + * the part we can't currently avoid doing per-element per-fragment. + */ +#define ZCONST 1.0f + x1 = pos.x / (pos.z + ZCONST) * ffw + (ffw >> 1); + y1 = pos.y / (pos.z + ZCONST) * ffh + (ffh >> 1) ; + x2 = e->position_b.x / (e->position_b.z + ZCONST) * ffw + (ffw >> 1); + y2 = e->position_b.y / (e->position_b.z + ZCONST) * ffh + (ffh >> 1) ; - w = (M_2_PI * asinf(fabsf(sinf((ticks * .001f))))) * 2.f - 1.f; - /* ^^ this approximates a triangle wave, - * a sine wave dwells too long for the illusion of continuously evolving - */ + /* for cases obviously outside the fragment, don't draw anything */ - for (unsigned j = 0; j < ctxt->n_elements; j++) { - flow_element_t *e = &ctxt->elements[j]; - v3f_t pos = e->position; - ff_data_t d = ff_get(ctxt->ff, &pos, w * .5f + .5f); + /* totally outside (above) */ + if (y1 < fy1 && y2 < fy1) + continue; - d.direction = v3f_mult_scalar(&d.direction, .001f); + /* totally outside (below) */ + if (y1 > fy2 && y2 > fy2) + continue; - for (unsigned k = 0; k < ctxt->n_iters; k++) { - unsigned x, y; + /* totally outside (left) */ + if (x1 < fx1 && x2 < fx1) + continue; - pos = v3f_add(&pos, &d.direction); -#define ZCONST 1.0f - x = (pos.x * 2.f - 1.f) / (pos.z + ZCONST) * fragment->width + (fragment->width >> 1); - y = (pos.y * 2.f - 1.f) / (pos.z + ZCONST) * fragment->height + (fragment->height >> 1) ; + /* totally outside (right) */ + if (x1 > fx2 && x2 > fx2) + continue; - (void) til_fb_fragment_put_pixel_checked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x, y, color_to_uint32_rgb(d.color)); + /* remaining cases draw something, get the pixel ready */ + pixel = color_to_uint32_rgb(e->color); - if (pos.x < 0.f || pos.x > 1.f || - pos.y < 0.f || pos.y > 1.f || - pos.z < 0.f || pos.z > 1.f) - *e = rand_element(&ctxt->til_module_context.seed); - else - e->position = pos; + /* totally inside, render unchecked */ + if (y1 >= fy1 && y1 < fy2 && y2 >= fy1 && y2 < fy2 && + x1 >= fx1 && x1 < fx2 && x2 >= fx1 && x2 < fx2) { + + (void) til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x1, y1, pixel); + (void) til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x2, y2, pixel); + + if (!ctxt->n_iters) + continue; + + for (unsigned j = 1; j < ctxt->n_iters - 1; j++) { + + pos = v3f_add(&pos, &v); + + x1 = pos.x / (pos.z + ZCONST) * ffw + (ffw >> 1); + y1 = pos.y / (pos.z + ZCONST) * ffh + (ffh >> 1); + + (void) til_fb_fragment_put_pixel_unchecked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x1, y1, pixel); + } + + continue; + } + + /* may partially overlap, do same as above but w/checking */ + (void) til_fb_fragment_put_pixel_checked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x1, y1, pixel); + (void) til_fb_fragment_put_pixel_checked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x2, y2, pixel); + + if (!ctxt->n_iters) + continue; + + for (unsigned j = 1; j < ctxt->n_iters - 1; j++) { + + pos = v3f_add(&pos, &v); + + x1 = pos.x / (pos.z + ZCONST) * ffw + (ffw >> 1); + y1 = pos.y / (pos.z + ZCONST) * ffh + (ffh >> 1) ; + + (void) til_fb_fragment_put_pixel_checked(fragment, TIL_FB_DRAW_FLAG_TEXTURABLE, x1, y1, pixel); + } } - e->lifetime -= .1f; - if (e->lifetime <= 0.0f) - *e = rand_element(&ctxt->til_module_context.seed); + return; } - /* Re-populate the other field before changing directions. - * note if the frame rate is too low and we miss a >.95 sample - * this will regress to just revisiting the previous field which - * is relatively harmless. - */ - if (fabsf(w) > .95f) { - unsigned other_idx; - - other_idx = rintf(-w * .5f + .5f); - if (other_idx != ctxt->last_populate_idx) { - ff_populate(ctxt->ff, other_idx); - ctxt->last_populate_idx = other_idx; + default: + assert(0); + } +} + + +static int flow_finish_frame(til_module_context_t *context, til_stream_t *stream, unsigned int ticks, til_fb_fragment_t **fragment_ptr) +{ + flow_context_t *ctxt = (flow_context_t *)context; + + ctxt->pass = (ctxt->pass + 1) % 2; + + if (!ctxt->pass) { + /* Re-populate the other field before changing directions. + * note if the frame rate is too low and we miss a >.95 sample + * this will regress to just revisiting the previous field which + * is relatively harmless. + */ + if (fabsf(ctxt->w) > .95f) { + unsigned other_idx; + + other_idx = rintf(-ctxt->w * .5f + .5f); + if (other_idx != ctxt->last_populate_idx) { + ff_populate(ctxt->ff, other_idx); + ctxt->last_populate_idx = other_idx; + } } } + + return ctxt->pass; } @@ -237,7 +394,9 @@ static int flow_setup(const til_settings_t *settings, til_setting_t **res_settin til_module_t flow_module = { .create_context = flow_create_context, .destroy_context = flow_destroy_context, + .prepare_frame = flow_prepare_frame, .render_fragment = flow_render_fragment, + .finish_frame = flow_finish_frame, .setup = flow_setup, .name = "flow", .description = "3D flow field", |