threads: rework threaded fragment scheduling

Instead of creating fragment lists striped across available threads uniformly in a round-robin fashion, just have the render threads iterate across the shared fragments array using atomics. This way non-uniform cost of rendering can be adapted to, provided the module prepares the frame with sufficient fragment granularity. In the ray tracer for example, it is quite common for some areas of the screen to have lower complexity/cost than others. The previous model distributed the fragments uniformly across the threads with no ability for underutilized threads to steal work from overutilized threads in the event of non-uniform cost distributions. Now no attempt to schedule work is made. The render threads simply race with eachother on a per-frame basis, atomically incrementing a shared index into the frame's prepared fragemnts. The fragment size itself represents the atomic work unit. A later commit will change the various renderers to prepare more/smaller fragments where appropriate. The ray tracer in particular needs more and would probably further benefit from a tiling strategy, especially when an acceleration data structure is introduced.
author: Vito Caputo <vcaputo@pengaru.com> 2017-08-07 13:02:47 -0700
committer: Vito Caputo <vcaputo@pengaru.com> 2017-08-07 13:15:44 -0700
commit: f0a961c5672aaee05112d54555aec6b9b68416dc (patch)
tree: 86877ea048bc152e8b9d3169e402d977a2c1de8a /src
parent: 3206fc10cb2cf01e61232a04f68917b0c244e552 (diff)
1 files changed, 76 insertions, 98 deletions
diff --git a/src/threads.c b/src/threads.c
index 865fe21..f81563e 100644
--- a/src/threads.c
+++ b/src/threads.c
@@ -7,135 +7,107 @@
 #include "threads.h"
 #include "util.h"
 
-/* This is a very simple/naive implementation, there's certainly room for improvement.
- * Some things to explore:
- *  - switch to a single condition variable and broadcast to wake up the threads?
- *  - use lock-free algorithms?
- */
-
-typedef struct fragment_node_t fragment_node_t;
-
-struct fragment_node_t {
-	fragment_node_t	*next;
-	fb_fragment_t	*fragment;
-};
-
-typedef struct thread_t {
-	pthread_t	thread;
-	pthread_mutex_t	mutex;
-	pthread_cond_t	cond;
-	void		(*render_fragment_func)(void *context, fb_fragment_t *fragment);
-	void		*context;
-	fragment_node_t	*fragments;
-} thread_t;
-
 typedef struct threads_t {
-	unsigned	n_threads;
-	fragment_node_t	fragment_nodes[ROTOTILLER_FRAME_MAX_FRAGMENTS];
-	thread_t	threads[];
+	unsigned		n_threads;
+
+	pthread_mutex_t		idle_mutex;
+	pthread_cond_t		idle_cond;
+	unsigned		n_idle;
+
+	pthread_mutex_t		frame_mutex;
+	pthread_cond_t		frame_cond;
+	void			(*render_fragment_func)(void *context, fb_fragment_t *fragment);
+	void			*context;
+	rototiller_frame_t	*frame;
+
+	unsigned		next_fragment;
+	unsigned		frame_num;
+
+	pthread_t		threads[];
 } threads_t;
 
 
-/* render submitted fragments using the supplied render function */
-static void * thread_func(void *_thread)
+/* render fragments using the supplied render function */
+static void * thread_func(void *_threads)
 {
-	thread_t	*thread = _thread;
+	threads_t		*threads = _threads;
+	unsigned		prev_frame_num = 0;
 
 	for (;;) {
-		pthread_mutex_lock(&thread->mutex);
-		while (!thread->fragments)
-			pthread_cond_wait(&thread->cond, &thread->mutex);
-
-		do {
-			thread->render_fragment_func(thread->context, thread->fragments->fragment);
-			thread->fragments = thread->fragments->next;
-		} while (thread->fragments);
+		unsigned	frag_idx;
+
+		/* wait for a new frame */
+		pthread_mutex_lock(&threads->frame_mutex);
+		while (threads->frame_num == prev_frame_num)
+			pthread_cond_wait(&threads->frame_cond, &threads->frame_mutex);
+		prev_frame_num = threads->frame_num;
+		pthread_mutex_unlock(&threads->frame_mutex);
+
+		/* render fragments */
+		for (frag_idx = __sync_fetch_and_add(&threads->next_fragment, 1);
+		     frag_idx < threads->frame->n_fragments;
+		     frag_idx = __sync_fetch_and_add(&threads->next_fragment, 1)) {
+			threads->render_fragment_func(threads->context, &threads->frame->fragments[frag_idx]);
+		}
 
-		pthread_mutex_unlock(&thread->mutex);
-		pthread_cond_signal(&thread->cond);
+		/* report as idle */
+		pthread_mutex_lock(&threads->idle_mutex);
+		threads->n_idle++;
+		if (threads->n_idle == threads->n_threads)	/* Frame finished! Notify potential waiter. */
+			pthread_cond_signal(&threads->idle_cond);
+		pthread_mutex_unlock(&threads->idle_mutex);
 	}
 
 	return NULL;
 }
 
 
-/* submit a list of fragments to render using the specified thread and render_fragment_func */
-static void thread_fragments_submit(thread_t *thread, void (*render_fragment_func)(void *context, fb_fragment_t *fragment), void *context, fragment_node_t *fragments)
-{
-	pthread_mutex_lock(&thread->mutex);
-	while (thread->fragments != NULL)	/* XXX: never true due to thread_wait_idle() */
-		pthread_cond_wait(&thread->cond, &thread->mutex);
-
-	thread->render_fragment_func = render_fragment_func;
-	thread->context = context;
-	thread->fragments = fragments;
-
-	pthread_mutex_unlock(&thread->mutex);
-	pthread_cond_signal(&thread->cond);
-}
-
-
-/* wait for a thread to be idle */
-static void thread_wait_idle(thread_t *thread)
+/* wait for all threads to be idle */
+void threads_wait_idle(threads_t *threads)
 {
-	pthread_mutex_lock(&thread->mutex);
-	while (thread->fragments)
-		pthread_cond_wait(&thread->cond, &thread->mutex);
-	pthread_mutex_unlock(&thread->mutex);
+	pthread_mutex_lock(&threads->idle_mutex);
+	while (threads->n_idle < threads->n_threads)
+		pthread_cond_wait(&threads->idle_cond, &threads->idle_mutex);
+	pthread_mutex_unlock(&threads->idle_mutex);
 }
 
 
 /* submit a frame's fragments to the threads */
 void threads_frame_submit(threads_t *threads, rototiller_frame_t *frame, void (*render_fragment_func)(void *context, fb_fragment_t *fragment), void *context)
 {
-	unsigned	i, t;
-	fragment_node_t	*lists[threads->n_threads];
-
-	assert(frame->n_fragments <= ROTOTILLER_FRAME_MAX_FRAGMENTS);
-
-	for (i = 0; i < threads->n_threads; i++)
-		lists[i] = NULL;
-
-	for (i = 0; i < frame->n_fragments;) {
-		for (t = 0; i < frame->n_fragments && t < threads->n_threads; t++, i++) {
-			threads->fragment_nodes[i].next = lists[t];
-			lists[t] = &threads->fragment_nodes[i];
-			lists[t]->fragment = &frame->fragments[i];
-		}
-	}
-
-	for (i = 0; i < threads->n_threads; i++)
-		thread_fragments_submit(&threads->threads[i], render_fragment_func, context, lists[i]);
-}
-
-
-/* wait for all threads to drain their fragments list and become idle */
-void threads_wait_idle(threads_t *threads)
-{
-	unsigned	i;
-
-	for (i = 0; i < threads->n_threads; i++)
-		thread_wait_idle(&threads->threads[i]);
+	threads_wait_idle(threads);	/* XXX: likely non-blocking; already happens pre page flip */
+
+	pthread_mutex_lock(&threads->frame_mutex);
+	threads->frame = frame;
+	threads->render_fragment_func = render_fragment_func;
+	threads->context = context;
+	threads->frame_num++;
+	threads->n_idle = threads->next_fragment = 0;
+	pthread_cond_broadcast(&threads->frame_cond);
+	pthread_mutex_unlock(&threads->frame_mutex);
 }
 
 
 /* create threads instance, a thread per cpu is created */
 threads_t * threads_create(void)
 {
-	threads_t	*threads;
 	unsigned	i, num = get_ncpus();
+	threads_t	*threads;
 
-	threads = calloc(1, sizeof(threads_t) + sizeof(thread_t) * num);
+	threads = calloc(1, sizeof(threads_t) + sizeof(pthread_t) * num);
 	if (!threads)
 		return NULL;
 
-	for (i = 0; i < num; i++) {
-		pthread_mutex_init(&threads->threads[i].mutex, NULL);
-		pthread_cond_init(&threads->threads[i].cond, NULL);
-		pthread_create(&threads->threads[i].thread, NULL, thread_func, &threads->threads[i]);
-	}
+	threads->n_idle = threads->n_threads = num;
 
-	threads->n_threads = num;
+	pthread_mutex_init(&threads->idle_mutex, NULL);
+	pthread_cond_init(&threads->idle_cond, NULL);
+
+	pthread_mutex_init(&threads->frame_mutex, NULL);
+	pthread_cond_init(&threads->frame_cond, NULL);
+
+	for (i = 0; i < num; i++)
+		pthread_create(&threads->threads[i], NULL, thread_func, threads);
 
 	return threads;
 }
@@ -147,10 +119,16 @@ void threads_destroy(threads_t *threads)
 	unsigned	i;
 
 	for (i = 0; i < threads->n_threads; i++)
-		pthread_cancel(threads->threads[i].thread);
+		pthread_cancel(threads->threads[i]);
 
 	for (i = 0; i < threads->n_threads; i++)
-		pthread_join(threads->threads[i].thread, NULL);
+		pthread_join(threads->threads[i], NULL);
+
+	pthread_mutex_destroy(&threads->idle_mutex);
+	pthread_cond_destroy(&threads->idle_cond);
+
+	pthread_mutex_destroy(&threads->frame_mutex);
+	pthread_cond_destroy(&threads->frame_cond);
 
 	free(threads);
 }
author	Vito Caputo <vcaputo@pengaru.com>	2017-08-07 13:02:47 -0700
committer	Vito Caputo <vcaputo@pengaru.com>	2017-08-07 13:15:44 -0700
commit	f0a961c5672aaee05112d54555aec6b9b68416dc (patch)
tree	86877ea048bc152e8b9d3169e402d977a2c1de8a /src
parent	3206fc10cb2cf01e61232a04f68917b0c244e552 (diff)