/*
 *  Copyright 2018 - Vito Caputo - <vcaputo@pengaru.com>
 *
 *  This program is free software: you can redistribute it and/or modify it
 *  under the terms of the GNU General Public License version 3 as published
 *  by the Free Software Foundation.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <assert.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <stdlib.h>

#include "container.h"
#include "list.h"
#include "pad.h"

/* This is adapted from the particle system in my rototiller project:
 * git://git.pengaru.com/rototiller
 *
 * This implements a very basic chunked allocator which prioritizes efficient
 * allocation and freeing over low waste of memory.  It mallocs a chunk at a
 * time, doling out elements from the chunk sequentially as requested until the
 * chunk cannot fulfill an allocation.  At that point, the current chunk is
 * retired, a new chunk is allocated, and the cycle repeats.
 *
 * When allocations are freed, the refcount for its chunk is decremented,
 * leaving the chunk pinned with holes accumulating until the refcount reaches
 * zero, at which point the chunk is made available for allocations again.
 *
 * This requires a pointer to the chunk be returned with every allocation.  It
 * may be possible to reduce the footprint of this by using a relative offset
 * to the chunk start instead, but that would probably be more harmful to the
 * alignment.
 *
 * Note that the allocations may vary in size, they just can't exceed the chunk
 * size on a given pad instance.  A TODO item is to add a pad variant with a
 * fixed allocation size specified at pad creation, with the addition of a free
 * list enabling hole-filling and a fast cache-friendly iterator for quickly
 * visiting all the elements in pinned chunks.
 */

#define CHUNK_ALIGNMENT			8192	/* XXX: this may be unnecessary, callers should be able to ideally size their pads */
#define ALLOC_ALIGNMENT			8	/* allocations within the chunk need to be aligned since their size affects subsequent allocation offsets */
#define ALIGN(_size, _alignment)	(((_size) + _alignment - 1) & ~(_alignment - 1))

typedef struct chunk_t {
	pad_t		*pad;		/* pad chunk belongs to */
	list_head_t	chunks;		/* node on free/pinned list */
	uint32_t	n_refs;		/* number of references (active allocations) to this chunk */
	unsigned	next_offset;	/* next available offset for allocation */
	uint8_t		mem[];		/* usable memory from this chunk */
} chunk_t;


typedef struct alloc_t {
	chunk_t		*chunk;		/* chunk this allocation came from */
	uint8_t		mem[];		/* usable memory from this allocation */
} alloc_t;


struct pad_t {
	chunk_t		*chunk;		/* current chunk allocations come from */
	unsigned	chunk_size;	/* size chunks are allocated in */
	unsigned	flags;		/* flags of this pad */
	list_head_t	free_chunks;	/* list of completely free chunks */
	list_head_t	pinned_chunks;	/* list of chunks pinned because they have an outstanding allocation */
};


/* Add a reference to a chunk. */
static inline void chunk_ref(chunk_t *chunk)
{
	assert(chunk);
	assert(chunk->pad);

	chunk->n_refs++;

	assert(chunk->n_refs != 0);
}


/* Remove reference from a chunk, move to free list if last reference. */
static inline void chunk_unref(chunk_t *chunk)
{
	assert(chunk);
	assert(chunk->pad);
	assert(chunk->n_refs > 0);

	if (chunk->n_refs == 1) {
		pad_t	*pad = chunk->pad;

		list_move(&chunk->chunks, &pad->free_chunks);

		if ((pad->flags & PAD_FLAGS_ZERO))
			memset(chunk->mem, 0, pad->chunk_size);
		
		return;
	}

	chunk->n_refs--;
}


/* Return allocated size of the chunk */
static inline unsigned chunk_alloc_size(pad_t *pad)
{
	assert(pad);

	return (sizeof(chunk_t) + pad->chunk_size);
}


/* Get a new working chunk */
static chunk_t * chunk_get(pad_t *pad)
{
	chunk_t	*chunk;

	assert(pad);

	if (!list_empty(&pad->free_chunks)) {
		chunk = list_entry(pad->free_chunks.next, chunk_t, chunks);
		list_del(&chunk->chunks);
	} else {
		/* No free chunks, must ask libc for memory */
		if ((pad->flags & PAD_FLAGS_ZERO))
			chunk = calloc(1, chunk_alloc_size(pad));
		else
			chunk = malloc(chunk_alloc_size(pad));

		if (!chunk)
			return NULL;
	}

	/* Note a chunk is pinned from the moment it's created, and a reference
	 * is added to represent pad->chunk, even though no allocations
	 * occurred yet.
	 */
	chunk->n_refs = 1;
	chunk->next_offset = 0;
	chunk->pad = pad;
	list_add(&chunk->chunks, &pad->pinned_chunks);

	return chunk;
}


/* Create a new pad. */
pad_t * pad_new(unsigned chunk_size, unsigned flags)
{
	pad_t	*pad;

	pad = calloc(1, sizeof(pad_t));
	if (!pad)
		return NULL;

	INIT_LIST_HEAD(&pad->free_chunks);
	INIT_LIST_HEAD(&pad->pinned_chunks);

	/* XXX: pad->chunk_size does not include the size of the chunk_t container */
	pad->chunk_size = ALIGN(chunk_size, CHUNK_ALIGNMENT);
	pad->flags = flags;

	return pad;
}


/* Reset a pad making all chunks free/available for reuse, deleting all
 * existing allocations in one swoop without giving the memory back to
 * libc.
 */
void pad_reset(pad_t *pad)
{
	assert(pad);

	pad->chunk = NULL;

	if ((pad->flags & PAD_FLAGS_ZERO)) {
		chunk_t	*c;

		list_for_each_entry(c, &pad->pinned_chunks, chunks)
			memset(c->mem, 0, pad->chunk_size);
	}

	list_splice_init(&pad->pinned_chunks, &pad->free_chunks);
}


/* Free a pad and it's associated allocations. */
void pad_free(pad_t *pad)
{
	chunk_t	*chunk, *_chunk;

	assert(pad);

	pad_reset(pad);

	list_for_each_entry_safe(chunk, _chunk, &pad->free_chunks, chunks)
		free(chunk);

	free(pad);
}


/* Get uninitialized memory from a pad. (malloc()).
 * May return NULL if malloc() fails
 */
void * pad_get(pad_t *pad, unsigned size)
{
	alloc_t	*alloc;

	assert(pad);
	assert(size <= pad->chunk_size);

	size = ALIGN(sizeof(alloc_t) + size, ALLOC_ALIGNMENT);

	if (!pad->chunk || size + pad->chunk->next_offset > pad->chunk_size) {
		/* Retire this chunk, time for a new one */
		if (pad->chunk)
			chunk_unref(pad->chunk);

		pad->chunk = chunk_get(pad);
	}

	if (!pad->chunk)
		return NULL;

	chunk_ref(pad->chunk);
	alloc = (alloc_t *)&pad->chunk->mem[pad->chunk->next_offset];
	pad->chunk->next_offset += size;
	alloc->chunk = pad->chunk;

	assert(pad->chunk->next_offset <= pad->chunk_size);

	return alloc->mem;
}


/* Put memory taken from a pad back. (free()) */
void pad_put(void *ptr)
{
	alloc_t	*alloc = container_of(ptr, alloc_t, mem);

	assert(ptr);

	chunk_unref(alloc->chunk);
}

/* TODO: add a chunk iterator interface for cache-friendly iterating across
 * chunk contents.  This is really only viable for uniformly sized chunks,
 * which the api doesn't currently enforce/differentiate.  So that would
 * probably be a special pad mode the iterator would only work with.
 */