diff options
| author | Vito Caputo <vcaputo@pengaru.com> | 2021-08-23 15:34:12 -0700 | 
|---|---|---|
| committer | Vito Caputo <vcaputo@pengaru.com> | 2021-08-24 00:48:46 -0700 | 
| commit | db549aa7f63621e1b81d32e59456303c5003b4b9 (patch) | |
| tree | 277d51fca2c89e2fdf7002ffee051bcd70b21ef8 /src | |
| parent | b53cc8e61a27f948df5f11da07c7c395ebae1dd1 (diff) | |
verify-hashed-objects: add `jio verify hashed-objects`
This is currently very hacky and unfinished, but does enough for
some performance comparisons against a zstd-using journalctl --verify
that has been hacked to return early after the first pass.
It's currently rather leaky, the whole per-object-dispatch thingy
is illuminating a thunk_h shortcoming and forcing addressing the
issue... soon.
Diffstat (limited to 'src')
| -rw-r--r-- | src/Makefile.am | 4 | ||||
| -rw-r--r-- | src/jio.c | 19 | ||||
| -rw-r--r-- | src/journals.c | 142 | ||||
| -rw-r--r-- | src/journals.h | 2 | ||||
| -rw-r--r-- | src/verify-hashed-objects.c | 269 | ||||
| -rw-r--r-- | src/verify-hashed-objects.h | 8 | 
6 files changed, 442 insertions, 2 deletions
| diff --git a/src/Makefile.am b/src/Makefile.am index 92f89b8..e0c25d1 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -22,7 +22,9 @@ jio_SOURCES = \  	report-tail-waste.c \  	report-tail-waste.h \  	report-usage.c \ -	report-usage.h +	report-usage.h \ +	verify-hashed-objects.c \ +	verify-hashed-objects.h  jio_CPPFLAGS = -I@top_srcdir@/thunk_h -I@top_srcdir@/libiou/src  jio_LDADD = @top_builddir@/libiou/src/libiou.a @top_builddir@/src/upstream/libupstream.a @@ -24,6 +24,7 @@  #include "report-layout.h"  #include "report-tail-waste.h"  #include "report-usage.h" +#include "verify-hashed-objects.h"  #include "upstream/journal-def.h" @@ -39,7 +40,7 @@ int main(int argc, char *argv[])  	int	r;  	if (argc < 2) { -		printf("Usage: %s {help,reclaim,report} [subcommand-args]\n", argv[0]); +		printf("Usage: %s {help,reclaim,report,verify} [subcommand-args]\n", argv[0]);  		return 0;  	} @@ -134,6 +135,22 @@ int main(int argc, char *argv[])  			fprintf(stderr, "Unsupported report subcommand: \"%s\"\n", argv[2]);  			return 1;  		} +	} else if (!strcmp(argv[1], "verify")) { +		if (argc < 3) { +			printf("Usage: %s verify {hashed-objects}\n", argv[0]); +			return 0; +		} + +		if (!strcmp(argv[2], "hashed-objects")) { +			r = jio_verify_hashed_objects(iou, argc, argv); +			if (r < 0) { +				fprintf(stderr, "failed to verify hashed objects: %s\n", strerror(-r)); +				return 1; +			} +		} else { +			fprintf(stderr, "Unsupported verify subcommand: \"%s\"\n", argv[2]); +			return 1; +		}  	} else if (!strcmp(argv[1], "version")) {  		puts("jio version " VERSION);  		return 0; diff --git a/src/journals.c b/src/journals.c index 8211b6f..05c14f0 100644 --- a/src/journals.c +++ b/src/journals.c @@ -18,6 +18,7 @@  #include <dirent.h>  #include <fcntl.h>  #include <liburing.h> +#include <stddef.h>  #include <stdio.h>  #include <string.h>  #include <sys/stat.h> @@ -689,6 +690,147 @@ THUNK_DEFINE(journal_get_object_header, iou_t *, iou, journal_t **, journal, uin  	return 0;  } +#define OBJECT_N_ITEMS(_o)	\ +	((_o.object.size - offsetof(typeof(_o), items)) / sizeof(*_o.items)) + +/* Validate and prepare object loaded via journal_get_object @ object, dispatch closure. */ +THUNK_DEFINE_STATIC(got_object, iou_t *, iou, iou_op_t *, op, uint64_t, size, Object *, object, thunk_t *, closure) +{ +	assert(iou); +	assert(op); +	assert(object); +	assert(closure); + +	if (op->result < 0) +		return op->result; + +	if (op->result != size) +		return -EINVAL; + +	object->object.size = le64toh(object->object.size); + +	/* TODO: validation/sanity checks? */ +	switch (object->object.type) { +	case OBJECT_DATA: +		object->data.hashed.hash = le64toh(object->data.hashed.hash); +		object->data.hashed.next_hash_offset = le64toh(object->data.hashed.next_hash_offset); +		object->data.next_field_offset = le64toh(object->data.next_field_offset); +		object->data.entry_offset = le64toh(object->data.entry_offset); +		object->data.entry_array_offset = le64toh(object->data.entry_array_offset); +		object->data.n_entries = le64toh(object->data.n_entries); +		break; + +	case OBJECT_FIELD: +		object->field.hashed.hash = le64toh(object->field.hashed.hash); +		object->field.hashed.next_hash_offset = le64toh(object->field.hashed.next_hash_offset); +		object->field.head_data_offset = le64toh(object->field.head_data_offset); +		break; + +	case OBJECT_ENTRY: +		object->entry.seqnum = le64toh(object->entry.seqnum); +		object->entry.realtime = le64toh(object->entry.realtime); +		object->entry.monotonic = le64toh(object->entry.monotonic); +		//object->entry.boot_id +		object->entry.xor_hash = le64toh(object->entry.xor_hash); +		for (uint64_t i = 0, n_items = OBJECT_N_ITEMS(object->entry); i < n_items; i++) { +			object->entry.items[i].object_offset = le64toh(object->entry.items[i].object_offset); +			object->entry.items[i].hash; +		} +		break; + +	case OBJECT_DATA_HASH_TABLE: +	case OBJECT_FIELD_HASH_TABLE: +		for (uint64_t i = 0, n_items = OBJECT_N_ITEMS(object->hash_table); i < n_items; i++) { +			object->hash_table.items[i].head_hash_offset = le64toh(object->hash_table.items[i].head_hash_offset); +			object->hash_table.items[i].tail_hash_offset = le64toh(object->hash_table.items[i].tail_hash_offset); +		} +		break; + +	case OBJECT_ENTRY_ARRAY: +		object->entry_array.next_entry_array_offset = le64toh(object->entry_array.next_entry_array_offset); +		for (uint64_t i = 0, n_items = OBJECT_N_ITEMS(object->entry_array); i < n_items; i++) +			object->entry_array.items[i] = le64toh(object->entry_array.items[i]); +		break; + +	case OBJECT_TAG: +		object->tag.seqnum = le64toh(object->tag.seqnum); +		object->tag.epoch = le64toh(object->tag.epoch); +		break; + +	default: +		/* XXX: should probably just ignore unknown types instead, +		 * but the idea here is to let callers safely assume loaded objects +		 * have been fully validated and byteswapped as needed. +		 */ +		assert(0); +	} + +	return thunk_dispatch(closure); +} + + +/* Queue IO on iou for loading an entire object of size *size from *journal @ offset *offset, into *object + * which must already be allocated. + * Registers closure for dispatch on the io when completed. + * + * Note this doesn't allocate space for the object and requires the size be already known, it is the bare + * minimum object loading into pre-allocated space when the size is known, which performs the necessary + * le64toh() swapping of object-specific members before calling the supplied closure. + * + * It's expected that the caller must already retrieve the object's header in a separate step before + * calling this to load the entirety of the object, since the header is needed first to know the size + * for allocating the full object and then loading its contents.  Another heavier helper will be provided + * for doing both the header load followed by the entire object load in one convenient step. + */ +THUNK_DEFINE(journal_get_object, iou_t *, iou, journal_t **, journal, uint64_t *, offset, uint64_t *, size, Object **, object, thunk_t *, closure) +{ +	iou_op_t	*op; + +	assert(iou); +	assert(journal); +	assert(offset); +	assert(size); +	assert(object && *object); +	assert(closure); + +	op = iou_op_new(iou); +	if (!op) +		return -ENOMEM; + +	io_uring_prep_read(op->sqe, (*journal)->idx, *object, *size, *offset); +	op->sqe->flags = IOSQE_FIXED_FILE; +	op_queue(iou, op, THUNK(got_object(iou, op, *size, *object, closure))); + +	return 0; +} + + +THUNK_DEFINE_STATIC(get_object_full_got_header, iou_t *, iou, journal_t **, journal, uint64_t *, offset, ObjectHeader *, object_header, Object **, object, thunk_t *, closure) +{ +	Object	*o; + +	o = malloc(object_header->size); +	if (!o) +		return -ENOMEM; + +	*object = o; + +	return journal_get_object(iou, journal, offset, &object_header->size, object, closure); +} + + +/* Queue IO on iou for loading an object header into *object_header, which must already be allocated, + * registering a closure to then allocate space for the full object @ *object and queueing IO for loading + * the full object into that space, with closure registered for dispatch once the full object is loaded. + * + * This will leave a newly allocated and populated object @ *object, ready for use. + */ +THUNK_DEFINE(journal_get_object_full, iou_t *, iou, journal_t **, journal, uint64_t *, offset, ObjectHeader *, object_header, Object **, object, thunk_t *, closure) +{ +	return	journal_get_object_header(iou, journal, offset, object_header, THUNK( +			get_object_full_got_header(iou, journal, offset, object_header, object, closure))); +} +  /* for every open journal in *journals, store the journal in *journal_iter and dispatch closure */  /* closure must expect to be dispatched multiple times; once per journal, and will be freed once at end */ diff --git a/src/journals.h b/src/journals.h index ca23f77..84c7a0f 100644 --- a/src/journals.h +++ b/src/journals.h @@ -33,6 +33,8 @@ THUNK_DECLARE(journal_hash_table_iter_next_object, iou_t *, iou, journal_t **, j  THUNK_DECLARE(journal_hash_table_for_each, iou_t *, iou, journal_t **, journal, HashItem **, hash_table, uint64_t *, hash_table_size, uint64_t *, iter_bucket, uint64_t *, iter_offset, HashedObjectHeader *, iter_object_header, size_t, iter_object_size, thunk_t *, closure);  THUNK_DECLARE(journal_get_object_header, iou_t *, iou, journal_t **, journal, uint64_t *, offset, ObjectHeader *, object_header, thunk_t *, closure); +THUNK_DECLARE(journal_get_object, iou_t *, iou, journal_t **, journal, uint64_t *, offset, uint64_t *, size, Object **, object, thunk_t *, closure); +THUNK_DECLARE(journal_get_object_full, iou_t *, iou, journal_t **, journal, uint64_t *, offset, ObjectHeader *, object_header, Object **, object, thunk_t *, closure);  THUNK_DECLARE(journals_for_each, journals_t **, journals, journal_t **, journal_iter, thunk_t *, closure);  const char * journal_object_type_str(ObjectType type); diff --git a/src/verify-hashed-objects.c b/src/verify-hashed-objects.c new file mode 100644 index 0000000..8d79228 --- /dev/null +++ b/src/verify-hashed-objects.c @@ -0,0 +1,269 @@ +/* + *  Copyright (C) 2020 - Vito Caputo - <vcaputo@pengaru.com> + * + *  This program is free software: you can redistribute it and/or modify it + *  under the terms of the GNU General Public License version 3 as published + *  by the Free Software Foundation. + * + *  This program is distributed in the hope that it will be useful, + *  but WITHOUT ANY WARRANTY; without even the implied warranty of + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + *  GNU General Public License for more details. + * + *  You should have received a copy of the GNU General Public License + *  along with this program.  If not, see <http://www.gnu.org/licenses/>. + */ + +#include <assert.h> +#include <inttypes.h> +#include <malloc.h> +#include <stdint.h> +#include <stdio.h> +#include <stdio_ext.h> + +#include <zstd.h> +#include <zstd_errors.h> + +#include <iou.h> +#include <thunk.h> + +#include "journals.h" +#include "machid.h" +#include "verify-hashed-objects.h" + +#include "upstream/journal-def.h" +#include "upstream/lookup3.h" +#include "upstream/siphash24.h" + +/* This simply loads all hashed objects (field and data objects) and verifies their + * hashes against their contents.  It doesn't examine entry item hashes and verify + * they match the referenced objects, but maybe it should do that too.  If it adds + * that ability, it probably makes sense to rename to verify-hashes. + */ + +/* borrowed from systemd */ +static uint64_t hash(Header *header, void *payload, uint64_t size) +{ +	if (header->incompatible_flags & HEADER_INCOMPATIBLE_KEYED_HASH) +		return siphash24(payload, size, header->file_id.bytes); + +	return jenkins_hash64(payload, size); +} + +/* borrowed from systemd */ +static int zstd_ret_to_errno(size_t ret) { +	switch (ZSTD_getErrorCode(ret)) { +	case ZSTD_error_dstSize_tooSmall: +		return -ENOBUFS; +	case ZSTD_error_memory_allocation: +		return -ENOMEM; +	default: +		return -EBADMSG; +	} +} + + +static int decompress(int compression, void *src, uint64_t src_size, void **dest, size_t *dest_size) +{ +	uint64_t	size; +	ZSTD_DCtx	*dctx; + +	assert(src); +	assert(src_size > 0); +	assert(dest); +	assert(dest_size); +	assert(compression & OBJECT_COMPRESSED_ZSTD); + +/* vaguely borrowed from systemd */ +	size = ZSTD_getFrameContentSize(src, src_size); +	if (size == ZSTD_CONTENTSIZE_ERROR || size == ZSTD_CONTENTSIZE_UNKNOWN) +		return -EBADMSG; + +	if (size > SIZE_MAX) +		return -E2BIG; + +	if (malloc_usable_size(*dest) < size) { +		free(*dest); +		*dest = malloc(size); +		if (!*dest) +			return -ENOMEM; +	} + +	dctx = ZSTD_createDCtx(); +	if (!dctx) { +		free(*dest); +		return -ENOMEM; +	} + +	ZSTD_inBuffer input = { +		.src = src, +		.size = src_size, +	}; +	ZSTD_outBuffer output = { +		.dst = *dest, +		.size = size, +	}; + +	size_t k = ZSTD_decompressStream(dctx, &output, &input); +	if (ZSTD_isError(k)) { +		return zstd_ret_to_errno(k); +	} +	assert(output.pos >= size); + +	*dest_size = size; + +        return 0; +} + + +THUNK_DEFINE_STATIC(per_hashed_object, journal_t *, journal, Header *, header, Object **, iter_object, void **, decompressed, thunk_t *, closure) +{ +	int		compression; +	uint64_t	payload_size, h; +	void		*payload; +	Object		*o; + +	assert(iter_object && *iter_object); + +	o = *iter_object; + +	switch (o->object.type) { +	case OBJECT_FIELD: +		payload_size = o->object.size - offsetof(FieldObject, payload), +		payload = o->field.payload; +		break; +	case OBJECT_DATA: +		payload_size = o->object.size - offsetof(DataObject, payload), +		payload = o->data.payload; +		break; +	default: +		assert(0); +	} + +	/* TODO: hash payload, compare to hash.. +	 * this kind of cpu-bound work would benefit from a thread-pool, and it would be +	 * neat if iou abstracted such a thing as if it were just another iou_op, except +	 * for execution by worker threads it abstracted, which upon completion would get +	 * their associated closures dispatched as if it were any other iou_op being completed. +	 * as-is this work will delay iou_run() from getting called again until the hashing +	 * and decompression if needed will complete, which may have a serializing effect on +	 * the otherwise parallel-processed journals. +	 */ + +	compression = (o->object.flags & OBJECT_COMPRESSION_MASK); +	if (compression) { +		int	r; +		size_t	b_size; + +		r = decompress(compression, payload, payload_size, decompressed, &b_size); +		if (r < 0) +			return r; + +		h = hash(header, *decompressed, b_size); +	} else { +		h = hash(header, payload, payload_size); +	} + +	if (h != o->data.hashed.hash) { +		printf("mismatch %"PRIx64" != %"PRIx64"\ncontents=\"%.*s\"\n", +			h, o->data.hashed.hash, +			(int)payload_size, payload); +		return -EBADMSG; +	} + +	return thunk_dispatch(closure); +} + +/* XXX TODO: this should prolly move into journals.[ch] now that it's + * in both here and report-entry-arrays.c + */ +THUNK_DEFINE_STATIC(per_object_dispatch, uint64_t *, iter_offset, thunk_t *, closure) +{ +	if (!(*iter_offset)) +		return thunk_dispatch(closure); + +	return thunk_dispatch_keep(closure); +} + + +THUNK_DEFINE_STATIC(per_object, thunk_t *, self, iou_t *, iou, journal_t **, journal, Header *, header, uint64_t *, iter_offset, ObjectHeader *, iter_object_header, Object **, iter_object, void **, decompressed) +{ +	assert(iter_offset); +	assert(iter_object_header); +	assert(iter_object); + +	if (!*iter_offset) { +		free(*iter_object); +		free(*decompressed); +		*iter_object = *decompressed = NULL; +		return 0; +	} + +	/* skip non-hashed objects */ +	if (iter_object_header->type != OBJECT_FIELD && iter_object_header->type != OBJECT_DATA) +		return	journal_iter_next_object(iou, journal, header, iter_offset, iter_object_header, THUNK( +				per_object_dispatch(iter_offset, self))); + +	if (malloc_usable_size(*iter_object) < iter_object_header->size) { +		free(*iter_object); + +		*iter_object = malloc(iter_object_header->size); +		if (!*iter_object) +			return -ENOMEM; +	} + +	return	journal_get_object(iou, journal, iter_offset, &iter_object_header->size, iter_object, THUNK( +			per_hashed_object(*journal, header, iter_object, decompressed, THUNK( +				journal_iter_next_object(iou, journal, header, iter_offset, iter_object_header, THUNK( +					per_object_dispatch(iter_offset, self))))))); +} + + +THUNK_DEFINE_STATIC(per_journal, iou_t *, iou, journal_t **, journal_iter) +{ +	struct { +		journal_t	*journal; +		Header		header; +		uint64_t	iter_offset; +		ObjectHeader	iter_object_header; +		Object		*iter_object; +		void		*decompressed; +	} *foo; + +	thunk_t		*closure; + +	assert(iou); +	assert(journal_iter); + +	closure = THUNK_ALLOC(per_object, (void **)&foo, sizeof(*foo)); +	foo->journal = *journal_iter; +	foo->iter_object = foo->decompressed = NULL; + +	return journal_get_header(iou, &foo->journal, &foo->header, THUNK( +			journal_iter_next_object(iou, &foo->journal, &foo->header, &foo->iter_offset, &foo->iter_object_header, THUNK( +				per_object_dispatch(&foo->iter_offset, THUNK_INIT( +					per_object(closure, closure, iou, &foo->journal, &foo->header, &foo->iter_offset, &foo->iter_object_header, &foo->iter_object, &foo->decompressed))))))); +} + + +/* verify the hashes of all "hashed objects" (field and data objects) */ +int jio_verify_hashed_objects(iou_t *iou, int argc, char *argv[]) +{ +	char		*machid; +	journals_t	*journals; +	journal_t	*journal_iter; +	int		r; + +	r = machid_get(iou, &machid, THUNK( +		journals_open(iou, &machid, O_RDONLY, &journals, THUNK( +			journals_for_each(&journals, &journal_iter, THUNK( +				per_journal(iou, &journal_iter))))))); +	if (r < 0) +		return r; + +	r = iou_run(iou); +	if (r < 0) +		return r; + +	return 0; +} diff --git a/src/verify-hashed-objects.h b/src/verify-hashed-objects.h new file mode 100644 index 0000000..851e8c5 --- /dev/null +++ b/src/verify-hashed-objects.h @@ -0,0 +1,8 @@ +#ifndef _JIO_VERIFY_HASHED_OBJECTS +#define _JIO_VERIFY_HASHED_OBJECTS + +typedef struct iou_t iou_t; + +int jio_verify_hashed_objects(iou_t *iou, int argc, char *argv[]); + +#endif | 
