diff options
author | Vito Caputo <vcaputo@pengaru.com> | 2021-08-23 15:34:12 -0700 |
---|---|---|
committer | Vito Caputo <vcaputo@pengaru.com> | 2021-08-24 00:48:46 -0700 |
commit | db549aa7f63621e1b81d32e59456303c5003b4b9 (patch) | |
tree | 277d51fca2c89e2fdf7002ffee051bcd70b21ef8 /src | |
parent | b53cc8e61a27f948df5f11da07c7c395ebae1dd1 (diff) |
verify-hashed-objects: add `jio verify hashed-objects`
This is currently very hacky and unfinished, but does enough for
some performance comparisons against a zstd-using journalctl --verify
that has been hacked to return early after the first pass.
It's currently rather leaky, the whole per-object-dispatch thingy
is illuminating a thunk_h shortcoming and forcing addressing the
issue... soon.
Diffstat (limited to 'src')
-rw-r--r-- | src/Makefile.am | 4 | ||||
-rw-r--r-- | src/jio.c | 19 | ||||
-rw-r--r-- | src/journals.c | 142 | ||||
-rw-r--r-- | src/journals.h | 2 | ||||
-rw-r--r-- | src/verify-hashed-objects.c | 269 | ||||
-rw-r--r-- | src/verify-hashed-objects.h | 8 |
6 files changed, 442 insertions, 2 deletions
diff --git a/src/Makefile.am b/src/Makefile.am index 92f89b8..e0c25d1 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -22,7 +22,9 @@ jio_SOURCES = \ report-tail-waste.c \ report-tail-waste.h \ report-usage.c \ - report-usage.h + report-usage.h \ + verify-hashed-objects.c \ + verify-hashed-objects.h jio_CPPFLAGS = -I@top_srcdir@/thunk_h -I@top_srcdir@/libiou/src jio_LDADD = @top_builddir@/libiou/src/libiou.a @top_builddir@/src/upstream/libupstream.a @@ -24,6 +24,7 @@ #include "report-layout.h" #include "report-tail-waste.h" #include "report-usage.h" +#include "verify-hashed-objects.h" #include "upstream/journal-def.h" @@ -39,7 +40,7 @@ int main(int argc, char *argv[]) int r; if (argc < 2) { - printf("Usage: %s {help,reclaim,report} [subcommand-args]\n", argv[0]); + printf("Usage: %s {help,reclaim,report,verify} [subcommand-args]\n", argv[0]); return 0; } @@ -134,6 +135,22 @@ int main(int argc, char *argv[]) fprintf(stderr, "Unsupported report subcommand: \"%s\"\n", argv[2]); return 1; } + } else if (!strcmp(argv[1], "verify")) { + if (argc < 3) { + printf("Usage: %s verify {hashed-objects}\n", argv[0]); + return 0; + } + + if (!strcmp(argv[2], "hashed-objects")) { + r = jio_verify_hashed_objects(iou, argc, argv); + if (r < 0) { + fprintf(stderr, "failed to verify hashed objects: %s\n", strerror(-r)); + return 1; + } + } else { + fprintf(stderr, "Unsupported verify subcommand: \"%s\"\n", argv[2]); + return 1; + } } else if (!strcmp(argv[1], "version")) { puts("jio version " VERSION); return 0; diff --git a/src/journals.c b/src/journals.c index 8211b6f..05c14f0 100644 --- a/src/journals.c +++ b/src/journals.c @@ -18,6 +18,7 @@ #include <dirent.h> #include <fcntl.h> #include <liburing.h> +#include <stddef.h> #include <stdio.h> #include <string.h> #include <sys/stat.h> @@ -689,6 +690,147 @@ THUNK_DEFINE(journal_get_object_header, iou_t *, iou, journal_t **, journal, uin return 0; } +#define OBJECT_N_ITEMS(_o) \ + ((_o.object.size - offsetof(typeof(_o), items)) / sizeof(*_o.items)) + +/* Validate and prepare object loaded via journal_get_object @ object, dispatch closure. */ +THUNK_DEFINE_STATIC(got_object, iou_t *, iou, iou_op_t *, op, uint64_t, size, Object *, object, thunk_t *, closure) +{ + assert(iou); + assert(op); + assert(object); + assert(closure); + + if (op->result < 0) + return op->result; + + if (op->result != size) + return -EINVAL; + + object->object.size = le64toh(object->object.size); + + /* TODO: validation/sanity checks? */ + switch (object->object.type) { + case OBJECT_DATA: + object->data.hashed.hash = le64toh(object->data.hashed.hash); + object->data.hashed.next_hash_offset = le64toh(object->data.hashed.next_hash_offset); + object->data.next_field_offset = le64toh(object->data.next_field_offset); + object->data.entry_offset = le64toh(object->data.entry_offset); + object->data.entry_array_offset = le64toh(object->data.entry_array_offset); + object->data.n_entries = le64toh(object->data.n_entries); + break; + + case OBJECT_FIELD: + object->field.hashed.hash = le64toh(object->field.hashed.hash); + object->field.hashed.next_hash_offset = le64toh(object->field.hashed.next_hash_offset); + object->field.head_data_offset = le64toh(object->field.head_data_offset); + break; + + case OBJECT_ENTRY: + object->entry.seqnum = le64toh(object->entry.seqnum); + object->entry.realtime = le64toh(object->entry.realtime); + object->entry.monotonic = le64toh(object->entry.monotonic); + //object->entry.boot_id + object->entry.xor_hash = le64toh(object->entry.xor_hash); + for (uint64_t i = 0, n_items = OBJECT_N_ITEMS(object->entry); i < n_items; i++) { + object->entry.items[i].object_offset = le64toh(object->entry.items[i].object_offset); + object->entry.items[i].hash; + } + break; + + case OBJECT_DATA_HASH_TABLE: + case OBJECT_FIELD_HASH_TABLE: + for (uint64_t i = 0, n_items = OBJECT_N_ITEMS(object->hash_table); i < n_items; i++) { + object->hash_table.items[i].head_hash_offset = le64toh(object->hash_table.items[i].head_hash_offset); + object->hash_table.items[i].tail_hash_offset = le64toh(object->hash_table.items[i].tail_hash_offset); + } + break; + + case OBJECT_ENTRY_ARRAY: + object->entry_array.next_entry_array_offset = le64toh(object->entry_array.next_entry_array_offset); + for (uint64_t i = 0, n_items = OBJECT_N_ITEMS(object->entry_array); i < n_items; i++) + object->entry_array.items[i] = le64toh(object->entry_array.items[i]); + break; + + case OBJECT_TAG: + object->tag.seqnum = le64toh(object->tag.seqnum); + object->tag.epoch = le64toh(object->tag.epoch); + break; + + default: + /* XXX: should probably just ignore unknown types instead, + * but the idea here is to let callers safely assume loaded objects + * have been fully validated and byteswapped as needed. + */ + assert(0); + } + + return thunk_dispatch(closure); +} + + +/* Queue IO on iou for loading an entire object of size *size from *journal @ offset *offset, into *object + * which must already be allocated. + * Registers closure for dispatch on the io when completed. + * + * Note this doesn't allocate space for the object and requires the size be already known, it is the bare + * minimum object loading into pre-allocated space when the size is known, which performs the necessary + * le64toh() swapping of object-specific members before calling the supplied closure. + * + * It's expected that the caller must already retrieve the object's header in a separate step before + * calling this to load the entirety of the object, since the header is needed first to know the size + * for allocating the full object and then loading its contents. Another heavier helper will be provided + * for doing both the header load followed by the entire object load in one convenient step. + */ +THUNK_DEFINE(journal_get_object, iou_t *, iou, journal_t **, journal, uint64_t *, offset, uint64_t *, size, Object **, object, thunk_t *, closure) +{ + iou_op_t *op; + + assert(iou); + assert(journal); + assert(offset); + assert(size); + assert(object && *object); + assert(closure); + + op = iou_op_new(iou); + if (!op) + return -ENOMEM; + + io_uring_prep_read(op->sqe, (*journal)->idx, *object, *size, *offset); + op->sqe->flags = IOSQE_FIXED_FILE; + op_queue(iou, op, THUNK(got_object(iou, op, *size, *object, closure))); + + return 0; +} + + +THUNK_DEFINE_STATIC(get_object_full_got_header, iou_t *, iou, journal_t **, journal, uint64_t *, offset, ObjectHeader *, object_header, Object **, object, thunk_t *, closure) +{ + Object *o; + + o = malloc(object_header->size); + if (!o) + return -ENOMEM; + + *object = o; + + return journal_get_object(iou, journal, offset, &object_header->size, object, closure); +} + + +/* Queue IO on iou for loading an object header into *object_header, which must already be allocated, + * registering a closure to then allocate space for the full object @ *object and queueing IO for loading + * the full object into that space, with closure registered for dispatch once the full object is loaded. + * + * This will leave a newly allocated and populated object @ *object, ready for use. + */ +THUNK_DEFINE(journal_get_object_full, iou_t *, iou, journal_t **, journal, uint64_t *, offset, ObjectHeader *, object_header, Object **, object, thunk_t *, closure) +{ + return journal_get_object_header(iou, journal, offset, object_header, THUNK( + get_object_full_got_header(iou, journal, offset, object_header, object, closure))); +} + /* for every open journal in *journals, store the journal in *journal_iter and dispatch closure */ /* closure must expect to be dispatched multiple times; once per journal, and will be freed once at end */ diff --git a/src/journals.h b/src/journals.h index ca23f77..84c7a0f 100644 --- a/src/journals.h +++ b/src/journals.h @@ -33,6 +33,8 @@ THUNK_DECLARE(journal_hash_table_iter_next_object, iou_t *, iou, journal_t **, j THUNK_DECLARE(journal_hash_table_for_each, iou_t *, iou, journal_t **, journal, HashItem **, hash_table, uint64_t *, hash_table_size, uint64_t *, iter_bucket, uint64_t *, iter_offset, HashedObjectHeader *, iter_object_header, size_t, iter_object_size, thunk_t *, closure); THUNK_DECLARE(journal_get_object_header, iou_t *, iou, journal_t **, journal, uint64_t *, offset, ObjectHeader *, object_header, thunk_t *, closure); +THUNK_DECLARE(journal_get_object, iou_t *, iou, journal_t **, journal, uint64_t *, offset, uint64_t *, size, Object **, object, thunk_t *, closure); +THUNK_DECLARE(journal_get_object_full, iou_t *, iou, journal_t **, journal, uint64_t *, offset, ObjectHeader *, object_header, Object **, object, thunk_t *, closure); THUNK_DECLARE(journals_for_each, journals_t **, journals, journal_t **, journal_iter, thunk_t *, closure); const char * journal_object_type_str(ObjectType type); diff --git a/src/verify-hashed-objects.c b/src/verify-hashed-objects.c new file mode 100644 index 0000000..8d79228 --- /dev/null +++ b/src/verify-hashed-objects.c @@ -0,0 +1,269 @@ +/* + * Copyright (C) 2020 - Vito Caputo - <vcaputo@pengaru.com> + * + * This program is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 3 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <assert.h> +#include <inttypes.h> +#include <malloc.h> +#include <stdint.h> +#include <stdio.h> +#include <stdio_ext.h> + +#include <zstd.h> +#include <zstd_errors.h> + +#include <iou.h> +#include <thunk.h> + +#include "journals.h" +#include "machid.h" +#include "verify-hashed-objects.h" + +#include "upstream/journal-def.h" +#include "upstream/lookup3.h" +#include "upstream/siphash24.h" + +/* This simply loads all hashed objects (field and data objects) and verifies their + * hashes against their contents. It doesn't examine entry item hashes and verify + * they match the referenced objects, but maybe it should do that too. If it adds + * that ability, it probably makes sense to rename to verify-hashes. + */ + +/* borrowed from systemd */ +static uint64_t hash(Header *header, void *payload, uint64_t size) +{ + if (header->incompatible_flags & HEADER_INCOMPATIBLE_KEYED_HASH) + return siphash24(payload, size, header->file_id.bytes); + + return jenkins_hash64(payload, size); +} + +/* borrowed from systemd */ +static int zstd_ret_to_errno(size_t ret) { + switch (ZSTD_getErrorCode(ret)) { + case ZSTD_error_dstSize_tooSmall: + return -ENOBUFS; + case ZSTD_error_memory_allocation: + return -ENOMEM; + default: + return -EBADMSG; + } +} + + +static int decompress(int compression, void *src, uint64_t src_size, void **dest, size_t *dest_size) +{ + uint64_t size; + ZSTD_DCtx *dctx; + + assert(src); + assert(src_size > 0); + assert(dest); + assert(dest_size); + assert(compression & OBJECT_COMPRESSED_ZSTD); + +/* vaguely borrowed from systemd */ + size = ZSTD_getFrameContentSize(src, src_size); + if (size == ZSTD_CONTENTSIZE_ERROR || size == ZSTD_CONTENTSIZE_UNKNOWN) + return -EBADMSG; + + if (size > SIZE_MAX) + return -E2BIG; + + if (malloc_usable_size(*dest) < size) { + free(*dest); + *dest = malloc(size); + if (!*dest) + return -ENOMEM; + } + + dctx = ZSTD_createDCtx(); + if (!dctx) { + free(*dest); + return -ENOMEM; + } + + ZSTD_inBuffer input = { + .src = src, + .size = src_size, + }; + ZSTD_outBuffer output = { + .dst = *dest, + .size = size, + }; + + size_t k = ZSTD_decompressStream(dctx, &output, &input); + if (ZSTD_isError(k)) { + return zstd_ret_to_errno(k); + } + assert(output.pos >= size); + + *dest_size = size; + + return 0; +} + + +THUNK_DEFINE_STATIC(per_hashed_object, journal_t *, journal, Header *, header, Object **, iter_object, void **, decompressed, thunk_t *, closure) +{ + int compression; + uint64_t payload_size, h; + void *payload; + Object *o; + + assert(iter_object && *iter_object); + + o = *iter_object; + + switch (o->object.type) { + case OBJECT_FIELD: + payload_size = o->object.size - offsetof(FieldObject, payload), + payload = o->field.payload; + break; + case OBJECT_DATA: + payload_size = o->object.size - offsetof(DataObject, payload), + payload = o->data.payload; + break; + default: + assert(0); + } + + /* TODO: hash payload, compare to hash.. + * this kind of cpu-bound work would benefit from a thread-pool, and it would be + * neat if iou abstracted such a thing as if it were just another iou_op, except + * for execution by worker threads it abstracted, which upon completion would get + * their associated closures dispatched as if it were any other iou_op being completed. + * as-is this work will delay iou_run() from getting called again until the hashing + * and decompression if needed will complete, which may have a serializing effect on + * the otherwise parallel-processed journals. + */ + + compression = (o->object.flags & OBJECT_COMPRESSION_MASK); + if (compression) { + int r; + size_t b_size; + + r = decompress(compression, payload, payload_size, decompressed, &b_size); + if (r < 0) + return r; + + h = hash(header, *decompressed, b_size); + } else { + h = hash(header, payload, payload_size); + } + + if (h != o->data.hashed.hash) { + printf("mismatch %"PRIx64" != %"PRIx64"\ncontents=\"%.*s\"\n", + h, o->data.hashed.hash, + (int)payload_size, payload); + return -EBADMSG; + } + + return thunk_dispatch(closure); +} + +/* XXX TODO: this should prolly move into journals.[ch] now that it's + * in both here and report-entry-arrays.c + */ +THUNK_DEFINE_STATIC(per_object_dispatch, uint64_t *, iter_offset, thunk_t *, closure) +{ + if (!(*iter_offset)) + return thunk_dispatch(closure); + + return thunk_dispatch_keep(closure); +} + + +THUNK_DEFINE_STATIC(per_object, thunk_t *, self, iou_t *, iou, journal_t **, journal, Header *, header, uint64_t *, iter_offset, ObjectHeader *, iter_object_header, Object **, iter_object, void **, decompressed) +{ + assert(iter_offset); + assert(iter_object_header); + assert(iter_object); + + if (!*iter_offset) { + free(*iter_object); + free(*decompressed); + *iter_object = *decompressed = NULL; + return 0; + } + + /* skip non-hashed objects */ + if (iter_object_header->type != OBJECT_FIELD && iter_object_header->type != OBJECT_DATA) + return journal_iter_next_object(iou, journal, header, iter_offset, iter_object_header, THUNK( + per_object_dispatch(iter_offset, self))); + + if (malloc_usable_size(*iter_object) < iter_object_header->size) { + free(*iter_object); + + *iter_object = malloc(iter_object_header->size); + if (!*iter_object) + return -ENOMEM; + } + + return journal_get_object(iou, journal, iter_offset, &iter_object_header->size, iter_object, THUNK( + per_hashed_object(*journal, header, iter_object, decompressed, THUNK( + journal_iter_next_object(iou, journal, header, iter_offset, iter_object_header, THUNK( + per_object_dispatch(iter_offset, self))))))); +} + + +THUNK_DEFINE_STATIC(per_journal, iou_t *, iou, journal_t **, journal_iter) +{ + struct { + journal_t *journal; + Header header; + uint64_t iter_offset; + ObjectHeader iter_object_header; + Object *iter_object; + void *decompressed; + } *foo; + + thunk_t *closure; + + assert(iou); + assert(journal_iter); + + closure = THUNK_ALLOC(per_object, (void **)&foo, sizeof(*foo)); + foo->journal = *journal_iter; + foo->iter_object = foo->decompressed = NULL; + + return journal_get_header(iou, &foo->journal, &foo->header, THUNK( + journal_iter_next_object(iou, &foo->journal, &foo->header, &foo->iter_offset, &foo->iter_object_header, THUNK( + per_object_dispatch(&foo->iter_offset, THUNK_INIT( + per_object(closure, closure, iou, &foo->journal, &foo->header, &foo->iter_offset, &foo->iter_object_header, &foo->iter_object, &foo->decompressed))))))); +} + + +/* verify the hashes of all "hashed objects" (field and data objects) */ +int jio_verify_hashed_objects(iou_t *iou, int argc, char *argv[]) +{ + char *machid; + journals_t *journals; + journal_t *journal_iter; + int r; + + r = machid_get(iou, &machid, THUNK( + journals_open(iou, &machid, O_RDONLY, &journals, THUNK( + journals_for_each(&journals, &journal_iter, THUNK( + per_journal(iou, &journal_iter))))))); + if (r < 0) + return r; + + r = iou_run(iou); + if (r < 0) + return r; + + return 0; +} diff --git a/src/verify-hashed-objects.h b/src/verify-hashed-objects.h new file mode 100644 index 0000000..851e8c5 --- /dev/null +++ b/src/verify-hashed-objects.h @@ -0,0 +1,8 @@ +#ifndef _JIO_VERIFY_HASHED_OBJECTS +#define _JIO_VERIFY_HASHED_OBJECTS + +typedef struct iou_t iou_t; + +int jio_verify_hashed_objects(iou_t *iou, int argc, char *argv[]); + +#endif |