diff --git a/c/tests/test_core.c b/c/tests/test_core.c index 9d8d9f5a7d..761a0e8773 100644 --- a/c/tests/test_core.c +++ b/c/tests/test_core.c @@ -83,161 +83,6 @@ test_generate_uuid(void) CU_ASSERT_STRING_NOT_EQUAL(uuid, other_uuid); } -static void -set_u64_le(uint8_t *dest, uint64_t value) -{ - dest[0] = (uint8_t) (value & 0xFF); - dest[1] = (uint8_t) ((value >> 8) & 0xFF); - dest[2] = (uint8_t) ((value >> 16) & 0xFF); - dest[3] = (uint8_t) ((value >> 24) & 0xFF); - dest[4] = (uint8_t) ((value >> 32) & 0xFF); - dest[5] = (uint8_t) ((value >> 40) & 0xFF); - dest[6] = (uint8_t) ((value >> 48) & 0xFF); - dest[7] = (uint8_t) ((value >> 56) & 0xFF); -} - -static void -test_json_struct_metadata_get_blob(void) -{ - int ret; - char metadata[128]; - char *json; - tsk_size_t json_buffer_length; - char *blob; - tsk_size_t blob_length; - uint8_t *bytes; - tsk_size_t metadata_length; - size_t header_length; - size_t json_length; - size_t payload_length; - size_t total_length; - char json_payload[] = "{\"a\":1}"; - uint8_t binary_payload[] = { 0x01, 0x02, 0x03, 0x04 }; - uint8_t empty_payload[] = { 0 }; - - bytes = (uint8_t *) metadata; - header_length = 4 + 1 + 8 + 8; - json_length = strlen(json_payload); - payload_length = sizeof(binary_payload); - total_length = header_length + json_length + payload_length; - CU_ASSERT_FATAL(total_length <= sizeof(metadata)); - memset(metadata, 0, sizeof(metadata)); - bytes[0] = 'J'; - bytes[1] = 'B'; - bytes[2] = 'L'; - bytes[3] = 'B'; - bytes[4] = 1; - set_u64_le(bytes + 5, (uint64_t) json_length); - set_u64_le(bytes + 13, (uint64_t) payload_length); - memcpy(bytes + header_length, json_payload, json_length); - memcpy(bytes + header_length + json_length, binary_payload, payload_length); - metadata_length = (tsk_size_t) total_length; - ret = tsk_json_struct_metadata_get_blob( - metadata, metadata_length, &json, &json_buffer_length, &blob, &blob_length); - CU_ASSERT_EQUAL(ret, 0); - CU_ASSERT_PTR_EQUAL(json, (char *) bytes + header_length); - CU_ASSERT_EQUAL(json_buffer_length, (tsk_size_t) json_length); - if (json_length > 0) { - CU_ASSERT_EQUAL(memcmp(json, json_payload, json_length), 0); - } - CU_ASSERT_PTR_EQUAL(blob, bytes + header_length + json_length); - CU_ASSERT_EQUAL(blob_length, (tsk_size_t) payload_length); - CU_ASSERT_EQUAL(memcmp(blob, binary_payload, payload_length), 0); - - payload_length = 0; - total_length = header_length + json_length + payload_length; - CU_ASSERT_FATAL(total_length <= sizeof(metadata)); - set_u64_le(bytes + 13, (uint64_t) payload_length); - metadata_length = (tsk_size_t) total_length; - ret = tsk_json_struct_metadata_get_blob( - metadata, metadata_length, &json, &json_buffer_length, &blob, &blob_length); - CU_ASSERT_EQUAL(ret, 0); - CU_ASSERT_PTR_EQUAL(json, (char *) bytes + header_length); - CU_ASSERT_EQUAL(json_buffer_length, (tsk_size_t) json_length); - CU_ASSERT_EQUAL(blob_length, (tsk_size_t) payload_length); - CU_ASSERT_PTR_EQUAL(blob, bytes + header_length + json_length); - - json_length = 0; - payload_length = sizeof(empty_payload); - total_length = header_length + json_length + payload_length; - CU_ASSERT_FATAL(total_length <= sizeof(metadata)); - set_u64_le(bytes + 5, (uint64_t) json_length); - set_u64_le(bytes + 13, (uint64_t) payload_length); - memcpy(bytes + header_length + json_length, empty_payload, payload_length); - metadata_length = (tsk_size_t) total_length; - ret = tsk_json_struct_metadata_get_blob( - metadata, metadata_length, &json, &json_buffer_length, &blob, &blob_length); - CU_ASSERT_EQUAL(ret, 0); - CU_ASSERT_PTR_EQUAL(json, (char *) bytes + header_length); - CU_ASSERT_EQUAL(json_buffer_length, (tsk_size_t) json_length); - CU_ASSERT_EQUAL(blob_length, (tsk_size_t) payload_length); - CU_ASSERT_PTR_EQUAL(blob, bytes + header_length + json_length); - CU_ASSERT_EQUAL(memcmp(blob, empty_payload, payload_length), 0); - - blob = NULL; - blob_length = 0; - json = NULL; - json_buffer_length = 0; - metadata_length = header_length - 1; - ret = tsk_json_struct_metadata_get_blob( - metadata, metadata_length, &json, &json_buffer_length, &blob, &blob_length); - CU_ASSERT_EQUAL(ret, TSK_ERR_JSON_STRUCT_METADATA_TRUNCATED); - - metadata_length = (tsk_size_t) total_length; - bytes[0] = 'X'; - ret = tsk_json_struct_metadata_get_blob( - metadata, metadata_length, &json, &json_buffer_length, &blob, &blob_length); - CU_ASSERT_EQUAL(ret, TSK_ERR_JSON_STRUCT_METADATA_BAD_MAGIC); - bytes[0] = 'J'; - - bytes[4] = 2; - ret = tsk_json_struct_metadata_get_blob( - metadata, metadata_length, &json, &json_buffer_length, &blob, &blob_length); - CU_ASSERT_EQUAL(ret, TSK_ERR_JSON_STRUCT_METADATA_BAD_VERSION); - bytes[4] = 1; - - metadata_length = (tsk_size_t) (total_length - 1); - ret = tsk_json_struct_metadata_get_blob( - metadata, metadata_length, &json, &json_buffer_length, &blob, &blob_length); - CU_ASSERT_EQUAL(ret, TSK_ERR_JSON_STRUCT_METADATA_TRUNCATED); - - ret = tsk_json_struct_metadata_get_blob( - NULL, metadata_length, &json, &json_buffer_length, &blob, &blob_length); - CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); - ret = tsk_json_struct_metadata_get_blob( - metadata, metadata_length, NULL, &json_buffer_length, &blob, &blob_length); - CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); - ret = tsk_json_struct_metadata_get_blob( - metadata, metadata_length, &json, NULL, &blob, &blob_length); - CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); - ret = tsk_json_struct_metadata_get_blob( - metadata, metadata_length, &json, &json_buffer_length, NULL, &blob_length); - CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); - ret = tsk_json_struct_metadata_get_blob( - metadata, metadata_length, &json, &json_buffer_length, &blob, NULL); - CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); - - memset(metadata, 0, sizeof(metadata)); - bytes[0] = 'J'; - bytes[1] = 'B'; - bytes[2] = 'L'; - bytes[3] = 'B'; - bytes[4] = 1; - metadata_length = (tsk_size_t) header_length; - - set_u64_le(bytes + 5, UINT64_MAX - (uint64_t) header_length + 1); - set_u64_le(bytes + 13, 0); - ret = tsk_json_struct_metadata_get_blob( - metadata, metadata_length, &json, &json_buffer_length, &blob, &blob_length); - CU_ASSERT_EQUAL(ret, TSK_ERR_JSON_STRUCT_METADATA_INVALID_LENGTH); - - set_u64_le(bytes + 5, 8); - set_u64_le(bytes + 13, UINT64_MAX - (uint64_t) (header_length + 8) + 1); - ret = tsk_json_struct_metadata_get_blob( - metadata, metadata_length, &json, &json_buffer_length, &blob, &blob_length); - CU_ASSERT_EQUAL(ret, TSK_ERR_JSON_STRUCT_METADATA_INVALID_LENGTH); -} - static void test_double_round(void) { @@ -808,7 +653,6 @@ main(int argc, char **argv) { "test_strerror", test_strerror }, { "test_strerror_kastore", test_strerror_kastore }, { "test_generate_uuid", test_generate_uuid }, - { "test_json_struct_metadata_get_blob", test_json_struct_metadata_get_blob }, { "test_double_round", test_double_round }, { "test_blkalloc", test_blkalloc }, { "test_unknown_time", test_unknown_time }, diff --git a/c/tskit/core.c b/c/tskit/core.c index 66f8cbd0ef..91e684c52d 100644 --- a/c/tskit/core.c +++ b/c/tskit/core.c @@ -32,10 +32,7 @@ #include #include -#define UUID_NUM_BYTES 16 -#define TSK_JSON_BINARY_HEADER_SIZE 21 - -static const uint8_t _tsk_json_binary_magic[4] = { 'J', 'B', 'L', 'B' }; +#define UUID_NUM_BYTES 16 #if defined(_WIN32) @@ -98,22 +95,6 @@ get_random_bytes(uint8_t *buf) #endif -static uint64_t -tsk_load_u64_le(const uint8_t *p) -{ - uint64_t value; - - value = (uint64_t) p[0]; - value |= (uint64_t) p[1] << 8; - value |= (uint64_t) p[2] << 16; - value |= (uint64_t) p[3] << 24; - value |= (uint64_t) p[4] << 32; - value |= (uint64_t) p[5] << 40; - value |= (uint64_t) p[6] << 48; - value |= (uint64_t) p[7] << 56; - return value; -} - /* Generate a new UUID4 using a system-generated source of randomness. * Note that this function writes a NULL terminator to the end of this * string, so that the total length of the buffer must be 37 bytes. @@ -141,65 +122,6 @@ tsk_generate_uuid(char *dest, int TSK_UNUSED(flags)) return ret; } -int -tsk_json_struct_metadata_get_blob(char *metadata, tsk_size_t metadata_length, - char **json, tsk_size_t *json_length, char **blob, tsk_size_t *blob_length) -{ - int ret; - uint8_t version; - uint64_t json_length_u64; - uint64_t binary_length_u64; - uint64_t header_and_json_length; - uint64_t total_length; - uint8_t *bytes; - char *blob_start; - char *json_start; - - if (metadata == NULL || json == NULL || json_length == NULL || blob == NULL - || blob_length == NULL) { - ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); - goto out; - } - bytes = (uint8_t *) metadata; - if (metadata_length < TSK_JSON_BINARY_HEADER_SIZE) { - ret = tsk_trace_error(TSK_ERR_JSON_STRUCT_METADATA_TRUNCATED); - goto out; - } - if (memcmp(bytes, _tsk_json_binary_magic, sizeof(_tsk_json_binary_magic)) != 0) { - ret = tsk_trace_error(TSK_ERR_JSON_STRUCT_METADATA_BAD_MAGIC); - goto out; - } - version = bytes[4]; - if (version != 1) { - ret = tsk_trace_error(TSK_ERR_JSON_STRUCT_METADATA_BAD_VERSION); - goto out; - } - json_length_u64 = tsk_load_u64_le(bytes + 5); - binary_length_u64 = tsk_load_u64_le(bytes + 13); - if (json_length_u64 > UINT64_MAX - (uint64_t) TSK_JSON_BINARY_HEADER_SIZE) { - ret = tsk_trace_error(TSK_ERR_JSON_STRUCT_METADATA_INVALID_LENGTH); - goto out; - } - header_and_json_length = (uint64_t) TSK_JSON_BINARY_HEADER_SIZE + json_length_u64; - if (binary_length_u64 > UINT64_MAX - header_and_json_length) { - ret = tsk_trace_error(TSK_ERR_JSON_STRUCT_METADATA_INVALID_LENGTH); - goto out; - } - total_length = header_and_json_length + binary_length_u64; - if ((uint64_t) metadata_length < total_length) { - ret = tsk_trace_error(TSK_ERR_JSON_STRUCT_METADATA_TRUNCATED); - goto out; - } - json_start = (char *) bytes + TSK_JSON_BINARY_HEADER_SIZE; - blob_start = (char *) bytes + TSK_JSON_BINARY_HEADER_SIZE + json_length_u64; - *json = json_start; - *json_length = (tsk_size_t) json_length_u64; - *blob = blob_start; - *blob_length = (tsk_size_t) binary_length_u64; - ret = 0; -out: - return ret; -} static const char * tsk_strerror_internal(int err) { @@ -267,22 +189,6 @@ tsk_strerror_internal(int err) ret = "An incompatible type for a column was found in the file. " "(TSK_ERR_BAD_COLUMN_TYPE)"; break; - case TSK_ERR_JSON_STRUCT_METADATA_BAD_MAGIC: - ret = "JSON binary struct metadata does not begin with the expected " - "magic bytes. (TSK_ERR_JSON_STRUCT_METADATA_BAD_MAGIC)"; - break; - case TSK_ERR_JSON_STRUCT_METADATA_TRUNCATED: - ret = "JSON binary struct metadata is shorter than the expected size. " - "(TSK_ERR_JSON_STRUCT_METADATA_TRUNCATED)"; - break; - case TSK_ERR_JSON_STRUCT_METADATA_INVALID_LENGTH: - ret = "A length field in the JSON binary struct metadata header is invalid. " - "(TSK_ERR_JSON_STRUCT_METADATA_INVALID_LENGTH)"; - break; - case TSK_ERR_JSON_STRUCT_METADATA_BAD_VERSION: - ret = "JSON binary struct metadata uses an unsupported version number. " - "(TSK_ERR_JSON_STRUCT_METADATA_BAD_VERSION)"; - break; /* Out of bounds errors */ case TSK_ERR_BAD_OFFSET: diff --git a/c/tskit/core.h b/c/tskit/core.h index 2964e3d8f1..f751d58b42 100644 --- a/c/tskit/core.h +++ b/c/tskit/core.h @@ -309,26 +309,6 @@ not found in the file. An unsupported type was provided for a column in the file. */ #define TSK_ERR_BAD_COLUMN_TYPE -105 - -/** -The JSON binary struct metadata does not begin with the expected magic bytes. -*/ -#define TSK_ERR_JSON_STRUCT_METADATA_BAD_MAGIC -106 - -/** -The JSON binary struct metadata is shorter than the expected size. -*/ -#define TSK_ERR_JSON_STRUCT_METADATA_TRUNCATED -107 - -/** -A length field in the JSON binary struct metadata header is invalid. -*/ -#define TSK_ERR_JSON_STRUCT_METADATA_INVALID_LENGTH -108 - -/** -The JSON binary struct metadata uses an unsupported version number. -*/ -#define TSK_ERR_JSON_STRUCT_METADATA_BAD_VERSION -109 /** @} */ /** @@ -1136,10 +1116,11 @@ int tsk_generate_uuid(char *dest, int flags); @brief Extract the binary payload from ``json+struct`` encoded metadata. @rst -Metadata produced by the JSONStructCodec consists of a fixed-size -header followed by canonical JSON bytes and an optional binary payload. This helper -validates the framing, returning pointers to the embedded JSON and binary sections -without copying. +Metadata produced by the JSONStructCodec consists of a fixed-size header followed +by canonical JSON bytes, a variable number of padding bytes (which should be zero) +to bring the length to a multiple of 8 bytes for alignment, and finally an optional +binary payload. This helper validates the framing, returning pointers to the +embedded JSON and binary sections without copying. The output pointers reference memory owned by the caller and remain valid only while the original metadata buffer is alive. diff --git a/python/tests/test_metadata.py b/python/tests/test_metadata.py index 31adb2b1b3..80fd2e3ce4 100644 --- a/python/tests/test_metadata.py +++ b/python/tests/test_metadata.py @@ -649,7 +649,7 @@ def test_disallow_duplicate_keys(self): ): tskit.MetadataSchema(schema) - def test_round_trip_with_struct_and_json(self): + def schema_with_blobs(self, num_blobs): schema = { "codec": "json+struct", "json": { @@ -662,14 +662,50 @@ def test_round_trip_with_struct_and_json(self): }, "struct": { "type": "object", - "properties": {"blob": {"type": "integer", "binaryFormat": "i"}}, + "properties": {}, }, } + for j in range(num_blobs): + schema["struct"]["properties"][f"b{j}"] = { + "type": "integer", + "binaryFormat": "i", + } + return tskit.MetadataSchema(schema) + + def test_round_trip_with_struct_and_json(self): + schema = dict(self.schema_with_blobs(2).schema) + schema["struct"]["properties"]["numbers"] = { + "type": "array", + "arrayLengthFormat": "H", + "items": {"type": "number", "binaryFormat": "i"}, + } ms = tskit.MetadataSchema(schema) - row = {"label": "alpha", "count": 7, "blob": 5} - encoded = ms.validate_and_encode_row(row) - out = ms.decode_row(encoded) - assert out == row + for v in [[], [0, 2, 12], [5] * 1000]: + row = {"label": "abcdef xyz", "count": 7, "b0": 123, "b1": 0, "numbers": v} + encoded = ms.validate_and_encode_row(row) + out = ms.decode_row(encoded) + assert out == row + + @pytest.mark.parametrize("k", (0, 1, 5, 1001)) + def test_byte_alignment(self, k): + ms = self.schema_with_blobs(k) + ms0 = self.schema_with_blobs(0) + bytes_per_blob = len(struct.pack("i", 0)) + for s in [ + "", + "abc", + "superfragilisticexpialodocious", + " " * 1000 + "foo" + " " * 1000, + ]: + row = {"label": s, "count": 7} + encoded0 = ms0.validate_and_encode_row(row) + row.update({f"b{j}": j for j in range(k)}) + encoded = ms.validate_and_encode_row(row) + out = ms.decode_row(encoded) + assert out == row + # validate byte alignment + assert len(encoded) - len(encoded0) == k * bytes_per_blob + assert len(encoded0) % 8 == 0 def test_json_defaults_applied(self): schema = { diff --git a/python/tskit/metadata.py b/python/tskit/metadata.py index 73ecf37b82..0fdd315dfd 100644 --- a/python/tskit/metadata.py +++ b/python/tskit/metadata.py @@ -201,6 +201,12 @@ class JSONStructCodec(AbstractMetadataCodec): The codec expects a metadata schema with separate ``json`` and ``struct`` subschemas and produces a single dict containing the union of the keys from those subschemas after decoding. + + The structure of the encoded metadata is as follows: first, a fixed-size + header that contains the number of bytes for both ``json`` and ``struct`` + portions; next, the json; then a variable number of zeroed padding bytes + that brings the length to a multiple of 8 for alignment; and finally + the struct-encoded binary portion. """ MAGIC = b"JBLB" @@ -294,7 +300,8 @@ def encode(self, obj: Any) -> bytes: header = self._HDR.pack( self.MAGIC, self.VERSION, len(json_bytes), len(blob_bytes) ) - return header + json_bytes + blob_bytes + padding_bytes = bytes((-(len(header) + len(json_bytes))) % 8) + return header + json_bytes + padding_bytes + blob_bytes def decode(self, encoded: bytes) -> Any: if len(encoded) >= self._HDR.size and encoded[:4] == self.MAGIC: @@ -302,12 +309,18 @@ def decode(self, encoded: bytes) -> Any: if version != self.VERSION: raise ValueError("Unsupported json+struct version") start = self._HDR.size - if jlen > len(encoded) - start or blen > len(encoded) - start - jlen: + padding_length = (-(start + jlen)) % 8 + if ( + jlen > len(encoded) - start + or blen > len(encoded) - start - jlen - padding_length + ): raise ValueError( "Invalid json+struct payload: declared lengths exceed buffer size" ) json_bytes = encoded[start : start + jlen] - blob_bytes = encoded[start + jlen : start + jlen + blen] + blob_bytes = encoded[ + start + jlen + padding_length : start + jlen + padding_length + blen + ] json_data = self.json_codec.decode(json_bytes) struct_data = self.struct_codec.decode(blob_bytes) overlap = set(json_data).intersection(struct_data)