From 3d0f794b2fd7905070e01055525eaf221422f0ac Mon Sep 17 00:00:00 2001 From: peter Date: Fri, 13 Mar 2026 09:01:41 -0700 Subject: [PATCH 1/3] docs for json+struct codec padding bytes in json+struct codec --- c/tests/test_core.c | 156 ---------------------------------- c/tskit/core.c | 96 +-------------------- c/tskit/core.h | 29 ++----- docs/metadata.md | 69 ++++++++++++++- python/tests/test_metadata.py | 36 ++++++-- python/tskit/metadata.py | 19 ++++- 6 files changed, 119 insertions(+), 286 deletions(-) diff --git a/c/tests/test_core.c b/c/tests/test_core.c index 9d8d9f5a7d..761a0e8773 100644 --- a/c/tests/test_core.c +++ b/c/tests/test_core.c @@ -83,161 +83,6 @@ test_generate_uuid(void) CU_ASSERT_STRING_NOT_EQUAL(uuid, other_uuid); } -static void -set_u64_le(uint8_t *dest, uint64_t value) -{ - dest[0] = (uint8_t) (value & 0xFF); - dest[1] = (uint8_t) ((value >> 8) & 0xFF); - dest[2] = (uint8_t) ((value >> 16) & 0xFF); - dest[3] = (uint8_t) ((value >> 24) & 0xFF); - dest[4] = (uint8_t) ((value >> 32) & 0xFF); - dest[5] = (uint8_t) ((value >> 40) & 0xFF); - dest[6] = (uint8_t) ((value >> 48) & 0xFF); - dest[7] = (uint8_t) ((value >> 56) & 0xFF); -} - -static void -test_json_struct_metadata_get_blob(void) -{ - int ret; - char metadata[128]; - char *json; - tsk_size_t json_buffer_length; - char *blob; - tsk_size_t blob_length; - uint8_t *bytes; - tsk_size_t metadata_length; - size_t header_length; - size_t json_length; - size_t payload_length; - size_t total_length; - char json_payload[] = "{\"a\":1}"; - uint8_t binary_payload[] = { 0x01, 0x02, 0x03, 0x04 }; - uint8_t empty_payload[] = { 0 }; - - bytes = (uint8_t *) metadata; - header_length = 4 + 1 + 8 + 8; - json_length = strlen(json_payload); - payload_length = sizeof(binary_payload); - total_length = header_length + json_length + payload_length; - CU_ASSERT_FATAL(total_length <= sizeof(metadata)); - memset(metadata, 0, sizeof(metadata)); - bytes[0] = 'J'; - bytes[1] = 'B'; - bytes[2] = 'L'; - bytes[3] = 'B'; - bytes[4] = 1; - set_u64_le(bytes + 5, (uint64_t) json_length); - set_u64_le(bytes + 13, (uint64_t) payload_length); - memcpy(bytes + header_length, json_payload, json_length); - memcpy(bytes + header_length + json_length, binary_payload, payload_length); - metadata_length = (tsk_size_t) total_length; - ret = tsk_json_struct_metadata_get_blob( - metadata, metadata_length, &json, &json_buffer_length, &blob, &blob_length); - CU_ASSERT_EQUAL(ret, 0); - CU_ASSERT_PTR_EQUAL(json, (char *) bytes + header_length); - CU_ASSERT_EQUAL(json_buffer_length, (tsk_size_t) json_length); - if (json_length > 0) { - CU_ASSERT_EQUAL(memcmp(json, json_payload, json_length), 0); - } - CU_ASSERT_PTR_EQUAL(blob, bytes + header_length + json_length); - CU_ASSERT_EQUAL(blob_length, (tsk_size_t) payload_length); - CU_ASSERT_EQUAL(memcmp(blob, binary_payload, payload_length), 0); - - payload_length = 0; - total_length = header_length + json_length + payload_length; - CU_ASSERT_FATAL(total_length <= sizeof(metadata)); - set_u64_le(bytes + 13, (uint64_t) payload_length); - metadata_length = (tsk_size_t) total_length; - ret = tsk_json_struct_metadata_get_blob( - metadata, metadata_length, &json, &json_buffer_length, &blob, &blob_length); - CU_ASSERT_EQUAL(ret, 0); - CU_ASSERT_PTR_EQUAL(json, (char *) bytes + header_length); - CU_ASSERT_EQUAL(json_buffer_length, (tsk_size_t) json_length); - CU_ASSERT_EQUAL(blob_length, (tsk_size_t) payload_length); - CU_ASSERT_PTR_EQUAL(blob, bytes + header_length + json_length); - - json_length = 0; - payload_length = sizeof(empty_payload); - total_length = header_length + json_length + payload_length; - CU_ASSERT_FATAL(total_length <= sizeof(metadata)); - set_u64_le(bytes + 5, (uint64_t) json_length); - set_u64_le(bytes + 13, (uint64_t) payload_length); - memcpy(bytes + header_length + json_length, empty_payload, payload_length); - metadata_length = (tsk_size_t) total_length; - ret = tsk_json_struct_metadata_get_blob( - metadata, metadata_length, &json, &json_buffer_length, &blob, &blob_length); - CU_ASSERT_EQUAL(ret, 0); - CU_ASSERT_PTR_EQUAL(json, (char *) bytes + header_length); - CU_ASSERT_EQUAL(json_buffer_length, (tsk_size_t) json_length); - CU_ASSERT_EQUAL(blob_length, (tsk_size_t) payload_length); - CU_ASSERT_PTR_EQUAL(blob, bytes + header_length + json_length); - CU_ASSERT_EQUAL(memcmp(blob, empty_payload, payload_length), 0); - - blob = NULL; - blob_length = 0; - json = NULL; - json_buffer_length = 0; - metadata_length = header_length - 1; - ret = tsk_json_struct_metadata_get_blob( - metadata, metadata_length, &json, &json_buffer_length, &blob, &blob_length); - CU_ASSERT_EQUAL(ret, TSK_ERR_JSON_STRUCT_METADATA_TRUNCATED); - - metadata_length = (tsk_size_t) total_length; - bytes[0] = 'X'; - ret = tsk_json_struct_metadata_get_blob( - metadata, metadata_length, &json, &json_buffer_length, &blob, &blob_length); - CU_ASSERT_EQUAL(ret, TSK_ERR_JSON_STRUCT_METADATA_BAD_MAGIC); - bytes[0] = 'J'; - - bytes[4] = 2; - ret = tsk_json_struct_metadata_get_blob( - metadata, metadata_length, &json, &json_buffer_length, &blob, &blob_length); - CU_ASSERT_EQUAL(ret, TSK_ERR_JSON_STRUCT_METADATA_BAD_VERSION); - bytes[4] = 1; - - metadata_length = (tsk_size_t) (total_length - 1); - ret = tsk_json_struct_metadata_get_blob( - metadata, metadata_length, &json, &json_buffer_length, &blob, &blob_length); - CU_ASSERT_EQUAL(ret, TSK_ERR_JSON_STRUCT_METADATA_TRUNCATED); - - ret = tsk_json_struct_metadata_get_blob( - NULL, metadata_length, &json, &json_buffer_length, &blob, &blob_length); - CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); - ret = tsk_json_struct_metadata_get_blob( - metadata, metadata_length, NULL, &json_buffer_length, &blob, &blob_length); - CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); - ret = tsk_json_struct_metadata_get_blob( - metadata, metadata_length, &json, NULL, &blob, &blob_length); - CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); - ret = tsk_json_struct_metadata_get_blob( - metadata, metadata_length, &json, &json_buffer_length, NULL, &blob_length); - CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); - ret = tsk_json_struct_metadata_get_blob( - metadata, metadata_length, &json, &json_buffer_length, &blob, NULL); - CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); - - memset(metadata, 0, sizeof(metadata)); - bytes[0] = 'J'; - bytes[1] = 'B'; - bytes[2] = 'L'; - bytes[3] = 'B'; - bytes[4] = 1; - metadata_length = (tsk_size_t) header_length; - - set_u64_le(bytes + 5, UINT64_MAX - (uint64_t) header_length + 1); - set_u64_le(bytes + 13, 0); - ret = tsk_json_struct_metadata_get_blob( - metadata, metadata_length, &json, &json_buffer_length, &blob, &blob_length); - CU_ASSERT_EQUAL(ret, TSK_ERR_JSON_STRUCT_METADATA_INVALID_LENGTH); - - set_u64_le(bytes + 5, 8); - set_u64_le(bytes + 13, UINT64_MAX - (uint64_t) (header_length + 8) + 1); - ret = tsk_json_struct_metadata_get_blob( - metadata, metadata_length, &json, &json_buffer_length, &blob, &blob_length); - CU_ASSERT_EQUAL(ret, TSK_ERR_JSON_STRUCT_METADATA_INVALID_LENGTH); -} - static void test_double_round(void) { @@ -808,7 +653,6 @@ main(int argc, char **argv) { "test_strerror", test_strerror }, { "test_strerror_kastore", test_strerror_kastore }, { "test_generate_uuid", test_generate_uuid }, - { "test_json_struct_metadata_get_blob", test_json_struct_metadata_get_blob }, { "test_double_round", test_double_round }, { "test_blkalloc", test_blkalloc }, { "test_unknown_time", test_unknown_time }, diff --git a/c/tskit/core.c b/c/tskit/core.c index 66f8cbd0ef..91e684c52d 100644 --- a/c/tskit/core.c +++ b/c/tskit/core.c @@ -32,10 +32,7 @@ #include #include -#define UUID_NUM_BYTES 16 -#define TSK_JSON_BINARY_HEADER_SIZE 21 - -static const uint8_t _tsk_json_binary_magic[4] = { 'J', 'B', 'L', 'B' }; +#define UUID_NUM_BYTES 16 #if defined(_WIN32) @@ -98,22 +95,6 @@ get_random_bytes(uint8_t *buf) #endif -static uint64_t -tsk_load_u64_le(const uint8_t *p) -{ - uint64_t value; - - value = (uint64_t) p[0]; - value |= (uint64_t) p[1] << 8; - value |= (uint64_t) p[2] << 16; - value |= (uint64_t) p[3] << 24; - value |= (uint64_t) p[4] << 32; - value |= (uint64_t) p[5] << 40; - value |= (uint64_t) p[6] << 48; - value |= (uint64_t) p[7] << 56; - return value; -} - /* Generate a new UUID4 using a system-generated source of randomness. * Note that this function writes a NULL terminator to the end of this * string, so that the total length of the buffer must be 37 bytes. @@ -141,65 +122,6 @@ tsk_generate_uuid(char *dest, int TSK_UNUSED(flags)) return ret; } -int -tsk_json_struct_metadata_get_blob(char *metadata, tsk_size_t metadata_length, - char **json, tsk_size_t *json_length, char **blob, tsk_size_t *blob_length) -{ - int ret; - uint8_t version; - uint64_t json_length_u64; - uint64_t binary_length_u64; - uint64_t header_and_json_length; - uint64_t total_length; - uint8_t *bytes; - char *blob_start; - char *json_start; - - if (metadata == NULL || json == NULL || json_length == NULL || blob == NULL - || blob_length == NULL) { - ret = tsk_trace_error(TSK_ERR_BAD_PARAM_VALUE); - goto out; - } - bytes = (uint8_t *) metadata; - if (metadata_length < TSK_JSON_BINARY_HEADER_SIZE) { - ret = tsk_trace_error(TSK_ERR_JSON_STRUCT_METADATA_TRUNCATED); - goto out; - } - if (memcmp(bytes, _tsk_json_binary_magic, sizeof(_tsk_json_binary_magic)) != 0) { - ret = tsk_trace_error(TSK_ERR_JSON_STRUCT_METADATA_BAD_MAGIC); - goto out; - } - version = bytes[4]; - if (version != 1) { - ret = tsk_trace_error(TSK_ERR_JSON_STRUCT_METADATA_BAD_VERSION); - goto out; - } - json_length_u64 = tsk_load_u64_le(bytes + 5); - binary_length_u64 = tsk_load_u64_le(bytes + 13); - if (json_length_u64 > UINT64_MAX - (uint64_t) TSK_JSON_BINARY_HEADER_SIZE) { - ret = tsk_trace_error(TSK_ERR_JSON_STRUCT_METADATA_INVALID_LENGTH); - goto out; - } - header_and_json_length = (uint64_t) TSK_JSON_BINARY_HEADER_SIZE + json_length_u64; - if (binary_length_u64 > UINT64_MAX - header_and_json_length) { - ret = tsk_trace_error(TSK_ERR_JSON_STRUCT_METADATA_INVALID_LENGTH); - goto out; - } - total_length = header_and_json_length + binary_length_u64; - if ((uint64_t) metadata_length < total_length) { - ret = tsk_trace_error(TSK_ERR_JSON_STRUCT_METADATA_TRUNCATED); - goto out; - } - json_start = (char *) bytes + TSK_JSON_BINARY_HEADER_SIZE; - blob_start = (char *) bytes + TSK_JSON_BINARY_HEADER_SIZE + json_length_u64; - *json = json_start; - *json_length = (tsk_size_t) json_length_u64; - *blob = blob_start; - *blob_length = (tsk_size_t) binary_length_u64; - ret = 0; -out: - return ret; -} static const char * tsk_strerror_internal(int err) { @@ -267,22 +189,6 @@ tsk_strerror_internal(int err) ret = "An incompatible type for a column was found in the file. " "(TSK_ERR_BAD_COLUMN_TYPE)"; break; - case TSK_ERR_JSON_STRUCT_METADATA_BAD_MAGIC: - ret = "JSON binary struct metadata does not begin with the expected " - "magic bytes. (TSK_ERR_JSON_STRUCT_METADATA_BAD_MAGIC)"; - break; - case TSK_ERR_JSON_STRUCT_METADATA_TRUNCATED: - ret = "JSON binary struct metadata is shorter than the expected size. " - "(TSK_ERR_JSON_STRUCT_METADATA_TRUNCATED)"; - break; - case TSK_ERR_JSON_STRUCT_METADATA_INVALID_LENGTH: - ret = "A length field in the JSON binary struct metadata header is invalid. " - "(TSK_ERR_JSON_STRUCT_METADATA_INVALID_LENGTH)"; - break; - case TSK_ERR_JSON_STRUCT_METADATA_BAD_VERSION: - ret = "JSON binary struct metadata uses an unsupported version number. " - "(TSK_ERR_JSON_STRUCT_METADATA_BAD_VERSION)"; - break; /* Out of bounds errors */ case TSK_ERR_BAD_OFFSET: diff --git a/c/tskit/core.h b/c/tskit/core.h index 2964e3d8f1..f751d58b42 100644 --- a/c/tskit/core.h +++ b/c/tskit/core.h @@ -309,26 +309,6 @@ not found in the file. An unsupported type was provided for a column in the file. */ #define TSK_ERR_BAD_COLUMN_TYPE -105 - -/** -The JSON binary struct metadata does not begin with the expected magic bytes. -*/ -#define TSK_ERR_JSON_STRUCT_METADATA_BAD_MAGIC -106 - -/** -The JSON binary struct metadata is shorter than the expected size. -*/ -#define TSK_ERR_JSON_STRUCT_METADATA_TRUNCATED -107 - -/** -A length field in the JSON binary struct metadata header is invalid. -*/ -#define TSK_ERR_JSON_STRUCT_METADATA_INVALID_LENGTH -108 - -/** -The JSON binary struct metadata uses an unsupported version number. -*/ -#define TSK_ERR_JSON_STRUCT_METADATA_BAD_VERSION -109 /** @} */ /** @@ -1136,10 +1116,11 @@ int tsk_generate_uuid(char *dest, int flags); @brief Extract the binary payload from ``json+struct`` encoded metadata. @rst -Metadata produced by the JSONStructCodec consists of a fixed-size -header followed by canonical JSON bytes and an optional binary payload. This helper -validates the framing, returning pointers to the embedded JSON and binary sections -without copying. +Metadata produced by the JSONStructCodec consists of a fixed-size header followed +by canonical JSON bytes, a variable number of padding bytes (which should be zero) +to bring the length to a multiple of 8 bytes for alignment, and finally an optional +binary payload. This helper validates the framing, returning pointers to the +embedded JSON and binary sections without copying. The output pointers reference memory owned by the caller and remain valid only while the original metadata buffer is alive. diff --git a/docs/metadata.md b/docs/metadata.md index 915108c000..8fb5853f57 100644 --- a/docs/metadata.md +++ b/docs/metadata.md @@ -527,7 +527,7 @@ of `B`, `H`, `I`, `L` or `Q` which have the same meaning as in the numeric types above. `L` is the default. As an example: ``` -{"type": "array", {"items": {"type":"number", "binaryFormat":"h"}}, "arrayLengthFormat":"B"} +{"type": "array", "items": {"type":"number", "binaryFormat":"h"}, "arrayLengthFormat":"B"} ``` Will result in an array of 2 byte integers, prepended by a single-byte array-length. @@ -555,6 +555,73 @@ As a special case under the `struct` codec, the top-level type of metadata can b union of `object` and `null`. Set `"type": ["object", "null"]`. Properties should be defined as normal, and will be ignored if the metadata is `None`. +(sec_metadata_codecs_jsonstruct)= + +### `json+struct` + +An additional codec provides the ability to store *both* JSON and binary-encoded data. +This is provided for the case where we want to store some arbitrary metadata +(as JSON) along with a relatively large amount of data (as binary, for efficiency). +For instance, we might want to record a raster map of the sampled area +along with a few pieces of generic information (e.g., the name of the area). + +The metadata schema for "json+struct" metadata basically just specifies both +a JSON metadata schema and a struct metadata schema. +Each entry in the metadata is encoded with either the JSON or the struct codec. +Here is a simple example: + +```{code-cell} +schema = { + "codec": "json+struct", + "json": { + "type": "object", + "properties": { + "label": {"type": "string"}, + "id": {"type": "number"}, + }, + "required": ["label"], + }, + "struct": { + "type": "object", + "properties": { + "values": { + "type": "array", + "arrayLengthFormat": "B", + "items": {"type": "number", "binaryFormat": "i"}, + }, + }, + }, +} +ms = tskit.MetadataSchema(schema) +row = {"label": "alpha", "id": 7, "values": [5, 10, 2, 12]} +encoded = ms.validate_and_encode_row(row) +print("Encoded:", encoded) +print("Decoded:", ms.decode_row(encoded)) +``` + +This encodes two things in JSON: a label and an ID number, +and then an array of integers in binary (using the ``struct`` codec). +If the array of integers is large, this could result in +much better performance. + + +#### Binary representation + +The underlying structure of the JSON+struct codec is as follows. +(If you're not writing out data in this format, +you don't need to worry about this.) +(1) some magic bytes; +(2) a version number; +(3) the length of the JSON in bytes; +(4) the length of the binary (struct) data in bytes; +(5) the JSON data; +(6) zero-ed "padding" bytes to bring the start of the binary section + into 8-byte alignment; +(7) the binary data. +The structure of the binary data is specified using the "struct" portion +of the metadata schema. + + (sec_metadata_schema_examples)= ## Schema examples diff --git a/python/tests/test_metadata.py b/python/tests/test_metadata.py index 31adb2b1b3..e7ef4bcb69 100644 --- a/python/tests/test_metadata.py +++ b/python/tests/test_metadata.py @@ -649,7 +649,7 @@ def test_disallow_duplicate_keys(self): ): tskit.MetadataSchema(schema) - def test_round_trip_with_struct_and_json(self): + def schema_with_blobs(self, num_blobs): schema = { "codec": "json+struct", "json": { @@ -662,14 +662,36 @@ def test_round_trip_with_struct_and_json(self): }, "struct": { "type": "object", - "properties": {"blob": {"type": "integer", "binaryFormat": "i"}}, + "properties": {}, }, } - ms = tskit.MetadataSchema(schema) - row = {"label": "alpha", "count": 7, "blob": 5} - encoded = ms.validate_and_encode_row(row) - out = ms.decode_row(encoded) - assert out == row + for j in range(num_blobs): + schema["struct"]["properties"][f"b{j}"] = { + "type": "integer", + "binaryFormat": "i", + } + return tskit.MetadataSchema(schema) + + @pytest.mark.parametrize("k", (0, 1, 5, 1001)) + def test_round_trip_with_struct_and_json(self, k): + ms = self.schema_with_blobs(k) + ms0 = self.schema_with_blobs(0) + bytes_per_blob = len(struct.pack("i", 0)) + for s in [ + "", + "abc", + "superfragilisticexpialodocious", + " " * 1000 + "foo" + " " * 1000, + ]: + row = {"label": s, "count": 7} + encoded0 = ms0.validate_and_encode_row(row) + row.update({f"b{j}": j for j in range(k)}) + encoded = ms.validate_and_encode_row(row) + out = ms.decode_row(encoded) + assert out == row + # validate byte alignment + assert len(encoded) - len(encoded0) == k * bytes_per_blob + assert len(encoded0) % 8 == 0 def test_json_defaults_applied(self): schema = { diff --git a/python/tskit/metadata.py b/python/tskit/metadata.py index 73ecf37b82..0fdd315dfd 100644 --- a/python/tskit/metadata.py +++ b/python/tskit/metadata.py @@ -201,6 +201,12 @@ class JSONStructCodec(AbstractMetadataCodec): The codec expects a metadata schema with separate ``json`` and ``struct`` subschemas and produces a single dict containing the union of the keys from those subschemas after decoding. + + The structure of the encoded metadata is as follows: first, a fixed-size + header that contains the number of bytes for both ``json`` and ``struct`` + portions; next, the json; then a variable number of zeroed padding bytes + that brings the length to a multiple of 8 for alignment; and finally + the struct-encoded binary portion. """ MAGIC = b"JBLB" @@ -294,7 +300,8 @@ def encode(self, obj: Any) -> bytes: header = self._HDR.pack( self.MAGIC, self.VERSION, len(json_bytes), len(blob_bytes) ) - return header + json_bytes + blob_bytes + padding_bytes = bytes((-(len(header) + len(json_bytes))) % 8) + return header + json_bytes + padding_bytes + blob_bytes def decode(self, encoded: bytes) -> Any: if len(encoded) >= self._HDR.size and encoded[:4] == self.MAGIC: @@ -302,12 +309,18 @@ def decode(self, encoded: bytes) -> Any: if version != self.VERSION: raise ValueError("Unsupported json+struct version") start = self._HDR.size - if jlen > len(encoded) - start or blen > len(encoded) - start - jlen: + padding_length = (-(start + jlen)) % 8 + if ( + jlen > len(encoded) - start + or blen > len(encoded) - start - jlen - padding_length + ): raise ValueError( "Invalid json+struct payload: declared lengths exceed buffer size" ) json_bytes = encoded[start : start + jlen] - blob_bytes = encoded[start + jlen : start + jlen + blen] + blob_bytes = encoded[ + start + jlen + padding_length : start + jlen + padding_length + blen + ] json_data = self.json_codec.decode(json_bytes) struct_data = self.struct_codec.decode(blob_bytes) overlap = set(json_data).intersection(struct_data) From dcd00c1363b6085752950b9616929dd25f611772 Mon Sep 17 00:00:00 2001 From: peter Date: Sat, 21 Mar 2026 20:33:47 -0700 Subject: [PATCH 2/3] remove docs edits from this pr --- docs/metadata.md | 69 +----------------------------------------------- 1 file changed, 1 insertion(+), 68 deletions(-) diff --git a/docs/metadata.md b/docs/metadata.md index 8fb5853f57..915108c000 100644 --- a/docs/metadata.md +++ b/docs/metadata.md @@ -527,7 +527,7 @@ of `B`, `H`, `I`, `L` or `Q` which have the same meaning as in the numeric types above. `L` is the default. As an example: ``` -{"type": "array", "items": {"type":"number", "binaryFormat":"h"}, "arrayLengthFormat":"B"} +{"type": "array", {"items": {"type":"number", "binaryFormat":"h"}}, "arrayLengthFormat":"B"} ``` Will result in an array of 2 byte integers, prepended by a single-byte array-length. @@ -555,73 +555,6 @@ As a special case under the `struct` codec, the top-level type of metadata can b union of `object` and `null`. Set `"type": ["object", "null"]`. Properties should be defined as normal, and will be ignored if the metadata is `None`. -(sec_metadata_codecs_jsonstruct)= - -### `json+struct` - -An additional codec provides the ability to store *both* JSON and binary-encoded data. -This is provided for the case where we want to store some arbitrary metadata -(as JSON) along with a relatively large amount of data (as binary, for efficiency). -For instance, we might want to record a raster map of the sampled area -along with a few pieces of generic information (e.g., the name of the area). - -The metadata schema for "json+struct" metadata basically just specifies both -a JSON metadata schema and a struct metadata schema. -Each entry in the metadata is encoded with either the JSON or the struct codec. -Here is a simple example: - -```{code-cell} -schema = { - "codec": "json+struct", - "json": { - "type": "object", - "properties": { - "label": {"type": "string"}, - "id": {"type": "number"}, - }, - "required": ["label"], - }, - "struct": { - "type": "object", - "properties": { - "values": { - "type": "array", - "arrayLengthFormat": "B", - "items": {"type": "number", "binaryFormat": "i"}, - }, - }, - }, -} -ms = tskit.MetadataSchema(schema) -row = {"label": "alpha", "id": 7, "values": [5, 10, 2, 12]} -encoded = ms.validate_and_encode_row(row) -print("Encoded:", encoded) -print("Decoded:", ms.decode_row(encoded)) -``` - -This encodes two things in JSON: a label and an ID number, -and then an array of integers in binary (using the ``struct`` codec). -If the array of integers is large, this could result in -much better performance. - - -#### Binary representation - -The underlying structure of the JSON+struct codec is as follows. -(If you're not writing out data in this format, -you don't need to worry about this.) -(1) some magic bytes; -(2) a version number; -(3) the length of the JSON in bytes; -(4) the length of the binary (struct) data in bytes; -(5) the JSON data; -(6) zero-ed "padding" bytes to bring the start of the binary section - into 8-byte alignment; -(7) the binary data. -The structure of the binary data is specified using the "struct" portion -of the metadata schema. - - (sec_metadata_schema_examples)= ## Schema examples From 169f2aed2217f84637fc431e6a90d579ecef84d2 Mon Sep 17 00:00:00 2001 From: peter Date: Sat, 21 Mar 2026 21:20:39 -0700 Subject: [PATCH 3/3] array test --- python/tests/test_metadata.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/python/tests/test_metadata.py b/python/tests/test_metadata.py index e7ef4bcb69..80fd2e3ce4 100644 --- a/python/tests/test_metadata.py +++ b/python/tests/test_metadata.py @@ -672,8 +672,22 @@ def schema_with_blobs(self, num_blobs): } return tskit.MetadataSchema(schema) + def test_round_trip_with_struct_and_json(self): + schema = dict(self.schema_with_blobs(2).schema) + schema["struct"]["properties"]["numbers"] = { + "type": "array", + "arrayLengthFormat": "H", + "items": {"type": "number", "binaryFormat": "i"}, + } + ms = tskit.MetadataSchema(schema) + for v in [[], [0, 2, 12], [5] * 1000]: + row = {"label": "abcdef xyz", "count": 7, "b0": 123, "b1": 0, "numbers": v} + encoded = ms.validate_and_encode_row(row) + out = ms.decode_row(encoded) + assert out == row + @pytest.mark.parametrize("k", (0, 1, 5, 1001)) - def test_round_trip_with_struct_and_json(self, k): + def test_byte_alignment(self, k): ms = self.schema_with_blobs(k) ms0 = self.schema_with_blobs(0) bytes_per_blob = len(struct.pack("i", 0))