Skip to content

Commit f4ed750

Browse files
committed
doc edits
1 parent c912551 commit f4ed750

3 files changed

Lines changed: 104 additions & 27 deletions

File tree

c/examples/json_struct_metadata.c

Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
#include <tskit.h>
66

77
// these are properties of the ``json+struct`` codec, documented in tskit
8-
#define JSON_STRUCT_CODEC_HEADER_SIZE 21
8+
#define JSON_STRUCT_HEADER_SIZE 21
99

1010
const uint8_t json_struct_codec_magic[4] = { 'J', 'B', 'L', 'B' };
1111
const uint8_t json_struct_codec_version = 1;
@@ -50,7 +50,7 @@ json_struct_codec_get_components(uint8_t *metadata, tsk_size_t metadata_length,
5050
if (metadata == NULL || json == NULL || json_length == NULL || binary == NULL
5151
|| binary_length == NULL)
5252
errx(EXIT_FAILURE, "bad parameter value.");
53-
if (metadata_length < JSON_STRUCT_CODEC_HEADER_SIZE)
53+
if (metadata_length < JSON_STRUCT_HEADER_SIZE)
5454
errx(EXIT_FAILURE, "metadata truncated.");
5555
if (memcmp(metadata, json_struct_codec_magic, sizeof(json_struct_codec_magic)) != 0)
5656
errx(EXIT_FAILURE, "bad magic bytes.");
@@ -61,33 +61,30 @@ json_struct_codec_get_components(uint8_t *metadata, tsk_size_t metadata_length,
6161

6262
uint64_t json_length_u64 = load_u64_le(metadata + 5);
6363
uint64_t binary_length_u64 = load_u64_le(metadata + 13);
64-
if (json_length_u64 > UINT64_MAX - (uint64_t) JSON_STRUCT_CODEC_HEADER_SIZE)
64+
if (json_length_u64 > UINT64_MAX - (uint64_t) JSON_STRUCT_HEADER_SIZE)
6565
errx(EXIT_FAILURE, "invalid length.");
6666

6767
// determine the number of padding bytes and do more safety checks
68-
uint64_t header_and_json_length
69-
= (uint64_t) JSON_STRUCT_CODEC_HEADER_SIZE + json_length_u64;
70-
uint64_t padding_length = (8 - (header_and_json_length & 0x07)) % 8;
71-
uint64_t header_and_json_and_padding_length
72-
= header_and_json_length + padding_length;
73-
if (binary_length_u64 > UINT64_MAX - header_and_json_and_padding_length)
68+
uint64_t length = (uint64_t) JSON_STRUCT_HEADER_SIZE + json_length_u64;
69+
uint64_t padding_length = (8 - (length & 0x07)) % 8;
70+
length += padding_length;
71+
if (binary_length_u64 > UINT64_MAX - length)
7472
errx(EXIT_FAILURE, "invalid length.");
7573

76-
uint64_t total_length = header_and_json_and_padding_length + binary_length_u64;
77-
if ((uint64_t) metadata_length != total_length)
74+
length += binary_length_u64;
75+
if ((uint64_t) metadata_length != length)
7876
errx(EXIT_FAILURE, "unexpected size.");
7977

80-
uint8_t *padding_start = metadata + JSON_STRUCT_CODEC_HEADER_SIZE + json_length_u64;
81-
for (uint64_t padding_index = 0; padding_index < padding_length; ++padding_index)
82-
if (*(padding_start + padding_index) != 0)
78+
uint8_t *padding_start = metadata + JSON_STRUCT_HEADER_SIZE + json_length_u64;
79+
for (uint64_t j = 0; j < padding_length; ++j)
80+
if (*(padding_start + j) != 0)
8381
errx(EXIT_FAILURE, "padding bytes are nonzero.");
8482

8583
// the structure of the codec data seems valid; return components
86-
*json = metadata + JSON_STRUCT_CODEC_HEADER_SIZE;
84+
*json = metadata + JSON_STRUCT_HEADER_SIZE;
8785
*json_length = (tsk_size_t) json_length_u64;
8886

89-
*binary
90-
= metadata + JSON_STRUCT_CODEC_HEADER_SIZE + json_length_u64 + padding_length;
87+
*binary = metadata + JSON_STRUCT_HEADER_SIZE + json_length_u64 + padding_length;
9188
*binary_length = (tsk_size_t) binary_length_u64;
9289
}
9390

@@ -99,7 +96,7 @@ json_struct_codec_create_buffer(const uint8_t *json, tsk_size_t json_length,
9996
tsk_size_t *buffer_length)
10097
{
10198
// figure out the total length of the codec's data and allocate the buffer for it
102-
tsk_size_t header_length = JSON_STRUCT_CODEC_HEADER_SIZE;
99+
tsk_size_t header_length = JSON_STRUCT_HEADER_SIZE;
103100
tsk_size_t padding_length = (8 - ((header_length + json_length) & 0x07)) % 8;
104101
tsk_size_t total_length
105102
= header_length + json_length + padding_length + binary_length;

docs/c-api.rst

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -959,27 +959,27 @@ parse metadata using an external JSON library, and for
959959
struct-encoded metadata the values can be directly unpacked.
960960
Examples of both can be found in
961961
`the SLiM code <https://messerlab.github.com/slim/>`_.
962-
(In Python, tskit automatically decodes both JSON and binary
963-
metadata and provides it as Python-data-typed metadata,
964-
just as for other codecs.)
965962

966-
The :ref:`"json+struct" <sec_metadata_codecs_jsonstruct>`_
963+
The :ref:`"json+struct" <sec_metadata_codecs_jsonstruct>`
967964
metadata codec is a little less straightforward to use,
968965
so we provide here an example of how to write to it
969-
and read from it in C. See :ref:`sec_metadata_codes_jsonstruct`
966+
and read from it in C. See :ref:`sec_metadata_codecs_jsonstruct`
970967
for details of how the metadata is encoded.
968+
(In Python, tskit automatically decodes both JSON and binary
969+
metadata and provides it as Python-data-typed metadata,
970+
just as for other codecs.)
971971

972972
The structure of this example is as follows:
973973

974974
1. Values specific to the metadata's header (e.g., the magic bytes `JBLB`).
975-
2. Functions that encode/decode `uint_64t`, used to store the lengths
975+
2. Functions that encode/decode `uint64_t`, used to store the lengths
976976
of the two components in the header.
977977
3. A method to "read" the metadata: really, to get pointers to the
978978
json and struct components.
979-
4. A method to write the metadata, again just given pointers to
979+
4. A method to "write" the metadata, again just given pointers to
980980
and lengths of the two components.
981981
5. The program itself just round-trips a very simple chunk of metadata,
982-
consisting of the JSON "`{"a": 1}`" and some binary `uint_8t` bytes ("`1234`").
982+
consisting of the JSON "`{"a": 1}`" and some binary `uint8_t` bytes ("`1234`").
983983

984984
.. literalinclude:: ../c/examples/json_struct_metadata.c
985985
:language: c
@@ -993,3 +993,16 @@ into two buffers (to then be decoded, for instance).
993993
However, that would double the memory footprint,
994994
and since this codec is intended for large metadata,
995995
we did not use that approach in this example.
996+
997+
Along the same lines, it is worth noting that this example does make a copy of
998+
the JSON and binary data when writing, in ``json_struct_codec_create_buffer()``,
999+
which doubles the memory footprint at that point, and adds the
1000+
overhead of copying the data. A more efficient approach would be to calculate
1001+
the buffer length needed for the codec’s data, allocate the buffer with that
1002+
length, and then generate the necessary JSON and binary metadata directly into
1003+
that buffer. This would require the metadata-generating code to be more
1004+
closely entwined with the code for handling the json+struct codec header and
1005+
padding bytes, and so we have chosen not to adopt that approach here, for
1006+
pedagogical purposes; but if your use of this codec will involve large
1007+
metadata, such an approach is recommended.
1008+

docs/metadata.md

Lines changed: 68 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -527,7 +527,7 @@ of `B`, `H`, `I`, `L` or `Q` which have the same meaning as in the numeric
527527
types above. `L` is the default. As an example:
528528

529529
```
530-
{"type": "array", {"items": {"type":"number", "binaryFormat":"h"}}, "arrayLengthFormat":"B"}
530+
{"type": "array", "items": {"type":"number", "binaryFormat":"h"}, "arrayLengthFormat":"B"}
531531
```
532532

533533
Will result in an array of 2 byte integers, prepended by a single-byte array-length.
@@ -555,6 +555,73 @@ As a special case under the `struct` codec, the top-level type of metadata can b
555555
union of `object` and `null`. Set `"type": ["object", "null"]`. Properties should
556556
be defined as normal, and will be ignored if the metadata is `None`.
557557

558+
(sec_metadata_codecs_jsonstruct)=
559+
560+
### `json+struct`
561+
562+
An additional codec provides the ability to store *both* JSON and binary-encoded data.
563+
This is provided for the case where we want to store some arbitrary metadata
564+
(as JSON) along with a relatively large amount of data (as binary, for efficiency).
565+
For instance, we might want to record a raster map of the sampled area
566+
along with a few pieces of generic information (e.g., the name of the area).
567+
568+
The metadata schema for "json+struct" metadata basically just specifies both
569+
a JSON metadata schema and a struct metadata schema.
570+
Each entry in the metadata is encoded with either the JSON or the struct codec.
571+
Here is a simple example:
572+
573+
```{code-cell}
574+
schema = {
575+
"codec": "json+struct",
576+
"json": {
577+
"type": "object",
578+
"properties": {
579+
"label": {"type": "string"},
580+
"id": {"type": "number"},
581+
},
582+
"required": ["label"],
583+
},
584+
"struct": {
585+
"type": "object",
586+
"properties": {
587+
"values": {
588+
"type": "array",
589+
"arrayLengthFormat": "B",
590+
"items": {"type": "number", "binaryFormat": "i"},
591+
},
592+
},
593+
},
594+
}
595+
ms = tskit.MetadataSchema(schema)
596+
row = {"label": "alpha", "id": 7, "values": [5, 10, 2, 12]}
597+
encoded = ms.validate_and_encode_row(row)
598+
print("Encoded:", encoded)
599+
print("Decoded:", ms.decode_row(encoded))
600+
```
601+
602+
This encodes two things in JSON: a label and an ID number,
603+
and then an array of integers in binary (using the ``struct`` codec).
604+
If the array of integers is large, this could result in
605+
much better performance.
606+
607+
608+
#### Binary representation
609+
610+
The underlying structure of the JSON+struct codec is as follows.
611+
(If you're not writing out data in this format,
612+
you don't need to worry about this.)
613+
(1) some magic bytes;
614+
(2) a version number;
615+
(3) the length of the JSON in bytes;
616+
(4) the length of the binary (struct) data in bytes;
617+
(5) the JSON data;
618+
(6) zero-ed "padding" bytes to bring the start of the binary section
619+
into 8-byte alignment;
620+
(7) the binary data.
621+
The structure of the binary data is specified using the "struct" portion
622+
of the metadata schema.
623+
624+
558625
(sec_metadata_schema_examples)=
559626

560627
## Schema examples

0 commit comments

Comments
 (0)