Qcodes/src/qcodes/dataset/data_set.py at 8f4baa8089bf0abf5e6e93441790b1bdff94c966 · microsoft/Qcodes · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
from __future__ import annotations

import importlib
import json
import logging
import tempfile
import time
import uuid
from dataclasses import dataclass
from pathlib import Path
from queue import Queue
from threading import Thread
from typing import TYPE_CHECKING, Any, Literal

import numpy
from tqdm.auto import trange

import qcodes
from qcodes.dataset.data_set_protocol import (
    SPECS,
    BaseDataSet,
    CompletedError,
    DataSetProtocol,
    ParameterData,
    SpecsOrInterDeps,
    values_type,
)
from qcodes.dataset.descriptions.dependencies import InterDependencies_
from qcodes.dataset.descriptions.rundescriber import RunDescriber
from qcodes.dataset.descriptions.versioning.converters import new_to_old, old_to_new
from qcodes.dataset.descriptions.versioning.v0 import InterDependencies
from qcodes.dataset.experiment_settings import get_default_experiment_id
from qcodes.dataset.export_config import (
    DataExportType,
)
from qcodes.dataset.guids import filter_guids_by_parts, generate_guid, parse_guid
from qcodes.dataset.linked_datasets.links import Link, links_to_str, str_to_links
from qcodes.dataset.sqlite.connection import ConnectionPlus, atomic, atomic_transaction
from qcodes.dataset.sqlite.database import (
    conn_from_dbpath_or_conn,
    connect,
    get_DB_location,
)
from qcodes.dataset.sqlite.queries import (
    _check_if_table_found,
    _get_result_table_name_by_guid,
    _query_guids_from_run_spec,
    add_data_to_dynamic_columns,
    add_parameter,
    completed,
    create_run,
    get_completed_timestamp_from_run_id,
    get_data_by_tag_and_table_name,
    get_experiment_name_from_experiment_id,
    get_guid_from_expid_and_counter,
    get_guid_from_run_id,
    get_metadata_from_run_id,
    get_parameter_data,
    get_parent_dataset_links,
    get_run_description,
    get_run_timestamp_from_run_id,
    get_runid_from_guid,
    get_sample_name_from_experiment_id,
    mark_run_complete,
    remove_trigger,
    run_exists,
    set_run_timestamp,
    update_parent_datasets,
    update_run_description,
)
from qcodes.dataset.sqlite.query_helpers import (
    VALUE,
    VALUES,
    insert_many_values,
    length,
    one,
    select_one_where,
)
from qcodes.utils import (
    NumpyJSONEncoder,
)

from .data_set_cache import DataSetCacheWithDBBackend
from .data_set_in_memory import DataSetInMem, load_from_file
from .descriptions.versioning import serialization as serial
from .exporters.export_info import ExportInfo
from .exporters.export_to_csv import dataframe_to_csv
from .exporters.export_to_pandas import (
    load_to_concatenated_dataframe,
    load_to_dataframe_dict,
)
from .exporters.export_to_xarray import (
    load_to_xarray_dataarray_dict,
    load_to_xarray_dataset,
    xarray_to_h5netcdf_with_complex_numbers,
)
from .subscriber import _Subscriber

if TYPE_CHECKING:
    from collections.abc import Callable, Mapping, Sequence

    import pandas as pd
    import xarray as xr

    from qcodes.dataset.descriptions.param_spec import ParamSpec, ParamSpecBase
    from qcodes.dataset.descriptions.versioning.rundescribertypes import Shapes
    from qcodes.parameters import ParameterBase


log = logging.getLogger(__name__)


# TODO: storing parameters in separate table as an extension (dropping
# the column parametenrs would be much nicer

# TODO: metadata split between well known columns and maybe something else is
# not such a good idea. The problem is if we allow for specific columns then
# how do the user/us know which are metatadata?  I THINK the only sane solution
# is to store JSON in a column called metadata

# TODO: fixix  a subset of metadata that we define well known (and create them)
# i.e. no dynamic creation of metadata columns, but add stuff to
# a json inside a 'metadata' column


class _BackgroundWriter(Thread):
    """
    Write the results from the DataSet's dataqueue in a new thread
    """

    def __init__(self, queue: Queue[Any], conn: ConnectionPlus):
        super().__init__(daemon=True)
        self.queue = queue
        self.path = conn.path_to_dbfile
        self.keep_writing = True

    def run(self) -> None:
        self.conn = connect(self.path)

        while self.keep_writing:
            item = self.queue.get()
            if item["keys"] == "stop":
                self.keep_writing = False
                self.conn.close()
            elif item["keys"] == "finalize":
                _WRITERS[self.path].active_datasets.remove(item["values"])
            else:
                self.write_results(item["keys"], item["values"], item["table_name"])
            self.queue.task_done()

    def write_results(
        self, keys: Sequence[str], values: Sequence[list[Any]], table_name: str
    ) -> None:
        insert_many_values(self.conn, table_name, keys, values)

    def shutdown(self) -> None:
        """
        Send a termination signal to the data writing queue, wait for the
        queue to empty and the thread to join.

        If the background writing thread is not alive this will do nothing.
        """
        if self.is_alive():
            self.queue.put({"keys": "stop", "values": []})
            self.queue.join()
            self.join()


@dataclass
class _WriterStatus:
    bg_writer: _BackgroundWriter | None
    write_in_background: bool | None
    data_write_queue: Queue[Any]
    active_datasets: set[int]


_WRITERS: dict[str, _WriterStatus] = {}


class DataSet(BaseDataSet):
    # the "persistent traits" are the attributes/properties of the DataSet
    # that are NOT tied to the representation of the DataSet in any particular
    # database
    persistent_traits = (
        "name",
        "guid",
        "number_of_results",
        "parameters",
        "paramspecs",
        "exp_name",
        "sample_name",
        "completed",
        "snapshot",
        "run_timestamp_raw",
        "description",
        "completed_timestamp_raw",
        "metadata",
        "dependent_parameters",
        "parent_dataset_links",
        "captured_run_id",
        "captured_counter",
    )
    background_sleep_time = 1e-3

    def __init__(
        self,
        path_to_db: str | None = None,
        run_id: int | None = None,
        conn: ConnectionPlus | None = None,
        exp_id: int | None = None,
        name: str | None = None,
        specs: SpecsOrInterDeps | None = None,
        values: VALUES | None = None,
        metadata: Mapping[str, Any] | None = None,
        shapes: Shapes | None = None,
        in_memory_cache: bool = True,
    ) -> None:
        """
        Create a new :class:`.DataSet` object. The object can either hold a new run or
        an already existing run. If a ``run_id`` is provided, then an old run is
        looked up, else a new run is created.

        Args:
            path_to_db: path to the sqlite file on disk. If not provided, the
                path will be read from the config.
            run_id: provide this when loading an existing run, leave it
                as None when creating a new run
            conn: connection to the DB; if provided and ``path_to_db`` is
                provided as well, then a ``ValueError`` is raised (this is to
                prevent the possibility of providing a connection to a DB
                file that is different from ``path_to_db``)
            exp_id: the id of the experiment in which to create a new run.
                Ignored if ``run_id`` is provided.
            name: the name of the dataset. Ignored if ``run_id`` is provided.
            specs: paramspecs belonging to the dataset or an ``InterDependencies_``
                object that describes the dataset. Ignored if ``run_id``
                is provided.
            values: values to insert into the dataset. Ignored if ``run_id`` is
                provided.
            metadata: metadata to insert into the dataset. Ignored if ``run_id``
                is provided.
            shapes:
                An optional dict from names of dependent parameters to the shape
                of the data captured as a list of integers. The list is in the
                same order as the interdependencies or paramspecs provided.
                Ignored if ``run_id`` is provided.
            in_memory_cache: Should measured data be keep in memory
                and available as part of the `dataset.cache` object.

        """
        self.conn = conn_from_dbpath_or_conn(conn, path_to_db)

        self._debug = False
        self.subscribers: dict[str, _Subscriber] = {}
        self._parent_dataset_links: list[Link]
        #: In memory representation of the data in the dataset.
        self._cache: DataSetCacheWithDBBackend = DataSetCacheWithDBBackend(self)
        self._results: list[dict[str, VALUE]] = []
        self._in_memory_cache = in_memory_cache

        if run_id is not None:
            if not run_exists(self.conn, run_id):
                raise ValueError(
                    f"Run with run_id {run_id} does not exist in the database"
                )
            self._run_id = run_id
            self._completed = completed(self.conn, self.run_id)
            run_desc = self._get_run_description_from_db()
            self._rundescriber = run_desc
            self._metadata = get_metadata_from_run_id(self.conn, self.run_id)
            self._started = self.run_timestamp_raw is not None
            self._parent_dataset_links = str_to_links(
                get_parent_dataset_links(self.conn, self.run_id)
            )
            self._export_info = ExportInfo.from_str(
                self.metadata.get("export_info", "")
            )
        else:
            # Actually perform all the side effects needed for the creation
            # of a new dataset. Note that a dataset is created (in the DB)
            # with no parameters; they are written to disk when the dataset
            # is marked as started
            if exp_id is None:
                exp_id = get_default_experiment_id(self.conn)
            name = name or "dataset"
            _, run_id, __ = create_run(
                self.conn,
                exp_id,
                name,
                generate_guid(),
                parameters=None,
                values=values,
                metadata=metadata,
            )
            # this is really the UUID (an ever increasing count in the db)
            self._run_id = run_id
            self._completed = False
            self._started = False

            if isinstance(specs, InterDependencies_):
                interdeps = specs
            elif specs is not None:
                interdeps = old_to_new(InterDependencies(*specs))
            else:
                interdeps = InterDependencies_()

            self.set_interdependencies(interdeps=interdeps, shapes=shapes)

            self._metadata = get_metadata_from_run_id(self.conn, self.run_id)
            self._parent_dataset_links = []
            self._export_info = ExportInfo({})
        assert self.path_to_db is not None
        if _WRITERS.get(self.path_to_db) is None:
            queue: Queue[Any] = Queue()
            ws: _WriterStatus = _WriterStatus(
                bg_writer=None,
                write_in_background=None,
                data_write_queue=queue,
                active_datasets=set(),
            )
            _WRITERS[self.path_to_db] = ws

    def prepare(
        self,
        *,
        snapshot: Mapping[Any, Any],
        interdeps: InterDependencies_,
        shapes: Shapes | None = None,
        parent_datasets: Sequence[Mapping[Any, Any]] = (),
        write_in_background: bool = False,
    ) -> None:
        self.add_snapshot(json.dumps(snapshot, cls=NumpyJSONEncoder))

        if interdeps == InterDependencies_():
            raise RuntimeError("No parameters supplied")

        self.set_interdependencies(interdeps, shapes)
        links = [Link(head=self.guid, **pdict) for pdict in parent_datasets]
        self.parent_dataset_links = links
        self.mark_started(start_bg_writer=write_in_background)

    @property
    def cache(self) -> DataSetCacheWithDBBackend:
        return self._cache

    @property
    def run_id(self) -> int:
        return self._run_id

    @property
    def captured_run_id(self) -> int:
        run_id = select_one_where(
            self.conn, "runs", "captured_run_id", "run_id", self.run_id
        )
        assert isinstance(run_id, int)
        return run_id

    @property
    def path_to_db(self) -> str | None:
        return self.conn.path_to_dbfile

    @property
    def name(self) -> str:
        name = select_one_where(self.conn, "runs", "name", "run_id", self.run_id)
        assert isinstance(name, str)
        return name

    @property
    def table_name(self) -> str:
        table_name = select_one_where(
            self.conn, "runs", "result_table_name", "run_id", self.run_id
        )
        assert isinstance(table_name, str)
        return table_name

    @property
    def guid(self) -> str:
        guid = get_guid_from_run_id(self.conn, self.run_id)
        assert guid is not None
        return guid

    @property
    def snapshot(self) -> dict[str, Any] | None:
        """Snapshot of the run as dictionary (or None)"""
        snapshot_json = self.snapshot_raw
        if snapshot_json is not None:
            return json.loads(snapshot_json)
        else:
            return None

    @property
    def _snapshot_raw(self) -> str | None:
        """Snapshot of the run as a JSON-formatted string (or None)"""
        snapshot_raw = select_one_where(
            self.conn, "runs", "snapshot", "run_id", self.run_id
        )
        assert isinstance(snapshot_raw, (str, type(None)))
        return snapshot_raw

    @property
    def snapshot_raw(self) -> str | None:
        """Snapshot of the run as a JSON-formatted string (or None)"""
        return self._snapshot_raw

    @property
    def number_of_results(self) -> int:
        sql = f'SELECT COUNT(*) FROM "{self.table_name}"'
        cursor = atomic_transaction(self.conn, sql)
        return one(cursor, "COUNT(*)")

    @property
    def counter(self) -> int:
        counter = select_one_where(
            self.conn, "runs", "result_counter", "run_id", self.run_id
        )
        assert isinstance(counter, int)
        return counter

    @property
    def captured_counter(self) -> int:
        captured_counter = select_one_where(
            self.conn, "runs", "captured_counter", "run_id", self.run_id
        )
        assert isinstance(captured_counter, int)
        return captured_counter

    @property
    def _parameters(self) -> str | None:
        if self.pristine:
            psnames = [ps.name for ps in self.description.interdeps.paramspecs]
            if len(psnames) > 0:
                return ",".join(psnames)
            else:
                return None
        else:
            parameters = select_one_where(
                self.conn, "runs", "parameters", "run_id", self.run_id
            )
            assert isinstance(parameters, (str, type(None)))
            return parameters

    @property
    def parameters(self) -> str | None:
        return self._parameters

    @property
    def paramspecs(self) -> dict[str, ParamSpec]:
        return {ps.name: ps for ps in self.get_parameters()}

    @property
    def exp_id(self) -> int:
        exp_id = select_one_where(self.conn, "runs", "exp_id", "run_id", self.run_id)
        assert isinstance(exp_id, int)
        return exp_id

    @property
    def exp_name(self) -> str:
        return get_experiment_name_from_experiment_id(self.conn, self.exp_id)

    @property
    def sample_name(self) -> str:
        return get_sample_name_from_experiment_id(self.conn, self.exp_id)

    @property
    def run_timestamp_raw(self) -> float | None:
        """
        Returns run timestamp as number of seconds since the Epoch

        The run timestamp is the moment when the measurement for this run
        started.
        """
        return get_run_timestamp_from_run_id(self.conn, self.run_id)

    @property
    def description(self) -> RunDescriber:
        return self._rundescriber

    @property
    def metadata(self) -> dict[str, Any]:
        return self._metadata

    @property
    def parent_dataset_links(self) -> list[Link]:
        """
        Return a list of Link objects. Each Link object describes a link from
        this dataset to one of its parent datasets
        """
        return self._parent_dataset_links

    @parent_dataset_links.setter
    def parent_dataset_links(self, links: list[Link]) -> None:
        """
        Assign one or more links to parent datasets to this dataset. It is an
        error to assign links to a non-pristine dataset

        Args:
            links: The links to assign to this dataset

        """
        if not self.pristine:
            raise RuntimeError(
                "Can not set parent dataset links on a dataset "
                "that has been started."
            )

        if not all(isinstance(link, Link) for link in links):
            raise ValueError("Invalid input. Did not receive a list of Links")

        for link in links:
            if link.head != self.guid:
                raise ValueError(
                    "Invalid input. All links must point to this dataset. "
                    "Got link(s) with head(s) pointing to another dataset."
                )

        self._parent_dataset_links = links

    @property
    def _writer_status(self) -> _WriterStatus:
        assert self.path_to_db is not None
        return _WRITERS[self.path_to_db]

    @property
    def completed_timestamp_raw(self) -> float | None:
        """
        Returns timestamp when measurement run was completed
        as number of seconds since the Epoch

        If the run (or the dataset) is not completed, then returns None.
        """
        return get_completed_timestamp_from_run_id(self.conn, self.run_id)

    def _get_run_description_from_db(self) -> RunDescriber:
        """
        Look up the run_description from the database
        """
        desc_str = get_run_description(self.conn, self.run_id)
        return serial.from_json_to_current(desc_str)

    def toggle_debug(self) -> None:
        """
        Toggle debug mode, if debug mode is on all the queries made are
        echoed back.
        """
        path_to_db = self.path_to_db
        assert path_to_db is not None
        self._debug = not self._debug
        self.conn.close()
        self.conn = connect(path_to_db, self._debug)

    def set_interdependencies(
        self, interdeps: InterDependencies_, shapes: Shapes | None = None
    ) -> None:
        """
        Set the interdependencies object (which holds all added
        parameters and their relationships) of this dataset and
        optionally the shapes object that holds information about
        the shape of the data to be measured.
        """
        if not isinstance(interdeps, InterDependencies_):
            raise TypeError(
                f"Wrong input type. Expected InterDepencies_, got {type(interdeps)}"
            )

        if not self.pristine:
            mssg = "Can not set interdependencies on a DataSet that has been started."
            raise RuntimeError(mssg)
        self._rundescriber = RunDescriber(interdeps, shapes=shapes)

    def add_metadata(self, tag: str, metadata: Any) -> None:
        """
        Adds metadata to the :class:`.DataSet`. The metadata is stored under the
        provided tag. Note that None is not allowed as a metadata value, and the
        tag has to be a valid python identified (e.g. containing alphanumeric
        characters and underscores).

        Args:
            tag: represents the key in the metadata dictionary
            metadata: actual metadata

        """

        self._metadata[tag] = metadata

        # `add_data_to_dynamic_columns` is not atomic by itself, hence using `atomic`
        with atomic(self.conn) as conn:
            add_data_to_dynamic_columns(conn, self.run_id, {tag: metadata})

        self._add_metadata_to_netcdf_if_nc_exported(tag, metadata)

    def add_snapshot(self, snapshot: str, overwrite: bool = False) -> None:
        """
        Adds a snapshot to this run

        Args:
            snapshot: the raw JSON dump of the snapshot
            overwrite: force overwrite an existing snapshot

        """
        if self.snapshot is None or overwrite:
            with atomic(self.conn) as conn:
                add_data_to_dynamic_columns(conn, self.run_id, {"snapshot": snapshot})
        elif self.snapshot is not None and not overwrite:
            log.warning(
                "This dataset already has a snapshot. Use overwrite"
                "=True to overwrite that"
            )

    @property
    def pristine(self) -> bool:
        """
        Is this :class:`.DataSet` pristine? A pristine :class:`.DataSet` has not yet been started,
        meaning that parameters can still be added and removed, but results
        can not be added.
        """
        return not (self._started or self._completed)

    @property
    def running(self) -> bool:
        """
        Is this :class:`.DataSet` currently running? A running :class:`.DataSet` has been started,
        but not yet completed.
        """
        return self._started and not self._completed

    @property
    def started(self) -> bool:
        """
        Has this :class:`.DataSet` been started? A :class:`.DataSet` not started can not have any
        results added to it.
        """
        return self._started

    @property
    def completed(self) -> bool:
        """
        Is this :class:`.DataSet` completed? A completed :class:`.DataSet` may not be modified in
        any way.
        """
        return self._completed

    @completed.setter
    def completed(self, value: bool) -> None:
        self._completed = value
        if value:
            mark_run_complete(self.conn, self.run_id)

    def mark_started(self, start_bg_writer: bool = False) -> None:
        """
        Mark this :class:`.DataSet` as started. A :class:`.DataSet` that has been started can not
        have its parameters modified.

        Calling this on an already started :class:`.DataSet` is a NOOP.

        Args:
            start_bg_writer: If True, the add_results method will write to the
                database in a separate thread.

        """
        if not self._started:
            self._perform_start_actions(start_bg_writer=start_bg_writer)
            self._started = True

    def _perform_start_actions(self, start_bg_writer: bool) -> None:
        """
        Perform the actions that must take place once the run has been started
        """
        paramspecs = new_to_old(self._rundescriber.interdeps).paramspecs

        for spec in paramspecs:
            add_parameter(
                spec, conn=self.conn, run_id=self.run_id, insert_into_results_table=True
            )

        desc_str = serial.to_json_for_storage(self.description)

        update_run_description(self.conn, self.run_id, desc_str)

        set_run_timestamp(self.conn, self.run_id)

        pdl_str = links_to_str(self._parent_dataset_links)
        update_parent_datasets(self.conn, self.run_id, pdl_str)

        writer_status = self._writer_status

        write_in_background_status = writer_status.write_in_background
        if (
            write_in_background_status is not None
            and write_in_background_status != start_bg_writer
        ):
            raise RuntimeError(
                "All datasets written to the same database must "
                "be written either in the background or in the "
                "main thread. You cannot mix."
            )
        if start_bg_writer:
            writer_status.write_in_background = True
            if writer_status.bg_writer is None:
                writer_status.bg_writer = _BackgroundWriter(
                    writer_status.data_write_queue, self.conn
                )
            if not writer_status.bg_writer.is_alive():
                writer_status.bg_writer.start()
        else:
            writer_status.write_in_background = False

        writer_status.active_datasets.add(self.run_id)
        self.cache.prepare()

    def mark_completed(self) -> None:
        """
        Mark :class:`.DataSet` as complete and thus read only and notify the subscribers
        """
        if self.completed:
            return
        if self.pristine:
            raise RuntimeError(
                "Can not mark DataSet as complete before it "
                "has been marked as started."
            )

        self._perform_completion_actions()
        self.completed = True

    def _perform_completion_actions(self) -> None:
        """
        Perform the necessary clean-up
        """
        for sub in self.subscribers.values():
            sub.done_callback()
        self._ensure_dataset_written()

    def add_results(self, results: Sequence[Mapping[str, VALUE]]) -> None:
        """
        Adds a sequence of results to the :class:`.DataSet`.

        Args:
            results: list of name-value dictionaries where each dictionary
                provides the values for the parameters in that result. If some
                parameters are missing the corresponding values are assumed
                to be None

        It is an error to provide a value for a key or keyword that is not
        the name of a parameter in this :class:`.DataSet`.

        It is an error to add results to a completed :class:`.DataSet`.

        """

        self._raise_if_not_writable()

        expected_keys = frozenset.union(*(frozenset(d) for d in results))
        values = [[d.get(k, None) for k in expected_keys] for d in results]

        writer_status = self._writer_status

        if writer_status.write_in_background:
            item = {
                "keys": list(expected_keys),
                "values": values,
                "table_name": self.table_name,
            }
            writer_status.data_write_queue.put(item)
        else:
            insert_many_values(self.conn, self.table_name, list(expected_keys), values)

    def _raise_if_not_writable(self) -> None:
        if self.pristine:
            raise RuntimeError(
                "This DataSet has not been marked as started. "
                "Please mark the DataSet as started before "
                "adding results to it."
            )
        if self.completed:
            raise CompletedError(
                "This DataSet is complete, no further results can be added to it."
            )

    def _ensure_dataset_written(self) -> None:
        writer_status = self._writer_status

        if writer_status.write_in_background:
            writer_status.data_write_queue.put(
                {"keys": "finalize", "values": self.run_id}
            )
            while self.run_id in writer_status.active_datasets:
                time.sleep(self.background_sleep_time)
        elif self.run_id in writer_status.active_datasets:
            writer_status.active_datasets.remove(self.run_id)
        if len(writer_status.active_datasets) == 0:
            writer_status.write_in_background = None
            if writer_status.bg_writer is not None:
                writer_status.bg_writer.shutdown()
                writer_status.bg_writer = None

    def get_parameter_data(
        self,
        *params: str | ParamSpec | ParameterBase,
        start: int | None = None,
        end: int | None = None,
        callback: Callable[[float], None] | None = None,
    ) -> ParameterData:
        """
        Returns the values stored in the :class:`.DataSet` for the specified parameters
        and their dependencies. If no parameters are supplied the values will
        be returned for all parameters that are not them self dependencies.

        The values are returned as a dictionary with names of the requested
        parameters as keys and values consisting of dictionaries with the
        names of the parameters and its dependencies as keys and numpy arrays
        of the data as values. If the dataset has a shape recorded
        in its metadata and the number of datapoints recorded matches the
        expected number of points the data will be returned as numpy arrays
        in this shape. If there are less datapoints recorded than expected
        from the metadata the dataset will be returned as is. This could happen
        if you call `get_parameter_data` on an incomplete dataset. See
        :py:meth:`dataset.cache.data <.DataSetCache.data>` for an implementation that
        returns the data with the expected shape using `NaN` or zeros as
        placeholders.

        If there are more datapoints than expected the dataset will be returned
        as is and a warning raised.

        If some of the parameters are stored as arrays
        the remaining parameters are expanded to the same shape as these.

        If provided, the start and end arguments select a range of results
        by result count (index). If the range is empty - that is, if the end is
        less than or equal to the start, or if start is after the current end
        of the :class:`.DataSet` – then a list of empty arrays is returned.

        Args:
            *params: string parameter names, QCoDeS Parameter objects, and
                ParamSpec objects. If no parameters are supplied data for
                all parameters that are not a dependency of another
                parameter will be returned.
            start: start value of selection range (by result count); ignored
                if None
            end: end value of selection range (by results count); ignored if
                None
            callback: Function called during the data loading every
                config.dataset.callback_percent.

        Returns:
            Dictionary from requested parameters to Dict of parameter names
            to numpy arrays containing the data points of type numeric,
            array or string.

        """
        if len(params) == 0:
            valid_param_names = [
                ps.name for ps in self._rundescriber.interdeps.non_dependencies
            ]
        else:
            valid_param_names = self._validate_parameters(*params)
        return get_parameter_data(
            self.conn, self.table_name, valid_param_names, start, end, callback
        )

    def to_pandas_dataframe_dict(
        self,
        *params: str | ParamSpec | ParameterBase,
        start: int | None = None,
        end: int | None = None,
    ) -> dict[str, pd.DataFrame]:
        """
        Returns the values stored in the :class:`.DataSet` for the specified parameters
        and their dependencies as a dict of :py:class:`pandas.DataFrame` s
        Each element in the dict is indexed by the names of the requested
        parameters.

        Each DataFrame contains a column for the data and is indexed by a
        :py:class:`pandas.MultiIndex` formed from all the setpoints
        of the parameter.

        If no parameters are supplied data will be be
        returned for all parameters in the :class:`.DataSet` that are not them self
        dependencies of other parameters.

        If provided, the start and end arguments select a range of results
        by result count (index). If the range is empty - that is, if the end is
        less than or equal to the start, or if start is after the current end
        of the :class:`.DataSet` – then a dict of empty :py:class:`pandas.DataFrame` s is
        returned.

        Args:
            *params: string parameter names, QCoDeS Parameter objects, and
                ParamSpec objects. If no parameters are supplied data for
                all parameters that are not a dependency of another
                parameter will be returned.
            start: start value of selection range (by result count); ignored
                if None
            end: end value of selection range (by results count); ignored if
                None

        Returns:
            Dictionary from requested parameter names to
            :py:class:`pandas.DataFrame` s with the requested parameter as
            a column and a indexed by a :py:class:`pandas.MultiIndex` formed
            by the dependencies.

        """
        datadict = self.get_parameter_data(*params, start=start, end=end)
        dfs_dict = load_to_dataframe_dict(datadict)
        return dfs_dict

    def to_pandas_dataframe(
        self,
        *params: str | ParamSpec | ParameterBase,
        start: int | None = None,
        end: int | None = None,
    ) -> pd.DataFrame:
        """
        Returns the values stored in the :class:`.DataSet` for the specified parameters
        and their dependencies as a concatenated :py:class:`pandas.DataFrame` s

        The DataFrame contains a column for the data and is indexed by a
        :py:class:`pandas.MultiIndex` formed from all the setpoints
        of the parameter.

        If no parameters are supplied data will be be
        returned for all parameters in the :class:`.DataSet` that are not them self
        dependencies of other parameters.

        If provided, the start and end arguments select a range of results
        by result count (index). If the range is empty - that is, if the end is
        less than or equal to the start, or if start is after the current end
        of the :class:`.DataSet` – then a dict of empty :py:class:`pandas.DataFrame` s is
        returned.

        Args:
            *params: string parameter names, QCoDeS Parameter objects, and
                ParamSpec objects. If no parameters are supplied data for
                all parameters that are not a dependency of another
                parameter will be returned.
            start: start value of selection range (by result count); ignored
                if None
            end: end value of selection range (by results count); ignored if
                None

        Returns:
            :py:class:`pandas.DataFrame` s with the requested parameter as
            a column and a indexed by a :py:class:`pandas.MultiIndex` formed
            by the dependencies.

        Example:
            Return a pandas DataFrame with
                df = ds.to_pandas_dataframe()

        """
        datadict = self.get_parameter_data(*params, start=start, end=end)
        return load_to_concatenated_dataframe(datadict)

    def to_xarray_dataarray_dict(
        self,
        *params: str | ParamSpec | ParameterBase,
        start: int | None = None,
        end: int | None = None,
        use_multi_index: Literal["auto", "always", "never"] = "auto",
    ) -> dict[str, xr.DataArray]:
        """
        Returns the values stored in the :class:`.DataSet` for the specified parameters
        and their dependencies as a dict of :py:class:`xr.DataArray` s
        Each element in the dict is indexed by the names of the requested
        parameters.

        If no parameters are supplied data will be be
        returned for all parameters in the :class:`.DataSet` that are not them self
        dependencies of other parameters.

        If provided, the start and end arguments select a range of results
        by result count (index). If the range is empty - that is, if the end is
        less than or equal to the start, or if start is after the current end
        of the :class:`.DataSet` – then a dict of empty :py:class:`xr.DataArray` s is
        returned.

        The dependent parameters of the Dataset are normally used as coordinates of the
        XArray dataframe. However if non unique values are found for the dependent parameter
        values we will fall back to using an index as coordinates.

        Args:
            *params: string parameter names, QCoDeS Parameter objects, and
                ParamSpec objects. If no parameters are supplied data for
                all parameters that are not a dependency of another
                parameter will be returned.
            start: start value of selection range (by result count); ignored
                if None
            end: end value of selection range (by results count); ignored if
                None
            use_multi_index: Should the data be exported using a multi index
                rather than regular cartesian indexes. With regular cartesian
                coordinates, the xarray dimensions are calculated from the sets or all
                values along the setpoint axis of the QCoDeS dataset. Any position
                in this grid not corresponding to a measured value will be filled
                with a placeholder (typically NaN) potentially creating a sparse
                dataset with significant storage overhead.
                Multi index avoids this and is therefor better
                suited for data that is known to not be on a grid.
                If set to "auto" multi index will be used if projecting the data onto
                a grid requires filling non measured values with NaN  and the shapes