-
Notifications
You must be signed in to change notification settings - Fork 31
Expand file tree
/
Copy pathuse_huggingface_dataset.py
More file actions
50 lines (37 loc) · 1.2 KB
/
use_huggingface_dataset.py
File metadata and controls
50 lines (37 loc) · 1.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
"""
In this script, we use the Hugging Face dataset made
from the script examples/create_huggingface_dataset.py
The dataset is available at:
https://huggingface.co/datasets/bezzam/dummy-dataset
```bash
# install
pip install datasets librosa soundfile
# run
python examples/use_huggingface_dataset.py
```
During the first run, the dataset will be downloaded and cached.
Subsequent runs will use the cached dataset.
"""
from datasets import load_dataset
import numpy as np
# load train and test splits
ds_train = load_dataset("bezzam/dummy-dataset", split="train")
ds_test = load_dataset("bezzam/dummy-dataset", split="test")
print(f"Number of training samples: {len(ds_train)}")
print(f"Number of test samples: {len(ds_test)}")
# load first example
print("\n---- First example:")
example = ds_train[0]
# -- audio duration
duration = len(example["audio"]["array"]) / example["audio"]["sampling_rate"]
print(f"Duration of audio: {duration:.2f} seconds")
# -- image size
image = np.array(example["image"])
print(f"Size of image: {image.shape}")
# -- text
text = example["text"]
print(f"Text: {text}")
# -- label
label = example["label"]
label_str = ds_train.features["label"].int2str(label)
print(f"Label: {label_str}")