Skip to content

Commit a9b5513

Browse files
committed
feat: overhaul UI/UX and update leaderboard results
1 parent 8eb655b commit a9b5513

22 files changed

Lines changed: 3518 additions & 2620 deletions

.gitignore

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,4 +27,8 @@ speechtokenizer_hubert_avg/
2727
check_all_codecs.py
2828
test_imports.py
2929
test_imports_detailed.py
30-
benchmarking.py
30+
benchmarking.py
31+
32+
# Data
33+
cached_datasets/
34+
datasets/

SoundCodec/base_codec/descript_audio_codec.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,9 @@ def synth(self, data, local_save=True):
2828
data['unit'] = extracted_unit.unit
2929
decompressed_audio = self.model.decompress(compressed_audio).audio_data.squeeze(0)
3030
if local_save:
31-
audio_path = f"dummy-descript-audio-codec-{self.model_type}/{data['id']}.wav"
31+
from SoundCodec.base_codec.general import uuid
32+
audio_id = data.get('id', str(uuid.uuid4()))
33+
audio_path = f"dummy-descript-audio-codec-{self.model_type}/{audio_id}.wav"
3234
save_audio(decompressed_audio, audio_path, self.sampling_rate)
3335
data['audio'] = audio_path
3436
else:

SoundCodec/base_codec/encodec_hf.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@ def synth(self, data, local_save=True):
2020
data['unit'] = extracted_unit.unit
2121
audio_values = self.decode_unit(extracted_unit.stuff_for_synth)
2222
if local_save:
23-
audio_path = f"dummy_{self.pretrained_model_name}/{data['id']}.wav"
23+
from SoundCodec.base_codec.general import uuid
24+
audio_id = data.get('id', str(uuid.uuid4()))
25+
audio_path = f"dummy_{self.pretrained_model_name.replace('/', '_')}/{audio_id}.wav"
2426
save_audio(audio_values, audio_path, self.sampling_rate)
2527
data['audio'] = audio_path
2628
else:

SoundCodec/base_codec/funcodec.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,9 @@ def synth(self, data, local_save=True):
4747
data['unit'] = extracted_unit.unit
4848
audio_array = self.decode_unit(extracted_unit.stuff_for_synth)
4949
if local_save:
50-
audio_path = f"dummy-funcodec-{self.model_name}/{data['id']}.wav"
50+
from SoundCodec.base_codec.general import uuid
51+
audio_id = data.get('id', str(uuid.uuid4()))
52+
audio_path = f"dummy-funcodec-{self.model_name}/{audio_id}.wav"
5153
save_audio(audio_array, audio_path, self.sampling_rate)
5254
data['audio'] = audio_path
5355
else:

SoundCodec/base_codec/general.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from dataclasses import dataclass
33
from typing import List, Union, Any
44
from abc import ABC, abstractmethod
5+
import uuid
56

67
import numpy as np
78
import torchaudio
@@ -46,10 +47,18 @@ def to_dict(self):
4647
}
4748

4849

49-
def save_audio(wav: torch.Tensor, path, sample_rate: int, rescale: bool = False):
50+
def save_audio(wav: Union[torch.Tensor, np.ndarray], path, sample_rate: int, rescale: bool = False):
51+
if sample_rate is None:
52+
raise ValueError(f"sample_rate cannot be None when saving audio to {path}")
53+
if isinstance(wav, np.ndarray):
54+
wav = torch.from_numpy(wav)
55+
if wav.ndim == 1:
56+
wav = wav.unsqueeze(0)
57+
5058
folder_path = os.path.dirname(path)
5159
if not os.path.exists(folder_path):
5260
os.makedirs(folder_path)
61+
print(f"Saving audio to {path} with sample_rate {sample_rate}")
5362
limit = 0.99
5463
max_val = wav.abs().max()
5564
wav = wav * min(limit / max_val, 1) if rescale else wav.clamp(-limit, limit)
@@ -86,7 +95,8 @@ def synth(self, data, local_save=True):
8695
data['unit'] = extracted_unit.unit
8796
audio_values = self.decode_unit(extracted_unit.stuff_for_synth)
8897
if local_save:
89-
audio_path = f"dummy_{self.setting}/{data['id']}.wav"
98+
audio_id = data.get('id', str(uuid.uuid4()))
99+
audio_path = f"dummy_{self.setting}/{audio_id}.wav"
90100
save_audio(audio_values, audio_path, self.sampling_rate)
91101
data['audio'] = audio_path
92102
else:
@@ -137,7 +147,8 @@ def batch_synth(self, data_list: List[dict], local_save=True) -> List[dict]:
137147
for i, (data, audio_values, unit) in enumerate(zip(data_list, batch_audio_values, batch_extracted_unit.units)):
138148
data['unit'] = unit
139149
if local_save:
140-
audio_path = f"dummy_{self.setting}/{data['id']}.wav"
150+
audio_id = data.get('id', str(uuid.uuid4()))
151+
audio_path = f"dummy_{self.setting}/{audio_id}.wav"
141152
save_audio(torch.tensor(audio_values), audio_path, self.sampling_rate)
142153
data['audio'] = audio_path
143154
else:

SoundCodec/base_codec/wavtokenizer.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@ def _setup_config_and_model(self):
2424
self.model = WavTokenizer.from_pretrained0802(self.config_path, self.ckpt_path)
2525
self.model.eval()
2626
self.model = self.model.to(self.device)
27-
self.sampling_rate = getattr(self, "sampling_rate", 24000)
27+
if self.sampling_rate is None:
28+
self.sampling_rate = 24000
2829

2930
def _download_resources(self):
3031
import os

SoundCodec/dataset/__init__.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,19 @@
1+
from datasets import load_dataset as hf_load_dataset
2+
13
def load_dataset(dataset_name):
2-
module = __import__(f"dataset.{dataset_name}", fromlist=[dataset_name])
3-
return module.load_data()
4+
try:
5+
module = __import__(f"SoundCodec.dataset.{dataset_name}", fromlist=[dataset_name])
6+
return module.load_data()
7+
except ImportError:
8+
# Fallback to loading from Hugging Face Hub
9+
ds = hf_load_dataset(dataset_name)
10+
if isinstance(ds, dict):
11+
if "test" in ds:
12+
return ds["test"]
13+
if "validation" in ds:
14+
return ds["validation"]
15+
if "train" in ds:
16+
return ds["train"]
17+
# return the first split if none of the above are found
18+
return ds[list(ds.keys())[0]]
19+
return ds

dataset_creator.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,22 @@
11
import argparse
22
from datasets import DatasetDict, Audio, load_from_disk
33
from SoundCodec.codec import load_codec, list_codec
4-
from SoundCodec.dataset import load_dataset, apply_audio_cast
5-
from SoundCodec.dataset.general import extract_unit
4+
from SoundCodec.dataset import load_dataset
5+
from SoundCodec.dataset.general import apply_audio_cast, extract_unit
66

77

88
def run_experiment(dataset_name):
99
cleaned_dataset = load_dataset(dataset_name)
10-
d_item = next(iter(cleaned_dataset))
11-
sampling_rate = d_item['audio']['sampling_rate']
12-
cleaned_dataset = load_dataset(dataset_name)
10+
if args.limit:
11+
cleaned_dataset = cleaned_dataset.select(range(min(args.limit, len(cleaned_dataset))))
12+
1313
print("before filter duration", cleaned_dataset)
1414
cleaned_dataset = cleaned_dataset.filter(
1515
lambda x: len(x['audio']['array']) / x['audio']['sampling_rate'] <= args.max_duration)
1616
print("after filter duration", cleaned_dataset)
17+
18+
d_item = next(iter(cleaned_dataset))
19+
sampling_rate = d_item['audio']['sampling_rate']
1720
cleaned_dataset = apply_audio_cast(cleaned_dataset, sampling_rate)
1821
if not args.extract_unit_only:
1922
datasets_dict = DatasetDict({'original': cleaned_dataset})
@@ -66,5 +69,6 @@ def run_experiment(dataset_name):
6669
parser.add_argument('--max_duration', required=False, type=int, default=120)
6770
parser.add_argument('--push_to_hub', required=False, action='store_true')
6871
parser.add_argument('--upload_name', required=False, default='Codec-SUPERB')
72+
parser.add_argument('--limit', required=False, type=int, default=None)
6973
args = parser.parse_args()
7074
run_experiment(args.dataset)

update_leaderboard.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
import json
2+
3+
# Load benchmark results
4+
with open('._datasets_voidful_codec-superb-tiny_synth_evaluation_results_20251218_204458.json', 'r') as f:
5+
benchmark_results = json.load(f)
6+
7+
# Hardcoded BPS mapping (bitrate in kbps or as used in data.js)
8+
bps_mapping = {
9+
'academicodec_hifi_16k_320d': 2,
10+
'academicodec_hifi_16k_320d_large_uni': 2,
11+
'academicodec_hifi_24k_320d': 3,
12+
'audiodec_24k_320d': 6.4,
13+
'auv': 1, # Estimated or placeholder
14+
'bigcodec_1k': 1, # Estimated or placeholder
15+
'dac_16k': 6,
16+
'dac_24k': 24,
17+
'dac_44k': 8,
18+
'encodec_24k_12bps': 12,
19+
'encodec_24k_1_5bps': 1.5,
20+
'encodec_24k_24bps': 24,
21+
'encodec_24k_3bps': 3,
22+
'encodec_24k_6bps': 6,
23+
'funcodec_en_libritts_16k_gr1nq32ds320': 16,
24+
'funcodec_en_libritts_16k_gr8nq32ds320': 16,
25+
'funcodec_en_libritts_16k_nq32ds320': 16,
26+
'funcodec_en_libritts_16k_nq32ds640': 8,
27+
'funcodec_zh_en_16k_nq32ds320': 16,
28+
'funcodec_zh_en_16k_nq32ds640': 8,
29+
's3tokenizer_v1': 0.1, # Semantic tokenizer
30+
'speech_tokenizer_16k': 4,
31+
'sqcodec_16k_0k75bps': 0.75,
32+
'sqcodec_16k_12kbps': 12,
33+
'sqcodec_16k_1k5bps': 1.5,
34+
'sqcodec_16k_3kbps': 3,
35+
'sqcodec_16k_6kbps': 6,
36+
'sqcodec_24k_12kbps': 12,
37+
'sqcodec_24k_24kbps': 24,
38+
'unicodec_24k': 12, # Estimated
39+
'wavtokenizer_24k_small_600_4096': 0.1,
40+
'wavtokenizer_24k_medium_600_4096': 0.1,
41+
'wavtokenizer_24k_large_600_4096': 0.1,
42+
'wavtokenizer_24k_large_speech_75token': 0.1
43+
}
44+
45+
# Metrics to include
46+
metrics_to_include = ['mel', 'pesq', 'stoi', 'f0corr']
47+
48+
new_results = {}
49+
50+
for model_name, metrics in benchmark_results.items():
51+
entry = {
52+
'bps': bps_mapping.get(model_name, 0)
53+
}
54+
for m in metrics_to_include:
55+
val = metrics.get(m, 0)
56+
# Handle NaN
57+
if val != val: # NaN check
58+
val = 0
59+
entry[m] = round(float(val), 3)
60+
new_results[model_name] = entry
61+
62+
# Format as JavaScript
63+
js_content = "const results = " + json.dumps(new_results, indent=1) + ";\nexport default results;"
64+
65+
with open('web/src/results/data.js', 'w') as f:
66+
f.write(js_content)
67+
68+
print(f"Updated web/src/results/data.js with {len(new_results)} codecs.")

0 commit comments

Comments
 (0)