Skip to content

Commit 0b14557

Browse files
week 16-17-18
1 parent d033918 commit 0b14557

108 files changed

Lines changed: 10548 additions & 14 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

datasets/dataset-6/datapreps-6/dataprep-6-2/data-dp-6-2/.gitkeep

Whitespace-only changes.
Binary file not shown.
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
2+
3+
4+
Initializing CodeLlama tokenizer:
5+
6+
Vocabulary size: 32,016:
7+
8+
Saving the data-preping-numpy-random state:
9+
--> saving it
10+
--> freeing its memory
11+
12+
Loading the dataset:
13+
14+
Save the meta information:
15+
--> freeing its memory
16+
17+
Split by examples using \n\n:
18+
--> splitting
19+
--> freeing data memory
20+
--> total number of examples: 30,000,000
21+
22+
23+
Creating the train.txt, val.txt and test.txt:
24+
--> shuffling examples
25+
--> creating the train_examples
26+
--> train_examples has 24000000 examples
27+
--> creating the train_data
28+
--> writing the train_data to train.txt
29+
30+
Tokenizing and saving train data to binary:
Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
## Data preping (on Kindi)
2+
3+
## Imports
4+
from transformers import AutoTokenizer
5+
import pickle
6+
import numpy as np
7+
import gc
8+
import struct
9+
import time
10+
import math
11+
12+
## Logging boilerplate
13+
log_file = open("data-preping.log", "w")
14+
pbar_recept_string = " " * 200 + "\n"
15+
log_file.write(pbar_recept_string)
16+
log_file.write(pbar_recept_string)
17+
log_file.flush()
18+
def log(s:str, p_level=None):
19+
if p_level == 1:
20+
log_file.seek(0,0)
21+
log_file.write(pbar_recept_string)
22+
log_file.seek(0,0)
23+
log_file.write(s)
24+
log_file.seek(0,2)
25+
elif p_level == 2:
26+
log_file.seek(len(pbar_recept_string), 0)
27+
log_file.write(pbar_recept_string)
28+
log_file.seek(len(pbar_recept_string), 0)
29+
log_file.write(s)
30+
log_file.seek(0,2)
31+
else:
32+
if s[0].upper() == s[0]:
33+
start = "\n"
34+
end = ":"
35+
else:
36+
start = " --> "
37+
end = ""
38+
log_file.write(start + s + end + "\n")
39+
log_file.flush()
40+
41+
42+
## Convert seconds to days, hours, minutes, seconds
43+
def convert_seconds(seconds:float):
44+
# ignoring the sub seconds
45+
seconds = int(seconds)
46+
days, seconds = divmod(seconds, 86400)
47+
hours, seconds = divmod(seconds, 3600)
48+
minutes, seconds = divmod(seconds, 60)
49+
return (days, hours, minutes, seconds)
50+
51+
52+
## Initialize CodeLlama tokenizer
53+
log("Initializing CodeLlama tokenizer")
54+
tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")
55+
vocab_size = tokenizer.vocab_size
56+
log(f"Vocabulary size: {vocab_size:,}")
57+
58+
## Saving the numpy random state
59+
log("Saving the data-preping-numpy-random state")
60+
log("saving it")
61+
np_random_state = np.random.get_state()
62+
with open("data-preping-np-random-state.bin", "wb") as f:
63+
pickle.dump(np_random_state, f)
64+
log("freeing its memory")
65+
del np_random_state
66+
gc.collect()
67+
68+
69+
## Loading the dataset
70+
log("Loading the dataset")
71+
with open("../../data-ds-6/data.txt", "r") as f:
72+
data = f.read()
73+
74+
75+
## Save the meta information
76+
log("Save the meta information")
77+
meta = {
78+
'vocab_size': vocab_size,
79+
'tokenizer_name': "codellama/CodeLlama-7b-hf",
80+
}
81+
82+
with open('data-dp-6-2/meta.pkl', 'wb') as f:
83+
pickle.dump(meta, f)
84+
log("freeing its memory")
85+
del meta
86+
gc.collect()
87+
88+
89+
## Split by examples using \n\n
90+
log("Split by examples using \\n\\n")
91+
log("splitting")
92+
examples = data.split("\n\n")[:-1]
93+
log("freeing data memory")
94+
del data
95+
gc.collect()
96+
n = len(examples)
97+
log(f"total number of examples: {n:,}\n")
98+
99+
100+
## Creating the train.txt, val.txt and test.txt
101+
log("Creating the train.txt, val.txt and test.txt")
102+
log("shuffling examples")
103+
np.random.shuffle(examples)
104+
105+
log("creating the train_examples")
106+
train_examples = examples[:int(n*0.8)]
107+
log(f"train_examples has {len(train_examples)} examples")
108+
log("creating the train_data")
109+
train_data = "\n\n".join(train_examples)
110+
del train_examples
111+
112+
log("writing the train_data to train.txt")
113+
with open("data-dp-6-2/train.txt", 'w') as f:
114+
f.write(train_data)
115+
116+
# Tokenize and save train data to binary
117+
log("Tokenizing and saving train data to binary")
118+
tokens = tokenizer.encode(train_data)
119+
del train_data
120+
log(f"Encoded train data has {len(tokens)} tokens")
121+
with open("data-dp-6-2/train.bin", "wb") as f:
122+
for token in tokens:
123+
f.write(struct.pack('H', token)) # 'H' stands for unsigned short (2 bytes)
124+
del tokens
125+
126+
# Process validation split
127+
log("Processing validation split")
128+
val_examples = examples[int(n*0.8):int(n*0.9)]
129+
log(f"val_examples has {len(val_examples)} examples")
130+
val_data = "\n\n".join(val_examples)
131+
del val_examples
132+
log(f"val_data has {(val_tokens := len(val_data))} characters")
133+
log("writing the val_data to val.txt")
134+
with open("data-dp-6-2/val.txt", 'w') as f:
135+
f.write(val_data)
136+
137+
# Tokenize and save validation data to binary
138+
log("Tokenizing and saving validation data to binary")
139+
tokens = tokenizer.encode(val_data)
140+
del val_data
141+
log(f"Encoded validation data has {len(tokens)} tokens")
142+
with open("data-dp-6-2/val.bin", "wb") as f:
143+
for token in tokens:
144+
f.write(struct.pack('H', token)) # 'H' stands for unsigned short (2 bytes)
145+
del tokens
146+
147+
# Process test split
148+
log("Processing test split")
149+
test_examples = examples[int(n*0.9):]
150+
log(f"test_examples has {len(test_examples)} examples")
151+
test_data = "\n\n".join(test_examples)
152+
del test_examples
153+
log(f"test_data has {len(test_data)} characters")
154+
log("writing the test_data to test.txt")
155+
with open("data-dp-6-2/test.txt", 'w') as f:
156+
f.write(test_data)
157+
158+
# Tokenize and save test data to binary
159+
log("Tokenizing and saving test data to binary")
160+
tokens = tokenizer.encode(test_data)
161+
del test_data
162+
log(f"Encoded test data has {len(tokens)} tokens")
163+
with open("data-dp-6-2/test.bin", "wb") as f:
164+
for token in tokens:
165+
f.write(struct.pack('H', token)) # 'H' stands for unsigned short (2 bytes)
166+
del tokens
167+
168+
log("freeing examples memory")
169+
del examples
170+
gc.collect()
171+
172+
log("Data preparation complete!")
173+
log_file.close()
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# DESCRIPTION
2+
3+
# OBTENTION
4+
5+
# META-DATA
6+
7+
# DATA-LOCATION
8+
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1-
last-dataprep-id: 1
1+
last-dataprep-id: 2
22
datapreps:
33
dataprep-6-1: CharacterLevelTokenizer 80%Train10%Test10%Val
4+
dataprep-6-2: CodeLlamaTokrnizer
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
iter : 210000
2-
epoch : 0.9435535597481272
1+
iter : 220000
2+
epoch : 0.9884846816408951
33
losses:
4-
train 0.6387873291969299
5-
val 0.6393495798110962date-hour : 2024-10-29_13-40
4+
train 0.6383318901062012
5+
val 0.6380950808525085date-hour : 2024-10-29_15-15
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
iter : 15000
2-
epoch : 0.06739668283915194
1+
iter : 110000
2+
epoch : 0.49424234082044755
33
losses:
4-
train 0.6735616326332092
5-
val 0.6728384494781494date-hour : 2024-10-29_13-59
4+
train 0.6484723091125488
5+
val 0.648794949054718date-hour : 2024-10-29_20-52
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
iter : 15000
2-
epoch : 0.06739668283915194
1+
iter : 110000
2+
epoch : 0.49424234082044755
33
losses:
4-
train 0.6735616326332092
5-
val 0.6728384494781494date-hour : 2024-10-29_13-59
4+
train 0.6484723091125488
5+
val 0.648794949054718date-hour : 2024-10-29_20-52
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# DESCRIPTION
2+
After we got good results with the 12 attention heads and 12 attention blocks 30M model we decided to scale it
3+
to 60M size with RMSNorm and SiLU trained on 30M data
4+
# OBTENTION
5+
6+
# META-DATA
7+
8+
# MODELS-LOCATION
9+

0 commit comments

Comments
 (0)