-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtraining_utils.py
More file actions
156 lines (122 loc) · 4.81 KB
/
training_utils.py
File metadata and controls
156 lines (122 loc) · 4.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
from constants import *
import pandas as pd
import streamlit as st
from uml_data_generation import get_gpt2_dataset
from uml_data_generation import get_dataloaders
from tokenization import get_pretrained_lm_tokenizer, get_word_tokenizer_tokenizer
from uml_data_generation import get_generative_uml_dataset
import pickle
from models import UMLGPT
from transformers import \
GPT2Config, \
GPT2ForSequenceClassification, \
AutoModelForSequenceClassification, \
AutoModelForCausalLM
from trainers.umlgpt import UMLGPTTrainer
from trainers.causal_lm import CausalLMTrainer
def get_tokenizer(tokenizer_name, data=None, special_tokens=SPECIAL_TOKENS):
if data is None:
print("Creating pretrained LM tokenizer...", tokenizer_name)
tokenizer = get_pretrained_lm_tokenizer(tokenizer_name, special_tokens=special_tokens)
print("Done!")
else:
print("Creating word tokenizer...")
tokenizer = get_word_tokenizer_tokenizer(data)
print("Done!")
return tokenizer
def get_hf_classification_model(model_name, num_labels, tokenizer):
"""
Get the hugging face classification model
"""
if 'gpt2' in model_name:
model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path=model_name, num_labels=num_labels)
tokenizer.padding_side = "left"
model = GPT2ForSequenceClassification.from_pretrained(
pretrained_model_name_or_path=model_name,
config=model_config,
ignore_mismatched_sizes=True
)
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = model.config.eos_token_id
else:
print("Loading BERT from", model_name)
model = AutoModelForSequenceClassification.from_pretrained(
pretrained_model_name_or_path=model_name, num_labels=num_labels, ignore_mismatched_sizes=True)
model.resize_token_embeddings(len(tokenizer))
return model
def train_umlgpt(dataset, args):
"""
Train the UMLGPT model
Args:
dataset: dict
The dataset dictionary
args: Namespace
The arguments
"""
if args.tokenizer_file is not None:
tokenizer = pickle.load(open(args.tokenizer_file, 'rb'))
elif args.tokenizer == WORD_TOKENIZER:
tokenizer = get_tokenizer(WORD_TOKENIZER, dataset)
tokenizer.save_pretrained(args.models_dir)
print(f"Saved tokenizer at {args.models_dir}")
else:
tokenizer = get_tokenizer(args.tokenizer)
print("Tokenize dataset...")
with st.spinner("Tokenizing dataset..."):
tokenized_dataset = get_generative_uml_dataset(dataset, tokenizer)
print("Done!")
if args.from_pretrained is not None:
uml_gpt = UMLGPT.from_pretrained(args.from_pretrained)
print(f'Loaded pretrained UMLGPT model from {args.from_pretrained}')
else:
uml_gpt = UMLGPT(
vocab_size=len(tokenizer),
embed_dim=args.embed_dim,
block_size=args.block_size,
n_layer=args.num_layers,
n_head=args.num_heads
)
print("Created UMLGPT model")
print("Model initialized! with parameters:")
print("Batch size: ", args.batch_size)
dataloaders = get_dataloaders(tokenized_dataset, args.batch_size)
print("Creating dataloaders and trainer...")
trainer = UMLGPTTrainer(uml_gpt, tokenizer, dataloaders, args)
print("Done!")
if args.phase == TRAINING_PHASE:
print("Training...")
trainer.train(args.num_epochs)
trainer.save_model()
else:
print("Evaluating: ", len(dataloaders[TEST_LABEL].dataset))
eval_results = trainer.evaluate()
print(eval_results)
st.dataframe(pd.DataFrame([eval_results]), hide_index=True)
def train_hugging_face_gpt(data, args):
"""
Train the hugging face GPT model
Args:
data: dict
The data dictionary
args: Namespace
The arguments
"""
results = dict()
model_name = args.gpt_model
tokenizer = get_pretrained_lm_tokenizer(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))
if tokenizer.pad_token_id is None:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
model.config.pad_token_id = model.config.eos_token_id
print('Creating dataset...')
dataset = get_gpt2_dataset(data, tokenizer)
print('Done!')
trainer = CausalLMTrainer(model, tokenizer, dataset, args)
if args.phase == TRAINING_PHASE:
trainer.train(args.num_epochs)
trainer.save_model()
else:
results = trainer.evaluate()
st.dataframe([results], hide_index=True)