MetRex/src/config/experiments/llama3_r128.yaml at main · scale-lab/MetRex · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
DATASET:
  TEST_SET: "eval/eval.json"
  METRIC: 'area'
  TRAIN_VAL_TEST_SPLIT: [0.99, 0.01, 0.0]

tokenizer:
  padding_side: 'right'
  use_fast: True

LLM:
  NAME: 'llama3:8b'
  TEMP: 0.4

lora:
  lora_r: 128
  lora_alpha: 64
  lora_dropout: 0.1
  use_dora: False

bnb:
  load_in_4bit: True
  bnb_4bit_quant_type: "nf4"
  bnb_4bit_use_double_quant: True

train:
  epochs: 10
  max_steps: -1                # make it -1 to use epochs
  max_seq_length: 1048
  batch_size: 32
  gradient_accumulation_steps: 2  # your effective batch size is batch_size*gradient_accumulation_steps
  learning_rate: 0.00002
  report_to: "wandb"
  gradient_checkpointing: True   ## Leads to reduction in memory but slows down training by 20%
  dataloader_num_workers: 0
  evaluation_strategy: 'steps'
  logging_strategy: 'steps'
  save_strategy: 'steps'
  logging_steps: 1
  eval_steps: 100
  save_steps: 100
  load_best_model_at_end: True
  lr_scheduler_type: 'linear'
  run_name: llama3_r128

test:
  batch_size: 8
  num_samples: 10 # n for computing pass@