Skip to content

Commit

Permalink
prepare reproducable run with validation
Browse files Browse the repository at this point in the history
  • Loading branch information
tigranfah committed Sep 28, 2024
1 parent 01e01f9 commit 9d929d1
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 11 deletions.
4 changes: 2 additions & 2 deletions submitit_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@

if __name__ == "__main__":
executor = submitit.AutoExecutor(folder="~/slurm_jobs/titan/job_%j")
n_gpus = 8
n_gpus = 4
executor.update_parameters(
name="titan", timeout_min=3 * 24 * 60,
gpus_per_node=n_gpus,
nodes=1, mem_gb=80, cpus_per_task=n_gpus * 4,
slurm_additional_parameters={
"partition": "h100"
"partition": "a100"
}
)

Expand Down
14 changes: 10 additions & 4 deletions train_configs/chemlactica_125m.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@ log_freq = 1
enable_color_printing = true
enable_aim = true
save_aim_folder = "aim"
#aim_hash = "c6b4d8b340f74287b82ef928"
#aim_experiment_name = "hello"

[model]
name = "opt"
Expand All @@ -32,8 +30,8 @@ name = "AdamW"
lr = 1.4e-3

[training]
batch_size = 20
gradient_accumulation_steps = 8
batch_size = 16
gradient_accumulation_steps = 16
seq_len = 2048
warmup_steps = 500 # lr scheduler warm up, normally 20% of the train steps
max_norm = 1.0 # grad norm clipping
Expand All @@ -46,6 +44,14 @@ compile = true
dataset = "chemlactica_train"
data_processing_style="chemlactica_style"

[validation]
valid_freq = 2000
enable_valid = true
dataset = "chemlactica_valid" # supported datasets: chemlactica_valid_mini

[dataloader]
num_workers = 4

[experimental]
pipeline_parallel_degree = 1
enable_async_tensor_parallel = false
Expand Down
8 changes: 3 additions & 5 deletions train_configs/chemlactica_debug.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,8 @@ save_memory_snapshot_folder = "memory_snapshot"
[metrics]
log_freq = 1
enable_color_printing = true
enable_aim = false
enable_aim = true
save_aim_folder = "aim"
#aim_hash = "c6b4d8b340f74287b82ef928"
#aim_experiment_name = "hello"

[model]
name = "opt"
Expand Down Expand Up @@ -48,8 +46,8 @@ data_processing_style="chemlactica_style"

[validation]
valid_freq = 25
enable_valid = false
dataset = "chemlactica_valid_mini" # supported datasets: c4_test (2K), c4 (177M), chemlactica_train_mini (4K)
enable_valid = true
dataset = "chemlactica_valid_mini" # supported datasets: chemlactica_valid_mini

[dataloader]
num_workers = 1
Expand Down

0 comments on commit 9d929d1

Please sign in to comment.