From 9d929d110c7e8dc0c787b9db82797ca39bfa1ecf Mon Sep 17 00:00:00 2001 From: tigranfah Date: Sat, 28 Sep 2024 11:38:10 +0400 Subject: [PATCH] prepare reproducable run with validation --- submitit_train.py | 4 ++-- train_configs/chemlactica_125m.toml | 14 ++++++++++---- train_configs/chemlactica_debug.toml | 8 +++----- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/submitit_train.py b/submitit_train.py index 0b64937f..8c4c597b 100644 --- a/submitit_train.py +++ b/submitit_train.py @@ -6,13 +6,13 @@ if __name__ == "__main__": executor = submitit.AutoExecutor(folder="~/slurm_jobs/titan/job_%j") - n_gpus = 8 + n_gpus = 4 executor.update_parameters( name="titan", timeout_min=3 * 24 * 60, gpus_per_node=n_gpus, nodes=1, mem_gb=80, cpus_per_task=n_gpus * 4, slurm_additional_parameters={ - "partition": "h100" + "partition": "a100" } ) diff --git a/train_configs/chemlactica_125m.toml b/train_configs/chemlactica_125m.toml index caa224c4..6dd87259 100644 --- a/train_configs/chemlactica_125m.toml +++ b/train_configs/chemlactica_125m.toml @@ -17,8 +17,6 @@ log_freq = 1 enable_color_printing = true enable_aim = true save_aim_folder = "aim" -#aim_hash = "c6b4d8b340f74287b82ef928" -#aim_experiment_name = "hello" [model] name = "opt" @@ -32,8 +30,8 @@ name = "AdamW" lr = 1.4e-3 [training] -batch_size = 20 -gradient_accumulation_steps = 8 +batch_size = 16 +gradient_accumulation_steps = 16 seq_len = 2048 warmup_steps = 500 # lr scheduler warm up, normally 20% of the train steps max_norm = 1.0 # grad norm clipping @@ -46,6 +44,14 @@ compile = true dataset = "chemlactica_train" data_processing_style="chemlactica_style" +[validation] +valid_freq = 2000 +enable_valid = true +dataset = "chemlactica_valid" # supported datasets: chemlactica_valid_mini + +[dataloader] +num_workers = 4 + [experimental] pipeline_parallel_degree = 1 enable_async_tensor_parallel = false diff --git a/train_configs/chemlactica_debug.toml b/train_configs/chemlactica_debug.toml index 77cd97a7..2939a0e9 100644 --- a/train_configs/chemlactica_debug.toml +++ b/train_configs/chemlactica_debug.toml @@ -15,10 +15,8 @@ save_memory_snapshot_folder = "memory_snapshot" [metrics] log_freq = 1 enable_color_printing = true -enable_aim = false +enable_aim = true save_aim_folder = "aim" -#aim_hash = "c6b4d8b340f74287b82ef928" -#aim_experiment_name = "hello" [model] name = "opt" @@ -48,8 +46,8 @@ data_processing_style="chemlactica_style" [validation] valid_freq = 25 -enable_valid = false -dataset = "chemlactica_valid_mini" # supported datasets: c4_test (2K), c4 (177M), chemlactica_train_mini (4K) +enable_valid = true +dataset = "chemlactica_valid_mini" # supported datasets: chemlactica_valid_mini [dataloader] num_workers = 1