Skip to content

Commit

Permalink
configs and unification
Browse files Browse the repository at this point in the history
  • Loading branch information
kshitijkg committed Mar 28, 2024
1 parent 7d370c7 commit 2a8c9d3
Show file tree
Hide file tree
Showing 7 changed files with 280 additions and 67 deletions.
160 changes: 160 additions & 0 deletions configs/hb-1-4B.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
{
"pipe_parallel_size": 1,
"model_parallel_size": 1,

"num_layers": 24,
"hidden_size": 2048,
"num_attention_heads": 16,
"seq_length": 2048,
"max_position_embeddings": 2048,
"pos_emb": "rotary",
"rotary_pct": 0.25,
"no_weight_tying": true,
"gpt_j_residual": true,
"output_layer_parallelism": "column",


"vision_encoder_args": {
"name": "openclip",
"arch": "ViT-B-32",
"modality": "vision",
"pretrained_data": "laion2b_s34b_b79k",
"cache_dir": '/p/scratch/ccstdl/gupta6/openclip/vitb32.pt',
"freeze_encoder": True,
"add_lora": True,
"pretrained": True,
"encoder_type": "openclip",
"embed_dropout_prob": 0.1,
"use_embed_layernorm": True,
"perceiver_seq_length": 64,
"num_layers_to_unfreeze": 2,
},

# "attention_config": [[["flash"], 24]],

"scaled_upper_triang_masked_softmax_fusion": true,
"bias_gelu_fusion": true,

"init_method": "small_init",
"output_layer_init_method": "wang_init",

"optimizer": {
"type": "Adam",
"params": {
"lr": 0.0002,
"betas": [0.9, 0.95],
"eps": 1.0e-8
}
},
"min_lr": 0.00002,


"lr_param_groups_config": {
"image_encoder.encoder":{
"decay_style" : "cosine",
"start_lr": 0.0002,
"min_lr": 0.00002,
"warmup_iter": 860,
"end_iter": 4300,
},
"input_layernorm":{
"decay_style" : "cosine",
"start_lr": 0.0002,
"min_lr": 0.00002,
"warmup_iter": 860,
"end_iter": 4300,
},
"attention":{
"decay_style" : "cosine",
"start_lr": 0.0002,
"min_lr": 0.00002,
"warmup_iter": 860,
"end_iter": 4300,
},
"post_attention_layernorm":{
"decay_style" : "cosine",
"start_lr": 0.0002,
"min_lr": 0.00002,
"warmup_iter": 860,
"end_iter": 4300,
},
"mlp.dense_h_to_4h":{
"decay_style" : "cosine",
"start_lr": 0.0002,
"min_lr": 0.00002,
"warmup_iter": 860,
"end_iter": 4300,
},
# "27.":{
# "decay_style" : "cosine",
# "start_lr": 0.0002,
# "min_lr": 0.00002,
# "warmup_iter": 860,
# "end_iter": 4300,
# },
},

"zero_optimization": {
"stage": 1,
"allgather_partitions": true,
"allgather_bucket_size": 500000000,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 500000000,
"contiguous_gradients": true,
"cpu_offload": false
},

"train_micro_batch_size_per_gpu": 16,
"gradient_accumulation_steps": 1,
"data_impl": "mmap",

"checkpoint_activations": true,
"checkpoint_num_layers": 1,
"partition_activations": true,
"synchronize_each_layer": true,

"gradient_clipping": 1.0,
"weight_decay": 0.1,
"hidden_dropout": 0,
"attention_dropout": 0,

# "fp16": {
# "fp16": true,
# "enabled": true,
# "loss_scale": 0,
# "loss_scale_window": 1000,
# "initial_scale_power": 12,
# "hysteresis": 2,
# "min_loss_scale": 1
# },

"bf16": {
"enabled": true
},
"precision": "bfloat16",
"fp32_allreduce": true,

"data_types": {
"grad_accum_dtype": "fp32"
},


"train_iters": 500,
"lr_decay_iters": 500,
"distributed_backend": "nccl",
"lr_decay_style": "cosine",
"warmup": 0.01,
"checkpoint_factor": 100,
"eval_interval": 20,
"eval_iters": 10,

"log_interval": 10,
"steps_per_print": 10,
"keep_last_n_checkpoints": 4,
"wall_clock_breakdown": true,

# "launcher": "slurm",
# "deepspeed_slurm": true

}
96 changes: 48 additions & 48 deletions configs/hummingbird_streaming.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,116 +4,116 @@
'dataset': {
'batching_method': 'stratified',
'streams':{
laion400M:{ # 154980
slimpajama:{
'local': "/p/fastdata/mmlaion/hummingbird/hummingbird_dataset_final",
'split': "laion_400m_train",
'split': "text_train_chunk1",
'keep_zip': True, # in case we need compressed files after testing
'proportion': 0.1,
'proportion': 0.5,
},
arxiv:{ # 610000
obelics:{
'local': "/p/fastdata/mmlaion/hummingbird/hummingbird_dataset_final",
'split': "text_train_chunk1",
'split': "obelics_train",
'keep_zip': True, # in case we need compressed files after testing
'proportion': 0.4,
'proportion': 0.2,
},
mmc4:{ # 3850
grit:{
'local': "/p/fastdata/mmlaion/hummingbird/hummingbird_dataset_final",
'split': "mmc4_train",
'split': "grit_train",
'keep_zip': True, # in case we need compressed files after testing
'proportion': 0.2,
'proportion': 0.1,
},
seed:{ # 79200
understanding:{
'local': "/p/fastdata/mmlaion/hummingbird/hummingbird_dataset_final",
'split': "seed_train",
'split': "datacomp_train_understanding",
'keep_zip': True, # in case we need compressed files after testing
'proportion': 0.2,
'proportion': 0.1,
},
grit:{ # 45360
generation:{
'local': "/p/fastdata/mmlaion/hummingbird/hummingbird_dataset_final",
'split': "grit_train",
'split': "datacomp_train_generation",
'keep_zip': True, # in case we need compressed files after testing
'proportion': 0.1,
},
}
},
'state_dict_path': "/p/fastdata/mmlaion/hummingbird/runs/checkpoints/data/train_hb_testing1",
'state_dict_path': "/p/fastdata/mmlaion/hummingbird/runs/checkpoints/data/train_hb_final_unified",
'drop_last': True,
},

valid_streaming_data_config: [
{
'name': 'text',
'dataset': {
'local': "/p/fastdata/mmlaion/hummingbird/hummingbird_dataset_final",
'split': "laion_400m_validation",
'local': "/p/fastdata/mmlaion/hummingbird/hummingbird_dataset_final/text_train_chunk1",
'split': "0",
'shuffle': False,
'keep_zip': True, # in case we need compressed files after testing
},
'state_dict_path': "/p/fastdata/mmlaion/hummingbird/runs/checkpoints/data/validation_hb_testing1",
'state_dict_path': "/p/fastdata/mmlaion/hummingbird/runs/checkpoints/data/train_hb_final_unified",
'drop_last': True,
'epoch_size': 40000,
'epoch_size': 200,
},
{
'name': 'text',
'name': 'obelics',
'dataset': {
'local': "/p/fastdata/mmlaion/hummingbird/hummingbird_dataset_final",
'split': "text_val_chunk1",
'local': "/p/fastdata/mmlaion/hummingbird/hummingbird_dataset_final/obelics_train/0",
'split': "0",
'shuffle': False,
'keep_zip': True, # in case we need compressed files after testing
},
'state_dict_path': "/p/fastdata/mmlaion/hummingbird/runs/checkpoints/data/validation_hb_testing1",
'state_dict_path': "/p/fastdata/mmlaion/hummingbird/runs/checkpoints/data/train_hb_final_unified",
'drop_last': True,
'epoch_size': 40000,
'epoch_size': 200,
},
{
'name': 'text',
'name': 'grit',
'dataset': {
'local': "/p/fastdata/mmlaion/hummingbird/hummingbird_dataset_final",
'split': "mmc4_val",
'local': "/p/fastdata/mmlaion/hummingbird/hummingbird_dataset_final/grit_train",
'split': "0",
'shuffle': False,
'keep_zip': True, # in case we need compressed files after testing
},
'state_dict_path': "/p/fastdata/mmlaion/hummingbird/runs/checkpoints/data/validation_hb_testing1",
'state_dict_path': "/p/fastdata/mmlaion/hummingbird/runs/checkpoints/data/train_hb_final_unified",
'drop_last': True,
'epoch_size': 40000,
'epoch_size': 200,
},
{
'name': 'text',
'name': 'understanding',
'dataset': {
'local': "/p/fastdata/mmlaion/hummingbird/hummingbird_dataset_final",
'split': "seed_val",
'local': "/p/fastdata/mmlaion/hummingbird/hummingbird_dataset_final/datacomp_train_understanding",
'split': "0",
'shuffle': False,
'keep_zip': True, # in case we need compressed files after testing
},
'state_dict_path': "/p/fastdata/mmlaion/hummingbird/runs/checkpoints/data/validation_hb_testing1",
'state_dict_path': "/p/fastdata/mmlaion/hummingbird/runs/checkpoints/data/train_hb_final_unified",
'drop_last': True,
'epoch_size': 40000,
'epoch_size': 200,
},
{
'name': 'text',
'name': 'generation',
'dataset': {
'local': "/p/fastdata/mmlaion/hummingbird/hummingbird_dataset_final",
'split': "grit_val",
'local': "/p/fastdata/mmlaion/hummingbird/hummingbird_dataset_final/datacomp_train_generation",
'split': "0",
'shuffle': False,
'keep_zip': True, # in case we need compressed files after testing
},
'state_dict_path': "/p/fastdata/mmlaion/hummingbird/runs/checkpoints/data/validation_hb_testing1",
'state_dict_path': "/p/fastdata/mmlaion/hummingbird/runs/checkpoints/data/train_hb_final_unified",
'drop_last': True,
'epoch_size': 40000,
'epoch_size': 200,
}

],

test_streaming_data_config: {
'name': 'text',
'dataset': {
'local': "/p/fastdata/mmlaion/hummingbird/hummingbird_dataset_final",
'split': "text_val_chunk1",
'local': "/p/fastdata/mmlaion/hummingbird/hummingbird_dataset_final/text_train_chunk1",
'split': "1",
'shuffle': False,
'keep_zip': True, # in case we need compressed files after testing
},
'drop_last': True,
'epoch_size': 40000,
'epoch_size': 200,
},

# If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.
Expand All @@ -125,21 +125,21 @@
"vocab-file": "/p/project/ccstdl/gupta6/multimodal/20B_tokenizer.json",
"position_pad_id": -1,

"save": "/p/fastdata/mmlaion/hummingbird/runs/checkpoints/model/hb_testing1",
"load": "/p/fastdata/mmlaion/hummingbird/runs/checkpoints/model/hb_testing1", # "/p/fastdata/mmlaion/hummingbird/checkpoints/1_4_expanded_neox", #"/p/fastdata/mmlaion/hummingbird/checkpoints/1_neox",
"save": "/p/fastdata/mmlaion/hummingbird/runs/checkpoints/model/1_4_final",
"load": "/p/fastdata/mmlaion/hummingbird/checkpoints/1_4_final", # "/p/fastdata/mmlaion/hummingbird/checkpoints/1_4_expanded_neox", #"/p/fastdata/mmlaion/hummingbird/checkpoints/1_neox",
"checkpoint_validation_with_forward_pass": False,

"tensorboard-dir": "/p/fastdata/mmlaion/hummingbird/runs/tensorboard/hb_testing1",
"log-dir": "/p/fastdata/mmlaion/hummingbird/runs/logs/hb_testing1",
"tensorboard-dir": "/p/fastdata/mmlaion/hummingbird/runs/tensorboard/1_4_final",
"log-dir": "/p/fastdata/mmlaion/hummingbird/runs/logs/1_4_final",
"use_wandb": True,
"wandb_host": "https://api.wandb.ai",
"wandb_project": "hummingbird",
"wandb_dir": "/p/fastdata/mmlaion/hummingbird/runs/wandb/hb_testing1",
"wandb_dir": "/p/fastdata/mmlaion/hummingbird/runs/wandb/1_4_final",
"no_ssh_check": True,
"num_workers": 1,

"vision_seq_length": 64,
# "finetune": True,
"finetune": True,
# "concat"
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from megatron.tokenizer.tokenizer import build_tokenizer
from argparse import ArgumentParser, Namespace
from megatron.tokenizer.tokenizer import build_tokenizer

def main(args: Namespace) -> None:
"""Main: create C4/pile streaming dataset.
Args:
args (Namespace): Commandline arguments.
"""
# Write samples
args.rank = 0
args.model_parallel_size = 1
args.make_vocab_size_divisible_by = 128
tokenizer = build_tokenizer(args)
tokenizer.tokenizer.add_special_tokens([f"<|p|>"])
tokenizer.tokenizer.add_special_tokens([f"<|/p|>"])
tokenizer.tokenizer.add_special_tokens([f"<|box|>"])
tokenizer.tokenizer.add_special_tokens([f"<|/box|>"])
tokenizer.tokenizer.add_special_tokens([f"<|grounding|>"])
tokenizer.tokenizer.add_special_tokens([f"<|image_start|>"])
tokenizer.tokenizer.add_special_tokens([f"<|image_end|>"])

for i in range(1024):
tokenizer.tokenizer.add_special_tokens([f"<|box_{i}|>"])

for i in range(8192):
tokenizer.tokenizer.add_special_tokens([f"<|seed_{i}|>"])

tokenizer.tokenizer.save(
"/p/project/ccstdl/gupta6/multimodal/20B_tokenizer_final.json"
)


def parse_args() -> Namespace:
"""Parse commandline arguments."""
parser = ArgumentParser(
description="Add new tokens to the vocabulary of a tokenizer."
)
parser.add_argument("--tokenizer_type", type=str, required=False, default=None)
parser.add_argument("--vocab_file", type=str, required=False, default=None)
parser.add_argument("--merge_file", type=str, required=False, default=None)

parsed = parser.parse_args()
return parsed


if __name__ == "__main__":
main(parse_args())
Loading

0 comments on commit 2a8c9d3

Please sign in to comment.