Merge branch 'trintamaki/nvlm-example-scripts' into 'main'

NVLM example scripts See merge request ADLR/megatron-lm!2306
NVIDIA · Nov 16, 2024 · ce507ee · ce507ee
2 parents 63b8520 + 4131b07
commit ce507ee
Show file tree

Hide file tree

Showing 17 changed files with 1,395 additions and 28 deletions.
diff --git a/examples/multimodal/README.md b/examples/multimodal/README.md
@@ -31,7 +31,7 @@ python examples/multimodal/model_converter/clip_converter.py --download-root /so
 Update the paths to point to the mcore converted CLIP and Mistral models and run the following script to combine the Mistral and CLIP models into a single multimodal checkpoint folder:
 
 ```
-examples/multimodal/combine_mistral_clip.sh /path/to/mistral/model /path/to/clip/model /output/dir
+examples/multimodal/combine_lm_vision_checkpoints.sh /path/to/mistral/model /path/to/clip/model /output/dir
 ```
 
 ## Training

diff --git a/examples/multimodal/combine_lm_vision_checkpoints.sh b/examples/multimodal/combine_lm_vision_checkpoints.sh
@@ -0,0 +1,57 @@
+#/bin/bash
+MCORE_LM=$1    # <path_to_mcore_lm_model_folder>
+MCORE_VISION=$2   # <path_to_mcore_vision_model_folder>
+OUTPUT_DIR=$3   # <path_to_output_folder_for_combined_checkpoint>
+MODEL_TYPE=$4   # Model type. Default: Mistral CLIP example.
+
+if [[ $MODEL_TYPE == "nvlm" ]]; then
+    # NVLM TP=8
+    python examples/multimodal/combine_state_dicts.py \
+        --input \
+        ${MCORE_LM}/iter_0000001/mp_rank_00/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_00/model_optim_rng.pt \
+        ${MCORE_LM}/iter_0000001/mp_rank_01/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_01/model_optim_rng.pt \
+        ${MCORE_LM}/iter_0000001/mp_rank_02/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_02/model_optim_rng.pt \
+        ${MCORE_LM}/iter_0000001/mp_rank_03/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_03/model_optim_rng.pt \
+        ${MCORE_LM}/iter_0000001/mp_rank_04/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_04/model_optim_rng.pt \
+        ${MCORE_LM}/iter_0000001/mp_rank_05/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_05/model_optim_rng.pt \
+        ${MCORE_LM}/iter_0000001/mp_rank_06/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_06/model_optim_rng.pt \
+        ${MCORE_LM}/iter_0000001/mp_rank_07/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_07/model_optim_rng.pt \
+        --prefixes language_model vision_model language_model vision_model language_model vision_model language_model vision_model language_model vision_model language_model vision_model language_model vision_model language_model vision_model \
+        --output \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_00/model_optim_rng.pt \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_01/model_optim_rng.pt \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_02/model_optim_rng.pt \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_03/model_optim_rng.pt \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_04/model_optim_rng.pt \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_05/model_optim_rng.pt \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_06/model_optim_rng.pt \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_07/model_optim_rng.pt
+else
+    # Mistral CLIP example TP=4.
+    python examples/multimodal/combine_state_dicts.py \
+        --input \
+        ${MCORE_LM}/iter_0000001/mp_rank_00/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_00/model_optim_rng.pt \
+        ${MCORE_LM}/iter_0000001/mp_rank_01/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_01/model_optim_rng.pt \
+        ${MCORE_LM}/iter_0000001/mp_rank_02/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_02/model_optim_rng.pt \
+        ${MCORE_LM}/iter_0000001/mp_rank_03/model_optim_rng.pt \
+        ${MCORE_VISION}/iter_0000001/mp_rank_03/model_optim_rng.pt \
+        --prefixes language_model vision_model language_model vision_model language_model vision_model language_model vision_model \
+        --output \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_00/model_optim_rng.pt \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_01/model_optim_rng.pt \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_02/model_optim_rng.pt \
+        ${OUTPUT_DIR}/iter_0000001/mp_rank_03/model_optim_rng.pt
+fi
+
+echo 1 > ${OUTPUT_DIR}/latest_checkpointed_iteration.txt
diff --git a/examples/multimodal/combine_mistral_clip.sh b/examples/multimodal/combine_mistral_clip.sh
diff --git a/examples/multimodal/config.py b/examples/multimodal/config.py
@@ -73,6 +73,20 @@ def get_language_model_config(config):
         config.apply_rope_fusion = False
         config.attention_softmax_in_fp32 = True
         config.ffn_hidden_size = 20480
+    elif config.language_model_type == "qwen2.0_72B":
+        config.activation_func = torch.nn.functional.silu
+        config.add_bias_linear = False
+        config.add_qkv_bias = True
+        config.bias_activation_fusion = False
+        config.gated_linear_unit = True
+        config.apply_query_key_layer_scaling = False
+        config.layernorm_zero_centered_gamma = (
+            False  # Zero centered gamma not supported for RMSNorm
+        )
+        config.bias_dropout_fusion = False
+        config.apply_rope_fusion = False
+        config.attention_softmax_in_fp32 = True
+        config.ffn_hidden_size = 29568
     else:
         raise ValueError(f"unknown language model type {config.language_model_type}")
 
@@ -146,7 +160,6 @@ def get_vision_model_config(config, apply_query_key_layer_scaling):
     else:
         raise ValueError(f"unknown vision model type {config.vision_model_type}")
 
-
     return config
 
 
@@ -171,6 +184,10 @@ def get_vision_projection_config(config, hidden_size):
         config.ffn_hidden_size = 20480
         config.normalization = 'LayerNorm'
         config.activation_func = torch.nn.functional.gelu
+    elif config.language_model_type == "qwen2.0_72B":
+        config.ffn_hidden_size = 29568
+        config.normalization = 'LayerNorm'
+        config.activation_func = torch.nn.functional.gelu
     else:
         raise ValueError(f"unknown language model type {config.language_model_type}")
 

diff --git a/examples/multimodal/model_converter/internvit_converter.py b/examples/multimodal/model_converter/internvit_converter.py
diff --git a/examples/multimodal/model_converter/siglip_converter.py b/examples/multimodal/model_converter/siglip_converter.py
@@ -61,9 +61,9 @@ def add_chunck_tensor(new_tensor, new_name, chunk_dim=None):
     head_dim = 72
     num_head = 16
     for layer_idx in range(27):
-        origin_base = f"vision_tower.vision_model.encoder.layers.{layer_idx}" 
+        origin_base = f"vision_tower.vision_model.encoder.layers.{layer_idx}"
         target_base = f"decoder.layers.{layer_idx}"
-        
+
         for param_type in ["weight", "bias"]:
             # QKV
             q_proj_params = state_dict[f"{origin_base}.self_attn.q_proj.{param_type}"]
@@ -135,7 +135,7 @@ def add_chunck_tensor(new_tensor, new_name, chunk_dim=None):
 Example usage:
 python siglip_converter.py --tensor-parallel-size 4 --output google_paligemma_3b_pt_44_mcore_tp_4 --use-te
 
-examples/multimodal/combine_mistral_clip.sh /lustre/fsw/portfolios/llmservice/users/jbarker/workspace/checkpoints/Mistral-7B-Instruct-v0.3-mcore-tp4 google_paligemma_3b_pt_44_mcore_tp_4 mistral_7b_instruct_v0p3_google_paligemma_3b_pt_44_mcore_tp_4
+examples/multimodal/combine_mistral_clip.sh Mistral-7B-Instruct-v0.3-mcore-tp4 google_paligemma_3b_pt_44_mcore_tp_4 mistral_7b_instruct_v0p3_google_paligemma_3b_pt_44_mcore_tp_4
 """,
         formatter_class=argparse.RawDescriptionHelpFormatter,
     )

diff --git a/examples/multimodal/nvlm/README.md b/examples/multimodal/nvlm/README.md
@@ -0,0 +1,5 @@
+NVLM
+====
+
+Work in progress.
+Please refer to the [NVLM paper](https://arxiv.org/pdf/2409.11402) for details.
diff --git a/examples/multimodal/nvlm/nvlm_prompts.json b/examples/multimodal/nvlm/nvlm_prompts.json
@@ -0,0 +1,165 @@
+{
+    "COMMENT": "Mixture of our own custom prompts and some prompts from https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain/viewer and https://huggingface.co/datasets/HuggingFaceM4/M3IT",
+    "Captioning": {
+        "raw": [
+        "Can you briefly explain what you see in the image?",
+        "Describe what's happening in this image in one short sentence.",
+        "Write a short caption that accurately represents the content of this image.",
+        "Please generate a descriptive caption for the image provided.",
+        "How would you summarize the scene depicted in the picture in short?",
+        "Describe the image briefly.",
+        "Write a succinct description of the image, capturing its main components, the relationships between them, and any notable details.",
+        "Create a concise caption that accurately describes the main elements in the image provided.",
+        "Write a brief, yet comprehensive, description of the image.",
+        "Describe the image in a clear and concise manner.",
+        "For the given image, provide a one-sentence summary that captures the most important details.",
+        "Generate a short caption for the picture.",
+        "Write a short and informative description that highlights the primary subjects and actions occurring in the given image.",
+        "Provide a concise and informative caption for the image, focusing on the primary subjects.",
+        "Write a clear description of the image, make sure the key features are well covered.",
+        "Offer a succinct explanation of the picture presented."
+        ]
+    },
+    "CaptioningPretraining": {
+        "raw": [
+        "Give a brief description of image.",
+        "Give a brief description of the image.",
+        "Provide a brief description of the given image.",
+        "Provide a one-sentence caption for the provided image.",
+        "Write a terse but informative summary of the picture.",
+        "Describe the image concisely.",
+        "Generate a clear and concise summary of the photo."
+        ]
+    },
+    "CaptioningSFT": {
+        "raw": [
+        "Give a brief description of the image.",
+        "Give a short and clear explanation of the subsequent image.",
+        "Present a compact description of the photo's key features.",
+        "Provide a brief description of the given image.",
+        "Provide a one-sentence caption for the provided image.",
+        "Render a clear and concise summary of the photo.",
+        "Share a concise interpretation of the image provided.",
+        "Summarize the visual content of the image.",
+        "Write a terse but informative summary of the picture.",
+        "Describe the image concisely."
+        ]
+    },
+    "VQAPretraining": {
+        "raw": [
+        "Question: {} Short answer:",
+        "Question: {} Answer:"
+        ]
+    },
+    "VQASFT": {
+        "raw": [
+        "{}",
+        "{}\nAnswer the question using a single word or phrase."
+        ],
+        "docvqa": [
+        "{}",
+        "{}\nAnswer this question using the text in the image directly."
+        ]
+    },
+    "DocPretraining": {
+        "raw": [
+        "Retrieve the text from the given pdf image.",
+        "Extract the text from the provided document.",
+        "Transcribe the text displayed in the image."
+        ],
+        "ocr_multi": [
+        "Apply grounded Optical Character Recognition (OCR) to the provided image.",
+        "Extract all texts and their bounding boxes from the given image using grounded OCR.",
+        "Extract and transcribe all visible text from the provided image, ensuring accurate spatial recognition.",
+        "Conduct a detailed optical character recognition analysis on this image, maintaining the text's original layout and positioning.",
+        "Execute a thorough text recognition procedure on this visual input, ensuring that the spatial arrangement of the text is accurately represented.",
+        "Perform an in-depth OCR scan of the image, capturing both the content and contextual positioning of all textual information.",
+        "OCR with grounding:"
+        ],
+        "md": [
+        "Extract the text from the given image and format it in Markdown.",
+        "Convert the text from the provided image into Markdown format.",
+        "Transform the text from the given image into Markdown syntax.",
+        "Extract and convert the text from the image to Markdown.",
+        "Retrieve the text from the image and present it in Markdown format."
+        ],
+        "grounded_ocr": [
+        "{}. Text:",
+        "Recognize the text in this region: {}.",
+        "Identify the text in this area: {}.",
+        "Detect the text within this section: {}."
+        ],
+        "referring_grounding": [
+        "Region of \"{}\" is:",
+        "Locate the text \"{}\" in the image.",
+        "Identify the text \"{}\" in the image and provide the coordinates."
+        ]
+    },
+    "CaptioningDetailed": {
+        "raw": [
+        "Create a comprehensive paragraph that captures the essence of the image while weaving a cohesive narrative around its elements.",
+        "Compose a paragraph that thoroughly describes the image's content, providing context and connections between different aspects of the scene.",
+        "Provide a detailed, paragraph-length description of the image that paints a vivid picture and tells a coherent story.",
+        "Write a rich and engaging paragraph that delves into the image's components, describing not only what is seen but also how the elements relate to one another.",
+        "Give a well-rounded, paragraph-length explanation of the image, describing the scene and its components while forming a complete and engaging narrative.",
+        "Produce a paragraph that not only describes the individual elements in the image but also weaves them together to form a cohesive, connected account.",
+        "Construct a paragraph that captures the image's details and context, offering a more in-depth and engaging story than a simple caption.",
+        "Compose a descriptive paragraph that brings the image to life through detailed storytelling, connecting the various visual elements into a unified narrative.",
+        "Create a paragraph that provides an extensive and interconnected description of the image, ensuring that the narrative is both detailed and cohesive.",
+        "Write a compelling and detailed paragraph that delves into the image's components, linking them together to create a unified and engaging story."
+        ]
+    },
+    "OCR": {
+        "raw": [
+        "Can you read the text from image and output here?",
+        "Extract and document the text from the provided image.",
+        "Converting the text embedded in this image into a readable document.",
+        "Transcribe all the text you find.",
+        "Can you extract all visible text from the image here?"
+        ],
+        "markdown": [
+        "Can you extract all visible text from the provided image?",
+        "Converting the text embedded in this image into a readable markdown document.",
+        "Can you read the text in the document as markdown?",
+        "Transcribe the document as markdown.",
+        "Extract and document the text from the provided image."
+        ],
+        "table_markdown": [
+        "Can you extract all visible text from the provided table?",
+        "Can you read the text in the provided table as markdown?",
+        "Transcribe the table as markdown.",
+        "Extract and document the text from the provided table image."
+        ],
+        "plain": [
+        "Transcribe the document as plain text.",
+        "Extract and document the text from the provided image.",
+        "Converting the text embedded in this image into a readable document.",
+        "Transcribe all the text you find.",
+        "Can you extract all visible text from the image here?"
+        ],
+        "bbox_plain": [
+        "Transcribe the document as plain text along with bounding boxes.",
+        "Extract and document the text from the provided image along with bounding boxes.",
+        "Converting the text embedded in this image into a readable documen along with bounding boxes.",
+        "Can you extract all visible text with bounding boxes from the image here?"
+        ]
+    },
+    "VQA": {
+        "raw": [
+        "Given the image, answer the following question with few words.",
+        "Answer the following question: ",
+        "What is the answer to this question?",
+        "Write the answer: ",
+        "Please answer this question: "
+        ]
+    },
+    "Embedded": {
+        "raw": [
+        "Given the image, answer the following question with few words.",
+        "Answer the following question: ",
+        "What is the answer to this question?",
+        "Write the answer: ",
+        "Please answer this question: "
+        ]
+    }
+}