resolve conflicts with main

YerevaNN · Sep 28, 2024 · f9b1b33 · f9b1b33
2 parents 9d929d1 + be97fc3
commit f9b1b33
Show file tree

Hide file tree

Showing 7 changed files with 492 additions and 11 deletions.
diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py
@@ -266,6 +266,11 @@ def __init__(self):
             default=True,
             action="store_true",
             help="Whether to apply loss parallel when sequence parallel is enabled",
+        )        
+        self.parser.add_argument(
+            "--training.representation_type",
+            default="SMILES",
+            help="The representation type of the molecule for training the model.",
         )
         self.parser.add_argument(
             "--experimental.enable_async_tensor_parallel",

diff --git a/torchtitan/datasets/hf_datasets.py b/torchtitan/datasets/hf_datasets.py
@@ -6,7 +6,6 @@
 
 import pickle
 from typing import Any, Dict, List, Optional
-from pathlib import Path
 import glob
 import os
 
@@ -92,6 +91,7 @@ def __init__(
         dataset_path: Optional[str],
         data_processing_style: str,
         tokenizer: Tokenizer,
+        representation_type: str = "SMILES",
         seq_len: int = 2048,
         world_size: int = 1,
         rank: int = 0,
@@ -139,6 +139,7 @@ def __init__(
         self.infinite = infinite
         self.rank = rank
         self.world_size = world_size
+        self.representation_type = representation_type
 
         # variables for checkpointing
         self._sample_idx = 0
@@ -161,7 +162,7 @@ def __iter__(self):
                 continue
 
             for sample_json in self._get_data_iter():
-                sample_text = self.data_processing_fn(sample_json, self.rng)
+                sample_text = self.data_processing_fn(sample_json, self.rng, self.representation_type)
                 sample_tokens = self._tokenizer.encode(sample_text, bos=True, eos=True)
                 self._all_tokens.extend(sample_tokens)
                 self._sample_idx += 1
@@ -242,14 +243,14 @@ def build_hf_data_loader(
     seq_len: int,
     world_size,
     rank,
+    representation_type,
     infinite: bool = True,
     pin_memory: bool = False,
     num_workers: int = 2,
     special_mode = None,
 ):
-
     hf_ds = HuggingFaceDataset(
-        dataset_name, dataset_path, data_processing_style, tokenizer, seq_len, world_size, rank, infinite, special_mode
+        dataset_name, dataset_path, data_processing_style, tokenizer, representation_type, seq_len, world_size, rank, infinite, special_mode
     )
 
     return DPAwareDataLoader(rank, hf_ds, batch_size=batch_size, pin_memory=pin_memory, num_workers=num_workers)
diff --git a/torchtitan/utils/dataset_utils.py b/torchtitan/utils/dataset_utils.py
@@ -20,12 +20,12 @@ def load_jsonl_line(jsonl_line):
         raise ValueError(f"Error decoding JSON: {e}")
 
 
-def chemlactica_style_data_processing(sample_json, rng):
+def chemlactica_style_data_processing(sample_json, rng, representation_type):
     try:
         sample_json = json.loads(sample_json["text"])
         compound = delete_empty_tags(sample_json)
         sample_json = generate_formatted_string(
-            compound, rng
+            compound, rng, representation_type
         )
     except Exception as e:
         print(e)