diff --git a/.github/workflows/test_torchchat_commands.yml b/.github/workflows/test_torchchat_commands.yml
new file mode 100644
index 000000000..86c4d8224
--- /dev/null
+++ b/.github/workflows/test_torchchat_commands.yml
@@ -0,0 +1,57 @@
+name: Run torchchat command tests
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+  workflow_dispatch:
+
+jobs:
+  torchchat-command-load-test:
+    strategy:
+      matrix:
+        runner: [macos-14]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v2
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.11
+      - name: Print machine info
+        run: |
+          uname -a
+          if [ $(uname -s) == Darwin ]; then
+            sysctl machdep.cpu.brand_string
+            sysctl machdep.cpu.core_count
+          fi
+      - name: Install requirements
+        run: |
+          echo "Installing pip packages"
+          pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
+          pip install -r requirements.txt
+
+      - name: Download Stories files
+        run: |
+
+          mkdir -p checkpoints/stories15M
+          pushd checkpoints/stories15M
+          curl -fsSL -O https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
+          curl -fsSL -O https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
+          popd
+
+      - name: Test generate
+        run: |
+
+          export MODEL_PATH=checkpoints/stories15M/stories15M.pt
+          export MODEL_NAME=stories15M
+          export MODEL_DIR=/tmp
+          
+          python generate.py --device cpu --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager1
+          python torchchat.py generate --device cpu --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager2
+          cat ./output_eager1
+          cat ./output_eager2
+          echo "Tests complete."
+     
\ No newline at end of file
diff --git a/build/builder.py b/build/builder.py
index f558989b7..8c156a911 100644
--- a/build/builder.py
+++ b/build/builder.py
@@ -3,19 +3,19 @@
 
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
-import itertools
+
+import os
 import sys
 import time
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 
 import torch
 import torch._dynamo.config
 import torch._inductor.config
-from cli import cli_args
 
-from quantize import get_precision, name_to_dtype, quantize_model, set_precision
+from quantize import name_to_dtype, quantize_model
 
 from sentencepiece import SentencePieceProcessor
 
@@ -110,7 +110,7 @@ def from_args(cls, args):  # -> TokenizerArgs:
         elif args.checkpoint_dir:
             tokenizer_path = args.checkpoint_dir / "tokenizer.model"
         else:
-            raise RuntimeError(f"cannot find tokenizer model")
+            raise RuntimeError("cannot find tokenizer model")
 
         if not tokenizer_path.is_file():
             raise RuntimeError(f"did not find tokenizer at {tokenizer_path}")
@@ -243,7 +243,7 @@ def _initialize_model(
         # assert model_dtype == "float32", f"dtype setting not valid for a DSO model. Specify dtype during export."
         assert (
             quantize is None or quantize == "{ }"
-        ), f"quantize not valid for exported DSO model. Specify quantization during export."
+        ), "quantize not valid for exported DSO model. Specify quantization during export."
         try:
             model = model_
             # Replace model forward with the AOT-compiled forward
@@ -262,12 +262,12 @@ def _initialize_model(
         # assert model_dtype == "float32", f"dtype setting not valid for a DSO model. Specify dtype during export."
         assert (
             quantize is None or quantize == "{ }"
-        ), f"quantize not valid for exported PTE model. Specify quantization during export."
+        ), "quantize not valid for exported PTE model. Specify quantization during export."
         try:
             from build.model_et import PTEModel
 
             model = PTEModel(model_.config, builder_args.pte_path)
-        except Exception as e:
+        except Exception:
             raise RuntimeError(f"Failed to load ET compiled {builder_args.pte_path}")
     else:
         model = model_
diff --git a/cli.py b/cli.py
index 3d6873b78..af2d81fb4 100644
--- a/cli.py
+++ b/cli.py
@@ -4,12 +4,11 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 
-import os
-import time
+import json
 from pathlib import Path
 
 import torch
-import torch.nn as nn
+
 
 default_device = "cpu"  # 'cuda' if torch.cuda.is_available() else 'cpu'
 
@@ -41,11 +40,19 @@ def check_args(args, command_name: str):
                 print(f"Warning: {text}")
 
 
-def cli_args():
-    import argparse
+def add_arguments_for_generate(parser):
+    _add_arguments_common(parser)
+
+
+def add_arguments_for_eval(parser):
+    _add_arguments_common(parser)
+
+
+def add_arguments_for_export(parser):
+    _add_arguments_common(parser)
 
-    parser = argparse.ArgumentParser(description="Your CLI description.")
 
+def _add_arguments_common(parser):
     parser.add_argument(
         "--seed",
         type=int,
@@ -60,21 +67,6 @@ def cli_args():
         action="store_true",
         help="Whether to use tiktoken tokenizer.",
     )
-    parser.add_argument(
-        "--export",
-        action="store_true",
-        help="Use torchchat to export a model.",
-    )
-    parser.add_argument(
-        "--eval",
-        action="store_true",
-        help="Use torchchat to eval a model.",
-    )
-    parser.add_argument(
-        "--generate",
-        action="store_true",
-        help="Use torchchat to generate a sequence using a model.",
-    )
     parser.add_argument(
         "--chat",
         action="store_true",
@@ -162,10 +154,10 @@ def cli_args():
     parser.add_argument(
         "--quantize", type=str, default="{ }", help="Quantization options."
     )
+    parser.add_argument("--params-table", type=str, default=None, help="Device to use")
     parser.add_argument(
         "--device", type=str, default=default_device, help="Device to use"
     )
-    parser.add_argument("--params-table", type=str, default=None, help="Device to use")
     parser.add_argument(
         "--tasks",
         nargs="+",
@@ -183,7 +175,8 @@ def cli_args():
         help="maximum length sequence to evaluate",
     )
 
-    args = parser.parse_args()
+
+def arg_init(args):
 
     if Path(args.quantize).is_file():
         with open(args.quantize, "r") as f:
@@ -191,5 +184,4 @@ def cli_args():
 
     if args.seed:
         torch.manual_seed(args.seed)
-
     return args
diff --git a/eval.py b/eval.py
index 9ebd5337d..6d719d460 100644
--- a/eval.py
+++ b/eval.py
@@ -3,24 +3,33 @@
 
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
-import sys
+import argparse
 import time
-from pathlib import Path
 from typing import Optional
 
 import torch
 import torch._dynamo.config
 import torch._inductor.config
 
+from build.builder import (
+    _initialize_model,
+    _initialize_tokenizer,
+    BuilderArgs,
+    TokenizerArgs,
+)
+
+from build.model import Transformer
+from cli import add_arguments_for_eval, arg_init
+from generate import encode_tokens, model_forward
+
+from quantize import set_precision
+
 torch._dynamo.config.automatic_dynamic_shapes = True
 torch._inductor.config.triton.unique_kernel_names = True
 torch._inductor.config.epilogue_fusion = False
 torch._inductor.config.triton.cudagraphs = True
 torch._dynamo.config.cache_size_limit = 100000
 
-from build.model import Transformer
-from cli import cli_args
-from quantize import name_to_dtype, set_precision
 
 try:
     import lm_eval
@@ -29,13 +38,6 @@
 except:
     lm_eval_available = False
 
-from build.builder import (
-    _initialize_model,
-    _initialize_tokenizer,
-    BuilderArgs,
-    TokenizerArgs,
-)
-from generate import encode_tokens, model_forward
 
 if lm_eval_available:
     try:  # lm_eval version 0.4
@@ -218,30 +220,19 @@ def main(args) -> None:
 
     builder_args = BuilderArgs.from_args(args)
     tokenizer_args = TokenizerArgs.from_args(args)
-
-    checkpoint_path = args.checkpoint_path
-    checkpoint_dir = args.checkpoint_dir
-    params_path = args.params_path
-    params_table = args.params_table
-    gguf_path = args.gguf_path
-    tokenizer_path = args.tokenizer_path
-    dso_path = args.dso_path
-    pte_path = args.pte_path
     quantize = args.quantize
     device = args.device
-    model_dtype = args.dtype
     tasks = args.tasks
     limit = args.limit
     max_seq_length = args.max_seq_length
-    use_tiktoken = args.tiktoken
 
     print(f"Using device={device}")
-    set_precision(buildeer_args.precision)
+    set_precision(builder_args.precision)
 
     tokenizer = _initialize_tokenizer(tokenizer_args)
     builder_args.setup_caches = False
     model = _initialize_model(
-        buildeer_args,
+        builder_args,
         quantize,
     )
 
@@ -280,11 +271,8 @@ def main(args) -> None:
 
 
 if __name__ == "__main__":
-
-    def cli():
-        args = cli_args()
-        main(args)
-
-
-if __name__ == "__main__":
-    cli()
+    parser = argparse.ArgumentParser(description="Export specific CLI.")
+    add_arguments_for_eval(parser)
+    args = parser.parse_args()
+    args = arg_init(args)
+    main(args)
diff --git a/export.py b/export.py
index f31af803a..45c61dbb4 100644
--- a/export.py
+++ b/export.py
@@ -4,16 +4,16 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 
+import argparse
 import os
-import time
-from pathlib import Path
 
 import torch
-import torch.nn as nn
-from cli import cli_args
 
-from quantize import get_precision, name_to_dtype, quantize_model, set_precision
-from torch.export import Dim, export
+from build.builder import _initialize_model, BuilderArgs
+from cli import add_arguments_for_export, arg_init, check_args
+from export_aoti import export_model as export_model_aoti
+
+from quantize import set_precision
 
 try:
     executorch_export_available = True
@@ -22,13 +22,6 @@
     executorch_exception = f"ET EXPORT EXCEPTION: {e}"
     executorch_export_available = False
 
-from build.builder import _initialize_model, BuilderArgs, TokenizerArgs
-
-from build.model import Transformer
-from export_aoti import export_model as export_model_aoti
-from generate import decode_one_token
-from quantize import name_to_dtype, quantize_model
-from torch._export import capture_pre_autograd_graph
 
 default_device = "cpu"  # 'cuda' if torch.cuda.is_available() else 'cpu'
 
@@ -44,7 +37,6 @@ def device_sync(device):
 
 def main(args):
     builder_args = BuilderArgs.from_args(args)
-    tokenizer_args = TokenizerArgs.from_args(args)
     quantize = args.quantize
 
     print(f"Using device={builder_args.device}")
@@ -70,7 +62,7 @@ def main(args):
                 export_model_et(model, builder_args.device, args.output_pte_path, args)
             else:
                 print(
-                    f"Export with executorch requested but Executorch could not be loaded"
+                    "Export with executorch requested but Executorch could not be loaded"
                 )
                 print(executorch_exception)
         if output_dso_path:
@@ -79,10 +71,10 @@ def main(args):
             export_model_aoti(model, builder_args.device, output_dso_path, args)
 
 
-def cli():
-    args = cli_args()
-    main(args)
-
-
 if __name__ == "__main__":
-    cli()
+    parser = argparse.ArgumentParser(description="Export specific CLI.")
+    add_arguments_for_export(parser)
+    args = parser.parse_args()
+    check_args(args, "export")
+    args = arg_init(args)
+    main(args)
diff --git a/generate.py b/generate.py
index d4b26a75b..4d52b4c8b 100644
--- a/generate.py
+++ b/generate.py
@@ -3,6 +3,7 @@
 
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
+import argparse
 import itertools
 import os
 import sys
@@ -23,8 +24,8 @@
     TokenizerArgs,
 )
 from build.model import Transformer
-from cli import cli_args
-from quantize import get_precision, name_to_dtype, quantize_model, set_precision
+from cli import add_arguments_for_generate, arg_init, check_args
+from quantize import set_precision
 
 
 @dataclass
@@ -137,7 +138,7 @@ def decode_n_tokens(
     **sampling_kwargs,
 ):
     new_tokens, new_probs = [], []
-    for i in range(num_new_tokens):
+    for _ in range(num_new_tokens):
         with torch.backends.cuda.sdp_kernel(
             enable_flash=False, enable_mem_efficient=False, enable_math=True
         ):  # Actually better for Inductor to codegen attention here
@@ -356,8 +357,6 @@ def _main(
     # will add a version of _initialize_model in future
     # (need additional args)
     if is_speculative:
-        from builder import _load_model
-
         speculative_builder_args = builder_args
 
         draft_model = _load_model(
@@ -496,8 +495,6 @@ def main(args):
     builder_args = BuilderArgs.from_args(args)
     speculative_builder_args = BuilderArgs.from_speculative_args(args)
     tokenizer_args = TokenizerArgs.from_args(args)
-    generator_args = GeneratorArgs.from_args(args)
-
     _main(
         builder_args,
         speculative_builder_args,
@@ -516,10 +513,10 @@ def main(args):
     )
 
 
-def cli():
-    args = cli_args()
-    main(args)
-
-
 if __name__ == "__main__":
-    cli()
+    parser = argparse.ArgumentParser(description="Generate specific CLI.")
+    add_arguments_for_generate(parser)
+    args = parser.parse_args()
+    check_args(args, "generate")
+    args = arg_init(args)
+    main(args)
diff --git a/torchat.py b/torchat.py
deleted file mode 100644
index d01b12f88..000000000
--- a/torchat.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-
-import os
-import time
-from pathlib import Path
-
-import torch
-import torch.nn as nn
-from cli import check_args, cli_args
-from eval import main as eval_main
-
-from export import main as export_main
-from generate import main as generate_main
-from torch.export import Dim, export
-
-default_device = "cpu"  # 'cuda' if torch.cuda.is_available() else 'cpu'
-
-
-def cli():
-    args = cli_args()
-
-    if args.generate or args.chat:
-        check_args(args, "generate")
-        generate_main(args)
-    elif args.eval:
-        eval_main(args)
-    elif args.export:
-        check_args(args, "export")
-        export_main(args)
-    else:
-        raise RuntimeError("must specify either --generate or --export")
-
-
-if __name__ == "__main__":
-    cli()
diff --git a/torchchat.py b/torchchat.py
new file mode 100644
index 000000000..8b5067cd0
--- /dev/null
+++ b/torchchat.py
@@ -0,0 +1,54 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+
+from cli import (
+    add_arguments_for_eval,
+    add_arguments_for_export,
+    add_arguments_for_generate,
+    arg_init,
+    check_args,
+)
+
+default_device = "cpu"  # 'cuda' if torch.cuda.is_available() else 'cpu'
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Top-level command")
+    subparsers = parser.add_subparsers(
+        dest="subcommand",
+        help="Use `generate`, `eval` or `export` followed by subcommand specific options.",
+    )
+
+    parser_generate = subparsers.add_parser("generate")
+    add_arguments_for_generate(parser_generate)
+
+    parser_eval = subparsers.add_parser("eval")
+    add_arguments_for_eval(parser_eval)
+
+    parser_export = subparsers.add_parser("export")
+    add_arguments_for_export(parser_export)
+
+    args = parser.parse_args()
+    args = arg_init(args)
+
+    if args.subcommand == "generate":
+        check_args(args, "generate")
+        from generate import main as generate_main
+
+        generate_main(args)
+    elif args.subcommand == "eval":
+        from eval import main as eval_main
+
+        eval_main(args)
+    elif args.subcommand == "export":
+        check_args(args, "export")
+        from export import main as export_main
+
+        export_main(args)
+    else:
+        raise RuntimeError("Must specify valid subcommands: generate, export, eval")