[BugFix] action_spec_unbatched whenever necessary

ghstack-source-id: 4168c6c8b6b5febd8db4fd43e71e46e9bfeb10cc Pull Request resolved: #2592
pytorch · Nov 20, 2024 · 6dae0cf · 6dae0cf
1 parent a47b32c
commit 6dae0cf
Show file tree

Hide file tree

Showing 24 changed files with 141 additions and 106 deletions.
diff --git a/examples/distributed/collectors/multi_nodes/ray_train.py b/examples/distributed/collectors/multi_nodes/ray_train.py
@@ -85,8 +85,8 @@
         in_keys=["loc", "scale"],
         distribution_class=TanhNormal,
         distribution_kwargs={
-            "low": env.action_spec.space.low,
-            "high": env.action_spec.space.high,
+            "low": env.action_spec_unbatched.space.low,
+            "high": env.action_spec_unbatched.space.high,
         },
         return_log_prob=True,
     )

diff --git a/sota-implementations/a2c/utils_atari.py b/sota-implementations/a2c/utils_atari.py
@@ -101,8 +101,8 @@ def make_ppo_modules_pixels(proof_environment, device):
         num_outputs = proof_environment.action_spec.shape
         distribution_class = TanhNormal
         distribution_kwargs = {
-            "low": proof_environment.action_spec.space.low.to(device),
-            "high": proof_environment.action_spec.space.high.to(device),
+            "low": proof_environment.action_spec_unbatched.space.low.to(device),
+            "high": proof_environment.action_spec_unbatched.space.high.to(device),
         }
 
     # Define input keys

diff --git a/sota-implementations/a2c/utils_mujoco.py b/sota-implementations/a2c/utils_mujoco.py
@@ -57,8 +57,8 @@ def make_ppo_models_state(proof_environment, device, *, compile: bool = False):
     num_outputs = proof_environment.action_spec.shape[-1]
     distribution_class = TanhNormal
     distribution_kwargs = {
-        "low": proof_environment.action_spec.space.low.to(device),
-        "high": proof_environment.action_spec.space.high.to(device),
+        "low": proof_environment.action_spec_unbatched.space.low.to(device),
+        "high": proof_environment.action_spec_unbatched.space.high.to(device),
         "tanh_loc": False,
         "safe_tanh": True,
     }

diff --git a/sota-implementations/cql/utils.py b/sota-implementations/cql/utils.py
@@ -191,7 +191,7 @@ def make_offline_replay_buffer(rb_cfg):
 def make_cql_model(cfg, train_env, eval_env, device="cpu"):
     model_cfg = cfg.model
 
-    action_spec = train_env.action_spec
+    action_spec = train_env.action_spec_unbatched
 
     actor_net, q_net = make_cql_modules_state(model_cfg, eval_env)
     in_keys = ["observation"]

diff --git a/sota-implementations/crossq/utils.py b/sota-implementations/crossq/utils.py
@@ -147,9 +147,7 @@ def make_crossQ_agent(cfg, train_env, device):
     """Make CrossQ agent."""
     # Define Actor Network
     in_keys = ["observation"]
-    action_spec = train_env.action_spec
-    if train_env.batch_size:
-        action_spec = action_spec[(0,) * len(train_env.batch_size)]
+    action_spec = train_env.action_spec_unbatched
     actor_net_kwargs = {
         "num_cells": cfg.network.actor_hidden_sizes,
         "out_features": 2 * action_spec.shape[-1],

diff --git a/sota-implementations/decision_transformer/utils.py b/sota-implementations/decision_transformer/utils.py
@@ -393,7 +393,7 @@ def make_dt_model(cfg):
         make_base_env(env_cfg), env_cfg, obs_loc=0, obs_std=1
     )
 
-    action_spec = proof_environment.action_spec
+    action_spec = proof_environment.action_spec_unbatched
     for key, value in proof_environment.observation_spec.items():
         if key == "observation":
             state_dim = value.shape[-1]

diff --git a/sota-implementations/gail/ppo_utils.py b/sota-implementations/gail/ppo_utils.py
@@ -52,8 +52,8 @@ def make_ppo_models_state(proof_environment):
     num_outputs = proof_environment.action_spec.shape[-1]
     distribution_class = TanhNormal
     distribution_kwargs = {
-        "low": proof_environment.action_spec.space.low,
-        "high": proof_environment.action_spec.space.high,
+        "low": proof_environment.action_spec_unbatched.space.low,
+        "high": proof_environment.action_spec_unbatched.space.high,
         "tanh_loc": False,
     }
 

diff --git a/sota-implementations/iql/utils.py b/sota-implementations/iql/utils.py
@@ -195,9 +195,7 @@ def make_iql_model(cfg, train_env, eval_env, device="cpu"):
     model_cfg = cfg.model
 
     in_keys = ["observation"]
-    action_spec = train_env.action_spec
-    if train_env.batch_size:
-        action_spec = action_spec[(0,) * len(train_env.batch_size)]
+    action_spec = train_env.action_spec_unbatched
     actor_net, q_net, value_net = make_iql_modules_state(model_cfg, eval_env)
 
     out_keys = ["loc", "scale"]

diff --git a/sota-implementations/multiagent/iql.py b/sota-implementations/multiagent/iql.py
@@ -91,7 +91,7 @@ def train(cfg: "DictConfig"):  # noqa: F821
             ("agents", "action_value"),
             ("agents", "chosen_action_value"),
         ],
-        spec=env.unbatched_action_spec,
+        spec=env.action_spec_unbatched,
         action_space=None,
     )
     qnet = SafeSequential(module, value_module)
@@ -103,7 +103,7 @@ def train(cfg: "DictConfig"):  # noqa: F821
             eps_end=0,
             annealing_num_steps=int(cfg.collector.total_frames * (1 / 2)),
             action_key=env.action_key,
-            spec=env.unbatched_action_spec,
+            spec=env.action_spec_unbatched,
         ),
     )
 

diff --git a/sota-implementations/multiagent/maddpg_iddpg.py b/sota-implementations/multiagent/maddpg_iddpg.py
@@ -91,21 +91,21 @@ def train(cfg: "DictConfig"):  # noqa: F821
     )
     policy = ProbabilisticActor(
         module=policy_module,
-        spec=env.unbatched_action_spec,
+        spec=env.action_spec_unbatched,
         in_keys=[("agents", "param")],
         out_keys=[env.action_key],
         distribution_class=TanhDelta,
         distribution_kwargs={
-            "low": env.unbatched_action_spec[("agents", "action")].space.low,
-            "high": env.unbatched_action_spec[("agents", "action")].space.high,
+            "low": env.action_spec_unbatched[("agents", "action")].space.low,
+            "high": env.action_spec_unbatched[("agents", "action")].space.high,
         },
         return_log_prob=False,
     )
 
     policy_explore = TensorDictSequential(
         policy,
         AdditiveGaussianModule(
-            spec=env.unbatched_action_spec,
+            spec=env.action_spec_unbatched,
             annealing_num_steps=int(cfg.collector.total_frames * (1 / 2)),
             action_key=env.action_key,
             device=cfg.train.device,

diff --git a/sota-implementations/multiagent/mappo_ippo.py b/sota-implementations/multiagent/mappo_ippo.py
@@ -92,13 +92,13 @@ def train(cfg: "DictConfig"):  # noqa: F821
     )
     policy = ProbabilisticActor(
         module=policy_module,
-        spec=env.unbatched_action_spec,
+        spec=env.action_spec_unbatched,
         in_keys=[("agents", "loc"), ("agents", "scale")],
         out_keys=[env.action_key],
         distribution_class=TanhNormal,
         distribution_kwargs={
-            "low": env.unbatched_action_spec[("agents", "action")].space.low,
-            "high": env.unbatched_action_spec[("agents", "action")].space.high,
+            "low": env.action_spec_unbatched[("agents", "action")].space.low,
+            "high": env.action_spec_unbatched[("agents", "action")].space.high,
         },
         return_log_prob=True,
     )

diff --git a/sota-implementations/multiagent/qmix_vdn.py b/sota-implementations/multiagent/qmix_vdn.py
@@ -91,7 +91,7 @@ def train(cfg: "DictConfig"):  # noqa: F821
             ("agents", "action_value"),
             ("agents", "chosen_action_value"),
         ],
-        spec=env.unbatched_action_spec,
+        spec=env.action_spec_unbatched,
         action_space=None,
     )
     qnet = SafeSequential(module, value_module)
@@ -103,7 +103,7 @@ def train(cfg: "DictConfig"):  # noqa: F821
             eps_end=0,
             annealing_num_steps=int(cfg.collector.total_frames * (1 / 2)),
             action_key=env.action_key,
-            spec=env.unbatched_action_spec,
+            spec=env.action_spec_unbatched,
         ),
     )
 

diff --git a/sota-implementations/multiagent/sac.py b/sota-implementations/multiagent/sac.py
@@ -96,13 +96,13 @@ def train(cfg: "DictConfig"):  # noqa: F821
 
         policy = ProbabilisticActor(
             module=policy_module,
-            spec=env.unbatched_action_spec,
+            spec=env.action_spec_unbatched,
             in_keys=[("agents", "loc"), ("agents", "scale")],
             out_keys=[env.action_key],
             distribution_class=TanhNormal,
             distribution_kwargs={
-                "low": env.unbatched_action_spec[("agents", "action")].space.low,
-                "high": env.unbatched_action_spec[("agents", "action")].space.high,
+                "low": env.action_spec_unbatched[("agents", "action")].space.low,
+                "high": env.action_spec_unbatched[("agents", "action")].space.high,
             },
             return_log_prob=True,
         )
@@ -146,7 +146,7 @@ def train(cfg: "DictConfig"):  # noqa: F821
         )
         policy = ProbabilisticActor(
             module=policy_module,
-            spec=env.unbatched_action_spec,
+            spec=env.action_spec_unbatched,
             in_keys=[("agents", "logits")],
             out_keys=[env.action_key],
             distribution_class=OneHotCategorical
@@ -194,7 +194,7 @@ def train(cfg: "DictConfig"):  # noqa: F821
             actor_network=policy,
             qvalue_network=value_module,
             delay_qvalue=True,
-            action_spec=env.unbatched_action_spec,
+            action_spec=env.action_spec_unbatched,
         )
         loss_module.set_keys(
             state_action_value=("agents", "state_action_value"),
@@ -209,7 +209,7 @@ def train(cfg: "DictConfig"):  # noqa: F821
             qvalue_network=value_module,
             delay_qvalue=True,
             num_actions=env.action_spec.space.n,
-            action_space=env.unbatched_action_spec,
+            action_space=env.action_spec_unbatched,
         )
         loss_module.set_keys(
             action_value=("agents", "action_value"),

diff --git a/sota-implementations/ppo/utils_atari.py b/sota-implementations/ppo/utils_atari.py
@@ -100,8 +100,8 @@ def make_ppo_modules_pixels(proof_environment):
         num_outputs = proof_environment.action_spec.shape
         distribution_class = TanhNormal
         distribution_kwargs = {
-            "low": proof_environment.action_spec.space.low,
-            "high": proof_environment.action_spec.space.high,
+            "low": proof_environment.action_spec_unbatched.space.low,
+            "high": proof_environment.action_spec_unbatched.space.high,
         }
 
     # Define input keys

diff --git a/sota-implementations/ppo/utils_mujoco.py b/sota-implementations/ppo/utils_mujoco.py
@@ -52,8 +52,8 @@ def make_ppo_models_state(proof_environment):
     num_outputs = proof_environment.action_spec.shape[-1]
     distribution_class = TanhNormal
     distribution_kwargs = {
-        "low": proof_environment.action_spec.space.low,
-        "high": proof_environment.action_spec.space.high,
+        "low": proof_environment.action_spec_unbatched.space.low,
+        "high": proof_environment.action_spec_unbatched.space.high,
         "tanh_loc": False,
     }
 

diff --git a/sota-implementations/redq/utils.py b/sota-implementations/redq/utils.py
@@ -410,7 +410,7 @@ def make_redq_model(
     default_policy_scale = cfg.network.default_policy_scale
     gSDE = cfg.exploration.gSDE
 
-    action_spec = proof_environment.action_spec
+    action_spec = proof_environment.action_spec_unbatched
 
     if actor_net_kwargs is None:
         actor_net_kwargs = {}

diff --git a/sota-implementations/sac/utils.py b/sota-implementations/sac/utils.py
@@ -161,9 +161,7 @@ def make_sac_agent(cfg, train_env, eval_env, device):
     """Make SAC agent."""
     # Define Actor Network
     in_keys = ["observation"]
-    action_spec = train_env.action_spec
-    if train_env.batch_size:
-        action_spec = action_spec[(0,) * len(train_env.batch_size)]
+    action_spec = train_env.action_spec_unbatched
     actor_net_kwargs = {
         "num_cells": cfg.network.hidden_sizes,
         "out_features": 2 * action_spec.shape[-1],

diff --git a/test/mocking_classes.py b/test/mocking_classes.py
@@ -1388,17 +1388,17 @@ def _make_specs(self):
         obs_spec_unlazy = consolidate_spec(obs_specs)
         action_specs = torch.stack(action_specs, dim=0)
 
-        self.unbatched_observation_spec = Composite(
+        self.observation_spec_unbatched = Composite(
             lazy=obs_spec_unlazy,
             state=Unbounded(shape=(64, 64, 3)),
             device=self.device,
         )
 
-        self.unbatched_action_spec = Composite(
+        self.action_spec_unbatched = Composite(
             lazy=action_specs,
             device=self.device,
         )
-        self.unbatched_reward_spec = Composite(
+        self.reward_spec_unbatched = Composite(
             {
                 "lazy": Composite(
                     {"reward": Unbounded(shape=(self.n_nested_dim, 1))},
@@ -1407,7 +1407,7 @@ def _make_specs(self):
             },
             device=self.device,
         )
-        self.unbatched_done_spec = Composite(
+        self.done_spec_unbatched = Composite(
             {
                 "lazy": Composite(
                     {
@@ -1423,19 +1423,6 @@ def _make_specs(self):
             device=self.device,
         )
 
-        self.action_spec = self.unbatched_action_spec.expand(
-            *self.batch_size, *self.unbatched_action_spec.shape
-        )
-        self.observation_spec = self.unbatched_observation_spec.expand(
-            *self.batch_size, *self.unbatched_observation_spec.shape
-        )
-        self.reward_spec = self.unbatched_reward_spec.expand(
-            *self.batch_size, *self.unbatched_reward_spec.shape
-        )
-        self.done_spec = self.unbatched_done_spec.expand(
-            *self.batch_size, *self.unbatched_done_spec.shape
-        )
-
     def get_agent_obs_spec(self, i):
         camera = Bounded(low=0, high=200, shape=(7, 7, 3))
         vector_3d = Unbounded(shape=(3,))
@@ -1610,21 +1597,8 @@ def __init__(self, max_steps: int = 5, start_val: int = 0, **kwargs):
 
         self.make_specs()
 
-        self.action_spec = self.unbatched_action_spec.expand(
-            *self.batch_size, *self.unbatched_action_spec.shape
-        )
-        self.observation_spec = self.unbatched_observation_spec.expand(
-            *self.batch_size, *self.unbatched_observation_spec.shape
-        )
-        self.reward_spec = self.unbatched_reward_spec.expand(
-            *self.batch_size, *self.unbatched_reward_spec.shape
-        )
-        self.done_spec = self.unbatched_done_spec.expand(
-            *self.batch_size, *self.unbatched_done_spec.shape
-        )
-
     def make_specs(self):
-        self.unbatched_observation_spec = Composite(
+        self.observation_spec_unbatched = Composite(
             nested_1=Composite(
                 observation=Bounded(low=0, high=200, shape=(self.nested_dim_1, 3)),
                 shape=(self.nested_dim_1,),
@@ -1642,7 +1616,7 @@ def make_specs(self):
             ),
         )
 
-        self.unbatched_action_spec = Composite(
+        self.action_spec_unbatched = Composite(
             nested_1=Composite(
                 action=Categorical(n=2, shape=(self.nested_dim_1,)),
                 shape=(self.nested_dim_1,),
@@ -1654,7 +1628,7 @@ def make_specs(self):
             action=OneHot(n=2),
         )
 
-        self.unbatched_reward_spec = Composite(
+        self.reward_spec_unbatched = Composite(
             nested_1=Composite(
                 gift=Unbounded(shape=(self.nested_dim_1, 1)),
                 shape=(self.nested_dim_1,),
@@ -1666,7 +1640,7 @@ def make_specs(self):
             reward=Unbounded(shape=(1,)),
         )
 
-        self.unbatched_done_spec = Composite(
+        self.done_spec_unbatched = Composite(
             nested_1=Composite(
                 done=Categorical(
                     n=2,

diff --git a/test/test_env.py b/test/test_env.py
@@ -3512,18 +3512,18 @@ def test_serial_partial_step_and_maybe_reset(self, use_buffers, device, env_devi
 
 def test_single_env_spec():
     env = NestedCountingEnv(batch_size=[3, 1, 7])
-    assert not env.single_full_action_spec.shape
-    assert not env.single_full_done_spec.shape
-    assert not env.single_input_spec.shape
-    assert not env.single_full_observation_spec.shape
-    assert not env.single_output_spec.shape
-    assert not env.single_full_reward_spec.shape
-
-    assert env.single_action_spec.shape
-    assert env.single_reward_spec.shape
-
-    assert env.output_spec.is_in(env.single_output_spec.zeros(env.shape))
-    assert env.input_spec.is_in(env.single_input_spec.zeros(env.shape))
+    assert not env.full_action_spec_unbatched.shape
+    assert not env.full_done_spec_unbatched.shape
+    assert not env.input_spec_unbatched.shape
+    assert not env.full_observation_spec_unbatched.shape
+    assert not env.output_spec_unbatched.shape
+    assert not env.full_reward_spec_unbatched.shape
+
+    assert env.action_spec_unbatched.shape
+    assert env.reward_spec_unbatched.shape
+
+    assert env.output_spec.is_in(env.output_spec_unbatched.zeros(env.shape))
+    assert env.input_spec.is_in(env.input_spec_unbatched.zeros(env.shape))
 
 
 if __name__ == "__main__":

diff --git a/test/test_libs.py b/test/test_libs.py
@@ -2253,6 +2253,13 @@ def test_vmas_batch_size(self, scenario_name, num_envs, n_agents):
             max_steps=n_rollout_samples,
             return_contiguous=False if env.het_specs else True,
         )
+        assert (
+            env.full_action_spec_unbatched.shape == env.unbatched_action_spec.shape
+        ), (
+            env.action_spec,
+            env.batch_size,
+        )
+
         env.close()
 
         if env.het_specs: