clip_action and action_space #63

HumbleLee · 2023-03-28T03:50:53Z

HumbleLee
Mar 28, 2023

I am currently using the multi-env framework with the skrl algorithm library. When clip_action=False, the actions outputted by skrl sometimes exceed the action space that I’ve defined. If clip_action=True, the actions will be clipped to the boundary values of -1 or 1. Could you please help me clarify these issues? THANKS A LOT! Here is the code:

# Instance of VecEnvBase and create the task
from vec_env import VecEnvBase
env = VecEnvBase(headless=False)

from goal import T2320Task
task = T2320Task(name="T2320s")
env.set_task(task, backend="torch")

import gym
import numpy as np
import torch
import torch.nn as nn

# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.sac import SAC, SAC_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import Wrapper


# Define the models (stochastic and deterministic models) for the agent using mixins.
# - Policy: takes as input the environment's observation/state and returns an action
# - Value: takes the state as input and provides a value to guide the policy
class Actor(GaussianMixin, Model):
    def __init__(self, observation_space, action_space, device, clip_actions=False,
                clip_log_std=True, min_log_std=-20, max_log_std=2,reduction="sum",
                num_envs=1, num_layers=2, hidden_size=256, sequence_length=10):
        Model.__init__(self, observation_space, action_space, device)
        GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std,reduction)
        
        self.net = nn.Sequential(nn.Linear(self.num_observations, 64),
                                nn.ELU(),
                                nn.Linear(64, self.num_actions),
                                nn.Tanh()
                                )
        
        self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))

    def compute(self, inputs, role):
        return torch.tanh(self.net(inputs["states"])), self.log_std_parameter, {}

class Critic(DeterministicMixin, Model):
    def __init__(self, observation_space, action_space, device, clip_actions=False):
        Model.__init__(self, observation_space, action_space, device)
        DeterministicMixin.__init__(self, clip_actions)

        self.net = nn.Sequential(nn.Linear(self.num_observations, 64),
                                nn.ELU(),
                                nn.Linear(64, 1))

    def compute(self, inputs, role):
        return self.net(inputs["states"]), {}


class CustomGymWrapper(Wrapper):
    def __init__(self, env):
        super().__init__(env)

        self._vectorized = True
        self._reset_once = True
        self._obs_tensor = None
        self._info_dict = None

    @property
    def state_space(self):
        return self._env.observation_space

    @property
    def observation_space(self):
        return self._env.observation_space

    @property
    def action_space(self):
        return self._env.action_space

    def _observation_to_tensor(self, observation, space=None):
        observation_space = self._env.observation_space if self._vectorized else self.observation_space
        space = space if space is not None else observation_space

        if isinstance(observation, np.ndarray):
            return torch.tensor(observation, device=self.device, dtype=torch.float32).view(self.num_envs, -1)
        return torch.tensor(observation, device=self.device, dtype=torch.float32).view(self.num_envs, -1)
        
    def _tensor_to_action(self, actions):
        space = self._env.action_space

        return np.array(actions.cpu().numpy(), dtype=space.dtype).reshape(self.num_envs, -1)
        
    def step(self, actions):
        observation, reward, terminated, info = self._env.step(self._tensor_to_action(actions))
        
        # convert response to torch
        observation = self._observation_to_tensor(observation)
        reward = torch.tensor(reward, device=self.device, dtype=torch.float32).view(self.num_envs, -1)
        terminated = torch.tensor(terminated, device=self.device, dtype=torch.bool).view(self.num_envs, -1)
        truncated = terminated.clone()

        # save observation and info for vectorized envs
        if self._vectorized:
            self._obs_tensor = observation
            self._info_dict = info

        return observation, reward, terminated, truncated, info

    def reset(self):
        if not self._reset_once:
            return self._obs_tensor, self._info_dict
        self._reset_once = False
        return self._observation_to_tensor(self._env.reset()), {}

    def render(self, *args, **kwargs):
        self._env.render(*args, **kwargs)

    def close(self):
        self._env.close()

# Load and wrap the environment
env = CustomGymWrapper(env)

device = env.device


# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=100000, num_envs=env.num_envs, device=device)


# Instantiate the agent's models (function approximators).
models_sac = {}
models_sac["policy"] = Actor(env.observation_space, env.action_space, device,clip_actions=True)
models_sac["critic_1"] = Critic(env.observation_space, env.action_space, device)
models_sac["critic_2"] = Critic(env.observation_space, env.action_space, device)
models_sac["target_critic_1"] = Critic(env.observation_space, env.action_space, device)
models_sac["target_critic_2"] = Critic(env.observation_space, env.action_space, device)



# Initialize the models' parameters (weights and biases) using a Gaussian distribution
for model in models_sac.values():
    model.init_parameters(method_name="normal_", mean=0.0, std=0.1)


# Configure and instantiate the agent.
cfg_sac = SAC_DEFAULT_CONFIG.copy()
cfg_sac["gradient_steps"] = 1
cfg_sac["batch_size"] = 1000
cfg_sac["random_timesteps"] = 0
cfg_sac["learning_starts"] = 10
cfg_sac["learn_entropy"] = True
cfg_sac["actor_learning_rate"] = 0.001
cfg_sac["critic_learning_rate"] = 0.001
# logging to TensorBoard and write checkpoints each 1000 and 5000 timesteps respectively
cfg_sac["experiment"]["write_interval"] = 1000
cfg_sac["experiment"]["checkpoint_interval"] = 5000


agent = SAC(models=models_sac,
            memory=memory, 
            cfg=cfg_sac, 
            observation_space=env.observation_space, 
            action_space=env.action_space,
            device=device)


# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 100000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)

# start training
trainer.train()

Answered by Toni-SM

Mar 28, 2023

Hi @HumbleLee

Both, SAC and PPO, for example, use a stochastic policy.

In the current implementation (a Gaussian policy), the function approximator (artificial neural networks) returns deterministic values for the mean actions (mean_action) and the natural logarithm of the standard deviation (log_std).
Those values are used to parametrize a Gaussian distribution (N) where std = e ^ log_std, as indicated in the concept image.

Then, the stochastic action is sampled as action ~ N ( mean_action, std )

The Gaussian mixin provided by skrl clip the log_std (a parameter of the artificial neural network) in the range [-20, 2] by default.
Then, the limits of the std is [2.06e-09, 7.389]

If we plot…

View full answer

Toni-SM · 2023-03-28T14:13:16Z

Toni-SM
Mar 28, 2023
Maintainer

Hi @HumbleLee

Both, SAC and PPO, for example, use a stochastic policy.

In the current implementation (a Gaussian policy), the function approximator (artificial neural networks) returns deterministic values for the mean actions (mean_action) and the natural logarithm of the standard deviation (log_std).
Those values are used to parametrize a Gaussian distribution (N) where std = e ^ log_std, as indicated in the concept image.

Then, the stochastic action is sampled as action ~ N ( mean_action, std )

The Gaussian mixin provided by skrl clip the log_std (a parameter of the artificial neural network) in the range [-20, 2] by default.
Then, the limits of the std is [2.06e-09, 7.389]

If we plot (using the following code) the Gaussian distributions for the minimum and maximum values of both, the mean_action (limited by a tanh function to the range [-1, 1]) and the std (obtained from the log_std) we have the following results:

import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm


def gaussian(interval, log_std, space_limits=[-1, 1]):
    x = np.arange(-interval, interval, 0.001)

    distribution = norm.pdf(x, space_limits[0], np.exp(log_std))
    y = np.max(distribution)
    plt.plot(x, distribution, label=f"normal(mean={space_limits[0]}, std={np.round(np.exp(log_std), 4)})")

    distribution = norm.pdf(x, space_limits[1], np.exp(log_std))
    y = max(np.max(distribution), y)
    plt.plot(x, distribution, label=f"normal(mean={space_limits[1]}, std={np.round(np.exp(log_std), 4)})")

    # plot limits
    plt.plot([space_limits[0], space_limits[0]], [0.0, y], "k--", label="space limits")
    plt.plot([space_limits[1], space_limits[1]], [0.0, y], "k--")

    plt.grid(True)
    plt.legend()
    plt.title(f"log_std: {log_std}, space limits: {space_limits}")
    plt.show()

gaussian(interval=25, log_std=2.0, space_limits=[-1, 1])
gaussian(interval=1.1, log_std=-20.0, space_limits=[-1, 1])

Note, that for the maximum log_std (clipped by default to 2.0), we can have high probabilities of sampling final actions outside and far from the space limits.
That is why, when clip_action=False the actions outputted by the policy sometimes exceed the action space.

Tips: you can try to reduce the maximum log_std value to reduce the probability of sampling values far form the limits of the actions space.

2 replies

HumbleLee Mar 28, 2023
Author

Hello,Toni
Thank you for your reply! It was my mistake to overlook this. I understand the reason and I am very grateful. But, I looked up the implementation of stable baselines3 and have another question. Stable baselines3 also sets LOG_STD_MAX to 2, and LOG_STD_MIN to -20, and there is quite a big difference in the action produced by the skrl and stable-baselines3, I can't figure it out. Anyway, I wish you all the best and thank you again for your help.

def get_action_dist_params(self, obs: th.Tensor) -> Tuple[th.Tensor, th.Tensor, Dict[str, th.Tensor]]:
        features = self.extract_features(obs, self.features_extractor)
        latent_pi = self.latent_pi(features)
        mean_actions = self.mu(latent_pi)
        if self.use_sde:
            return mean_actions, self.log_std, dict(latent_sde=latent_pi)
        # Unstructured exploration (Original implementation)
        log_std = self.log_std(latent_pi)
        # Original Implementation to cap the standard deviation
        log_std = th.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX)
        return mean_actions, log_std, {}

Toni-SM Mar 30, 2023
Maintainer

Hi @HumbleLee

The following are some points that can be reviewed (apart from the algorithm hyperparameters) to try to make the two libraries configurations more similar:

initial log_std value

self.log_std_parameter = nn.Parameter(torch.ones(self.num_actions) * LOG_STD_INITIAL_VALUE)

initial model weights and biases

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

clip_action and action_space #63

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Replies: 1 comment 2 replies

{{title}}

{{editor}}'s edit

{{editor}}'s edit

{{title}}

{{editor}}'s edit

{{editor}}'s edit

{{title}}

Select a reply

clip_action and action_space #63

HumbleLee Mar 28, 2023

Replies: 1 comment · 2 replies

Toni-SM Mar 28, 2023 Maintainer

HumbleLee Mar 28, 2023 Author

Toni-SM Mar 30, 2023 Maintainer

HumbleLee
Mar 28, 2023

Replies: 1 comment 2 replies

Toni-SM
Mar 28, 2023
Maintainer

HumbleLee Mar 28, 2023
Author

Toni-SM Mar 30, 2023
Maintainer