-
I am currently using the multi-env framework with the skrl algorithm library. When clip_action=False, the actions outputted by skrl sometimes exceed the action space that I’ve defined. If clip_action=True, the actions will be clipped to the boundary values of -1 or 1. Could you please help me clarify these issues? THANKS A LOT! Here is the code: # Instance of VecEnvBase and create the task
from vec_env import VecEnvBase
env = VecEnvBase(headless=False)
from goal import T2320Task
task = T2320Task(name="T2320s")
env.set_task(task, backend="torch")
import gym
import numpy as np
import torch
import torch.nn as nn
# Import the skrl components to build the RL system
from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
from skrl.memories.torch import RandomMemory
from skrl.agents.torch.sac import SAC, SAC_DEFAULT_CONFIG
from skrl.resources.schedulers.torch import KLAdaptiveRL
from skrl.trainers.torch import SequentialTrainer
from skrl.envs.torch import Wrapper
# Define the models (stochastic and deterministic models) for the agent using mixins.
# - Policy: takes as input the environment's observation/state and returns an action
# - Value: takes the state as input and provides a value to guide the policy
class Actor(GaussianMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False,
clip_log_std=True, min_log_std=-20, max_log_std=2,reduction="sum",
num_envs=1, num_layers=2, hidden_size=256, sequence_length=10):
Model.__init__(self, observation_space, action_space, device)
GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std,reduction)
self.net = nn.Sequential(nn.Linear(self.num_observations, 64),
nn.ELU(),
nn.Linear(64, self.num_actions),
nn.Tanh()
)
self.log_std_parameter = nn.Parameter(torch.zeros(self.num_actions))
def compute(self, inputs, role):
return torch.tanh(self.net(inputs["states"])), self.log_std_parameter, {}
class Critic(DeterministicMixin, Model):
def __init__(self, observation_space, action_space, device, clip_actions=False):
Model.__init__(self, observation_space, action_space, device)
DeterministicMixin.__init__(self, clip_actions)
self.net = nn.Sequential(nn.Linear(self.num_observations, 64),
nn.ELU(),
nn.Linear(64, 1))
def compute(self, inputs, role):
return self.net(inputs["states"]), {}
class CustomGymWrapper(Wrapper):
def __init__(self, env):
super().__init__(env)
self._vectorized = True
self._reset_once = True
self._obs_tensor = None
self._info_dict = None
@property
def state_space(self):
return self._env.observation_space
@property
def observation_space(self):
return self._env.observation_space
@property
def action_space(self):
return self._env.action_space
def _observation_to_tensor(self, observation, space=None):
observation_space = self._env.observation_space if self._vectorized else self.observation_space
space = space if space is not None else observation_space
if isinstance(observation, np.ndarray):
return torch.tensor(observation, device=self.device, dtype=torch.float32).view(self.num_envs, -1)
return torch.tensor(observation, device=self.device, dtype=torch.float32).view(self.num_envs, -1)
def _tensor_to_action(self, actions):
space = self._env.action_space
return np.array(actions.cpu().numpy(), dtype=space.dtype).reshape(self.num_envs, -1)
def step(self, actions):
observation, reward, terminated, info = self._env.step(self._tensor_to_action(actions))
# convert response to torch
observation = self._observation_to_tensor(observation)
reward = torch.tensor(reward, device=self.device, dtype=torch.float32).view(self.num_envs, -1)
terminated = torch.tensor(terminated, device=self.device, dtype=torch.bool).view(self.num_envs, -1)
truncated = terminated.clone()
# save observation and info for vectorized envs
if self._vectorized:
self._obs_tensor = observation
self._info_dict = info
return observation, reward, terminated, truncated, info
def reset(self):
if not self._reset_once:
return self._obs_tensor, self._info_dict
self._reset_once = False
return self._observation_to_tensor(self._env.reset()), {}
def render(self, *args, **kwargs):
self._env.render(*args, **kwargs)
def close(self):
self._env.close()
# Load and wrap the environment
env = CustomGymWrapper(env)
device = env.device
# Instantiate a RandomMemory as rollout buffer (any memory can be used for this)
memory = RandomMemory(memory_size=100000, num_envs=env.num_envs, device=device)
# Instantiate the agent's models (function approximators).
models_sac = {}
models_sac["policy"] = Actor(env.observation_space, env.action_space, device,clip_actions=True)
models_sac["critic_1"] = Critic(env.observation_space, env.action_space, device)
models_sac["critic_2"] = Critic(env.observation_space, env.action_space, device)
models_sac["target_critic_1"] = Critic(env.observation_space, env.action_space, device)
models_sac["target_critic_2"] = Critic(env.observation_space, env.action_space, device)
# Initialize the models' parameters (weights and biases) using a Gaussian distribution
for model in models_sac.values():
model.init_parameters(method_name="normal_", mean=0.0, std=0.1)
# Configure and instantiate the agent.
cfg_sac = SAC_DEFAULT_CONFIG.copy()
cfg_sac["gradient_steps"] = 1
cfg_sac["batch_size"] = 1000
cfg_sac["random_timesteps"] = 0
cfg_sac["learning_starts"] = 10
cfg_sac["learn_entropy"] = True
cfg_sac["actor_learning_rate"] = 0.001
cfg_sac["critic_learning_rate"] = 0.001
# logging to TensorBoard and write checkpoints each 1000 and 5000 timesteps respectively
cfg_sac["experiment"]["write_interval"] = 1000
cfg_sac["experiment"]["checkpoint_interval"] = 5000
agent = SAC(models=models_sac,
memory=memory,
cfg=cfg_sac,
observation_space=env.observation_space,
action_space=env.action_space,
device=device)
# Configure and instantiate the RL trainer
cfg_trainer = {"timesteps": 100000, "headless": True}
trainer = SequentialTrainer(cfg=cfg_trainer, env=env, agents=agent)
# start training
trainer.train() |
Beta Was this translation helpful? Give feedback.
Replies: 1 comment 2 replies
-
Hi @HumbleLee Both, SAC and PPO, for example, use a stochastic policy. In the current implementation (a Gaussian policy), the function approximator (artificial neural networks) returns deterministic values for the mean actions ( Then, the stochastic action is sampled as The Gaussian mixin provided by skrl clip the If we plot (using the following code) the Gaussian distributions for the minimum and maximum values of both, the import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
def gaussian(interval, log_std, space_limits=[-1, 1]):
x = np.arange(-interval, interval, 0.001)
distribution = norm.pdf(x, space_limits[0], np.exp(log_std))
y = np.max(distribution)
plt.plot(x, distribution, label=f"normal(mean={space_limits[0]}, std={np.round(np.exp(log_std), 4)})")
distribution = norm.pdf(x, space_limits[1], np.exp(log_std))
y = max(np.max(distribution), y)
plt.plot(x, distribution, label=f"normal(mean={space_limits[1]}, std={np.round(np.exp(log_std), 4)})")
# plot limits
plt.plot([space_limits[0], space_limits[0]], [0.0, y], "k--", label="space limits")
plt.plot([space_limits[1], space_limits[1]], [0.0, y], "k--")
plt.grid(True)
plt.legend()
plt.title(f"log_std: {log_std}, space limits: {space_limits}")
plt.show()
gaussian(interval=25, log_std=2.0, space_limits=[-1, 1])
gaussian(interval=1.1, log_std=-20.0, space_limits=[-1, 1]) Note, that for the maximum Tips: you can try to reduce the maximum |
Beta Was this translation helpful? Give feedback.
Hi @HumbleLee
Both, SAC and PPO, for example, use a stochastic policy.
In the current implementation (a Gaussian policy), the function approximator (artificial neural networks) returns deterministic values for the mean actions (
mean_action
) and the natural logarithm of the standard deviation (log_std
).Those values are used to parametrize a Gaussian distribution (
N
) wherestd = e ^ log_std
, as indicated in the concept image.Then, the stochastic action is sampled as
action ~ N ( mean_action, std )
The Gaussian mixin provided by skrl clip the
log_std
(a parameter of the artificial neural network) in the range[-20, 2]
by default.Then, the limits of the
std
is[2.06e-09, 7.389]
If we plot…