Skip to content

Commit

Permalink
Merge pull request #171 from cpnota/release/0.6.0
Browse files Browse the repository at this point in the history
Release/0.6.0
  • Loading branch information
cpnota authored Sep 29, 2020
2 parents 3ec67d5 + 67fcf2c commit 31e5aa9
Show file tree
Hide file tree
Showing 90 changed files with 1,309 additions and 710 deletions.
33 changes: 24 additions & 9 deletions .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -423,23 +423,38 @@ function-naming-style=snake_case
#function-rgx=

# Good variable names which should always be accepted, separated by a comma.
good-names=i,
good-names=a,
b,
c,
d,
e,
f,
g,
h,
i,
j,
k,
ex,
Run,
l,
m,
n,
o,
p,
q,
r,
s,
t,
u,
v,
_,
w,
x,
y,
z
_,
lr,
n,
t,
e,
u,
kl,
ax
ax,
ex,
Run,

# Include a hint for the correct naming format with invalid-name.
include-naming-hint=no
Expand Down
5 changes: 2 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
language: python
python:
- "3.6"
- "3.7"
branches:
only:
- master
- develop
install:
- pip install https://download.pytorch.org/whl/cpu/torch-1.0.1.post2-cp36-cp36m-linux_x86_64.whl
- pip install torchvision
- pip install torch==1.5.1+cpu torchvision==0.6.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
- pip install -q -e .["dev"]
script:
- make lint
Expand Down
3 changes: 3 additions & 0 deletions all/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
import all.nn
from all.core import State, StateArray

__all__ = ['nn', 'State', 'StateArray']
6 changes: 2 additions & 4 deletions all/agents/_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class Agent(ABC, Schedulable):
"""

@abstractmethod
def act(self, state, reward):
def act(self, state):
"""
Select an action for the current timestep and update internal parameters.
Expand All @@ -27,14 +27,13 @@ def act(self, state, reward):
Args:
state (all.environment.State): The environment state at the current timestep.
reward (torch.Tensor): The reward from the previous timestep.
Returns:
torch.Tensor: The action to take at the current timestep.
"""

@abstractmethod
def eval(self, state, reward):
def eval(self, state):
"""
Select an action for the current timestep in evaluation mode.
Expand All @@ -45,7 +44,6 @@ def eval(self, state, reward):
Args:
state (all.environment.State): The environment state at the current timestep.
reward (torch.Tensor): The reward from the previous timestep.
Returns:
torch.Tensor: The action to take at the current timestep.
Expand Down
6 changes: 3 additions & 3 deletions all/agents/a2c.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,14 @@ def __init__(
self._batch_size = n_envs * n_steps
self._buffer = self._make_buffer()

def act(self, states, rewards):
self._buffer.store(self._states, self._actions, rewards)
def act(self, states):
self._buffer.store(self._states, self._actions, states.reward)
self._train(states)
self._states = states
self._actions = self.policy.no_grad(self.features.no_grad(states)).sample()
return self._actions

def eval(self, states, _):
def eval(self, states):
return self.policy.eval(self.features.eval(states))

def _train(self, next_states):
Expand Down
18 changes: 8 additions & 10 deletions all/agents/c51.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,22 +53,20 @@ def __init__(
self._action = None
self._frames_seen = 0

def act(self, state, reward):
self.replay_buffer.store(self._state, self._action, reward, state)
def act(self, state):
self.replay_buffer.store(self._state, self._action, state)
self._train()
self._state = state
self._action = self._choose_action(state)
return self._action

def eval(self, state, _):
return self._best_actions(self.q_dist.eval(state))
def eval(self, state):
return self._best_actions(self.q_dist.eval(state)).item()

def _choose_action(self, state):
if self._should_explore():
return torch.randint(
self.q_dist.n_actions, (len(state),), device=self.q_dist.device
)
return self._best_actions(self.q_dist.no_grad(state))
return np.random.randint(0, self.q_dist.n_actions)
return self._best_actions(self.q_dist.no_grad(state)).item()

def _should_explore(self):
return (
Expand All @@ -77,8 +75,8 @@ def _should_explore(self):
)

def _best_actions(self, probs):
q_values = (probs * self.q_dist.atoms).sum(dim=2)
return torch.argmax(q_values, dim=1)
q_values = (probs * self.q_dist.atoms).sum(dim=-1)
return torch.argmax(q_values, dim=-1)

def _train(self):
if self._should_train():
Expand Down
6 changes: 3 additions & 3 deletions all/agents/ddpg.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,14 @@ def __init__(self,
self._action = None
self._frames_seen = 0

def act(self, state, reward):
self.replay_buffer.store(self._state, self._action, reward, state)
def act(self, state):
self.replay_buffer.store(self._state, self._action, state)
self._train()
self._state = state
self._action = self._choose_action(state)
return self._action

def eval(self, state, _):
def eval(self, state):
return self.policy.eval(state)

def _choose_action(self, state):
Expand Down
8 changes: 4 additions & 4 deletions all/agents/ddqn.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def __init__(self,
self.q = q
self.policy = policy
self.replay_buffer = replay_buffer
self.loss = staticmethod(loss)
self.loss = loss
# hyperparameters
self.replay_start_size = replay_start_size
self.update_frequency = update_frequency
Expand All @@ -49,14 +49,14 @@ def __init__(self,
self._action = None
self._frames_seen = 0

def act(self, state, reward):
self.replay_buffer.store(self._state, self._action, reward, state)
def act(self, state):
self.replay_buffer.store(self._state, self._action, state)
self._train()
self._state = state
self._action = self.policy.no_grad(state)
return self._action

def eval(self, state, _):
def eval(self, state):
return self.policy.eval(state)

def _train(self):
Expand Down
8 changes: 4 additions & 4 deletions all/agents/dqn.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def __init__(self,
self.q = q
self.policy = policy
self.replay_buffer = replay_buffer
self.loss = staticmethod(loss)
self.loss = loss
# hyperparameters
self.discount_factor = discount_factor
self.minibatch_size = minibatch_size
Expand All @@ -50,14 +50,14 @@ def __init__(self,
self._action = None
self._frames_seen = 0

def act(self, state, reward):
self.replay_buffer.store(self._state, self._action, reward, state)
def act(self, state):
self.replay_buffer.store(self._state, self._action, state)
self._train()
self._state = state
self._action = self.policy.no_grad(state)
return self._action

def eval(self, state, _):
def eval(self, state):
return self.policy.eval(state)

def _train(self):
Expand Down
6 changes: 3 additions & 3 deletions all/agents/ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,14 @@ def __init__(
self._batch_size = n_envs * n_steps
self._buffer = self._make_buffer()

def act(self, states, rewards):
self._buffer.store(self._states, self._actions, rewards)
def act(self, states):
self._buffer.store(self._states, self._actions, states.reward)
self._train(states)
self._states = states
self._actions = self.policy.no_grad(self.features.no_grad(states)).sample()
return self._actions

def eval(self, states, _):
def eval(self, states):
return self.policy.eval(self.features.eval(states))

def _train(self, next_states):
Expand Down
8 changes: 4 additions & 4 deletions all/agents/sac.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,15 +63,15 @@ def __init__(self,
self._action = None
self._frames_seen = 0

def act(self, state, reward):
self.replay_buffer.store(self._state, self._action, reward, state)
def act(self, state):
self.replay_buffer.store(self._state, self._action, state)
self._train()
self._state = state
self._action = self.policy.no_grad(state)[0]
return self._action

def eval(self, state, _):
return self.policy.eval(state)[0]
def eval(self, state):
return self.policy.eval(state)

def _train(self):
if self._should_train():
Expand Down
6 changes: 3 additions & 3 deletions all/agents/vac.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,14 @@ def __init__(self, features, v, policy, discount_factor=1):
self._distribution = None
self._action = None

def act(self, state, reward):
self._train(state, reward)
def act(self, state):
self._train(state, state.reward)
self._features = self.features(state)
self._distribution = self.policy(self._features)
self._action = self._distribution.sample()
return self._action

def eval(self, state, _):
def eval(self, state):
return self.policy.eval(self.features.eval(state))

def _train(self, state, reward):
Expand Down
20 changes: 10 additions & 10 deletions all/agents/vpg.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import torch
from torch.nn.functional import mse_loss
from all.environments import State
from all.core import State
from ._agent import Agent

class VPG(Agent):
Expand Down Expand Up @@ -43,38 +43,38 @@ def __init__(
self._log_pis = []
self._rewards = []

def act(self, state, reward):
def act(self, state):
if not self._features:
return self._initial(state)
if not state.done:
return self._act(state, reward)
return self._terminal(state, reward)
return self._act(state, state.reward)
return self._terminal(state, state.reward)

def eval(self, state, _):
def eval(self, state):
return self.policy.eval(self.features.eval(state))

def _initial(self, state):
features = self.features(state)
distribution = self.policy(features)
action = distribution.sample()
self._features = [features.features]
self._features = [features]
self._log_pis.append(distribution.log_prob(action))
return action

def _act(self, state, reward):
features = self.features(state)
distribution = self.policy(features)
action = distribution.sample()
self._features.append(features.features)
self._features.append(features)
self._rewards.append(reward)
self._log_pis.append(distribution.log_prob(action))
return action

def _terminal(self, state, reward):
self._rewards.append(reward)
features = torch.cat(self._features)
features = State.array(self._features)
rewards = torch.tensor(self._rewards, device=features.device)
log_pis = torch.cat(self._log_pis)
log_pis = torch.stack(self._log_pis)
self._trajectories.append((features, rewards, log_pis))
self._current_batch_size += len(features)
self._features = []
Expand All @@ -90,7 +90,7 @@ def _terminal(self, state, reward):
def _train(self):
# forward pass
values = torch.cat([
self.v(State(features))
self.v(features)
for (features, _, _)
in self._trajectories
])
Expand Down
6 changes: 3 additions & 3 deletions all/agents/vqn.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,14 @@ def __init__(self, q, policy, discount_factor=0.99):
self._state = None
self._action = None

def act(self, state, reward):
self._train(reward, state)
def act(self, state):
self._train(state.reward, state)
action = self.policy.no_grad(state)
self._state = state
self._action = action
return action

def eval(self, state, _):
def eval(self, state):
return self.policy.eval(state)

def _train(self, reward, next_state):
Expand Down
6 changes: 3 additions & 3 deletions all/agents/vsarsa.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@ def __init__(self, q, policy, discount_factor=0.99):
self._state = None
self._action = None

def act(self, state, reward):
def act(self, state):
action = self.policy.no_grad(state)
self._train(reward, state, action)
self._train(state.reward, state, action)
self._state = state
self._action = action
return action

def eval(self, state, _):
def eval(self, state):
return self.policy.eval(state)

def _train(self, reward, next_state, next_action):
Expand Down
Loading

0 comments on commit 31e5aa9

Please sign in to comment.