Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RL Bridge -- added baselines and corrected GridRowDoor and DoorKnob environments #1705

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
252 changes: 199 additions & 53 deletions predicators/approaches/bridge_policy_approach.py

Large diffs are not rendered by default.

100 changes: 92 additions & 8 deletions predicators/approaches/maple_q_approach.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from predicators.approaches.online_nsrt_learning_approach import \
OnlineNSRTLearningApproach
from predicators.explorers import BaseExplorer, create_explorer
from predicators.ml_models import MapleQFunction
from predicators.ml_models import MapleQFunction, MPDQNFunction
from predicators.settings import CFG
from predicators.structs import Action, GroundAtom, InteractionRequest, \
LowLevelTrajectory, ParameterizedOption, Predicate, State, Task, Type, \
Expand Down Expand Up @@ -82,7 +82,10 @@ def _create_explorer(self) -> BaseExplorer:
"""Create a new explorer at the beginning of each interaction cycle."""
# Geometrically increase the length of exploration.
b = CFG.active_sampler_learning_explore_length_base
max_steps = b**(1 + self._online_learning_cycle)
if CFG.use_old_exploration:
max_steps = b**(1 + self._online_learning_cycle)
else:
max_steps = CFG.max_num_steps_interaction_request
preds = self._get_current_predicates()
assert CFG.explorer == "maple_q"
explorer = create_explorer(CFG.explorer,
Expand Down Expand Up @@ -110,8 +113,11 @@ def load(self, online_learning_cycle: Optional[int]) -> None:

def _learn_nsrts(self, trajectories: List[LowLevelTrajectory],
online_learning_cycle: Optional[int],
annotations: Optional[List[Any]]) -> None:
annotations: Optional[List[Any]], reward_bonuses: \
Optional[List[float]] = None) -> None:
# Start by learning NSRTs in the usual way.
if reward_bonuses is None:
reward_bonuses = []
super()._learn_nsrts(trajectories, online_learning_cycle, annotations)
if CFG.approach == "active_sampler_learning":
# Check the assumption that operators and options are 1:1.
Expand All @@ -124,9 +130,8 @@ def _learn_nsrts(self, trajectories: List[LowLevelTrajectory],
assert nsrt.option_vars == nsrt.parameters # pragma: no cover.
# On the first cycle, we need to register the ground NSRTs, goals, and
# objects in the Q function so that it can define its inputs.
# Do not set grounding for rl_bridge_policy since it was set already
# in init_nsrts
if not online_learning_cycle and CFG.approach != "rl_bridge_policy":
if not online_learning_cycle and CFG.approach != \
"rl_bridge_policy" and CFG.approach != "rl_first_bridge":
all_ground_nsrts: Set[_GroundNSRT] = set()
if CFG.sesame_grounder == "naive":
for nsrt in self._nsrts:
Expand Down Expand Up @@ -156,7 +161,10 @@ def _learn_nsrts(self, trajectories: List[LowLevelTrajectory],
self._q_function.set_grounding(all_objects, goals,
all_ground_nsrts)
# Update the data using the updated self._segmented_trajs.
self._update_maple_data()
if isinstance(self, MPDQNApproach):
MPDQNApproach._update_maple_data(self, reward_bonuses) # pylint: disable=protected-access
else:
self._update_maple_data()
# Re-learn Q function.
self._q_function.train_q_function()
# Save the things we need other than the NSRTs, which were already
Expand All @@ -182,13 +190,19 @@ def _update_maple_data(self) -> None:

for traj_i, segmented_traj in enumerate(new_trajs):
self._last_seen_segment_traj_idx += 1
already_terminal = False
for seg_i, segment in enumerate(segmented_traj):
s = segment.states[0]
goal = new_traj_goals[traj_i]
o = segment.get_option()
ns = segment.states[-1]
reward = 1.0 if goal.issubset(segment.final_atoms) else 0.0
if goal.issubset(segment.final_atoms) and not already_terminal:
reward = 1.0
else:
reward = 0.0
terminal = reward > 0 or seg_i == len(segmented_traj) - 1
if terminal:
already_terminal = terminal
self._q_function.add_datum_to_replay_buffer(
(s, goal, o, ns, reward, terminal))

Expand All @@ -200,3 +214,73 @@ def get_interaction_requests(self) -> List[InteractionRequest]:
goal = self._train_tasks[request.train_task_idx].goal
self._interaction_goals.append(goal)
return requests


class MPDQNApproach(MapleQApproach):
"""DQN with target function."""
def __init__(self, initial_predicates: Set[Predicate],
initial_options: Set[ParameterizedOption], types: Set[Type],
action_space: Box, train_tasks: List[Task],
CallPlanner: Optional[\
utils.SingletonParameterizedOption]) -> None:
super().__init__(initial_predicates, initial_options, types,
action_space, train_tasks)

# The current implementation assumes that NSRTs are not changing.
assert CFG.strips_learner == "oracle"
# The base sampler should also be unchanging and from the oracle.
assert CFG.sampler_learner == "oracle"
self.CallPlanner = CallPlanner

# Log all transition data.
self._interaction_goals: List[Set[GroundAtom]] = []
self._last_seen_segment_traj_idx = -1

# Store the Q function. Note that this implicitly
# contains a replay buffer.
self._q_function = MPDQNFunction(
seed=CFG.seed,
hid_sizes=CFG.mlp_regressor_hid_sizes,
max_train_iters=CFG.mlp_regressor_max_itr,
clip_gradients=CFG.mlp_regressor_clip_gradients,
clip_value=CFG.mlp_regressor_gradient_clip_value,
learning_rate=CFG.learning_rate,
weight_decay=CFG.weight_decay,
use_torch_gpu=CFG.use_torch_gpu,
train_print_every=CFG.pytorch_train_print_every,
n_iter_no_change=CFG.active_sampler_learning_n_iter_no_change,
num_lookahead_samples=CFG.
active_sampler_learning_num_lookahead_samples)

def _update_maple_data(self, reward_bonuses: \
Optional[List[float]] = None) -> None:
if reward_bonuses is None:
reward_bonuses = []
start_idx = self._last_seen_segment_traj_idx + 1
new_trajs = self._segmented_trajs[start_idx:]

goal_offset = CFG.max_initial_demos
assert len(self._segmented_trajs) == goal_offset + \
len(self._interaction_goals)
new_traj_goals = self._interaction_goals[goal_offset + start_idx:]

for traj_i, segmented_traj in enumerate(new_trajs):
self._last_seen_segment_traj_idx += 1
for seg_i, segment in enumerate(segmented_traj):
s = segment.states[0]
goal = new_traj_goals[traj_i]
o = segment.get_option()
ns = segment.states[-1]
reward = 1.0 if goal.issubset(\
segment.final_atoms) else 0.0
if CFG.use_callplanner and o.parent == \
self.CallPlanner and reward == 1:
reward += 0.5

if CFG.rl_rwd_shape:
reward += reward_bonuses[0]
reward_bonuses.pop(0)

terminal = reward > 0 or seg_i == len(segmented_traj) - 1
self._q_function.add_datum_to_replay_buffer(
(s, goal, o, ns, reward, terminal))
Loading
Loading