diff --git a/.gitignore b/.gitignore index 49c6adc85..a0b2f4717 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +data/ */*/mjkey.txt **/.DS_STORE **/*.pyc diff --git a/docs/HER.md b/docs/HER.md index 4abea5bdf..b4dfb9c24 100644 --- a/docs/HER.md +++ b/docs/HER.md @@ -2,9 +2,12 @@ Some notes on the implementation of [Hindsight Experience Replay](https://arxiv.org/abs/1707.01495). ## Expected Results -If you run the [Fetch example](examples/her/her_td3_gym_fetch_reach.py), then +If you run the [Fetch reach example](examples/her/her_td3_gym_fetch_reach.py), then you should get results like this: - ![Fetch HER results](images/FetchReach-v1_HER-TD3.png) + ![Fetch HER Reach results](images/FetchReach-v1_HER-TD3.png) + +If you run the [Fetch pick and place example](eaxmples/her/her_td3_gym_fetch_pnp.py), then you should get results like this: ![Fetch HER PNP results](images/FetchPickAndPlace-v1_HER-TD3.png) + If you run the [Sawyer example](examples/her/her_td3_multiworld_sawyer_reach.py) , then you should get results like this: diff --git a/docs/images/FetchPickAndPlace-v1_HER-TD3.png b/docs/images/FetchPickAndPlace-v1_HER-TD3.png new file mode 100644 index 000000000..dbe8657db Binary files /dev/null and b/docs/images/FetchPickAndPlace-v1_HER-TD3.png differ diff --git a/examples/her/her_td3_gym_fetch_pnp.py b/examples/her/her_td3_gym_fetch_pnp.py new file mode 100644 index 000000000..f60cef3f8 --- /dev/null +++ b/examples/her/her_td3_gym_fetch_pnp.py @@ -0,0 +1,149 @@ +import gym + +import rlkit.torch.pytorch_util as ptu +from rlkit.exploration_strategies.base import ( + PolicyWrappedWithExplorationStrategy +) +from rlkit.exploration_strategies.gaussian_and_epsilon_strategy import ( + GaussianAndEpsilonStrategy +) +from rlkit.torch.her.her import HerTd3 +import rlkit.samplers.rollout_functions as rf + + +from rlkit.torch.networks import FlattenMlp, MlpPolicy, QNormalizedFlattenMlp, CompositeNormalizedMlpPolicy +from rlkit.torch.data_management.normalizer import CompositeNormalizer + + +def experiment(variant): + try: + import robotics_recorder + except ImportError as e: + print(e) + + env = gym.make(variant['env_id']) + es = GaussianAndEpsilonStrategy( + action_space=env.action_space, + max_sigma=.2, + min_sigma=.2, # constant sigma + epsilon=.3, + ) + obs_dim = env.observation_space.spaces['observation'].low.size + goal_dim = env.observation_space.spaces['desired_goal'].low.size + action_dim = env.action_space.low.size + + shared_normalizer = CompositeNormalizer(obs_dim + goal_dim, action_dim, obs_clip_range=5) + + qf1 = QNormalizedFlattenMlp( + input_size=obs_dim + goal_dim + action_dim, + output_size=1, + hidden_sizes=[400, 300], + composite_normalizer=shared_normalizer + ) + qf2 = QNormalizedFlattenMlp( + input_size=obs_dim + goal_dim + action_dim, + output_size=1, + hidden_sizes=[400, 300], + composite_normalizer=shared_normalizer + ) + import torch + policy = CompositeNormalizedMlpPolicy( + input_size=obs_dim + goal_dim, + output_size=action_dim, + hidden_sizes=[400, 300], + composite_normalizer=shared_normalizer, + output_activation=torch.tanh + ) + exploration_policy = PolicyWrappedWithExplorationStrategy( + exploration_strategy=es, + policy=policy, + ) + + from rlkit.data_management.obs_dict_replay_buffer import ObsDictRelabelingBuffer + + observation_key = 'observation' + desired_goal_key = 'desired_goal' + achieved_goal_key = desired_goal_key.replace("desired", "achieved") + + replay_buffer = ObsDictRelabelingBuffer( + env=env, + observation_key=observation_key, + desired_goal_key=desired_goal_key, + achieved_goal_key=achieved_goal_key, + **variant['replay_buffer_kwargs'] + ) + + algorithm = HerTd3( + her_kwargs=dict( + observation_key='observation', + desired_goal_key='desired_goal' + ), + td3_kwargs = dict( + env=env, + qf1=qf1, + qf2=qf2, + policy=policy, + exploration_policy=exploration_policy + ), + replay_buffer=replay_buffer, + **variant['algo_kwargs'] + ) + + if variant.get("save_video", True): + rollout_function = rf.create_rollout_function( + rf.multitask_rollout, + max_path_length=algorithm.max_path_length, + observation_key=algorithm.observation_key, + desired_goal_key=algorithm.desired_goal_key, + ) + video_func = get_video_save_func( + rollout_function, + env, + algorithm.eval_policy, + variant, + ) + algorithm.post_epoch_funcs.append(video_func) + + algorithm.to(ptu.device) + algorithm.train() + + +if __name__ == "__main__": + variant = dict( + algo_kwargs=dict( + num_epochs=5000, + num_steps_per_epoch=1000, + num_steps_per_eval=500, + max_path_length=50, + batch_size=128, + discount=0.98, + save_algorithm=True, + ), + replay_buffer_kwargs=dict( + max_size=100000, + fraction_goals_rollout_goals=0.2, # equal to k = 4 in HER paper + fraction_goals_env_goals=0.0, + ), + render=False, + env_id="FetchPickAndPlace-v1", + doodad_docker_image="", # Set + gpu_doodad_docker_image="", # Set + save_video=False, + save_video_period=50, + ) + + from rlkit.launchers.launcher_util import run_experiment + + run_experiment( + experiment, + exp_prefix="her_td3_gym_fetch_pnp_test", # Make sure no spaces... + region="us-east-2", + mode='here_no_doodad', + variant=variant, + use_gpu=True, # Note: online normalization is very slow without GPU. + spot_price=.5, + snapshot_mode='gap_and_last', + snapshot_gap=100, + num_exps_per_instance=2 + ) + diff --git a/examples/her/her_td3_gym_fetch_reach.py b/examples/her/her_td3_gym_fetch_reach.py index cbbc48bfa..d4356846a 100644 --- a/examples/her/her_td3_gym_fetch_reach.py +++ b/examples/her/her_td3_gym_fetch_reach.py @@ -14,16 +14,17 @@ PolicyWrappedWithExplorationStrategy ) from rlkit.exploration_strategies.gaussian_and_epsilon_strategy import ( - GaussianAndEpislonStrategy + GaussianAndEpsilonStrategy ) from rlkit.launchers.launcher_util import setup_logger from rlkit.torch.her.her import HerTd3 from rlkit.torch.networks import FlattenMlp, TanhMlpPolicy +from rlkit.launchers.launcher_util import run_experiment def experiment(variant): env = gym.make('FetchReach-v1') - es = GaussianAndEpislonStrategy( + es = GaussianAndEpsilonStrategy( action_space=env.action_space, max_sigma=.2, min_sigma=.2, # constant sigma @@ -91,4 +92,11 @@ def experiment(variant): ), ) setup_logger('her-td3-fetch-experiment', variant=variant) - experiment(variant) + run_experiment( + experiment, + exp_prefix="rlkit-her_td3_gym_fetch", + mode='local_docker', + variant=variant, + use_gpu=False, + spot_price=.03 + ) \ No newline at end of file diff --git a/examples/her/her_td3_multiworld_sawyer_reach.py b/examples/her/her_td3_multiworld_sawyer_reach.py index 5c1f2cdec..06f44d057 100644 --- a/examples/her/her_td3_multiworld_sawyer_reach.py +++ b/examples/her/her_td3_multiworld_sawyer_reach.py @@ -14,7 +14,7 @@ from rlkit.exploration_strategies.base import \ PolicyWrappedWithExplorationStrategy from rlkit.exploration_strategies.gaussian_and_epsilon_strategy import ( - GaussianAndEpislonStrategy + GaussianAndEpsilonStrategy ) from rlkit.launchers.launcher_util import setup_logger from rlkit.torch.her.her import HerTd3 @@ -23,7 +23,7 @@ def experiment(variant): env = gym.make('SawyerReachXYZEnv-v0') - es = GaussianAndEpislonStrategy( + es = GaussianAndEpsilonStrategy( action_space=env.action_space, max_sigma=.2, min_sigma=.2, # constant sigma diff --git a/examples/rig/pointmass/rig.py b/examples/rig/pointmass/rig.py index c0681e3dc..1435c15ed 100644 --- a/examples/rig/pointmass/rig.py +++ b/examples/rig/pointmass/rig.py @@ -93,5 +93,5 @@ exp_prefix='rlkit-pointmass-rig-example', mode='here_no_doodad', variant=variant, - # use_gpu=True, # Turn on if you have a GPU + use_gpu=True, # Turn on if you have a GPU ) diff --git a/rlkit/core/rl_algorithm.py b/rlkit/core/rl_algorithm.py index 798a4e0b8..7c009630d 100644 --- a/rlkit/core/rl_algorithm.py +++ b/rlkit/core/rl_algorithm.py @@ -425,7 +425,7 @@ def get_extra_data_to_save(self, epoch): :return: """ if self.render: - self.training_env.render(close=True) + self.training_env.close() data_to_save = dict( epoch=epoch, ) diff --git a/rlkit/exploration_strategies/gaussian_and_epsilon_strategy.py b/rlkit/exploration_strategies/gaussian_and_epsilon_strategy.py index 4306cdfd1..0b7418ef1 100644 --- a/rlkit/exploration_strategies/gaussian_and_epsilon_strategy.py +++ b/rlkit/exploration_strategies/gaussian_and_epsilon_strategy.py @@ -4,7 +4,7 @@ import numpy as np -class GaussianAndEpislonStrategy(RawExplorationStrategy, Serializable): +class GaussianAndEpsilonStrategy(RawExplorationStrategy, Serializable): """ With probability epsilon, take a completely random action. with probability 1-epsilon, add Gaussian noise to the action taken by a diff --git a/rlkit/torch/her/her.py b/rlkit/torch/her/her.py index e7518037f..8fa8f398f 100644 --- a/rlkit/torch/her/her.py +++ b/rlkit/torch/her/her.py @@ -45,6 +45,7 @@ def __init__( self, observation_key=None, desired_goal_key=None, + render=False, ): self.observation_key = observation_key self.desired_goal_key = desired_goal_key diff --git a/rlkit/torch/networks.py b/rlkit/torch/networks.py index 9ce79b014..2facb072a 100644 --- a/rlkit/torch/networks.py +++ b/rlkit/torch/networks.py @@ -10,7 +10,7 @@ from rlkit.policies.base import Policy from rlkit.torch import pytorch_util as ptu from rlkit.torch.core import PyTorchModule -from rlkit.torch.data_management.normalizer import TorchFixedNormalizer +from rlkit.torch.data_management.normalizer import TorchFixedNormalizer, TorchNormalizer, CompositeNormalizer from rlkit.torch.modules import LayerNorm @@ -41,6 +41,8 @@ def __init__( self.input_size = input_size self.output_size = output_size self.hidden_activation = hidden_activation + if isinstance(output_activation, str): + output_activation = getattr(torch, output_activation) self.output_activation = output_activation self.layer_norm = layer_norm self.fcs = [] @@ -89,6 +91,92 @@ def forward(self, *inputs, **kwargs): return super().forward(flat_inputs, **kwargs) +class CompositeNormalizedFlattenMlp(FlattenMlp): + def __init__( + self, + composite_normalizer, + *args, + **kwargs + ): + self.save_init_params(locals()) + super().__init__(*args, **kwargs) + self.composite_normalizer = composite_normalizer + + def forward( + self, + observations, + actions, + return_preactivations=False): + obs, _ = self.composite_normalizer.normalize_all(observations, None) + flat_input = torch.cat((obs, actions), dim=1) + return super().forward(flat_input, return_preactivations=return_preactivations) + + +class QNormalizedFlattenMlp(FlattenMlp): + def __init__( + self, + composite_normalizer, + *args, + clip_high=float('inf'), + clip_low=float('-inf'), + **kwargs + ): + self.save_init_params(locals()) + super().__init__(*args, **kwargs) + self.composite_normalizer = composite_normalizer + self.clip_low = clip_low + self.clip_high = clip_high + + def forward( + self, + observations, + actions, + return_preactivations=False): + obs, _ = self.composite_normalizer.normalize_all(observations, None) + flat_input = torch.cat((obs, actions), dim=1) + + if return_preactivations: + output, preactivation = super().forward(flat_input, return_preactivations=return_preactivations) + output = torch.clamp(output, self.clip_low, self.clip_high) + return output, preactivation + else: + output = super().forward(flat_input) + output = torch.clamp(output, self.clip_low, self.clip_high) + return output + + +class VNormalizedFlattenMlp(FlattenMlp): + def __init__( + self, + composite_normalizer, + *args, + clip_high=float('inf'), + clip_low=float('-inf'), + **kwargs + ): + self.save_init_params(locals()) + super().__init__(*args, **kwargs) + self.composite_normalizer = composite_normalizer + self.clip_low = clip_low + self.clip_high = clip_high + + def forward( + self, + observations, + return_preactivations=False): + obs, _ = self.composite_normalizer.normalize_all(observations, None) + flat_input = obs + + if return_preactivations: + output, preactivation = super().forward(flat_input, return_preactivations=return_preactivations) + output = torch.clamp(output, self.clip_low, self.clip_high) + return output, preactivation + else: + output = super().forward(flat_input) + output = torch.clamp(output, self.clip_low, self.clip_high) + return output + + class MlpPolicy(Mlp, Policy): """ A simpler interface for creating policies. @@ -117,6 +205,22 @@ def get_actions(self, obs): return self.eval_np(obs) +class CompositeNormalizedMlpPolicy(MlpPolicy): + def __init__( + self, + composite_normalizer, + *args, + **kwargs + ): + self.save_init_params(locals()) + super().__init__(*args, **kwargs) + self.composite_normalizer = composite_normalizer + + def forward(self, obs, **kwargs): + obs, _ = self.composite_normalizer.normalize_all(obs, None) + return super().forward(obs, **kwargs) + + class TanhMlpPolicy(MlpPolicy): """ A helper class since most policies have a tanh output activation. @@ -124,3 +228,4 @@ class TanhMlpPolicy(MlpPolicy): def __init__(self, *args, **kwargs): self.save_init_params(locals()) super().__init__(*args, output_activation=torch.tanh, **kwargs) + diff --git a/rlkit/torch/td3/td3.py b/rlkit/torch/td3/td3.py index b4971160c..897ab2933 100644 --- a/rlkit/torch/td3/td3.py +++ b/rlkit/torch/td3/td3.py @@ -34,7 +34,8 @@ def __init__( tau=0.005, qf_criterion=None, optimizer_class=optim.Adam, - + policy_preactivation_loss=True, + policy_preactivation_coefficient=1.0, **kwargs ): super().__init__( @@ -71,6 +72,8 @@ def __init__( self.policy.parameters(), lr=policy_learning_rate, ) + self.policy_preactivation_penalty = policy_preactivation_loss + self.policy_preactivation_coefficient = policy_preactivation_coefficient def _do_training(self): batch = self.get_batch() @@ -99,6 +102,7 @@ def _do_training(self): target_q1_values = self.target_qf1(next_obs, noisy_next_actions) target_q2_values = self.target_qf2(next_obs, noisy_next_actions) target_q_values = torch.min(target_q1_values, target_q2_values) + q_target = rewards + (1. - terminals) * self.discount * target_q_values q_target = q_target.detach() @@ -123,9 +127,12 @@ def _do_training(self): policy_actions = policy_loss = None if self._n_train_steps_total % self.policy_and_target_update_period == 0: - policy_actions = self.policy(obs) + policy_actions, policy_preactivations = self.policy(obs, return_preactivations=True) q_output = self.qf1(obs, policy_actions) + policy_loss = - q_output.mean() + if self.policy_preactivation_penalty: + policy_loss += self.policy_preactivation_coefficient * (policy_preactivations ** 2).mean() self.policy_optimizer.zero_grad() policy_loss.backward() diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 000000000..e69de29bb