Initial version of DeepRL agents, results from final project

b8fe4105 · Bryson Howell · e1472177 · b8fe4105 · b8fe4105 · b8fe4105
Commit b8fe4105 authored May 07, 2024 by Bryson Howell
39 changed files
--- a/deeprl_data/bad_reward_function.png
+++ b/deeprl_data/bad_reward_function.png
--- a/deeprl_data/final_results.png
+++ b/deeprl_data/final_results.png
--- a/deeprl_data/final_test1.png
+++ b/deeprl_data/final_test1.png
--- a/deeprl_data/trained_model.pt
+++ b/deeprl_data/trained_model.pt
--- a/deeprl_data/trained_model2.pt
+++ b/deeprl_data/trained_model2.pt
--- a/deeprl_data/trained_model3.pt
+++ b/deeprl_data/trained_model3.pt
--- a/deeprl_data/trained_model4.pt
+++ b/deeprl_data/trained_model4.pt
--- a/logs4/lpm_final.zip
+++ b/logs4/lpm_final.zip
--- a/logs4/lpm_final/events.out.tfevents.1714783826.caslab-bryson-desktop.2504951.0
+++ b/logs4/lpm_final/events.out.tfevents.1714783826.caslab-bryson-desktop.2504951.0
--- a/logs4/lpm_final2/events.out.tfevents.1714788031.caslab-bryson-desktop.3825346.0
+++ b/logs4/lpm_final2/events.out.tfevents.1714788031.caslab-bryson-desktop.3825346.0
--- a/logs4/lpm_reward_test1.zip
+++ b/logs4/lpm_reward_test1.zip
--- a/logs4/lpm_reward_test1/events.out.tfevents.1714778824.caslab-bryson-desktop.2904438.0
+++ b/logs4/lpm_reward_test1/events.out.tfevents.1714778824.caslab-bryson-desktop.2904438.0
--- a/logs4/lpm_reward_test2/events.out.tfevents.1714780839.caslab-bryson-desktop.4063243.0
+++ b/logs4/lpm_reward_test2/events.out.tfevents.1714780839.caslab-bryson-desktop.4063243.0
--- a/logs4/old1714771101/events.out.tfevents.1714771101.caslab-bryson-desktop.59980.0
+++ b/logs4/old1714771101/events.out.tfevents.1714771101.caslab-bryson-desktop.59980.0
--- a/logs4/old1714772221/events.out.tfevents.1714772221.caslab-bryson-desktop.62319.0
+++ b/logs4/old1714772221/events.out.tfevents.1714772221.caslab-bryson-desktop.62319.0
--- a/logs4/old1714772609/events.out.tfevents.1714772609.caslab-bryson-desktop.63001.0
+++ b/logs4/old1714772609/events.out.tfevents.1714772609.caslab-bryson-desktop.63001.0
--- a/logs4/old1714772814/events.out.tfevents.1714772814.caslab-bryson-desktop.125114.0
+++ b/logs4/old1714772814/events.out.tfevents.1714772814.caslab-bryson-desktop.125114.0
--- a/logs4/old1714775057/events.out.tfevents.1714775057.caslab-bryson-desktop.1174234.0
+++ b/logs4/old1714775057/events.out.tfevents.1714775057.caslab-bryson-desktop.1174234.0
--- a/logs4/old1714775105/events.out.tfevents.1714775105.caslab-bryson-desktop.1174375.0
+++ b/logs4/old1714775105/events.out.tfevents.1714775105.caslab-bryson-desktop.1174375.0
--- a/logs4/old1714775246/events.out.tfevents.1714775246.caslab-bryson-desktop.1201013.0
+++ b/logs4/old1714775246/events.out.tfevents.1714775246.caslab-bryson-desktop.1201013.0
--- a/logs4/old1714775386/events.out.tfevents.1714775386.caslab-bryson-desktop.1227455.0
+++ b/logs4/old1714775386/events.out.tfevents.1714775386.caslab-bryson-desktop.1227455.0
--- a/logs4/old1714775480/events.out.tfevents.1714775480.caslab-bryson-desktop.1253937.0
+++ b/logs4/old1714775480/events.out.tfevents.1714775480.caslab-bryson-desktop.1253937.0
--- a/logs4/old1714777540/events.out.tfevents.1714777540.caslab-bryson-desktop.2362329.0
+++ b/logs4/old1714777540/events.out.tfevents.1714777540.caslab-bryson-desktop.2362329.0
--- a/logs4/old1714777669/events.out.tfevents.1714777669.caslab-bryson-desktop.2389068.0
+++ b/logs4/old1714777669/events.out.tfevents.1714777669.caslab-bryson-desktop.2389068.0
--- a/logs4/old1714778149/events.out.tfevents.1714778149.caslab-bryson-desktop.2416149.0
+++ b/logs4/old1714778149/events.out.tfevents.1714778149.caslab-bryson-desktop.2416149.0
--- a/logs4/old1714780028/events.out.tfevents.1714780028.caslab-bryson-desktop.4057882.0
+++ b/logs4/old1714780028/events.out.tfevents.1714780028.caslab-bryson-desktop.4057882.0
--- a/sargym/core.py
+++ b/sargym/core.py
+import numpy as np
+import scipy.signal
+from gym.spaces import Box, Discrete
+import torch
+import torch.nn as nn
+from torch.distributions.normal import Normal
+from torch.distributions.categorical import Categorical
+import torch.nn.functional as F
+import copy
+def combined_shape(length, shape=None):
+    if shape is None:
+        return (length,)
+    return (length, shape) if np.isscalar(shape) else (length, *shape)
+def mlp(sizes, activation, output_activation=nn.Identity):
+    layers = []
+    for j in range(len(sizes)-1):
+        act = activation if j < len(sizes)-2 else output_activation
+        layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
+    return nn.Sequential(*layers)
+def count_vars(module):
+    return sum([np.prod(p.shape) for p in module.parameters()])
+def discount_cumsum(x, discount):
+    """
+    magic from rllab for computing discounted cumulative sums of vectors.
+    input: 
+        vector x, 
+        [x0, 
+         x1, 
+         x2]
+    output:
+        [x0 + discount * x1 + discount^2 * x2,  
+         x1 + discount * x2,
+         x2]
+    """
+    return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]
+class Actor(nn.Module):
+    def _distribution(self, obs):
+        raise NotImplementedError
+    def _log_prob_from_distribution(self, pi, act):
+        raise NotImplementedError
+    def forward(self, obs, act=None):
+        # Produce action distributions for given observations, and 
+        # optionally compute the log likelihood of given actions under
+        # those distributions.
+        pi = self._distribution(obs)
+        logp_a = None
+        if act is not None:
+            logp_a = self._log_prob_from_distribution(pi, act)
+        return pi, logp_a
+class MLPDeterministicActor(nn.Module):
+    def __init__(self, state_dim, action_dim, max_action,discount_factor=0.99):
+        super(MLPDeterministicActor, self).__init__()
+        self.l1 = nn.Linear(state_dim, 256)
+        self.l2 = nn.Linear(256, 256)
+        self.l3 = nn.Linear(256, action_dim)
+        self.max_action = max_action
+        self.action_dim=action_dim
+        self.start_state = None
+    def forward(self, state,safety_switch=False,debug = False, noisy=False):
+        a = F.relu(self.l1(state))
+        a = F.relu(self.l2(a))
+        return self.max_action * torch.tanh(self.l3(a))
+class MLPCategoricalActor(Actor):
+    def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
+        super().__init__()
+        self.logits_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation)
+    def _distribution(self, obs):
+        logits = self.logits_net(obs)
+        return Categorical(logits=logits)
+    def _log_prob_from_distribution(self, pi, act):
+        return pi.log_prob(act)
+class MLPGaussianActor(Actor):
+    def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
+        super().__init__()
+        log_std = -0.5 * np.ones(act_dim, dtype=np.float32)
+        self.log_std = torch.nn.Parameter(torch.as_tensor(log_std))
+        self.mu_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation)
+    def _distribution(self, obs):
+        mu = self.mu_net(obs)
+        std = torch.exp(self.log_std)
+        return Normal(mu, std)
+    def mean(self, obs):
+        mu = self.mu_net(obs)
+        return mu
+    def _log_prob_from_distribution(self, pi, act):
+        return pi.log_prob(act).sum(axis=-1)    # Last axis sum needed for Torch Normal distribution
+class MLPCritic(nn.Module):
+    def __init__(self, obs_dim, hidden_sizes, activation):
+        super().__init__()
+        self.v_net = mlp([obs_dim] + list(hidden_sizes) + [1], activation)
+    def forward(self, obs):
+        return torch.squeeze(self.v_net(obs), -1) # Critical to ensure v has right shape.
+class MLPActorCriticTD3trust(nn.Module):
+    def __init__(self, observation_space, action_space, 
+                 hidden_sizes=(64,64), activation=nn.Tanh):
+        super().__init__()
+        obs_dim = observation_space.shape[0]
+        act_dim = action_space.shape[0]
+        self.obs_dim = obs_dim
+        self.act_dim = act_dim
+        # policy builder depends on action space
+        self.pi = MLPDeterministicActor(obs_dim, action_space.shape[0],action_space.high[0])
+        # build value function
+        self.Qv1  = MLPCritic(obs_dim+act_dim, hidden_sizes, activation)
+        self.Qv2  = MLPCritic(obs_dim+act_dim, hidden_sizes, activation)
+        self.Qj1  = MLPCritic(obs_dim+act_dim, hidden_sizes, activation)
+        self.Qj2  = MLPCritic(obs_dim+act_dim, hidden_sizes, activation)
+        self.baseline_Qj = copy.deepcopy(self.Qj1)
+        self.baseline_pi = copy.deepcopy(self.pi)
+        self.pi_mix = copy.deepcopy(self.pi)
+        self.epsilon = 0
+    def step(self, obs):
+        a = self.pi(obs)
+        qv = self.Qv1(torch.cat((obs,a)))
+        qj = self.Qj1(torch.cat((obs,a)))
+        return a.detach().cpu().numpy(), qv.detach().cpu().numpy(), qj.detach().cpu().numpy(), 0
+    def act_pi(self, pi, obs):
+        a = pi(obs)
+        return a
+    def act(self, obs):
+        return self.step(obs)[0]
+class MLPActorCritic(nn.Module):
+    def __init__(self, observation_space, action_space, 
+                 hidden_sizes=(64,64), activation=nn.Tanh):
+        super().__init__()
+        obs_dim = observation_space.shape[0]
+        # policy builder depends on action space
+        if isinstance(action_space, Box):
+            self.pi = MLPGaussianActor(obs_dim, action_space.shape[0], hidden_sizes, activation)
+        elif isinstance(action_space, Discrete):
+            self.pi = MLPCategoricalActor(obs_dim, action_space.n, hidden_sizes, activation)
+        # build value function
+        self.v  = MLPCritic(obs_dim, hidden_sizes, activation)
+    def step(self, obs):
+        with torch.no_grad():
+            pi = self.pi._distribution(obs)
+            a = pi.sample()
+            logp_a = self.pi._log_prob_from_distribution(pi, a)
+            v = self.v(obs)
+        return a.numpy(), v.numpy(), logp_a.numpy()
+    def act(self, obs):
+        return self.step(obs)[0]
+class MLPActorCriticCost(nn.Module):
+    def __init__(self, observation_space, action_space, 
+                 hidden_sizes=(64,64), activation=nn.Tanh):
+        super().__init__()
+        obs_dim = observation_space.shape[0]
+        # policy builder depends on action space
+        if isinstance(action_space, Box):
+            self.pi = MLPGaussianActor(obs_dim, action_space.shape[0], hidden_sizes, activation)
+        elif isinstance(action_space, Discrete):
+            self.pi = MLPCategoricalActor(obs_dim, action_space.n, hidden_sizes, activation)
+        # build value function
+        self.v  = MLPCritic(obs_dim, hidden_sizes, activation)
+        self.j  = MLPCritic(obs_dim, hidden_sizes, activation)
+    def step(self, obs):
+        with torch.no_grad():
+            pi = self.pi._distribution(obs)
+            a = pi.sample()
+            logp_a = self.pi._log_prob_from_distribution(pi, a)
+            v = self.v(obs)
+            j = self.j(obs)
+        return a.numpy(), v.numpy(), j.numpy(), logp_a.numpy()
+    def act(self, obs):
+        return self.step(obs)[0]
--- a/sargym/cpo.py
+++ b/sargym/cpo.py
+import numpy as np
+import torch
+from torch.optim import Adam
+import gym
+import time
+from . import core
+import sys
+#import safety_gym
+import torch.nn.functional as F
+from torch.autograd import Variable
+from .ppo_utils.logx import EpochLogger
+from .ppo_utils.mpi_pytorch import setup_pytorch_for_mpi, sync_params, mpi_avg_grads
+from .ppo_utils.mpi_tools import mpi_fork, mpi_avg, proc_id, mpi_statistics_scalar, num_procs
+import copy
+from .trust_region_utils import *
+from .replay_buffers import *
+import os
+from tqdm import tqdm
+from torch.utils.tensorboard import SummaryWriter
+import datetime
+def cpo(env_fn, env_name = '', actor_critic=core.MLPActorCriticCost, ac_kwargs=dict(), seed=0,
+        steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4,
+        vf_lr=1e-3, jf_lr=1e-3, penalty_init=1., penalty_lr=5e-2, cost_lim=25, train_pi_iters=80, train_v_iters=80, lam=0.95, max_ep_len=1000,
+        target_kl=0.00, target_l2=0.012, logger_kwargs=dict(), save_freq=10):
+    """
+    Proximal Policy Optimization (by clipping),
+    with early stopping based on approximate KL
+    Args:
+        env_fn : A function which creates a copy of the environment.
+            The environment must satisfy the OpenAI Gym API.
+        actor_critic: The constructor method for a PyTorch Module with a
+            ``step`` method, an ``act`` method, a ``pi`` module, and a ``v``
+            module. The ``step`` method should accept a batch of observations
+            and return:
+            ===========  ================  ======================================
+            Symbol       Shape             Description
+            ===========  ================  ======================================
+            ``a``        (batch, act_dim)  | Numpy array of actions for each
+                                           | observation.
+            ``v``        (batch,)          | Numpy array of value estimates
+                                           | for the provided observations.
+            ``logp_a``   (batch,)          | Numpy array of log probs for the
+                                           | actions in ``a``.
+            ===========  ================  ======================================
+            The ``act`` method behaves the same as ``step`` but only returns ``a``.
+            The ``pi`` module's forward call should accept a batch of
+            observations and optionally a batch of actions, and return:
+            ===========  ================  ======================================
+            Symbol       Shape             Description
+            ===========  ================  ======================================
+            ``pi``       N/A               | Torch Distribution object, containing
+                                           | a batch of distributions describing
+                                           | the policy for the provided observations.
+            ``logp_a``   (batch,)          | Optional (only returned if batch of
+                                           | actions is given). Tensor containing
+                                           | the log probability, according to
+                                           | the policy, of the provided actions.
+                                           | If actions not given, will contain
+                                           | ``None``.
+            ===========  ================  ======================================
+            The ``v`` module's forward call should accept a batch of observations
+            and return:
+            ===========  ================  ======================================
+            Symbol       Shape             Description
+            ===========  ================  ======================================
+            ``v``        (batch,)          | Tensor containing the value estimates
+                                           | for the provided observations. (Critical:
+                                           | make sure to flatten this!)
+            ===========  ================  ======================================
+        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object
+            you provided to PPO.
+        seed (int): Seed for random number generators.
+        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
+            for the agent and the environment in each epoch.
+        epochs (int): Number of epochs of interaction (equivalent to
+            number of policy updates) to perform.
+        gamma (float): Discount factor. (Always between 0 and 1.)
+        clip_ratio (float): Hyperparameter for clipping in the policy objective.
+            Roughly: how far can the new policy go from the old policy while
+            still profiting (improving the objective function)? The new policy
+            can still go farther than the clip_ratio says, but it doesn't help
+            on the objective anymore. (Usually small, 0.1 to 0.3.) Typically
+            denoted by :math:`\epsilon`.
+        pi_lr (float): Learning rate for policy optimizer.
+        vf_lr (float): Learning rate for value function optimizer.
+        train_pi_iters (int): Maximum number of gradient descent steps to take
+            on policy loss per epoch. (Early stopping may cause optimizer
+            to take fewer than this.)
+        train_v_iters (int): Number of gradient descent steps to take on
+            value function per epoch.
+        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
+            close to 1.)
+        max_ep_len (int): Maximum length of trajectory / episode / rollout.
+        target_kl (float): Roughly what KL divergence we think is appropriate
+            between new and old policies after an update. This will get used
+            for early stopping. (Usually small, 0.01 or 0.05.)
+        logger_kwargs (dict): Keyword args for EpochLogger.
+        save_freq (int): How often (in terms of gap between epochs) to save
+            the current policy and value function.
+    """
+    # Special function to avoid certain slowdowns from PyTorch + MPI combo.
+    setup_pytorch_for_mpi()
+    # Set up logger and save configuration
+    logger = EpochLogger(**logger_kwargs)
+    logger.save_config(locals())
+    writer = SummaryWriter(
+        'logs4/' + str(int(time.time()))
+    )
+    # Random seed
+    seed += 10000 * proc_id()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    # Instantiate environment
+    env = env_fn()
+    obs_dim = env.observation_space.shape
+    act_dim = env.action_space.shape
+    # Create actor-critic module
+    # if not os.path.exists('safe_initial_policies/'+env_name+'.pt'):
+    #     ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
+    # else:
+    # Commented for LOOP experiments
+    # if 'Grid' in env_name:
+    #     ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
+    # else:
+    #     ac = torch.load('safe_initial_policies/'+env_name+'.pt')
+    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
+    # # print("Looking for safe policy at: {}".format())
+    # if not os.path.exists('safe_initial_policies/'+env_name+'.pt'):
+    #     raise NotImplementedError('A safe initial policy was not found at {}'.format('safe_initial_policies/'+env_name+'.pt'))
+    # ac.load_state_dict(ac.state_dict(),'safe_initial_policies/'+env_name+'.pt')
+    # Sync params across processes
+    sync_params(ac)
+    # Count variables
+    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.Qv1])
+    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts)
+    # Set up experience buffer
+    local_steps_per_epoch = int(steps_per_epoch / num_procs())
+    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)
+    # td3_buf = TD3Buffer(obs_dim[0], act_dim[0])
+    # Set up penalty params
+    soft_penalty = Variable(torch.exp(torch.Tensor([penalty_init]))-1, requires_grad=True)
+    penalty_optimizer = torch.optim.Adam([soft_penalty],lr=penalty_lr)
+    margin = 0
+    margin_lr = 0.05
+    learn_margin = False
+    constraint_violations = [0]
+    constraint_violations_count = [0]
+    def conjugate_gradients(Avp, b, nsteps, residual_tol=1e-10):
+        x = torch.zeros(b.size())
+        r = b.clone()
+        p = b.clone()
+        rdotr = torch.dot(r, r)
+        for i in range(nsteps):
+            _Avp = Avp(p)
+            alpha = rdotr / torch.dot(p, _Avp)
+            x += alpha * p
+            r -= alpha * _Avp
+            new_rdotr = torch.dot(r, r)
+            betta = new_rdotr / rdotr
+            p = r + betta * p
+            rdotr = new_rdotr
+            if rdotr < residual_tol:
+                break
+        return x
+    def linesearch(model,
+                f,
+                get_kl,
+                optim_case,
+                c,
+                x,
+                fullstep,
+                max_kl,
+                max_backtracks=10,
+                accept_ratio=.1):
+        fval1, fval2 = f()[0].data,f()[1].data
+        old_model = copy.deepcopy(model)
+        # print("fval before", fval.item())
+        for (_n_backtracks, stepfrac) in enumerate(.5**np.arange(max_backtracks)):
+            xnew = x + stepfrac * fullstep
+            set_flat_params_to(model, xnew)
+            newfval1, newfval2 = f()[0].data, f()[1].data
+            kl_change = get_kl(old_model=old_model).mean()
+            if (kl_change <= max_kl and
+                (newfval1 <= fval1 if optim_case > 1 else True) and
+                newfval2 - fval2 <= max(-c,0)):
+                return True, xnew
+        return False, x
+    def trust_region_step(model, get_loss, get_kl, max_kl, damping):
+        # global margin, margin_lr, learn_margin
+        loss_pi, loss_j = get_loss()
+        grads = torch.autograd.grad(loss_pi, model.parameters())
+        loss_grad = torch.cat([grad.view(-1) for grad in grads]).data
+        grads_j = torch.autograd.grad(loss_j, model.parameters())
+        loss_grad_j = torch.cat([grad.view(-1) for grad in grads_j]).data
+        c = ac.epsilon/(gamma-1)
+        rescale = 100
+        # TODO correct this
+        margin = 0
+        margin_lr = 0.05
+        learn_margin = False
+        if learn_margin:
+            margin+=margin_lr*c
+            margin = max(0,margin)
+        # TODO: mpi_Avg margin?
+        c += margin
+        c/= (rescale+1e-8)
+        def Fvp(v):
+            kl = get_kl()
+            kl = kl.mean()
+            grads = torch.autograd.grad(kl, model.parameters(), create_graph=True)
+            flat_grad_kl = torch.cat([grad.view(-1) for grad in grads])
+            kl_v = (flat_grad_kl * Variable(v)).sum()
+            grads = torch.autograd.grad(kl_v, model.parameters())
+            flat_grad_grad_kl = torch.cat([grad.contiguous().view(-1) for grad in grads]).data
+            return flat_grad_grad_kl + v * damping
+        v = conjugate_gradients(Fvp, -loss_grad, 10)
+        approx_g = Fvp(v)
+        q = np.dot(v, approx_g)
+        # Determine optim_case (switch condition for calculation,
+        # based on geometry of constrained optimization problem)
+        if np.dot(loss_grad_j,loss_grad_j) <= 1e-8 and c < 0:
+            # feasible and cost grad is zero---shortcut to pure TRPO update!
+            w, r, s, A, B = 0, 0, 0, 0, 0
+            optim_case = 4
+        else:
+            # cost grad is nonzero: CPO update!
+            w = conjugate_gradients(Fvp, -loss_grad_j, 10)
+            r = np.dot(w, approx_g)         # b^T H^{-1} g
+            s = np.dot(w, Fvp(w))            # b^T H^{-1} b
+            A = q - r**2 / s                # should be always positive (Cauchy-Shwarz)
+            B = 2*max_kl - c**2 / s      # does safety boundary intersect trust region? (positive = yes)
+            if c < 0 and B < 0:
+                # point in trust region is feasible and safety boundary doesn't intersect
+                # ==> entire trust region is feasible
+                optim_case = 3
+            elif c < 0 and B >= 0:
+                # x = 0 is feasible and safety boundary intersects
+                # ==> most of trust region is feasible
+                optim_case = 2
+            elif c >= 0 and B >= 0:
+                # x = 0 is infeasible and safety boundary intersects
+                # ==> part of trust region is feasible, recovery possible
+                optim_case = 1
+                print("Alert! Attempting feasible recovery!")
+                # self.logger.log('Alert! Attempting feasible recovery!', 'yellow')
+            else:
+                # x = 0 infeasible, and safety halfspace is outside trust region
+                # ==> whole trust region is infeasible, try to fail gracefully
+                optim_case = 0
+                print("Alert! Attempting infeasible recovery!")
+                # self.logger.log('Alert! Attempting infeasible recovery!', 'red')
+        if optim_case in [3,4]:
+            lam = np.sqrt(q / (2*max_kl))
+            nu = 0
+        elif optim_case in [1,2]:
+            LA, LB = [0, r /c], [r/c, np.inf]
+            LA, LB = (LA, LB) if c < 0 else (LB, LA)
+            proj = lambda x, L : max(L[0], min(L[1], x))
+            lam_a = proj(np.sqrt(A/B), LA)
+            lam_b = proj(np.sqrt(q/(2*max_kl)), LB)
+            f_a = lambda lam : -0.5 * (A / (lam+1e-8) + B * lam) - r*c/(s+1e-8)
+            f_b = lambda lam : -0.5 * (q / (lam+1e-8) + 2 * target_kl * lam)
+            lam = lam_a if f_a(lam_a) >= f_b(lam_b) else lam_b
+            nu = max(0, lam * c - r) / (s + 1e-8)
+        else:
+            lam = 0
+            nu = np.sqrt(2 * max_kl / (s+1e-8))
+        # normal step if optim_case > 0, but for optim_case =0,
+        # perform infeasible recovery: step to purely decrease cost
+        x = (1./(lam+1e-8)) * (v + nu * w) if optim_case > 0 else nu * w
+        if optim_case==0:
+            logger.store(Backtrack=-1)
+        else:
+            logger.store(Backtrack=1)
+        prev_params = get_flat_params_from(model)
+        success, new_params = linesearch(model, get_loss, get_kl, optim_case, c, prev_params, x, max_kl)
+        set_flat_params_to(model, new_params)
+        return loss_pi
+    # Set up function for computing PPO policy loss
+    def compute_loss_pi(data, epoch_no=1):
+        obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data['logp']
+        def get_kl(old_mean=None, new_mean=None, old_model=None):
+            if old_mean is None:
+                mean1 = ac.pi(obs)
+            else:
+                mean1 = old_mean
+            if old_model is not None:
+                mean1 = old_model(obs)
+            log_std1, std1 = -2.99, 0.05
+            if new_mean is None:
+                mean0 = torch.autograd.Variable(mean1.data)
+            else:
+                mean0 = new_mean
+            log_std0 = -2.99
+            std0 = 0.05
+            kl = log_std1 - log_std0 + (std0**2 + (mean0 - mean1).pow(2)) / (2.0 * std1**2) - 0.5
+            return kl.sum(1, keepdim=True)
+        def get_loss_pi():
+            # if ac.epsilon<0:
+            #     loss_pi =  (ac.Qj1(torch.cat((obs, ac.pi(obs)),dim=1))).mean()
+            # else:
+            #     loss_pi =  - (ac.Qv1(torch.cat((obs, ac.pi(obs)),dim=1))).mean()
+            loss_pi = -(ac.Qv1(torch.cat((obs, ac.pi(obs)),dim=1))).mean()
+            loss_j = (ac.Qj1(torch.cat((obs, ac.pi(obs)),dim=1))).mean()
+            # loss_pi =  (ac.Qj1(torch.cat((obs, ac.pi(obs)),dim=1))).mean()
+            return loss_pi, loss_j
+        old_mean = ac.pi(obs).detach().data
+        loss_pi = trust_region_step(ac.pi, get_loss_pi, get_kl, target_l2, 0.1)
+        # Useful extra info
+        approx_l2 = torch.sqrt(torch.mean((ac.pi(obs) - data['old_act'])**2)).item()
+        approx_kl = get_kl(old_mean = old_mean, new_mean=ac.pi(obs).detach()).mean().item()
+        # ent = pi.entropy().mean().item()
+        ent = 0
+        clipped = [0]
+        clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item()
+        pi_info = dict(kl=approx_kl,l2=approx_l2, ent=ent, cf=clipfrac)
+        return loss_pi, pi_info
+    # Set up function for computing value loss
+    def compute_loss_v(data):
+        obs, act, ret = data['obs'], data['act'], data['ret']
+        return ((ac.Qv1(torch.cat((obs,act),dim=1)) - ret)**2).mean(), ((ac.Qv2(torch.cat((obs,act),dim=1)) - ret)**2).mean()
+    # Set up function for computing value loss
+    def compute_loss_j(data):
+        obs, act, cost_ret = data['obs'], data['act'], data['cost_ret']
+        return ((ac.Qj1(torch.cat((obs,act),dim=1)) - cost_ret)**2).mean(), ((ac.Qj2(torch.cat((obs,act),dim=1)) - cost_ret)**2).mean()
+    # Set up optimizers for policy and value function
+    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
+    pi_bc_optimizer = Adam(ac.pi.parameters(), lr=0.001)
+    vf1_optimizer = Adam(ac.Qv1.parameters(), lr=vf_lr)
+    vf2_optimizer = Adam(ac.Qv2.parameters(), lr=vf_lr)
+    jf1_optimizer = Adam(ac.Qj1.parameters(), lr=jf_lr)
+    jf2_optimizer = Adam(ac.Qj2.parameters(), lr=jf_lr)
+    # Set up model saving
+    logger.setup_pytorch_saver(ac)
+    def update(epoch_no, constraint_violations, constraint_violations_count):
+        # global soft_penalty, penalty_optimizer
+        data = buf.get()
+        # Update the penalty
+        curr_cost = logger.get_stats('EpCostRet')[0]
+        if curr_cost-cost_lim>0:
+            # constraint_violations_count.append(1)
+            # constraint_violations.append(curr_cost-cost_lim)
+            logger.log('Warning! Safety constraint is already violated.', 'red')
+        ac.epsilon = (1-gamma)*(cost_lim-curr_cost)
+        if epoch_no==0 or ac.epsilon>=0:
+            ac.baseline_pi = copy.deepcopy(ac.pi)
+            ac.baseline_Qj = copy.deepcopy(ac.Qj1)
+        # pi_l_old, pi_info_old = compute_loss_pi(data)
+        # pi_l_old = pi_l_old.item()
+        # v_l_old, _ = compute_loss_v(data)
+        # v_l_old = v_l_old.item()
+        # j_l_old, _ = compute_loss_j(data)
+        # j_l_old = j_l_old.item()
+        pi_l_old, v_l_old, j_l_old = 0, 0, 0
+        pi_info_old = dict(kl=0,l2=0, ent=0, cf=0)
+        if epoch_no==0:
+           for i in range(train_v_iters):
+            vf1_optimizer.zero_grad()
+            vf2_optimizer.zero_grad()
+            loss_v1, loss_v2 = compute_loss_v(data)
+            loss_v1.backward()
+            loss_v2.backward()
+            mpi_avg_grads(ac.Qv1)    # average grads across MPI processes
+            mpi_avg_grads(ac.Qv2)
+            vf1_optimizer.step()
+            vf2_optimizer.step()
+            jf1_optimizer.zero_grad()
+            jf2_optimizer.zero_grad()
+            loss_j1, loss_j2 = compute_loss_j(data)
+            loss_j1.backward()
+            loss_j2.backward()
+            mpi_avg_grads(ac.Qj1)    # average grads across MPI processes
+            mpi_avg_grads(ac.Qj2)
+            jf1_optimizer.step()
+            jf2_optimizer.step()
+        # Trust region update for policy
+        loss_pi, pi_info = compute_loss_pi(data, epoch_no = epoch_no)
+        logger.store(StopIter=0)
+        # Value and Cost Value function learning
+        for i in range(train_v_iters):
+            vf1_optimizer.zero_grad()
+            vf2_optimizer.zero_grad()
+            loss_v1, loss_v2 = compute_loss_v(data)
+            loss_v1.backward()
+            loss_v2.backward()
+            mpi_avg_grads(ac.Qv1)    # average grads across MPI processes
+            mpi_avg_grads(ac.Qv2)
+            vf1_optimizer.step()
+            vf2_optimizer.step()
+            jf1_optimizer.zero_grad()
+            jf2_optimizer.zero_grad()
+            loss_j1, loss_j2 = compute_loss_j(data)
+            loss_j1.backward()
+            loss_j2.backward()
+            mpi_avg_grads(ac.Qj1)    # average grads across MPI processes
+            mpi_avg_grads(ac.Qj2)
+            jf1_optimizer.step()
+            jf2_optimizer.step()
+        # Log changes from update
+        kl,l2, ent, cf = pi_info['kl'],pi_info['l2'], pi_info_old['ent'], pi_info['cf']
+        logger.store(LossPi=pi_l_old, LossV=v_l_old, LossJ= j_l_old,
+                     KL=kl, L2=l2, Entropy=ent, ClipFrac=cf,
+                     DeltaLossPi=(loss_pi.item() - pi_l_old),
+                     DeltaLossV=(loss_v1.item() - v_l_old),
+                     DeltaLossJ=(loss_j1.item() - j_l_old),
+                     Penalty=torch.nn.functional.softplus(soft_penalty))
+    # Prepare for interaction with environment
+    start_time = time.time()
+    o, ep_ret,ep_cost_ret, ep_len = env.reset(), 0, 0, 0
+    # Main loop: collect experience in env and update/log each epoch
+    for epoch in tqdm(range(epochs)):
+        for t in range(local_steps_per_epoch):
+            a, v, j, logp = ac.step(torch.as_tensor(o, dtype=torch.float32))
+            noise = 0.05 * np.random.randn(*a.shape)
+            # print("Action: {}".format(a+noise))
+            a = a + noise
+            next_o, r, d, info = env.step(a)
+            ep_ret += r
+            ep_cost_ret += info.get('cost', 0)
+            ep_len += 1
+            # save and log
+            buf.store(o, a, r, info.get('cost', 0), v, j, logp, a)
+            # td3_buf.add(o, a,next_o, r, info.get('cost', 0),d)
+            logger.store(VVals=v, JVals = j)
+            # Update obs (critical!)
+            o = next_o
+            timeout = ep_len == max_ep_len
+            terminal = d or timeout
+            epoch_ended = t==local_steps_per_epoch-1
+            if terminal or epoch_ended:
+                if epoch_ended and not(terminal):
+                    print('Warning: trajectory cut off by epoch at %d steps.'%ep_len, flush=True)
+                # if trajectory didn't reach terminal state, bootstrap value target
+                if timeout or epoch_ended:
+                    _, v, j, _ = ac.step(torch.as_tensor(o, dtype=torch.float32))
+                else:
+                    v, j = 0, 0
+                buf.finish_path(v, j)
+                if terminal:
+                    # only save EpRet / EpLen if trajectory finished
+                    logger.store(EpRet=ep_ret, EpCostRet=ep_cost_ret, EpLen=ep_len)
+                o, ep_ret , ep_cost_ret, ep_len = env.reset(), 0, 0, 0
+        # Save model
+        if (epoch % save_freq == 0) or (epoch == epochs-1):
+            logger.save_state({'env': env}, None)
+        # Perform PPO update!
+        update(epoch, constraint_violations, constraint_violations_count)
+        # Log info about epoch
+        logger.log_tabular('Epoch', epoch)
+        logger.log_tabular('EpRet', with_min_and_max=True)
+        logger.log_tabular('EpCostRet', with_min_and_max=True)
+        logger.log_tabular('EpLen', average_only=True)
+        logger.log_tabular('VVals', with_min_and_max=True)
+        logger.log_tabular('JVals', with_min_and_max=True)
+        logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch)
+        logger.log_tabular('LossPi', average_only=True)
+        logger.log_tabular('LossV', average_only=True)
+        logger.log_tabular('LossJ', average_only=True)
+        logger.log_tabular('DeltaLossPi', average_only=True)
+        logger.log_tabular('DeltaLossV', average_only=True)
+        logger.log_tabular('DeltaLossJ', average_only=True)
+        logger.log_tabular('Entropy', average_only=True)
+        logger.log_tabular('KL', average_only=True)
+        # logger.log_tabular('L2', average_only=True)
+        logger.log_tabular('ClipFrac', average_only=True)
+        logger.log_tabular('StopIter', average_only=True)
+        logger.log_tabular('Penalty', average_only=True)
+        logger.log_tabular('Backtrack', average_only=True)
+        logger.log_tabular('Time', time.time()-start_time)
+        #breakpoint()
+        # Log the average episode return and length using TensorBoard
+        rv = logger.dump_tabular(IO=False) #holy mother of monkeypatch
+        writer.add_scalar('AvgEpRet', rv['AverageEpRet'], epoch)
+        writer.add_scalar('AvgEpLen', rv['EpLen'], epoch)
+        writer.add_scalar('AvgEpCost', rv['AverageEpCostRet'], epoch)
+        #print(rv)
+        #TODO: get out the episode return and length (average)
+        #writer.add_scalar('EpRet', logger.epoch_dict['EpRet'][0], epoch)
+        #writer.add_scalar('EpLen', logger.epoch_dict['EpLen'], epoch)
+    writer.close()
+    print("Total constraint violations were: {} with total violation cost: {}".format(sum(constraint_violations_count), sum(constraint_violations)))
+    return ac
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--env', type=str, default='Safexp-PointGoal1-v0')
+    parser.add_argument('--hid', type=int, default=256)
+    parser.add_argument('--l', type=int, default=2)
+    parser.add_argument('--gamma', type=float, default=0.99)
+    parser.add_argument('--target_l2', type=float, default=0.012)
+    parser.add_argument('--cost_lim', type=float, default=25)
+    parser.add_argument('--seed', '-s', type=int, default=0)
+    parser.add_argument('--cpu', type=int, default=3)
+    parser.add_argument('--steps', type=int, default=30000)
+    parser.add_argument('--epochs', type=int, default=3000)
+    parser.add_argument('--exp_name', type=str, default='td3_dump')
+    args = parser.parse_args()
+    mpi_fork(args.cpu)  # run parallel code with mpi
+    from ppo_utils.run_utils import setup_logger_kwargs
+    logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed)
+    env_fn = lambda : gym.make(args.env)
+    cpo(env_fn, env_name= args.env, actor_critic=core.MLPActorCriticTD3trust,
+        ac_kwargs=dict(hidden_sizes=[args.hid]*args.l), gamma=args.gamma,
+        seed=args.seed, steps_per_epoch=args.steps, epochs=args.epochs, target_l2=args.target_l2,cost_lim=args.cost_lim,
+        logger_kwargs=logger_kwargs)
--- a/sargym/path_clone_ring_constraint-finalcopy.py
+++ b/sargym/path_clone_ring_constraint-finalcopy.py
+#!/usr/bin/env python
+# coding: utf-8
+# In[25]:
+import numpy as np
+from skimage.io import imshow
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D
+from tqdm import tqdm
+import torch
+rp = np.load('test_prob.npy')
+rp.shape
+# In[2]:
+imshow(rp)
+# In[3]:
+r0 = np.load('robot0_waypoints.npy')
+r1 = np.load('robot1_waypoints.npy')
+r2 = np.load('robot2_waypoints.npy')
+plt.imshow(rp)
+plt.scatter(*r0.T, color='r')
+plt.scatter(*r1.T, color='r')
+plt.scatter(*r2.T, color='r')
+plt.vlines([400, 800], 0, 1200, color='pink')
+# In[4]:
+plt.plot(np.sum(1 - rp, axis=0))
+plt.vlines([400, 800], 0, 1200, color='pink')
+# In[5]:
+#rew
+def f(x, t=0.8):
+    if x < -t:
+        return -1
+    elif x > t:
+        return 1
+    return 1/t * x
+X  = np.linspace(-1, 1, 64)
+Y1 = [f(x, t=0.6) for x in X]
+Y2 = [f(x, t=0.8) for x in X]
+plt.plot(Y1)
+plt.plot(Y2)
+# In[6]:
+from gym import spaces, Env
+CW =  400
+CH = 1200
+MAX_STEPS_PER_EPISODE        = 100
+MAX_FORWARD_REWARD_THRESHOLD = 0.70 #Give us the 'leeway' to make some horizontal variation
+MAX_UNITS_TRAVELED_PER_STEP  = CH / (MAX_STEPS_PER_EPISODE * MAX_FORWARD_REWARD_THRESHOLD)
+H_REWARD_SCALE = 5 #TODO: tune this
+R_REWARD_SCALE = 2 #TODO: tune this
+#Normalize paths and regions to [-1, 1]
+region0 = rp[:,    0: 400]
+region1 = rp[:,  400: 400]
+region2 = rp[:,  800:1200]
+def uv2xy(u, v):
+    # Map coordinates from the double unit square (-1, -1) to (1, 1)
+    # to the rectangle (0, 0), (CW, CH)
+    x = (u + 1) * CW / 2
+    y = (v + 1) * CH / 2
+    return [x, y]
+def xy2uv(x, y):
+    # Map coordinates from the rectangle (0, 0), (CW, CH)
+    # to the double unit square (-1, -1) to (1, 1)
+    u = (x / CW) * 2 - 1
+    v = (y / CH) * 2 - 1
+    return [u, v]
+class GEC(Env):
+    def __init__(self, track, new=True):
+        self.action_space      = spaces.Box(low=-1, high=1, shape=(2,), dtype=float)
+        self.observation_space = spaces.Box(low=-1, high=1, shape=(2,), dtype=float)
+        self.track = np.array([xy2uv(*c) for c in track])
+        self.new   = new
+        self.reset()
+    def reset(self, seed=None, options=None):
+        if seed:
+            np.random.seed(seed)
+            random.seed(seed)
+        self.pos_x = 0.0
+        self.pos_y = 1.0
+        self.current_step = 0
+        #print(self.new)
+        if self.new:
+            return self._get_obs(), {}
+        return self._get_obs()
+    def _get_obs(self):
+        return np.array([self.pos_x, self.pos_y])
+    def step(self, action):
+        action_scaled = action * (2 * MAX_UNITS_TRAVELED_PER_STEP / max(CW, CH)) #hacky but we'll take it
+        self.pos_x = np.clip(self.pos_x + action_scaled[0], -1, 1)
+        self.pos_y = np.clip(self.pos_y + action_scaled[1], -1, 1)
+        #get to the end
+        reward_vertical = -f(action[1], t=MAX_FORWARD_REWARD_THRESHOLD)
+        #follow the LPM track
+        '''
+        Example:
+        #self.track = [[ 0.         -1.        ]
+                       [ 0.005      -0.97333333]
+                       [ 0.01       -0.94833333]
+                       [ 0.015      -0.92166667]
+                       [ 0.02       -0.89666667]
+                       [ 0.025      -0.87166667]...
+        self.pos_x =  0.3
+        self.pos_y = -0.96
+        calculate the indices of the track path closest to the y position (so, surrounding -0.96)
+        naturally these are -0.97 and 0.94
+        calculate the line between 
+        [ 0.005      -0.97333333]
+        [ 0.01       -0.94833333]
+        reward (penalty) is the L2 distance from pos(x, y) to the line segment 
+        '''
+        # Find indices of adjacent 'y'
+        idx = np.searchsorted(self.track[:, 1], self.pos_y, side='right') - 1
+        idx = np.clip(idx, 0, len(self.track) - 2)
+        # Compute line over adjacency
+        p1 = self.track[idx]
+        p2 = self.track[idx + 1]
+        line = p2 - p1
+        # Compute distance perpendicular to line
+        pos = np.array([self.pos_x, self.pos_y])
+        line_norm = line / np.linalg.norm(line)
+        proj = np.dot(pos - p1, line_norm) * line_norm
+        closest_point = p1 + proj
+        penalty_horizontal = np.linalg.norm(pos - closest_point)
+        penalty_ring = 1 - rp[
+            int(0.5 * (self.pos_x + 1) * 0.33 * (CW-1)), 
+            int(0.5 * (self.pos_y + 1) * 1.00 * (CH-1))
+        ]
+        reward = reward_vertical - H_REWARD_SCALE * penalty_horizontal #- R_REWARD_SCALE * penalty_ring
+        # Increment step counter
+        self.current_step += 1
+        # Calculate 'truncated'
+        truncated = self.current_step >= MAX_STEPS_PER_EPISODE
+        # Calculate 'terminated'
+        terminated = self.pos_y < -0.999
+        if self.new:
+            return self._get_obs(), reward, bool(terminated), bool(truncated), {}
+        return self._get_obs(), reward, bool(terminated) or bool(truncated), {}
+    def render(self, mode='human'):
+        pass
+    def close(self):
+        pass
+# In[7]:
+#Sanity / 'dumb' environment test using naive policy
+env = GEC(r0)
+obs, info = env.reset()
+path = [obs]
+rews = []
+done = False
+while not done:
+    obs, rew, term, trunc, info = env.step(np.array([0, -1]))
+    done = term or trunc
+    path.append(obs)
+    rews.append(rew)
+fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
+# Scatter plot of path and track in the first subplot
+ax1.scatter(*(np.array(path)).T, label='Path')
+ax1.scatter(*env.track.T, label='Track')
+ax1.set_xlabel('X')
+ax1.set_ylabel('Y')
+ax1.set_title('Path and Track')
+ax1.legend()
+# Plot of rewards in the second subplot
+ax2.plot(rews)
+ax2.set_xlabel('Step')
+ax2.set_ylabel('Reward')
+ax2.set_title('Rewards')
+plt.tight_layout()
+plt.show()
+# In[10]:
+from stable_baselines3 import PPO
+#Train PPO model
+train = False
+if train:
+    log_dir = "logs3/" #tensorboard --logdir logs3 
+    model = PPO("MlpPolicy", env, verbose=0, tensorboard_log=log_dir, n_steps = 8192, batch_size = 2048)
+    model.learn(total_timesteps=500000, progress_bar=True)
+    model.save('final4')
+else:
+    model = PPO.load('final4')
+# In[16]:
+obs, info = env.reset()
+path = [obs]
+rews = []
+done = False
+while not done:
+    actions, _states = model.predict(obs)
+    obs, rew, term, trunc, info = env.step(actions)
+    done = term or trunc
+    path.append(obs)
+    rews.append(rew)
+fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
+# Scatter plot of path and track in the first subplot
+ax1.scatter(*(np.array(path)).T, label='Path')
+ax1.scatter(*env.track.T, label='Track')
+ax1.set_xlabel('X')
+ax1.set_ylabel('Y')
+ax1.set_title('Path and Track')
+ax1.legend()
+# Plot of rewards in the second subplot
+ax2.plot(rews)
+ax2.set_xlabel('Step')
+ax2.set_ylabel('Reward')
+ax2.set_title('Rewards')
+plt.tight_layout()
+plt.show()
+# In[17]:
+from gym import spaces, Env
+CW =  400
+CH = 1200
+MAX_STEPS_PER_EPISODE        = 100
+MAX_FORWARD_REWARD_THRESHOLD = 0.70 #Give us the 'leeway' to make some horizontal variation
+MAX_UNITS_TRAVELED_PER_STEP  = CH / (MAX_STEPS_PER_EPISODE * MAX_FORWARD_REWARD_THRESHOLD)
+H_REWARD_SCALE = 1.5 #TODO: tune this
+#Normalize paths and regions to [-1, 1]
+region0 = rp[:,    0: 400]
+region1 = rp[:,  400: 400]
+region2 = rp[:,  800:1200]
+def uv2xy(u, v):
+    # Map coordinates from the double unit square (-1, -1) to (1, 1)
+    # to the rectangle (0, 0), (CW, CH)
+    x = (u + 1) * CW / 2
+    y = (v + 1) * CH / 2
+    return [x, y]
+def xy2uv(x, y):
+    # Map coordinates from the rectangle (0, 0), (CW, CH)
+    # to the double unit square (-1, -1) to (1, 1)
+    u = (x / CW) * 2 - 1
+    v = (y / CH) * 2 - 1
+    return [u, v]
+class GEC2(Env):
+    def __init__(self, track):
+        self.action_space      = spaces.Box(low=-1, high=1, shape=(2,), dtype=float)
+        self.observation_space = spaces.Box(low=-1, high=1, shape=(2,), dtype=float)
+        self.track = np.array([xy2uv(*c) for c in track])
+        self.reset()
+    def reset(self, seed=None, options=None):
+        self.pos_x = 0.0
+        self.pos_y = 1.0
+        self.current_step = 0
+        return self._get_obs()
+    def _get_obs(self):
+        return np.array([self.pos_x, self.pos_y])
+    def step(self, action):
+        action_scaled = action * (2 * MAX_UNITS_TRAVELED_PER_STEP / max(CW, CH)) #hacky but we'll take it
+        self.pos_x = np.clip(self.pos_x + action_scaled[0], -1, 1)
+        self.pos_y = np.clip(self.pos_y + action_scaled[1], -1, 1)
+        #get to the end
+        reward_vertical = -f(action[1], t=MAX_FORWARD_REWARD_THRESHOLD)
+        # Find indices of adjacent 'y'
+        idx = np.searchsorted(self.track[:, 1], self.pos_y, side='right') - 1
+        idx = np.clip(idx, 0, len(self.track) - 2)
+        # Compute line over adjacency
+        p1 = self.track[idx]
+        p2 = self.track[idx + 1]
+        line = p2 - p1
+        # Compute distance perpendicular to line
+        pos = np.array([self.pos_x, self.pos_y])
+        line_norm = line / np.linalg.norm(line)
+        proj = np.dot(pos - p1, line_norm) * line_norm
+        closest_point = p1 + proj
+        penalty_horizontal = np.linalg.norm(pos - closest_point)
+        penalty_ring = 1 - rp[
+            int(0.5 * (self.pos_x + 1) * 0.33 * (CW-1)), 
+            int(0.5 * (self.pos_y + 1) * 1.00 * (CH-1))
+        ]
+        info = {'cost':penalty_ring}
+        reward = reward_vertical - H_REWARD_SCALE * penalty_horizontal
+        # Increment step counter
+        self.current_step += 1
+        # Calculate 'terminated'
+        terminated = self.pos_y < -0.999
+        return self._get_obs(), reward, bool(terminated), info
+    def render(self, mode='human'):
+        pass
+    def close(self):
+        pass
+# In[29]:
+get_ipython().run_line_magic('load_ext', 'autoreload')
+get_ipython().run_line_magic('autoreload', '2')
+import cpo
+import core
+import cProfile
+import pstats
+import io
+COST_THRESHOLD = 50  #sanity check to make sure things are working
+def main():
+    J = cpo.cpo(
+        lambda: GEC2(r0),
+        actor_critic=core.MLPActorCriticTD3trust,
+        max_ep_len=MAX_STEPS_PER_EPISODE,
+        cost_lim=COST_THRESHOLD,
+        epochs=175,
+        steps_per_epoch=8000
+    )
+    return J
+pr = cProfile.Profile()
+pr.enable()
+profile = False
+if profile:
+    try:
+        main()
+    except KeyboardInterrupt:
+        print("\nKeyboard interrupt received. Printing stats...")
+    finally:
+        pr.disable()
+        s = io.StringIO()
+        ps = pstats.Stats(pr, stream=s).sort_stats("tottime")
+        ps.print_stats(32)
+        print(s.getvalue())
+else:
+    J = main()
+# In[37]:
+env = GEC2(r0)
+obs = env.reset()
+path = [obs]
+rews = []
+done = False
+steps = 0
+while not done and steps <= MAX_STEPS_PER_EPISODE:
+    with torch.no_grad():
+        tt = torch.as_tensor(obs, dtype=torch.float32)
+        actions = J.pi(tt)
+    #print(actions)
+    obs, rew, term, info = env.step(actions.numpy())
+    done = term
+    path.append(obs)
+    rews.append(rew)
+    steps += 1
+fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
+# Scatter plot of path and track in the first subplot
+#ax1.scatter(*(np.array(path)).T, label='Path')
+ax1.scatter(*env.track.T, label='Track')
+path_array = np.array(path)
+directions = np.diff(path_array, axis=0)
+arrow_positions = path_array[:-1] + directions / 2
+ax1.quiver(*arrow_positions.T, *directions.T, scale_units='xy', angles='xy', scale=1, color='r', width=0.005)
+ax1.set_xlabel('X')
+ax1.set_ylabel('Y')
+ax1.set_title('Path and Track')
+ax1.legend()
+# Plot of rewards in the second subplot
+ax2.plot(rews)
+ax2.set_xlabel('Step')
+ax2.set_ylabel('Reward')
+ax2.set_title('Rewards')
+plt.tight_layout()
+plt.show()
--- a/sargym/ppo_utils/logx.py
+++ b/sargym/ppo_utils/logx.py
+"""
+Some simple logging functionality, inspired by rllab's logging.
+Logs to a tab-separated-values file (path/to/output_directory/progress.txt)
+"""
+import json
+import joblib
+import shutil
+import numpy as np
+#import tensorflow as tf #lmao
+import torch
+import os.path as osp, time, atexit, os
+import warnings
+from .mpi_tools import proc_id, mpi_statistics_scalar
+from .serialization_utils import convert_json
+color2num = dict(
+    gray=30,
+    red=31,
+    green=32,
+    yellow=33,
+    blue=34,
+    magenta=35,
+    cyan=36,
+    white=37,
+    crimson=38
+)
+def colorize(string, color, bold=False, highlight=False):
+    """
+    Colorize a string.
+    This function was originally written by John Schulman.
+    """
+    attr = []
+    num = color2num[color]
+    if highlight: num += 10
+    attr.append(str(num))
+    if bold: attr.append('1')
+    return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
+def restore_tf_graph(sess, fpath):
+    """
+    Loads graphs saved by Logger.
+    Will output a dictionary whose keys and values are from the 'inputs'
+    and 'outputs' dict you specified with logger.setup_tf_saver().
+    Args:
+        sess: A Tensorflow session.
+        fpath: Filepath to save directory.
+    Returns:
+        A dictionary mapping from keys to tensors in the computation graph
+        loaded from ``fpath``.
+    """
+    tf.saved_model.loader.load(
+                sess,
+                [tf.saved_model.tag_constants.SERVING],
+                fpath
+            )
+    model_info = joblib.load(osp.join(fpath, 'model_info.pkl'))
+    graph = tf.get_default_graph()
+    model = dict()
+    model.update({k: graph.get_tensor_by_name(v) for k,v in model_info['inputs'].items()})
+    model.update({k: graph.get_tensor_by_name(v) for k,v in model_info['outputs'].items()})
+    return model
+class Logger:
+    """
+    A general-purpose logger.
+    Makes it easy to save diagnostics, hyperparameter configurations, the
+    state of a training run, and the trained model.
+    """
+    def __init__(self, output_dir=None, output_fname='progress.txt', exp_name=None):
+        """
+        Initialize a Logger.
+        Args:
+            output_dir (string): A directory for saving results to. If
+                ``None``, defaults to a temp directory of the form
+                ``/tmp/experiments/somerandomnumber``.
+            output_fname (string): Name for the tab-separated-value file
+                containing metrics logged throughout a training run.
+                Defaults to ``progress.txt``.
+            exp_name (string): Experiment name. If you run multiple training
+                runs and give them all the same ``exp_name``, the plotter
+                will know to group them. (Use case: if you run the same
+                hyperparameter configuration with multiple random seeds, you
+                should give them all the same ``exp_name``.)
+        """
+        if proc_id()==0:
+            self.output_dir = output_dir or "/tmp/experiments/%i"%int(time.time())
+            if osp.exists(self.output_dir):
+                print("Warning: Log dir %s already exists! Storing info there anyway."%self.output_dir)
+            else:
+                os.makedirs(self.output_dir)
+            self.output_file = open(osp.join(self.output_dir, output_fname), 'w')
+            atexit.register(self.output_file.close)
+            print(colorize("Logging data to %s"%self.output_file.name, 'green', bold=True))
+        else:
+            self.output_dir = None
+            self.output_file = None
+        self.first_row=True
+        self.log_headers = []
+        self.log_current_row = {}
+        self.exp_name = exp_name
+    def log(self, msg, color='green'):
+        """Print a colorized message to stdout."""
+        if proc_id()==0:
+            print(colorize(msg, color, bold=True))
+    def log_tabular(self, key, val):
+        """
+        Log a value of some diagnostic.
+        Call this only once for each diagnostic quantity, each iteration.
+        After using ``log_tabular`` to store values for each diagnostic,
+        make sure to call ``dump_tabular`` to write them out to file and
+        stdout (otherwise they will not get saved anywhere).
+        """
+        if self.first_row:
+            self.log_headers.append(key)
+        else:
+            assert key in self.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key
+        assert key not in self.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key
+        self.log_current_row[key] = val
+    def save_config(self, config):
+        """
+        Log an experiment configuration.
+        Call this once at the top of your experiment, passing in all important
+        config vars as a dict. This will serialize the config to JSON, while
+        handling anything which can't be serialized in a graceful way (writing
+        as informative a string as possible).
+        Example use:
+        .. code-block:: python
+            logger = EpochLogger(**logger_kwargs)
+            logger.save_config(locals())
+        """
+        config_json = convert_json(config)
+        if self.exp_name is not None:
+            config_json['exp_name'] = self.exp_name
+        if proc_id()==0:
+            output = json.dumps(config_json, separators=(',',':\t'), indent=4, sort_keys=True)
+            print(colorize('Saving config:\n', color='cyan', bold=True))
+            print(output)
+            with open(osp.join(self.output_dir, "config.json"), 'w') as out:
+                out.write(output)
+    def save_state(self, state_dict, itr=None):
+        """
+        Saves the state of an experiment.
+        To be clear: this is about saving *state*, not logging diagnostics.
+        All diagnostic logging is separate from this function. This function
+        will save whatever is in ``state_dict``---usually just a copy of the
+        environment---and the most recent parameters for the model you
+        previously set up saving for with ``setup_tf_saver``.
+        Call with any frequency you prefer. If you only want to maintain a
+        single state and overwrite it at each call with the most recent
+        version, leave ``itr=None``. If you want to keep all of the states you
+        save, provide unique (increasing) values for 'itr'.
+        Args:
+            state_dict (dict): Dictionary containing essential elements to
+                describe the current state of training.
+            itr: An int, or None. Current iteration of training.
+        """
+        if proc_id()==0:
+            fname = 'vars.pkl' if itr is None else 'vars%d.pkl'%itr
+            try:
+                joblib.dump(state_dict, osp.join(self.output_dir, fname))
+            except:
+                self.log('Warning: could not pickle state_dict.', color='red')
+            if hasattr(self, 'tf_saver_elements'):
+                self._tf_simple_save(itr)
+            if hasattr(self, 'pytorch_saver_elements'):
+                self._pytorch_simple_save(itr)
+    def setup_tf_saver(self, sess, inputs, outputs):
+        """
+        Set up easy model saving for tensorflow.
+        Call once, after defining your computation graph but before training.
+        Args:
+            sess: The Tensorflow session in which you train your computation
+                graph.
+            inputs (dict): A dictionary that maps from keys of your choice
+                to the tensorflow placeholders that serve as inputs to the
+                computation graph. Make sure that *all* of the placeholders
+                needed for your outputs are included!
+            outputs (dict): A dictionary that maps from keys of your choice
+                to the outputs from your computation graph.
+        """
+        self.tf_saver_elements = dict(session=sess, inputs=inputs, outputs=outputs)
+        self.tf_saver_info = {'inputs': {k:v.name for k,v in inputs.items()},
+                              'outputs': {k:v.name for k,v in outputs.items()}}
+    def _tf_simple_save(self, itr=None):
+        """
+        Uses simple_save to save a trained model, plus info to make it easy
+        to associated tensors to variables after restore.
+        """
+        if proc_id()==0:
+            assert hasattr(self, 'tf_saver_elements'), \
+                "First have to setup saving with self.setup_tf_saver"
+            fpath = 'tf1_save' + ('%d'%itr if itr is not None else '')
+            fpath = osp.join(self.output_dir, fpath)
+            if osp.exists(fpath):
+                # simple_save refuses to be useful if fpath already exists,
+                # so just delete fpath if it's there.
+                shutil.rmtree(fpath)
+            tf.saved_model.simple_save(export_dir=fpath, **self.tf_saver_elements)
+            joblib.dump(self.tf_saver_info, osp.join(fpath, 'model_info.pkl'))
+    def setup_pytorch_saver(self, what_to_save):
+        """
+        Set up easy model saving for a single PyTorch model.
+        Because PyTorch saving and loading is especially painless, this is
+        very minimal; we just need references to whatever we would like to
+        pickle. This is integrated into the logger because the logger
+        knows where the user would like to save information about this
+        training run.
+        Args:
+            what_to_save: Any PyTorch model or serializable object containing
+                PyTorch models.
+        """
+        self.pytorch_saver_elements = what_to_save
+    def _pytorch_simple_save(self, itr=None):
+        """
+        Saves the PyTorch model (or models).
+        """
+        if proc_id()==0:
+            assert hasattr(self, 'pytorch_saver_elements'), \
+                "First have to setup saving with self.setup_pytorch_saver"
+            fpath = 'pyt_save'
+            fpath = osp.join(self.output_dir, fpath)
+            fname = 'model' + ('%d'%itr if itr is not None else '') + '.pt'
+            fname = osp.join(fpath, fname)
+            os.makedirs(fpath, exist_ok=True)
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                # We are using a non-recommended way of saving PyTorch models,
+                # by pickling whole objects (which are dependent on the exact
+                # directory structure at the time of saving) as opposed to
+                # just saving network weights. This works sufficiently well
+                # for the purposes of Spinning Up, but you may want to do
+                # something different for your personal PyTorch project.
+                # We use a catch_warnings() context to avoid the warnings about
+                # not being able to save the source code.
+                torch.save(self.pytorch_saver_elements, fname)
+    def dump_tabular(self, IO=True):
+        """
+        Write all of the diagnostics from the current iteration.
+        Writes both to stdout, and to the output file.
+        """
+        rv = {}
+        if proc_id()==0:
+            vals = []
+            key_lens = [len(key) for key in self.log_headers]
+            max_key_len = max(15,max(key_lens))
+            keystr = '%'+'%d'%max_key_len
+            fmt = "| " + keystr + "s | %15s |"
+            n_slashes = 22 + max_key_len
+            if IO:
+                print("-"*n_slashes)
+            for key in self.log_headers:
+                val = self.log_current_row.get(key, "")
+                rv[key] = val
+                valstr = "%8.3g"%val if hasattr(val, "__float__") else val
+                if IO:
+                    print(fmt%(key, valstr))
+                vals.append(val)
+            if IO:
+                print("-"*n_slashes, flush=True)
+            if self.output_file is not None and IO:
+                if self.first_row:
+                    self.output_file.write("\t".join(self.log_headers)+"\n")
+                self.output_file.write("\t".join(map(str,vals))+"\n")
+                self.output_file.flush()
+        self.log_current_row.clear()
+        self.first_row=False
+        return rv
+class EpochLogger(Logger):
+    """
+    A variant of Logger tailored for tracking average values over epochs.
+    Typical use case: there is some quantity which is calculated many times
+    throughout an epoch, and at the end of the epoch, you would like to
+    report the average / std / min / max value of that quantity.
+    With an EpochLogger, each time the quantity is calculated, you would
+    use
+    .. code-block:: python
+        epoch_logger.store(NameOfQuantity=quantity_value)
+    to load it into the EpochLogger's state. Then at the end of the epoch, you
+    would use
+    .. code-block:: python
+        epoch_logger.log_tabular(NameOfQuantity, **options)
+    to record the desired values.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.epoch_dict = dict()
+    def store(self, **kwargs):
+        """
+        Save something into the epoch_logger's current state.
+        Provide an arbitrary number of keyword arguments with numerical
+        values.
+        """
+        for k,v in kwargs.items():
+            if not(k in self.epoch_dict.keys()):
+                self.epoch_dict[k] = []
+            self.epoch_dict[k].append(v)
+    def log_tabular(self, key, val=None, with_min_and_max=False, average_only=False):
+        """
+        Log a value or possibly the mean/std/min/max values of a diagnostic.
+        Args:
+            key (string): The name of the diagnostic. If you are logging a
+                diagnostic whose state has previously been saved with
+                ``store``, the key here has to match the key you used there.
+            val: A value for the diagnostic. If you have previously saved
+                values for this key via ``store``, do *not* provide a ``val``
+                here.
+            with_min_and_max (bool): If true, log min and max values of the
+                diagnostic over the epoch.
+            average_only (bool): If true, do not log the standard deviation
+                of the diagnostic over the epoch.
+        """
+        if val is not None:
+            super().log_tabular(key,val)
+        else:
+            v = self.epoch_dict[key]
+            vals = np.concatenate(v) if isinstance(v[0], np.ndarray) and len(v[0].shape)>0 else v
+            stats = mpi_statistics_scalar(vals, with_min_and_max=with_min_and_max)
+            super().log_tabular(key if average_only else 'Average' + key, stats[0])
+            if not(average_only):
+                super().log_tabular('Std'+key, stats[1])
+            if with_min_and_max:
+                super().log_tabular('Max'+key, stats[3])
+                super().log_tabular('Min'+key, stats[2])
+        self.epoch_dict[key] = []
+    def get_stats(self, key):
+        """
+        Lets an algorithm ask the logger for mean/std/min/max of a diagnostic.
+        """
+        v = self.epoch_dict[key]
+        vals = np.concatenate(v) if isinstance(v[0], np.ndarray) and len(v[0].shape)>0 else v
+        return mpi_statistics_scalar(vals)
--- a/sargym/ppo_utils/mpi_pytorch.py
+++ b/sargym/ppo_utils/mpi_pytorch.py
+import multiprocessing
+import numpy as np
+import os
+import torch
+from mpi4py import MPI
+from .mpi_tools import broadcast, mpi_avg, num_procs, proc_id
+def setup_pytorch_for_mpi():
+    """
+    Avoid slowdowns caused by each separate process's PyTorch using
+    more than its fair share of CPU resources.
+    """
+    #print('Proc %d: Reporting original number of Torch threads as %d.'%(proc_id(), torch.get_num_threads()), flush=True)
+    if torch.get_num_threads()==1:
+        return
+    fair_num_threads = max(int(torch.get_num_threads() / num_procs()), 1)
+    torch.set_num_threads(fair_num_threads)
+    #print('Proc %d: Reporting new number of Torch threads as %d.'%(proc_id(), torch.get_num_threads()), flush=True)
+def mpi_avg_grads(module):
+    """ Average contents of gradient buffers across MPI processes. """
+    if num_procs()==1:
+        return
+    for p in module.parameters():
+        p_grad_numpy = p.grad.numpy()   # numpy view of tensor data
+        avg_p_grad = mpi_avg(p.grad)
+        p_grad_numpy[:] = avg_p_grad[:]
+def sync_params(module):
+    """ Sync all parameters of module across all MPI processes. """
+    if num_procs()==1:
+        return
+    for p in module.parameters():
+        p_numpy = p.data.numpy()
+        broadcast(p_numpy)
--- a/sargym/ppo_utils/mpi_tools.py
+++ b/sargym/ppo_utils/mpi_tools.py
+from mpi4py import MPI
+import os, subprocess, sys
+import numpy as np
+import torch
+def mpi_fork(n, bind_to_core=False):
+    """
+    Re-launches the current script with workers linked by MPI.
+    Also, terminates the original process that launched it.
+    Taken almost without modification from the Baselines function of the
+    `same name`_.
+    .. _`same name`: https://github.com/openai/baselines/blob/master/baselines/common/mpi_fork.py
+    Args:
+        n (int): Number of process to split into.
+        bind_to_core (bool): Bind each MPI process to a core.
+    """
+    if n<=1: 
+        return
+    if os.getenv("IN_MPI") is None:
+        env = os.environ.copy()
+        env.update(
+            MKL_NUM_THREADS="1",
+            OMP_NUM_THREADS="1",
+            IN_MPI="1"
+        )
+        args = ["mpirun", "-np", str(n)]
+        if bind_to_core:
+            args += ["-bind-to", "core"]
+        args += [sys.executable] + sys.argv
+        subprocess.check_call(args, env=env)
+        sys.exit()
+def msg(m, string=''):
+    print(('Message from %d: %s \t '%(MPI.COMM_WORLD.Get_rank(), string))+str(m))
+def proc_id():
+    """Get rank of calling process."""
+    return MPI.COMM_WORLD.Get_rank()
+def allreduce(*args, **kwargs):
+    return MPI.COMM_WORLD.Allreduce(*args, **kwargs)
+def num_procs():
+    """Count active MPI processes."""
+    return MPI.COMM_WORLD.Get_size()
+def broadcast(x, root=0):
+    MPI.COMM_WORLD.Bcast(x, root=root)
+def mpi_op(x, op):
+    x, scalar = ([x], True) if np.isscalar(x) else (x, False)
+    x = np.asarray(x, dtype=np.float32)
+    buff = np.zeros_like(x, dtype=np.float32)
+    allreduce(x, buff, op=op)
+    return buff[0] if scalar else buff
+def mpi_sum(x):
+    return mpi_op(x, MPI.SUM)
+def mpi_avg(x):
+    """Average a scalar or vector over MPI processes."""
+    return mpi_sum(x) / num_procs()
+def mpi_statistics_scalar(x, with_min_and_max=False):
+    """
+    Get mean/std and optional min/max of scalar x across MPI processes.
+    Args:
+        x: An array containing samples of the scalar to produce statistics
+            for.
+        with_min_and_max (bool): If true, return min and max of x in 
+            addition to mean and std.
+    """
+    if isinstance(x, list):
+        for i, xx in enumerate(x):
+            if torch.is_tensor(xx):
+                x[i] = xx.detach()
+    x = np.array(x, dtype=np.float32)
+    global_sum, global_n = mpi_sum([np.sum(x), len(x)])
+    mean = global_sum / global_n
+    global_sum_sq = mpi_sum(np.sum((x - mean)**2))
+    std = np.sqrt(global_sum_sq / global_n)  # compute global std
+    if with_min_and_max:
+        global_min = mpi_op(np.min(x) if len(x) > 0 else np.inf, op=MPI.MIN)
+        global_max = mpi_op(np.max(x) if len(x) > 0 else -np.inf, op=MPI.MAX)
+        return mean, std, global_min, global_max
+    return mean, std
\ No newline at end of file
--- a/sargym/ppo_utils/run_utils.py
+++ b/sargym/ppo_utils/run_utils.py
+from ppo_utils.user_config import DEFAULT_DATA_DIR, FORCE_DATESTAMP, \
+                               DEFAULT_SHORTHAND, WAIT_BEFORE_LAUNCH
+from ppo_utils.logx import colorize
+from ppo_utils.mpi_tools import mpi_fork, msg
+from ppo_utils.serialization_utils import convert_json
+import base64
+from copy import deepcopy
+import cloudpickle
+import json
+import numpy as np
+import os
+import os.path as osp
+import psutil
+import string
+import subprocess
+from subprocess import CalledProcessError
+import sys
+from textwrap import dedent
+import time
+from tqdm import trange
+import zlib
+DIV_LINE_WIDTH = 80
+def setup_logger_kwargs(exp_name, seed=None, data_dir=None, datestamp=False):
+    """
+    Sets up the output_dir for a logger and returns a dict for logger kwargs.
+    If no seed is given and datestamp is false, 
+    ::
+        output_dir = data_dir/exp_name
+    If a seed is given and datestamp is false,
+    ::
+        output_dir = data_dir/exp_name/exp_name_s[seed]
+    If datestamp is true, amend to
+    ::
+        output_dir = data_dir/YY-MM-DD_exp_name/YY-MM-DD_HH-MM-SS_exp_name_s[seed]
+    You can force datestamp=True by setting ``FORCE_DATESTAMP=True`` in 
+    ``spinup/user_config.py``. 
+    Args:
+        exp_name (string): Name for experiment.
+        seed (int): Seed for random number generators used by experiment.
+        data_dir (string): Path to folder where results should be saved.
+            Default is the ``DEFAULT_DATA_DIR`` in ``spinup/user_config.py``.
+        datestamp (bool): Whether to include a date and timestamp in the
+            name of the save directory.
+    Returns:
+        logger_kwargs, a dict containing output_dir and exp_name.
+    """
+    # Datestamp forcing
+    datestamp = datestamp or FORCE_DATESTAMP
+    # Make base path
+    ymd_time = time.strftime("%Y-%m-%d_") if datestamp else ''
+    relpath = ''.join([ymd_time, exp_name])
+    if seed is not None:
+        # Make a seed-specific subfolder in the experiment directory.
+        if datestamp:
+            hms_time = time.strftime("%Y-%m-%d_%H-%M-%S")
+            subfolder = ''.join([hms_time, '-', exp_name, '_s', str(seed)])
+        else:
+            subfolder = ''.join([exp_name, '_s', str(seed)])
+        relpath = osp.join(relpath, subfolder)
+    data_dir = data_dir or DEFAULT_DATA_DIR
+    logger_kwargs = dict(output_dir=osp.join(data_dir, relpath), 
+                         exp_name=exp_name)
+    return logger_kwargs
+def call_experiment(exp_name, thunk, seed=0, num_cpu=1, data_dir=None, 
+                    datestamp=False, **kwargs):
+    """
+    Run a function (thunk) with hyperparameters (kwargs), plus configuration.
+    This wraps a few pieces of functionality which are useful when you want
+    to run many experiments in sequence, including logger configuration and
+    splitting into multiple processes for MPI. 
+    There's also a SpinningUp-specific convenience added into executing the
+    thunk: if ``env_name`` is one of the kwargs passed to call_experiment, it's
+    assumed that the thunk accepts an argument called ``env_fn``, and that
+    the ``env_fn`` should make a gym environment with the given ``env_name``. 
+    The way the experiment is actually executed is slightly complicated: the
+    function is serialized to a string, and then ``run_entrypoint.py`` is
+    executed in a subprocess call with the serialized string as an argument.
+    ``run_entrypoint.py`` unserializes the function call and executes it.
+    We choose to do it this way---instead of just calling the function 
+    directly here---to avoid leaking state between successive experiments.
+    Args:
+        exp_name (string): Name for experiment.
+        thunk (callable): A python function.
+        seed (int): Seed for random number generators.
+        num_cpu (int): Number of MPI processes to split into. Also accepts
+            'auto', which will set up as many procs as there are cpus on
+            the machine.
+        data_dir (string): Used in configuring the logger, to decide where
+            to store experiment results. Note: if left as None, data_dir will
+            default to ``DEFAULT_DATA_DIR`` from ``spinup/user_config.py``. 
+        **kwargs: All kwargs to pass to thunk.
+    """
+    # Determine number of CPU cores to run on
+    num_cpu = psutil.cpu_count(logical=False) if num_cpu=='auto' else num_cpu
+    # Send random seed to thunk
+    kwargs['seed'] = seed
+    # Be friendly and print out your kwargs, so we all know what's up
+    print(colorize('Running experiment:\n', color='cyan', bold=True))
+    print(exp_name + '\n')
+    print(colorize('with kwargs:\n', color='cyan', bold=True))
+    kwargs_json = convert_json(kwargs)
+    print(json.dumps(kwargs_json, separators=(',',':\t'), indent=4, sort_keys=True))
+    print('\n')
+    # Set up logger output directory
+    if 'logger_kwargs' not in kwargs:
+        kwargs['logger_kwargs'] = setup_logger_kwargs(exp_name, seed, data_dir, datestamp)
+    else:
+        print('Note: Call experiment is not handling logger_kwargs.\n')
+    def thunk_plus():
+        # Make 'env_fn' from 'env_name'
+        if 'env_name' in kwargs:
+            import gym
+            env_name = kwargs['env_name']
+            kwargs['env_fn'] = lambda : gym.make(env_name)
+            del kwargs['env_name']
+        # Fork into multiple processes
+        mpi_fork(num_cpu)
+        # Run thunk
+        thunk(**kwargs)
+    # Prepare to launch a script to run the experiment
+    pickled_thunk = cloudpickle.dumps(thunk_plus)
+    encoded_thunk = base64.b64encode(zlib.compress(pickled_thunk)).decode('utf-8')
+    entrypoint = osp.join(osp.abspath(osp.dirname(__file__)),'run_entrypoint.py')
+    cmd = [sys.executable if sys.executable else 'python', entrypoint, encoded_thunk]
+    try:
+        subprocess.check_call(cmd, env=os.environ)
+    except CalledProcessError:
+        err_msg = '\n'*3 + '='*DIV_LINE_WIDTH + '\n' + dedent("""
+            There appears to have been an error in your experiment.
+            Check the traceback above to see what actually went wrong. The 
+            traceback below, included for completeness (but probably not useful
+            for diagnosing the error), shows the stack leading up to the 
+            experiment launch.
+            """) + '='*DIV_LINE_WIDTH + '\n'*3
+        print(err_msg)
+        raise
+    # Tell the user about where results are, and how to check them
+    logger_kwargs = kwargs['logger_kwargs']
+    plot_cmd = 'python -m spinup.run plot '+logger_kwargs['output_dir']
+    plot_cmd = colorize(plot_cmd, 'green')
+    test_cmd = 'python -m spinup.run test_policy '+logger_kwargs['output_dir']
+    test_cmd = colorize(test_cmd, 'green')
+    output_msg = '\n'*5 + '='*DIV_LINE_WIDTH +'\n' + dedent("""\
+    End of experiment.
+    Plot results from this run with:
+    %s
+    Watch the trained agent with:
+    %s
+    """%(plot_cmd,test_cmd)) + '='*DIV_LINE_WIDTH + '\n'*5
+    print(output_msg)
+def all_bools(vals):
+    return all([isinstance(v,bool) for v in vals])
+def valid_str(v):
+    """ 
+    Convert a value or values to a string which could go in a filepath.
+    Partly based on `this gist`_.
+    .. _`this gist`: https://gist.github.com/seanh/93666
+    """
+    if hasattr(v, '__name__'):
+        return valid_str(v.__name__)
+    if isinstance(v, tuple) or isinstance(v, list):
+        return '-'.join([valid_str(x) for x in v])
+    # Valid characters are '-', '_', and alphanumeric. Replace invalid chars
+    # with '-'. 
+    str_v = str(v).lower()
+    valid_chars = "-_%s%s" % (string.ascii_letters, string.digits)
+    str_v = ''.join(c if c in valid_chars else '-' for c in str_v)
+    return str_v
+class ExperimentGrid:
+    """
+    Tool for running many experiments given hyperparameter ranges.
+    """
+    def __init__(self, name=''):
+        self.keys = []
+        self.vals = []
+        self.shs = []
+        self.in_names = []
+        self.name(name)
+    def name(self, _name):
+        assert isinstance(_name, str), "Name has to be a string."
+        self._name = _name
+    def print(self):
+        """Print a helpful report about the experiment grid."""
+        print('='*DIV_LINE_WIDTH)
+        # Prepare announcement at top of printing. If the ExperimentGrid has a
+        # short name, write this as one line. If the name is long, break the
+        # announcement over two lines.
+        base_msg = 'ExperimentGrid %s runs over parameters:\n'
+        name_insert = '['+self._name+']'
+        if len(base_msg%name_insert) <= 80:
+            msg = base_msg%name_insert
+        else:
+            msg = base_msg%(name_insert+'\n')
+        print(colorize(msg, color='green', bold=True))
+        # List off parameters, shorthands, and possible values.
+        for k, v, sh in zip(self.keys, self.vals, self.shs):
+            color_k = colorize(k.ljust(40), color='cyan', bold=True)
+            print('', color_k, '['+sh+']' if sh is not None else '', '\n')
+            for i, val in enumerate(v):
+                print('\t' + str(convert_json(val)))
+            print()
+        # Count up the number of variants. The number counting seeds
+        # is the total number of experiments that will run; the number not
+        # counting seeds is the total number of otherwise-unique configs
+        # being investigated.
+        nvars_total = int(np.prod([len(v) for v in self.vals]))
+        if 'seed' in self.keys:
+            num_seeds = len(self.vals[self.keys.index('seed')])
+            nvars_seedless = int(nvars_total / num_seeds)
+        else:
+            nvars_seedless = nvars_total
+        print(' Variants, counting seeds: '.ljust(40), nvars_total)
+        print(' Variants, not counting seeds: '.ljust(40), nvars_seedless)
+        print()
+        print('='*DIV_LINE_WIDTH)
+    def _default_shorthand(self, key):
+        # Create a default shorthand for the key, built from the first 
+        # three letters of each colon-separated part.
+        # But if the first three letters contains something which isn't
+        # alphanumeric, shear that off.
+        valid_chars = "%s%s" % (string.ascii_letters, string.digits)
+        def shear(x):
+            return ''.join(z for z in x[:3] if z in valid_chars)
+        sh = '-'.join([shear(x) for x in key.split(':')])
+        return sh
+    def add(self, key, vals, shorthand=None, in_name=False):
+        """
+        Add a parameter (key) to the grid config, with potential values (vals).
+        By default, if a shorthand isn't given, one is automatically generated
+        from the key using the first three letters of each colon-separated
+        term. To disable this behavior, change ``DEFAULT_SHORTHAND`` in the
+        ``spinup/user_config.py`` file to ``False``. 
+        Args:
+            key (string): Name of parameter.
+            vals (value or list of values): Allowed values of parameter.
+            shorthand (string): Optional, shortened name of parameter. For 
+                example, maybe the parameter ``steps_per_epoch`` is shortened
+                to ``steps``. 
+            in_name (bool): When constructing variant names, force the
+                inclusion of this parameter into the name.
+        """
+        assert isinstance(key, str), "Key must be a string."
+        assert shorthand is None or isinstance(shorthand, str), \
+            "Shorthand must be a string."
+        if not isinstance(vals, list):
+            vals = [vals]
+        if DEFAULT_SHORTHAND and shorthand is None:
+            shorthand = self._default_shorthand(key)
+        self.keys.append(key)
+        self.vals.append(vals)
+        self.shs.append(shorthand)
+        self.in_names.append(in_name)
+    def variant_name(self, variant):
+        """
+        Given a variant (dict of valid param/value pairs), make an exp_name.
+        A variant's name is constructed as the grid name (if you've given it 
+        one), plus param names (or shorthands if available) and values 
+        separated by underscores.
+        Note: if ``seed`` is a parameter, it is not included in the name.
+        """
+        def get_val(v, k):
+            # Utility method for getting the correct value out of a variant
+            # given as a nested dict. Assumes that a parameter name, k, 
+            # describes a path into the nested dict, such that k='a:b:c'
+            # corresponds to value=variant['a']['b']['c']. Uses recursion
+            # to get this.
+            if k in v:
+                return v[k]
+            else:
+                splits = k.split(':')
+                k0, k1 = splits[0], ':'.join(splits[1:])
+                return get_val(v[k0], k1)
+        # Start the name off with the name of the variant generator.
+        var_name = self._name
+        # Build the rest of the name by looping through all parameters,
+        # and deciding which ones need to go in there.
+        for k, v, sh, inn in zip(self.keys, self.vals, self.shs, self.in_names):
+            # Include a parameter in a name if either 1) it can take multiple
+            # values, or 2) the user specified that it must appear in the name.
+            # Except, however, when the parameter is 'seed'. Seed is handled
+            # differently so that runs of the same experiment, with different 
+            # seeds, will be grouped by experiment name.
+            if (len(v)>1 or inn) and not(k=='seed'):
+                # Use the shorthand if available, otherwise the full name.
+                param_name = sh if sh is not None else k
+                param_name = valid_str(param_name)
+                # Get variant value for parameter k
+                variant_val = get_val(variant, k)
+                # Append to name
+                if all_bools(v): 
+                    # If this is a param which only takes boolean values,
+                    # only include in the name if it's True for this variant.
+                    var_name += ('_' + param_name) if variant_val else ''
+                else:
+                    var_name += '_' + param_name + valid_str(variant_val)
+        return var_name.lstrip('_')
+    def _variants(self, keys, vals):
+        """
+        Recursively builds list of valid variants.
+        """
+        if len(keys)==1:
+            pre_variants = [dict()]
+        else:
+            pre_variants = self._variants(keys[1:], vals[1:])
+        variants = []
+        for val in vals[0]:
+            for pre_v in pre_variants:
+                v = {}
+                v[keys[0]] = val
+                v.update(pre_v)
+                variants.append(v)
+        return variants
+    def variants(self):
+        """
+        Makes a list of dicts, where each dict is a valid config in the grid.
+        There is special handling for variant parameters whose names take
+        the form
+            ``'full:param:name'``.
+        The colons are taken to indicate that these parameters should
+        have a nested dict structure. eg, if there are two params,
+            ====================  ===
+            Key                   Val
+            ====================  ===
+            ``'base:param:a'``    1
+            ``'base:param:b'``    2
+            ====================  ===
+        the variant dict will have the structure
+        .. parsed-literal::
+            variant = {
+                base: {
+                    param : {
+                        a : 1,
+                        b : 2
+                        }
+                    }    
+                }
+        """
+        flat_variants = self._variants(self.keys, self.vals)
+        def unflatten_var(var):
+            """ 
+            Build the full nested dict version of var, based on key names.
+            """
+            new_var = dict()
+            unflatten_set = set()
+            for k,v in var.items():
+                if ':' in k:
+                    splits = k.split(':')
+                    k0 = splits[0]
+                    assert k0 not in new_var or isinstance(new_var[k0], dict), \
+                        "You can't assign multiple values to the same key."
+                    if not(k0 in new_var):
+                        new_var[k0] = dict()
+                    sub_k = ':'.join(splits[1:])
+                    new_var[k0][sub_k] = v
+                    unflatten_set.add(k0)
+                else:
+                    assert not(k in new_var), \
+                        "You can't assign multiple values to the same key."
+                    new_var[k] = v
+            # Make sure to fill out the nested dicts.
+            for k in unflatten_set:
+                new_var[k] = unflatten_var(new_var[k])
+            return new_var
+        new_variants = [unflatten_var(var) for var in flat_variants]
+        return new_variants
+    def run(self, thunk, num_cpu=1, data_dir=None, datestamp=False):
+        """
+        Run each variant in the grid with function 'thunk'.
+        Note: 'thunk' must be either a callable function, or a string. If it is
+        a string, it must be the name of a parameter whose values are all 
+        callable functions.
+        Uses ``call_experiment`` to actually launch each experiment, and gives
+        each variant a name using ``self.variant_name()``. 
+        Maintenance note: the args for ExperimentGrid.run should track closely
+        to the args for call_experiment. However, ``seed`` is omitted because
+        we presume the user may add it as a parameter in the grid.
+        """
+        # Print info about self.
+        self.print()
+        # Make the list of all variants.
+        variants = self.variants()
+        # Print variant names for the user.
+        var_names = set([self.variant_name(var) for var in variants])
+        var_names = sorted(list(var_names))
+        line = '='*DIV_LINE_WIDTH
+        preparing = colorize('Preparing to run the following experiments...', 
+                             color='green', bold=True)
+        joined_var_names = '\n'.join(var_names)
+        announcement = f"\n{preparing}\n\n{joined_var_names}\n\n{line}"
+        print(announcement)
+        if WAIT_BEFORE_LAUNCH > 0:
+            delay_msg = colorize(dedent("""
+            Launch delayed to give you a few seconds to review your experiments.
+            To customize or disable this behavior, change WAIT_BEFORE_LAUNCH in
+            spinup/user_config.py.
+            """), color='cyan', bold=True)+line
+            print(delay_msg)
+            wait, steps = WAIT_BEFORE_LAUNCH, 100
+            prog_bar = trange(steps, desc='Launching in...', 
+                              leave=False, ncols=DIV_LINE_WIDTH, 
+                              mininterval=0.25,
+                              bar_format='{desc}: {bar}| {remaining} {elapsed}')
+            for _ in prog_bar:
+                time.sleep(wait/steps)
+        # Run the variants.
+        for var in variants:
+            exp_name = self.variant_name(var)
+            # Figure out what the thunk is.
+            if isinstance(thunk, str):
+                # Assume one of the variant parameters has the same
+                # name as the string you passed for thunk, and that 
+                # variant[thunk] is a valid callable function.
+                thunk_ = var[thunk]
+                del var[thunk]
+            else:
+                # Assume thunk is given as a function.
+                thunk_ = thunk
+            call_experiment(exp_name, thunk_, num_cpu=num_cpu, 
+                            data_dir=data_dir, datestamp=datestamp, **var)
+def test_eg():
+    eg = ExperimentGrid()
+    eg.add('test:a', [1,2,3], 'ta', True)
+    eg.add('test:b', [1,2,3])
+    eg.add('some', [4,5])
+    eg.add('why', [True,False])
+    eg.add('huh', 5)
+    eg.add('no', 6, in_name=True)
+    return eg.variants()
\ No newline at end of file
--- a/sargym/ppo_utils/serialization_utils.py
+++ b/sargym/ppo_utils/serialization_utils.py
+import json
+def convert_json(obj):
+    """ Convert obj to a version which can be serialized with JSON. """
+    if is_json_serializable(obj):
+        return obj
+    else:
+        if isinstance(obj, dict):
+            return {convert_json(k): convert_json(v) 
+                    for k,v in obj.items()}
+        elif isinstance(obj, tuple):
+            return (convert_json(x) for x in obj)
+        elif isinstance(obj, list):
+            return [convert_json(x) for x in obj]
+        elif hasattr(obj,'__name__') and not('lambda' in obj.__name__):
+            return convert_json(obj.__name__)
+        elif hasattr(obj,'__dict__') and obj.__dict__:
+            obj_dict = {convert_json(k): convert_json(v) 
+                        for k,v in obj.__dict__.items()}
+            return {str(obj): obj_dict}
+        return str(obj)
+def is_json_serializable(v):
+    try:
+        json.dumps(v)
+        return True
+    except:
+        return False
\ No newline at end of file
--- a/sargym/ppo_utils/user_config.py
+++ b/sargym/ppo_utils/user_config.py
+import os
+import os.path as osp
+# Default neural network backend for each algo
+# (Must be either 'tf1' or 'pytorch')
+DEFAULT_BACKEND = {
+    'vpg': 'pytorch',
+    'trpo': 'tf1',
+    'ppo': 'pytorch',
+    'ddpg': 'pytorch',
+    'td3': 'pytorch',
+    'sac': 'pytorch'
+}
+# Where experiment outputs are saved by default:
+DEFAULT_DATA_DIR = osp.join(osp.abspath(osp.dirname(osp.dirname(__file__))),'data')
+# Whether to automatically insert a date and time stamp into the names of
+# save directories:
+FORCE_DATESTAMP = False
+# Whether GridSearch provides automatically-generated default shorthands:
+DEFAULT_SHORTHAND = True
+# Tells the GridSearch how many seconds to pause for before launching 
+# experiments.
+WAIT_BEFORE_LAUNCH = 5
\ No newline at end of file
--- a/sargym/replay_buffers.py
+++ b/sargym/replay_buffers.py
+import torch
+import numpy as np
+from . import core
+from .ppo_utils.logx import EpochLogger
+from .ppo_utils.mpi_pytorch import setup_pytorch_for_mpi, sync_params, mpi_avg_grads
+from .ppo_utils.mpi_tools import mpi_fork, mpi_avg, proc_id, mpi_statistics_scalar, num_procs
+class TD3Buffer(object):
+    def __init__(self, state_dim, action_dim, max_size=int(1e6)):
+        self.max_size = max_size
+        self.ptr = 0
+        self.size = 0
+        # print(state_dim,action_dim)
+        self.state = np.zeros((max_size, state_dim))
+        self.action = np.zeros((max_size, action_dim))
+        self.next_state = np.zeros((max_size, state_dim))
+        self.reward = np.zeros((max_size, 1))
+        self.cost = np.zeros((max_size, 1))
+        self.not_done = np.zeros((max_size, 1))
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    def add(self, state, action, next_state, reward, cost, done):
+        self.state[self.ptr] = state
+        self.action[self.ptr] = action
+        self.next_state[self.ptr] = next_state
+        self.reward[self.ptr] = reward
+        self.cost[self.ptr] = cost
+        self.not_done[self.ptr] = 1. - done
+        self.ptr = (self.ptr + 1) % self.max_size
+        self.size = min(self.size + 1, self.max_size)
+    def sample(self, batch_size):
+        ind = np.random.randint(0, self.size, size=batch_size)
+        return (
+            torch.FloatTensor(self.state[ind]).to(self.device),
+            torch.FloatTensor(self.action[ind]).to(self.device),
+            torch.FloatTensor(self.next_state[ind]).to(self.device),
+            torch.FloatTensor(self.reward[ind]).to(self.device),
+            torch.FloatTensor(self.cost[ind]).to(self.device),
+            torch.FloatTensor(self.not_done[ind]).to(self.device)
+        )
+class PPOBuffer:
+    """
+    A buffer for storing trajectories experienced by a PPO agent interacting
+    with the environment, and using Generalized Advantage Estimation (GAE-Lambda)
+    for calculating the advantages of state-action pairs.
+    """
+    def __init__(self, obs_dim, act_dim, size, gamma=0.99, lam=0.95):
+        self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32)
+        self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32)
+        self.old_act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32)
+        self.adv_buf = np.zeros(size, dtype=np.float32)
+        self.rew_buf = np.zeros(size, dtype=np.float32)
+        self.ret_buf = np.zeros(size, dtype=np.float32)
+        self.val_buf = np.zeros(size, dtype=np.float32)
+        self.cost_adv_buf = np.zeros(size, dtype=np.float32)
+        self.cost_buf = np.zeros(size, dtype=np.float32)
+        self.cost_ret_buf = np.zeros(size, dtype=np.float32)
+        self.cost_val_buf = np.zeros(size, dtype=np.float32)
+        self.logp_buf = np.zeros(size, dtype=np.float32)
+        self.gamma, self.lam = gamma, lam
+        self.ptr, self.path_start_idx, self.max_size = 0, 0, size
+    def store(self, obs, act, rew, cost, val, cost_val, logp, old_act):
+        """
+        Append one timestep of agent-environment interaction to the buffer.
+        """
+        assert self.ptr < self.max_size     # buffer has to have room so you can store
+        self.obs_buf[self.ptr] = obs
+        self.act_buf[self.ptr] = act
+        self.old_act_buf[self.ptr] = old_act
+        self.rew_buf[self.ptr] = rew
+        self.val_buf[self.ptr] = val
+        self.cost_buf[self.ptr] = cost
+        self.cost_val_buf[self.ptr] = cost_val
+        self.logp_buf[self.ptr] = logp
+        self.ptr += 1
+    def finish_path(self, last_val=0, last_cost_val=0):
+        """
+        Call this at the end of a trajectory, or when one gets cut off
+        by an epoch ending. This looks back in the buffer to where the
+        trajectory started, and uses rewards and value estimates from
+        the whole trajectory to compute advantage estimates with GAE-Lambda,
+        as well as compute the rewards-to-go for each state, to use as
+        the targets for the value function.
+        The "last_val" argument should be 0 if the trajectory ended
+        because the agent reached a terminal state (died), and otherwise
+        should be V(s_T), the value function estimated for the last state.
+        This allows us to bootstrap the reward-to-go calculation to account
+        for timesteps beyond the arbitrary episode horizon (or epoch cutoff).
+        """
+        path_slice = slice(self.path_start_idx, self.ptr)
+        rews = np.append(self.rew_buf[path_slice], last_val)
+        vals = np.append(self.val_buf[path_slice], last_val)
+        # the next two lines implement GAE-Lambda advantage calculation
+        deltas = rews[:-1] + self.gamma * vals[1:] - vals[:-1]
+        self.adv_buf[path_slice] = core.discount_cumsum(deltas, self.gamma * self.lam)
+        # the next line computes rewards-to-go, to be targets for the value function
+        self.ret_buf[path_slice] = core.discount_cumsum(rews, self.gamma)[:-1]
+        costs = np.append(self.cost_buf[path_slice], last_cost_val)
+        cost_vals = np.append(self.cost_val_buf[path_slice], last_cost_val)
+        # the next two lines implement GAE-Lambda advantage calculation
+        cost_deltas = costs[:-1] + self.gamma * cost_vals[1:] - cost_vals[:-1]
+        self.cost_adv_buf[path_slice] = core.discount_cumsum(cost_deltas, self.gamma * self.lam)
+        # the next line computes rewards-to-go, to be targets for the value function
+        self.cost_ret_buf[path_slice] = core.discount_cumsum(costs, self.gamma)[:-1]
+        self.path_start_idx = self.ptr
+    def get(self):
+        """
+        Call this at the end of an epoch to get all of the data from
+        the buffer, with advantages appropriately normalized (shifted to have
+        mean zero and std one). Also, resets some pointers in the buffer.
+        """
+        assert self.ptr == self.max_size    # buffer has to be full before you can get
+        self.ptr, self.path_start_idx = 0, 0
+        # the next two lines implement the advantage normalization trick
+        adv_mean, adv_std = mpi_statistics_scalar(self.adv_buf)
+        self.adv_buf = (self.adv_buf - adv_mean) / adv_std
+        cost_adv_mean, cost_adv_std = mpi_statistics_scalar(self.cost_adv_buf)
+        self.cost_adv_buf = (self.cost_adv_buf - cost_adv_mean) / cost_adv_std
+        data = dict(obs=self.obs_buf, act=self.act_buf,old_act=self.old_act_buf, ret=self.ret_buf,
+                    adv=self.adv_buf, cost_ret = self.cost_ret_buf, cost_adv=self.cost_adv_buf, logp=self.logp_buf)
+        return {k: torch.as_tensor(v, dtype=torch.float32) for k,v in data.items()}
--- a/sargym/sar_gym.py
+++ b/sargym/sar_gym.py
 import gym
-from gym import spaces
+from gym import spaces, Env
 import pygame
 import numpy as np
+MAX_STEPS_PER_EPISODE = 100
+MAX_FORWARD_REWARD_THRESHOLD = 0.5
+#reward scaling
+SEARCH_SCALE = 1.0
+FORWARD_SCALE = 0.1
 class GridWorldSAR(gym.Env):
    metadata = {"render_modes": ["human", "rgb_array"], "render_fps": 4}
@@ -13,9 +20,14 @@ class GridWorldSAR(gym.Env):
        self.window_size = 512  # The size of the PyGame window
        #Values the agent will need
-        self.pos_x = 0.0    #Current position, update w/ velocity
+        self.pos_x = 0.0    #Current position, update w/ velocity (Might need to make this the pos in 1200x1200 path coordinates?)
        self.pos_y = 0.0
-        self.heatmap = np.load('./deeprl_data/lpm4prob.npy')
+        self.cell = [0,0]
+        self.visited = np.zeros((map_size,map_size),dtype=bool)
+        self.heatmap = np.load('./deeprl_data/lpm/lpm4_prob.npy')
+        self.ringmap = np.load('./deeprl_data/ring/ringlowres_prob.npy')
+        #self.heatmap = np.load('./deeprl_data/test/test_prob.npy')
+        #self.heatmap = np.load('./deeprl_data/ring/ringlowres_prop.npy')
        # Observations are dictionaries with the agent's and the target's location.
        # Each location is encoded as an element of {0, ..., `size`}^2, i.e. MultiDiscrete([size, size]).
@@ -23,14 +35,16 @@ class GridWorldSAR(gym.Env):
        #Pos = agent's current coordinates
        #risk = search risk. Agent can see in a square area around themselves
        #visited = which spaces have been surveyed. Will be needed to get paths for multiple drones
+        #So this is kinda model based RL..?
        self.observation_space = spaces.Dict(
            {
-                "pos": spaces.Box(0, size -1, shape=(2,),dtype=int),
+                "pos": spaces.Box(0, self.size -1, shape=(2,),dtype=float),
-                "risk": spaces.Box(0, 1, shape=(map_size*map_size,), dtype=float),
+                "risk": spaces.Box(0, 1, shape=(self.size*self.size,), dtype=float),
-                "visited": spaces.Box(0, 1, shape=(map_size*map_size,), dtype=bool),
+                "visited": spaces.Box(0, 1, shape=(self.size*self.size,), dtype=bool),
            }
        )
        #Might need to take the risk/visited obs out. Let's see if it runs
+        self.observation_space = spaces.Box(low=0, high=self.size-1, shape=(2,),dtype=int)
        #Continuous velocity. Keep it at 1 so agent can't skip over cells to end episode early
        self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(2,), dtype=float)
@@ -50,6 +64,146 @@ class GridWorldSAR(gym.Env):
        self.window = None
        self.clock = None
+    def reset(self, seed=None, options=None):
+        if seed:
+            np.random.seed(seed)
+            random.seed(seed)
+        #Move agent back to start
+        self.pos_x = 0.0
+        self.pos_y = 23.0
+        self.cell = [0,23]
+        self.current_step = 0
+        self.done = False
+        self.visited = np.zeros((self.size,self.size),dtype=bool)
+        observation = self._get_obs()
+        info = self._get_info()
+        return observation
+    #let's see if we really need to observe risk and visited?
    def _get_obs(self):
-        return {'pos': np.array([np.int32(self.pos_x),np.int32(self.pos_y)])}
+        return np.array([self.pos_x,self.pos_y])
+        #return {'pos': self.cell, "risk": self.heatmap, "visited": self.visited}
+    def _get_info(self):
+        #print("Location = %.3f %.3f" %(self.pos_x,self.pos_y))
+        #Determine cost for CPO
+        #hacky but make sure indicies are okay
+        if(self.cell[0] < 0):
+            self.cell[0] = 0
+        if(self.cell[0] >= self.size):
+            self.cell[0] = self.size-1
+        if(self.cell[1] < 0):
+            self.cell[1] = 0
+        if(self.cell[1] >= self.size):
+            self.cell[1] = self.size-1
+        map_diff = abs(self.heatmap[self.cell[0],self.cell[1]] - self.ringmap[self.cell[0],self.cell[1]])
+        return {'loc': [self.pos_x,self.pos_y], 'cost': map_diff}
+    def step(self, action):
+        terminated = self.done
+        truncated = self.done
+        #Move agent to new position
+        self.pos_x = self.pos_x + action[0]
+        self.pos_y = self.pos_y + action[1]
+        self.cell = np.array([np.int32(self.pos_x),np.int32(self.pos_y)])
+        #Check if agent has reached edge
+        if(self.pos_x < 0 or self.pos_x >= self.size or self.pos_y < 0 or self.pos_y >= self.size):
+            self.done = True
+            terminated = True
+            #Add reward for reaching end??
+        #Calculate reward and update visit map
+        elif(not self.visited[self.cell[0],self.cell[1]]):
+            reward_search = self.heatmap[self.cell[0],self.cell[1]]
+            self.visited[self.cell[0],self.cell[1]] = True
+        reward_search = 0
+        reward_forward = np.min([action[0], MAX_FORWARD_REWARD_THRESHOLD])
+        #End episode after step max or agent exits the environment
+        if(self.current_step == MAX_STEPS_PER_EPISODE):
+            self.done = True
+            truncated = True
+        #Update step counter
+        self.current_step += 1
+        reward = SEARCH_SCALE*reward_search + FORWARD_SCALE*reward_forward
+        #print(reward)
+        observation = self._get_obs()
+        info = self._get_info()
+        return observation, reward, terminated, info
+    def render(self):
+        if self.render_mode == "rgb_array":
+            return self._render_frame()
+    def _render_frame(self):
+        if self.window is None and self.render_mode == "human":
+            pygame.init()
+            pygame.display.init()
+            self.window = pygame.display.set_mode((self.window_size, self.window_size))
+        if self.clock is None and self.render_mode == "human":
+            self.clock = pygame.time.Clock()
+        canvas = pygame.Surface((self.window_size, self.window_size))
+        canvas.fill((255, 255, 255))
+        pix_square_size = (
+            self.window_size / self.size
+        )  # The size of a single grid square in pixels
+        # Now we draw the agent
+        pygame.draw.circle(
+            canvas,
+            (0, 0, 255),
+            (self.cell + 0.5) * pix_square_size,
+            pix_square_size / 3,
+        )
+        # Finally, add some gridlines
+        for x in range(self.size + 1):
+            pygame.draw.line(
+                canvas,
+                0,
+                (0, pix_square_size * x),
+                (self.window_size, pix_square_size * x),
+                width=3,
+            )
+            pygame.draw.line(
+                canvas,
+                0,
+                (pix_square_size * x, 0),
+                (pix_square_size * x, self.window_size),
+                width=3,
+            )
+        if self.render_mode == "human":
+            # The following line copies our drawings from `canvas` to the visible window
+            self.window.blit(canvas, canvas.get_rect())
+            pygame.event.pump()
+            pygame.display.update()
+            # We need to ensure that human-rendering occurs at the predefined framerate.
+            # The following line will automatically add a delay to keep the framerate stable.
+            self.clock.tick(self.metadata["render_fps"])
+        else:  # rgb_array
+            return np.transpose(
+                np.array(pygame.surfarray.pixels3d(canvas)), axes=(1, 0, 2)
+            )
+    def close(self):
+        if self.window is not None:
+            pygame.display.quit()
+            pygame.quit()
--- a/sargym/trust_region_utils.py
+++ b/sargym/trust_region_utils.py
+import math
+import numpy as np
+import torch
+def normal_entropy(std):
+    var = std.pow(2)
+    entropy = 0.5 + 0.5 * torch.log(2 * var * math.pi)
+    return entropy.sum(1, keepdim=True)
+def normal_log_density(x, mean, log_std, std):
+    var = std.pow(2)
+    log_density = -(x - mean).pow(2) / (
+        2 * var) - 0.5 * math.log(2 * math.pi) - log_std
+    return log_density.sum(1, keepdim=True)
+def get_flat_params_from(model):
+    params = []
+    for param in model.parameters():
+        params.append(param.data.view(-1))
+    flat_params = torch.cat(params)
+    return flat_params
+def set_flat_params_to(model, flat_params):
+    prev_ind = 0
+    for param in model.parameters():
+        flat_size = int(np.prod(list(param.size())))
+        param.data.copy_(
+            flat_params[prev_ind:prev_ind + flat_size].view(param.size()))
+        prev_ind += flat_size
+def get_flat_grad_from(net, grad_grad=False):
+    grads = []
+    for param in net.parameters():
+        if grad_grad:
+            grads.append(param.grad.grad.view(-1))
+        else:
+            grads.append(param.grad.view(-1))
+    flat_grad = torch.cat(grads)
+    return flat_grad
\ No newline at end of file
--- a/train_cpo.py
+++ b/train_cpo.py
@@ -22,6 +22,11 @@ import datetime as dt
 import time, os
 import pdb
+from sargym import sar_gym, cpo, core
+import cProfile, pstats, io
 #Relative filepaths (Assumes this and ags_grabber projects are in same parent directory)
 kentland_heatmap = './LP model/analysis/outputs/ic_1_con_hiker_t12_kentland.csv'
 #kentland_heatmap = './LP model/analysis/outputs/ic_1_con_hiker_t4.csv' #I like 4?
@@ -159,26 +164,188 @@ def create_data():
    del rgp, planner, mc
-def main():
+def testing():
    #Run this function to create data
-    create_data()
+    #create_data()
-    #Load robot paths from file
-    ring_prob = np.load('./deeprl_data/ring_prob.npy')
-    robot0_waypoints = np.load('./deeprl_data/robot0_waypoints.npy')
-    robot1_waypoints = np.load('./deeprl_data/robot1_waypoints.npy')
-    robot2_waypoints = np.load('./deeprl_data/robot2_waypoints.npy')
    #Make Gym Environment
+    env = sar_gym.GridWorldSAR(render_mode='human')
+    obs, info = env.reset()
+    path = [info['loc']]
+    rews = []
+    done = False
+    while not done:
+        obs, reward, term, info = env.step([1,1])
+        done = term
+        path.append(info['loc'])
+        rews.append(reward)
+    fig, (ax1, ax2) = plt.subplots(1, 2)
+    # Scatter plot of path and track in the first subplot
+    ax1.scatter(x=np.array(path)[:,0], y=np.array(path)[:,1], label='Path')
+    ax1.set_xlim([0,48])
+    ax1.set_ylim([0,48])
+    ax1.set_xlabel('X')
+    ax1.set_ylabel('Y')
+    ax1.set_title('Path and Track')
+    ax1.legend()
+    # Plot of rewards in the second subplot
+    ax2.plot(rews)
+    ax2.set_xlabel('Step')
+    ax2.set_ylabel('Reward')
+    ax2.set_title('Rewards')
+    plt.tight_layout()
+    plt.show()
+    return
+COST_THRESHOLD = 50  #sanity check to make sure things are working
+MAX_STEPS_PER_EPISODE = 100
+def train():
+    #Run this function to create data
+    #create_data()
+    #Training CPO
+    J = cpo.cpo(
+        lambda: sar_gym.GridWorldSAR(render_mode=None),
+        actor_critic=core.MLPActorCriticTD3trust,
+        max_ep_len=MAX_STEPS_PER_EPISODE,
+        cost_lim=COST_THRESHOLD,
+        epochs=200,
+        steps_per_epoch=8000
+    )
+    #175
+    pr = cProfile.Profile()
+    pr.enable()
+    profile = True
+    if profile:
+        try:
+            J
+        except KeyboardInterrupt:
+            print("\nKeyboard interrupt received. Printing stats...")
+        finally:
+            pr.disable()
+            s = io.StringIO()
+            ps = pstats.Stats(pr, stream=s).sort_stats("tottime")
+            ps.print_stats(32)
+            print(s.getvalue())
+    #Save policy
+    torch.save(J.pi, './deeprl_data/trained_model4.pt')
+    #Testing
+    #Make Gym Environment
+    env = sar_gym.GridWorldSAR(render_mode=None)
+    #Test one episode
+    obs = env.reset()
+    path = []
+    rews = []
+    done = False
+    while not done:
+        with torch.no_grad():
+            tt = torch.as_tensor(obs,dtype=torch.float32)
+            actions = J.pi(tt)
+        obs, reward, term, info = env.step(actions.numpy())
+        done = term
+        path.append(info['loc'])
+        rews.append(reward)
+    fig, (ax1, ax2) = plt.subplots(1, 2)
+    # Scatter plot of path and track in the first subplot
+    ax1.scatter(x=np.array(path)[:,0], y=np.array(path)[:,1], label='Path')
+    ax1.set_xlim([0,48])
+    ax1.set_ylim([0,48])
+    ax1.set_xlabel('X')
+    ax1.set_ylabel('Y')
+    ax1.set_title('Path and Track')
+    ax1.legend()
+    # Plot of rewards in the second subplot
+    ax2.plot(rews)
+    ax2.set_xlabel('Step')
+    ax2.set_ylabel('Reward')
+    ax2.set_title('Rewards')
+    plt.tight_layout()
+    plt.show()
    return
+def test_policy():
+    #Testing
+    policy = torch.load('./deeprl_data/trained_model2.pt')
+    heatmap = np.load('./deeprl_data/lpm/lpm4_prob.npy')
+    ringmap = np.load('./deeprl_data/ring/ringlowres_prob.npy')
+    #Make Gym Environment
+    env = sar_gym.GridWorldSAR(render_mode=None)
+    #Test one episode
+    obs = env.reset()
+    path = []
+    rews = []
+    done = False
+    while not done:
+        with torch.no_grad():
+            tt = torch.as_tensor(obs,dtype=torch.float32)
+            actions = policy(tt)
+        obs, reward, term, info = env.step(actions.numpy())
+        done = term
+        path.append(info['loc'])
+        rews.append(reward)
+    fig, (ax1, ax2) = plt.subplots(1, 2)
+    # Scatter plot of path and track in the first subplot
+    ax1.scatter(x=np.array(path)[:,0], y=np.array(path)[:,1], label='Path')
+    ax1.imshow(heatmap)
+    ax1.set_xlim([0,48])
+    ax1.set_ylim([0,48])
+    ax1.set_xlabel('X')
+    ax1.set_ylabel('Y')
+    ax1.set_title('Lost Person Model')
+    ax1.legend()
+    # Scatter plot of path and track in the first subplot
+    ax2.scatter(x=np.array(path)[:,0], y=np.array(path)[:,1], label='Path')
+    ax2.imshow(ringmap)
+    ax2.set_xlim([0,48])
+    ax2.set_ylim([0,48])
+    ax2.set_xlabel('X')
+    ax2.set_ylabel('Y')
+    ax2.set_title('Ring Model')
+    ax2.legend()
+    # Plot of rewards in the second subplot
+    #ax2.plot(rews)
+    #ax2.set_xlabel('Step')
+    #ax2.set_ylabel('Reward')
+    #x2.set_title('Rewards')
+    plt.tight_layout()
+    plt.suptitle("LPM Reward Function with Ring Model Constraint")
+    plt.show()
+def main():
+    #test_policy()
+    train()
+    return