Initial version of DeepRL agents, results from final project

b8fe4105 · Bryson Howell · e1472177 · b8fe4105 · b8fe4105 · b8fe4105
Commit b8fe4105 authored May 07, 2024 by Bryson Howell
39 changed files
--- a/deeprl_data/bad_reward_function.png
+++ b/deeprl_data/bad_reward_function.png
--- a/deeprl_data/final_results.png
+++ b/deeprl_data/final_results.png
--- a/deeprl_data/final_test1.png
+++ b/deeprl_data/final_test1.png
--- a/deeprl_data/trained_model.pt
+++ b/deeprl_data/trained_model.pt
--- a/deeprl_data/trained_model2.pt
+++ b/deeprl_data/trained_model2.pt
--- a/deeprl_data/trained_model3.pt
+++ b/deeprl_data/trained_model3.pt
--- a/deeprl_data/trained_model4.pt
+++ b/deeprl_data/trained_model4.pt
--- a/logs4/lpm_final.zip
+++ b/logs4/lpm_final.zip
--- a/logs4/lpm_final/events.out.tfevents.1714783826.caslab-bryson-desktop.2504951.0
+++ b/logs4/lpm_final/events.out.tfevents.1714783826.caslab-bryson-desktop.2504951.0
--- a/logs4/lpm_final2/events.out.tfevents.1714788031.caslab-bryson-desktop.3825346.0
+++ b/logs4/lpm_final2/events.out.tfevents.1714788031.caslab-bryson-desktop.3825346.0
--- a/logs4/lpm_reward_test1.zip
+++ b/logs4/lpm_reward_test1.zip
--- a/logs4/lpm_reward_test1/events.out.tfevents.1714778824.caslab-bryson-desktop.2904438.0
+++ b/logs4/lpm_reward_test1/events.out.tfevents.1714778824.caslab-bryson-desktop.2904438.0
--- a/logs4/lpm_reward_test2/events.out.tfevents.1714780839.caslab-bryson-desktop.4063243.0
+++ b/logs4/lpm_reward_test2/events.out.tfevents.1714780839.caslab-bryson-desktop.4063243.0
--- a/logs4/old1714771101/events.out.tfevents.1714771101.caslab-bryson-desktop.59980.0
+++ b/logs4/old1714771101/events.out.tfevents.1714771101.caslab-bryson-desktop.59980.0
--- a/logs4/old1714772221/events.out.tfevents.1714772221.caslab-bryson-desktop.62319.0
+++ b/logs4/old1714772221/events.out.tfevents.1714772221.caslab-bryson-desktop.62319.0
--- a/logs4/old1714772609/events.out.tfevents.1714772609.caslab-bryson-desktop.63001.0
+++ b/logs4/old1714772609/events.out.tfevents.1714772609.caslab-bryson-desktop.63001.0
--- a/logs4/old1714772814/events.out.tfevents.1714772814.caslab-bryson-desktop.125114.0
+++ b/logs4/old1714772814/events.out.tfevents.1714772814.caslab-bryson-desktop.125114.0
--- a/logs4/old1714775057/events.out.tfevents.1714775057.caslab-bryson-desktop.1174234.0
+++ b/logs4/old1714775057/events.out.tfevents.1714775057.caslab-bryson-desktop.1174234.0
--- a/logs4/old1714775105/events.out.tfevents.1714775105.caslab-bryson-desktop.1174375.0
+++ b/logs4/old1714775105/events.out.tfevents.1714775105.caslab-bryson-desktop.1174375.0
--- a/logs4/old1714775246/events.out.tfevents.1714775246.caslab-bryson-desktop.1201013.0
+++ b/logs4/old1714775246/events.out.tfevents.1714775246.caslab-bryson-desktop.1201013.0
--- a/logs4/old1714775386/events.out.tfevents.1714775386.caslab-bryson-desktop.1227455.0
+++ b/logs4/old1714775386/events.out.tfevents.1714775386.caslab-bryson-desktop.1227455.0
--- a/logs4/old1714775480/events.out.tfevents.1714775480.caslab-bryson-desktop.1253937.0
+++ b/logs4/old1714775480/events.out.tfevents.1714775480.caslab-bryson-desktop.1253937.0
--- a/logs4/old1714777540/events.out.tfevents.1714777540.caslab-bryson-desktop.2362329.0
+++ b/logs4/old1714777540/events.out.tfevents.1714777540.caslab-bryson-desktop.2362329.0
--- a/logs4/old1714777669/events.out.tfevents.1714777669.caslab-bryson-desktop.2389068.0
+++ b/logs4/old1714777669/events.out.tfevents.1714777669.caslab-bryson-desktop.2389068.0
--- a/logs4/old1714778149/events.out.tfevents.1714778149.caslab-bryson-desktop.2416149.0
+++ b/logs4/old1714778149/events.out.tfevents.1714778149.caslab-bryson-desktop.2416149.0
--- a/logs4/old1714780028/events.out.tfevents.1714780028.caslab-bryson-desktop.4057882.0
+++ b/logs4/old1714780028/events.out.tfevents.1714780028.caslab-bryson-desktop.4057882.0
--- a/sargym/core.py
+++ b/sargym/core.py
+import numpy as np
+import scipy.signal
+from gym.spaces import Box, Discrete
+import torch
+import torch.nn as nn
+from torch.distributions.normal import Normal
+from torch.distributions.categorical import Categorical
+import torch.nn.functional as F
+import copy
+def combined_shape(length, shape=None):
+    if shape is None:
+        return (length,)
+    return (length, shape) if np.isscalar(shape) else (length, *shape)
+def mlp(sizes, activation, output_activation=nn.Identity):
+    layers = []
+    for j in range(len(sizes)-1):
+        act = activation if j < len(sizes)-2 else output_activation
+        layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
+    return nn.Sequential(*layers)
+def count_vars(module):
+    return sum([np.prod(p.shape) for p in module.parameters()])
+def discount_cumsum(x, discount):
+    """
+    magic from rllab for computing discounted cumulative sums of vectors.
+    input: 
+        vector x, 
+        [x0, 
+         x1, 
+         x2]
+    output:
+        [x0 + discount * x1 + discount^2 * x2,  
+         x1 + discount * x2,
+         x2]
+    """
+    return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]
+class Actor(nn.Module):
+    def _distribution(self, obs):
+        raise NotImplementedError
+    def _log_prob_from_distribution(self, pi, act):
+        raise NotImplementedError
+    def forward(self, obs, act=None):
+        # Produce action distributions for given observations, and 
+        # optionally compute the log likelihood of given actions under
+        # those distributions.
+        pi = self._distribution(obs)
+        logp_a = None
+        if act is not None:
+            logp_a = self._log_prob_from_distribution(pi, act)
+        return pi, logp_a
+class MLPDeterministicActor(nn.Module):
+    def __init__(self, state_dim, action_dim, max_action,discount_factor=0.99):
+        super(MLPDeterministicActor, self).__init__()
+        self.l1 = nn.Linear(state_dim, 256)
+        self.l2 = nn.Linear(256, 256)
+        self.l3 = nn.Linear(256, action_dim)
+        self.max_action = max_action
+        self.action_dim=action_dim
+        self.start_state = None
+    def forward(self, state,safety_switch=False,debug = False, noisy=False):
+        a = F.relu(self.l1(state))
+        a = F.relu(self.l2(a))
+        return self.max_action * torch.tanh(self.l3(a))
+class MLPCategoricalActor(Actor):
+    def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
+        super().__init__()
+        self.logits_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation)
+    def _distribution(self, obs):
+        logits = self.logits_net(obs)
+        return Categorical(logits=logits)
+    def _log_prob_from_distribution(self, pi, act):
+        return pi.log_prob(act)
+class MLPGaussianActor(Actor):
+    def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
+        super().__init__()
+        log_std = -0.5 * np.ones(act_dim, dtype=np.float32)
+        self.log_std = torch.nn.Parameter(torch.as_tensor(log_std))
+        self.mu_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation)
+    def _distribution(self, obs):
+        mu = self.mu_net(obs)
+        std = torch.exp(self.log_std)
+        return Normal(mu, std)
+    def mean(self, obs):
+        mu = self.mu_net(obs)
+        return mu
+    def _log_prob_from_distribution(self, pi, act):
+        return pi.log_prob(act).sum(axis=-1)    # Last axis sum needed for Torch Normal distribution
+class MLPCritic(nn.Module):
+    def __init__(self, obs_dim, hidden_sizes, activation):
+        super().__init__()
+        self.v_net = mlp([obs_dim] + list(hidden_sizes) + [1], activation)
+    def forward(self, obs):
+        return torch.squeeze(self.v_net(obs), -1) # Critical to ensure v has right shape.
+class MLPActorCriticTD3trust(nn.Module):
+    def __init__(self, observation_space, action_space, 
+                 hidden_sizes=(64,64), activation=nn.Tanh):
+        super().__init__()
+        obs_dim = observation_space.shape[0]
+        act_dim = action_space.shape[0]
+        self.obs_dim = obs_dim
+        self.act_dim = act_dim
+        # policy builder depends on action space
+        self.pi = MLPDeterministicActor(obs_dim, action_space.shape[0],action_space.high[0])
+        # build value function
+        self.Qv1  = MLPCritic(obs_dim+act_dim, hidden_sizes, activation)
+        self.Qv2  = MLPCritic(obs_dim+act_dim, hidden_sizes, activation)
+        self.Qj1  = MLPCritic(obs_dim+act_dim, hidden_sizes, activation)
+        self.Qj2  = MLPCritic(obs_dim+act_dim, hidden_sizes, activation)
+        self.baseline_Qj = copy.deepcopy(self.Qj1)
+        self.baseline_pi = copy.deepcopy(self.pi)
+        self.pi_mix = copy.deepcopy(self.pi)
+        self.epsilon = 0
+    def step(self, obs):
+        a = self.pi(obs)
+        qv = self.Qv1(torch.cat((obs,a)))
+        qj = self.Qj1(torch.cat((obs,a)))
+        return a.detach().cpu().numpy(), qv.detach().cpu().numpy(), qj.detach().cpu().numpy(), 0
+    def act_pi(self, pi, obs):
+        a = pi(obs)
+        return a
+    def act(self, obs):
+        return self.step(obs)[0]
+class MLPActorCritic(nn.Module):
+    def __init__(self, observation_space, action_space, 
+                 hidden_sizes=(64,64), activation=nn.Tanh):
+        super().__init__()
+        obs_dim = observation_space.shape[0]
+        # policy builder depends on action space
+        if isinstance(action_space, Box):
+            self.pi = MLPGaussianActor(obs_dim, action_space.shape[0], hidden_sizes, activation)
+        elif isinstance(action_space, Discrete):
+            self.pi = MLPCategoricalActor(obs_dim, action_space.n, hidden_sizes, activation)
+        # build value function
+        self.v  = MLPCritic(obs_dim, hidden_sizes, activation)
+    def step(self, obs):
+        with torch.no_grad():
+            pi = self.pi._distribution(obs)
+            a = pi.sample()
+            logp_a = self.pi._log_prob_from_distribution(pi, a)
+            v = self.v(obs)
+        return a.numpy(), v.numpy(), logp_a.numpy()
+    def act(self, obs):
+        return self.step(obs)[0]
+class MLPActorCriticCost(nn.Module):
+    def __init__(self, observation_space, action_space, 
+                 hidden_sizes=(64,64), activation=nn.Tanh):
+        super().__init__()
+        obs_dim = observation_space.shape[0]
+        # policy builder depends on action space
+        if isinstance(action_space, Box):
+            self.pi = MLPGaussianActor(obs_dim, action_space.shape[0], hidden_sizes, activation)
+        elif isinstance(action_space, Discrete):
+            self.pi = MLPCategoricalActor(obs_dim, action_space.n, hidden_sizes, activation)
+        # build value function
+        self.v  = MLPCritic(obs_dim, hidden_sizes, activation)
+        self.j  = MLPCritic(obs_dim, hidden_sizes, activation)
+    def step(self, obs):
+        with torch.no_grad():
+            pi = self.pi._distribution(obs)
+            a = pi.sample()
+            logp_a = self.pi._log_prob_from_distribution(pi, a)
+            v = self.v(obs)
+            j = self.j(obs)
+        return a.numpy(), v.numpy(), j.numpy(), logp_a.numpy()
+    def act(self, obs):
+        return self.step(obs)[0]
--- a/sargym/cpo.py
+++ b/sargym/cpo.py
--- a/sargym/path_clone_ring_constraint-finalcopy.py
+++ b/sargym/path_clone_ring_constraint-finalcopy.py
--- a/sargym/ppo_utils/logx.py
+++ b/sargym/ppo_utils/logx.py
--- a/sargym/ppo_utils/mpi_pytorch.py
+++ b/sargym/ppo_utils/mpi_pytorch.py
+import multiprocessing
+import numpy as np
+import os
+import torch
+from mpi4py import MPI
+from .mpi_tools import broadcast, mpi_avg, num_procs, proc_id
+def setup_pytorch_for_mpi():
+    """
+    Avoid slowdowns caused by each separate process's PyTorch using
+    more than its fair share of CPU resources.
+    """
+    #print('Proc %d: Reporting original number of Torch threads as %d.'%(proc_id(), torch.get_num_threads()), flush=True)
+    if torch.get_num_threads()==1:
+        return
+    fair_num_threads = max(int(torch.get_num_threads() / num_procs()), 1)
+    torch.set_num_threads(fair_num_threads)
+    #print('Proc %d: Reporting new number of Torch threads as %d.'%(proc_id(), torch.get_num_threads()), flush=True)
+def mpi_avg_grads(module):
+    """ Average contents of gradient buffers across MPI processes. """
+    if num_procs()==1:
+        return
+    for p in module.parameters():
+        p_grad_numpy = p.grad.numpy()   # numpy view of tensor data
+        avg_p_grad = mpi_avg(p.grad)
+        p_grad_numpy[:] = avg_p_grad[:]
+def sync_params(module):
+    """ Sync all parameters of module across all MPI processes. """
+    if num_procs()==1:
+        return
+    for p in module.parameters():
+        p_numpy = p.data.numpy()
+        broadcast(p_numpy)
--- a/sargym/ppo_utils/mpi_tools.py
+++ b/sargym/ppo_utils/mpi_tools.py
+from mpi4py import MPI
+import os, subprocess, sys
+import numpy as np
+import torch
+def mpi_fork(n, bind_to_core=False):
+    """
+    Re-launches the current script with workers linked by MPI.
+    Also, terminates the original process that launched it.
+    Taken almost without modification from the Baselines function of the
+    `same name`_.
+    .. _`same name`: https://github.com/openai/baselines/blob/master/baselines/common/mpi_fork.py
+    Args:
+        n (int): Number of process to split into.
+        bind_to_core (bool): Bind each MPI process to a core.
+    """
+    if n<=1: 
+        return
+    if os.getenv("IN_MPI") is None:
+        env = os.environ.copy()
+        env.update(
+            MKL_NUM_THREADS="1",
+            OMP_NUM_THREADS="1",
+            IN_MPI="1"
+        )
+        args = ["mpirun", "-np", str(n)]
+        if bind_to_core:
+            args += ["-bind-to", "core"]
+        args += [sys.executable] + sys.argv
+        subprocess.check_call(args, env=env)
+        sys.exit()
+def msg(m, string=''):
+    print(('Message from %d: %s \t '%(MPI.COMM_WORLD.Get_rank(), string))+str(m))
+def proc_id():
+    """Get rank of calling process."""
+    return MPI.COMM_WORLD.Get_rank()
+def allreduce(*args, **kwargs):
+    return MPI.COMM_WORLD.Allreduce(*args, **kwargs)
+def num_procs():
+    """Count active MPI processes."""
+    return MPI.COMM_WORLD.Get_size()
+def broadcast(x, root=0):
+    MPI.COMM_WORLD.Bcast(x, root=root)
+def mpi_op(x, op):
+    x, scalar = ([x], True) if np.isscalar(x) else (x, False)
+    x = np.asarray(x, dtype=np.float32)
+    buff = np.zeros_like(x, dtype=np.float32)
+    allreduce(x, buff, op=op)
+    return buff[0] if scalar else buff
+def mpi_sum(x):
+    return mpi_op(x, MPI.SUM)
+def mpi_avg(x):
+    """Average a scalar or vector over MPI processes."""
+    return mpi_sum(x) / num_procs()
+def mpi_statistics_scalar(x, with_min_and_max=False):
+    """
+    Get mean/std and optional min/max of scalar x across MPI processes.
+    Args:
+        x: An array containing samples of the scalar to produce statistics
+            for.
+        with_min_and_max (bool): If true, return min and max of x in 
+            addition to mean and std.
+    """
+    if isinstance(x, list):
+        for i, xx in enumerate(x):
+            if torch.is_tensor(xx):
+                x[i] = xx.detach()
+    x = np.array(x, dtype=np.float32)
+    global_sum, global_n = mpi_sum([np.sum(x), len(x)])
+    mean = global_sum / global_n
+    global_sum_sq = mpi_sum(np.sum((x - mean)**2))
+    std = np.sqrt(global_sum_sq / global_n)  # compute global std
+    if with_min_and_max:
+        global_min = mpi_op(np.min(x) if len(x) > 0 else np.inf, op=MPI.MIN)
+        global_max = mpi_op(np.max(x) if len(x) > 0 else -np.inf, op=MPI.MAX)
+        return mean, std, global_min, global_max
+    return mean, std
\ No newline at end of file
--- a/sargym/ppo_utils/run_utils.py
+++ b/sargym/ppo_utils/run_utils.py
--- a/sargym/ppo_utils/serialization_utils.py
+++ b/sargym/ppo_utils/serialization_utils.py
+import json
+def convert_json(obj):
+    """ Convert obj to a version which can be serialized with JSON. """
+    if is_json_serializable(obj):
+        return obj
+    else:
+        if isinstance(obj, dict):
+            return {convert_json(k): convert_json(v) 
+                    for k,v in obj.items()}
+        elif isinstance(obj, tuple):
+            return (convert_json(x) for x in obj)
+        elif isinstance(obj, list):
+            return [convert_json(x) for x in obj]
+        elif hasattr(obj,'__name__') and not('lambda' in obj.__name__):
+            return convert_json(obj.__name__)
+        elif hasattr(obj,'__dict__') and obj.__dict__:
+            obj_dict = {convert_json(k): convert_json(v) 
+                        for k,v in obj.__dict__.items()}
+            return {str(obj): obj_dict}
+        return str(obj)
+def is_json_serializable(v):
+    try:
+        json.dumps(v)
+        return True
+    except:
+        return False
\ No newline at end of file
--- a/sargym/ppo_utils/user_config.py
+++ b/sargym/ppo_utils/user_config.py
+import os
+import os.path as osp
+# Default neural network backend for each algo
+# (Must be either 'tf1' or 'pytorch')
+DEFAULT_BACKEND = {
+    'vpg': 'pytorch',
+    'trpo': 'tf1',
+    'ppo': 'pytorch',
+    'ddpg': 'pytorch',
+    'td3': 'pytorch',
+    'sac': 'pytorch'
+}
+# Where experiment outputs are saved by default:
+DEFAULT_DATA_DIR = osp.join(osp.abspath(osp.dirname(osp.dirname(__file__))),'data')
+# Whether to automatically insert a date and time stamp into the names of
+# save directories:
+FORCE_DATESTAMP = False
+# Whether GridSearch provides automatically-generated default shorthands:
+DEFAULT_SHORTHAND = True
+# Tells the GridSearch how many seconds to pause for before launching 
+# experiments.
+WAIT_BEFORE_LAUNCH = 5
\ No newline at end of file
--- a/sargym/replay_buffers.py
+++ b/sargym/replay_buffers.py
+import torch
+import numpy as np
+from . import core
+from .ppo_utils.logx import EpochLogger
+from .ppo_utils.mpi_pytorch import setup_pytorch_for_mpi, sync_params, mpi_avg_grads
+from .ppo_utils.mpi_tools import mpi_fork, mpi_avg, proc_id, mpi_statistics_scalar, num_procs
+class TD3Buffer(object):
+    def __init__(self, state_dim, action_dim, max_size=int(1e6)):
+        self.max_size = max_size
+        self.ptr = 0
+        self.size = 0
+        # print(state_dim,action_dim)
+        self.state = np.zeros((max_size, state_dim))
+        self.action = np.zeros((max_size, action_dim))
+        self.next_state = np.zeros((max_size, state_dim))
+        self.reward = np.zeros((max_size, 1))
+        self.cost = np.zeros((max_size, 1))
+        self.not_done = np.zeros((max_size, 1))
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    def add(self, state, action, next_state, reward, cost, done):
+        self.state[self.ptr] = state
+        self.action[self.ptr] = action
+        self.next_state[self.ptr] = next_state
+        self.reward[self.ptr] = reward
+        self.cost[self.ptr] = cost
+        self.not_done[self.ptr] = 1. - done
+        self.ptr = (self.ptr + 1) % self.max_size
+        self.size = min(self.size + 1, self.max_size)
+    def sample(self, batch_size):
+        ind = np.random.randint(0, self.size, size=batch_size)
+        return (
+            torch.FloatTensor(self.state[ind]).to(self.device),
+            torch.FloatTensor(self.action[ind]).to(self.device),
+            torch.FloatTensor(self.next_state[ind]).to(self.device),
+            torch.FloatTensor(self.reward[ind]).to(self.device),
+            torch.FloatTensor(self.cost[ind]).to(self.device),
+            torch.FloatTensor(self.not_done[ind]).to(self.device)
+        )
+class PPOBuffer:
+    """
+    A buffer for storing trajectories experienced by a PPO agent interacting
+    with the environment, and using Generalized Advantage Estimation (GAE-Lambda)
+    for calculating the advantages of state-action pairs.
+    """
+    def __init__(self, obs_dim, act_dim, size, gamma=0.99, lam=0.95):
+        self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32)
+        self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32)
+        self.old_act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32)
+        self.adv_buf = np.zeros(size, dtype=np.float32)
+        self.rew_buf = np.zeros(size, dtype=np.float32)
+        self.ret_buf = np.zeros(size, dtype=np.float32)
+        self.val_buf = np.zeros(size, dtype=np.float32)
+        self.cost_adv_buf = np.zeros(size, dtype=np.float32)
+        self.cost_buf = np.zeros(size, dtype=np.float32)
+        self.cost_ret_buf = np.zeros(size, dtype=np.float32)
+        self.cost_val_buf = np.zeros(size, dtype=np.float32)
+        self.logp_buf = np.zeros(size, dtype=np.float32)
+        self.gamma, self.lam = gamma, lam
+        self.ptr, self.path_start_idx, self.max_size = 0, 0, size
+    def store(self, obs, act, rew, cost, val, cost_val, logp, old_act):
+        """
+        Append one timestep of agent-environment interaction to the buffer.
+        """
+        assert self.ptr < self.max_size     # buffer has to have room so you can store
+        self.obs_buf[self.ptr] = obs
+        self.act_buf[self.ptr] = act
+        self.old_act_buf[self.ptr] = old_act
+        self.rew_buf[self.ptr] = rew
+        self.val_buf[self.ptr] = val
+        self.cost_buf[self.ptr] = cost
+        self.cost_val_buf[self.ptr] = cost_val
+        self.logp_buf[self.ptr] = logp
+        self.ptr += 1
+    def finish_path(self, last_val=0, last_cost_val=0):
+        """
+        Call this at the end of a trajectory, or when one gets cut off
+        by an epoch ending. This looks back in the buffer to where the
+        trajectory started, and uses rewards and value estimates from
+        the whole trajectory to compute advantage estimates with GAE-Lambda,
+        as well as compute the rewards-to-go for each state, to use as
+        the targets for the value function.
+        The "last_val" argument should be 0 if the trajectory ended
+        because the agent reached a terminal state (died), and otherwise
+        should be V(s_T), the value function estimated for the last state.
+        This allows us to bootstrap the reward-to-go calculation to account
+        for timesteps beyond the arbitrary episode horizon (or epoch cutoff).
+        """
+        path_slice = slice(self.path_start_idx, self.ptr)
+        rews = np.append(self.rew_buf[path_slice], last_val)
+        vals = np.append(self.val_buf[path_slice], last_val)
+        # the next two lines implement GAE-Lambda advantage calculation
+        deltas = rews[:-1] + self.gamma * vals[1:] - vals[:-1]
+        self.adv_buf[path_slice] = core.discount_cumsum(deltas, self.gamma * self.lam)
+        # the next line computes rewards-to-go, to be targets for the value function
+        self.ret_buf[path_slice] = core.discount_cumsum(rews, self.gamma)[:-1]
+        costs = np.append(self.cost_buf[path_slice], last_cost_val)
+        cost_vals = np.append(self.cost_val_buf[path_slice], last_cost_val)
+        # the next two lines implement GAE-Lambda advantage calculation
+        cost_deltas = costs[:-1] + self.gamma * cost_vals[1:] - cost_vals[:-1]
+        self.cost_adv_buf[path_slice] = core.discount_cumsum(cost_deltas, self.gamma * self.lam)
+        # the next line computes rewards-to-go, to be targets for the value function
+        self.cost_ret_buf[path_slice] = core.discount_cumsum(costs, self.gamma)[:-1]
+        self.path_start_idx = self.ptr
+    def get(self):
+        """
+        Call this at the end of an epoch to get all of the data from
+        the buffer, with advantages appropriately normalized (shifted to have
+        mean zero and std one). Also, resets some pointers in the buffer.
+        """
+        assert self.ptr == self.max_size    # buffer has to be full before you can get
+        self.ptr, self.path_start_idx = 0, 0
+        # the next two lines implement the advantage normalization trick
+        adv_mean, adv_std = mpi_statistics_scalar(self.adv_buf)
+        self.adv_buf = (self.adv_buf - adv_mean) / adv_std
+        cost_adv_mean, cost_adv_std = mpi_statistics_scalar(self.cost_adv_buf)
+        self.cost_adv_buf = (self.cost_adv_buf - cost_adv_mean) / cost_adv_std
+        data = dict(obs=self.obs_buf, act=self.act_buf,old_act=self.old_act_buf, ret=self.ret_buf,
+                    adv=self.adv_buf, cost_ret = self.cost_ret_buf, cost_adv=self.cost_adv_buf, logp=self.logp_buf)
+        return {k: torch.as_tensor(v, dtype=torch.float32) for k,v in data.items()}
--- a/sargym/sar_gym.py
+++ b/sargym/sar_gym.py
 import gym
-from gym import spaces
+from gym import spaces, Env
 import pygame
 import numpy as np
+MAX_STEPS_PER_EPISODE = 100
+MAX_FORWARD_REWARD_THRESHOLD = 0.5
+#reward scaling
+SEARCH_SCALE = 1.0
+FORWARD_SCALE = 0.1
 class GridWorldSAR(gym.Env):
    metadata = {"render_modes": ["human", "rgb_array"], "render_fps": 4}
@@ -13,9 +20,14 @@ class GridWorldSAR(gym.Env):
        self.window_size = 512  # The size of the PyGame window
        #Values the agent will need
-        self.pos_x = 0.0    #Current position, update w/ velocity
+        self.pos_x = 0.0    #Current position, update w/ velocity (Might need to make this the pos in 1200x1200 path coordinates?)
        self.pos_y = 0.0
-        self.heatmap = np.load('./deeprl_data/lpm4prob.npy')
+        self.cell = [0,0]
+        self.visited = np.zeros((map_size,map_size),dtype=bool)
+        self.heatmap = np.load('./deeprl_data/lpm/lpm4_prob.npy')
+        self.ringmap = np.load('./deeprl_data/ring/ringlowres_prob.npy')
+        #self.heatmap = np.load('./deeprl_data/test/test_prob.npy')
+        #self.heatmap = np.load('./deeprl_data/ring/ringlowres_prop.npy')
        # Observations are dictionaries with the agent's and the target's location.
        # Each location is encoded as an element of {0, ..., `size`}^2, i.e. MultiDiscrete([size, size]).
@@ -23,14 +35,16 @@ class GridWorldSAR(gym.Env):
        #Pos = agent's current coordinates
        #risk = search risk. Agent can see in a square area around themselves
        #visited = which spaces have been surveyed. Will be needed to get paths for multiple drones
+        #So this is kinda model based RL..?
        self.observation_space = spaces.Dict(
            {
-                "pos": spaces.Box(0, size -1, shape=(2,),dtype=int),
+                "pos": spaces.Box(0, self.size -1, shape=(2,),dtype=float),
-                "risk": spaces.Box(0, 1, shape=(map_size*map_size,), dtype=float),
+                "risk": spaces.Box(0, 1, shape=(self.size*self.size,), dtype=float),
-                "visited": spaces.Box(0, 1, shape=(map_size*map_size,), dtype=bool),
+                "visited": spaces.Box(0, 1, shape=(self.size*self.size,), dtype=bool),
            }
        )
        #Might need to take the risk/visited obs out. Let's see if it runs
+        self.observation_space = spaces.Box(low=0, high=self.size-1, shape=(2,),dtype=int)
        #Continuous velocity. Keep it at 1 so agent can't skip over cells to end episode early
        self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(2,), dtype=float)
@@ -50,6 +64,146 @@ class GridWorldSAR(gym.Env):
        self.window = None
        self.clock = None
+    def reset(self, seed=None, options=None):
+        if seed:
+            np.random.seed(seed)
+            random.seed(seed)
+        #Move agent back to start
+        self.pos_x = 0.0
+        self.pos_y = 23.0
+        self.cell = [0,23]
+        self.current_step = 0
+        self.done = False
+        self.visited = np.zeros((self.size,self.size),dtype=bool)
+        observation = self._get_obs()
+        info = self._get_info()
+        return observation
+    #let's see if we really need to observe risk and visited?
    def _get_obs(self):
-        return {'pos': np.array([np.int32(self.pos_x),np.int32(self.pos_y)])}
+        return np.array([self.pos_x,self.pos_y])
+        #return {'pos': self.cell, "risk": self.heatmap, "visited": self.visited}
+    def _get_info(self):
+        #print("Location = %.3f %.3f" %(self.pos_x,self.pos_y))
+        #Determine cost for CPO
+        #hacky but make sure indicies are okay
+        if(self.cell[0] < 0):
+            self.cell[0] = 0
+        if(self.cell[0] >= self.size):
+            self.cell[0] = self.size-1
+        if(self.cell[1] < 0):
+            self.cell[1] = 0
+        if(self.cell[1] >= self.size):
+            self.cell[1] = self.size-1
+        map_diff = abs(self.heatmap[self.cell[0],self.cell[1]] - self.ringmap[self.cell[0],self.cell[1]])
+        return {'loc': [self.pos_x,self.pos_y], 'cost': map_diff}
+    def step(self, action):
+        terminated = self.done
+        truncated = self.done
+        #Move agent to new position
+        self.pos_x = self.pos_x + action[0]
+        self.pos_y = self.pos_y + action[1]
+        self.cell = np.array([np.int32(self.pos_x),np.int32(self.pos_y)])
+        #Check if agent has reached edge
+        if(self.pos_x < 0 or self.pos_x >= self.size or self.pos_y < 0 or self.pos_y >= self.size):
+            self.done = True
+            terminated = True
+            #Add reward for reaching end??
+        #Calculate reward and update visit map
+        elif(not self.visited[self.cell[0],self.cell[1]]):
+            reward_search = self.heatmap[self.cell[0],self.cell[1]]
+            self.visited[self.cell[0],self.cell[1]] = True
+        reward_search = 0
+        reward_forward = np.min([action[0], MAX_FORWARD_REWARD_THRESHOLD])
+        #End episode after step max or agent exits the environment
+        if(self.current_step == MAX_STEPS_PER_EPISODE):
+            self.done = True
+            truncated = True
+        #Update step counter
+        self.current_step += 1
+        reward = SEARCH_SCALE*reward_search + FORWARD_SCALE*reward_forward
+        #print(reward)
+        observation = self._get_obs()
+        info = self._get_info()
+        return observation, reward, terminated, info
+    def render(self):
+        if self.render_mode == "rgb_array":
+            return self._render_frame()
+    def _render_frame(self):
+        if self.window is None and self.render_mode == "human":
+            pygame.init()
+            pygame.display.init()
+            self.window = pygame.display.set_mode((self.window_size, self.window_size))
+        if self.clock is None and self.render_mode == "human":
+            self.clock = pygame.time.Clock()
+        canvas = pygame.Surface((self.window_size, self.window_size))
+        canvas.fill((255, 255, 255))
+        pix_square_size = (
+            self.window_size / self.size
+        )  # The size of a single grid square in pixels
+        # Now we draw the agent
+        pygame.draw.circle(
+            canvas,
+            (0, 0, 255),
+            (self.cell + 0.5) * pix_square_size,
+            pix_square_size / 3,
+        )
+        # Finally, add some gridlines
+        for x in range(self.size + 1):
+            pygame.draw.line(
+                canvas,
+                0,
+                (0, pix_square_size * x),
+                (self.window_size, pix_square_size * x),
+                width=3,
+            )
+            pygame.draw.line(
+                canvas,
+                0,
+                (pix_square_size * x, 0),
+                (pix_square_size * x, self.window_size),
+                width=3,
+            )
+        if self.render_mode == "human":
+            # The following line copies our drawings from `canvas` to the visible window
+            self.window.blit(canvas, canvas.get_rect())
+            pygame.event.pump()
+            pygame.display.update()
+            # We need to ensure that human-rendering occurs at the predefined framerate.
+            # The following line will automatically add a delay to keep the framerate stable.
+            self.clock.tick(self.metadata["render_fps"])
+        else:  # rgb_array
+            return np.transpose(
+                np.array(pygame.surfarray.pixels3d(canvas)), axes=(1, 0, 2)
+            )
+    def close(self):
+        if self.window is not None:
+            pygame.display.quit()
+            pygame.quit()
--- a/sargym/trust_region_utils.py
+++ b/sargym/trust_region_utils.py
+import math
+import numpy as np
+import torch
+def normal_entropy(std):
+    var = std.pow(2)
+    entropy = 0.5 + 0.5 * torch.log(2 * var * math.pi)
+    return entropy.sum(1, keepdim=True)
+def normal_log_density(x, mean, log_std, std):
+    var = std.pow(2)
+    log_density = -(x - mean).pow(2) / (
+        2 * var) - 0.5 * math.log(2 * math.pi) - log_std
+    return log_density.sum(1, keepdim=True)
+def get_flat_params_from(model):
+    params = []
+    for param in model.parameters():
+        params.append(param.data.view(-1))
+    flat_params = torch.cat(params)
+    return flat_params
+def set_flat_params_to(model, flat_params):
+    prev_ind = 0
+    for param in model.parameters():
+        flat_size = int(np.prod(list(param.size())))
+        param.data.copy_(
+            flat_params[prev_ind:prev_ind + flat_size].view(param.size()))
+        prev_ind += flat_size
+def get_flat_grad_from(net, grad_grad=False):
+    grads = []
+    for param in net.parameters():
+        if grad_grad:
+            grads.append(param.grad.grad.view(-1))
+        else:
+            grads.append(param.grad.view(-1))
+    flat_grad = torch.cat(grads)
+    return flat_grad
\ No newline at end of file
--- a/train_cpo.py
+++ b/train_cpo.py
@@ -22,6 +22,11 @@ import datetime as dt
 import time, os
 import pdb
+from sargym import sar_gym, cpo, core
+import cProfile, pstats, io
 #Relative filepaths (Assumes this and ags_grabber projects are in same parent directory)
 kentland_heatmap = './LP model/analysis/outputs/ic_1_con_hiker_t12_kentland.csv'
 #kentland_heatmap = './LP model/analysis/outputs/ic_1_con_hiker_t4.csv' #I like 4?
@@ -159,26 +164,188 @@ def create_data():
    del rgp, planner, mc
-def main():
+def testing():
    #Run this function to create data
-    create_data()
+    #create_data()
-    #Load robot paths from file
-    ring_prob = np.load('./deeprl_data/ring_prob.npy')
-    robot0_waypoints = np.load('./deeprl_data/robot0_waypoints.npy')
-    robot1_waypoints = np.load('./deeprl_data/robot1_waypoints.npy')
-    robot2_waypoints = np.load('./deeprl_data/robot2_waypoints.npy')
    #Make Gym Environment
+    env = sar_gym.GridWorldSAR(render_mode='human')
+    obs, info = env.reset()
+    path = [info['loc']]
+    rews = []
+    done = False
+    while not done:
+        obs, reward, term, info = env.step([1,1])
+        done = term
+        path.append(info['loc'])
+        rews.append(reward)
+    fig, (ax1, ax2) = plt.subplots(1, 2)
+    # Scatter plot of path and track in the first subplot
+    ax1.scatter(x=np.array(path)[:,0], y=np.array(path)[:,1], label='Path')
+    ax1.set_xlim([0,48])
+    ax1.set_ylim([0,48])
+    ax1.set_xlabel('X')
+    ax1.set_ylabel('Y')
+    ax1.set_title('Path and Track')
+    ax1.legend()
+    # Plot of rewards in the second subplot
+    ax2.plot(rews)
+    ax2.set_xlabel('Step')
+    ax2.set_ylabel('Reward')
+    ax2.set_title('Rewards')
+    plt.tight_layout()
+    plt.show()
+    return
+COST_THRESHOLD = 50  #sanity check to make sure things are working
+MAX_STEPS_PER_EPISODE = 100
+def train():
+    #Run this function to create data
+    #create_data()
+    #Training CPO
+    J = cpo.cpo(
+        lambda: sar_gym.GridWorldSAR(render_mode=None),
+        actor_critic=core.MLPActorCriticTD3trust,
+        max_ep_len=MAX_STEPS_PER_EPISODE,
+        cost_lim=COST_THRESHOLD,
+        epochs=200,
+        steps_per_epoch=8000
+    )
+    #175
+    pr = cProfile.Profile()
+    pr.enable()
+    profile = True
+    if profile:
+        try:
+            J
+        except KeyboardInterrupt:
+            print("\nKeyboard interrupt received. Printing stats...")
+        finally:
+            pr.disable()
+            s = io.StringIO()
+            ps = pstats.Stats(pr, stream=s).sort_stats("tottime")
+            ps.print_stats(32)
+            print(s.getvalue())
+    #Save policy
+    torch.save(J.pi, './deeprl_data/trained_model4.pt')
+    #Testing
+    #Make Gym Environment
+    env = sar_gym.GridWorldSAR(render_mode=None)
+    #Test one episode
+    obs = env.reset()
+    path = []
+    rews = []
+    done = False
+    while not done:
+        with torch.no_grad():
+            tt = torch.as_tensor(obs,dtype=torch.float32)
+            actions = J.pi(tt)
+        obs, reward, term, info = env.step(actions.numpy())
+        done = term
+        path.append(info['loc'])
+        rews.append(reward)
+    fig, (ax1, ax2) = plt.subplots(1, 2)
+    # Scatter plot of path and track in the first subplot
+    ax1.scatter(x=np.array(path)[:,0], y=np.array(path)[:,1], label='Path')
+    ax1.set_xlim([0,48])
+    ax1.set_ylim([0,48])
+    ax1.set_xlabel('X')
+    ax1.set_ylabel('Y')
+    ax1.set_title('Path and Track')
+    ax1.legend()
+    # Plot of rewards in the second subplot
+    ax2.plot(rews)
+    ax2.set_xlabel('Step')
+    ax2.set_ylabel('Reward')
+    ax2.set_title('Rewards')
+    plt.tight_layout()
+    plt.show()
    return
+def test_policy():
+    #Testing
+    policy = torch.load('./deeprl_data/trained_model2.pt')
+    heatmap = np.load('./deeprl_data/lpm/lpm4_prob.npy')
+    ringmap = np.load('./deeprl_data/ring/ringlowres_prob.npy')
+    #Make Gym Environment
+    env = sar_gym.GridWorldSAR(render_mode=None)
+    #Test one episode
+    obs = env.reset()
+    path = []
+    rews = []
+    done = False
+    while not done:
+        with torch.no_grad():
+            tt = torch.as_tensor(obs,dtype=torch.float32)
+            actions = policy(tt)
+        obs, reward, term, info = env.step(actions.numpy())
+        done = term
+        path.append(info['loc'])
+        rews.append(reward)
+    fig, (ax1, ax2) = plt.subplots(1, 2)
+    # Scatter plot of path and track in the first subplot
+    ax1.scatter(x=np.array(path)[:,0], y=np.array(path)[:,1], label='Path')
+    ax1.imshow(heatmap)
+    ax1.set_xlim([0,48])
+    ax1.set_ylim([0,48])
+    ax1.set_xlabel('X')
+    ax1.set_ylabel('Y')
+    ax1.set_title('Lost Person Model')
+    ax1.legend()
+    # Scatter plot of path and track in the first subplot
+    ax2.scatter(x=np.array(path)[:,0], y=np.array(path)[:,1], label='Path')
+    ax2.imshow(ringmap)
+    ax2.set_xlim([0,48])
+    ax2.set_ylim([0,48])
+    ax2.set_xlabel('X')
+    ax2.set_ylabel('Y')
+    ax2.set_title('Ring Model')
+    ax2.legend()
+    # Plot of rewards in the second subplot
+    #ax2.plot(rews)
+    #ax2.set_xlabel('Step')
+    #ax2.set_ylabel('Reward')
+    #x2.set_title('Rewards')
+    plt.tight_layout()
+    plt.suptitle("LPM Reward Function with Ring Model Constraint")
+    plt.show()
+def main():
+    #test_policy()
+    train()
+    return