Commit b8fe4105 authored by Bryson Howell's avatar Bryson Howell

Initial version of DeepRL agents, results from final project

parent e1472177
import numpy as np
import scipy.signal
from gym.spaces import Box, Discrete
import torch
import torch.nn as nn
from torch.distributions.normal import Normal
from torch.distributions.categorical import Categorical
import torch.nn.functional as F
import copy
def combined_shape(length, shape=None):
if shape is None:
return (length,)
return (length, shape) if np.isscalar(shape) else (length, *shape)
def mlp(sizes, activation, output_activation=nn.Identity):
layers = []
for j in range(len(sizes)-1):
act = activation if j < len(sizes)-2 else output_activation
layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
return nn.Sequential(*layers)
def count_vars(module):
return sum([np.prod(p.shape) for p in module.parameters()])
def discount_cumsum(x, discount):
"""
magic from rllab for computing discounted cumulative sums of vectors.
input:
vector x,
[x0,
x1,
x2]
output:
[x0 + discount * x1 + discount^2 * x2,
x1 + discount * x2,
x2]
"""
return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]
class Actor(nn.Module):
def _distribution(self, obs):
raise NotImplementedError
def _log_prob_from_distribution(self, pi, act):
raise NotImplementedError
def forward(self, obs, act=None):
# Produce action distributions for given observations, and
# optionally compute the log likelihood of given actions under
# those distributions.
pi = self._distribution(obs)
logp_a = None
if act is not None:
logp_a = self._log_prob_from_distribution(pi, act)
return pi, logp_a
class MLPDeterministicActor(nn.Module):
def __init__(self, state_dim, action_dim, max_action,discount_factor=0.99):
super(MLPDeterministicActor, self).__init__()
self.l1 = nn.Linear(state_dim, 256)
self.l2 = nn.Linear(256, 256)
self.l3 = nn.Linear(256, action_dim)
self.max_action = max_action
self.action_dim=action_dim
self.start_state = None
def forward(self, state,safety_switch=False,debug = False, noisy=False):
a = F.relu(self.l1(state))
a = F.relu(self.l2(a))
return self.max_action * torch.tanh(self.l3(a))
class MLPCategoricalActor(Actor):
def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
super().__init__()
self.logits_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation)
def _distribution(self, obs):
logits = self.logits_net(obs)
return Categorical(logits=logits)
def _log_prob_from_distribution(self, pi, act):
return pi.log_prob(act)
class MLPGaussianActor(Actor):
def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
super().__init__()
log_std = -0.5 * np.ones(act_dim, dtype=np.float32)
self.log_std = torch.nn.Parameter(torch.as_tensor(log_std))
self.mu_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation)
def _distribution(self, obs):
mu = self.mu_net(obs)
std = torch.exp(self.log_std)
return Normal(mu, std)
def mean(self, obs):
mu = self.mu_net(obs)
return mu
def _log_prob_from_distribution(self, pi, act):
return pi.log_prob(act).sum(axis=-1) # Last axis sum needed for Torch Normal distribution
class MLPCritic(nn.Module):
def __init__(self, obs_dim, hidden_sizes, activation):
super().__init__()
self.v_net = mlp([obs_dim] + list(hidden_sizes) + [1], activation)
def forward(self, obs):
return torch.squeeze(self.v_net(obs), -1) # Critical to ensure v has right shape.
class MLPActorCriticTD3trust(nn.Module):
def __init__(self, observation_space, action_space,
hidden_sizes=(64,64), activation=nn.Tanh):
super().__init__()
obs_dim = observation_space.shape[0]
act_dim = action_space.shape[0]
self.obs_dim = obs_dim
self.act_dim = act_dim
# policy builder depends on action space
self.pi = MLPDeterministicActor(obs_dim, action_space.shape[0],action_space.high[0])
# build value function
self.Qv1 = MLPCritic(obs_dim+act_dim, hidden_sizes, activation)
self.Qv2 = MLPCritic(obs_dim+act_dim, hidden_sizes, activation)
self.Qj1 = MLPCritic(obs_dim+act_dim, hidden_sizes, activation)
self.Qj2 = MLPCritic(obs_dim+act_dim, hidden_sizes, activation)
self.baseline_Qj = copy.deepcopy(self.Qj1)
self.baseline_pi = copy.deepcopy(self.pi)
self.pi_mix = copy.deepcopy(self.pi)
self.epsilon = 0
def step(self, obs):
a = self.pi(obs)
qv = self.Qv1(torch.cat((obs,a)))
qj = self.Qj1(torch.cat((obs,a)))
return a.detach().cpu().numpy(), qv.detach().cpu().numpy(), qj.detach().cpu().numpy(), 0
def act_pi(self, pi, obs):
a = pi(obs)
return a
def act(self, obs):
return self.step(obs)[0]
class MLPActorCritic(nn.Module):
def __init__(self, observation_space, action_space,
hidden_sizes=(64,64), activation=nn.Tanh):
super().__init__()
obs_dim = observation_space.shape[0]
# policy builder depends on action space
if isinstance(action_space, Box):
self.pi = MLPGaussianActor(obs_dim, action_space.shape[0], hidden_sizes, activation)
elif isinstance(action_space, Discrete):
self.pi = MLPCategoricalActor(obs_dim, action_space.n, hidden_sizes, activation)
# build value function
self.v = MLPCritic(obs_dim, hidden_sizes, activation)
def step(self, obs):
with torch.no_grad():
pi = self.pi._distribution(obs)
a = pi.sample()
logp_a = self.pi._log_prob_from_distribution(pi, a)
v = self.v(obs)
return a.numpy(), v.numpy(), logp_a.numpy()
def act(self, obs):
return self.step(obs)[0]
class MLPActorCriticCost(nn.Module):
def __init__(self, observation_space, action_space,
hidden_sizes=(64,64), activation=nn.Tanh):
super().__init__()
obs_dim = observation_space.shape[0]
# policy builder depends on action space
if isinstance(action_space, Box):
self.pi = MLPGaussianActor(obs_dim, action_space.shape[0], hidden_sizes, activation)
elif isinstance(action_space, Discrete):
self.pi = MLPCategoricalActor(obs_dim, action_space.n, hidden_sizes, activation)
# build value function
self.v = MLPCritic(obs_dim, hidden_sizes, activation)
self.j = MLPCritic(obs_dim, hidden_sizes, activation)
def step(self, obs):
with torch.no_grad():
pi = self.pi._distribution(obs)
a = pi.sample()
logp_a = self.pi._log_prob_from_distribution(pi, a)
v = self.v(obs)
j = self.j(obs)
return a.numpy(), v.numpy(), j.numpy(), logp_a.numpy()
def act(self, obs):
return self.step(obs)[0]
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
import multiprocessing
import numpy as np
import os
import torch
from mpi4py import MPI
from .mpi_tools import broadcast, mpi_avg, num_procs, proc_id
def setup_pytorch_for_mpi():
"""
Avoid slowdowns caused by each separate process's PyTorch using
more than its fair share of CPU resources.
"""
#print('Proc %d: Reporting original number of Torch threads as %d.'%(proc_id(), torch.get_num_threads()), flush=True)
if torch.get_num_threads()==1:
return
fair_num_threads = max(int(torch.get_num_threads() / num_procs()), 1)
torch.set_num_threads(fair_num_threads)
#print('Proc %d: Reporting new number of Torch threads as %d.'%(proc_id(), torch.get_num_threads()), flush=True)
def mpi_avg_grads(module):
""" Average contents of gradient buffers across MPI processes. """
if num_procs()==1:
return
for p in module.parameters():
p_grad_numpy = p.grad.numpy() # numpy view of tensor data
avg_p_grad = mpi_avg(p.grad)
p_grad_numpy[:] = avg_p_grad[:]
def sync_params(module):
""" Sync all parameters of module across all MPI processes. """
if num_procs()==1:
return
for p in module.parameters():
p_numpy = p.data.numpy()
broadcast(p_numpy)
from mpi4py import MPI
import os, subprocess, sys
import numpy as np
import torch
def mpi_fork(n, bind_to_core=False):
"""
Re-launches the current script with workers linked by MPI.
Also, terminates the original process that launched it.
Taken almost without modification from the Baselines function of the
`same name`_.
.. _`same name`: https://github.com/openai/baselines/blob/master/baselines/common/mpi_fork.py
Args:
n (int): Number of process to split into.
bind_to_core (bool): Bind each MPI process to a core.
"""
if n<=1:
return
if os.getenv("IN_MPI") is None:
env = os.environ.copy()
env.update(
MKL_NUM_THREADS="1",
OMP_NUM_THREADS="1",
IN_MPI="1"
)
args = ["mpirun", "-np", str(n)]
if bind_to_core:
args += ["-bind-to", "core"]
args += [sys.executable] + sys.argv
subprocess.check_call(args, env=env)
sys.exit()
def msg(m, string=''):
print(('Message from %d: %s \t '%(MPI.COMM_WORLD.Get_rank(), string))+str(m))
def proc_id():
"""Get rank of calling process."""
return MPI.COMM_WORLD.Get_rank()
def allreduce(*args, **kwargs):
return MPI.COMM_WORLD.Allreduce(*args, **kwargs)
def num_procs():
"""Count active MPI processes."""
return MPI.COMM_WORLD.Get_size()
def broadcast(x, root=0):
MPI.COMM_WORLD.Bcast(x, root=root)
def mpi_op(x, op):
x, scalar = ([x], True) if np.isscalar(x) else (x, False)
x = np.asarray(x, dtype=np.float32)
buff = np.zeros_like(x, dtype=np.float32)
allreduce(x, buff, op=op)
return buff[0] if scalar else buff
def mpi_sum(x):
return mpi_op(x, MPI.SUM)
def mpi_avg(x):
"""Average a scalar or vector over MPI processes."""
return mpi_sum(x) / num_procs()
def mpi_statistics_scalar(x, with_min_and_max=False):
"""
Get mean/std and optional min/max of scalar x across MPI processes.
Args:
x: An array containing samples of the scalar to produce statistics
for.
with_min_and_max (bool): If true, return min and max of x in
addition to mean and std.
"""
if isinstance(x, list):
for i, xx in enumerate(x):
if torch.is_tensor(xx):
x[i] = xx.detach()
x = np.array(x, dtype=np.float32)
global_sum, global_n = mpi_sum([np.sum(x), len(x)])
mean = global_sum / global_n
global_sum_sq = mpi_sum(np.sum((x - mean)**2))
std = np.sqrt(global_sum_sq / global_n) # compute global std
if with_min_and_max:
global_min = mpi_op(np.min(x) if len(x) > 0 else np.inf, op=MPI.MIN)
global_max = mpi_op(np.max(x) if len(x) > 0 else -np.inf, op=MPI.MAX)
return mean, std, global_min, global_max
return mean, std
\ No newline at end of file
This diff is collapsed.
import json
def convert_json(obj):
""" Convert obj to a version which can be serialized with JSON. """
if is_json_serializable(obj):
return obj
else:
if isinstance(obj, dict):
return {convert_json(k): convert_json(v)
for k,v in obj.items()}
elif isinstance(obj, tuple):
return (convert_json(x) for x in obj)
elif isinstance(obj, list):
return [convert_json(x) for x in obj]
elif hasattr(obj,'__name__') and not('lambda' in obj.__name__):
return convert_json(obj.__name__)
elif hasattr(obj,'__dict__') and obj.__dict__:
obj_dict = {convert_json(k): convert_json(v)
for k,v in obj.__dict__.items()}
return {str(obj): obj_dict}
return str(obj)
def is_json_serializable(v):
try:
json.dumps(v)
return True
except:
return False
\ No newline at end of file
import os
import os.path as osp
# Default neural network backend for each algo
# (Must be either 'tf1' or 'pytorch')
DEFAULT_BACKEND = {
'vpg': 'pytorch',
'trpo': 'tf1',
'ppo': 'pytorch',
'ddpg': 'pytorch',
'td3': 'pytorch',
'sac': 'pytorch'
}
# Where experiment outputs are saved by default:
DEFAULT_DATA_DIR = osp.join(osp.abspath(osp.dirname(osp.dirname(__file__))),'data')
# Whether to automatically insert a date and time stamp into the names of
# save directories:
FORCE_DATESTAMP = False
# Whether GridSearch provides automatically-generated default shorthands:
DEFAULT_SHORTHAND = True
# Tells the GridSearch how many seconds to pause for before launching
# experiments.
WAIT_BEFORE_LAUNCH = 5
\ No newline at end of file
import torch
import numpy as np
from . import core
from .ppo_utils.logx import EpochLogger
from .ppo_utils.mpi_pytorch import setup_pytorch_for_mpi, sync_params, mpi_avg_grads
from .ppo_utils.mpi_tools import mpi_fork, mpi_avg, proc_id, mpi_statistics_scalar, num_procs
class TD3Buffer(object):
def __init__(self, state_dim, action_dim, max_size=int(1e6)):
self.max_size = max_size
self.ptr = 0
self.size = 0
# print(state_dim,action_dim)
self.state = np.zeros((max_size, state_dim))
self.action = np.zeros((max_size, action_dim))
self.next_state = np.zeros((max_size, state_dim))
self.reward = np.zeros((max_size, 1))
self.cost = np.zeros((max_size, 1))
self.not_done = np.zeros((max_size, 1))
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def add(self, state, action, next_state, reward, cost, done):
self.state[self.ptr] = state
self.action[self.ptr] = action
self.next_state[self.ptr] = next_state
self.reward[self.ptr] = reward
self.cost[self.ptr] = cost
self.not_done[self.ptr] = 1. - done
self.ptr = (self.ptr + 1) % self.max_size
self.size = min(self.size + 1, self.max_size)
def sample(self, batch_size):
ind = np.random.randint(0, self.size, size=batch_size)
return (
torch.FloatTensor(self.state[ind]).to(self.device),
torch.FloatTensor(self.action[ind]).to(self.device),
torch.FloatTensor(self.next_state[ind]).to(self.device),
torch.FloatTensor(self.reward[ind]).to(self.device),
torch.FloatTensor(self.cost[ind]).to(self.device),
torch.FloatTensor(self.not_done[ind]).to(self.device)
)
class PPOBuffer:
"""
A buffer for storing trajectories experienced by a PPO agent interacting
with the environment, and using Generalized Advantage Estimation (GAE-Lambda)
for calculating the advantages of state-action pairs.
"""
def __init__(self, obs_dim, act_dim, size, gamma=0.99, lam=0.95):
self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32)
self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32)
self.old_act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32)
self.adv_buf = np.zeros(size, dtype=np.float32)
self.rew_buf = np.zeros(size, dtype=np.float32)
self.ret_buf = np.zeros(size, dtype=np.float32)
self.val_buf = np.zeros(size, dtype=np.float32)
self.cost_adv_buf = np.zeros(size, dtype=np.float32)
self.cost_buf = np.zeros(size, dtype=np.float32)
self.cost_ret_buf = np.zeros(size, dtype=np.float32)
self.cost_val_buf = np.zeros(size, dtype=np.float32)
self.logp_buf = np.zeros(size, dtype=np.float32)
self.gamma, self.lam = gamma, lam
self.ptr, self.path_start_idx, self.max_size = 0, 0, size
def store(self, obs, act, rew, cost, val, cost_val, logp, old_act):
"""
Append one timestep of agent-environment interaction to the buffer.
"""
assert self.ptr < self.max_size # buffer has to have room so you can store
self.obs_buf[self.ptr] = obs
self.act_buf[self.ptr] = act
self.old_act_buf[self.ptr] = old_act
self.rew_buf[self.ptr] = rew
self.val_buf[self.ptr] = val
self.cost_buf[self.ptr] = cost
self.cost_val_buf[self.ptr] = cost_val
self.logp_buf[self.ptr] = logp
self.ptr += 1
def finish_path(self, last_val=0, last_cost_val=0):
"""
Call this at the end of a trajectory, or when one gets cut off
by an epoch ending. This looks back in the buffer to where the
trajectory started, and uses rewards and value estimates from
the whole trajectory to compute advantage estimates with GAE-Lambda,
as well as compute the rewards-to-go for each state, to use as
the targets for the value function.
The "last_val" argument should be 0 if the trajectory ended
because the agent reached a terminal state (died), and otherwise
should be V(s_T), the value function estimated for the last state.
This allows us to bootstrap the reward-to-go calculation to account
for timesteps beyond the arbitrary episode horizon (or epoch cutoff).
"""
path_slice = slice(self.path_start_idx, self.ptr)
rews = np.append(self.rew_buf[path_slice], last_val)
vals = np.append(self.val_buf[path_slice], last_val)
# the next two lines implement GAE-Lambda advantage calculation
deltas = rews[:-1] + self.gamma * vals[1:] - vals[:-1]
self.adv_buf[path_slice] = core.discount_cumsum(deltas, self.gamma * self.lam)
# the next line computes rewards-to-go, to be targets for the value function
self.ret_buf[path_slice] = core.discount_cumsum(rews, self.gamma)[:-1]
costs = np.append(self.cost_buf[path_slice], last_cost_val)
cost_vals = np.append(self.cost_val_buf[path_slice], last_cost_val)
# the next two lines implement GAE-Lambda advantage calculation
cost_deltas = costs[:-1] + self.gamma * cost_vals[1:] - cost_vals[:-1]
self.cost_adv_buf[path_slice] = core.discount_cumsum(cost_deltas, self.gamma * self.lam)
# the next line computes rewards-to-go, to be targets for the value function
self.cost_ret_buf[path_slice] = core.discount_cumsum(costs, self.gamma)[:-1]
self.path_start_idx = self.ptr
def get(self):
"""
Call this at the end of an epoch to get all of the data from
the buffer, with advantages appropriately normalized (shifted to have
mean zero and std one). Also, resets some pointers in the buffer.
"""
assert self.ptr == self.max_size # buffer has to be full before you can get
self.ptr, self.path_start_idx = 0, 0
# the next two lines implement the advantage normalization trick
adv_mean, adv_std = mpi_statistics_scalar(self.adv_buf)
self.adv_buf = (self.adv_buf - adv_mean) / adv_std
cost_adv_mean, cost_adv_std = mpi_statistics_scalar(self.cost_adv_buf)
self.cost_adv_buf = (self.cost_adv_buf - cost_adv_mean) / cost_adv_std
data = dict(obs=self.obs_buf, act=self.act_buf,old_act=self.old_act_buf, ret=self.ret_buf,
adv=self.adv_buf, cost_ret = self.cost_ret_buf, cost_adv=self.cost_adv_buf, logp=self.logp_buf)
return {k: torch.as_tensor(v, dtype=torch.float32) for k,v in data.items()}
import gym import gym
from gym import spaces from gym import spaces, Env
import pygame import pygame
import numpy as np import numpy as np
MAX_STEPS_PER_EPISODE = 100
MAX_FORWARD_REWARD_THRESHOLD = 0.5
#reward scaling
SEARCH_SCALE = 1.0
FORWARD_SCALE = 0.1
class GridWorldSAR(gym.Env): class GridWorldSAR(gym.Env):
metadata = {"render_modes": ["human", "rgb_array"], "render_fps": 4} metadata = {"render_modes": ["human", "rgb_array"], "render_fps": 4}
...@@ -13,9 +20,14 @@ class GridWorldSAR(gym.Env): ...@@ -13,9 +20,14 @@ class GridWorldSAR(gym.Env):
self.window_size = 512 # The size of the PyGame window self.window_size = 512 # The size of the PyGame window
#Values the agent will need #Values the agent will need
self.pos_x = 0.0 #Current position, update w/ velocity self.pos_x = 0.0 #Current position, update w/ velocity (Might need to make this the pos in 1200x1200 path coordinates?)
self.pos_y = 0.0 self.pos_y = 0.0
self.heatmap = np.load('./deeprl_data/lpm4prob.npy') self.cell = [0,0]
self.visited = np.zeros((map_size,map_size),dtype=bool)
self.heatmap = np.load('./deeprl_data/lpm/lpm4_prob.npy')
self.ringmap = np.load('./deeprl_data/ring/ringlowres_prob.npy')
#self.heatmap = np.load('./deeprl_data/test/test_prob.npy')
#self.heatmap = np.load('./deeprl_data/ring/ringlowres_prop.npy')
# Observations are dictionaries with the agent's and the target's location. # Observations are dictionaries with the agent's and the target's location.
# Each location is encoded as an element of {0, ..., `size`}^2, i.e. MultiDiscrete([size, size]). # Each location is encoded as an element of {0, ..., `size`}^2, i.e. MultiDiscrete([size, size]).
...@@ -23,14 +35,16 @@ class GridWorldSAR(gym.Env): ...@@ -23,14 +35,16 @@ class GridWorldSAR(gym.Env):
#Pos = agent's current coordinates #Pos = agent's current coordinates
#risk = search risk. Agent can see in a square area around themselves #risk = search risk. Agent can see in a square area around themselves
#visited = which spaces have been surveyed. Will be needed to get paths for multiple drones #visited = which spaces have been surveyed. Will be needed to get paths for multiple drones
#So this is kinda model based RL..?
self.observation_space = spaces.Dict( self.observation_space = spaces.Dict(
{ {
"pos": spaces.Box(0, size -1, shape=(2,),dtype=int), "pos": spaces.Box(0, self.size -1, shape=(2,),dtype=float),
"risk": spaces.Box(0, 1, shape=(map_size*map_size,), dtype=float), "risk": spaces.Box(0, 1, shape=(self.size*self.size,), dtype=float),
"visited": spaces.Box(0, 1, shape=(map_size*map_size,), dtype=bool), "visited": spaces.Box(0, 1, shape=(self.size*self.size,), dtype=bool),
} }
) )
#Might need to take the risk/visited obs out. Let's see if it runs #Might need to take the risk/visited obs out. Let's see if it runs
self.observation_space = spaces.Box(low=0, high=self.size-1, shape=(2,),dtype=int)
#Continuous velocity. Keep it at 1 so agent can't skip over cells to end episode early #Continuous velocity. Keep it at 1 so agent can't skip over cells to end episode early
self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(2,), dtype=float) self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(2,), dtype=float)
...@@ -50,6 +64,146 @@ class GridWorldSAR(gym.Env): ...@@ -50,6 +64,146 @@ class GridWorldSAR(gym.Env):
self.window = None self.window = None
self.clock = None self.clock = None
def reset(self, seed=None, options=None):
if seed:
np.random.seed(seed)
random.seed(seed)
#Move agent back to start
self.pos_x = 0.0
self.pos_y = 23.0
self.cell = [0,23]
self.current_step = 0
self.done = False
self.visited = np.zeros((self.size,self.size),dtype=bool)
observation = self._get_obs()
info = self._get_info()
return observation
#let's see if we really need to observe risk and visited?
def _get_obs(self): def _get_obs(self):
return {'pos': np.array([np.int32(self.pos_x),np.int32(self.pos_y)])} return np.array([self.pos_x,self.pos_y])
#return {'pos': self.cell, "risk": self.heatmap, "visited": self.visited}
def _get_info(self):
#print("Location = %.3f %.3f" %(self.pos_x,self.pos_y))
#Determine cost for CPO
#hacky but make sure indicies are okay
if(self.cell[0] < 0):
self.cell[0] = 0
if(self.cell[0] >= self.size):
self.cell[0] = self.size-1
if(self.cell[1] < 0):
self.cell[1] = 0
if(self.cell[1] >= self.size):
self.cell[1] = self.size-1
map_diff = abs(self.heatmap[self.cell[0],self.cell[1]] - self.ringmap[self.cell[0],self.cell[1]])
return {'loc': [self.pos_x,self.pos_y], 'cost': map_diff}
def step(self, action):
terminated = self.done
truncated = self.done
#Move agent to new position
self.pos_x = self.pos_x + action[0]
self.pos_y = self.pos_y + action[1]
self.cell = np.array([np.int32(self.pos_x),np.int32(self.pos_y)])
#Check if agent has reached edge
if(self.pos_x < 0 or self.pos_x >= self.size or self.pos_y < 0 or self.pos_y >= self.size):
self.done = True
terminated = True
#Add reward for reaching end??
#Calculate reward and update visit map
elif(not self.visited[self.cell[0],self.cell[1]]):
reward_search = self.heatmap[self.cell[0],self.cell[1]]
self.visited[self.cell[0],self.cell[1]] = True
reward_search = 0
reward_forward = np.min([action[0], MAX_FORWARD_REWARD_THRESHOLD])
#End episode after step max or agent exits the environment
if(self.current_step == MAX_STEPS_PER_EPISODE):
self.done = True
truncated = True
#Update step counter
self.current_step += 1
reward = SEARCH_SCALE*reward_search + FORWARD_SCALE*reward_forward
#print(reward)
observation = self._get_obs()
info = self._get_info()
return observation, reward, terminated, info
def render(self):
if self.render_mode == "rgb_array":
return self._render_frame()
def _render_frame(self):
if self.window is None and self.render_mode == "human":
pygame.init()
pygame.display.init()
self.window = pygame.display.set_mode((self.window_size, self.window_size))
if self.clock is None and self.render_mode == "human":
self.clock = pygame.time.Clock()
canvas = pygame.Surface((self.window_size, self.window_size))
canvas.fill((255, 255, 255))
pix_square_size = (
self.window_size / self.size
) # The size of a single grid square in pixels
# Now we draw the agent
pygame.draw.circle(
canvas,
(0, 0, 255),
(self.cell + 0.5) * pix_square_size,
pix_square_size / 3,
)
# Finally, add some gridlines
for x in range(self.size + 1):
pygame.draw.line(
canvas,
0,
(0, pix_square_size * x),
(self.window_size, pix_square_size * x),
width=3,
)
pygame.draw.line(
canvas,
0,
(pix_square_size * x, 0),
(pix_square_size * x, self.window_size),
width=3,
)
if self.render_mode == "human":
# The following line copies our drawings from `canvas` to the visible window
self.window.blit(canvas, canvas.get_rect())
pygame.event.pump()
pygame.display.update()
# We need to ensure that human-rendering occurs at the predefined framerate.
# The following line will automatically add a delay to keep the framerate stable.
self.clock.tick(self.metadata["render_fps"])
else: # rgb_array
return np.transpose(
np.array(pygame.surfarray.pixels3d(canvas)), axes=(1, 0, 2)
)
def close(self):
if self.window is not None:
pygame.display.quit()
pygame.quit()
import math
import numpy as np
import torch
def normal_entropy(std):
var = std.pow(2)
entropy = 0.5 + 0.5 * torch.log(2 * var * math.pi)
return entropy.sum(1, keepdim=True)
def normal_log_density(x, mean, log_std, std):
var = std.pow(2)
log_density = -(x - mean).pow(2) / (
2 * var) - 0.5 * math.log(2 * math.pi) - log_std
return log_density.sum(1, keepdim=True)
def get_flat_params_from(model):
params = []
for param in model.parameters():
params.append(param.data.view(-1))
flat_params = torch.cat(params)
return flat_params
def set_flat_params_to(model, flat_params):
prev_ind = 0
for param in model.parameters():
flat_size = int(np.prod(list(param.size())))
param.data.copy_(
flat_params[prev_ind:prev_ind + flat_size].view(param.size()))
prev_ind += flat_size
def get_flat_grad_from(net, grad_grad=False):
grads = []
for param in net.parameters():
if grad_grad:
grads.append(param.grad.grad.view(-1))
else:
grads.append(param.grad.view(-1))
flat_grad = torch.cat(grads)
return flat_grad
\ No newline at end of file
...@@ -22,6 +22,11 @@ import datetime as dt ...@@ -22,6 +22,11 @@ import datetime as dt
import time, os import time, os
import pdb import pdb
from sargym import sar_gym, cpo, core
import cProfile, pstats, io
#Relative filepaths (Assumes this and ags_grabber projects are in same parent directory) #Relative filepaths (Assumes this and ags_grabber projects are in same parent directory)
kentland_heatmap = './LP model/analysis/outputs/ic_1_con_hiker_t12_kentland.csv' kentland_heatmap = './LP model/analysis/outputs/ic_1_con_hiker_t12_kentland.csv'
#kentland_heatmap = './LP model/analysis/outputs/ic_1_con_hiker_t4.csv' #I like 4? #kentland_heatmap = './LP model/analysis/outputs/ic_1_con_hiker_t4.csv' #I like 4?
...@@ -159,26 +164,188 @@ def create_data(): ...@@ -159,26 +164,188 @@ def create_data():
del rgp, planner, mc del rgp, planner, mc
def main(): def testing():
#Run this function to create data #Run this function to create data
create_data() #create_data()
#Load robot paths from file
ring_prob = np.load('./deeprl_data/ring_prob.npy')
robot0_waypoints = np.load('./deeprl_data/robot0_waypoints.npy')
robot1_waypoints = np.load('./deeprl_data/robot1_waypoints.npy')
robot2_waypoints = np.load('./deeprl_data/robot2_waypoints.npy')
#Make Gym Environment #Make Gym Environment
env = sar_gym.GridWorldSAR(render_mode='human')
obs, info = env.reset()
path = [info['loc']]
rews = []
done = False
while not done:
obs, reward, term, info = env.step([1,1])
done = term
path.append(info['loc'])
rews.append(reward)
fig, (ax1, ax2) = plt.subplots(1, 2)
# Scatter plot of path and track in the first subplot
ax1.scatter(x=np.array(path)[:,0], y=np.array(path)[:,1], label='Path')
ax1.set_xlim([0,48])
ax1.set_ylim([0,48])
ax1.set_xlabel('X')
ax1.set_ylabel('Y')
ax1.set_title('Path and Track')
ax1.legend()
# Plot of rewards in the second subplot
ax2.plot(rews)
ax2.set_xlabel('Step')
ax2.set_ylabel('Reward')
ax2.set_title('Rewards')
plt.tight_layout()
plt.show()
return
COST_THRESHOLD = 50 #sanity check to make sure things are working
MAX_STEPS_PER_EPISODE = 100
def train():
#Run this function to create data
#create_data()
#Training CPO
J = cpo.cpo(
lambda: sar_gym.GridWorldSAR(render_mode=None),
actor_critic=core.MLPActorCriticTD3trust,
max_ep_len=MAX_STEPS_PER_EPISODE,
cost_lim=COST_THRESHOLD,
epochs=200,
steps_per_epoch=8000
)
#175
pr = cProfile.Profile()
pr.enable()
profile = True
if profile:
try:
J
except KeyboardInterrupt:
print("\nKeyboard interrupt received. Printing stats...")
finally:
pr.disable()
s = io.StringIO()
ps = pstats.Stats(pr, stream=s).sort_stats("tottime")
ps.print_stats(32)
print(s.getvalue())
#Save policy
torch.save(J.pi, './deeprl_data/trained_model4.pt')
#Testing
#Make Gym Environment
env = sar_gym.GridWorldSAR(render_mode=None)
#Test one episode
obs = env.reset()
path = []
rews = []
done = False
while not done:
with torch.no_grad():
tt = torch.as_tensor(obs,dtype=torch.float32)
actions = J.pi(tt)
obs, reward, term, info = env.step(actions.numpy())
done = term
path.append(info['loc'])
rews.append(reward)
fig, (ax1, ax2) = plt.subplots(1, 2)
# Scatter plot of path and track in the first subplot
ax1.scatter(x=np.array(path)[:,0], y=np.array(path)[:,1], label='Path')
ax1.set_xlim([0,48])
ax1.set_ylim([0,48])
ax1.set_xlabel('X')
ax1.set_ylabel('Y')
ax1.set_title('Path and Track')
ax1.legend()
# Plot of rewards in the second subplot
ax2.plot(rews)
ax2.set_xlabel('Step')
ax2.set_ylabel('Reward')
ax2.set_title('Rewards')
plt.tight_layout()
plt.show()
return return
def test_policy():
#Testing
policy = torch.load('./deeprl_data/trained_model2.pt')
heatmap = np.load('./deeprl_data/lpm/lpm4_prob.npy')
ringmap = np.load('./deeprl_data/ring/ringlowres_prob.npy')
#Make Gym Environment
env = sar_gym.GridWorldSAR(render_mode=None)
#Test one episode
obs = env.reset()
path = []
rews = []
done = False
while not done:
with torch.no_grad():
tt = torch.as_tensor(obs,dtype=torch.float32)
actions = policy(tt)
obs, reward, term, info = env.step(actions.numpy())
done = term
path.append(info['loc'])
rews.append(reward)
fig, (ax1, ax2) = plt.subplots(1, 2)
# Scatter plot of path and track in the first subplot
ax1.scatter(x=np.array(path)[:,0], y=np.array(path)[:,1], label='Path')
ax1.imshow(heatmap)
ax1.set_xlim([0,48])
ax1.set_ylim([0,48])
ax1.set_xlabel('X')
ax1.set_ylabel('Y')
ax1.set_title('Lost Person Model')
ax1.legend()
# Scatter plot of path and track in the first subplot
ax2.scatter(x=np.array(path)[:,0], y=np.array(path)[:,1], label='Path')
ax2.imshow(ringmap)
ax2.set_xlim([0,48])
ax2.set_ylim([0,48])
ax2.set_xlabel('X')
ax2.set_ylabel('Y')
ax2.set_title('Ring Model')
ax2.legend()
# Plot of rewards in the second subplot
#ax2.plot(rews)
#ax2.set_xlabel('Step')
#ax2.set_ylabel('Reward')
#x2.set_title('Rewards')
plt.tight_layout()
plt.suptitle("LPM Reward Function with Ring Model Constraint")
plt.show()
def main():
#test_policy()
train()
return
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment