Commit b8fe4105 authored by Bryson Howell's avatar Bryson Howell

Initial version of DeepRL agents, results from final project

parent e1472177
import numpy as np
import scipy.signal
from gym.spaces import Box, Discrete
import torch
import torch.nn as nn
from torch.distributions.normal import Normal
from torch.distributions.categorical import Categorical
import torch.nn.functional as F
import copy
def combined_shape(length, shape=None):
if shape is None:
return (length,)
return (length, shape) if np.isscalar(shape) else (length, *shape)
def mlp(sizes, activation, output_activation=nn.Identity):
layers = []
for j in range(len(sizes)-1):
act = activation if j < len(sizes)-2 else output_activation
layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
return nn.Sequential(*layers)
def count_vars(module):
return sum([np.prod(p.shape) for p in module.parameters()])
def discount_cumsum(x, discount):
"""
magic from rllab for computing discounted cumulative sums of vectors.
input:
vector x,
[x0,
x1,
x2]
output:
[x0 + discount * x1 + discount^2 * x2,
x1 + discount * x2,
x2]
"""
return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]
class Actor(nn.Module):
def _distribution(self, obs):
raise NotImplementedError
def _log_prob_from_distribution(self, pi, act):
raise NotImplementedError
def forward(self, obs, act=None):
# Produce action distributions for given observations, and
# optionally compute the log likelihood of given actions under
# those distributions.
pi = self._distribution(obs)
logp_a = None
if act is not None:
logp_a = self._log_prob_from_distribution(pi, act)
return pi, logp_a
class MLPDeterministicActor(nn.Module):
def __init__(self, state_dim, action_dim, max_action,discount_factor=0.99):
super(MLPDeterministicActor, self).__init__()
self.l1 = nn.Linear(state_dim, 256)
self.l2 = nn.Linear(256, 256)
self.l3 = nn.Linear(256, action_dim)
self.max_action = max_action
self.action_dim=action_dim
self.start_state = None
def forward(self, state,safety_switch=False,debug = False, noisy=False):
a = F.relu(self.l1(state))
a = F.relu(self.l2(a))
return self.max_action * torch.tanh(self.l3(a))
class MLPCategoricalActor(Actor):
def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
super().__init__()
self.logits_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation)
def _distribution(self, obs):
logits = self.logits_net(obs)
return Categorical(logits=logits)
def _log_prob_from_distribution(self, pi, act):
return pi.log_prob(act)
class MLPGaussianActor(Actor):
def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
super().__init__()
log_std = -0.5 * np.ones(act_dim, dtype=np.float32)
self.log_std = torch.nn.Parameter(torch.as_tensor(log_std))
self.mu_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation)
def _distribution(self, obs):
mu = self.mu_net(obs)
std = torch.exp(self.log_std)
return Normal(mu, std)
def mean(self, obs):
mu = self.mu_net(obs)
return mu
def _log_prob_from_distribution(self, pi, act):
return pi.log_prob(act).sum(axis=-1) # Last axis sum needed for Torch Normal distribution
class MLPCritic(nn.Module):
def __init__(self, obs_dim, hidden_sizes, activation):
super().__init__()
self.v_net = mlp([obs_dim] + list(hidden_sizes) + [1], activation)
def forward(self, obs):
return torch.squeeze(self.v_net(obs), -1) # Critical to ensure v has right shape.
class MLPActorCriticTD3trust(nn.Module):
def __init__(self, observation_space, action_space,
hidden_sizes=(64,64), activation=nn.Tanh):
super().__init__()
obs_dim = observation_space.shape[0]
act_dim = action_space.shape[0]
self.obs_dim = obs_dim
self.act_dim = act_dim
# policy builder depends on action space
self.pi = MLPDeterministicActor(obs_dim, action_space.shape[0],action_space.high[0])
# build value function
self.Qv1 = MLPCritic(obs_dim+act_dim, hidden_sizes, activation)
self.Qv2 = MLPCritic(obs_dim+act_dim, hidden_sizes, activation)
self.Qj1 = MLPCritic(obs_dim+act_dim, hidden_sizes, activation)
self.Qj2 = MLPCritic(obs_dim+act_dim, hidden_sizes, activation)
self.baseline_Qj = copy.deepcopy(self.Qj1)
self.baseline_pi = copy.deepcopy(self.pi)
self.pi_mix = copy.deepcopy(self.pi)
self.epsilon = 0
def step(self, obs):
a = self.pi(obs)
qv = self.Qv1(torch.cat((obs,a)))
qj = self.Qj1(torch.cat((obs,a)))
return a.detach().cpu().numpy(), qv.detach().cpu().numpy(), qj.detach().cpu().numpy(), 0
def act_pi(self, pi, obs):
a = pi(obs)
return a
def act(self, obs):
return self.step(obs)[0]
class MLPActorCritic(nn.Module):
def __init__(self, observation_space, action_space,
hidden_sizes=(64,64), activation=nn.Tanh):
super().__init__()
obs_dim = observation_space.shape[0]
# policy builder depends on action space
if isinstance(action_space, Box):
self.pi = MLPGaussianActor(obs_dim, action_space.shape[0], hidden_sizes, activation)
elif isinstance(action_space, Discrete):
self.pi = MLPCategoricalActor(obs_dim, action_space.n, hidden_sizes, activation)
# build value function
self.v = MLPCritic(obs_dim, hidden_sizes, activation)
def step(self, obs):
with torch.no_grad():
pi = self.pi._distribution(obs)
a = pi.sample()
logp_a = self.pi._log_prob_from_distribution(pi, a)
v = self.v(obs)
return a.numpy(), v.numpy(), logp_a.numpy()
def act(self, obs):
return self.step(obs)[0]
class MLPActorCriticCost(nn.Module):
def __init__(self, observation_space, action_space,
hidden_sizes=(64,64), activation=nn.Tanh):
super().__init__()
obs_dim = observation_space.shape[0]
# policy builder depends on action space
if isinstance(action_space, Box):
self.pi = MLPGaussianActor(obs_dim, action_space.shape[0], hidden_sizes, activation)
elif isinstance(action_space, Discrete):
self.pi = MLPCategoricalActor(obs_dim, action_space.n, hidden_sizes, activation)
# build value function
self.v = MLPCritic(obs_dim, hidden_sizes, activation)
self.j = MLPCritic(obs_dim, hidden_sizes, activation)
def step(self, obs):
with torch.no_grad():
pi = self.pi._distribution(obs)
a = pi.sample()
logp_a = self.pi._log_prob_from_distribution(pi, a)
v = self.v(obs)
j = self.j(obs)
return a.numpy(), v.numpy(), j.numpy(), logp_a.numpy()
def act(self, obs):
return self.step(obs)[0]
import numpy as np
import torch
from torch.optim import Adam
import gym
import time
from . import core
import sys
#import safety_gym
import torch.nn.functional as F
from torch.autograd import Variable
from .ppo_utils.logx import EpochLogger
from .ppo_utils.mpi_pytorch import setup_pytorch_for_mpi, sync_params, mpi_avg_grads
from .ppo_utils.mpi_tools import mpi_fork, mpi_avg, proc_id, mpi_statistics_scalar, num_procs
import copy
from .trust_region_utils import *
from .replay_buffers import *
import os
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
import datetime
def cpo(env_fn, env_name = '', actor_critic=core.MLPActorCriticCost, ac_kwargs=dict(), seed=0,
steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4,
vf_lr=1e-3, jf_lr=1e-3, penalty_init=1., penalty_lr=5e-2, cost_lim=25, train_pi_iters=80, train_v_iters=80, lam=0.95, max_ep_len=1000,
target_kl=0.00, target_l2=0.012, logger_kwargs=dict(), save_freq=10):
"""
Proximal Policy Optimization (by clipping),
with early stopping based on approximate KL
Args:
env_fn : A function which creates a copy of the environment.
The environment must satisfy the OpenAI Gym API.
actor_critic: The constructor method for a PyTorch Module with a
``step`` method, an ``act`` method, a ``pi`` module, and a ``v``
module. The ``step`` method should accept a batch of observations
and return:
=========== ================ ======================================
Symbol Shape Description
=========== ================ ======================================
``a`` (batch, act_dim) | Numpy array of actions for each
| observation.
``v`` (batch,) | Numpy array of value estimates
| for the provided observations.
``logp_a`` (batch,) | Numpy array of log probs for the
| actions in ``a``.
=========== ================ ======================================
The ``act`` method behaves the same as ``step`` but only returns ``a``.
The ``pi`` module's forward call should accept a batch of
observations and optionally a batch of actions, and return:
=========== ================ ======================================
Symbol Shape Description
=========== ================ ======================================
``pi`` N/A | Torch Distribution object, containing
| a batch of distributions describing
| the policy for the provided observations.
``logp_a`` (batch,) | Optional (only returned if batch of
| actions is given). Tensor containing
| the log probability, according to
| the policy, of the provided actions.
| If actions not given, will contain
| ``None``.
=========== ================ ======================================
The ``v`` module's forward call should accept a batch of observations
and return:
=========== ================ ======================================
Symbol Shape Description
=========== ================ ======================================
``v`` (batch,) | Tensor containing the value estimates
| for the provided observations. (Critical:
| make sure to flatten this!)
=========== ================ ======================================
ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object
you provided to PPO.
seed (int): Seed for random number generators.
steps_per_epoch (int): Number of steps of interaction (state-action pairs)
for the agent and the environment in each epoch.
epochs (int): Number of epochs of interaction (equivalent to
number of policy updates) to perform.
gamma (float): Discount factor. (Always between 0 and 1.)
clip_ratio (float): Hyperparameter for clipping in the policy objective.
Roughly: how far can the new policy go from the old policy while
still profiting (improving the objective function)? The new policy
can still go farther than the clip_ratio says, but it doesn't help
on the objective anymore. (Usually small, 0.1 to 0.3.) Typically
denoted by :math:`\epsilon`.
pi_lr (float): Learning rate for policy optimizer.
vf_lr (float): Learning rate for value function optimizer.
train_pi_iters (int): Maximum number of gradient descent steps to take
on policy loss per epoch. (Early stopping may cause optimizer
to take fewer than this.)
train_v_iters (int): Number of gradient descent steps to take on
value function per epoch.
lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
close to 1.)
max_ep_len (int): Maximum length of trajectory / episode / rollout.
target_kl (float): Roughly what KL divergence we think is appropriate
between new and old policies after an update. This will get used
for early stopping. (Usually small, 0.01 or 0.05.)
logger_kwargs (dict): Keyword args for EpochLogger.
save_freq (int): How often (in terms of gap between epochs) to save
the current policy and value function.
"""
# Special function to avoid certain slowdowns from PyTorch + MPI combo.
setup_pytorch_for_mpi()
# Set up logger and save configuration
logger = EpochLogger(**logger_kwargs)
logger.save_config(locals())
writer = SummaryWriter(
'logs4/' + str(int(time.time()))
)
# Random seed
seed += 10000 * proc_id()
torch.manual_seed(seed)
np.random.seed(seed)
# Instantiate environment
env = env_fn()
obs_dim = env.observation_space.shape
act_dim = env.action_space.shape
# Create actor-critic module
# if not os.path.exists('safe_initial_policies/'+env_name+'.pt'):
# ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
# else:
# Commented for LOOP experiments
# if 'Grid' in env_name:
# ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
# else:
# ac = torch.load('safe_initial_policies/'+env_name+'.pt')
ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
# # print("Looking for safe policy at: {}".format())
# if not os.path.exists('safe_initial_policies/'+env_name+'.pt'):
# raise NotImplementedError('A safe initial policy was not found at {}'.format('safe_initial_policies/'+env_name+'.pt'))
# ac.load_state_dict(ac.state_dict(),'safe_initial_policies/'+env_name+'.pt')
# Sync params across processes
sync_params(ac)
# Count variables
var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.Qv1])
logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts)
# Set up experience buffer
local_steps_per_epoch = int(steps_per_epoch / num_procs())
buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)
# td3_buf = TD3Buffer(obs_dim[0], act_dim[0])
# Set up penalty params
soft_penalty = Variable(torch.exp(torch.Tensor([penalty_init]))-1, requires_grad=True)
penalty_optimizer = torch.optim.Adam([soft_penalty],lr=penalty_lr)
margin = 0
margin_lr = 0.05
learn_margin = False
constraint_violations = [0]
constraint_violations_count = [0]
def conjugate_gradients(Avp, b, nsteps, residual_tol=1e-10):
x = torch.zeros(b.size())
r = b.clone()
p = b.clone()
rdotr = torch.dot(r, r)
for i in range(nsteps):
_Avp = Avp(p)
alpha = rdotr / torch.dot(p, _Avp)
x += alpha * p
r -= alpha * _Avp
new_rdotr = torch.dot(r, r)
betta = new_rdotr / rdotr
p = r + betta * p
rdotr = new_rdotr
if rdotr < residual_tol:
break
return x
def linesearch(model,
f,
get_kl,
optim_case,
c,
x,
fullstep,
max_kl,
max_backtracks=10,
accept_ratio=.1):
fval1, fval2 = f()[0].data,f()[1].data
old_model = copy.deepcopy(model)
# print("fval before", fval.item())
for (_n_backtracks, stepfrac) in enumerate(.5**np.arange(max_backtracks)):
xnew = x + stepfrac * fullstep
set_flat_params_to(model, xnew)
newfval1, newfval2 = f()[0].data, f()[1].data
kl_change = get_kl(old_model=old_model).mean()
if (kl_change <= max_kl and
(newfval1 <= fval1 if optim_case > 1 else True) and
newfval2 - fval2 <= max(-c,0)):
return True, xnew
return False, x
def trust_region_step(model, get_loss, get_kl, max_kl, damping):
# global margin, margin_lr, learn_margin
loss_pi, loss_j = get_loss()
grads = torch.autograd.grad(loss_pi, model.parameters())
loss_grad = torch.cat([grad.view(-1) for grad in grads]).data
grads_j = torch.autograd.grad(loss_j, model.parameters())
loss_grad_j = torch.cat([grad.view(-1) for grad in grads_j]).data
c = ac.epsilon/(gamma-1)
rescale = 100
# TODO correct this
margin = 0
margin_lr = 0.05
learn_margin = False
if learn_margin:
margin+=margin_lr*c
margin = max(0,margin)
# TODO: mpi_Avg margin?
c += margin
c/= (rescale+1e-8)
def Fvp(v):
kl = get_kl()
kl = kl.mean()
grads = torch.autograd.grad(kl, model.parameters(), create_graph=True)
flat_grad_kl = torch.cat([grad.view(-1) for grad in grads])
kl_v = (flat_grad_kl * Variable(v)).sum()
grads = torch.autograd.grad(kl_v, model.parameters())
flat_grad_grad_kl = torch.cat([grad.contiguous().view(-1) for grad in grads]).data
return flat_grad_grad_kl + v * damping
v = conjugate_gradients(Fvp, -loss_grad, 10)
approx_g = Fvp(v)
q = np.dot(v, approx_g)
# Determine optim_case (switch condition for calculation,
# based on geometry of constrained optimization problem)
if np.dot(loss_grad_j,loss_grad_j) <= 1e-8 and c < 0:
# feasible and cost grad is zero---shortcut to pure TRPO update!
w, r, s, A, B = 0, 0, 0, 0, 0
optim_case = 4
else:
# cost grad is nonzero: CPO update!
w = conjugate_gradients(Fvp, -loss_grad_j, 10)
r = np.dot(w, approx_g) # b^T H^{-1} g
s = np.dot(w, Fvp(w)) # b^T H^{-1} b
A = q - r**2 / s # should be always positive (Cauchy-Shwarz)
B = 2*max_kl - c**2 / s # does safety boundary intersect trust region? (positive = yes)
if c < 0 and B < 0:
# point in trust region is feasible and safety boundary doesn't intersect
# ==> entire trust region is feasible
optim_case = 3
elif c < 0 and B >= 0:
# x = 0 is feasible and safety boundary intersects
# ==> most of trust region is feasible
optim_case = 2
elif c >= 0 and B >= 0:
# x = 0 is infeasible and safety boundary intersects
# ==> part of trust region is feasible, recovery possible
optim_case = 1
print("Alert! Attempting feasible recovery!")
# self.logger.log('Alert! Attempting feasible recovery!', 'yellow')
else:
# x = 0 infeasible, and safety halfspace is outside trust region
# ==> whole trust region is infeasible, try to fail gracefully
optim_case = 0
print("Alert! Attempting infeasible recovery!")
# self.logger.log('Alert! Attempting infeasible recovery!', 'red')
if optim_case in [3,4]:
lam = np.sqrt(q / (2*max_kl))
nu = 0
elif optim_case in [1,2]:
LA, LB = [0, r /c], [r/c, np.inf]
LA, LB = (LA, LB) if c < 0 else (LB, LA)
proj = lambda x, L : max(L[0], min(L[1], x))
lam_a = proj(np.sqrt(A/B), LA)
lam_b = proj(np.sqrt(q/(2*max_kl)), LB)
f_a = lambda lam : -0.5 * (A / (lam+1e-8) + B * lam) - r*c/(s+1e-8)
f_b = lambda lam : -0.5 * (q / (lam+1e-8) + 2 * target_kl * lam)
lam = lam_a if f_a(lam_a) >= f_b(lam_b) else lam_b
nu = max(0, lam * c - r) / (s + 1e-8)
else:
lam = 0
nu = np.sqrt(2 * max_kl / (s+1e-8))
# normal step if optim_case > 0, but for optim_case =0,
# perform infeasible recovery: step to purely decrease cost
x = (1./(lam+1e-8)) * (v + nu * w) if optim_case > 0 else nu * w
if optim_case==0:
logger.store(Backtrack=-1)
else:
logger.store(Backtrack=1)
prev_params = get_flat_params_from(model)
success, new_params = linesearch(model, get_loss, get_kl, optim_case, c, prev_params, x, max_kl)
set_flat_params_to(model, new_params)
return loss_pi
# Set up function for computing PPO policy loss
def compute_loss_pi(data, epoch_no=1):
obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data['logp']
def get_kl(old_mean=None, new_mean=None, old_model=None):
if old_mean is None:
mean1 = ac.pi(obs)
else:
mean1 = old_mean
if old_model is not None:
mean1 = old_model(obs)
log_std1, std1 = -2.99, 0.05
if new_mean is None:
mean0 = torch.autograd.Variable(mean1.data)
else:
mean0 = new_mean
log_std0 = -2.99
std0 = 0.05
kl = log_std1 - log_std0 + (std0**2 + (mean0 - mean1).pow(2)) / (2.0 * std1**2) - 0.5
return kl.sum(1, keepdim=True)
def get_loss_pi():
# if ac.epsilon<0:
# loss_pi = (ac.Qj1(torch.cat((obs, ac.pi(obs)),dim=1))).mean()
# else:
# loss_pi = - (ac.Qv1(torch.cat((obs, ac.pi(obs)),dim=1))).mean()
loss_pi = -(ac.Qv1(torch.cat((obs, ac.pi(obs)),dim=1))).mean()
loss_j = (ac.Qj1(torch.cat((obs, ac.pi(obs)),dim=1))).mean()
# loss_pi = (ac.Qj1(torch.cat((obs, ac.pi(obs)),dim=1))).mean()
return loss_pi, loss_j
old_mean = ac.pi(obs).detach().data
loss_pi = trust_region_step(ac.pi, get_loss_pi, get_kl, target_l2, 0.1)
# Useful extra info
approx_l2 = torch.sqrt(torch.mean((ac.pi(obs) - data['old_act'])**2)).item()
approx_kl = get_kl(old_mean = old_mean, new_mean=ac.pi(obs).detach()).mean().item()
# ent = pi.entropy().mean().item()
ent = 0
clipped = [0]
clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item()
pi_info = dict(kl=approx_kl,l2=approx_l2, ent=ent, cf=clipfrac)
return loss_pi, pi_info
# Set up function for computing value loss
def compute_loss_v(data):
obs, act, ret = data['obs'], data['act'], data['ret']
return ((ac.Qv1(torch.cat((obs,act),dim=1)) - ret)**2).mean(), ((ac.Qv2(torch.cat((obs,act),dim=1)) - ret)**2).mean()
# Set up function for computing value loss
def compute_loss_j(data):
obs, act, cost_ret = data['obs'], data['act'], data['cost_ret']
return ((ac.Qj1(torch.cat((obs,act),dim=1)) - cost_ret)**2).mean(), ((ac.Qj2(torch.cat((obs,act),dim=1)) - cost_ret)**2).mean()
# Set up optimizers for policy and value function
pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
pi_bc_optimizer = Adam(ac.pi.parameters(), lr=0.001)
vf1_optimizer = Adam(ac.Qv1.parameters(), lr=vf_lr)
vf2_optimizer = Adam(ac.Qv2.parameters(), lr=vf_lr)
jf1_optimizer = Adam(ac.Qj1.parameters(), lr=jf_lr)
jf2_optimizer = Adam(ac.Qj2.parameters(), lr=jf_lr)
# Set up model saving
logger.setup_pytorch_saver(ac)
def update(epoch_no, constraint_violations, constraint_violations_count):
# global soft_penalty, penalty_optimizer
data = buf.get()
# Update the penalty
curr_cost = logger.get_stats('EpCostRet')[0]
if curr_cost-cost_lim>0:
# constraint_violations_count.append(1)
# constraint_violations.append(curr_cost-cost_lim)
logger.log('Warning! Safety constraint is already violated.', 'red')
ac.epsilon = (1-gamma)*(cost_lim-curr_cost)
if epoch_no==0 or ac.epsilon>=0:
ac.baseline_pi = copy.deepcopy(ac.pi)
ac.baseline_Qj = copy.deepcopy(ac.Qj1)
# pi_l_old, pi_info_old = compute_loss_pi(data)
# pi_l_old = pi_l_old.item()
# v_l_old, _ = compute_loss_v(data)
# v_l_old = v_l_old.item()
# j_l_old, _ = compute_loss_j(data)
# j_l_old = j_l_old.item()
pi_l_old, v_l_old, j_l_old = 0, 0, 0
pi_info_old = dict(kl=0,l2=0, ent=0, cf=0)
if epoch_no==0:
for i in range(train_v_iters):
vf1_optimizer.zero_grad()
vf2_optimizer.zero_grad()
loss_v1, loss_v2 = compute_loss_v(data)
loss_v1.backward()
loss_v2.backward()
mpi_avg_grads(ac.Qv1) # average grads across MPI processes
mpi_avg_grads(ac.Qv2)
vf1_optimizer.step()
vf2_optimizer.step()
jf1_optimizer.zero_grad()
jf2_optimizer.zero_grad()
loss_j1, loss_j2 = compute_loss_j(data)
loss_j1.backward()
loss_j2.backward()
mpi_avg_grads(ac.Qj1) # average grads across MPI processes
mpi_avg_grads(ac.Qj2)
jf1_optimizer.step()
jf2_optimizer.step()
# Trust region update for policy
loss_pi, pi_info = compute_loss_pi(data, epoch_no = epoch_no)
logger.store(StopIter=0)
# Value and Cost Value function learning
for i in range(train_v_iters):
vf1_optimizer.zero_grad()
vf2_optimizer.zero_grad()
loss_v1, loss_v2 = compute_loss_v(data)
loss_v1.backward()
loss_v2.backward()
mpi_avg_grads(ac.Qv1) # average grads across MPI processes
mpi_avg_grads(ac.Qv2)
vf1_optimizer.step()
vf2_optimizer.step()
jf1_optimizer.zero_grad()
jf2_optimizer.zero_grad()
loss_j1, loss_j2 = compute_loss_j(data)
loss_j1.backward()
loss_j2.backward()
mpi_avg_grads(ac.Qj1) # average grads across MPI processes
mpi_avg_grads(ac.Qj2)
jf1_optimizer.step()
jf2_optimizer.step()
# Log changes from update
kl,l2, ent, cf = pi_info['kl'],pi_info['l2'], pi_info_old['ent'], pi_info['cf']
logger.store(LossPi=pi_l_old, LossV=v_l_old, LossJ= j_l_old,
KL=kl, L2=l2, Entropy=ent, ClipFrac=cf,
DeltaLossPi=(loss_pi.item() - pi_l_old),
DeltaLossV=(loss_v1.item() - v_l_old),
DeltaLossJ=(loss_j1.item() - j_l_old),
Penalty=torch.nn.functional.softplus(soft_penalty))
# Prepare for interaction with environment
start_time = time.time()
o, ep_ret,ep_cost_ret, ep_len = env.reset(), 0, 0, 0
# Main loop: collect experience in env and update/log each epoch
for epoch in tqdm(range(epochs)):
for t in range(local_steps_per_epoch):
a, v, j, logp = ac.step(torch.as_tensor(o, dtype=torch.float32))
noise = 0.05 * np.random.randn(*a.shape)
# print("Action: {}".format(a+noise))
a = a + noise
next_o, r, d, info = env.step(a)
ep_ret += r
ep_cost_ret += info.get('cost', 0)
ep_len += 1
# save and log
buf.store(o, a, r, info.get('cost', 0), v, j, logp, a)
# td3_buf.add(o, a,next_o, r, info.get('cost', 0),d)
logger.store(VVals=v, JVals = j)
# Update obs (critical!)
o = next_o
timeout = ep_len == max_ep_len
terminal = d or timeout
epoch_ended = t==local_steps_per_epoch-1
if terminal or epoch_ended:
if epoch_ended and not(terminal):
print('Warning: trajectory cut off by epoch at %d steps.'%ep_len, flush=True)
# if trajectory didn't reach terminal state, bootstrap value target
if timeout or epoch_ended:
_, v, j, _ = ac.step(torch.as_tensor(o, dtype=torch.float32))
else:
v, j = 0, 0
buf.finish_path(v, j)
if terminal:
# only save EpRet / EpLen if trajectory finished
logger.store(EpRet=ep_ret, EpCostRet=ep_cost_ret, EpLen=ep_len)
o, ep_ret , ep_cost_ret, ep_len = env.reset(), 0, 0, 0
# Save model
if (epoch % save_freq == 0) or (epoch == epochs-1):
logger.save_state({'env': env}, None)
# Perform PPO update!
update(epoch, constraint_violations, constraint_violations_count)
# Log info about epoch
logger.log_tabular('Epoch', epoch)
logger.log_tabular('EpRet', with_min_and_max=True)
logger.log_tabular('EpCostRet', with_min_and_max=True)
logger.log_tabular('EpLen', average_only=True)
logger.log_tabular('VVals', with_min_and_max=True)
logger.log_tabular('JVals', with_min_and_max=True)
logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch)
logger.log_tabular('LossPi', average_only=True)
logger.log_tabular('LossV', average_only=True)
logger.log_tabular('LossJ', average_only=True)
logger.log_tabular('DeltaLossPi', average_only=True)
logger.log_tabular('DeltaLossV', average_only=True)
logger.log_tabular('DeltaLossJ', average_only=True)
logger.log_tabular('Entropy', average_only=True)
logger.log_tabular('KL', average_only=True)
# logger.log_tabular('L2', average_only=True)
logger.log_tabular('ClipFrac', average_only=True)
logger.log_tabular('StopIter', average_only=True)
logger.log_tabular('Penalty', average_only=True)
logger.log_tabular('Backtrack', average_only=True)
logger.log_tabular('Time', time.time()-start_time)
#breakpoint()
# Log the average episode return and length using TensorBoard
rv = logger.dump_tabular(IO=False) #holy mother of monkeypatch
writer.add_scalar('AvgEpRet', rv['AverageEpRet'], epoch)
writer.add_scalar('AvgEpLen', rv['EpLen'], epoch)
writer.add_scalar('AvgEpCost', rv['AverageEpCostRet'], epoch)
#print(rv)
#TODO: get out the episode return and length (average)
#writer.add_scalar('EpRet', logger.epoch_dict['EpRet'][0], epoch)
#writer.add_scalar('EpLen', logger.epoch_dict['EpLen'], epoch)
writer.close()
print("Total constraint violations were: {} with total violation cost: {}".format(sum(constraint_violations_count), sum(constraint_violations)))
return ac
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--env', type=str, default='Safexp-PointGoal1-v0')
parser.add_argument('--hid', type=int, default=256)
parser.add_argument('--l', type=int, default=2)
parser.add_argument('--gamma', type=float, default=0.99)
parser.add_argument('--target_l2', type=float, default=0.012)
parser.add_argument('--cost_lim', type=float, default=25)
parser.add_argument('--seed', '-s', type=int, default=0)
parser.add_argument('--cpu', type=int, default=3)
parser.add_argument('--steps', type=int, default=30000)
parser.add_argument('--epochs', type=int, default=3000)
parser.add_argument('--exp_name', type=str, default='td3_dump')
args = parser.parse_args()
mpi_fork(args.cpu) # run parallel code with mpi
from ppo_utils.run_utils import setup_logger_kwargs
logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed)
env_fn = lambda : gym.make(args.env)
cpo(env_fn, env_name= args.env, actor_critic=core.MLPActorCriticTD3trust,
ac_kwargs=dict(hidden_sizes=[args.hid]*args.l), gamma=args.gamma,
seed=args.seed, steps_per_epoch=args.steps, epochs=args.epochs, target_l2=args.target_l2,cost_lim=args.cost_lim,
logger_kwargs=logger_kwargs)
#!/usr/bin/env python
# coding: utf-8
# In[25]:
import numpy as np
from skimage.io import imshow
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from tqdm import tqdm
import torch
rp = np.load('test_prob.npy')
rp.shape
# In[2]:
imshow(rp)
# In[3]:
r0 = np.load('robot0_waypoints.npy')
r1 = np.load('robot1_waypoints.npy')
r2 = np.load('robot2_waypoints.npy')
plt.imshow(rp)
plt.scatter(*r0.T, color='r')
plt.scatter(*r1.T, color='r')
plt.scatter(*r2.T, color='r')
plt.vlines([400, 800], 0, 1200, color='pink')
# In[4]:
plt.plot(np.sum(1 - rp, axis=0))
plt.vlines([400, 800], 0, 1200, color='pink')
# In[5]:
#rew
def f(x, t=0.8):
if x < -t:
return -1
elif x > t:
return 1
return 1/t * x
X = np.linspace(-1, 1, 64)
Y1 = [f(x, t=0.6) for x in X]
Y2 = [f(x, t=0.8) for x in X]
plt.plot(Y1)
plt.plot(Y2)
# In[6]:
from gym import spaces, Env
CW = 400
CH = 1200
MAX_STEPS_PER_EPISODE = 100
MAX_FORWARD_REWARD_THRESHOLD = 0.70 #Give us the 'leeway' to make some horizontal variation
MAX_UNITS_TRAVELED_PER_STEP = CH / (MAX_STEPS_PER_EPISODE * MAX_FORWARD_REWARD_THRESHOLD)
H_REWARD_SCALE = 5 #TODO: tune this
R_REWARD_SCALE = 2 #TODO: tune this
#Normalize paths and regions to [-1, 1]
region0 = rp[:, 0: 400]
region1 = rp[:, 400: 400]
region2 = rp[:, 800:1200]
def uv2xy(u, v):
# Map coordinates from the double unit square (-1, -1) to (1, 1)
# to the rectangle (0, 0), (CW, CH)
x = (u + 1) * CW / 2
y = (v + 1) * CH / 2
return [x, y]
def xy2uv(x, y):
# Map coordinates from the rectangle (0, 0), (CW, CH)
# to the double unit square (-1, -1) to (1, 1)
u = (x / CW) * 2 - 1
v = (y / CH) * 2 - 1
return [u, v]
class GEC(Env):
def __init__(self, track, new=True):
self.action_space = spaces.Box(low=-1, high=1, shape=(2,), dtype=float)
self.observation_space = spaces.Box(low=-1, high=1, shape=(2,), dtype=float)
self.track = np.array([xy2uv(*c) for c in track])
self.new = new
self.reset()
def reset(self, seed=None, options=None):
if seed:
np.random.seed(seed)
random.seed(seed)
self.pos_x = 0.0
self.pos_y = 1.0
self.current_step = 0
#print(self.new)
if self.new:
return self._get_obs(), {}
return self._get_obs()
def _get_obs(self):
return np.array([self.pos_x, self.pos_y])
def step(self, action):
action_scaled = action * (2 * MAX_UNITS_TRAVELED_PER_STEP / max(CW, CH)) #hacky but we'll take it
self.pos_x = np.clip(self.pos_x + action_scaled[0], -1, 1)
self.pos_y = np.clip(self.pos_y + action_scaled[1], -1, 1)
#get to the end
reward_vertical = -f(action[1], t=MAX_FORWARD_REWARD_THRESHOLD)
#follow the LPM track
'''
Example:
#self.track = [[ 0. -1. ]
[ 0.005 -0.97333333]
[ 0.01 -0.94833333]
[ 0.015 -0.92166667]
[ 0.02 -0.89666667]
[ 0.025 -0.87166667]...
self.pos_x = 0.3
self.pos_y = -0.96
calculate the indices of the track path closest to the y position (so, surrounding -0.96)
naturally these are -0.97 and 0.94
calculate the line between
[ 0.005 -0.97333333]
[ 0.01 -0.94833333]
reward (penalty) is the L2 distance from pos(x, y) to the line segment
'''
# Find indices of adjacent 'y'
idx = np.searchsorted(self.track[:, 1], self.pos_y, side='right') - 1
idx = np.clip(idx, 0, len(self.track) - 2)
# Compute line over adjacency
p1 = self.track[idx]
p2 = self.track[idx + 1]
line = p2 - p1
# Compute distance perpendicular to line
pos = np.array([self.pos_x, self.pos_y])
line_norm = line / np.linalg.norm(line)
proj = np.dot(pos - p1, line_norm) * line_norm
closest_point = p1 + proj
penalty_horizontal = np.linalg.norm(pos - closest_point)
penalty_ring = 1 - rp[
int(0.5 * (self.pos_x + 1) * 0.33 * (CW-1)),
int(0.5 * (self.pos_y + 1) * 1.00 * (CH-1))
]
reward = reward_vertical - H_REWARD_SCALE * penalty_horizontal #- R_REWARD_SCALE * penalty_ring
# Increment step counter
self.current_step += 1
# Calculate 'truncated'
truncated = self.current_step >= MAX_STEPS_PER_EPISODE
# Calculate 'terminated'
terminated = self.pos_y < -0.999
if self.new:
return self._get_obs(), reward, bool(terminated), bool(truncated), {}
return self._get_obs(), reward, bool(terminated) or bool(truncated), {}
def render(self, mode='human'):
pass
def close(self):
pass
# In[7]:
#Sanity / 'dumb' environment test using naive policy
env = GEC(r0)
obs, info = env.reset()
path = [obs]
rews = []
done = False
while not done:
obs, rew, term, trunc, info = env.step(np.array([0, -1]))
done = term or trunc
path.append(obs)
rews.append(rew)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
# Scatter plot of path and track in the first subplot
ax1.scatter(*(np.array(path)).T, label='Path')
ax1.scatter(*env.track.T, label='Track')
ax1.set_xlabel('X')
ax1.set_ylabel('Y')
ax1.set_title('Path and Track')
ax1.legend()
# Plot of rewards in the second subplot
ax2.plot(rews)
ax2.set_xlabel('Step')
ax2.set_ylabel('Reward')
ax2.set_title('Rewards')
plt.tight_layout()
plt.show()
# In[10]:
from stable_baselines3 import PPO
#Train PPO model
train = False
if train:
log_dir = "logs3/" #tensorboard --logdir logs3
model = PPO("MlpPolicy", env, verbose=0, tensorboard_log=log_dir, n_steps = 8192, batch_size = 2048)
model.learn(total_timesteps=500000, progress_bar=True)
model.save('final4')
else:
model = PPO.load('final4')
# In[16]:
obs, info = env.reset()
path = [obs]
rews = []
done = False
while not done:
actions, _states = model.predict(obs)
obs, rew, term, trunc, info = env.step(actions)
done = term or trunc
path.append(obs)
rews.append(rew)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
# Scatter plot of path and track in the first subplot
ax1.scatter(*(np.array(path)).T, label='Path')
ax1.scatter(*env.track.T, label='Track')
ax1.set_xlabel('X')
ax1.set_ylabel('Y')
ax1.set_title('Path and Track')
ax1.legend()
# Plot of rewards in the second subplot
ax2.plot(rews)
ax2.set_xlabel('Step')
ax2.set_ylabel('Reward')
ax2.set_title('Rewards')
plt.tight_layout()
plt.show()
# In[17]:
from gym import spaces, Env
CW = 400
CH = 1200
MAX_STEPS_PER_EPISODE = 100
MAX_FORWARD_REWARD_THRESHOLD = 0.70 #Give us the 'leeway' to make some horizontal variation
MAX_UNITS_TRAVELED_PER_STEP = CH / (MAX_STEPS_PER_EPISODE * MAX_FORWARD_REWARD_THRESHOLD)
H_REWARD_SCALE = 1.5 #TODO: tune this
#Normalize paths and regions to [-1, 1]
region0 = rp[:, 0: 400]
region1 = rp[:, 400: 400]
region2 = rp[:, 800:1200]
def uv2xy(u, v):
# Map coordinates from the double unit square (-1, -1) to (1, 1)
# to the rectangle (0, 0), (CW, CH)
x = (u + 1) * CW / 2
y = (v + 1) * CH / 2
return [x, y]
def xy2uv(x, y):
# Map coordinates from the rectangle (0, 0), (CW, CH)
# to the double unit square (-1, -1) to (1, 1)
u = (x / CW) * 2 - 1
v = (y / CH) * 2 - 1
return [u, v]
class GEC2(Env):
def __init__(self, track):
self.action_space = spaces.Box(low=-1, high=1, shape=(2,), dtype=float)
self.observation_space = spaces.Box(low=-1, high=1, shape=(2,), dtype=float)
self.track = np.array([xy2uv(*c) for c in track])
self.reset()
def reset(self, seed=None, options=None):
self.pos_x = 0.0
self.pos_y = 1.0
self.current_step = 0
return self._get_obs()
def _get_obs(self):
return np.array([self.pos_x, self.pos_y])
def step(self, action):
action_scaled = action * (2 * MAX_UNITS_TRAVELED_PER_STEP / max(CW, CH)) #hacky but we'll take it
self.pos_x = np.clip(self.pos_x + action_scaled[0], -1, 1)
self.pos_y = np.clip(self.pos_y + action_scaled[1], -1, 1)
#get to the end
reward_vertical = -f(action[1], t=MAX_FORWARD_REWARD_THRESHOLD)
# Find indices of adjacent 'y'
idx = np.searchsorted(self.track[:, 1], self.pos_y, side='right') - 1
idx = np.clip(idx, 0, len(self.track) - 2)
# Compute line over adjacency
p1 = self.track[idx]
p2 = self.track[idx + 1]
line = p2 - p1
# Compute distance perpendicular to line
pos = np.array([self.pos_x, self.pos_y])
line_norm = line / np.linalg.norm(line)
proj = np.dot(pos - p1, line_norm) * line_norm
closest_point = p1 + proj
penalty_horizontal = np.linalg.norm(pos - closest_point)
penalty_ring = 1 - rp[
int(0.5 * (self.pos_x + 1) * 0.33 * (CW-1)),
int(0.5 * (self.pos_y + 1) * 1.00 * (CH-1))
]
info = {'cost':penalty_ring}
reward = reward_vertical - H_REWARD_SCALE * penalty_horizontal
# Increment step counter
self.current_step += 1
# Calculate 'terminated'
terminated = self.pos_y < -0.999
return self._get_obs(), reward, bool(terminated), info
def render(self, mode='human'):
pass
def close(self):
pass
# In[29]:
get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
import cpo
import core
import cProfile
import pstats
import io
COST_THRESHOLD = 50 #sanity check to make sure things are working
def main():
J = cpo.cpo(
lambda: GEC2(r0),
actor_critic=core.MLPActorCriticTD3trust,
max_ep_len=MAX_STEPS_PER_EPISODE,
cost_lim=COST_THRESHOLD,
epochs=175,
steps_per_epoch=8000
)
return J
pr = cProfile.Profile()
pr.enable()
profile = False
if profile:
try:
main()
except KeyboardInterrupt:
print("\nKeyboard interrupt received. Printing stats...")
finally:
pr.disable()
s = io.StringIO()
ps = pstats.Stats(pr, stream=s).sort_stats("tottime")
ps.print_stats(32)
print(s.getvalue())
else:
J = main()
# In[37]:
env = GEC2(r0)
obs = env.reset()
path = [obs]
rews = []
done = False
steps = 0
while not done and steps <= MAX_STEPS_PER_EPISODE:
with torch.no_grad():
tt = torch.as_tensor(obs, dtype=torch.float32)
actions = J.pi(tt)
#print(actions)
obs, rew, term, info = env.step(actions.numpy())
done = term
path.append(obs)
rews.append(rew)
steps += 1
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
# Scatter plot of path and track in the first subplot
#ax1.scatter(*(np.array(path)).T, label='Path')
ax1.scatter(*env.track.T, label='Track')
path_array = np.array(path)
directions = np.diff(path_array, axis=0)
arrow_positions = path_array[:-1] + directions / 2
ax1.quiver(*arrow_positions.T, *directions.T, scale_units='xy', angles='xy', scale=1, color='r', width=0.005)
ax1.set_xlabel('X')
ax1.set_ylabel('Y')
ax1.set_title('Path and Track')
ax1.legend()
# Plot of rewards in the second subplot
ax2.plot(rews)
ax2.set_xlabel('Step')
ax2.set_ylabel('Reward')
ax2.set_title('Rewards')
plt.tight_layout()
plt.show()
"""
Some simple logging functionality, inspired by rllab's logging.
Logs to a tab-separated-values file (path/to/output_directory/progress.txt)
"""
import json
import joblib
import shutil
import numpy as np
#import tensorflow as tf #lmao
import torch
import os.path as osp, time, atexit, os
import warnings
from .mpi_tools import proc_id, mpi_statistics_scalar
from .serialization_utils import convert_json
color2num = dict(
gray=30,
red=31,
green=32,
yellow=33,
blue=34,
magenta=35,
cyan=36,
white=37,
crimson=38
)
def colorize(string, color, bold=False, highlight=False):
"""
Colorize a string.
This function was originally written by John Schulman.
"""
attr = []
num = color2num[color]
if highlight: num += 10
attr.append(str(num))
if bold: attr.append('1')
return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
def restore_tf_graph(sess, fpath):
"""
Loads graphs saved by Logger.
Will output a dictionary whose keys and values are from the 'inputs'
and 'outputs' dict you specified with logger.setup_tf_saver().
Args:
sess: A Tensorflow session.
fpath: Filepath to save directory.
Returns:
A dictionary mapping from keys to tensors in the computation graph
loaded from ``fpath``.
"""
tf.saved_model.loader.load(
sess,
[tf.saved_model.tag_constants.SERVING],
fpath
)
model_info = joblib.load(osp.join(fpath, 'model_info.pkl'))
graph = tf.get_default_graph()
model = dict()
model.update({k: graph.get_tensor_by_name(v) for k,v in model_info['inputs'].items()})
model.update({k: graph.get_tensor_by_name(v) for k,v in model_info['outputs'].items()})
return model
class Logger:
"""
A general-purpose logger.
Makes it easy to save diagnostics, hyperparameter configurations, the
state of a training run, and the trained model.
"""
def __init__(self, output_dir=None, output_fname='progress.txt', exp_name=None):
"""
Initialize a Logger.
Args:
output_dir (string): A directory for saving results to. If
``None``, defaults to a temp directory of the form
``/tmp/experiments/somerandomnumber``.
output_fname (string): Name for the tab-separated-value file
containing metrics logged throughout a training run.
Defaults to ``progress.txt``.
exp_name (string): Experiment name. If you run multiple training
runs and give them all the same ``exp_name``, the plotter
will know to group them. (Use case: if you run the same
hyperparameter configuration with multiple random seeds, you
should give them all the same ``exp_name``.)
"""
if proc_id()==0:
self.output_dir = output_dir or "/tmp/experiments/%i"%int(time.time())
if osp.exists(self.output_dir):
print("Warning: Log dir %s already exists! Storing info there anyway."%self.output_dir)
else:
os.makedirs(self.output_dir)
self.output_file = open(osp.join(self.output_dir, output_fname), 'w')
atexit.register(self.output_file.close)
print(colorize("Logging data to %s"%self.output_file.name, 'green', bold=True))
else:
self.output_dir = None
self.output_file = None
self.first_row=True
self.log_headers = []
self.log_current_row = {}
self.exp_name = exp_name
def log(self, msg, color='green'):
"""Print a colorized message to stdout."""
if proc_id()==0:
print(colorize(msg, color, bold=True))
def log_tabular(self, key, val):
"""
Log a value of some diagnostic.
Call this only once for each diagnostic quantity, each iteration.
After using ``log_tabular`` to store values for each diagnostic,
make sure to call ``dump_tabular`` to write them out to file and
stdout (otherwise they will not get saved anywhere).
"""
if self.first_row:
self.log_headers.append(key)
else:
assert key in self.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key
assert key not in self.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key
self.log_current_row[key] = val
def save_config(self, config):
"""
Log an experiment configuration.
Call this once at the top of your experiment, passing in all important
config vars as a dict. This will serialize the config to JSON, while
handling anything which can't be serialized in a graceful way (writing
as informative a string as possible).
Example use:
.. code-block:: python
logger = EpochLogger(**logger_kwargs)
logger.save_config(locals())
"""
config_json = convert_json(config)
if self.exp_name is not None:
config_json['exp_name'] = self.exp_name
if proc_id()==0:
output = json.dumps(config_json, separators=(',',':\t'), indent=4, sort_keys=True)
print(colorize('Saving config:\n', color='cyan', bold=True))
print(output)
with open(osp.join(self.output_dir, "config.json"), 'w') as out:
out.write(output)
def save_state(self, state_dict, itr=None):
"""
Saves the state of an experiment.
To be clear: this is about saving *state*, not logging diagnostics.
All diagnostic logging is separate from this function. This function
will save whatever is in ``state_dict``---usually just a copy of the
environment---and the most recent parameters for the model you
previously set up saving for with ``setup_tf_saver``.
Call with any frequency you prefer. If you only want to maintain a
single state and overwrite it at each call with the most recent
version, leave ``itr=None``. If you want to keep all of the states you
save, provide unique (increasing) values for 'itr'.
Args:
state_dict (dict): Dictionary containing essential elements to
describe the current state of training.
itr: An int, or None. Current iteration of training.
"""
if proc_id()==0:
fname = 'vars.pkl' if itr is None else 'vars%d.pkl'%itr
try:
joblib.dump(state_dict, osp.join(self.output_dir, fname))
except:
self.log('Warning: could not pickle state_dict.', color='red')
if hasattr(self, 'tf_saver_elements'):
self._tf_simple_save(itr)
if hasattr(self, 'pytorch_saver_elements'):
self._pytorch_simple_save(itr)
def setup_tf_saver(self, sess, inputs, outputs):
"""
Set up easy model saving for tensorflow.
Call once, after defining your computation graph but before training.
Args:
sess: The Tensorflow session in which you train your computation
graph.
inputs (dict): A dictionary that maps from keys of your choice
to the tensorflow placeholders that serve as inputs to the
computation graph. Make sure that *all* of the placeholders
needed for your outputs are included!
outputs (dict): A dictionary that maps from keys of your choice
to the outputs from your computation graph.
"""
self.tf_saver_elements = dict(session=sess, inputs=inputs, outputs=outputs)
self.tf_saver_info = {'inputs': {k:v.name for k,v in inputs.items()},
'outputs': {k:v.name for k,v in outputs.items()}}
def _tf_simple_save(self, itr=None):
"""
Uses simple_save to save a trained model, plus info to make it easy
to associated tensors to variables after restore.
"""
if proc_id()==0:
assert hasattr(self, 'tf_saver_elements'), \
"First have to setup saving with self.setup_tf_saver"
fpath = 'tf1_save' + ('%d'%itr if itr is not None else '')
fpath = osp.join(self.output_dir, fpath)
if osp.exists(fpath):
# simple_save refuses to be useful if fpath already exists,
# so just delete fpath if it's there.
shutil.rmtree(fpath)
tf.saved_model.simple_save(export_dir=fpath, **self.tf_saver_elements)
joblib.dump(self.tf_saver_info, osp.join(fpath, 'model_info.pkl'))
def setup_pytorch_saver(self, what_to_save):
"""
Set up easy model saving for a single PyTorch model.
Because PyTorch saving and loading is especially painless, this is
very minimal; we just need references to whatever we would like to
pickle. This is integrated into the logger because the logger
knows where the user would like to save information about this
training run.
Args:
what_to_save: Any PyTorch model or serializable object containing
PyTorch models.
"""
self.pytorch_saver_elements = what_to_save
def _pytorch_simple_save(self, itr=None):
"""
Saves the PyTorch model (or models).
"""
if proc_id()==0:
assert hasattr(self, 'pytorch_saver_elements'), \
"First have to setup saving with self.setup_pytorch_saver"
fpath = 'pyt_save'
fpath = osp.join(self.output_dir, fpath)
fname = 'model' + ('%d'%itr if itr is not None else '') + '.pt'
fname = osp.join(fpath, fname)
os.makedirs(fpath, exist_ok=True)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
# We are using a non-recommended way of saving PyTorch models,
# by pickling whole objects (which are dependent on the exact
# directory structure at the time of saving) as opposed to
# just saving network weights. This works sufficiently well
# for the purposes of Spinning Up, but you may want to do
# something different for your personal PyTorch project.
# We use a catch_warnings() context to avoid the warnings about
# not being able to save the source code.
torch.save(self.pytorch_saver_elements, fname)
def dump_tabular(self, IO=True):
"""
Write all of the diagnostics from the current iteration.
Writes both to stdout, and to the output file.
"""
rv = {}
if proc_id()==0:
vals = []
key_lens = [len(key) for key in self.log_headers]
max_key_len = max(15,max(key_lens))
keystr = '%'+'%d'%max_key_len
fmt = "| " + keystr + "s | %15s |"
n_slashes = 22 + max_key_len
if IO:
print("-"*n_slashes)
for key in self.log_headers:
val = self.log_current_row.get(key, "")
rv[key] = val
valstr = "%8.3g"%val if hasattr(val, "__float__") else val
if IO:
print(fmt%(key, valstr))
vals.append(val)
if IO:
print("-"*n_slashes, flush=True)
if self.output_file is not None and IO:
if self.first_row:
self.output_file.write("\t".join(self.log_headers)+"\n")
self.output_file.write("\t".join(map(str,vals))+"\n")
self.output_file.flush()
self.log_current_row.clear()
self.first_row=False
return rv
class EpochLogger(Logger):
"""
A variant of Logger tailored for tracking average values over epochs.
Typical use case: there is some quantity which is calculated many times
throughout an epoch, and at the end of the epoch, you would like to
report the average / std / min / max value of that quantity.
With an EpochLogger, each time the quantity is calculated, you would
use
.. code-block:: python
epoch_logger.store(NameOfQuantity=quantity_value)
to load it into the EpochLogger's state. Then at the end of the epoch, you
would use
.. code-block:: python
epoch_logger.log_tabular(NameOfQuantity, **options)
to record the desired values.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.epoch_dict = dict()
def store(self, **kwargs):
"""
Save something into the epoch_logger's current state.
Provide an arbitrary number of keyword arguments with numerical
values.
"""
for k,v in kwargs.items():
if not(k in self.epoch_dict.keys()):
self.epoch_dict[k] = []
self.epoch_dict[k].append(v)
def log_tabular(self, key, val=None, with_min_and_max=False, average_only=False):
"""
Log a value or possibly the mean/std/min/max values of a diagnostic.
Args:
key (string): The name of the diagnostic. If you are logging a
diagnostic whose state has previously been saved with
``store``, the key here has to match the key you used there.
val: A value for the diagnostic. If you have previously saved
values for this key via ``store``, do *not* provide a ``val``
here.
with_min_and_max (bool): If true, log min and max values of the
diagnostic over the epoch.
average_only (bool): If true, do not log the standard deviation
of the diagnostic over the epoch.
"""
if val is not None:
super().log_tabular(key,val)
else:
v = self.epoch_dict[key]
vals = np.concatenate(v) if isinstance(v[0], np.ndarray) and len(v[0].shape)>0 else v
stats = mpi_statistics_scalar(vals, with_min_and_max=with_min_and_max)
super().log_tabular(key if average_only else 'Average' + key, stats[0])
if not(average_only):
super().log_tabular('Std'+key, stats[1])
if with_min_and_max:
super().log_tabular('Max'+key, stats[3])
super().log_tabular('Min'+key, stats[2])
self.epoch_dict[key] = []
def get_stats(self, key):
"""
Lets an algorithm ask the logger for mean/std/min/max of a diagnostic.
"""
v = self.epoch_dict[key]
vals = np.concatenate(v) if isinstance(v[0], np.ndarray) and len(v[0].shape)>0 else v
return mpi_statistics_scalar(vals)
import multiprocessing
import numpy as np
import os
import torch
from mpi4py import MPI
from .mpi_tools import broadcast, mpi_avg, num_procs, proc_id
def setup_pytorch_for_mpi():
"""
Avoid slowdowns caused by each separate process's PyTorch using
more than its fair share of CPU resources.
"""
#print('Proc %d: Reporting original number of Torch threads as %d.'%(proc_id(), torch.get_num_threads()), flush=True)
if torch.get_num_threads()==1:
return
fair_num_threads = max(int(torch.get_num_threads() / num_procs()), 1)
torch.set_num_threads(fair_num_threads)
#print('Proc %d: Reporting new number of Torch threads as %d.'%(proc_id(), torch.get_num_threads()), flush=True)
def mpi_avg_grads(module):
""" Average contents of gradient buffers across MPI processes. """
if num_procs()==1:
return
for p in module.parameters():
p_grad_numpy = p.grad.numpy() # numpy view of tensor data
avg_p_grad = mpi_avg(p.grad)
p_grad_numpy[:] = avg_p_grad[:]
def sync_params(module):
""" Sync all parameters of module across all MPI processes. """
if num_procs()==1:
return
for p in module.parameters():
p_numpy = p.data.numpy()
broadcast(p_numpy)
from mpi4py import MPI
import os, subprocess, sys
import numpy as np
import torch
def mpi_fork(n, bind_to_core=False):
"""
Re-launches the current script with workers linked by MPI.
Also, terminates the original process that launched it.
Taken almost without modification from the Baselines function of the
`same name`_.
.. _`same name`: https://github.com/openai/baselines/blob/master/baselines/common/mpi_fork.py
Args:
n (int): Number of process to split into.
bind_to_core (bool): Bind each MPI process to a core.
"""
if n<=1:
return
if os.getenv("IN_MPI") is None:
env = os.environ.copy()
env.update(
MKL_NUM_THREADS="1",
OMP_NUM_THREADS="1",
IN_MPI="1"
)
args = ["mpirun", "-np", str(n)]
if bind_to_core:
args += ["-bind-to", "core"]
args += [sys.executable] + sys.argv
subprocess.check_call(args, env=env)
sys.exit()
def msg(m, string=''):
print(('Message from %d: %s \t '%(MPI.COMM_WORLD.Get_rank(), string))+str(m))
def proc_id():
"""Get rank of calling process."""
return MPI.COMM_WORLD.Get_rank()
def allreduce(*args, **kwargs):
return MPI.COMM_WORLD.Allreduce(*args, **kwargs)
def num_procs():
"""Count active MPI processes."""
return MPI.COMM_WORLD.Get_size()
def broadcast(x, root=0):
MPI.COMM_WORLD.Bcast(x, root=root)
def mpi_op(x, op):
x, scalar = ([x], True) if np.isscalar(x) else (x, False)
x = np.asarray(x, dtype=np.float32)
buff = np.zeros_like(x, dtype=np.float32)
allreduce(x, buff, op=op)
return buff[0] if scalar else buff
def mpi_sum(x):
return mpi_op(x, MPI.SUM)
def mpi_avg(x):
"""Average a scalar or vector over MPI processes."""
return mpi_sum(x) / num_procs()
def mpi_statistics_scalar(x, with_min_and_max=False):
"""
Get mean/std and optional min/max of scalar x across MPI processes.
Args:
x: An array containing samples of the scalar to produce statistics
for.
with_min_and_max (bool): If true, return min and max of x in
addition to mean and std.
"""
if isinstance(x, list):
for i, xx in enumerate(x):
if torch.is_tensor(xx):
x[i] = xx.detach()
x = np.array(x, dtype=np.float32)
global_sum, global_n = mpi_sum([np.sum(x), len(x)])
mean = global_sum / global_n
global_sum_sq = mpi_sum(np.sum((x - mean)**2))
std = np.sqrt(global_sum_sq / global_n) # compute global std
if with_min_and_max:
global_min = mpi_op(np.min(x) if len(x) > 0 else np.inf, op=MPI.MIN)
global_max = mpi_op(np.max(x) if len(x) > 0 else -np.inf, op=MPI.MAX)
return mean, std, global_min, global_max
return mean, std
\ No newline at end of file
from ppo_utils.user_config import DEFAULT_DATA_DIR, FORCE_DATESTAMP, \
DEFAULT_SHORTHAND, WAIT_BEFORE_LAUNCH
from ppo_utils.logx import colorize
from ppo_utils.mpi_tools import mpi_fork, msg
from ppo_utils.serialization_utils import convert_json
import base64
from copy import deepcopy
import cloudpickle
import json
import numpy as np
import os
import os.path as osp
import psutil
import string
import subprocess
from subprocess import CalledProcessError
import sys
from textwrap import dedent
import time
from tqdm import trange
import zlib
DIV_LINE_WIDTH = 80
def setup_logger_kwargs(exp_name, seed=None, data_dir=None, datestamp=False):
"""
Sets up the output_dir for a logger and returns a dict for logger kwargs.
If no seed is given and datestamp is false,
::
output_dir = data_dir/exp_name
If a seed is given and datestamp is false,
::
output_dir = data_dir/exp_name/exp_name_s[seed]
If datestamp is true, amend to
::
output_dir = data_dir/YY-MM-DD_exp_name/YY-MM-DD_HH-MM-SS_exp_name_s[seed]
You can force datestamp=True by setting ``FORCE_DATESTAMP=True`` in
``spinup/user_config.py``.
Args:
exp_name (string): Name for experiment.
seed (int): Seed for random number generators used by experiment.
data_dir (string): Path to folder where results should be saved.
Default is the ``DEFAULT_DATA_DIR`` in ``spinup/user_config.py``.
datestamp (bool): Whether to include a date and timestamp in the
name of the save directory.
Returns:
logger_kwargs, a dict containing output_dir and exp_name.
"""
# Datestamp forcing
datestamp = datestamp or FORCE_DATESTAMP
# Make base path
ymd_time = time.strftime("%Y-%m-%d_") if datestamp else ''
relpath = ''.join([ymd_time, exp_name])
if seed is not None:
# Make a seed-specific subfolder in the experiment directory.
if datestamp:
hms_time = time.strftime("%Y-%m-%d_%H-%M-%S")
subfolder = ''.join([hms_time, '-', exp_name, '_s', str(seed)])
else:
subfolder = ''.join([exp_name, '_s', str(seed)])
relpath = osp.join(relpath, subfolder)
data_dir = data_dir or DEFAULT_DATA_DIR
logger_kwargs = dict(output_dir=osp.join(data_dir, relpath),
exp_name=exp_name)
return logger_kwargs
def call_experiment(exp_name, thunk, seed=0, num_cpu=1, data_dir=None,
datestamp=False, **kwargs):
"""
Run a function (thunk) with hyperparameters (kwargs), plus configuration.
This wraps a few pieces of functionality which are useful when you want
to run many experiments in sequence, including logger configuration and
splitting into multiple processes for MPI.
There's also a SpinningUp-specific convenience added into executing the
thunk: if ``env_name`` is one of the kwargs passed to call_experiment, it's
assumed that the thunk accepts an argument called ``env_fn``, and that
the ``env_fn`` should make a gym environment with the given ``env_name``.
The way the experiment is actually executed is slightly complicated: the
function is serialized to a string, and then ``run_entrypoint.py`` is
executed in a subprocess call with the serialized string as an argument.
``run_entrypoint.py`` unserializes the function call and executes it.
We choose to do it this way---instead of just calling the function
directly here---to avoid leaking state between successive experiments.
Args:
exp_name (string): Name for experiment.
thunk (callable): A python function.
seed (int): Seed for random number generators.
num_cpu (int): Number of MPI processes to split into. Also accepts
'auto', which will set up as many procs as there are cpus on
the machine.
data_dir (string): Used in configuring the logger, to decide where
to store experiment results. Note: if left as None, data_dir will
default to ``DEFAULT_DATA_DIR`` from ``spinup/user_config.py``.
**kwargs: All kwargs to pass to thunk.
"""
# Determine number of CPU cores to run on
num_cpu = psutil.cpu_count(logical=False) if num_cpu=='auto' else num_cpu
# Send random seed to thunk
kwargs['seed'] = seed
# Be friendly and print out your kwargs, so we all know what's up
print(colorize('Running experiment:\n', color='cyan', bold=True))
print(exp_name + '\n')
print(colorize('with kwargs:\n', color='cyan', bold=True))
kwargs_json = convert_json(kwargs)
print(json.dumps(kwargs_json, separators=(',',':\t'), indent=4, sort_keys=True))
print('\n')
# Set up logger output directory
if 'logger_kwargs' not in kwargs:
kwargs['logger_kwargs'] = setup_logger_kwargs(exp_name, seed, data_dir, datestamp)
else:
print('Note: Call experiment is not handling logger_kwargs.\n')
def thunk_plus():
# Make 'env_fn' from 'env_name'
if 'env_name' in kwargs:
import gym
env_name = kwargs['env_name']
kwargs['env_fn'] = lambda : gym.make(env_name)
del kwargs['env_name']
# Fork into multiple processes
mpi_fork(num_cpu)
# Run thunk
thunk(**kwargs)
# Prepare to launch a script to run the experiment
pickled_thunk = cloudpickle.dumps(thunk_plus)
encoded_thunk = base64.b64encode(zlib.compress(pickled_thunk)).decode('utf-8')
entrypoint = osp.join(osp.abspath(osp.dirname(__file__)),'run_entrypoint.py')
cmd = [sys.executable if sys.executable else 'python', entrypoint, encoded_thunk]
try:
subprocess.check_call(cmd, env=os.environ)
except CalledProcessError:
err_msg = '\n'*3 + '='*DIV_LINE_WIDTH + '\n' + dedent("""
There appears to have been an error in your experiment.
Check the traceback above to see what actually went wrong. The
traceback below, included for completeness (but probably not useful
for diagnosing the error), shows the stack leading up to the
experiment launch.
""") + '='*DIV_LINE_WIDTH + '\n'*3
print(err_msg)
raise
# Tell the user about where results are, and how to check them
logger_kwargs = kwargs['logger_kwargs']
plot_cmd = 'python -m spinup.run plot '+logger_kwargs['output_dir']
plot_cmd = colorize(plot_cmd, 'green')
test_cmd = 'python -m spinup.run test_policy '+logger_kwargs['output_dir']
test_cmd = colorize(test_cmd, 'green')
output_msg = '\n'*5 + '='*DIV_LINE_WIDTH +'\n' + dedent("""\
End of experiment.
Plot results from this run with:
%s
Watch the trained agent with:
%s
"""%(plot_cmd,test_cmd)) + '='*DIV_LINE_WIDTH + '\n'*5
print(output_msg)
def all_bools(vals):
return all([isinstance(v,bool) for v in vals])
def valid_str(v):
"""
Convert a value or values to a string which could go in a filepath.
Partly based on `this gist`_.
.. _`this gist`: https://gist.github.com/seanh/93666
"""
if hasattr(v, '__name__'):
return valid_str(v.__name__)
if isinstance(v, tuple) or isinstance(v, list):
return '-'.join([valid_str(x) for x in v])
# Valid characters are '-', '_', and alphanumeric. Replace invalid chars
# with '-'.
str_v = str(v).lower()
valid_chars = "-_%s%s" % (string.ascii_letters, string.digits)
str_v = ''.join(c if c in valid_chars else '-' for c in str_v)
return str_v
class ExperimentGrid:
"""
Tool for running many experiments given hyperparameter ranges.
"""
def __init__(self, name=''):
self.keys = []
self.vals = []
self.shs = []
self.in_names = []
self.name(name)
def name(self, _name):
assert isinstance(_name, str), "Name has to be a string."
self._name = _name
def print(self):
"""Print a helpful report about the experiment grid."""
print('='*DIV_LINE_WIDTH)
# Prepare announcement at top of printing. If the ExperimentGrid has a
# short name, write this as one line. If the name is long, break the
# announcement over two lines.
base_msg = 'ExperimentGrid %s runs over parameters:\n'
name_insert = '['+self._name+']'
if len(base_msg%name_insert) <= 80:
msg = base_msg%name_insert
else:
msg = base_msg%(name_insert+'\n')
print(colorize(msg, color='green', bold=True))
# List off parameters, shorthands, and possible values.
for k, v, sh in zip(self.keys, self.vals, self.shs):
color_k = colorize(k.ljust(40), color='cyan', bold=True)
print('', color_k, '['+sh+']' if sh is not None else '', '\n')
for i, val in enumerate(v):
print('\t' + str(convert_json(val)))
print()
# Count up the number of variants. The number counting seeds
# is the total number of experiments that will run; the number not
# counting seeds is the total number of otherwise-unique configs
# being investigated.
nvars_total = int(np.prod([len(v) for v in self.vals]))
if 'seed' in self.keys:
num_seeds = len(self.vals[self.keys.index('seed')])
nvars_seedless = int(nvars_total / num_seeds)
else:
nvars_seedless = nvars_total
print(' Variants, counting seeds: '.ljust(40), nvars_total)
print(' Variants, not counting seeds: '.ljust(40), nvars_seedless)
print()
print('='*DIV_LINE_WIDTH)
def _default_shorthand(self, key):
# Create a default shorthand for the key, built from the first
# three letters of each colon-separated part.
# But if the first three letters contains something which isn't
# alphanumeric, shear that off.
valid_chars = "%s%s" % (string.ascii_letters, string.digits)
def shear(x):
return ''.join(z for z in x[:3] if z in valid_chars)
sh = '-'.join([shear(x) for x in key.split(':')])
return sh
def add(self, key, vals, shorthand=None, in_name=False):
"""
Add a parameter (key) to the grid config, with potential values (vals).
By default, if a shorthand isn't given, one is automatically generated
from the key using the first three letters of each colon-separated
term. To disable this behavior, change ``DEFAULT_SHORTHAND`` in the
``spinup/user_config.py`` file to ``False``.
Args:
key (string): Name of parameter.
vals (value or list of values): Allowed values of parameter.
shorthand (string): Optional, shortened name of parameter. For
example, maybe the parameter ``steps_per_epoch`` is shortened
to ``steps``.
in_name (bool): When constructing variant names, force the
inclusion of this parameter into the name.
"""
assert isinstance(key, str), "Key must be a string."
assert shorthand is None or isinstance(shorthand, str), \
"Shorthand must be a string."
if not isinstance(vals, list):
vals = [vals]
if DEFAULT_SHORTHAND and shorthand is None:
shorthand = self._default_shorthand(key)
self.keys.append(key)
self.vals.append(vals)
self.shs.append(shorthand)
self.in_names.append(in_name)
def variant_name(self, variant):
"""
Given a variant (dict of valid param/value pairs), make an exp_name.
A variant's name is constructed as the grid name (if you've given it
one), plus param names (or shorthands if available) and values
separated by underscores.
Note: if ``seed`` is a parameter, it is not included in the name.
"""
def get_val(v, k):
# Utility method for getting the correct value out of a variant
# given as a nested dict. Assumes that a parameter name, k,
# describes a path into the nested dict, such that k='a:b:c'
# corresponds to value=variant['a']['b']['c']. Uses recursion
# to get this.
if k in v:
return v[k]
else:
splits = k.split(':')
k0, k1 = splits[0], ':'.join(splits[1:])
return get_val(v[k0], k1)
# Start the name off with the name of the variant generator.
var_name = self._name
# Build the rest of the name by looping through all parameters,
# and deciding which ones need to go in there.
for k, v, sh, inn in zip(self.keys, self.vals, self.shs, self.in_names):
# Include a parameter in a name if either 1) it can take multiple
# values, or 2) the user specified that it must appear in the name.
# Except, however, when the parameter is 'seed'. Seed is handled
# differently so that runs of the same experiment, with different
# seeds, will be grouped by experiment name.
if (len(v)>1 or inn) and not(k=='seed'):
# Use the shorthand if available, otherwise the full name.
param_name = sh if sh is not None else k
param_name = valid_str(param_name)
# Get variant value for parameter k
variant_val = get_val(variant, k)
# Append to name
if all_bools(v):
# If this is a param which only takes boolean values,
# only include in the name if it's True for this variant.
var_name += ('_' + param_name) if variant_val else ''
else:
var_name += '_' + param_name + valid_str(variant_val)
return var_name.lstrip('_')
def _variants(self, keys, vals):
"""
Recursively builds list of valid variants.
"""
if len(keys)==1:
pre_variants = [dict()]
else:
pre_variants = self._variants(keys[1:], vals[1:])
variants = []
for val in vals[0]:
for pre_v in pre_variants:
v = {}
v[keys[0]] = val
v.update(pre_v)
variants.append(v)
return variants
def variants(self):
"""
Makes a list of dicts, where each dict is a valid config in the grid.
There is special handling for variant parameters whose names take
the form
``'full:param:name'``.
The colons are taken to indicate that these parameters should
have a nested dict structure. eg, if there are two params,
==================== ===
Key Val
==================== ===
``'base:param:a'`` 1
``'base:param:b'`` 2
==================== ===
the variant dict will have the structure
.. parsed-literal::
variant = {
base: {
param : {
a : 1,
b : 2
}
}
}
"""
flat_variants = self._variants(self.keys, self.vals)
def unflatten_var(var):
"""
Build the full nested dict version of var, based on key names.
"""
new_var = dict()
unflatten_set = set()
for k,v in var.items():
if ':' in k:
splits = k.split(':')
k0 = splits[0]
assert k0 not in new_var or isinstance(new_var[k0], dict), \
"You can't assign multiple values to the same key."
if not(k0 in new_var):
new_var[k0] = dict()
sub_k = ':'.join(splits[1:])
new_var[k0][sub_k] = v
unflatten_set.add(k0)
else:
assert not(k in new_var), \
"You can't assign multiple values to the same key."
new_var[k] = v
# Make sure to fill out the nested dicts.
for k in unflatten_set:
new_var[k] = unflatten_var(new_var[k])
return new_var
new_variants = [unflatten_var(var) for var in flat_variants]
return new_variants
def run(self, thunk, num_cpu=1, data_dir=None, datestamp=False):
"""
Run each variant in the grid with function 'thunk'.
Note: 'thunk' must be either a callable function, or a string. If it is
a string, it must be the name of a parameter whose values are all
callable functions.
Uses ``call_experiment`` to actually launch each experiment, and gives
each variant a name using ``self.variant_name()``.
Maintenance note: the args for ExperimentGrid.run should track closely
to the args for call_experiment. However, ``seed`` is omitted because
we presume the user may add it as a parameter in the grid.
"""
# Print info about self.
self.print()
# Make the list of all variants.
variants = self.variants()
# Print variant names for the user.
var_names = set([self.variant_name(var) for var in variants])
var_names = sorted(list(var_names))
line = '='*DIV_LINE_WIDTH
preparing = colorize('Preparing to run the following experiments...',
color='green', bold=True)
joined_var_names = '\n'.join(var_names)
announcement = f"\n{preparing}\n\n{joined_var_names}\n\n{line}"
print(announcement)
if WAIT_BEFORE_LAUNCH > 0:
delay_msg = colorize(dedent("""
Launch delayed to give you a few seconds to review your experiments.
To customize or disable this behavior, change WAIT_BEFORE_LAUNCH in
spinup/user_config.py.
"""), color='cyan', bold=True)+line
print(delay_msg)
wait, steps = WAIT_BEFORE_LAUNCH, 100
prog_bar = trange(steps, desc='Launching in...',
leave=False, ncols=DIV_LINE_WIDTH,
mininterval=0.25,
bar_format='{desc}: {bar}| {remaining} {elapsed}')
for _ in prog_bar:
time.sleep(wait/steps)
# Run the variants.
for var in variants:
exp_name = self.variant_name(var)
# Figure out what the thunk is.
if isinstance(thunk, str):
# Assume one of the variant parameters has the same
# name as the string you passed for thunk, and that
# variant[thunk] is a valid callable function.
thunk_ = var[thunk]
del var[thunk]
else:
# Assume thunk is given as a function.
thunk_ = thunk
call_experiment(exp_name, thunk_, num_cpu=num_cpu,
data_dir=data_dir, datestamp=datestamp, **var)
def test_eg():
eg = ExperimentGrid()
eg.add('test:a', [1,2,3], 'ta', True)
eg.add('test:b', [1,2,3])
eg.add('some', [4,5])
eg.add('why', [True,False])
eg.add('huh', 5)
eg.add('no', 6, in_name=True)
return eg.variants()
\ No newline at end of file
import json
def convert_json(obj):
""" Convert obj to a version which can be serialized with JSON. """
if is_json_serializable(obj):
return obj
else:
if isinstance(obj, dict):
return {convert_json(k): convert_json(v)
for k,v in obj.items()}
elif isinstance(obj, tuple):
return (convert_json(x) for x in obj)
elif isinstance(obj, list):
return [convert_json(x) for x in obj]
elif hasattr(obj,'__name__') and not('lambda' in obj.__name__):
return convert_json(obj.__name__)
elif hasattr(obj,'__dict__') and obj.__dict__:
obj_dict = {convert_json(k): convert_json(v)
for k,v in obj.__dict__.items()}
return {str(obj): obj_dict}
return str(obj)
def is_json_serializable(v):
try:
json.dumps(v)
return True
except:
return False
\ No newline at end of file
import os
import os.path as osp
# Default neural network backend for each algo
# (Must be either 'tf1' or 'pytorch')
DEFAULT_BACKEND = {
'vpg': 'pytorch',
'trpo': 'tf1',
'ppo': 'pytorch',
'ddpg': 'pytorch',
'td3': 'pytorch',
'sac': 'pytorch'
}
# Where experiment outputs are saved by default:
DEFAULT_DATA_DIR = osp.join(osp.abspath(osp.dirname(osp.dirname(__file__))),'data')
# Whether to automatically insert a date and time stamp into the names of
# save directories:
FORCE_DATESTAMP = False
# Whether GridSearch provides automatically-generated default shorthands:
DEFAULT_SHORTHAND = True
# Tells the GridSearch how many seconds to pause for before launching
# experiments.
WAIT_BEFORE_LAUNCH = 5
\ No newline at end of file
import torch
import numpy as np
from . import core
from .ppo_utils.logx import EpochLogger
from .ppo_utils.mpi_pytorch import setup_pytorch_for_mpi, sync_params, mpi_avg_grads
from .ppo_utils.mpi_tools import mpi_fork, mpi_avg, proc_id, mpi_statistics_scalar, num_procs
class TD3Buffer(object):
def __init__(self, state_dim, action_dim, max_size=int(1e6)):
self.max_size = max_size
self.ptr = 0
self.size = 0
# print(state_dim,action_dim)
self.state = np.zeros((max_size, state_dim))
self.action = np.zeros((max_size, action_dim))
self.next_state = np.zeros((max_size, state_dim))
self.reward = np.zeros((max_size, 1))
self.cost = np.zeros((max_size, 1))
self.not_done = np.zeros((max_size, 1))
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def add(self, state, action, next_state, reward, cost, done):
self.state[self.ptr] = state
self.action[self.ptr] = action
self.next_state[self.ptr] = next_state
self.reward[self.ptr] = reward
self.cost[self.ptr] = cost
self.not_done[self.ptr] = 1. - done
self.ptr = (self.ptr + 1) % self.max_size
self.size = min(self.size + 1, self.max_size)
def sample(self, batch_size):
ind = np.random.randint(0, self.size, size=batch_size)
return (
torch.FloatTensor(self.state[ind]).to(self.device),
torch.FloatTensor(self.action[ind]).to(self.device),
torch.FloatTensor(self.next_state[ind]).to(self.device),
torch.FloatTensor(self.reward[ind]).to(self.device),
torch.FloatTensor(self.cost[ind]).to(self.device),
torch.FloatTensor(self.not_done[ind]).to(self.device)
)
class PPOBuffer:
"""
A buffer for storing trajectories experienced by a PPO agent interacting
with the environment, and using Generalized Advantage Estimation (GAE-Lambda)
for calculating the advantages of state-action pairs.
"""
def __init__(self, obs_dim, act_dim, size, gamma=0.99, lam=0.95):
self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32)
self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32)
self.old_act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32)
self.adv_buf = np.zeros(size, dtype=np.float32)
self.rew_buf = np.zeros(size, dtype=np.float32)
self.ret_buf = np.zeros(size, dtype=np.float32)
self.val_buf = np.zeros(size, dtype=np.float32)
self.cost_adv_buf = np.zeros(size, dtype=np.float32)
self.cost_buf = np.zeros(size, dtype=np.float32)
self.cost_ret_buf = np.zeros(size, dtype=np.float32)
self.cost_val_buf = np.zeros(size, dtype=np.float32)
self.logp_buf = np.zeros(size, dtype=np.float32)
self.gamma, self.lam = gamma, lam
self.ptr, self.path_start_idx, self.max_size = 0, 0, size
def store(self, obs, act, rew, cost, val, cost_val, logp, old_act):
"""
Append one timestep of agent-environment interaction to the buffer.
"""
assert self.ptr < self.max_size # buffer has to have room so you can store
self.obs_buf[self.ptr] = obs
self.act_buf[self.ptr] = act
self.old_act_buf[self.ptr] = old_act
self.rew_buf[self.ptr] = rew
self.val_buf[self.ptr] = val
self.cost_buf[self.ptr] = cost
self.cost_val_buf[self.ptr] = cost_val
self.logp_buf[self.ptr] = logp
self.ptr += 1
def finish_path(self, last_val=0, last_cost_val=0):
"""
Call this at the end of a trajectory, or when one gets cut off
by an epoch ending. This looks back in the buffer to where the
trajectory started, and uses rewards and value estimates from
the whole trajectory to compute advantage estimates with GAE-Lambda,
as well as compute the rewards-to-go for each state, to use as
the targets for the value function.
The "last_val" argument should be 0 if the trajectory ended
because the agent reached a terminal state (died), and otherwise
should be V(s_T), the value function estimated for the last state.
This allows us to bootstrap the reward-to-go calculation to account
for timesteps beyond the arbitrary episode horizon (or epoch cutoff).
"""
path_slice = slice(self.path_start_idx, self.ptr)
rews = np.append(self.rew_buf[path_slice], last_val)
vals = np.append(self.val_buf[path_slice], last_val)
# the next two lines implement GAE-Lambda advantage calculation
deltas = rews[:-1] + self.gamma * vals[1:] - vals[:-1]
self.adv_buf[path_slice] = core.discount_cumsum(deltas, self.gamma * self.lam)
# the next line computes rewards-to-go, to be targets for the value function
self.ret_buf[path_slice] = core.discount_cumsum(rews, self.gamma)[:-1]
costs = np.append(self.cost_buf[path_slice], last_cost_val)
cost_vals = np.append(self.cost_val_buf[path_slice], last_cost_val)
# the next two lines implement GAE-Lambda advantage calculation
cost_deltas = costs[:-1] + self.gamma * cost_vals[1:] - cost_vals[:-1]
self.cost_adv_buf[path_slice] = core.discount_cumsum(cost_deltas, self.gamma * self.lam)
# the next line computes rewards-to-go, to be targets for the value function
self.cost_ret_buf[path_slice] = core.discount_cumsum(costs, self.gamma)[:-1]
self.path_start_idx = self.ptr
def get(self):
"""
Call this at the end of an epoch to get all of the data from
the buffer, with advantages appropriately normalized (shifted to have
mean zero and std one). Also, resets some pointers in the buffer.
"""
assert self.ptr == self.max_size # buffer has to be full before you can get
self.ptr, self.path_start_idx = 0, 0
# the next two lines implement the advantage normalization trick
adv_mean, adv_std = mpi_statistics_scalar(self.adv_buf)
self.adv_buf = (self.adv_buf - adv_mean) / adv_std
cost_adv_mean, cost_adv_std = mpi_statistics_scalar(self.cost_adv_buf)
self.cost_adv_buf = (self.cost_adv_buf - cost_adv_mean) / cost_adv_std
data = dict(obs=self.obs_buf, act=self.act_buf,old_act=self.old_act_buf, ret=self.ret_buf,
adv=self.adv_buf, cost_ret = self.cost_ret_buf, cost_adv=self.cost_adv_buf, logp=self.logp_buf)
return {k: torch.as_tensor(v, dtype=torch.float32) for k,v in data.items()}
import gym
from gym import spaces
from gym import spaces, Env
import pygame
import numpy as np
MAX_STEPS_PER_EPISODE = 100
MAX_FORWARD_REWARD_THRESHOLD = 0.5
#reward scaling
SEARCH_SCALE = 1.0
FORWARD_SCALE = 0.1
class GridWorldSAR(gym.Env):
metadata = {"render_modes": ["human", "rgb_array"], "render_fps": 4}
......@@ -13,9 +20,14 @@ class GridWorldSAR(gym.Env):
self.window_size = 512 # The size of the PyGame window
#Values the agent will need
self.pos_x = 0.0 #Current position, update w/ velocity
self.pos_x = 0.0 #Current position, update w/ velocity (Might need to make this the pos in 1200x1200 path coordinates?)
self.pos_y = 0.0
self.heatmap = np.load('./deeprl_data/lpm4prob.npy')
self.cell = [0,0]
self.visited = np.zeros((map_size,map_size),dtype=bool)
self.heatmap = np.load('./deeprl_data/lpm/lpm4_prob.npy')
self.ringmap = np.load('./deeprl_data/ring/ringlowres_prob.npy')
#self.heatmap = np.load('./deeprl_data/test/test_prob.npy')
#self.heatmap = np.load('./deeprl_data/ring/ringlowres_prop.npy')
# Observations are dictionaries with the agent's and the target's location.
# Each location is encoded as an element of {0, ..., `size`}^2, i.e. MultiDiscrete([size, size]).
......@@ -23,14 +35,16 @@ class GridWorldSAR(gym.Env):
#Pos = agent's current coordinates
#risk = search risk. Agent can see in a square area around themselves
#visited = which spaces have been surveyed. Will be needed to get paths for multiple drones
#So this is kinda model based RL..?
self.observation_space = spaces.Dict(
{
"pos": spaces.Box(0, size -1, shape=(2,),dtype=int),
"risk": spaces.Box(0, 1, shape=(map_size*map_size,), dtype=float),
"visited": spaces.Box(0, 1, shape=(map_size*map_size,), dtype=bool),
"pos": spaces.Box(0, self.size -1, shape=(2,),dtype=float),
"risk": spaces.Box(0, 1, shape=(self.size*self.size,), dtype=float),
"visited": spaces.Box(0, 1, shape=(self.size*self.size,), dtype=bool),
}
)
#Might need to take the risk/visited obs out. Let's see if it runs
self.observation_space = spaces.Box(low=0, high=self.size-1, shape=(2,),dtype=int)
#Continuous velocity. Keep it at 1 so agent can't skip over cells to end episode early
self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(2,), dtype=float)
......@@ -50,6 +64,146 @@ class GridWorldSAR(gym.Env):
self.window = None
self.clock = None
def reset(self, seed=None, options=None):
if seed:
np.random.seed(seed)
random.seed(seed)
#Move agent back to start
self.pos_x = 0.0
self.pos_y = 23.0
self.cell = [0,23]
self.current_step = 0
self.done = False
self.visited = np.zeros((self.size,self.size),dtype=bool)
observation = self._get_obs()
info = self._get_info()
return observation
#let's see if we really need to observe risk and visited?
def _get_obs(self):
return {'pos': np.array([np.int32(self.pos_x),np.int32(self.pos_y)])}
return np.array([self.pos_x,self.pos_y])
#return {'pos': self.cell, "risk": self.heatmap, "visited": self.visited}
def _get_info(self):
#print("Location = %.3f %.3f" %(self.pos_x,self.pos_y))
#Determine cost for CPO
#hacky but make sure indicies are okay
if(self.cell[0] < 0):
self.cell[0] = 0
if(self.cell[0] >= self.size):
self.cell[0] = self.size-1
if(self.cell[1] < 0):
self.cell[1] = 0
if(self.cell[1] >= self.size):
self.cell[1] = self.size-1
map_diff = abs(self.heatmap[self.cell[0],self.cell[1]] - self.ringmap[self.cell[0],self.cell[1]])
return {'loc': [self.pos_x,self.pos_y], 'cost': map_diff}
def step(self, action):
terminated = self.done
truncated = self.done
#Move agent to new position
self.pos_x = self.pos_x + action[0]
self.pos_y = self.pos_y + action[1]
self.cell = np.array([np.int32(self.pos_x),np.int32(self.pos_y)])
#Check if agent has reached edge
if(self.pos_x < 0 or self.pos_x >= self.size or self.pos_y < 0 or self.pos_y >= self.size):
self.done = True
terminated = True
#Add reward for reaching end??
#Calculate reward and update visit map
elif(not self.visited[self.cell[0],self.cell[1]]):
reward_search = self.heatmap[self.cell[0],self.cell[1]]
self.visited[self.cell[0],self.cell[1]] = True
reward_search = 0
reward_forward = np.min([action[0], MAX_FORWARD_REWARD_THRESHOLD])
#End episode after step max or agent exits the environment
if(self.current_step == MAX_STEPS_PER_EPISODE):
self.done = True
truncated = True
#Update step counter
self.current_step += 1
reward = SEARCH_SCALE*reward_search + FORWARD_SCALE*reward_forward
#print(reward)
observation = self._get_obs()
info = self._get_info()
return observation, reward, terminated, info
def render(self):
if self.render_mode == "rgb_array":
return self._render_frame()
def _render_frame(self):
if self.window is None and self.render_mode == "human":
pygame.init()
pygame.display.init()
self.window = pygame.display.set_mode((self.window_size, self.window_size))
if self.clock is None and self.render_mode == "human":
self.clock = pygame.time.Clock()
canvas = pygame.Surface((self.window_size, self.window_size))
canvas.fill((255, 255, 255))
pix_square_size = (
self.window_size / self.size
) # The size of a single grid square in pixels
# Now we draw the agent
pygame.draw.circle(
canvas,
(0, 0, 255),
(self.cell + 0.5) * pix_square_size,
pix_square_size / 3,
)
# Finally, add some gridlines
for x in range(self.size + 1):
pygame.draw.line(
canvas,
0,
(0, pix_square_size * x),
(self.window_size, pix_square_size * x),
width=3,
)
pygame.draw.line(
canvas,
0,
(pix_square_size * x, 0),
(pix_square_size * x, self.window_size),
width=3,
)
if self.render_mode == "human":
# The following line copies our drawings from `canvas` to the visible window
self.window.blit(canvas, canvas.get_rect())
pygame.event.pump()
pygame.display.update()
# We need to ensure that human-rendering occurs at the predefined framerate.
# The following line will automatically add a delay to keep the framerate stable.
self.clock.tick(self.metadata["render_fps"])
else: # rgb_array
return np.transpose(
np.array(pygame.surfarray.pixels3d(canvas)), axes=(1, 0, 2)
)
def close(self):
if self.window is not None:
pygame.display.quit()
pygame.quit()
import math
import numpy as np
import torch
def normal_entropy(std):
var = std.pow(2)
entropy = 0.5 + 0.5 * torch.log(2 * var * math.pi)
return entropy.sum(1, keepdim=True)
def normal_log_density(x, mean, log_std, std):
var = std.pow(2)
log_density = -(x - mean).pow(2) / (
2 * var) - 0.5 * math.log(2 * math.pi) - log_std
return log_density.sum(1, keepdim=True)
def get_flat_params_from(model):
params = []
for param in model.parameters():
params.append(param.data.view(-1))
flat_params = torch.cat(params)
return flat_params
def set_flat_params_to(model, flat_params):
prev_ind = 0
for param in model.parameters():
flat_size = int(np.prod(list(param.size())))
param.data.copy_(
flat_params[prev_ind:prev_ind + flat_size].view(param.size()))
prev_ind += flat_size
def get_flat_grad_from(net, grad_grad=False):
grads = []
for param in net.parameters():
if grad_grad:
grads.append(param.grad.grad.view(-1))
else:
grads.append(param.grad.view(-1))
flat_grad = torch.cat(grads)
return flat_grad
\ No newline at end of file
......@@ -22,6 +22,11 @@ import datetime as dt
import time, os
import pdb
from sargym import sar_gym, cpo, core
import cProfile, pstats, io
#Relative filepaths (Assumes this and ags_grabber projects are in same parent directory)
kentland_heatmap = './LP model/analysis/outputs/ic_1_con_hiker_t12_kentland.csv'
#kentland_heatmap = './LP model/analysis/outputs/ic_1_con_hiker_t4.csv' #I like 4?
......@@ -159,26 +164,188 @@ def create_data():
del rgp, planner, mc
def main():
def testing():
#Run this function to create data
create_data()
#Load robot paths from file
ring_prob = np.load('./deeprl_data/ring_prob.npy')
robot0_waypoints = np.load('./deeprl_data/robot0_waypoints.npy')
robot1_waypoints = np.load('./deeprl_data/robot1_waypoints.npy')
robot2_waypoints = np.load('./deeprl_data/robot2_waypoints.npy')
#create_data()
#Make Gym Environment
env = sar_gym.GridWorldSAR(render_mode='human')
obs, info = env.reset()
path = [info['loc']]
rews = []
done = False
while not done:
obs, reward, term, info = env.step([1,1])
done = term
path.append(info['loc'])
rews.append(reward)
fig, (ax1, ax2) = plt.subplots(1, 2)
# Scatter plot of path and track in the first subplot
ax1.scatter(x=np.array(path)[:,0], y=np.array(path)[:,1], label='Path')
ax1.set_xlim([0,48])
ax1.set_ylim([0,48])
ax1.set_xlabel('X')
ax1.set_ylabel('Y')
ax1.set_title('Path and Track')
ax1.legend()
# Plot of rewards in the second subplot
ax2.plot(rews)
ax2.set_xlabel('Step')
ax2.set_ylabel('Reward')
ax2.set_title('Rewards')
plt.tight_layout()
plt.show()
return
COST_THRESHOLD = 50 #sanity check to make sure things are working
MAX_STEPS_PER_EPISODE = 100
def train():
#Run this function to create data
#create_data()
#Training CPO
J = cpo.cpo(
lambda: sar_gym.GridWorldSAR(render_mode=None),
actor_critic=core.MLPActorCriticTD3trust,
max_ep_len=MAX_STEPS_PER_EPISODE,
cost_lim=COST_THRESHOLD,
epochs=200,
steps_per_epoch=8000
)
#175
pr = cProfile.Profile()
pr.enable()
profile = True
if profile:
try:
J
except KeyboardInterrupt:
print("\nKeyboard interrupt received. Printing stats...")
finally:
pr.disable()
s = io.StringIO()
ps = pstats.Stats(pr, stream=s).sort_stats("tottime")
ps.print_stats(32)
print(s.getvalue())
#Save policy
torch.save(J.pi, './deeprl_data/trained_model4.pt')
#Testing
#Make Gym Environment
env = sar_gym.GridWorldSAR(render_mode=None)
#Test one episode
obs = env.reset()
path = []
rews = []
done = False
while not done:
with torch.no_grad():
tt = torch.as_tensor(obs,dtype=torch.float32)
actions = J.pi(tt)
obs, reward, term, info = env.step(actions.numpy())
done = term
path.append(info['loc'])
rews.append(reward)
fig, (ax1, ax2) = plt.subplots(1, 2)
# Scatter plot of path and track in the first subplot
ax1.scatter(x=np.array(path)[:,0], y=np.array(path)[:,1], label='Path')
ax1.set_xlim([0,48])
ax1.set_ylim([0,48])
ax1.set_xlabel('X')
ax1.set_ylabel('Y')
ax1.set_title('Path and Track')
ax1.legend()
# Plot of rewards in the second subplot
ax2.plot(rews)
ax2.set_xlabel('Step')
ax2.set_ylabel('Reward')
ax2.set_title('Rewards')
plt.tight_layout()
plt.show()
return
def test_policy():
#Testing
policy = torch.load('./deeprl_data/trained_model2.pt')
heatmap = np.load('./deeprl_data/lpm/lpm4_prob.npy')
ringmap = np.load('./deeprl_data/ring/ringlowres_prob.npy')
#Make Gym Environment
env = sar_gym.GridWorldSAR(render_mode=None)
#Test one episode
obs = env.reset()
path = []
rews = []
done = False
while not done:
with torch.no_grad():
tt = torch.as_tensor(obs,dtype=torch.float32)
actions = policy(tt)
obs, reward, term, info = env.step(actions.numpy())
done = term
path.append(info['loc'])
rews.append(reward)
fig, (ax1, ax2) = plt.subplots(1, 2)
# Scatter plot of path and track in the first subplot
ax1.scatter(x=np.array(path)[:,0], y=np.array(path)[:,1], label='Path')
ax1.imshow(heatmap)
ax1.set_xlim([0,48])
ax1.set_ylim([0,48])
ax1.set_xlabel('X')
ax1.set_ylabel('Y')
ax1.set_title('Lost Person Model')
ax1.legend()
# Scatter plot of path and track in the first subplot
ax2.scatter(x=np.array(path)[:,0], y=np.array(path)[:,1], label='Path')
ax2.imshow(ringmap)
ax2.set_xlim([0,48])
ax2.set_ylim([0,48])
ax2.set_xlabel('X')
ax2.set_ylabel('Y')
ax2.set_title('Ring Model')
ax2.legend()
# Plot of rewards in the second subplot
#ax2.plot(rews)
#ax2.set_xlabel('Step')
#ax2.set_ylabel('Reward')
#x2.set_title('Rewards')
plt.tight_layout()
plt.suptitle("LPM Reward Function with Ring Model Constraint")
plt.show()
def main():
#test_policy()
train()
return
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment