Source code for rlzoo.algorithms.dqn.default

from gym.spaces import Discrete

from rlzoo.common.utils import set_seed
from rlzoo.common.value_networks import *

""" 
full list of algorithm parameters (alg_params)
-----------------------------------------------
-----------------------------------------------

full list of learning parameters (learn_params)
-----------------------------------------------
double_q (bool): if True double DQN will be used
dueling (bool): if True dueling value estimation will be used
exploration_rate (float): fraction of entire training period over
    which the exploration rate is annealed
exploration_final_eps (float): final value of random action probability
batch_size (int): size of a batched sampled from replay buffer for training
train_freq (int): update the model every `train_freq` steps
learning_starts (int): how many steps of the model to collect transitions
                        for before learning starts
target_network_update_freq (int): update the target network every
                                    `target_network_update_freq` steps
buffer_size (int): size of the replay buffer
prioritized_replay (bool): if True prioritized replay buffer will be used.
prioritized_alpha (float): alpha parameter for prioritized replay
prioritized_beta0 (float): beta parameter for prioritized replay
mode (str): train or test
-----------------------------------------------
"""


[docs]def atari(env, default_seed=False, **kwargs): if default_seed: seed = 2 set_seed(seed, env) # reproducible assert isinstance(env.action_space, Discrete) alg_params = dict( dueling=True, double_q=True, buffer_size=1000, prioritized_replay=True, prioritized_alpha=0.6, prioritized_beta0=0.4, ) alg_params.update(kwargs) if alg_params.get('net_list') is None: alg_params['net_list'] = [QNetwork(env.observation_space, env.action_space, [64], state_only=True, dueling=alg_params['dueling'])] if alg_params.get('optimizers_list') is None: alg_params['optimizers_list'] = tf.optimizers.Adam(1e-4, epsilon=1e-5, clipnorm=10), learn_params = dict( train_episodes=int(1e5), test_episodes=10, max_steps=200, save_interval=1e4, batch_size=32, exploration_rate=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, ) return alg_params, learn_params
[docs]def classic_control(env, default_seed=False, **kwargs): if default_seed: seed = 2 set_seed(seed, env) # reproducible assert isinstance(env.action_space, Discrete) alg_params = dict( dueling=True, double_q=True, buffer_size=1000, prioritized_replay=False, prioritized_alpha=0.6, prioritized_beta0=0.4, ) alg_params.update(kwargs) if alg_params.get('net_list') is None: alg_params['net_list'] = [QNetwork(env.observation_space, env.action_space, [64], activation=tf.nn.tanh, state_only=True, dueling=alg_params['dueling'])] if alg_params.get('optimizers_list') is None: alg_params['optimizers_list'] = tf.optimizers.Adam(5e-3, epsilon=1e-5), learn_params = dict( train_episodes=int(1e3), test_episodes=10, max_steps=200, save_interval=1e3, batch_size=32, exploration_rate=0.2, exploration_final_eps=0.01, train_freq=4, learning_starts=200, target_network_update_freq=50, gamma=0.99, ) return alg_params, learn_params
# class CNNQNet(tl.models.Model): # def __init__(self, in_dim, act_dim, dueling): # super().__init__() # self._state_shape = in_dim # self._action_shape = act_dim, # self.dueling = dueling # with tf.name_scope('DQN'): # with tf.name_scope('CNN'): # self.cnn = basic_nets.CNNModel(in_dim) # mlp_in_shape = self.cnn.outputs[0].shape[0] # with tf.name_scope('QValue'): # hidden_dim = 256 # self.preq = tl.layers.Dense( # hidden_dim, tf.nn.relu, # tf.initializers.Orthogonal(1.0), # in_channels=mlp_in_shape # ) # self.qout = tl.layers.Dense( # act_dim, None, # tf.initializers.Orthogonal(1.0), # in_channels=hidden_dim # ) # if dueling: # with tf.name_scope('Value'): # hidden_dim = 256 # self.prev = tl.layers.Dense( # hidden_dim, tf.nn.relu, # tf.initializers.Orthogonal(1.0), # in_channels=mlp_in_shape # ) # self.vout = tl.layers.Dense( # 1, None, # tf.initializers.Orthogonal(1.0), # in_channels=hidden_dim # ) # # def forward(self, obv): # obv = tf.cast(obv, tf.float32) / 255.0 # mlp_in = tl.layers.flatten_reshape(self.cnn(obv)) # q_out = self.qout(self.preq(mlp_in)) # if self.dueling: # v_out = self.vout(self.prev(mlp_in)) # q_out = v_out + q_out - tf.reduce_mean(q_out, 1, True) # return q_out # # @property # def state_shape(self): # return copy.deepcopy(self._state_shape) # # @property # def action_shape(self): # return copy.deepcopy(self._action_shape) # # # class MLPQNet(tl.models.Model): # def __init__(self, in_dim, act_dim, dueling): # super().__init__() # self._state_shape = in_dim, # self._action_shape = act_dim, # self.dueling = dueling # hidden_dim = 64 # with tf.name_scope('DQN'): # with tf.name_scope('MLP'): # self.mlp = tl.layers.Dense( # hidden_dim, tf.nn.tanh, # tf.initializers.Orthogonal(1.0), # in_channels=in_dim # ) # with tf.name_scope('QValue'): # self.qmlp = tl.layers.Dense( # act_dim, None, # tf.initializers.Orthogonal(1.0), # in_channels=hidden_dim # ) # if dueling: # with tf.name_scope('Value'): # self.vmlp = tl.layers.Dense( # 1, None, # tf.initializers.Orthogonal(1.0), # in_channels=hidden_dim # ) # # def forward(self, obv): # obv = tf.cast(obv, tf.float32) # latent = self.mlp(obv) # q_out = self.qmlp(latent) # if self.dueling: # v_out = self.vmlp(latent) # q_out = v_out + q_out - tf.reduce_mean(q_out, 1, True) # return q_out # # @property # def state_shape(self): # return copy.deepcopy(self._state_shape) # # @property # def action_shape(self): # return copy.deepcopy(self._action_shape)