# Q-Learning Example

Andrew H. Fagg


In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import fnmatch
import matplotlib.pyplot as plt
import tensorflow.keras as keras

#from tensorflow import keras
from tensorflow.keras.layers import LeakyReLU, UpSampling1D, Input, InputLayer, Reshape, Activation, Lambda, AveragePooling1D
from tensorflow.keras.layers import Convolution2D, Dense, MaxPooling2D, Flatten, BatchNormalization, Dropout, Conv2DTranspose, Concatenate
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import RMSprop
import random
#import skimage.transform as sktr
import gym
from mpl_toolkits.mplot3d import Axes3D
import re
 

#from sklearn.p
import sklearn.metrics

from sklearn.utils.extmath import cartesian

####################################

FONTSIZE = 18
FIGURE_SIZE = (10,4)
FIGURE_SIZE2 = (10,10)

# Configure parameters
plt.rcParams.update({'font.size': FONTSIZE, 'figure.figsize': FIGURE_SIZE})

# Default tick label size
plt.rcParams['xtick.labelsize'] = FONTSIZE
plt.rcParams['ytick.labelsize'] = FONTSIZE

In [None]:
class numpyBuffer:
    '''
    Circular buffer using a numpy array
    
    In this case, we only append to this buffer and overwrite values once we wrap-around
    '''
    def __init__(self, maxsize=100, ndims=1, dtype=np.float32):
        '''
        Constructor for the buffer
        
        :param maxsize: Maximum number of rows that can be stored in the buffer
        :param ndims: The number of columns in the buffer       
        '''
        
        self.buffer = np.zeros((maxsize,ndims), dtype=dtype)
        self.maxsize=maxsize
        self.ndims=ndims
        self.back = 0
        self.full = False
    
    def size(self):
        '''
        :return: The number of items stored in the buffer
        '''
        if(self.full):
            return self.maxsize
        else:
            return self.back
        
    def append(self, rowvec):
        '''
        Append a row to the buffer
        
        :param rowvec: Numpy row vector of values to append.  Must be 1xndims
        '''
        self.buffer[self.back,:] = rowvec
        self.back = self.back+1
        if self.back >= self.maxsize:
            self.back = 0
            self.full = True
            
    def getrows(self, row_indices):
        '''
        Return a set of indicated rows
        
        :param row_indices: Array of row indices into the buffer
        :return: len(row_indices)xndims numpy array
        '''
        return self.buffer[row_indices,:]
    

In [None]:
class myAgent:
    def __init__(self, state_size, action_size, action_continuous, epsilon=.01, gamma=0.99, 
                 lrate=.001, action_discrete=True, maxlen=10000):
        '''
        :param state_size: Number of state variables
        :param action_size: Number of actions (will use one-hot encoded actions)
        :param action_continuous: List of continuous actions that correspond to the discrete choices
        :param epsilon: Constant exploration rate
        :param gamma: Constant discout rate
        :param lrate: Learning rate
        :param action_discrete: Network produces one Q-value for each discrete action 
                (True is the only supported case)
        :param maxlen: Maximum length of the circular experience buffer
        
        Experience buffer is designed for quick access to prior experience
        '''
        self.state_size = state_size
        self.action_size = action_size
        self.action_continuous = action_continuous
        self.epsilon=epsilon
        self.gamma=gamma
        self.reward_log = []
        self.verbose = False
        self.verbose_execute = False
        self.lrate=lrate
        self.action_discrete=action_discrete
        self.log_observation = numpyBuffer(maxlen, state_size)
        self.log_observation_new = numpyBuffer(maxlen, state_size)
        self.log_action = numpyBuffer(maxlen, 1, dtype=np.int16)
        self.log_reward = numpyBuffer(maxlen, 1)
        self.log_done = numpyBuffer(maxlen, 1, dtype=np.bool)
        
        
    def build_model(self, n_units, activation='elu', lambda_regularization=None):
        '''
        Simple sequential model.
        
        :param n_units: Number of units in each hidden layer (a list)
        :param activation: Activation function for the hidden units
        :param lambda_regularization: None or a continuous value (currently not used)
        '''
        model = Sequential()
        self.model = model
        i = 0
        
        # Input layer
        model.add(InputLayer(input_shape=(self.state_size,)))
        
        # Loop over hidden layers
        for n in n_units:
            model.add(Dense(n, 
                        activation=activation,
                        use_bias=True,
                        kernel_initializer='truncated_normal', 
                        bias_initializer='zeros', 
                        name = "D"+str(i)))
                        #kernel_regularizer=keras.regularizers.l2(lambda_regularization),
                        #bias_regularizer=keras.regularizers.l2(lambda_regularization)))
            i=i+1
            
        # model.add(BatchNormalization())
        # Output layer
        model.add(Dense(self.action_size, 
                        activation=None,
                        use_bias=True,
                        kernel_initializer='truncated_normal', 
                        bias_initializer='zeros',  
                        name = "D"+str(i)))
                        #kernel_regularizer=keras.regularizers.l2(lambda_regularization),
                        #bias_regularizer=keras.regularizers.l2(lambda_regularization)))
        
        # Configure model
        opt = keras.optimizers.Adam(lr=self.lrate, beta_1=0.9, beta_2=0.999, 
                            epsilon=None, decay=0.0, amsgrad=False)
        
        model.compile(loss='mse', optimizer=opt)
        
        print(model.summary())
        
    def build_model2(self, n_units, activation='elu', lambda_regularization=0.0):
        '''
        Model with an independent branch for each action.  (not used right now)
        
        Uses the Model API
        
        Regularization has been turned off
        '''
        
        # Input layer
        #model.add(InputLayer(input_shape=(self.state_size,)))
        input_tensor = Input(shape=(self.state_size,), name='input')
        
        output_tensors = []
        
        # Loop over actions
        for a in range(self.action_size):
            
            dense_tensor = input_tensor
            # Loop over hidden layers
            i = 0
            for n in n_units:
                dense_tensor = Dense(n, 
                        activation=activation,
                        use_bias=True,
                        kernel_initializer='truncated_normal', 
                        bias_initializer='zeros', 
                        name = "D_Br"+str(a)+"_L"+str(i))(dense_tensor)
                        #kernel_regularizer=keras.regularizers.l2(lambda_regularization),
                        #bias_regularizer=keras.regularizers.l2(lambda_regularization)))
                i=i+1
            
            # Output layer
            output_tensor = Dense(1, 
                        activation=None,
                        use_bias=True,
                        kernel_initializer='truncated_normal', 
                        bias_initializer='zeros', 
                        name = "O"+str(a))(dense_tensor)
            
                        #kernel_regularizer=keras.regularizers.l2(lambda_regularization),
                        #bias_regularizer=keras.regularizers.l2(lambda_regularization)))
            output_tensors.append(output_tensor)
        
        # Concatentate the actions together
        output_tensor = Concatenate()(output_tensors)
        
        # Configure model
        opt = keras.optimizers.Adam(lr=self.lrate, beta_1=0.9, beta_2=0.999, 
                            epsilon=None, decay=0.0, amsgrad=False)
        
        model = Model(inputs=input_tensor, outputs=output_tensor)
        model.compile(loss='mse', optimizer=opt)
        
        self.model = model
        
        print(model.summary())
        
        
    def choose_action(self, observation, verbose=False):
        '''
        epsilon-greedy choice of discrete action
        
        :returns: (discrete_action, explore_bit)

        '''
        if(np.random.rand() <= self.epsilon):
            return np.random.randint(self.action_size), True
        else:
            pred = self.model.predict(observation)[0]
            if verbose:
                print(pred)
            return np.argmax(pred), False
    
    def choose_action_continuous(self, observation, verbose=False):
        '''
        epsilon-greedy choice of continuous action
        
        :returns: (discrete_action, continuous_action, explore_bit)
        '''
        observation = np.array(observation, ndmin=2)
        action_index, explore = self.choose_action(observation, verbose)
        return action_index, self.action_continuous[action_index], explore
    
    def log_experience(self, observation, action_index, reward, observation_new, done):
        ''' 
        Store the last step in the circular buffer
        '''
        # Convert to numpy arrays
        observation =  np.array(observation, ndmin=2)
        observation_new =  np.array(observation_new, ndmin=2)
        
        self.log_observation.append(observation)
        self.log_observation_new.append(observation_new)
        self.log_action.append(action_index)
        self.log_reward.append(reward)
        self.log_done.append(done)
                
    def learning_step(self, batch_size=200):
        '''
        Iterate over a minibatch of the stored experience & take a learning step with each

        :param batch_size: Size of the batch to do learning with
        
        '''
        
        # Sample from the prior experience.  How we do this depends on how much
        #  experience that we have accumulated so far
        if self.log_observation.size() < batch_size:
            minibatch_inds = range(self.log_observation.size())
            #return
        else:
            # Random sample from the buffer
            minibatch_inds = random.sample(range(self.log_observation.size()), batch_size)
        
        print("Creating batch:", len(minibatch_inds))
        observations = self.log_observation.getrows(minibatch_inds)
        targets = self.model.predict(observations)
        observations_new = self.log_observation_new.getrows(minibatch_inds)
        
        q_next = ????
        q_next_max = ????
        
        rewards = self.log_reward.getrows(minibatch_inds)[:,0]

        dones = self.log_done.getrows(minibatch_inds)[:,0]  
        done_list = ????
        done_not_list = ????
        
        actions = self.log_action.getrows(minibatch_inds)[:,0]
        
        # Update targets: for each example, only one action is updated
        #  (the one that was actually executed)
        
        # Last step in the episodes
        targets[done_list, actions[done_list]] = ????
        # Other steps
        targets[done_not_list, actions[done_not_list]] = ????
        
        # Update the Q-function
        self.model.fit(observations, targets, epochs=1, verbose=0)

        if self.verbose:
            print(observations, targets)
    
    def execute_trial(self, env, nsteps, render_flag=False, batch_size=100):
        '''
        A trial terminates at nsteps or when the environment says we must stop.
        
        '''
        observation = env.reset()
        
        # Accumulator for total reward
        reward_total = 0
        
        # Loop over each step
        for i in range(nsteps):
            if render_flag:
                env.render()
            
            # Figure out which action to execute
            action_index, action_continuous, explore = self.choose_action_continuous(observation, verbose=self.verbose_execute)
            
            # Some environments require discrete actions, while others require continous actions
            if self.action_discrete:
                observation_new, reward, done, info = env.step(action_index) #env.step(action_continuous)
            else:
                observation_new, reward, done, info = env.step(action_continuous)
                
            # Remember reward
            reward_total = reward_total + reward
            if self.verbose_execute:
                print(observation, action_index, reward, observation_new, done)
                
            # Log this step 
            self.log_experience(observation, action_index, reward, 
                                    observation_new, done)
                
            if done:
                # Environment says we are done
                break
                
            # Prepare for the next step
            observation = observation_new
            
        # Learning
        #print("before learning")
        self.learning_step(batch_size=batch_size)
        if render_flag:
            env.close()
        print(reward_total)
        
        # Log accumulated reward for this trial
        self.reward_log.append(reward_total)
        
    def execute_ntrials(self, env, ntrials, nsteps, render_flag=False, batch_size=100):
        '''
        Execute the specified number of trials
        '''
        for _ in range(ntrials):
            self.execute_trial(env, nsteps, render_flag, batch_size)
        
            

## Cart-Pole example

In [None]:
env = gym.make('CartPole-v1')

In [None]:
# Cart-pole is a discrete action environment (provided continous values are dummies)
agent = myAgent(4, 2, [[-1], [1]], gamma=0.99, epsilon=0.1, lrate=.001)
agent.build_model([20, 10, 5])

In [None]:
# Show accumulated reward as a function of trial
plt.plot(agent.reward_log)


## Pendulum


In [None]:
env2 = gym.make('Pendulum-v0')

In [None]:
# Learning


In [None]:
# Show accumulated reward as a function of trial
plt.plot(agent2.reward_log)

In [None]:
# Learn while rendering
agent2.execute_ntrials(env2, 10, 1000, render_flag=True)