ddqn_Agent.py

import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
from keras.layers import Dense, Activation
from keras.models import Sequential, load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model
import numpy as np
import time

# replay buffer to allow the agent to sample state action reward... across many different episodes
# and also for the agent so that he doesn't get stuck
class ReplayBuffer(object):
    def __init__(self, max_size, input_shape, n_actions, discrete=False):
        self.mem_size = max_size
        self.mem_cntr = 0
        # because we are handling a continues action spaces
        self.discrete = discrete
        self.state_memory = np.zeros((self.mem_size, input_shape))
        # to store the state after taking an action
        self.new_state_memory = np.zeros((self.mem_size, input_shape))
        dtype = np.int8 if self.discrete else np.float32
        self.action_memory = np.zeros((self.mem_size, n_actions), dtype=dtype)
        self.reward_memory = np.zeros(self.mem_size)
        # the expected reward for terminal state is 0 
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.float32)

    def store_transition(self, state, action, reward, state_, done):
        # find first avilable memory
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        # store one hot encoding of actions, if appropriate
        if self.discrete:
            actions = np.zeros(self.action_memory.shape[1])
            actions[action] = 1.0
            self.action_memory[index] = actions
        else:
            self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.terminal_memory[index] = 1 - done
        self.mem_cntr += 1
    
    def sample_buffer(self, batch_size):
        #for not sampling the zeros we want to find max between the two
        max_mem = min(self.mem_cntr, self.mem_size)
        # get array from 0 to max_mem-1
        batch = np.random.choice(max_mem, batch_size)
        states = self.state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        states_ = self.new_state_memory[batch]
        terminal = self.terminal_memory[batch]

        return states, actions, rewards, states_, terminal

def Model(lr, n_actions, input_dims, fc_dims):
    model = Sequential([
                Dense(fc_dims, input_shape=(input_dims,),activation='relu'),
                Dense(fc_dims,activation='relu'),
                Dense(fc_dims,activation='relu'),
                Dense(n_actions)])

    model.compile(optimizer=Adam(learning_rate=lr), loss='mse')

    return model

class DDQNAgent(object):
    # NB : the gamma here is to reduce the predicted reward because it may or may not end-up in the same tragedy 
    def __init__(self, alpha, gamma, n_actions, epsilon, batch_size,
                 input_dims, epsilon_dec=0.9995,  epsilon_end=0.01,
                 mem_size=1000000, fname='Model',
                 replace_target=100):
        self.action_space = [i for i in range(n_actions)]
        self.n_actions = n_actions
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_dec = epsilon_dec
        self.epsilon_min = epsilon_end
        self.batch_size = batch_size
        self.model_file = fname
        self.replace_target = replace_target
        self.memory = ReplayBuffer(mem_size, input_dims, n_actions,
                                   discrete=True)
        self.q_eval = Model(alpha, n_actions, input_dims, 32)
        self.q_target = Model(alpha, n_actions, input_dims, 32)

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def choose_action(self, state):
        state=np.array(state)
        state = state[np.newaxis, :]
        rand = np.random.random()
        if rand < self.epsilon:
            action = np.random.choice(self.action_space)
        else:
            actions = self.q_eval.predict(state)
            action = np.argmax(actions)

        return action

    def learn(self):
        if self.memory.mem_cntr > self.batch_size:
            state, action, reward, new_state, done = self.memory.sample_buffer(self.batch_size)

            action_values = np.array(self.action_space, dtype=np.int8)
            action_indices = np.dot(action, action_values)

            q_next = self.q_target.predict(new_state)
            q_eval = self.q_eval.predict(new_state)
            q_pred = self.q_eval.predict(state)

            max_actions = np.argmax(q_eval, axis=1)

            q_target = q_pred

            batch_index = np.arange(self.batch_size, dtype=np.int32)

            q_target[batch_index, action_indices] = reward + \
                    self.gamma*q_next[batch_index, max_actions.astype(int)]*done

            _ = self.q_eval.fit(state, q_target, verbose=0)

            self.epsilon = self.epsilon*self.epsilon_dec if self.epsilon > \
                           self.epsilon_min else self.epsilon_min
            if self.memory.mem_cntr % self.replace_target == 0:
                self.update_network_parameters()

    def update_network_parameters(self):
        self.q_target.set_weights(self.q_eval.get_weights())

    def save_model(self):
        timestr = time.strftime("-%d-%m-%Y-%H-%M")
        self.q_eval.save("Models/"+self.model_file+timestr+".h5")

    def load_model(self,path):
        self.q_eval = load_model(path)
        self.q_eval.summary()
        self.q_target = load_model(path)
        # if we are in evaluation mode we want to use the best weights for
        # q_target
        if self.epsilon == 0.0:
            self.update_network_parameters()
    
    def Plotit(self):
        plot_model(self.q_eval, to_file="dot_img_file.png", show_shapes=True)
        
  
import asyncio
import websockets
import json
import numpy as np
from threading import Thread
from asyncio import Lock

class WS_DDQN:
    def __init__(self, host='localhost', port=8765):
        self.host = host
        self.port = port
        self.server = None
        self.ws_connection = None
        self.loop = asyncio.new_event_loop()
        self.thread = Thread(target=self._run_event_loop, daemon=True)
        self.thread.start()
        self.lock = Lock()
        self.currentMsg = None

    def _run_event_loop(self):
        asyncio.set_event_loop(self.loop)
        self.loop.run_forever()

    def start_ws(self):
        future = asyncio.run_coroutine_threadsafe(self._start_ws(), self.loop)
        future.result()  # Wait for the server to start

    async def _start_ws(self):
        self.server = await websockets.serve(self._handle_connection, self.host, self.port)
        print(f"WebSocket server started on ws://{self.host}:{self.port}")

    def stop_ws(self):
        future = asyncio.run_coroutine_threadsafe(self._stop_ws(), self.loop)
        future.result()  # Wait for the server to stop

    async def _stop_ws(self):
        if self.server:
            self.server.close()
            await self.server.wait_closed()
            print("WebSocket server stopped")

    async def _handle_connection(self, websocket, path):
        self.ws_connection = websocket
        print("Colab client connected")
        try:
            async for message in websocket:
                print(f"Received from Colab: {message}")
                self.currentMsg = message
        except websockets.exceptions.ConnectionClosed:
            print("Colab client disconnected")
        finally:
            self.ws_connection = None

    async def _send_command(self, command, **kwargs):
        async with self.lock:
            if self.ws_connection:
                currentMsg = self.currentMsg
                message = {"command": command, **kwargs}
                await self.ws_connection.send(json.dumps(message))
                print(f"Sent to Colab: {message}")
                while self.currentMsg == currentMsg:
                    await asyncio.sleep(0.1)
                result = json.loads(self.currentMsg)
                self.currentMsg = None
                return result
            else:
                print("No connection to Colab")
                return None

    def _run_command(self, command, **kwargs):
        future = asyncio.run_coroutine_threadsafe(self._send_command(command, **kwargs), self.loop)
        result = future.result()
        while result is None:
            result = future.result()
        return result


    # DDQN-related commands (now synchronous)
    def choose_action(self, observation):
        result = self._run_command("choose_action", observation=observation)
        return result['action'] if result else None

    def remember(self, state, action, reward, new_state, done):
        self._run_command("remember", 
                          state=state, 
                          action=action, 
                          reward=reward, 
                          new_state=new_state, 
                          done=done)

    def learn(self):
        self._run_command("learn")

    def save_model(self):
        self._run_command("save_model")

    def load_model(self, filepath):
        self._run_command("load_model", filepath=filepath)

    def update_network_parameters(self):
        self._run_command("update_network_parameters")

    def get_epsilon(self):
        result = self._run_command("get_epsilon")
        return result['epsilon'] if result else None

    def get_memory_counter(self):
        result = self._run_command("get_memory_counter")
        return result['memory_counter'] if result else None