2

I'm trying to understand how to use @tf.function properly in a A2C problem.

I constantly get the following error:

Cannot convert a symbolic Keras input/output to a numpy array. This error may indicate that you're trying to pass a symbolic value to a NumPy call, which is not supported. Or, you may be trying to pass Keras symbolic inputs/outputs to a TF API that does not register dispatching, preventing Keras from automatically converting the API call to a lambda layer in the Functional Model.

The agent is built as follows:

class Agent():
    learning_rate = 0.0001
    CLIP_EDGE = 1e-8
    entropy = 0.0001
    critic_weight = 0.95
    def __init__(self,state_shape,action_size,hidden_neurons,memory,learning_rate = learning_rate, CLIP_EDGE = CLIP_EDGE, entropy = entropy, 
                 critic_weight = critic_weight, actor_name = "actor",critic_name = "critic", policy_name = "policy",main_folder = "main_folder"):
        
        self.state_shape = state_shape
        self.action_size = action_size
        self.hidden_neurons = hidden_neurons
        self.memory = memory
        self.learning_rate = learning_rate
        self.CLIP_EDGE = CLIP_EDGE
        self.entropy = entropy
        self.critic_weight = critic_weight
        self.actor_name = actor_name
        self.critic_name = critic_name
        self.policy_name = policy_name
        self.main_folder = main_folder
        
        self.actor, self.critic, self.policy = self.build_networks()
        
     
            
    def act(self, state):
        """Selects an action for the agent to take given a game state.

        Args:
            state (list of numbers): The state of the environment to act on.
            traning (bool): True if the agent is training.

        Returns:
            (int) The index of the action to take.
        """
        # If not acting randomly, take action with highest predicted value.
        state_batch = np.expand_dims(state, axis=0)
        probabilities = self.policy.predict(state_batch)[0]
        action = np.random.choice(self.action_size, p=probabilities)
        return action
    
    
    def learn(self, print_variables=False):
        """Trains the Deep Q Network based on stored experiences."""
        gamma = self.memory.gamma
        experiences = self.memory.sample()
        state_mb, action_mb, reward_mb, dones_mb, next_value = experiences
        
        # One hot enocde actions
        actions = np.zeros([len(action_mb), self.action_size])
        actions[np.arange(len(action_mb)), action_mb] = 1

        #Apply TD(0)
        discount_mb = reward_mb + next_value * gamma * (1 - dones_mb)
        state_values = self.critic.predict([state_mb])
        advantages = discount_mb - np.squeeze(state_values)
        
        
        if print_variables:
            print("discount_mb", discount_mb)
            print("next_value", next_value)
            print("state_values", state_values)
            print("advantages", advantages)
        else:
            self.actor.train_on_batch(
                [state_mb, advantages], [actions, discount_mb])
        
            
    def build_networks(self):
        """Creates Actor Critic Neural Networks.

        Creates a hidden-layer Policy Gradient Neural Network. The loss
        function is altered to be a log-likelihood function weighted
        by an action's advantage.

        """

        state_input = Input(shape=self.state_shape, name='frames')
        advantages = Input((1,), name='advantages')  # PG, A instead of G

        # PG
        actor_1 = Dense(units=self.hidden_neurons, activation="relu",name='actor1')(state_input)
        actor_3 = Dense(units=int(self.hidden_neurons), activation="relu",name='actor3')(actor_1)
        adrop_1 = Dropout(0.2,name='actor_drop_1')(actor_3)
        actor_4 = Dense(units = self.hidden_neurons, activation="relu")(adrop_1)
        probabilities = Dense(self.action_size, activation='softmax',name='actor_output')(actor_4)

        # DQN
        critic_1 = Dense(units = self.hidden_neurons,activation="relu",name='critic1')(state_input)
        critic_3 = Dense(units = int(self.hidden_neurons), activation="relu",name='critic3')(critic_1)
        cdrop_1 = Dropout(0.2,name='critic_drop_1')(critic_3)
        critic_4 = Dense(units = self.hidden_neurons, activation="relu")(cdrop_1) #activation era relu por error... se cambio a elu, MONITOREAR
        values = Dense(1, activation='linear',name='critic_output')(critic_4)

        def actor_loss(y_true, y_pred):  # PG
            y_pred_clipped = K.clip(y_pred, self.CLIP_EDGE, 1-self.CLIP_EDGE)
            log_lik = y_true*K.log(y_pred_clipped)
            entropy_loss = y_pred * K.log(K.clip(y_pred, self.CLIP_EDGE, 1-self.CLIP_EDGE))  # New
            return K.sum(-log_lik * advantages) - (self.entropy * K.sum(entropy_loss))

        # Train both actor and critic at the same time.
        actor = Model(
            inputs=[state_input, advantages], outputs=[probabilities, values])
        actor.compile(
            loss=[actor_loss, 'mean_squared_error'],  # [PG, DQN]
            loss_weights=[1, self.critic_weight],  # [PG, DQN]
            optimizer=Adam(learning_rate=self.learning_rate))#,clipnorm=1.0))

        critic = Model(inputs=[state_input], outputs=[values])
        policy = Model(inputs=[state_input], outputs=[probabilities])

        tf.keras.utils.plot_model(actor,f"{self.main_folder}/Agents/{self.actor_name}.png",show_shapes=True)
        tf.keras.utils.plot_model(critic,f"{self.main_folder}/Agents/{self.critic_name}.png",show_shapes=True)
        tf.keras.utils.plot_model(policy,f"{self.main_folder}/Agents/{self.policy_name}.png",show_shapes=True)
        
        return actor, critic, policy

The loop where the agent interacts with the environment is this:

with tf.Graph().as_default():
    agent = Agent()
    environment = Environment()
    state = environment.reset()
    done = False
    while not done:
        acion = agent.act(state)
        state,reward,done,info = environment.step(action)
        next_value = agent.critic.predict([[state]])
        agent.memory.add((state,action,reward,done,next_value))
        if agent.memory.full():
            agent.learn()

This works fine. My problem comes when I try to switch to use @tf.function because it seems (afaik) that increases training speed (also did a little of benchmark in a jupyter notebook and it is actually faster).

The "refactored" code is this:

The main loop:

agent = Agent()
environment = Environment()
state = environment.reset()
done = False
while not done:
    acion = agent.act(state)
    state,reward,done,info = environment.step(action)
    next_value = agent.model_predict(agent.critic,[[state]]).numpy() #REMOVED .predict FROM MODEL
    agent.memory.add((state,action,reward,done,next_value))
    if agent.memory.full():
        agent.learn()

The modified functions in the Agent class:

@tf.function #NEW FUNCTION ADDED USING @tf.function
def model_predict(self,model,x):
    return model(x)

def act(self, state): #MODIFIED FUNCTION, NOW USES self.model_predict
    """Selects an action for the agent to take given a game state.
    
    Args:
        state (list of numbers): The state of the environment to act on.
        traning (bool): True if the agent is training.
    
    Returns:
        (int) The index of the action to take.
    """
    # If not acting randomly, take action with highest predicted value.
    state_batch = np.expand_dims(state, axis=0)
    probabilities = self.model_predict(self.policy,state_batch).numpy()[0]
    action = np.random.choice(self.action_size, p=probabilities)
    return action


def learn(self, print_variables=False): #MODIFIED FUNCTION, NOW USES self.model_predict
    """Trains the Deep Q Network based on stored experiences."""
    gamma = self.memory.gamma
    experiences = self.memory.sample()
    state_mb, action_mb, reward_mb, dones_mb, next_value = experiences
            
    # One hot enocde actions
    actions = np.zeros([len(action_mb), self.action_size])
    actions[np.arange(len(action_mb)), action_mb] = 1
    
    #Apply TD(0)
    discount_mb = reward_mb + next_value * gamma * (1 - dones_mb)
    state_values = self.model_predict(self.critic,[state_mb]).numpy()
    advantages = discount_mb - np.squeeze(state_values)
            
            
    if print_variables:
        print("discount_mb", discount_mb)
        print("next_value", next_value)
        print("state_values", state_values)
        print("advantages", advantages)
    else:
        self.actor.train_on_batch(
            [state_mb, advantages], [actions, discount_mb])

The error is triggered when self.actor.train_on_batch is executed, giving me the error mentioned above. Why this happens and what I'm doing wrong?

1

0

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.