2017-09-27 106 views
1

我正在開發自駕車計劃的代碼。我在我的choose_action函數中遇到問題。代理應該從選擇的有以下步驟最高Q值的動作選擇隨機行動:隨機優化行動選擇

「其他: 行動= maxQaction」

但我現在寫的方式,每次只會選擇相同的動作。任何人都可以建議如何隨機選擇最高的Q值,也許我可以使用列表。

Code: 
import random 
import math 
from environment import Agent, Environment 
from planner import RoutePlanner 
from simulator import Simulator 
import itertools 

class LearningAgent(Agent): 
    """ An agent that learns to drive in the Smartcab world. 
     This is the object you will be modifying. """ 

    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): 
     super(LearningAgent, self).__init__(env)  # Set the agent in the evironment 
     self.planner = RoutePlanner(self.env, self) # Create a route planner 
     self.valid_actions = self.env.valid_actions # The set of valid actions 

     # Set parameters of the learning agent 
     self.learning = learning # Whether the agent is expected to learn 
     self.Q = dict()   # Create a Q-table which will be a dictionary of tuples 
     self.epsilon = epsilon # Random exploration factor 
     self.alpha = alpha  # Learning factor 

     ########### 
     ## TO DO ## 
     ########### 
     # Set any additional class parameters as needed 

     self.states = [    
      ['red', 'green'],     #light 
      ['left', 'right', 'forward', None], #vehicleleft 
      ['left', 'right', 'forward', None], #vehicleright 
      ['left', 'right', 'forward', None], #vehicleoncoming 
      ['left', 'right', 'forward']  #waypoint 
     ] 

     self.x = 0 
     random.seed(42) 

     self.q_maker = dict((k, 0.0) for k in self.valid_actions) 

     for prod_state in itertools.product(*self.states): 
      self.Q[prod_state] = self.q_maker.copy() 



    def reset(self, destination=None, testing=False): 
     """ The reset function is called at the beginning of each trial. 
      'testing' is set to True if testing trials are being used 
      once training trials have completed. """ 

     # Select the destination as the new location to route to 
     self.planner.route_to(destination) 

     ########### 
     ## TO DO ## 
     ########### 
     # Update epsilon using a decay function of your choice 
     # Update additional class parameters as needed 
     # If 'testing' is True, set epsilon and alpha to 0 

     #Added for Question 6 
     #self.x = self.x + 1 
     if testing: 
      self.epsilon = 0.0 
      self.alpha = 0.0 
     else: 
      #self.epsilon = self.epsilon - 0.05 for question 6 

      self.x += 1 
      self.epsilon = math.exp(-self.alpha*self.x) 
      #self.epsilon = math.fabs(math.cos(self.alpha*self.x)) 
      # self.epsilon = 1.0/(self.x**2) 
      # self.epsilon = self.alpha**self.x 


     return None 

    def build_state(self): 
     """ The build_state function is called when the agent requests data from the 
      environment. The next waypoint, the intersection inputs, and the deadline 
      are all features available to the agent. """ 

     # Collect data about the environment 
     waypoint = self.planner.next_waypoint() # The next waypoint 
     inputs = self.env.sense(self)   # Visual input - intersection light and traffic 
     deadline = self.env.get_deadline(self) # Remaining deadline 

     ########### 
     ## TO DO ## 
     ########### 
     # Set 'state' as a tuple of relevant data for the agent   
     #state = (waypoint, inputs['light'], inputs['left'], inputs['right'], inputs['oncoming']) #None modified for "Update the Driving Agent State" 

     state = (inputs['light'], inputs['left'], inputs['right'], inputs['oncoming'],waypoint) 

     return state 


    def get_maxQ(self, state): 
     """ The get_max_Q function is called when the agent is asked to find the 
      maximum Q-value of all actions based on the 'state' the smartcab is in. """ 

     ########### 
     ## TO DO ## 
     ########### 
     # Calculate the maximum Q-value of all actions for a given state 


     action_selections = self.Q[state] 

     maxQ = max(action_selections.items(), key=lambda x: x[1])[1] 

     return maxQ 



    def createQ(self, state): 
     """ The createQ function is called when a state is generated by the agent. """ 

     ########### 
     ## TO DO ## 
     ########### 
     # When learning, check if the 'state' is not in the Q-table 
     # If it is not, create a new dictionary for that state 
     # Then, for each action available, set the initial Q-value to 0.0 

     if not self.learning: 
      return 

     if not state in self.Q: 
      self.Q[state] = self.q_maker.copy() 

     return 


    def choose_action(self, state): 
     """ The choose_action function is called when the agent is asked to choose 
      which action to take, based on the 'state' the smartcab is in. """ 

     # Set the agent state and default action 
     self.state = state 
     self.next_waypoint = self.planner.next_waypoint() 
     action = random.choice([None, 'forward', 'left', 'right']) ##None ##Modified from None for question 3 
     #action = None # added after first submission 

     ########### 
     ## TO DO ## 
     ########### 
     # When not learning, choose a random action 
     # When learning, choose a random action with 'epsilon' probability 
     # Otherwise, choose an action with the highest Q-value for the current state 



     action_selections = self.Q[state] 
     maxQaction = max(action_selections.items(), key=lambda x: x[1])[0] 
     if self.learning: 
      choose_using_epsilon = random.random() < 1 - self.epsilon 
      if not choose_using_epsilon: 
       valid_actions = filter(lambda x: x != maxQaction, 
        Environment.valid_actions) 
       action = random.choice(valid_actions) 
      else: 
       action = maxQaction 
     else: 
      action = random.choice(Environment.valid_actions) 
     return action 

回答

1
if not self.learning or random.random() < self.epsilon: 
      action = random.choice(self.valid_actions) 
     else: 
      maxQaction= self.get_maxQ(state) 
      maxQaction= [] # build list of actions that match the max Q value 
      for act in self.Q[state]: 
       if self.Q[state][act] == maxQ: 
        maxQaction.append(act) 
      action = random.choice(maxQaction) # choose one randomly 
+0

由於該訣竅。 – user3476463