def act(self, data,t): #state rate = self.get_exploration_rate(t) if random.random() < rate: options = self.model.predict(data) #state options = np.squeeze(options) action = random.randrange(self.action_size) else: options = self.model.predict(data) #state options = np.squeeze(options) action = options.argmax() return action, options, rate def train(self): batch_size = 200 t = 0 #increment states, prob_actions, dlogps, drs, proj_data, reward_data =[], [], [], [], [], [] tr_x, tr_y = [],[] avg_reward = [] reward_sum = 0 ep_number = 0 prev_state = None first_step = True new_state = self.value data_inp = self.data while ep_number<3000000: prev_data = data_inp prev_state = new_state states.append(new_state) action, probs, rate = self.act(data_inp,t) prob_actions.append(probs) y = np.zeros([self.action_size]) y[action] = 1 new_state = eval(command[action]) proj = projection(new_state, self.final_state) data_inp = [proj,action] data_inp = np.reshape(data_inp,(1,1,len(data_inp))) tr_x.append(data_inp) if(t==0): rw = reward(proj,0) drs.append(rw) reward_sum+=rw elif(t<4): rw = reward(new_state, self.final_state) drs.append(rw) print("present reward: ", rw) reward_sum+=rw elif(t==4): if not np.allclose(new_state, self.final_state): rw = -1 drs.append(rw) reward_sum+=rw else: rw = 1 drs.append(rw) reward_sum+=rw print("reward till now: ",reward_sum) dlogps.append(np.array(y).astype('float32') * probs) print("dlogps before time step: ", len(dlogps)) print("time step: ",t) del(probs, action) t+=1 if(t==5 or np.allclose(new_state,self.final_state)): #### Done State ep_number+=1 ep_x = np.vstack(tr_x) #states ep_dlogp = np.vstack(dlogps) ep_reward = np.vstack(drs) disc_rw = discounted_reward(ep_reward,self.gamma) disc_rw = disc_rw.astype('float32') disc_rw -= np.mean(disc_rw) disc_rw /= np.std(disc_rw) tr_y_len = len(ep_dlogp) ep_dlogp*=disc_rw if ep_number % batch_size == 0: input_tr_y = prob_actions - self.learning_rate * ep_dlogp input_tr_y = np.reshape(input_tr_y, (tr_y_len,1,6)) self.model.train_on_batch(ep_x, input_tr_y) tr_x, dlogps, drs, states, prob_actions, reward_data = [],[],[],[],[],[] env = Environment() new_state = env.reset() proj = projection(state, self.final_state) data_inp = [proj,5] data_inp = np.reshape(data_inp,(1,1,len(data_inp))) print("State after resetting: ", new_state) t=0