# # Deep Q-Learning Agent # # (c) Dr. Yves J. Hilpisch # Reinforcement Learning for Finance # import os import random import warnings import numpy as np import tensorflow as tf from tensorflow import keras from collections import deque from keras.layers import Dense, Flatten from keras.models import Sequential warnings.simplefilter('ignore') os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' from tensorflow.python.framework.ops import disable_eager_execution disable_eager_execution() opt = keras.optimizers.legacy.Adam class DQLAgent: def __init__(self, symbol, feature, n_features, env, hu=24, lr=0.001): self.epsilon = 1.0 self.epsilon_decay = 0.9975 self.epsilon_min = 0.1 self.memory = deque(maxlen=2000) self.batch_size = 32 self.gamma = 0.5 self.trewards = list() self.max_treward = -np.inf self.n_features = n_features self.env = env self.episodes = 0 self._create_model(hu, lr) def _create_model(self, hu, lr): self.model = Sequential() self.model.add(Dense(hu, activation='relu', input_dim=self.n_features)) self.model.add(Dense(hu, activation='relu')) self.model.add(Dense(2, activation='linear')) self.model.compile(loss='mse', optimizer=opt(learning_rate=lr)) def _reshape(self, state): state = state.flatten() return np.reshape(state, [1, len(state)]) def act(self, state): if random.random() < self.epsilon: return self.env.action_space.sample() return np.argmax(self.model.predict(state)[0]) def replay(self): batch = random.sample(self.memory, self.batch_size) for state, action, next_state, reward, done in batch: if not done: reward += self.gamma * np.amax( self.model.predict(next_state)[0]) target = self.model.predict(state) target[0, action] = reward self.model.fit(state, target, epochs=1, verbose=False) if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay def learn(self, episodes): for e in range(1, episodes + 1): self.episodes += 1 state, _ = self.env.reset() state = self._reshape(state) treward = 0 for f in range(1, 5000): self.f = f action = self.act(state) next_state, reward, done, trunc, _ = self.env.step(action) treward += reward next_state = self._reshape(next_state) self.memory.append( [state, action, next_state, reward, done]) state = next_state if done: self.trewards.append(treward) self.max_treward = max(self.max_treward, treward) templ = f'episode={self.episodes:4d} | ' templ += f'treward={treward:7.3f}' templ += f' | max={self.max_treward:7.3f}' print(templ, end='\r') break if len(self.memory) > self.batch_size: self.replay() print() def test(self, episodes, min_accuracy=0.0, min_performance=0.0, verbose=True, full=True): ma = self.env.min_accuracy self.env.min_accuracy = min_accuracy if hasattr(self.env, 'min_performance'): mp = self.env.min_performance self.env.min_performance = min_performance self.performances = list() for e in range(1, episodes + 1): state, _ = self.env.reset() state = self._reshape(state) for f in range(1, 5001): action = np.argmax(self.model.predict(state)[0]) state, reward, done, trunc, _ = self.env.step(action) state = self._reshape(state) if done: templ = f'total reward={f:4d} | ' templ += f'accuracy={self.env.accuracy:.3f}' if hasattr(self.env, 'min_performance'): self.performances.append(self.env.performance) templ += f' | performance={self.env.performance:.3f}' if verbose: if full: print(templ) else: print(templ, end='\r') break self.env.min_accuracy = ma if hasattr(self.env, 'min_performance'): self.env.min_performance = mp print()