Forked from fergusmcdonald/reinforcement-learning-policy-gradient-reinforce-algorithm.js
Created
March 12, 2019 02:08
-
-
Save pebsconsulting/a1bcb1d16317d7d24098d3ebcb149378 to your computer and use it in GitHub Desktop.
Reinforcement Learning - Policy Gradient - REINFORCE algorithm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| * Reinforcement Learning | |
| * Policy Gradient | |
| * REINFORCE Algorithm | |
| * | |
| * Fergus McDonald | |
| * | |
| * Environment: 2D Grid | |
| * Actions: Up, Right, Down & Left | |
| * | |
| * The gradient of softmax policy [ denoted as: ∇θlogπθ(s,a) ] for a feature's corresponding weight at a time step is | |
| * the value of feature for the action taken / the average value of the feature for all available actions | |
| * | |
| * Note: The following code's feature can only differentiate best action when beside a wall | |
| * The policy will only learn best actions for states at walls if corners at wall are differentiable | |
| * Place rewards in the corners to see the policy improve to find best action for each wall | |
| * | |
| * Note: Version which differentiates for all states and actions is here: https://gist.github.com/fergusmcdonald/74e2446200911256d2382f169a136e3f | |
| * | |
| */ | |
| // Softmax function - maps input values to probabilities | |
| const softmax = input => { | |
| const eulerExpOfAllValues = input.map(value => Math.exp(value)) | |
| const sumOfEulerExpOfAllValues = eulerExpOfAllValues.reduce((a,b) => a + b) | |
| return input.map(function(value) { | |
| const eulerExpValue = Math.exp(value) | |
| return eulerExpValue / sumOfEulerExpOfAllValues | |
| }) | |
| } | |
| // Setup: Learning parameters | |
| const learningRate = 0.000025 | |
| const discountFactor = 0.99 | |
| const initialActionWeight = 0.1 | |
| // Setup: 2D array of grid rewards | |
| const gridRewards = [ | |
| [1,0,0,0,-1], | |
| [0,0,0,0,0], | |
| [0,0,0,0,0], | |
| [0,0,0,0,0], | |
| [-1,0,0,0,1] | |
| ] | |
| // Setup: Agent = array contain x and y positions | |
| // Agent always starts in the middle of the grid | |
| const agent = [ | |
| 2, // agent[0] = agent's x position in the grid | |
| 2 // agent[1] = agent's y position in the grid | |
| ] | |
| // Setup: Actions | |
| // Initialize the 2D array of actions available for each state | |
| // 0 = Up | |
| // 1 = Right | |
| // 2 = Down | |
| // 3 = Left | |
| const actions = [] | |
| // Populate actions | |
| for (let x = 0; x < 5; x++) { | |
| actions[x] = [] | |
| for (let y = 0; y < 5; y++) { | |
| actions[x][y] = [] | |
| // For states not on the top row of the grid | |
| // Add action up | |
| if (y !== 0) { | |
| actions[x][y].push(0) // Up | |
| } | |
| // For states not on the right column of the grid | |
| // Add action right | |
| if (x !== 4) { | |
| actions[x][y].push(1) // Right | |
| } | |
| // For states not on the bottom row of the grid | |
| // Add action down | |
| if (y !== 4) { | |
| actions[x][y].push(2) // Down | |
| } | |
| // For states not on the left column of the grid | |
| // Add action left | |
| if (x !== 0) { | |
| actions[x][y].push(3) // Left | |
| } | |
| } | |
| } | |
| // Setup: Features | |
| const weights = [ | |
| initialActionWeight, // Move Up & Wall Right | |
| initialActionWeight, // Move Up & Wall Down | |
| initialActionWeight, // Move Up & Wall Left | |
| initialActionWeight, // Move Right & Wall Up | |
| initialActionWeight, // Move Right & Wall Down | |
| initialActionWeight, // Move Right & Wall Left | |
| initialActionWeight, // Move Down & Wall Up | |
| initialActionWeight, // Move Down & Wall Right | |
| initialActionWeight, // Move Down & Wall Left | |
| initialActionWeight, // Move Left & Wall Up | |
| initialActionWeight, // Move Left & Wall Right | |
| initialActionWeight, // Move Left & Wall Down | |
| initialActionWeight // Bias | |
| ] | |
| // Setup: Score Function | |
| // This tells us how good the state (x, y) and action are | |
| const scoreFunction = (x, y, action) => { | |
| let score = 0 | |
| const features = getStateActionFeatures(x, y, action) | |
| // Multiply each feature by it's weight and add to score | |
| // Only active features hold weight for the score | |
| for (let i = 0; i < weights.length; i++) { | |
| score += weights[i] * features[i] | |
| } | |
| return score | |
| } | |
| // Util: Get features for a state action pair | |
| const getStateActionFeatures = (x, y, action) => { | |
| // Set feature to 0 if condition is false or 1 if condition is true | |
| const features = [ | |
| action === 0 && x === 4 ? 1 : 0, // Move Up & Wall Right | |
| action === 0 && y === 4 ? 1 : 0, // Move Up & Wall Down | |
| action === 0 && x === 0 ? 1 : 0, // Move Up & Wall Left | |
| action === 1 && y === 0 ? 1 : 0, // Move Right & Wall Up | |
| action === 1 && y === 4 ? 1 : 0, // Move Right & Wall Down | |
| action === 1 && x === 0 ? 1 : 0, // Move Right & Wall Left | |
| action === 2 && y === 0 ? 1 : 0, // Move Down & Wall Up | |
| action === 2 && x === 4 ? 1 : 0, // Move Down & Wall Right | |
| action === 2 && x === 0 ? 1 : 0, // Move Down & Wall Left | |
| action === 3 && y === 0 ? 1 : 0, // Move Left & Wall Up | |
| action === 3 && x === 4 ? 1 : 0, // Move Left & Wall Right | |
| action === 3 && y === 4 ? 1 : 0, // Move Left & Wall Down | |
| 1 // Bias | |
| ] | |
| return features | |
| } | |
| // Generate: Run grid until agent gets a non-zero reward | |
| const generateEpisode = () => { | |
| // Initialise | |
| // Agent always starts in the middle of the grid | |
| agent[0] = 2 | |
| agent[1] = 2 | |
| // Add initial state to episode states | |
| const episodeStates = [[agent[0], agent[1]]] | |
| const episodeRewards = [] | |
| const episodeActions = [] | |
| let finished = false | |
| // While agent has not reached an end state (value = non-zero) | |
| while (!finished) { | |
| // Get values for current state, actions | |
| const x = agent[0] | |
| const y = agent[1] | |
| // Get available actions for current state (x, y) | |
| const availableActions = actions[x][y] | |
| const availableActionsScores = availableActions.map(action => scoreFunction(x, y, action)) | |
| // Use softmax function to get probabilities of available action using score values | |
| const stateActionProbabilities = softmax(availableActionsScores) | |
| // Hack: Select an action using the state action probabilities from softmax function | |
| let selectedAction = 0 | |
| const actionSelector = Math.random() | |
| if (stateActionProbabilities[0] > actionSelector) { | |
| selectedAction = availableActions[0] | |
| } else if (stateActionProbabilities[0] + stateActionProbabilities[1] > actionSelector) { | |
| selectedAction = availableActions[1] | |
| } else if (stateActionProbabilities[0] + stateActionProbabilities[1] + stateActionProbabilities[2] > actionSelector) { | |
| selectedAction = availableActions[2] | |
| } else if (stateActionProbabilities[0] + stateActionProbabilities[1] + stateActionProbabilities[2] + stateActionProbabilities[3] > actionSelector) { | |
| selectedAction = availableActions[3] | |
| } | |
| // Perform selected action | |
| if (selectedAction === 0) { // Up | |
| agent[1]-- | |
| } else if (selectedAction === 1) { // Right | |
| agent[0]++ | |
| } else if (selectedAction === 2) { // Down | |
| agent[1]++ | |
| } else if (selectedAction === 3) { // Left | |
| agent[0]-- | |
| } | |
| // Record action, reward & state | |
| episodeActions.push(selectedAction) | |
| episodeRewards.push(gridRewards[agent[0]][agent[1]]) | |
| episodeStates.push([agent[0], agent[1]]) | |
| // If there was a positive or negative reward mark the episode as finished | |
| if (gridRewards[agent[0]][agent[1]] !== 0) { | |
| finished = true | |
| } | |
| } | |
| // Return the finished episide | |
| return { | |
| states: episodeStates, | |
| actions: episodeActions, | |
| rewards: episodeRewards | |
| } | |
| } | |
| // Learn - Process the finished episode = Improves policy by updating weights | |
| const processEpisode = episode => { | |
| // For every time step (using actions as count) | |
| for (let i = 0; i < episode.actions.length; i++) { | |
| // Get agent position | |
| const x = episode.states[i][0] | |
| const y = episode.states[i][1] | |
| // Get available actions at state | |
| const availableActions = actions[x][y] | |
| // Get selected action | |
| const selectedAction = episode.actions[i] | |
| // Get total sum of rewards from this time step onwards (G) | |
| // Dicounting the reward by the discount factor at each step | |
| // This makes more recent rewards more important | |
| let exponentialDiscount = 1 | |
| const G = episode.rewards.slice(i).reduce((a, b) => { | |
| exponentialDiscount = exponentialDiscount * discountFactor | |
| return a + (b * exponentialDiscount) | |
| }) | |
| // Get the gradient for the selected action ∇θlogπθ(s,a) | |
| // Update the action weight for state action | |
| for (let w = 0; w < weights.length; w++) { | |
| // Get features for state | |
| const features = getStateActionFeatures(x, y, selectedAction) | |
| let featureValueForAllActions = 0 | |
| // For all actions get the feature values | |
| for (let j = 0; j < availableActions.length; j++) { | |
| if (availableActions[j] === 0) { | |
| const action = 0 | |
| const features = getStateActionFeatures(x, y, action) | |
| featureValueForAllActions += weights[w] * features[w] | |
| } else if (availableActions[j] === 1) { | |
| const action = 1 | |
| const features = getStateActionFeatures(x, y, action) | |
| featureValueForAllActions += weights[w] * features[w] | |
| } else if (availableActions[j] === 2) { | |
| const action = 2 | |
| const features = getStateActionFeatures(x, y, action) | |
| featureValueForAllActions += weights[w] * features[w] | |
| } else if (availableActions[j] === 3) { | |
| const action = 3 | |
| const features = getStateActionFeatures(x, y, action) | |
| featureValueForAllActions += weights[w] * features[w] | |
| } | |
| } | |
| // The gradient of softmax policy is | |
| // the value of feature for the action taken / the average value of the feature for all available actions | |
| const featureValueForTakenAction = weights[w] * features[w] | |
| const gradient = featureValueForTakenAction - (featureValueForAllActions / availableActions.length) | |
| weights[w] += learningRate * G * gradient | |
| } | |
| } | |
| } | |
| // Run: Generate episodes and process | |
| const numberOfEpisodesToProcess = 10000 | |
| let numberOfEpisodesProcessed = 0 | |
| while (numberOfEpisodesToProcess > numberOfEpisodesProcessed) { | |
| let episode = generateEpisode() | |
| processEpisode(episode) | |
| numberOfEpisodesProcessed = numberOfEpisodesProcessed + 1 | |
| } | |
| // Print: Generate episodes and process | |
| // After all episodes have been generated and processed | |
| // Print best action for each state | |
| const actionSymbols = ['U', 'R', 'D', 'L'] | |
| const bestActionTable = [[],[],[],[],[]] | |
| // Generate a 2D grid showing the generated policies most likely action for each state | |
| for (let x = 0; x < 5; x++) { | |
| for (let y = 0; y < 5; y++) { | |
| const availableActions = actions[x][y] | |
| const availableActionsScores = availableActions.map(action => scoreFunction(x, y, action)) | |
| // Use softmax function to get probabilities of actions for current state | |
| const stateActionProbabilities = softmax(availableActionsScores) | |
| let bestAction = 0 | |
| let bestActionsProbability = 0 | |
| for (let i = 0; i < stateActionProbabilities.length; i++) { | |
| if (stateActionProbabilities[i] > bestActionsProbability) { | |
| bestActionsProbability = stateActionProbabilities[i] | |
| bestAction = availableActions[i] | |
| } | |
| } | |
| bestActionTable[x][y] = actionSymbols[bestAction] | |
| } | |
| } | |
| // Do some mapping to print grids in correct orientation for x and y axis | |
| let gridRewardsPrintable = [[], [], [], [], []] | |
| let bestActionTablePrintable = [[], [], [], [], []] | |
| for (let x = 0; x < 5; x++) { | |
| for (let y = 0; y < 5; y++) { | |
| gridRewardsPrintable[y][x] = gridRewards[x][y] | |
| bestActionTablePrintable[y][x] = bestActionTable[x][y] | |
| // Print the rewards for reward states as they have no actions since episod terminates in these states | |
| if (gridRewards[x][y]) { | |
| bestActionTablePrintable[y][x] = gridRewards[x][y] | |
| } | |
| // Use '?' for non-differentiable states | |
| if (x > 0 && x < 4 && y > 0 && y < 4) { | |
| bestActionTablePrintable[y][x] = '?' | |
| } | |
| } | |
| } | |
| // Print the rewards and best actions from policy for each state | |
| console.table(gridRewardsPrintable) | |
| console.table(bestActionTablePrintable) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment