Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save pebsconsulting/a1bcb1d16317d7d24098d3ebcb149378 to your computer and use it in GitHub Desktop.

Select an option

Save pebsconsulting/a1bcb1d16317d7d24098d3ebcb149378 to your computer and use it in GitHub Desktop.
Reinforcement Learning - Policy Gradient - REINFORCE algorithm
/*
* Reinforcement Learning
* Policy Gradient
* REINFORCE Algorithm
*
* Fergus McDonald
*
* Environment: 2D Grid
* Actions: Up, Right, Down & Left
*
* The gradient of softmax policy [ denoted as: ∇θlogπθ(s,a) ] for a feature's corresponding weight at a time step is
* the value of feature for the action taken / the average value of the feature for all available actions
*
* Note: The following code's feature can only differentiate best action when beside a wall
* The policy will only learn best actions for states at walls if corners at wall are differentiable
* Place rewards in the corners to see the policy improve to find best action for each wall
*
* Note: Version which differentiates for all states and actions is here: https://gist.github.com/fergusmcdonald/74e2446200911256d2382f169a136e3f
*
*/
// Softmax function - maps input values to probabilities
const softmax = input => {
const eulerExpOfAllValues = input.map(value => Math.exp(value))
const sumOfEulerExpOfAllValues = eulerExpOfAllValues.reduce((a,b) => a + b)
return input.map(function(value) {
const eulerExpValue = Math.exp(value)
return eulerExpValue / sumOfEulerExpOfAllValues
})
}
// Setup: Learning parameters
const learningRate = 0.000025
const discountFactor = 0.99
const initialActionWeight = 0.1
// Setup: 2D array of grid rewards
const gridRewards = [
[1,0,0,0,-1],
[0,0,0,0,0],
[0,0,0,0,0],
[0,0,0,0,0],
[-1,0,0,0,1]
]
// Setup: Agent = array contain x and y positions
// Agent always starts in the middle of the grid
const agent = [
2, // agent[0] = agent's x position in the grid
2 // agent[1] = agent's y position in the grid
]
// Setup: Actions
// Initialize the 2D array of actions available for each state
// 0 = Up
// 1 = Right
// 2 = Down
// 3 = Left
const actions = []
// Populate actions
for (let x = 0; x < 5; x++) {
actions[x] = []
for (let y = 0; y < 5; y++) {
actions[x][y] = []
// For states not on the top row of the grid
// Add action up
if (y !== 0) {
actions[x][y].push(0) // Up
}
// For states not on the right column of the grid
// Add action right
if (x !== 4) {
actions[x][y].push(1) // Right
}
// For states not on the bottom row of the grid
// Add action down
if (y !== 4) {
actions[x][y].push(2) // Down
}
// For states not on the left column of the grid
// Add action left
if (x !== 0) {
actions[x][y].push(3) // Left
}
}
}
// Setup: Features
const weights = [
initialActionWeight, // Move Up & Wall Right
initialActionWeight, // Move Up & Wall Down
initialActionWeight, // Move Up & Wall Left
initialActionWeight, // Move Right & Wall Up
initialActionWeight, // Move Right & Wall Down
initialActionWeight, // Move Right & Wall Left
initialActionWeight, // Move Down & Wall Up
initialActionWeight, // Move Down & Wall Right
initialActionWeight, // Move Down & Wall Left
initialActionWeight, // Move Left & Wall Up
initialActionWeight, // Move Left & Wall Right
initialActionWeight, // Move Left & Wall Down
initialActionWeight // Bias
]
// Setup: Score Function
// This tells us how good the state (x, y) and action are
const scoreFunction = (x, y, action) => {
let score = 0
const features = getStateActionFeatures(x, y, action)
// Multiply each feature by it's weight and add to score
// Only active features hold weight for the score
for (let i = 0; i < weights.length; i++) {
score += weights[i] * features[i]
}
return score
}
// Util: Get features for a state action pair
const getStateActionFeatures = (x, y, action) => {
// Set feature to 0 if condition is false or 1 if condition is true
const features = [
action === 0 && x === 4 ? 1 : 0, // Move Up & Wall Right
action === 0 && y === 4 ? 1 : 0, // Move Up & Wall Down
action === 0 && x === 0 ? 1 : 0, // Move Up & Wall Left
action === 1 && y === 0 ? 1 : 0, // Move Right & Wall Up
action === 1 && y === 4 ? 1 : 0, // Move Right & Wall Down
action === 1 && x === 0 ? 1 : 0, // Move Right & Wall Left
action === 2 && y === 0 ? 1 : 0, // Move Down & Wall Up
action === 2 && x === 4 ? 1 : 0, // Move Down & Wall Right
action === 2 && x === 0 ? 1 : 0, // Move Down & Wall Left
action === 3 && y === 0 ? 1 : 0, // Move Left & Wall Up
action === 3 && x === 4 ? 1 : 0, // Move Left & Wall Right
action === 3 && y === 4 ? 1 : 0, // Move Left & Wall Down
1 // Bias
]
return features
}
// Generate: Run grid until agent gets a non-zero reward
const generateEpisode = () => {
// Initialise
// Agent always starts in the middle of the grid
agent[0] = 2
agent[1] = 2
// Add initial state to episode states
const episodeStates = [[agent[0], agent[1]]]
const episodeRewards = []
const episodeActions = []
let finished = false
// While agent has not reached an end state (value = non-zero)
while (!finished) {
// Get values for current state, actions
const x = agent[0]
const y = agent[1]
// Get available actions for current state (x, y)
const availableActions = actions[x][y]
const availableActionsScores = availableActions.map(action => scoreFunction(x, y, action))
// Use softmax function to get probabilities of available action using score values
const stateActionProbabilities = softmax(availableActionsScores)
// Hack: Select an action using the state action probabilities from softmax function
let selectedAction = 0
const actionSelector = Math.random()
if (stateActionProbabilities[0] > actionSelector) {
selectedAction = availableActions[0]
} else if (stateActionProbabilities[0] + stateActionProbabilities[1] > actionSelector) {
selectedAction = availableActions[1]
} else if (stateActionProbabilities[0] + stateActionProbabilities[1] + stateActionProbabilities[2] > actionSelector) {
selectedAction = availableActions[2]
} else if (stateActionProbabilities[0] + stateActionProbabilities[1] + stateActionProbabilities[2] + stateActionProbabilities[3] > actionSelector) {
selectedAction = availableActions[3]
}
// Perform selected action
if (selectedAction === 0) { // Up
agent[1]--
} else if (selectedAction === 1) { // Right
agent[0]++
} else if (selectedAction === 2) { // Down
agent[1]++
} else if (selectedAction === 3) { // Left
agent[0]--
}
// Record action, reward & state
episodeActions.push(selectedAction)
episodeRewards.push(gridRewards[agent[0]][agent[1]])
episodeStates.push([agent[0], agent[1]])
// If there was a positive or negative reward mark the episode as finished
if (gridRewards[agent[0]][agent[1]] !== 0) {
finished = true
}
}
// Return the finished episide
return {
states: episodeStates,
actions: episodeActions,
rewards: episodeRewards
}
}
// Learn - Process the finished episode = Improves policy by updating weights
const processEpisode = episode => {
// For every time step (using actions as count)
for (let i = 0; i < episode.actions.length; i++) {
// Get agent position
const x = episode.states[i][0]
const y = episode.states[i][1]
// Get available actions at state
const availableActions = actions[x][y]
// Get selected action
const selectedAction = episode.actions[i]
// Get total sum of rewards from this time step onwards (G)
// Dicounting the reward by the discount factor at each step
// This makes more recent rewards more important
let exponentialDiscount = 1
const G = episode.rewards.slice(i).reduce((a, b) => {
exponentialDiscount = exponentialDiscount * discountFactor
return a + (b * exponentialDiscount)
})
// Get the gradient for the selected action ∇θlogπθ(s,a)
// Update the action weight for state action
for (let w = 0; w < weights.length; w++) {
// Get features for state
const features = getStateActionFeatures(x, y, selectedAction)
let featureValueForAllActions = 0
// For all actions get the feature values
for (let j = 0; j < availableActions.length; j++) {
if (availableActions[j] === 0) {
const action = 0
const features = getStateActionFeatures(x, y, action)
featureValueForAllActions += weights[w] * features[w]
} else if (availableActions[j] === 1) {
const action = 1
const features = getStateActionFeatures(x, y, action)
featureValueForAllActions += weights[w] * features[w]
} else if (availableActions[j] === 2) {
const action = 2
const features = getStateActionFeatures(x, y, action)
featureValueForAllActions += weights[w] * features[w]
} else if (availableActions[j] === 3) {
const action = 3
const features = getStateActionFeatures(x, y, action)
featureValueForAllActions += weights[w] * features[w]
}
}
// The gradient of softmax policy is
// the value of feature for the action taken / the average value of the feature for all available actions
const featureValueForTakenAction = weights[w] * features[w]
const gradient = featureValueForTakenAction - (featureValueForAllActions / availableActions.length)
weights[w] += learningRate * G * gradient
}
}
}
// Run: Generate episodes and process
const numberOfEpisodesToProcess = 10000
let numberOfEpisodesProcessed = 0
while (numberOfEpisodesToProcess > numberOfEpisodesProcessed) {
let episode = generateEpisode()
processEpisode(episode)
numberOfEpisodesProcessed = numberOfEpisodesProcessed + 1
}
// Print: Generate episodes and process
// After all episodes have been generated and processed
// Print best action for each state
const actionSymbols = ['U', 'R', 'D', 'L']
const bestActionTable = [[],[],[],[],[]]
// Generate a 2D grid showing the generated policies most likely action for each state
for (let x = 0; x < 5; x++) {
for (let y = 0; y < 5; y++) {
const availableActions = actions[x][y]
const availableActionsScores = availableActions.map(action => scoreFunction(x, y, action))
// Use softmax function to get probabilities of actions for current state
const stateActionProbabilities = softmax(availableActionsScores)
let bestAction = 0
let bestActionsProbability = 0
for (let i = 0; i < stateActionProbabilities.length; i++) {
if (stateActionProbabilities[i] > bestActionsProbability) {
bestActionsProbability = stateActionProbabilities[i]
bestAction = availableActions[i]
}
}
bestActionTable[x][y] = actionSymbols[bestAction]
}
}
// Do some mapping to print grids in correct orientation for x and y axis
let gridRewardsPrintable = [[], [], [], [], []]
let bestActionTablePrintable = [[], [], [], [], []]
for (let x = 0; x < 5; x++) {
for (let y = 0; y < 5; y++) {
gridRewardsPrintable[y][x] = gridRewards[x][y]
bestActionTablePrintable[y][x] = bestActionTable[x][y]
// Print the rewards for reward states as they have no actions since episod terminates in these states
if (gridRewards[x][y]) {
bestActionTablePrintable[y][x] = gridRewards[x][y]
}
// Use '?' for non-differentiable states
if (x > 0 && x < 4 && y > 0 && y < 4) {
bestActionTablePrintable[y][x] = '?'
}
}
}
// Print the rewards and best actions from policy for each state
console.table(gridRewardsPrintable)
console.table(bestActionTablePrintable)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment