pebsconsulting · March 12, 2019 02:08
diff --git a/reinforcement-learning-policy-gradient-reinforce-algorithm.js b/reinforcement-learning-policy-gradient-reinforce-algorithm.js
 /*
 * Reinforcement Learning
 * Policy Gradient
 * REINFORCE Algorithm
 *
 * Fergus McDonald
 *
 * Environment: 2D Grid
 * Actions: Up, Right, Down & Left
 *
 * The gradient of softmax policy [ denoted as: ∇θlogπθ(s,a) ] for a feature's corresponding weight at a time step is
 * the value of feature for the action taken / the average value of the feature for all available actions
 *
 * Note: The following code's feature can only differentiate best action when beside a wall
 * The policy will only learn best actions for states at walls if corners at wall are differentiable
 * Place rewards in the corners to see the policy improve to find best action for each wall
 *
 * Note: Version which differentiates for all states and actions is here: https://gist.github.com/fergusmcdonald/74e2446200911256d2382f169a136e3f
 *
 */

 // Softmax function - maps input values to probabilities
 const softmax = input => {
  const eulerExpOfAllValues = input.map(value => Math.exp(value))
  const sumOfEulerExpOfAllValues = eulerExpOfAllValues.reduce((a,b) => a + b)

  return input.map(function(value) {
    const eulerExpValue = Math.exp(value)
    return eulerExpValue / sumOfEulerExpOfAllValues
  })
 }

 // Setup: Learning parameters
 const learningRate = 0.000025
 const discountFactor = 0.99
 const initialActionWeight = 0.1

 // Setup: 2D array of grid rewards
 const gridRewards = [
  [1,0,0,0,-1],
  [0,0,0,0,0],
  [0,0,0,0,0],
  [0,0,0,0,0],
  [-1,0,0,0,1]
 ]

 // Setup: Agent = array contain x and y positions
 // Agent always starts in the middle of the grid
 const agent = [
  2, // agent[0] = agent's x position in the grid
  2 // agent[1] = agent's y position in the grid
 ]

 // Setup: Actions
 // Initialize the 2D array of actions available for each state
 // 0 = Up
 // 1 = Right
 // 2 = Down
 // 3 = Left
 const actions = []

 // Populate actions
 for (let x = 0; x < 5; x++) {
  actions[x] = []

  for (let y = 0; y < 5; y++) {
    actions[x][y] = []

    // For states not on the top row of the grid
    // Add action up
    if (y !== 0) {
      actions[x][y].push(0) // Up
    }

    // For states not on the right column of the grid
    // Add action right
    if (x !== 4) {
      actions[x][y].push(1) // Right
    }

    // For states not on the bottom row of the grid
    // Add action down
    if (y !== 4) {
      actions[x][y].push(2) // Down
    }

    // For states not on the left column of the grid
    // Add action left
    if (x !== 0) {
      actions[x][y].push(3) // Left
    }
  }
 }

 // Setup: Features
 const weights = [
  initialActionWeight, // Move Up & Wall Right
  initialActionWeight, // Move Up & Wall Down
  initialActionWeight, // Move Up & Wall Left
  initialActionWeight, // Move Right & Wall Up
  initialActionWeight, // Move Right & Wall Down
  initialActionWeight, // Move Right & Wall Left
  initialActionWeight, // Move Down & Wall Up
  initialActionWeight, // Move Down & Wall Right
  initialActionWeight, // Move Down & Wall Left
  initialActionWeight, // Move Left & Wall Up
  initialActionWeight, // Move Left & Wall Right
  initialActionWeight, // Move Left & Wall Down
  initialActionWeight // Bias
 ]

 // Setup: Score Function
 // This tells us how good the state (x, y) and action are
 const scoreFunction = (x, y, action) => {
  let score = 0

  const features = getStateActionFeatures(x, y, action)

  // Multiply each feature by it's weight and add to score
  // Only active features hold weight for the score
  for (let i = 0; i < weights.length; i++) {
    score += weights[i] * features[i]
  }

  return score
 }

 // Util: Get features for a state action pair
 const getStateActionFeatures = (x, y, action) => {
  // Set feature to 0 if condition is false or 1 if condition is true
  const features = [
    action === 0 && x === 4 ? 1 : 0, // Move Up & Wall Right
    action === 0 && y === 4 ? 1 : 0, // Move Up & Wall Down
    action === 0 && x === 0 ? 1 : 0, // Move Up & Wall Left
    action === 1 && y === 0 ? 1 : 0, // Move Right & Wall Up
    action === 1 && y === 4 ? 1 : 0, // Move Right & Wall Down
    action === 1 && x === 0 ? 1 : 0, // Move Right & Wall Left
    action === 2 && y === 0 ? 1 : 0, // Move Down & Wall Up
    action === 2 && x === 4 ? 1 : 0, // Move Down & Wall Right
    action === 2 && x === 0 ? 1 : 0, // Move Down & Wall Left
    action === 3 && y === 0 ? 1 : 0, // Move Left & Wall Up
    action === 3 && x === 4 ? 1 : 0, // Move Left & Wall Right
    action === 3 && y === 4 ? 1 : 0, // Move Left & Wall Down
    1 // Bias
  ]

  return features
 }

 // Generate: Run grid until agent gets a non-zero reward
 const generateEpisode = () => {
  // Initialise
  // Agent always starts in the middle of the grid
  agent[0] = 2
  agent[1] = 2

  // Add initial state to episode states
  const episodeStates = [[agent[0], agent[1]]]
  const episodeRewards = []
  const episodeActions = []

  let finished = false

  // While agent has not reached an end state (value = non-zero)
  while (!finished) {
    // Get values for current state, actions
    const x = agent[0]
    const y = agent[1]

    // Get available actions for current state (x, y)
    const availableActions = actions[x][y]

    const availableActionsScores = availableActions.map(action => scoreFunction(x, y, action))

    // Use softmax function to get probabilities of available action using score values
    const stateActionProbabilities = softmax(availableActionsScores)

    // Hack: Select an action using the state action probabilities from softmax function
    let selectedAction = 0
    const actionSelector = Math.random()

    if (stateActionProbabilities[0] > actionSelector) {
      selectedAction = availableActions[0]
    } else if (stateActionProbabilities[0] + stateActionProbabilities[1] > actionSelector) {
      selectedAction = availableActions[1]
    } else if (stateActionProbabilities[0] + stateActionProbabilities[1] + stateActionProbabilities[2] > actionSelector) {
      selectedAction = availableActions[2]
    } else if (stateActionProbabilities[0] + stateActionProbabilities[1] + stateActionProbabilities[2] + stateActionProbabilities[3] > actionSelector) {
      selectedAction = availableActions[3]
    }

    // Perform selected action
    if (selectedAction === 0) { // Up
      agent[1]--
    } else if (selectedAction === 1) { // Right
      agent[0]++
    } else if (selectedAction === 2) { // Down
      agent[1]++
    } else if (selectedAction === 3) { // Left
      agent[0]--
    }

    // Record action, reward & state
    episodeActions.push(selectedAction)
    episodeRewards.push(gridRewards[agent[0]][agent[1]])
    episodeStates.push([agent[0], agent[1]])

    // If there was a positive or negative reward mark the episode as finished
    if (gridRewards[agent[0]][agent[1]] !== 0) {
      finished = true
    }
  }

  // Return the finished episide
  return {
    states: episodeStates,
    actions: episodeActions,
    rewards: episodeRewards
  }
 }

 // Learn - Process the finished episode = Improves policy by updating weights
 const processEpisode = episode => {
  // For every time step (using actions as count)
  for (let i = 0; i < episode.actions.length; i++) {
    // Get agent position
    const x = episode.states[i][0]
    const y = episode.states[i][1]

    // Get available actions at state
    const availableActions = actions[x][y]

    // Get selected action
    const selectedAction = episode.actions[i]

    // Get total sum of rewards from this time step onwards (G)
    // Dicounting the reward by the discount factor at each step
    // This makes more recent rewards more important
    let exponentialDiscount = 1
    const G = episode.rewards.slice(i).reduce((a, b) => {
      exponentialDiscount = exponentialDiscount * discountFactor
      return a + (b * exponentialDiscount)
    })

    // Get the gradient for the selected action ∇θlogπθ(s,a)
    // Update the action weight for state action
    for (let w = 0; w < weights.length; w++) {
      // Get features for state
      const features = getStateActionFeatures(x, y, selectedAction)

      let featureValueForAllActions = 0
      // For all actions get the feature values
      for (let j = 0; j < availableActions.length; j++) {
        if (availableActions[j] === 0) {
          const action = 0
          const features = getStateActionFeatures(x, y, action)
          featureValueForAllActions += weights[w] * features[w]
        } else if (availableActions[j] === 1) {
          const action = 1
          const features = getStateActionFeatures(x, y, action)
          featureValueForAllActions += weights[w] * features[w]
        } else if (availableActions[j] === 2) {
          const action = 2
          const features = getStateActionFeatures(x, y, action)
          featureValueForAllActions += weights[w] * features[w]
        } else if (availableActions[j] === 3) {
          const action = 3
          const features = getStateActionFeatures(x, y, action)
          featureValueForAllActions += weights[w] * features[w]
        }
      }

      // The gradient of softmax policy is
      // the value of feature for the action taken / the average value of the feature for all available actions
      const featureValueForTakenAction = weights[w] * features[w]
      const gradient = featureValueForTakenAction - (featureValueForAllActions / availableActions.length)

      weights[w] += learningRate * G * gradient
    }
  }
 }

 // Run: Generate episodes and process
 const numberOfEpisodesToProcess = 10000
 let numberOfEpisodesProcessed = 0

 while (numberOfEpisodesToProcess > numberOfEpisodesProcessed) {
  let episode = generateEpisode()
  processEpisode(episode)
  numberOfEpisodesProcessed = numberOfEpisodesProcessed + 1
 }

 // Print: Generate episodes and process
 // After all episodes have been generated and processed
 // Print best action for each state
 const actionSymbols = ['U', 'R', 'D', 'L']
 const bestActionTable = [[],[],[],[],[]]

 // Generate a 2D grid showing the generated policies most likely action for each state
 for (let x = 0; x < 5; x++) {
  for (let y = 0; y < 5; y++) {
    const availableActions = actions[x][y]
    const availableActionsScores = availableActions.map(action => scoreFunction(x, y, action))

    // Use softmax function to get probabilities of actions for current state
    const stateActionProbabilities = softmax(availableActionsScores)

    let bestAction = 0
    let bestActionsProbability = 0
    for (let i = 0; i < stateActionProbabilities.length; i++) {
      if (stateActionProbabilities[i] > bestActionsProbability) {
        bestActionsProbability = stateActionProbabilities[i]
        bestAction = availableActions[i]
      }
    }

    bestActionTable[x][y] = actionSymbols[bestAction]
  }
 }

 // Do some mapping to print grids in correct orientation for x and y axis
 let gridRewardsPrintable = [[], [], [], [], []]
 let bestActionTablePrintable = [[], [], [], [], []]

 for (let x = 0; x < 5; x++) {
  for (let y = 0; y < 5; y++) {
    gridRewardsPrintable[y][x] = gridRewards[x][y]
    bestActionTablePrintable[y][x] = bestActionTable[x][y]

    // Print the rewards for reward states as they have no actions since episod terminates in these states
    if (gridRewards[x][y]) {
      bestActionTablePrintable[y][x] = gridRewards[x][y]
    }

    // Use '?' for non-differentiable states
    if (x > 0 && x < 4 && y > 0 && y < 4) {
      bestActionTablePrintable[y][x] = '?'
    }
  }
 }

 // Print the rewards and best actions from policy for each state
 console.table(gridRewardsPrintable)
 console.table(bestActionTablePrintable)
	/*
	* Reinforcement Learning
	* Policy Gradient
	* REINFORCE Algorithm
	*
	* Fergus McDonald
	*
	* Environment: 2D Grid
	* Actions: Up, Right, Down & Left
	*
	* The gradient of softmax policy [ denoted as: ∇θlogπθ(s,a) ] for a feature's corresponding weight at a time step is
	* the value of feature for the action taken / the average value of the feature for all available actions
	*
	* Note: The following code's feature can only differentiate best action when beside a wall
	* The policy will only learn best actions for states at walls if corners at wall are differentiable
	* Place rewards in the corners to see the policy improve to find best action for each wall
	*
	* Note: Version which differentiates for all states and actions is here: https://gist.github.com/fergusmcdonald/74e2446200911256d2382f169a136e3f
	*
	*/

	// Softmax function - maps input values to probabilities
	const softmax = input => {
	const eulerExpOfAllValues = input.map(value => Math.exp(value))
	const sumOfEulerExpOfAllValues = eulerExpOfAllValues.reduce((a,b) => a + b)

	return input.map(function(value) {
	const eulerExpValue = Math.exp(value)
	return eulerExpValue / sumOfEulerExpOfAllValues
	})
	}

	// Setup: Learning parameters
	const learningRate = 0.000025
	const discountFactor = 0.99
	const initialActionWeight = 0.1

	// Setup: 2D array of grid rewards
	const gridRewards = [
	[1,0,0,0,-1],
	[0,0,0,0,0],
	[0,0,0,0,0],
	[0,0,0,0,0],
	[-1,0,0,0,1]
	]

	// Setup: Agent = array contain x and y positions
	// Agent always starts in the middle of the grid
	const agent = [
	2, // agent[0] = agent's x position in the grid
	2 // agent[1] = agent's y position in the grid
	]

	// Setup: Actions
	// Initialize the 2D array of actions available for each state
	// 0 = Up
	// 1 = Right
	// 2 = Down
	// 3 = Left
	const actions = []

	// Populate actions
	for (let x = 0; x < 5; x++) {
	actions[x] = []

	for (let y = 0; y < 5; y++) {
	actions[x][y] = []

	// For states not on the top row of the grid
	// Add action up
	if (y !== 0) {
	actions[x][y].push(0) // Up
	}

	// For states not on the right column of the grid
	// Add action right
	if (x !== 4) {
	actions[x][y].push(1) // Right
	}

	// For states not on the bottom row of the grid
	// Add action down
	if (y !== 4) {
	actions[x][y].push(2) // Down
	}

	// For states not on the left column of the grid
	// Add action left
	if (x !== 0) {
	actions[x][y].push(3) // Left
	}
	}
	}

	// Setup: Features
	const weights = [
	initialActionWeight, // Move Up & Wall Right
	initialActionWeight, // Move Up & Wall Down
	initialActionWeight, // Move Up & Wall Left
	initialActionWeight, // Move Right & Wall Up
	initialActionWeight, // Move Right & Wall Down
	initialActionWeight, // Move Right & Wall Left
	initialActionWeight, // Move Down & Wall Up
	initialActionWeight, // Move Down & Wall Right
	initialActionWeight, // Move Down & Wall Left
	initialActionWeight, // Move Left & Wall Up
	initialActionWeight, // Move Left & Wall Right
	initialActionWeight, // Move Left & Wall Down
	initialActionWeight // Bias
	]

	// Setup: Score Function
	// This tells us how good the state (x, y) and action are
	const scoreFunction = (x, y, action) => {
	let score = 0

	const features = getStateActionFeatures(x, y, action)

	// Multiply each feature by it's weight and add to score
	// Only active features hold weight for the score
	for (let i = 0; i < weights.length; i++) {
	score += weights[i] * features[i]
	}

	return score
	}

	// Util: Get features for a state action pair
	const getStateActionFeatures = (x, y, action) => {
	// Set feature to 0 if condition is false or 1 if condition is true
	const features = [
	action === 0 && x === 4 ? 1 : 0, // Move Up & Wall Right
	action === 0 && y === 4 ? 1 : 0, // Move Up & Wall Down
	action === 0 && x === 0 ? 1 : 0, // Move Up & Wall Left
	action === 1 && y === 0 ? 1 : 0, // Move Right & Wall Up
	action === 1 && y === 4 ? 1 : 0, // Move Right & Wall Down
	action === 1 && x === 0 ? 1 : 0, // Move Right & Wall Left
	action === 2 && y === 0 ? 1 : 0, // Move Down & Wall Up
	action === 2 && x === 4 ? 1 : 0, // Move Down & Wall Right
	action === 2 && x === 0 ? 1 : 0, // Move Down & Wall Left
	action === 3 && y === 0 ? 1 : 0, // Move Left & Wall Up
	action === 3 && x === 4 ? 1 : 0, // Move Left & Wall Right
	action === 3 && y === 4 ? 1 : 0, // Move Left & Wall Down
	1 // Bias
	]

	return features
	}

	// Generate: Run grid until agent gets a non-zero reward
	const generateEpisode = () => {
	// Initialise
	// Agent always starts in the middle of the grid
	agent[0] = 2
	agent[1] = 2

	// Add initial state to episode states
	const episodeStates = [[agent[0], agent[1]]]
	const episodeRewards = []
	const episodeActions = []

	let finished = false

	// While agent has not reached an end state (value = non-zero)
	while (!finished) {
	// Get values for current state, actions
	const x = agent[0]
	const y = agent[1]

	// Get available actions for current state (x, y)
	const availableActions = actions[x][y]

	const availableActionsScores = availableActions.map(action => scoreFunction(x, y, action))

	// Use softmax function to get probabilities of available action using score values
	const stateActionProbabilities = softmax(availableActionsScores)

	// Hack: Select an action using the state action probabilities from softmax function
	let selectedAction = 0
	const actionSelector = Math.random()

	if (stateActionProbabilities[0] > actionSelector) {
	selectedAction = availableActions[0]
	} else if (stateActionProbabilities[0] + stateActionProbabilities[1] > actionSelector) {
	selectedAction = availableActions[1]
	} else if (stateActionProbabilities[0] + stateActionProbabilities[1] + stateActionProbabilities[2] > actionSelector) {
	selectedAction = availableActions[2]
	} else if (stateActionProbabilities[0] + stateActionProbabilities[1] + stateActionProbabilities[2] + stateActionProbabilities[3] > actionSelector) {
	selectedAction = availableActions[3]
	}

	// Perform selected action
	if (selectedAction === 0) { // Up
	agent[1]--
	} else if (selectedAction === 1) { // Right
	agent[0]++
	} else if (selectedAction === 2) { // Down
	agent[1]++
	} else if (selectedAction === 3) { // Left
	agent[0]--
	}

	// Record action, reward & state
	episodeActions.push(selectedAction)
	episodeRewards.push(gridRewards[agent[0]][agent[1]])
	episodeStates.push([agent[0], agent[1]])

	// If there was a positive or negative reward mark the episode as finished
	if (gridRewards[agent[0]][agent[1]] !== 0) {
	finished = true
	}
	}

	// Return the finished episide
	return {
	states: episodeStates,
	actions: episodeActions,
	rewards: episodeRewards
	}
	}

	// Learn - Process the finished episode = Improves policy by updating weights
	const processEpisode = episode => {
	// For every time step (using actions as count)
	for (let i = 0; i < episode.actions.length; i++) {
	// Get agent position
	const x = episode.states[i][0]
	const y = episode.states[i][1]

	// Get available actions at state
	const availableActions = actions[x][y]

	// Get selected action
	const selectedAction = episode.actions[i]

	// Get total sum of rewards from this time step onwards (G)
	// Dicounting the reward by the discount factor at each step
	// This makes more recent rewards more important
	let exponentialDiscount = 1
	const G = episode.rewards.slice(i).reduce((a, b) => {
	exponentialDiscount = exponentialDiscount * discountFactor
	return a + (b * exponentialDiscount)
	})

	// Get the gradient for the selected action ∇θlogπθ(s,a)
	// Update the action weight for state action
	for (let w = 0; w < weights.length; w++) {
	// Get features for state
	const features = getStateActionFeatures(x, y, selectedAction)

	let featureValueForAllActions = 0
	// For all actions get the feature values
	for (let j = 0; j < availableActions.length; j++) {
	if (availableActions[j] === 0) {
	const action = 0
	const features = getStateActionFeatures(x, y, action)
	featureValueForAllActions += weights[w] * features[w]
	} else if (availableActions[j] === 1) {
	const action = 1
	const features = getStateActionFeatures(x, y, action)
	featureValueForAllActions += weights[w] * features[w]
	} else if (availableActions[j] === 2) {
	const action = 2
	const features = getStateActionFeatures(x, y, action)
	featureValueForAllActions += weights[w] * features[w]
	} else if (availableActions[j] === 3) {
	const action = 3
	const features = getStateActionFeatures(x, y, action)
	featureValueForAllActions += weights[w] * features[w]
	}
	}

	// The gradient of softmax policy is
	// the value of feature for the action taken / the average value of the feature for all available actions
	const featureValueForTakenAction = weights[w] * features[w]
	const gradient = featureValueForTakenAction - (featureValueForAllActions / availableActions.length)

	weights[w] += learningRate * G * gradient
	}
	}
	}

	// Run: Generate episodes and process
	const numberOfEpisodesToProcess = 10000
	let numberOfEpisodesProcessed = 0

	while (numberOfEpisodesToProcess > numberOfEpisodesProcessed) {
	let episode = generateEpisode()
	processEpisode(episode)
	numberOfEpisodesProcessed = numberOfEpisodesProcessed + 1
	}

	// Print: Generate episodes and process
	// After all episodes have been generated and processed
	// Print best action for each state
	const actionSymbols = ['U', 'R', 'D', 'L']
	const bestActionTable = [[],[],[],[],[]]

	// Generate a 2D grid showing the generated policies most likely action for each state
	for (let x = 0; x < 5; x++) {
	for (let y = 0; y < 5; y++) {
	const availableActions = actions[x][y]
	const availableActionsScores = availableActions.map(action => scoreFunction(x, y, action))

	// Use softmax function to get probabilities of actions for current state
	const stateActionProbabilities = softmax(availableActionsScores)

	let bestAction = 0
	let bestActionsProbability = 0
	for (let i = 0; i < stateActionProbabilities.length; i++) {
	if (stateActionProbabilities[i] > bestActionsProbability) {
	bestActionsProbability = stateActionProbabilities[i]
	bestAction = availableActions[i]
	}
	}

	bestActionTable[x][y] = actionSymbols[bestAction]
	}
	}

	// Do some mapping to print grids in correct orientation for x and y axis
	let gridRewardsPrintable = [[], [], [], [], []]
	let bestActionTablePrintable = [[], [], [], [], []]

	for (let x = 0; x < 5; x++) {
	for (let y = 0; y < 5; y++) {
	gridRewardsPrintable[y][x] = gridRewards[x][y]
	bestActionTablePrintable[y][x] = bestActionTable[x][y]

	// Print the rewards for reward states as they have no actions since episod terminates in these states
	if (gridRewards[x][y]) {
	bestActionTablePrintable[y][x] = gridRewards[x][y]
	}

	// Use '?' for non-differentiable states
	if (x > 0 && x < 4 && y > 0 && y < 4) {
	bestActionTablePrintable[y][x] = '?'
	}
	}
	}

	// Print the rewards and best actions from policy for each state
	console.table(gridRewardsPrintable)
	console.table(bestActionTablePrintable)
No results found