Skip to content

Instantly share code, notes, and snippets.

@evanthebouncy
Last active April 22, 2026 07:57
Show Gist options
  • Select an option

  • Save evanthebouncy/22e115c660bb90bd5ef0a0a8db72c07d to your computer and use it in GitHub Desktop.

Select an option

Save evanthebouncy/22e115c660bb90bd5ef0a0a8db72c07d to your computer and use it in GitHub Desktop.
q-learning html
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Q-Learning GridWorld</title>
<style>
* { box-sizing: border-box; margin: 0; padding: 0; }
body {
font-family: 'Segoe UI', Arial, sans-serif;
background: #12192b;
color: #e0e0e0;
min-height: 100vh;
display: flex;
flex-direction: column;
align-items: center;
padding: 28px 16px 32px;
gap: 14px;
}
h1 { color: #7ec8f8; font-size: 22px; font-weight: 700; letter-spacing: 0.5px; }
.subtitle { font-size: 13px; color: #8899aa; }
canvas {
border-radius: 8px;
box-shadow: 0 6px 32px rgba(0,0,0,0.7);
}
.controls {
display: flex;
gap: 8px;
flex-wrap: wrap;
justify-content: center;
align-items: center;
}
button {
padding: 8px 18px;
border: 1px solid #334;
border-radius: 6px;
background: #1c2e4a;
color: #b8d4f0;
font-size: 13px;
font-weight: 500;
cursor: pointer;
transition: background 0.15s, border-color 0.15s;
white-space: nowrap;
}
button:hover { background: #263d60; border-color: #4a6a9a; }
button.playing { background: #1e4228; border-color: #4a8a5a; color: #9fdf9f; }
button.reset { background: #2e1c1c; border-color: #6a3434; color: #f0a0a0; }
button.reset:hover { background: #3e2424; }
.row {
display: flex;
align-items: center;
gap: 16px;
font-size: 13px;
color: #8899aa;
flex-wrap: wrap;
justify-content: center;
}
.stat { color: #7ec8f8; font-weight: 700; font-size: 17px; }
.param { color: #aaccee; font-size: 13px; }
input[type=range] { width: 110px; accent-color: #5a9ad5; }
.legend {
display: flex;
gap: 14px;
font-size: 11px;
color: #778;
align-items: center;
flex-wrap: wrap;
justify-content: center;
}
.sw {
width: 13px; height: 13px;
display: inline-block;
border: 1px solid #445;
vertical-align: middle;
margin-right: 3px;
border-radius: 2px;
}
.speed-row { display: flex; align-items: center; gap: 8px; font-size: 12px; color: #778; }
</style>
</head>
<body>
<h1>Q-Learning: GridWorld</h1>
<div class="subtitle">Trajectory-based Q-learning &nbsp;|&nbsp; reward on entering state</div>
<div class="row">
<span>Episodes: <span class="stat" id="epCount">0</span></span>
<span class="param">
α: <span id="alphaVal">0.10</span>
<input type="range" id="alphaSlider" min="0" max="1" step="0.05" value="0.1">
</span>
<span class="param">
γ: <span id="gammaVal">0.90</span>
<input type="range" id="gammaSlider" min="0" max="1" step="0.05" value="0.9">
</span>
<span class="param">
ε: <span id="epsVal">0.10</span>
<input type="range" id="epsSlider" min="0" max="1" step="0.05" value="0.1">
</span>
</div>
<div class="row">
<span class="param">
p(intended): <span id="pIntVal">0.80</span>
<input type="range" id="pIntSlider" min="0" max="1" step="0.05" value="0.8">
</span>
<span style="font-size:12px;color:#667">→ noise split equally: <span id="pNoise">0.10</span> each side</span>
<span class="speed-row">Speed:
<input type="range" id="speedSlider" min="1" max="200" step="1" value="1">
<span id="speedVal">1 ep/frame</span>
</span>
</div>
<canvas id="canvas"></canvas>
<div class="controls">
<button onclick="doStep(1)">+1 Episode</button>
<button onclick="doStep(10)">+10</button>
<button onclick="doStep(100)">+100</button>
<button onclick="doStep(1000)">+1 000</button>
<button onclick="doStep(10000)">+10 000</button>
<button id="playBtn" onclick="togglePlay()">▶ Play</button>
<button class="reset" onclick="resetSim()">↺ Reset</button>
</div>
<div class="legend">
<span><span class="sw" style="background:#88dd88"></span>Positive Q (green → +1)</span>
<span><span class="sw" style="background:#dd8888"></span>Negative Q (red → −1)</span>
<span><span class="sw" style="background:#fff"></span>Zero</span>
<span><span class="sw" style="background:#b0b0b0"></span>Wall</span>
<span>▲ = greedy action per cell</span>
</div>
<script>
// ── Environment ──────────────────────────────────────────────
const NCOLS = 4, NROWS = 3;
let ALPHA = 0.1, GAMMA = 0.9;
let epsilon = 0.1;
let pIntended = 0.8; // probability of going in the intended direction
let epCount = 0;
function isWall(c, r) { return c === 2 && r === 2; }
function isTerminal(c, r) { return c === 4 && (r === 2 || r === 3); }
function stateReward(c, r) {
if (c === 4 && r === 3) return +1;
if (c === 4 && r === 2) return -1;
return 0;
}
// Actions: 0=up(row+1) 1=down(row-1) 2=left(col-1) 3=right(col+1)
const DC = [0, 0, -1, 1];
const DR = [1, -1, 0, 0];
// Perpendiculars for each action: [left-perp, right-perp]
// up(0): left(2), right(3)
// down(1): right(3), left(2)
// left(2): down(1), up(0)
// right(3): up(0), down(1)
const PERP = [[2,3],[3,2],[1,0],[0,1]];
function applyAction(c, r, a) {
const nc = c + DC[a], nr = r + DR[a];
if (nc < 1 || nc > NCOLS || nr < 1 || nr > NROWS || isWall(nc, nr))
return [c, r];
return [nc, nr];
}
function transition(c, r, a) {
// Stochastic: pIntended intended, (1-pIntended)/2 each perpendicular
const p = Math.random();
const pNoise = (1 - pIntended) / 2;
let actual;
if (p < pIntended) actual = a;
else if (p < pIntended + pNoise) actual = PERP[a][0];
else actual = PERP[a][1];
const [nc, nr] = applyAction(c, r, actual);
return [nc, nr, stateReward(nc, nr)];
}
// ── Q Table ───────────────────────────────────────────────────
let Q; // Q[c][r] = Float32Array([up, down, left, right])
function initQ() {
Q = {};
for (let c = 1; c <= NCOLS; c++) {
Q[c] = {};
for (let r = 1; r <= NROWS; r++)
Q[c][r] = new Float32Array(4);
}
}
function maxQval(c, r) {
if (isWall(c, r) || isTerminal(c, r)) return 0;
const v = Q[c][r];
return Math.max(v[0], v[1], v[2], v[3]);
}
function greedyAction(c, r) {
const v = Q[c][r];
let best = 0;
for (let i = 1; i < 4; i++) if (v[i] > v[best]) best = i;
return best;
}
function chooseAction(c, r) {
if (Math.random() < epsilon) return Math.floor(Math.random() * 4);
return greedyAction(c, r);
}
function randomStart() {
return [1, 1]; // always start bottom-left
}
// ── Trajectory storage (last episode, for overlay) ───────────
let lastTraj = []; // [{c,r,nc,nr,rew}]
// ── Run One Episode (collect trajectory, then update) ────────
function runEpisode() {
let [c, r] = randomStart();
let len = 0;
const maxSteps = 500;
// (1) collect trajectory
const sc = [], sr = [], sa = [], snc = [], snr = [], srew = [];
while (!isTerminal(c, r) && len < maxSteps) {
const a = chooseAction(c, r);
const [nc, nr, rew] = transition(c, r, a);
sc.push(c); sr.push(r); sa.push(a);
snc.push(nc); snr.push(nr); srew.push(rew);
c = nc; r = nr;
len++;
}
// save for overlay (include terminal arrival)
lastTraj = [];
for (let i = 0; i < len; i++)
lastTraj.push({ c: sc[i], r: sr[i], nc: snc[i], nr: snr[i], rew: srew[i] });
// (2) update along trajectory
for (let i = 0; i < len; i++) {
const futQ = isTerminal(snc[i], snr[i]) ? 0 : maxQval(snc[i], snr[i]);
const a = sa[i];
Q[sc[i]][sr[i]][a] =
(1 - ALPHA) * Q[sc[i]][sr[i]][a] +
ALPHA * (srew[i] + GAMMA * futQ);
}
epCount++;
}
// ── Canvas Setup ──────────────────────────────────────────────
const canvas = document.getElementById('canvas');
const ctx = canvas.getContext('2d');
const CELL = 100;
const PL = 42, PT = 22, PB = 42, PR = 22;
canvas.width = PL + NCOLS * CELL + PR;
canvas.height = PT + NROWS * CELL + PB;
function cellXY(c, r) {
return [PL + (c - 1) * CELL, PT + (NROWS - r) * CELL];
}
function qColor(v, scale) {
if (scale < 1e-9) return '#ffffff';
const t = Math.max(-1, Math.min(1, v / scale));
if (t >= 0) {
const k = Math.round(t * 130);
return `rgb(${255 - k},255,${255 - k})`;
} else {
const k = Math.round(-t * 130);
return `rgb(255,${255 - k},${255 - k})`;
}
}
function globalScale() {
let mx = 1e-9;
for (let c = 1; c <= NCOLS; c++)
for (let r = 1; r <= NROWS; r++)
if (!isWall(c, r) && !isTerminal(c, r))
for (const v of Q[c][r]) if (Math.abs(v) > mx) mx = Math.abs(v);
return mx;
}
// Arrow pointing in canvas direction for action a
const ARROW_CDX = [0, 0, -1, 1]; // action 0=up: canvas y decreases
const ARROW_CDY = [-1, 1, 0, 0];
function drawArrow(cx, cy, action, half) {
const dx = ARROW_CDX[action], dy = ARROW_CDY[action];
const x1 = cx - dx * half * 0.5, y1 = cy - dy * half * 0.5;
const x2 = cx + dx * half, y2 = cy + dy * half;
ctx.save();
ctx.strokeStyle = 'rgba(20,20,20,0.85)';
ctx.lineWidth = 2.2;
ctx.lineCap = 'round';
ctx.beginPath();
ctx.moveTo(x1, y1);
ctx.lineTo(x2, y2);
ctx.stroke();
const ang = Math.atan2(dy, dx);
ctx.fillStyle = 'rgba(20,20,20,0.85)';
ctx.beginPath();
ctx.moveTo(x2, y2);
ctx.lineTo(x2 - 9 * Math.cos(ang - 0.45), y2 - 9 * Math.sin(ang - 0.45));
ctx.lineTo(x2 - 9 * Math.cos(ang + 0.45), y2 - 9 * Math.sin(ang + 0.45));
ctx.closePath();
ctx.fill();
ctx.restore();
}
function render() {
const scale = globalScale();
ctx.clearRect(0, 0, canvas.width, canvas.height);
// Canvas background
ctx.fillStyle = '#f2f4f6';
ctx.fillRect(0, 0, canvas.width, canvas.height);
for (let c = 1; c <= NCOLS; c++) {
for (let r = 1; r <= NROWS; r++) {
const [x, y] = cellXY(c, r);
const W = CELL, H = CELL;
const cx = x + W / 2, cy = y + H / 2;
// ── Wall ──
if (isWall(c, r)) {
ctx.fillStyle = '#b4b4b4';
ctx.fillRect(x, y, W, H);
ctx.strokeStyle = '#777';
ctx.lineWidth = 2;
ctx.strokeRect(x, y, W, H);
// hatching
ctx.save();
ctx.clip(); // won't clip — just style
ctx.strokeStyle = '#999';
ctx.lineWidth = 1;
for (let d = -W; d < W * 2; d += 14) {
ctx.beginPath();
ctx.moveTo(x + d, y);
ctx.lineTo(x + d + H, y + H);
ctx.stroke();
}
ctx.restore();
ctx.strokeStyle = '#777';
ctx.lineWidth = 2;
ctx.strokeRect(x, y, W, H);
continue;
}
// ── Terminal ──
if (isTerminal(c, r)) {
const rew = stateReward(c, r);
ctx.fillStyle = rew > 0 ? '#b6f0b6' : '#f0b6b6';
ctx.fillRect(x, y, W, H);
ctx.strokeStyle = rew > 0 ? '#2a8a2a' : '#8a2a2a';
ctx.lineWidth = 3;
ctx.strokeRect(x + 1, y + 1, W - 2, H - 2);
ctx.fillStyle = rew > 0 ? '#1a6a1a' : '#6a1a1a';
ctx.font = 'bold 30px Arial';
ctx.textAlign = 'center';
ctx.textBaseline = 'middle';
ctx.fillText((rew > 0 ? '+' : '') + rew, cx, cy);
continue;
}
// ── Normal cell: 4 triangles ──
const qv = Q[c][r];
// Triangle fill
const tris = [
{ pts: [[x,y],[x+W,y],[cx,cy]], a: 0 }, // top = up
{ pts: [[x,y+H],[x+W,y+H],[cx,cy]], a: 1 }, // bottom = down
{ pts: [[x,y],[x,y+H],[cx,cy]], a: 2 }, // left
{ pts: [[x+W,y],[x+W,y+H],[cx,cy]], a: 3 }, // right
];
for (const { pts, a } of tris) {
ctx.beginPath();
ctx.moveTo(pts[0][0], pts[0][1]);
ctx.lineTo(pts[1][0], pts[1][1]);
ctx.lineTo(pts[2][0], pts[2][1]);
ctx.closePath();
ctx.fillStyle = qColor(qv[a], scale);
ctx.fill();
ctx.strokeStyle = '#ccc';
ctx.lineWidth = 0.6;
ctx.stroke();
}
// Cell border
ctx.strokeStyle = '#555';
ctx.lineWidth = 1.8;
ctx.strokeRect(x, y, W, H);
// Q value text
ctx.fillStyle = '#111';
ctx.font = '11px monospace';
ctx.textAlign = 'center';
ctx.textBaseline = 'middle';
ctx.fillText(qv[0].toFixed(3), cx, y + H * 0.23); // up
ctx.fillText(qv[1].toFixed(3), cx, y + H * 0.77); // down
ctx.fillText(qv[2].toFixed(3), x + W * 0.22, cy); // left
ctx.fillText(qv[3].toFixed(3), x + W * 0.78, cy); // right
// Greedy action arrow (only when not all zero)
const allZero = qv[0] === 0 && qv[1] === 0 && qv[2] === 0 && qv[3] === 0;
if (!allZero) {
drawArrow(cx, cy, greedyAction(c, r), 14);
}
}
}
// Row labels
ctx.fillStyle = '#445';
ctx.font = 'bold 15px Arial';
ctx.textAlign = 'right';
ctx.textBaseline = 'middle';
for (let r = 1; r <= NROWS; r++) {
const [, y] = cellXY(1, r);
ctx.fillText(r, PL - 10, y + CELL / 2);
}
// Col labels
ctx.textAlign = 'center';
ctx.textBaseline = 'top';
for (let c = 1; c <= NCOLS; c++) {
const [x] = cellXY(c, 1);
ctx.fillText(c, x + CELL / 2, PT + NROWS * CELL + 10);
}
// ── Trajectory overlay ──
drawTrajectory();
// Episode label (like "Q0" in the reference image)
ctx.fillStyle = '#667';
ctx.font = '14px Arial';
ctx.textAlign = 'center';
ctx.textBaseline = 'alphabetic';
ctx.fillText(`Q${epCount}`, canvas.width / 2, canvas.height - 8);
document.getElementById('epCount').textContent = epCount.toLocaleString();
}
function drawTrajectory() {
if (!lastTraj.length) return;
const n = lastTraj.length;
ctx.save();
ctx.lineCap = 'round';
ctx.lineJoin = 'round';
for (let i = 0; i < n; i++) {
const { c, r, nc, nr, rew } = lastTraj[i];
// recent steps fully opaque, older steps 60% — never too dim
const fade = 0.6 + 0.4 * (i / (n - 1 || 1));
const [x1, y1] = cellXY(c, r);
const [x2, y2] = cellXY(nc, nr);
const cx1 = x1 + CELL / 2, cy1 = y1 + CELL / 2;
const cx2 = x2 + CELL / 2, cy2 = y2 + CELL / 2;
const isLast = i === n - 1;
const hue = isLast ? (rew > 0 ? '120' : '0') : '200';
const lineColor = `hsla(${hue},100%,60%,${fade.toFixed(2)})`;
const shadowColor = `hsla(${hue},100%,35%,0.7)`;
if (c !== nc || r !== nr) {
const ox = (cy2 - cy1) * 0.08, oy = -(cx2 - cx1) * 0.08;
ctx.shadowColor = shadowColor;
ctx.shadowBlur = 6;
ctx.strokeStyle = lineColor;
ctx.lineWidth = 5;
ctx.beginPath();
ctx.moveTo(cx1 + ox, cy1 + oy);
ctx.lineTo(cx2 + ox, cy2 + oy);
ctx.stroke();
const ang = Math.atan2(cy2 - cy1, cx2 - cx1);
const ax = cx2 + ox, ay = cy2 + oy;
ctx.fillStyle = lineColor;
ctx.shadowBlur = 0;
ctx.beginPath();
ctx.moveTo(ax, ay);
ctx.lineTo(ax - 13 * Math.cos(ang - 0.4), ay - 13 * Math.sin(ang - 0.4));
ctx.lineTo(ax - 13 * Math.cos(ang + 0.4), ay - 13 * Math.sin(ang + 0.4));
ctx.closePath();
ctx.fill();
}
// Circle at each visited state
ctx.shadowBlur = 0;
ctx.strokeStyle = `hsla(${hue},100%,30%,${fade.toFixed(2)})`;
ctx.lineWidth = 2;
const isStart = i === 0;
ctx.fillStyle = isStart ? 'rgba(255,255,255,0.95)' : lineColor;
ctx.beginPath();
ctx.arc(cx1, cy1, isStart ? 8 : 5, 0, Math.PI * 2);
ctx.fill();
ctx.stroke();
}
// Terminal arrival dot
const last = lastTraj[n - 1];
const [xf, yf] = cellXY(last.nc, last.nr);
const endHue = last.rew > 0 ? '120' : last.rew < 0 ? '0' : '200';
ctx.fillStyle = `hsla(${endHue},100%,65%,1.0)`;
ctx.strokeStyle = `hsla(${endHue},100%,25%,1.0)`;
ctx.lineWidth = 2.5;
ctx.shadowColor = `hsla(${endHue},100%,50%,0.8)`;
ctx.shadowBlur = 12;
ctx.beginPath();
ctx.arc(xf + CELL / 2, yf + CELL / 2, 8, 0, Math.PI * 2);
ctx.fill();
ctx.stroke();
ctx.shadowBlur = 0;
ctx.restore();
// Step count label
ctx.fillStyle = 'rgba(100,160,255,0.85)';
ctx.font = 'bold 12px Arial';
ctx.textAlign = 'left';
ctx.textBaseline = 'top';
ctx.fillText(`last traj: ${n} step${n !== 1 ? 's' : ''}`, PL, PT + NROWS * CELL + 10);
}
// ── Controls ──────────────────────────────────────────────────
function doStep(n) {
stopPlay();
for (let i = 0; i < n; i++) runEpisode();
render();
}
function resetSim() {
stopPlay();
epCount = 0;
lastTraj = [];
initQ();
render();
}
let playing = false, raf = null;
let epsPerFrame = 1;
function togglePlay() {
if (playing) { stopPlay(); return; }
playing = true;
document.getElementById('playBtn').textContent = '⏸ Pause';
document.getElementById('playBtn').classList.add('playing');
function tick() {
for (let i = 0; i < epsPerFrame; i++) runEpisode();
render();
if (playing) raf = requestAnimationFrame(tick);
}
raf = requestAnimationFrame(tick);
}
function stopPlay() {
playing = false;
cancelAnimationFrame(raf);
document.getElementById('playBtn').textContent = '▶ Play';
document.getElementById('playBtn').classList.remove('playing');
}
document.getElementById('alphaSlider').addEventListener('input', function () {
ALPHA = parseFloat(this.value);
document.getElementById('alphaVal').textContent = ALPHA.toFixed(2);
});
document.getElementById('gammaSlider').addEventListener('input', function () {
GAMMA = parseFloat(this.value);
document.getElementById('gammaVal').textContent = GAMMA.toFixed(2);
});
document.getElementById('epsSlider').addEventListener('input', function () {
epsilon = parseFloat(this.value);
document.getElementById('epsVal').textContent = epsilon.toFixed(2);
});
document.getElementById('pIntSlider').addEventListener('input', function () {
pIntended = parseFloat(this.value);
document.getElementById('pIntVal').textContent = pIntended.toFixed(2);
const each = ((1 - pIntended) / 2).toFixed(2);
document.getElementById('pNoise').textContent = each;
});
document.getElementById('speedSlider').addEventListener('input', function () {
epsPerFrame = parseInt(this.value);
document.getElementById('speedVal').textContent = `${epsPerFrame} ep/frame`;
});
// ── Init ──────────────────────────────────────────────────────
initQ();
render();
</script>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment