Skip to content

Instantly share code, notes, and snippets.

@timzhang642
Forked from karpathy/min-char-rnn.py
Last active October 19, 2016 05:31
Show Gist options
  • Select an option

  • Save timzhang642/b83213d840dbe25a9ed14fe5b05aed7e to your computer and use it in GitHub Desktop.

Select an option

Save timzhang642/b83213d840dbe25a9ed14fe5b05aed7e to your computer and use it in GitHub Desktop.
Minimal character-level language model with a Vanilla Recurrent Neural Network, in Python/numpy
"""
Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy)
BSD License
"""
import numpy as np
# data I/O
data = open('input.txt', 'r').read() # should be simple plain text file
chars = list(set(data)) # find all unique characters
data_size, vocab_size = len(data), len(chars)
print 'data has %d characters, %d unique.' % (data_size, vocab_size)
char_to_ix = { ch:i for i,ch in enumerate(chars) } # mapping dic from characters to indices
ix_to_char = { i:ch for i,ch in enumerate(chars) } # mapping dic from indices to characters
# hyperparameters
hidden_size = 100 # size of hidden layer of neurons
seq_length = 25 # number of steps to unroll the RNN for. so each batch contains 25 characters
learning_rate = 1e-1
# model parameters
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias
def lossFun(inputs, targets, hprev):
"""
inputs,targets are both list of integers.
hprev is Hx1 array of initial hidden state
returns the loss, gradients on model parameters, and last hidden state
the following are dictionaries to record at each time step
xs is a dic of the RNN input vector
hs is a dic of the hidden state
ys is a dic of the unnormalized log probabilities for next char
ps is a dic of normalized probabilities for next char
"""
xs, hs, ys, ps = {}, {}, {}, {}
hs[-1] = np.copy(hprev)
loss = 0
# forward pass
for t in xrange(len(inputs)): # t is the time step, indicating the index of the char in the current batch
xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
xs[t][inputs[t]] = 1 # one-hot representation
hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)
# backward pass: compute gradients going backwards
dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why) # initialize gradients as zeros
dbh, dby = np.zeros_like(bh), np.zeros_like(by)
dhnext = np.zeros_like(hs[0])
for t in reversed(xrange(len(inputs))): # going back in time steps, it goes from 25 to 24 to 23...
dy = np.copy(ps[t]) # dy is the gradient on loss function
# backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
# targets[t] is the probability of true char at the t time step gradient on cross-entropy loss is prediction - 1
dy[targets[t]] -= 1
dWhy += np.dot(dy, hs[t].T) # the reason why it is += is we are accumulating the gradients for weights since we’ve using them for all the 25 time steps
dby += dy
dh = np.dot(Why.T, dy) + dhnext # backprop into h
dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
dbh += dhraw
dWxh += np.dot(dhraw, xs[t].T)
dWhh += np.dot(dhraw, hs[t-1].T)
dhnext = np.dot(Whh.T, dhraw)
for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]
# get RNN to generate new text based on chars previously fed in
def sample(h, seed_ix, n):
"""
sample a sequence of integers from the model
h is memory state, seed_ix is seed letter for first time step
n is how many chars to be generated
"""
x = np.zeros((vocab_size, 1))
x[seed_ix] = 1
ixes = [] # placeholder for the indices of the generated chars
for t in xrange(n):
# just do forward propagation
h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh) # calculate hidden state for next time step
y = np.dot(Why, h) + by # unnormalized log probability of the next char
p = np.exp(y) / np.sum(np.exp(y)) # normalized probability of the next char
ix = np.random.choice(range(vocab_size), p=p.ravel())
x = np.zeros((vocab_size, 1))
x[ix] = 1 # one hot representation
ixes.append(ix)
return ixes
n, p = 0, 0 # n is iteration counter, p is character index in the data sequence
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
while True:
# prepare inputs (we're sweeping from left to right in steps seq_length long)
if p+seq_length+1 >= len(data) or n == 0:
hprev = np.zeros((hidden_size,1)) # reset RNN memory. hprev is the hidden state vector from the previous time step
p = 0 # go from start of data
# inputs and targets are both lists of indices of characters w.r.t. sequence length
inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]] # note target is offset by one character after input
# for every 100 iterations, print output just to see how it is trained
if n % 100 == 0:
sample_ix = sample(hprev, inputs[0], 200)
txt = ''.join(ix_to_char[ix] for ix in sample_ix)
print '----\n %s \n----' % (txt, )
# forward seq_length characters through the net and fetch gradient
# here, hprev is the hidden state vector at the end of the 25 characters, we keep track of this at the end of this batch and feed it in to initialize the next batch
# This is to make sure the hidden states are correctly back propagated from batch to batch
# We are only back propagating those 25 time steps
loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
smooth_loss = smooth_loss * 0.999 + loss * 0.001
if n % 100 == 0: print 'iter %d, loss: %f' % (n, smooth_loss) # print progress
# perform parameter update with Adagrad
for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
[dWxh, dWhh, dWhy, dbh, dby],
[mWxh, mWhh, mWhy, mbh, mby]):
mem += dparam * dparam
param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update
p += seq_length # move data pointer
n += 1 # iteration counter
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment