-
-
Save timzhang642/b83213d840dbe25a9ed14fe5b05aed7e to your computer and use it in GitHub Desktop.
Minimal character-level language model with a Vanilla Recurrent Neural Network, in Python/numpy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy) | |
| BSD License | |
| """ | |
| import numpy as np | |
| # data I/O | |
| data = open('input.txt', 'r').read() # should be simple plain text file | |
| chars = list(set(data)) # find all unique characters | |
| data_size, vocab_size = len(data), len(chars) | |
| print 'data has %d characters, %d unique.' % (data_size, vocab_size) | |
| char_to_ix = { ch:i for i,ch in enumerate(chars) } # mapping dic from characters to indices | |
| ix_to_char = { i:ch for i,ch in enumerate(chars) } # mapping dic from indices to characters | |
| # hyperparameters | |
| hidden_size = 100 # size of hidden layer of neurons | |
| seq_length = 25 # number of steps to unroll the RNN for. so each batch contains 25 characters | |
| learning_rate = 1e-1 | |
| # model parameters | |
| Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden | |
| Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden | |
| Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output | |
| bh = np.zeros((hidden_size, 1)) # hidden bias | |
| by = np.zeros((vocab_size, 1)) # output bias | |
| def lossFun(inputs, targets, hprev): | |
| """ | |
| inputs,targets are both list of integers. | |
| hprev is Hx1 array of initial hidden state | |
| returns the loss, gradients on model parameters, and last hidden state | |
| the following are dictionaries to record at each time step | |
| xs is a dic of the RNN input vector | |
| hs is a dic of the hidden state | |
| ys is a dic of the unnormalized log probabilities for next char | |
| ps is a dic of normalized probabilities for next char | |
| """ | |
| xs, hs, ys, ps = {}, {}, {}, {} | |
| hs[-1] = np.copy(hprev) | |
| loss = 0 | |
| # forward pass | |
| for t in xrange(len(inputs)): # t is the time step, indicating the index of the char in the current batch | |
| xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation | |
| xs[t][inputs[t]] = 1 # one-hot representation | |
| hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state | |
| ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars | |
| ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars | |
| loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss) | |
| # backward pass: compute gradients going backwards | |
| dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why) # initialize gradients as zeros | |
| dbh, dby = np.zeros_like(bh), np.zeros_like(by) | |
| dhnext = np.zeros_like(hs[0]) | |
| for t in reversed(xrange(len(inputs))): # going back in time steps, it goes from 25 to 24 to 23... | |
| dy = np.copy(ps[t]) # dy is the gradient on loss function | |
| # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here | |
| # targets[t] is the probability of true char at the t time step gradient on cross-entropy loss is prediction - 1 | |
| dy[targets[t]] -= 1 | |
| dWhy += np.dot(dy, hs[t].T) # the reason why it is += is we are accumulating the gradients for weights since we’ve using them for all the 25 time steps | |
| dby += dy | |
| dh = np.dot(Why.T, dy) + dhnext # backprop into h | |
| dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity | |
| dbh += dhraw | |
| dWxh += np.dot(dhraw, xs[t].T) | |
| dWhh += np.dot(dhraw, hs[t-1].T) | |
| dhnext = np.dot(Whh.T, dhraw) | |
| for dparam in [dWxh, dWhh, dWhy, dbh, dby]: | |
| np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients | |
| return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1] | |
| # get RNN to generate new text based on chars previously fed in | |
| def sample(h, seed_ix, n): | |
| """ | |
| sample a sequence of integers from the model | |
| h is memory state, seed_ix is seed letter for first time step | |
| n is how many chars to be generated | |
| """ | |
| x = np.zeros((vocab_size, 1)) | |
| x[seed_ix] = 1 | |
| ixes = [] # placeholder for the indices of the generated chars | |
| for t in xrange(n): | |
| # just do forward propagation | |
| h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh) # calculate hidden state for next time step | |
| y = np.dot(Why, h) + by # unnormalized log probability of the next char | |
| p = np.exp(y) / np.sum(np.exp(y)) # normalized probability of the next char | |
| ix = np.random.choice(range(vocab_size), p=p.ravel()) | |
| x = np.zeros((vocab_size, 1)) | |
| x[ix] = 1 # one hot representation | |
| ixes.append(ix) | |
| return ixes | |
| n, p = 0, 0 # n is iteration counter, p is character index in the data sequence | |
| mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why) | |
| mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad | |
| smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0 | |
| while True: | |
| # prepare inputs (we're sweeping from left to right in steps seq_length long) | |
| if p+seq_length+1 >= len(data) or n == 0: | |
| hprev = np.zeros((hidden_size,1)) # reset RNN memory. hprev is the hidden state vector from the previous time step | |
| p = 0 # go from start of data | |
| # inputs and targets are both lists of indices of characters w.r.t. sequence length | |
| inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]] | |
| targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]] # note target is offset by one character after input | |
| # for every 100 iterations, print output just to see how it is trained | |
| if n % 100 == 0: | |
| sample_ix = sample(hprev, inputs[0], 200) | |
| txt = ''.join(ix_to_char[ix] for ix in sample_ix) | |
| print '----\n %s \n----' % (txt, ) | |
| # forward seq_length characters through the net and fetch gradient | |
| # here, hprev is the hidden state vector at the end of the 25 characters, we keep track of this at the end of this batch and feed it in to initialize the next batch | |
| # This is to make sure the hidden states are correctly back propagated from batch to batch | |
| # We are only back propagating those 25 time steps | |
| loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev) | |
| smooth_loss = smooth_loss * 0.999 + loss * 0.001 | |
| if n % 100 == 0: print 'iter %d, loss: %f' % (n, smooth_loss) # print progress | |
| # perform parameter update with Adagrad | |
| for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], | |
| [dWxh, dWhh, dWhy, dbh, dby], | |
| [mWxh, mWhh, mWhy, mbh, mby]): | |
| mem += dparam * dparam | |
| param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update | |
| p += seq_length # move data pointer | |
| n += 1 # iteration counter |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment