timzhang642 · October 19, 2016 05:31
diff --git a/min-char-rnn.py b/min-char-rnn.py
 """
 Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy)
 BSD License
 """
 import numpy as np

 # data I/O
 data = open('input.txt', 'r').read() # should be simple plain text file
 chars = list(set(data)) # find all unique characters
 data_size, vocab_size = len(data), len(chars)
 print 'data has %d characters, %d unique.' % (data_size, vocab_size)
 char_to_ix = { ch:i for i,ch in enumerate(chars) } # mapping dic from characters to indices
 ix_to_char = { i:ch for i,ch in enumerate(chars) } # mapping dic from indices to characters

 # hyperparameters
 hidden_size = 100 # size of hidden layer of neurons
 seq_length = 25 # number of steps to unroll the RNN for. so each batch contains 25 characters
 learning_rate = 1e-1

 # model parameters
 Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
 Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
 Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
 bh = np.zeros((hidden_size, 1)) # hidden bias
 by = np.zeros((vocab_size, 1)) # output bias

 def lossFun(inputs, targets, hprev):
  """
  inputs,targets are both list of integers.
  hprev is Hx1 array of initial hidden state
  returns the loss, gradients on model parameters, and last hidden state
  
  the following are dictionaries to record at each time step
  xs is a dic of the RNN input vector 
  hs is a dic of the hidden state
  ys is a dic of the unnormalized log probabilities for next char
  ps is a dic of normalized probabilities for next char
  """
  
  xs, hs, ys, ps = {}, {}, {}, {}
  hs[-1] = np.copy(hprev)
  loss = 0
  # forward pass
  for t in xrange(len(inputs)): # t is the time step, indicating the index of the char in the current batch
    xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
    xs[t][inputs[t]] = 1 # one-hot representation
    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
    ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
    loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)
  # backward pass: compute gradients going backwards
  dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why) # initialize gradients as zeros
  dbh, dby = np.zeros_like(bh), np.zeros_like(by)
  dhnext = np.zeros_like(hs[0])
  for t in reversed(xrange(len(inputs))): # going back in time steps, it goes from 25 to 24 to 23...
    dy = np.copy(ps[t]) # dy is the gradient on loss function
    # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
    # targets[t] is the probability of true char at the t time step gradient on cross-entropy loss is prediction - 1
    dy[targets[t]] -= 1 
    dWhy += np.dot(dy, hs[t].T) # the reason why it is += is we are accumulating the gradients for weights since we’ve using them for all the 25 time steps
    dby += dy
    dh = np.dot(Why.T, dy) + dhnext # backprop into h
    dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
    dbh += dhraw
    dWxh += np.dot(dhraw, xs[t].T) 
    dWhh += np.dot(dhraw, hs[t-1].T)
    dhnext = np.dot(Whh.T, dhraw)
  for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
    np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
  return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

 # get RNN to generate new text based on chars previously fed in
 def sample(h, seed_ix, n):
  """ 
  sample a sequence of integers from the model 
  h is memory state, seed_ix is seed letter for first time step
  n is how many chars to be generated
  """
  x = np.zeros((vocab_size, 1))
  x[seed_ix] = 1
  ixes = [] # placeholder for the indices of the generated chars
  for t in xrange(n):
    # just do forward propagation
    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh) # calculate hidden state for next time step
    y = np.dot(Why, h) + by # unnormalized log probability of the next char
    p = np.exp(y) / np.sum(np.exp(y)) # normalized probability of the next char
    ix = np.random.choice(range(vocab_size), p=p.ravel())
    x = np.zeros((vocab_size, 1))
    x[ix] = 1 # one hot representation
    ixes.append(ix)
  return ixes

 n, p = 0, 0 # n is iteration counter, p is character index in the data sequence
 mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
 mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
 smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
 while True:
  # prepare inputs (we're sweeping from left to right in steps seq_length long)
  if p+seq_length+1 >= len(data) or n == 0: 
    hprev = np.zeros((hidden_size,1)) # reset RNN memory. hprev is the hidden state vector from the previous time step
    p = 0 # go from start of data
  # inputs and targets are both lists of indices of characters w.r.t. sequence length
  inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
  targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]] # note target is offset by one character after input

  # for every 100 iterations, print output just to see how it is trained
  if n % 100 == 0:
    sample_ix = sample(hprev, inputs[0], 200)
    txt = ''.join(ix_to_char[ix] for ix in sample_ix)
    print '----\n %s \n----' % (txt, )

  # forward seq_length characters through the net and fetch gradient
  
  # here, hprev is the hidden state vector at the end of the 25 characters, we keep track of this at the end of this batch and feed it in to initialize the next batch
  # This is to make sure the hidden states are correctly back propagated from batch to batch
  # We are only back propagating those 25 time steps
  loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
  smooth_loss = smooth_loss * 0.999 + loss * 0.001
  if n % 100 == 0: print 'iter %d, loss: %f' % (n, smooth_loss) # print progress
  
  # perform parameter update with Adagrad
  for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 
                                [dWxh, dWhh, dWhy, dbh, dby], 
                                [mWxh, mWhh, mWhy, mbh, mby]):
    mem += dparam * dparam
    param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

  p += seq_length # move data pointer
  n += 1 # iteration counter
	"""
	Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy)
	BSD License
	"""
	import numpy as np

	# data I/O
	data = open('input.txt', 'r').read() # should be simple plain text file
	chars = list(set(data)) # find all unique characters
	data_size, vocab_size = len(data), len(chars)
	print 'data has %d characters, %d unique.' % (data_size, vocab_size)
	char_to_ix = { ch:i for i,ch in enumerate(chars) } # mapping dic from characters to indices
	ix_to_char = { i:ch for i,ch in enumerate(chars) } # mapping dic from indices to characters

	# hyperparameters
	hidden_size = 100 # size of hidden layer of neurons
	seq_length = 25 # number of steps to unroll the RNN for. so each batch contains 25 characters
	learning_rate = 1e-1

	# model parameters
	Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
	Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
	Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
	bh = np.zeros((hidden_size, 1)) # hidden bias
	by = np.zeros((vocab_size, 1)) # output bias

	def lossFun(inputs, targets, hprev):
	"""
	inputs,targets are both list of integers.
	hprev is Hx1 array of initial hidden state
	returns the loss, gradients on model parameters, and last hidden state

	the following are dictionaries to record at each time step
	xs is a dic of the RNN input vector
	hs is a dic of the hidden state
	ys is a dic of the unnormalized log probabilities for next char
	ps is a dic of normalized probabilities for next char
	"""

	xs, hs, ys, ps = {}, {}, {}, {}
	hs[-1] = np.copy(hprev)
	loss = 0
	# forward pass
	for t in xrange(len(inputs)): # t is the time step, indicating the index of the char in the current batch
	xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
	xs[t][inputs[t]] = 1 # one-hot representation
	hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
	ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
	ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
	loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)
	# backward pass: compute gradients going backwards
	dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why) # initialize gradients as zeros
	dbh, dby = np.zeros_like(bh), np.zeros_like(by)
	dhnext = np.zeros_like(hs[0])
	for t in reversed(xrange(len(inputs))): # going back in time steps, it goes from 25 to 24 to 23...
	dy = np.copy(ps[t]) # dy is the gradient on loss function
	# backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
	# targets[t] is the probability of true char at the t time step gradient on cross-entropy loss is prediction - 1
	dy[targets[t]] -= 1
	dWhy += np.dot(dy, hs[t].T) # the reason why it is += is we are accumulating the gradients for weights since we’ve using them for all the 25 time steps
	dby += dy
	dh = np.dot(Why.T, dy) + dhnext # backprop into h
	dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
	dbh += dhraw
	dWxh += np.dot(dhraw, xs[t].T)
	dWhh += np.dot(dhraw, hs[t-1].T)
	dhnext = np.dot(Whh.T, dhraw)
	for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
	np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
	return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

	# get RNN to generate new text based on chars previously fed in
	def sample(h, seed_ix, n):
	"""
	sample a sequence of integers from the model
	h is memory state, seed_ix is seed letter for first time step
	n is how many chars to be generated
	"""
	x = np.zeros((vocab_size, 1))
	x[seed_ix] = 1
	ixes = [] # placeholder for the indices of the generated chars
	for t in xrange(n):
	# just do forward propagation
	h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh) # calculate hidden state for next time step
	y = np.dot(Why, h) + by # unnormalized log probability of the next char
	p = np.exp(y) / np.sum(np.exp(y)) # normalized probability of the next char
	ix = np.random.choice(range(vocab_size), p=p.ravel())
	x = np.zeros((vocab_size, 1))
	x[ix] = 1 # one hot representation
	ixes.append(ix)
	return ixes

	n, p = 0, 0 # n is iteration counter, p is character index in the data sequence
	mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
	mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
	smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
	while True:
	# prepare inputs (we're sweeping from left to right in steps seq_length long)
	if p+seq_length+1 >= len(data) or n == 0:
	hprev = np.zeros((hidden_size,1)) # reset RNN memory. hprev is the hidden state vector from the previous time step
	p = 0 # go from start of data
	# inputs and targets are both lists of indices of characters w.r.t. sequence length
	inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
	targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]] # note target is offset by one character after input

	# for every 100 iterations, print output just to see how it is trained
	if n % 100 == 0:
	sample_ix = sample(hprev, inputs[0], 200)
	txt = ''.join(ix_to_char[ix] for ix in sample_ix)
	print '----\n %s \n----' % (txt, )

	# forward seq_length characters through the net and fetch gradient

	# here, hprev is the hidden state vector at the end of the 25 characters, we keep track of this at the end of this batch and feed it in to initialize the next batch
	# This is to make sure the hidden states are correctly back propagated from batch to batch
	# We are only back propagating those 25 time steps
	loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
	smooth_loss = smooth_loss * 0.999 + loss * 0.001
	if n % 100 == 0: print 'iter %d, loss: %f' % (n, smooth_loss) # print progress

	# perform parameter update with Adagrad
	for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
	[dWxh, dWhh, dWhy, dbh, dby],
	[mWxh, mWhh, mWhy, mbh, mby]):
	mem += dparam * dparam
	param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

	p += seq_length # move data pointer
	n += 1 # iteration counter
No results found