Skip to content

Instantly share code, notes, and snippets.

@ssarkar445
Forked from GKarmakar/MaLSTM
Created March 10, 2022 05:22
Show Gist options
  • Select an option

  • Save ssarkar445/26bfa80d79610acae7ab1d72fcf5594d to your computer and use it in GitHub Desktop.

Select an option

Save ssarkar445/26bfa80d79610acae7ab1d72fcf5594d to your computer and use it in GitHub Desktop.
#Load required Keras libraries:
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Merge
import keras.backend as K
from keras.optimizers import Adadelta
from keras.callbacks import ModelCheckpoint
I wrote a routine to clean text data
def text_to_word_list(text):
''' Pre process and convert texts to a list of words '''
text = str(text)
text = text.lower()
# Clean the text
text = re.sub(r"[^A-Za-z0–9^,!.\/'+-=]", " ", text)
text = re.sub(r"what's", "what is ", text)
text = re.sub(r"\'s", " ", text)
text = re.sub(r"\'ve", " have ", text)
text = re.sub(r"can't", "cannot ", text)
text = re.sub(r"n't", " not ", text)
text = re.sub(r"i'm", "i am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub(r",", " ", text)
text = re.sub(r"\.", " ", text)
text = re.sub(r"!", " ! ", text)
text = re.sub(r"\/", " ", text)
text = re.sub(r"\^", " ^ ", text)
text = re.sub(r"\+", " + ", text)
text = re.sub(r"\-", " - ", text)
text = re.sub(r"\=", " = ", text)
text = re.sub(r"'", " ", text)
text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
text = re.sub(r":", " : ", text)
text = re.sub(r" e g ", " eg ", text)
text = re.sub(r" b g ", " bg ", text)
text = re.sub(r" u s ", " american ", text)
text = re.sub(r"\0s", "0", text)
text = re.sub(r" 9 11 ", "911", text)
text = re.sub(r"e - mail", "email", text)
text = re.sub(r"j k", "jk", text)
text = re.sub(r"\s{2,}", " ", text)
text = text.split()
return text
#Prepare embedding of the data - I am using quora question pairs
for dataset in [train_df, test_df]:
for index, row in dataset.iterrows():
# Iterate through the text of both questions of the row
for question in questions_cols:
q2n = [] # q2n -> question numbers representation
for word in text_to_word_list(row[question]):
# Check for unwanted words
if word in stops and word not in word2vec.vocab:
continue
if word not in vocabulary:
vocabulary[word] = len(inverse_vocabulary)
q2n.append(len(inverse_vocabulary))
inverse_vocabulary.append(word)
else:
q2n.append(vocabulary[word])
# Replace questions as word to question as number representation
dataset.set_value(index, question, q2n)
I am using 300 dimension for my embedding i.e. there will 300 vectors for each word in the corpora represented for neural network model.
embedding_dim = 300
embeddings = 1 * np.random.randn(len(vocabulary) + 1, embedding_dim) #embedding matrix
embeddings[0] = 0 #padding will be ignored
#Build the embedding matrix
for word, index in vocabulary.items():
if word in word2vec.vocab:
embeddings[index] = word2vec.word_vec(word)
Keras doesn't come with Manhattan distance calculation, hence we need to write a routine to do that for us.
def exponent_neg_manhattan_distance(left, right):
''' Helper function for the similarity estimate of the LSTMs outputs'''
return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))
Let's build the model now:
The visible layer
left_input = Input(shape=(max_seq_length,), dtype='int32')
right_input = Input(shape=(max_seq_length,), dtype='int32')
embedding_layer = Embedding(len(embeddings), embedding_dim, weights=[embeddings], input_length=max_seq_length, trainable=False)
# Embedded version of the inputs
encoded_left = embedding_layer(left_input)
encoded_right = embedding_layer(right_input)
# Since this is a siamese network, both sides share the same LSTM
shared_lstm = LSTM(n_hidden)
left_output = shared_lstm(encoded_left)
right_output = shared_lstm(encoded_right)
# Calculates the distance as defined by the MaLSTM model
malstm_distance = Merge(mode=lambda x: exponent_neg_manhattan_distance(x[0], x[1]), output_shape=lambda x: (x[0][0], 1))([left_output, right_output])
# Pack it all up into a model
malstm = Model([left_input, right_input], [malstm_distance])
We need set an optimizer, I am using adadelta but any other popular optimizer such as RMSProp, Adam and even SGD could be tested to see if it increases accuracy, decreases training time by finding better local minima (yes, global minima is an elusive goal still).
# Adadelta optimizer, with gradient clipping by norm
optimizer = Adadelta(clipnorm=gradient_clipping_norm)
Now we will compile and train the model.
malstm.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])
# Start training
training_start_time = time()
malstm_trained = malstm.fit([X_train['left'], X_train['right']], Y_train, batch_size=batch_size, nb_epoch=n_epoch, validation_data=([X_validation['left'], X_validation['right']], Y_validation))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment