ssarkar445 · March 10, 2022 05:22
diff --git a/MaLSTM b/MaLSTM
 #Load required Keras libraries:
 from keras.preprocessing.sequence import pad_sequences
 from keras.models import Model
 from keras.layers import Input, Embedding, LSTM, Merge
 import keras.backend as K
 from keras.optimizers import Adadelta
 from keras.callbacks import ModelCheckpoint
 I wrote a routine to clean text data
 def text_to_word_list(text):
 ''' Pre process and convert texts to a list of words '''
 text = str(text)
 text = text.lower()
 # Clean the text
 text = re.sub(r"[^A-Za-z0–9^,!.\/'+-=]", " ", text)
 text = re.sub(r"what's", "what is ", text)
 text = re.sub(r"\'s", " ", text)
 text = re.sub(r"\'ve", " have ", text)
 text = re.sub(r"can't", "cannot ", text)
 text = re.sub(r"n't", " not ", text)
 text = re.sub(r"i'm", "i am ", text)
 text = re.sub(r"\'re", " are ", text)
 text = re.sub(r"\'d", " would ", text)
 text = re.sub(r"\'ll", " will ", text)
 text = re.sub(r",", " ", text)
 text = re.sub(r"\.", " ", text)
 text = re.sub(r"!", " ! ", text)
 text = re.sub(r"\/", " ", text)
 text = re.sub(r"\^", " ^ ", text)
 text = re.sub(r"\+", " + ", text)
 text = re.sub(r"\-", " - ", text)
 text = re.sub(r"\=", " = ", text)
 text = re.sub(r"'", " ", text)
 text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
 text = re.sub(r":", " : ", text)
 text = re.sub(r" e g ", " eg ", text)
 text = re.sub(r" b g ", " bg ", text)
 text = re.sub(r" u s ", " american ", text)
 text = re.sub(r"\0s", "0", text)
 text = re.sub(r" 9 11 ", "911", text)
 text = re.sub(r"e - mail", "email", text)
 text = re.sub(r"j k", "jk", text)
 text = re.sub(r"\s{2,}", " ", text)
 text = text.split()
 return text
 #Prepare embedding of the data - I am using quora question pairs
 for dataset in [train_df, test_df]:
 for index, row in dataset.iterrows():
 # Iterate through the text of both questions of the row
 for question in questions_cols:
 q2n = [] # q2n -> question numbers representation
 for word in text_to_word_list(row[question]):
 # Check for unwanted words
 if word in stops and word not in word2vec.vocab:
 continue
 if word not in vocabulary:
 vocabulary[word] = len(inverse_vocabulary)
 q2n.append(len(inverse_vocabulary))
 inverse_vocabulary.append(word)
 else:
 q2n.append(vocabulary[word])
 # Replace questions as word to question as number representation
 dataset.set_value(index, question, q2n)
 I am using 300 dimension for my embedding i.e. there will 300 vectors for each word in the corpora represented for neural network model.
 embedding_dim = 300
 embeddings = 1 * np.random.randn(len(vocabulary) + 1, embedding_dim) #embedding matrix
 embeddings[0] = 0 #padding will be ignored
 #Build the embedding matrix
 for word, index in vocabulary.items():
 if word in word2vec.vocab:
 embeddings[index] = word2vec.word_vec(word)
 Keras doesn't come with Manhattan distance calculation, hence we need to write a routine to do that for us.
 def exponent_neg_manhattan_distance(left, right):
 ''' Helper function for the similarity estimate of the LSTMs outputs'''
 return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))
 Let's build the model now:
 The visible layer
 left_input = Input(shape=(max_seq_length,), dtype='int32')
 right_input = Input(shape=(max_seq_length,), dtype='int32')
 embedding_layer = Embedding(len(embeddings), embedding_dim, weights=[embeddings], input_length=max_seq_length, trainable=False)
 # Embedded version of the inputs
 encoded_left = embedding_layer(left_input)
 encoded_right = embedding_layer(right_input)
 # Since this is a siamese network, both sides share the same LSTM
 shared_lstm = LSTM(n_hidden)
 left_output = shared_lstm(encoded_left)
 right_output = shared_lstm(encoded_right)
 # Calculates the distance as defined by the MaLSTM model
 malstm_distance = Merge(mode=lambda x: exponent_neg_manhattan_distance(x[0], x[1]), output_shape=lambda x: (x[0][0], 1))([left_output, right_output])
 # Pack it all up into a model
 malstm = Model([left_input, right_input], [malstm_distance])
 We need set an optimizer, I am using adadelta but any other popular optimizer such as RMSProp, Adam and even SGD could be tested to see if it increases accuracy, decreases training time by finding better local minima (yes, global minima is an elusive goal still).
 # Adadelta optimizer, with gradient clipping by norm
 optimizer = Adadelta(clipnorm=gradient_clipping_norm)
 Now we will compile and train the model.
 malstm.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])
 # Start training
 training_start_time = time()
 malstm_trained = malstm.fit([X_train['left'], X_train['right']], Y_train, batch_size=batch_size, nb_epoch=n_epoch, validation_data=([X_validation['left'], X_validation['right']], Y_validation))
	#Load required Keras libraries:
	from keras.preprocessing.sequence import pad_sequences
	from keras.models import Model
	from keras.layers import Input, Embedding, LSTM, Merge
	import keras.backend as K
	from keras.optimizers import Adadelta
	from keras.callbacks import ModelCheckpoint
	I wrote a routine to clean text data
	def text_to_word_list(text):
	''' Pre process and convert texts to a list of words '''
	text = str(text)
	text = text.lower()
	# Clean the text
	text = re.sub(r"[^A-Za-z0–9^,!.\/'+-=]", " ", text)
	text = re.sub(r"what's", "what is ", text)
	text = re.sub(r"\'s", " ", text)
	text = re.sub(r"\'ve", " have ", text)
	text = re.sub(r"can't", "cannot ", text)
	text = re.sub(r"n't", " not ", text)
	text = re.sub(r"i'm", "i am ", text)
	text = re.sub(r"\'re", " are ", text)
	text = re.sub(r"\'d", " would ", text)
	text = re.sub(r"\'ll", " will ", text)
	text = re.sub(r",", " ", text)
	text = re.sub(r"\.", " ", text)
	text = re.sub(r"!", " ! ", text)
	text = re.sub(r"\/", " ", text)
	text = re.sub(r"\^", " ^ ", text)
	text = re.sub(r"\+", " + ", text)
	text = re.sub(r"\-", " - ", text)
	text = re.sub(r"\=", " = ", text)
	text = re.sub(r"'", " ", text)
	text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
	text = re.sub(r":", " : ", text)
	text = re.sub(r" e g ", " eg ", text)
	text = re.sub(r" b g ", " bg ", text)
	text = re.sub(r" u s ", " american ", text)
	text = re.sub(r"\0s", "0", text)
	text = re.sub(r" 9 11 ", "911", text)
	text = re.sub(r"e - mail", "email", text)
	text = re.sub(r"j k", "jk", text)
	text = re.sub(r"\s{2,}", " ", text)
	text = text.split()
	return text
	#Prepare embedding of the data - I am using quora question pairs
	for dataset in [train_df, test_df]:
	for index, row in dataset.iterrows():
	# Iterate through the text of both questions of the row
	for question in questions_cols:
	q2n = [] # q2n -> question numbers representation
	for word in text_to_word_list(row[question]):
	# Check for unwanted words
	if word in stops and word not in word2vec.vocab:
	continue
	if word not in vocabulary:
	vocabulary[word] = len(inverse_vocabulary)
	q2n.append(len(inverse_vocabulary))
	inverse_vocabulary.append(word)
	else:
	q2n.append(vocabulary[word])
	# Replace questions as word to question as number representation
	dataset.set_value(index, question, q2n)
	I am using 300 dimension for my embedding i.e. there will 300 vectors for each word in the corpora represented for neural network model.
	embedding_dim = 300
	embeddings = 1 * np.random.randn(len(vocabulary) + 1, embedding_dim) #embedding matrix
	embeddings[0] = 0 #padding will be ignored
	#Build the embedding matrix
	for word, index in vocabulary.items():
	if word in word2vec.vocab:
	embeddings[index] = word2vec.word_vec(word)
	Keras doesn't come with Manhattan distance calculation, hence we need to write a routine to do that for us.
	def exponent_neg_manhattan_distance(left, right):
	''' Helper function for the similarity estimate of the LSTMs outputs'''
	return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))
	Let's build the model now:
	The visible layer
	left_input = Input(shape=(max_seq_length,), dtype='int32')
	right_input = Input(shape=(max_seq_length,), dtype='int32')
	embedding_layer = Embedding(len(embeddings), embedding_dim, weights=[embeddings], input_length=max_seq_length, trainable=False)
	# Embedded version of the inputs
	encoded_left = embedding_layer(left_input)
	encoded_right = embedding_layer(right_input)
	# Since this is a siamese network, both sides share the same LSTM
	shared_lstm = LSTM(n_hidden)
	left_output = shared_lstm(encoded_left)
	right_output = shared_lstm(encoded_right)
	# Calculates the distance as defined by the MaLSTM model
	malstm_distance = Merge(mode=lambda x: exponent_neg_manhattan_distance(x[0], x[1]), output_shape=lambda x: (x[0][0], 1))([left_output, right_output])
	# Pack it all up into a model
	malstm = Model([left_input, right_input], [malstm_distance])
	We need set an optimizer, I am using adadelta but any other popular optimizer such as RMSProp, Adam and even SGD could be tested to see if it increases accuracy, decreases training time by finding better local minima (yes, global minima is an elusive goal still).
	# Adadelta optimizer, with gradient clipping by norm
	optimizer = Adadelta(clipnorm=gradient_clipping_norm)
	Now we will compile and train the model.
	malstm.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])
	# Start training
	training_start_time = time()
	malstm_trained = malstm.fit([X_train['left'], X_train['right']], Y_train, batch_size=batch_size, nb_epoch=n_epoch, validation_data=([X_validation['left'], X_validation['right']], Y_validation))
No results found