-
-
Save ssarkar445/26bfa80d79610acae7ab1d72fcf5594d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #Load required Keras libraries: | |
| from keras.preprocessing.sequence import pad_sequences | |
| from keras.models import Model | |
| from keras.layers import Input, Embedding, LSTM, Merge | |
| import keras.backend as K | |
| from keras.optimizers import Adadelta | |
| from keras.callbacks import ModelCheckpoint | |
| I wrote a routine to clean text data | |
| def text_to_word_list(text): | |
| ''' Pre process and convert texts to a list of words ''' | |
| text = str(text) | |
| text = text.lower() | |
| # Clean the text | |
| text = re.sub(r"[^A-Za-z0–9^,!.\/'+-=]", " ", text) | |
| text = re.sub(r"what's", "what is ", text) | |
| text = re.sub(r"\'s", " ", text) | |
| text = re.sub(r"\'ve", " have ", text) | |
| text = re.sub(r"can't", "cannot ", text) | |
| text = re.sub(r"n't", " not ", text) | |
| text = re.sub(r"i'm", "i am ", text) | |
| text = re.sub(r"\'re", " are ", text) | |
| text = re.sub(r"\'d", " would ", text) | |
| text = re.sub(r"\'ll", " will ", text) | |
| text = re.sub(r",", " ", text) | |
| text = re.sub(r"\.", " ", text) | |
| text = re.sub(r"!", " ! ", text) | |
| text = re.sub(r"\/", " ", text) | |
| text = re.sub(r"\^", " ^ ", text) | |
| text = re.sub(r"\+", " + ", text) | |
| text = re.sub(r"\-", " - ", text) | |
| text = re.sub(r"\=", " = ", text) | |
| text = re.sub(r"'", " ", text) | |
| text = re.sub(r"(\d+)(k)", r"\g<1>000", text) | |
| text = re.sub(r":", " : ", text) | |
| text = re.sub(r" e g ", " eg ", text) | |
| text = re.sub(r" b g ", " bg ", text) | |
| text = re.sub(r" u s ", " american ", text) | |
| text = re.sub(r"\0s", "0", text) | |
| text = re.sub(r" 9 11 ", "911", text) | |
| text = re.sub(r"e - mail", "email", text) | |
| text = re.sub(r"j k", "jk", text) | |
| text = re.sub(r"\s{2,}", " ", text) | |
| text = text.split() | |
| return text | |
| #Prepare embedding of the data - I am using quora question pairs | |
| for dataset in [train_df, test_df]: | |
| for index, row in dataset.iterrows(): | |
| # Iterate through the text of both questions of the row | |
| for question in questions_cols: | |
| q2n = [] # q2n -> question numbers representation | |
| for word in text_to_word_list(row[question]): | |
| # Check for unwanted words | |
| if word in stops and word not in word2vec.vocab: | |
| continue | |
| if word not in vocabulary: | |
| vocabulary[word] = len(inverse_vocabulary) | |
| q2n.append(len(inverse_vocabulary)) | |
| inverse_vocabulary.append(word) | |
| else: | |
| q2n.append(vocabulary[word]) | |
| # Replace questions as word to question as number representation | |
| dataset.set_value(index, question, q2n) | |
| I am using 300 dimension for my embedding i.e. there will 300 vectors for each word in the corpora represented for neural network model. | |
| embedding_dim = 300 | |
| embeddings = 1 * np.random.randn(len(vocabulary) + 1, embedding_dim) #embedding matrix | |
| embeddings[0] = 0 #padding will be ignored | |
| #Build the embedding matrix | |
| for word, index in vocabulary.items(): | |
| if word in word2vec.vocab: | |
| embeddings[index] = word2vec.word_vec(word) | |
| Keras doesn't come with Manhattan distance calculation, hence we need to write a routine to do that for us. | |
| def exponent_neg_manhattan_distance(left, right): | |
| ''' Helper function for the similarity estimate of the LSTMs outputs''' | |
| return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True)) | |
| Let's build the model now: | |
| The visible layer | |
| left_input = Input(shape=(max_seq_length,), dtype='int32') | |
| right_input = Input(shape=(max_seq_length,), dtype='int32') | |
| embedding_layer = Embedding(len(embeddings), embedding_dim, weights=[embeddings], input_length=max_seq_length, trainable=False) | |
| # Embedded version of the inputs | |
| encoded_left = embedding_layer(left_input) | |
| encoded_right = embedding_layer(right_input) | |
| # Since this is a siamese network, both sides share the same LSTM | |
| shared_lstm = LSTM(n_hidden) | |
| left_output = shared_lstm(encoded_left) | |
| right_output = shared_lstm(encoded_right) | |
| # Calculates the distance as defined by the MaLSTM model | |
| malstm_distance = Merge(mode=lambda x: exponent_neg_manhattan_distance(x[0], x[1]), output_shape=lambda x: (x[0][0], 1))([left_output, right_output]) | |
| # Pack it all up into a model | |
| malstm = Model([left_input, right_input], [malstm_distance]) | |
| We need set an optimizer, I am using adadelta but any other popular optimizer such as RMSProp, Adam and even SGD could be tested to see if it increases accuracy, decreases training time by finding better local minima (yes, global minima is an elusive goal still). | |
| # Adadelta optimizer, with gradient clipping by norm | |
| optimizer = Adadelta(clipnorm=gradient_clipping_norm) | |
| Now we will compile and train the model. | |
| malstm.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy']) | |
| # Start training | |
| training_start_time = time() | |
| malstm_trained = malstm.fit([X_train['left'], X_train['right']], Y_train, batch_size=batch_size, nb_epoch=n_epoch, validation_data=([X_validation['left'], X_validation['right']], Y_validation)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment