arminius2 · September 29, 2021 06:13 · Sep 28, 2021
diff --git a/gym.py b/gym.py
@@ -0,0 +1,222 @@
+import os
+import pickle
+import warnings
+
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from tensorflow.keras.callbacks import EarlyStopping
+from tensorflow.keras.layers import Dense
+from tensorflow.keras.layers import Dropout
+from tensorflow.keras.layers import Embedding
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.python.keras.layers import GlobalAveragePooling1D, LSTM, Bidirectional
+
+from bad_content import config
+from bad_content.utils import show_plot_confusion_matrix, show_classification_report
+
+warnings.filterwarnings("ignore")  # We're outlaws!
+
+
+def create_embedding_matrix(filepath, word_index, embedding_dim):
+    print('Creating embedding matrix from the glove.')
+    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
+    embedding_matrix = np.zeros((vocab_size, embedding_dim))
+
+    with open(filepath, encoding='utf8') as f:
+        for line in f:
+            word, *vector = line.split()
+            if word in word_index:
+                idx = word_index[word]
+                embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]
+
+    return embedding_matrix
+
+
+def train(classification_report: bool = False, plot_confusion_matrix_report: bool = False) -> None:
+    """For better result while training, play https://www.youtube.com/watch?v=_YYmfM2TfUA as loud as possible."""
+
+    df = pd.read_csv('data/bad_content_clean.csv', encoding='utf-8')
+    df.head()
+
+    data = df.copy()  # Make a copy of the data.
+
+    print(f'Value Count: {data.spam.value_counts()}')
+
+    # sns.countplot(data['spam'])
+    # plt.show()
+
+    X = data['content'].values
+    y = data['spam'].values
+
+    X_train: np.ndarray
+    X_test: np.ndarray
+    y_train: np.ndarray
+    y_test: np.ndarray
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
+
+    # Prepare the tokenizer.
+    t = Tokenizer()
+    t.fit_on_texts(X_train)
+
+    # integer encode the documents
+    encoded_train = t.texts_to_sequences(X_train)
+    encoded_test = t.texts_to_sequences(X_test)
+    print(f'encoded_train[0:2]: {encoded_train[0:2]}')
+
+    # pad documents to a max length of 50 words.
+    max_length = 50
+    padded_train = pad_sequences(encoded_train, maxlen=max_length, padding='post')
+    padded_test = pad_sequences(encoded_test, maxlen=max_length, padding='post')
+
+    print(f'padded_train: {padded_train}')
+
+    vocab_size = len(t.word_index) + 1
+
+    embedding_dim = max_length
+    embedding_matrix = create_embedding_matrix(
+        f'data/glove.6B/glove.6B.{embedding_dim}d.txt',
+        t.word_index,
+        embedding_dim
+    )
+
+    def my_model():
+        # Define the model as Sequential.
+        model = Sequential()
+
+        # The model trains for a number of epochs and stops once it is not improving anymore.
+        # This is made possible by the early [stopping callback](https://keras.io/api/callbacks/early_stopping/).
+        # The model training might run for about 11 or 12 epochs.
+        # This varies because of the stochastic[https://machinelearningmastery.com/stochastic-in-machine-learning/]
+        # nature of the model and even data splitting.
+        model.add(Embedding(vocab_size, embedding_dim, input_length=max_length, weights=[embedding_matrix], trainable=False))
+
+        # model.add(Flatten())
+        model.add(GlobalAveragePooling1D())
+        model.add(Dense(X_train.shape[0] / 4, activation='relu'))
+        model.add(Dropout(0.2))
+
+        model.add(Dense(X_train.shape[0] / 6, activation='relu'))
+        model.add(Dropout(0.2))
+
+        model.add(Dense(X_train.shape[0] / 8, activation='relu'))
+        model.add(Dropout(0.2))
+
+        model.add(Dense(X_train.shape[0] / 10, activation='relu')) 
+        model.add(Dropout(0.2))
+
+        model.add(Dense(1, activation='sigmoid'))
+
+        # compile the model
+        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
+
+        # summarize the model
+        print(f'model.summary(): {model.summary()}')
+
+        early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
+        # checkpoint = ModelCheckpoint(
+        # 'models/weights.{epoch:02d}-{val_loss:.2f}.hdf5',
+        #     monitor='val_accuracy',
+        #     save_best_only=True,
+        #     verbose=1,
+        # )
+
+        # fit the model
+        model.fit(
+            x=padded_train,
+            y=y_train,
+            epochs=100,
+            # batch_size=20,
+            validation_data=(padded_test, y_test),
+            verbose=1,
+            # callbacks=[checkpoint, early_stop],
+            callbacks=[early_stop, ],
+            use_multiprocessing=True
+        )
+
+        return model
+
+    def ltsm_model():
+        # LSTM hyperparameters
+        n_lstm = 20
+        drop_lstm = 0.2
+
+        model = Sequential()
+        model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
+        model.add(LSTM(n_lstm, dropout=drop_lstm, return_sequences=True))
+        model.add(LSTM(n_lstm, dropout=drop_lstm, return_sequences=True))
+        model.add(Dense(1, activation='sigmoid'))
+
+        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
+
+        # summarize the model
+        print(f'model.summary(): {model.summary()}')
+
+        num_epochs = 30
+        early_stop = EarlyStopping(monitor='val_loss', patience=2)
+        model.fit(
+            padded_train,
+            y_train,
+            epochs=num_epochs,
+            validation_data=(padded_test, y_test),
+            callbacks=[early_stop],
+            verbose=1,
+        )
+
+        return model
+
+    def bi_lstm_model():
+        # LSTM hyperparameters
+        n_lstm = 20
+        drop_lstm = 0.2
+        model = Sequential()
+        model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
+        model.add(Bidirectional(LSTM(n_lstm, dropout=drop_lstm, return_sequences=True)))
+        # model.add(Bidirectional(CuDNNLSTM(
+        #     units=n_lstm,
+        #     dropout=drop_lstm,
+        #     return_sequences=True,
+        #     recurrent_activation='sigmoid',
+        # )))
+        model.add(Dense(1, activation='sigmoid'))
+
+        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
+
+        # summarize the model
+        print(f'model.summary(): {model.summary()}')
+
+        num_epochs = 30
+        early_stop = EarlyStopping(monitor='val_loss', patience=2)
+
+        model.fit(
+            padded_train, y_train, epochs=num_epochs,
+            validation_data=(padded_test, y_test),
+            callbacks=[early_stop, ],
+            verbose=1,
+            use_multiprocessing=True
+        )
+
+        return model
+
+    model = bi_lstm_model()
+
+    preds = (model.predict(padded_test) > 0.5).astype("int32")
+
+    if classification_report:
+        show_classification_report(y_test, preds)
+
+    if plot_confusion_matrix_report:
+        show_plot_confusion_matrix(y_test, preds)
+
+    if not os.path.exists(config.__MODEL_SAVE_PATH):
+        os.makedirs(config.__MODEL_SAVE_PATH)
+
+    print(f'Saving model to {config.__MODEL_SAVE_PATH}')
+
+    model.save(config.__MODEL_SAVE_PATH)
+
+    with open(f'{config.__MODEL_SAVE_PATH}/tokenizer.pkl', 'wb') as output:
+        pickle.dump(t, output, pickle.HIGHEST_PROTOCOL)
No results found