Skip to content

Instantly share code, notes, and snippets.

@miles-igd
Last active February 25, 2019 09:18
Show Gist options
  • Select an option

  • Save miles-igd/d8e57a6eaa6e2690c656fb937dffe77e to your computer and use it in GitHub Desktop.

Select an option

Save miles-igd/d8e57a6eaa6e2690c656fb937dffe77e to your computer and use it in GitHub Desktop.
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
import gensim
from keras.layers import Input, Dense, BatchNormalization
from keras.models import Model
from keras.callbacks import ModelCheckpoint
class ClassifierCategorical():
def __init__(self, doc_vecs, tag_vecs):
#This just saves the model every epoch
callbacks = [
ModelCheckpoint('classifier_categorical.h5', save_best_only=True)
]
inputlen = len(doc_vecs[0])
outputlen = len(tag_vecs[0])
#Create a model using Keras functional API
inputs = Input(shape=(inputlen,))
x = Dense(inputlen, activation='relu')(inputs)
x = Dense(inputlen, activation='relu')(inputs)
outputs = Dense(outputlen, activation='sigmoid')(x)
#Compile model
self.model = Model(inputs=inputs, outputs=outputs)
self.model.compile(optimizer='SGD',
loss='categorical_crossentropy',
metrics=['accuracy'])
#Train the model immediately
self.model.fit(doc_vecs,
tag_vecs,
batch_size=8,
epochs=25,
verbose=1,
validation_split=0.3,
callbacks=callbacks)
def save(self, name):
self.model.save(name)
#inputs: create docvecs list that matches with clustertag genre list
def get_docvecs_list(df, docvecs):
"""
df: pandas.DataFrame
docvecs: gensim.models.keyedvectors.Doc2VecKeyedVectors
returns list of numpy arrays
"""
return [docvecs[name] for name in df['name']]
#outputs: encode categories
def encode_data(df):
"""
df: pandas.DataFrame
returns list of lists of encoded categorical array,
sklearn.preprocessing.OneHotEncoder
"""
vals = df['clustercat'].values
vals = vals.reshape(len(vals), 1)
enc = OneHotEncoder(sparse=False, categories='auto')
enc_vals = enc.fit_transform(vals)
return enc_vals, enc
if __name__ == '__main__':
#load data
model = gensim.models.Doc2Vec.load("model/doc2vec.model")
df = pd.read_csv('clustercat.csv', index_col=0)
#get inputs and outputs
outs, enc = encode_data(df)
ins = np.asarray(get_docvecs_list(df, model.docvecs))
#shuffle the for a random order
random_state = np.random.get_state()
np.random.shuffle(outs)
np.random.set_state(random_state)
np.random.shuffle(ins)
#train model
classifier = ClassifierCategorical(ins, outs)
classifier.save(f'classifier_categorical.h5')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment