Skip to content

Instantly share code, notes, and snippets.

@mvsusp
Created February 28, 2018 04:19
Show Gist options
  • Select an option

  • Save mvsusp/fee588819dd73300123e209298d8ee2c to your computer and use it in GitHub Desktop.

Select an option

Save mvsusp/fee588819dd73300123e209298d8ee2c to your computer and use it in GitHub Desktop.
keras_embeddings
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import print_function
import sys
import numpy as np
import os
import tensorflow as tf
from tensorflow.contrib.learn import Experiment
from tensorflow.contrib.learn.python.learn import learn_runner
from tensorflow.contrib.learn.python.learn.utils import saved_model_export_utils
from tensorflow.python.estimator.export.export import build_raw_serving_input_receiver_fn
from tensorflow.python.estimator.inputs.numpy_io import numpy_input_fn
from tensorflow.python.keras._impl.keras.engine.topology import Input
from tensorflow.python.keras._impl.keras.layers import Dense, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.python.keras._impl.keras.models import Model
from tensorflow.python.keras._impl.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras._impl.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras._impl.keras.utils import to_categorical
from tensorflow.python.saved_model.signature_constants import PREDICT_INPUTS
# data inside the container will stored inside this folder
# https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo.html#your-algorithms-training-algo-running-container-trainingdata
SAGEMAKER_DATA_PATH = '/opt/ml/input/data'
inside_sagemaker_container = os.path.exists(SAGEMAKER_DATA_PATH)
# allows script to be executed inside and outside the container
base_dir = SAGEMAKER_DATA_PATH if inside_sagemaker_container else 'data'
GLOVE_DIR = os.path.join(base_dir, 'embedding', 'glove.6B')
TEXT_DATA_DIR = os.path.join(base_dir, 'training', '20_newsgroup')
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
# first, build index mapping words in the embeddings set
# to their embedding vector
print('Indexing word vectors.')
embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
print('Found %s word vectors.' % len(embeddings_index))
# second, prepare text samples and their labels
print('Processing text dataset')
texts = [] # list of text samples
labels_index = {} # dictionary mapping label name to numeric id
labels = [] # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
path = os.path.join(TEXT_DATA_DIR, name)
if os.path.isdir(path):
label_id = len(labels_index)
labels_index[name] = label_id
for fname in sorted(os.listdir(path)):
if fname.isdigit():
fpath = os.path.join(path, fname)
args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'}
with open(fpath, **args) as f:
t = f.read()
i = t.find('\n\n') # skip header
if 0 < i:
t = t[i:]
texts.append(t)
labels.append(label_id)
print('Found %s texts.' % len(texts))
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices].astype(np.float32)
labels = labels[indices].astype(np.float32)
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]
print('Preparing embedding matrix.')
# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
if i >= MAX_NUM_WORDS:
continue
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
EMBEDDING_DIM,
weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable=False)
print('Training model.')
# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name=PREDICT_INPUTS)
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(labels_index), activation='softmax')(x)
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
optimizer='rmsprop',
metrics=['acc'])
def keras_model_fn(hyperparameters):
return model
def serving_input_fn(hyperpameters):
tensor = tf.placeholder(tf.int32, shape=[None, MAX_SEQUENCE_LENGTH])
return build_raw_serving_input_receiver_fn({PREDICT_INPUTS: tensor})
def train_input_fn(training_dir, hyperpameters):
return numpy_input_fn(x={PREDICT_INPUTS: x_train}, y=y_train, shuffle=False)
def eval_input_fn(training_dir, hyperpameters):
return numpy_input_fn(x={PREDICT_INPUTS: x_val}, y=y_val, shuffle=False)
if __name__ == '__main__':
tf.logging.set_verbosity(tf.logging.INFO)
estimator = tf.keras.estimator.model_to_estimator(keras_model=model)
export_strategy = saved_model_export_utils.make_export_strategy(serving_input_fn=serving_input_fn(None),
default_output_alternative_key=None,
exports_to_keep=1)
def experiment_fn(output_dir):
return Experiment(estimator=estimator, train_input_fn=train_input_fn(None, None),
eval_input_fn=eval_input_fn(None, None), export_strategies=[export_strategy])
learn_runner.run(experiment_fn, '.')
import tensorflow as tf
from sagemaker import RealTimePredictor
from sagemaker.tensorflow.predictor import tf_deserializer, tf_serializer
from tensorflow.python.saved_model.signature_constants import DEFAULT_SERVING_SIGNATURE_DEF_KEY
from tensorflow_serving.apis import predict_pb2
def _create_feature(v):
if type(v) == int:
return tf.train.Feature(int64_list=tf.train.Int64List(value=[v]))
if type(v) == str:
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[v]))
if type(v) == float:
return tf.train.Feature(float_list=tf.train.FloatList(value=[v]))
raise ValueError('invalid type')
def _create_tensor_proto():
data = {'account_age': 12.0, 'average_usage': 4.923076923, 'n_regions': 2.0, 'n_success_bills': 0.0,
'total_success_billing': 0.0}
features = {k: _create_feature(v) for k, v in data.items()}
request = predict_pb2.PredictRequest()
request.model_spec.name = "generic_model"
request.model_spec.signature_name = DEFAULT_SERVING_SIGNATURE_DEF_KEY
example = tf.train.Example(features=tf.train.Features(feature=features))
return tf.make_tensor_proto(values=example.SerializeToString(), shape=[1], dtype=tf.string)
if __name__ == '__main__':
tensor_proto = _create_tensor_proto()
endpoint = 'sagemaker-tensorflow-py2-cpu-2018-01-25-19-56-02-379'
predictor = RealTimePredictor(endpoint_name=endpoint,
deserializer=tf_deserializer,
serializer=tf_serializer,
content_type='application/octet-stream')
request = predict_pb2.PredictRequest()
request.model_spec.name = "generic_model"
request.model_spec.signature_name = DEFAULT_SERVING_SIGNATURE_DEF_KEY
request.inputs['examples'].CopyFrom(tensor_proto)
result = predictor.predict(request)
print(result)
import os
from sagemaker import s3_input
from sagemaker.tensorflow import TensorFlow
if __name__ == '__main__':
estimator = TensorFlow(entry_point='keras_embeddings.py',
input_mode='Pipe',
role='SageMakerRole',
training_steps=10000,
evaluation_steps=100,
train_instance_count=1,
train_instance_type='ml.c4.xlarge')
bucket_folder = 'my-bucket-name/data/keras-embeddings'
# two folders, training and embedding, will be created in the container in
# '/opt/ml/input/data'
# https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo.html
# #your-algorithms-training-algo-running-container-trainingdata
channel = {'training': s3_input(os.path.join(bucket_folder, 'training')),
'embedding': s3_input(os.path.join(bucket_folder, 'embedding'))}
estimator.fit(channel)
@Harathi123
Copy link

Hi @mvsusp, its a very nice implementation. I am searching for some examples for training and deploying keras model in sagemaker. I understood 'keras_embeddings.py' and 'train_and_deploy.py' files. But I am unable to understand 'predictor.py' file. If my understanding is correct, it is for deploying the fitted model.

My question is, cant we run estimator.deploy() method after fitting the model and use predictor.predict() method for inferences?

And one more thing is here i didnt see where you deployed the model and created endpoint, but it is used in the predictor method.

If you dont mind, can you please clear my confusions... It would be really heplful...

Thanks so much,
Harathi

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment