mvsusp · February 28, 2018 04:19 · Harathi123 · Aug 16, 2018
diff --git a/keras_embeddings.py b/keras_embeddings.py
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from __future__ import print_function

 import sys

 import numpy as np
 import os
 import tensorflow as tf
 from tensorflow.contrib.learn import Experiment
 from tensorflow.contrib.learn.python.learn import learn_runner
 from tensorflow.contrib.learn.python.learn.utils import saved_model_export_utils
 from tensorflow.python.estimator.export.export import build_raw_serving_input_receiver_fn
 from tensorflow.python.estimator.inputs.numpy_io import numpy_input_fn
 from tensorflow.python.keras._impl.keras.engine.topology import Input
 from tensorflow.python.keras._impl.keras.layers import Dense, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.preprocessing.sequence import pad_sequences
 from tensorflow.python.keras._impl.keras.preprocessing.text import Tokenizer
 from tensorflow.python.keras._impl.keras.utils import to_categorical
 from tensorflow.python.saved_model.signature_constants import PREDICT_INPUTS

 # data inside the container will stored inside this folder
 # https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo.html#your-algorithms-training-algo-running-container-trainingdata
 SAGEMAKER_DATA_PATH = '/opt/ml/input/data'
 inside_sagemaker_container = os.path.exists(SAGEMAKER_DATA_PATH)

 # allows script to be executed inside and outside the container
 base_dir = SAGEMAKER_DATA_PATH if inside_sagemaker_container else 'data'

 GLOVE_DIR = os.path.join(base_dir, 'embedding', 'glove.6B')
 TEXT_DATA_DIR = os.path.join(base_dir, 'training', '20_newsgroup')
 MAX_SEQUENCE_LENGTH = 1000
 MAX_NUM_WORDS = 20000
 EMBEDDING_DIM = 100
 VALIDATION_SPLIT = 0.2

 # first, build index mapping words in the embeddings set
 # to their embedding vector
 print('Indexing word vectors.')

 embeddings_index = {}
 with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

 print('Found %s word vectors.' % len(embeddings_index))

 # second, prepare text samples and their labels
 print('Processing text dataset')

 texts = []  # list of text samples
 labels_index = {}  # dictionary mapping label name to numeric id
 labels = []  # list of label ids
 for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'}
                with open(fpath, **args) as f:
                    t = f.read()
                    i = t.find('\n\n')  # skip header
                    if 0 < i:
                        t = t[i:]
                    texts.append(t)
                labels.append(label_id)

 print('Found %s texts.' % len(texts))

 # finally, vectorize the text samples into a 2D integer tensor
 tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
 tokenizer.fit_on_texts(texts)
 sequences = tokenizer.texts_to_sequences(texts)

 word_index = tokenizer.word_index
 print('Found %s unique tokens.' % len(word_index))

 data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

 labels = to_categorical(np.asarray(labels))
 print('Shape of data tensor:', data.shape)
 print('Shape of label tensor:', labels.shape)

 # split the data into a training set and a validation set
 indices = np.arange(data.shape[0])
 np.random.shuffle(indices)
 data = data[indices].astype(np.float32)
 labels = labels[indices].astype(np.float32)
 num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

 x_train = data[:-num_validation_samples]
 y_train = labels[:-num_validation_samples]
 x_val = data[-num_validation_samples:]
 y_val = labels[-num_validation_samples:]
 print('Preparing embedding matrix.')

 # prepare embedding matrix
 num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
 embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
 for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

 # load pre-trained word embeddings into an Embedding layer
 # note that we set trainable = False so as to keep the embeddings fixed
 embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

 print('Training model.')

 # train a 1D convnet with global maxpooling
 sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name=PREDICT_INPUTS)
 embedded_sequences = embedding_layer(sequence_input)
 x = Conv1D(128, 5, activation='relu')(embedded_sequences)
 x = MaxPooling1D(5)(x)
 x = Conv1D(128, 5, activation='relu')(x)
 x = MaxPooling1D(5)(x)
 x = Conv1D(128, 5, activation='relu')(x)
 x = GlobalMaxPooling1D()(x)
 x = Dense(128, activation='relu')(x)
 preds = Dense(len(labels_index), activation='softmax')(x)

 model = Model(sequence_input, preds)
 model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])


 def keras_model_fn(hyperparameters):
    return model


 def serving_input_fn(hyperpameters):
    tensor = tf.placeholder(tf.int32, shape=[None, MAX_SEQUENCE_LENGTH])
    return build_raw_serving_input_receiver_fn({PREDICT_INPUTS: tensor})


 def train_input_fn(training_dir, hyperpameters):
    return numpy_input_fn(x={PREDICT_INPUTS: x_train}, y=y_train, shuffle=False)


 def eval_input_fn(training_dir, hyperpameters):
    return numpy_input_fn(x={PREDICT_INPUTS: x_val}, y=y_val, shuffle=False)


 if __name__ == '__main__':
    tf.logging.set_verbosity(tf.logging.INFO)

    estimator = tf.keras.estimator.model_to_estimator(keras_model=model)

    export_strategy = saved_model_export_utils.make_export_strategy(serving_input_fn=serving_input_fn(None),
                                                                    default_output_alternative_key=None,
                                                                    exports_to_keep=1)


    def experiment_fn(output_dir):
        return Experiment(estimator=estimator, train_input_fn=train_input_fn(None, None),
                          eval_input_fn=eval_input_fn(None, None), export_strategies=[export_strategy])

    learn_runner.run(experiment_fn, '.')
diff --git a/predict.py b/predict.py
 import tensorflow as tf
 from sagemaker import RealTimePredictor
 from sagemaker.tensorflow.predictor import tf_deserializer, tf_serializer
 from tensorflow.python.saved_model.signature_constants import DEFAULT_SERVING_SIGNATURE_DEF_KEY
 from tensorflow_serving.apis import predict_pb2


 def _create_feature(v):
    if type(v) == int:
        return tf.train.Feature(int64_list=tf.train.Int64List(value=[v]))
    if type(v) == str:
        return tf.train.Feature(bytes_list=tf.train.BytesList(value=[v]))
    if type(v) == float:
        return tf.train.Feature(float_list=tf.train.FloatList(value=[v]))
    raise ValueError('invalid type')


 def _create_tensor_proto():
    data = {'account_age': 12.0, 'average_usage': 4.923076923, 'n_regions': 2.0, 'n_success_bills': 0.0,
            'total_success_billing': 0.0}
    features = {k: _create_feature(v) for k, v in data.items()}
    request = predict_pb2.PredictRequest()
    request.model_spec.name = "generic_model"
    request.model_spec.signature_name = DEFAULT_SERVING_SIGNATURE_DEF_KEY
    example = tf.train.Example(features=tf.train.Features(feature=features))
    return tf.make_tensor_proto(values=example.SerializeToString(), shape=[1], dtype=tf.string)


 if __name__ == '__main__':
    tensor_proto = _create_tensor_proto()

    endpoint = 'sagemaker-tensorflow-py2-cpu-2018-01-25-19-56-02-379'
    predictor = RealTimePredictor(endpoint_name=endpoint,
                                  deserializer=tf_deserializer,
                                  serializer=tf_serializer,
                                  content_type='application/octet-stream')

    request = predict_pb2.PredictRequest()
    request.model_spec.name = "generic_model"
    request.model_spec.signature_name = DEFAULT_SERVING_SIGNATURE_DEF_KEY
    request.inputs['examples'].CopyFrom(tensor_proto)

    result = predictor.predict(request)

    print(result)
diff --git a/train_and_deploy.py b/train_and_deploy.py
 import os
 from sagemaker import s3_input
 from sagemaker.tensorflow import TensorFlow

 if __name__ == '__main__':
    estimator = TensorFlow(entry_point='keras_embeddings.py',
                           input_mode='Pipe',
                           role='SageMakerRole',
                           training_steps=10000,
                           evaluation_steps=100,
                           train_instance_count=1,
                           train_instance_type='ml.c4.xlarge')

    bucket_folder = 'my-bucket-name/data/keras-embeddings'

    # two folders, training and embedding, will be created in the container in
    # '/opt/ml/input/data'
    # https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo.html
    # #your-algorithms-training-algo-running-container-trainingdata
    channel = {'training': s3_input(os.path.join(bucket_folder, 'training')),
               'embedding': s3_input(os.path.join(bucket_folder, 'embedding'))}

    estimator.fit(channel)
	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function
	from __future__ import print_function

	import sys

	import numpy as np
	import os
	import tensorflow as tf
	from tensorflow.contrib.learn import Experiment
	from tensorflow.contrib.learn.python.learn import learn_runner
	from tensorflow.contrib.learn.python.learn.utils import saved_model_export_utils
	from tensorflow.python.estimator.export.export import build_raw_serving_input_receiver_fn
	from tensorflow.python.estimator.inputs.numpy_io import numpy_input_fn
	from tensorflow.python.keras._impl.keras.engine.topology import Input
	from tensorflow.python.keras._impl.keras.layers import Dense, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D
	from tensorflow.python.keras._impl.keras.models import Model
	from tensorflow.python.keras._impl.keras.preprocessing.sequence import pad_sequences
	from tensorflow.python.keras._impl.keras.preprocessing.text import Tokenizer
	from tensorflow.python.keras._impl.keras.utils import to_categorical
	from tensorflow.python.saved_model.signature_constants import PREDICT_INPUTS

	# data inside the container will stored inside this folder
	# https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo.html#your-algorithms-training-algo-running-container-trainingdata
	SAGEMAKER_DATA_PATH = '/opt/ml/input/data'
	inside_sagemaker_container = os.path.exists(SAGEMAKER_DATA_PATH)

	# allows script to be executed inside and outside the container
	base_dir = SAGEMAKER_DATA_PATH if inside_sagemaker_container else 'data'

	GLOVE_DIR = os.path.join(base_dir, 'embedding', 'glove.6B')
	TEXT_DATA_DIR = os.path.join(base_dir, 'training', '20_newsgroup')
	MAX_SEQUENCE_LENGTH = 1000
	MAX_NUM_WORDS = 20000
	EMBEDDING_DIM = 100
	VALIDATION_SPLIT = 0.2

	# first, build index mapping words in the embeddings set
	# to their embedding vector
	print('Indexing word vectors.')

	embeddings_index = {}
	with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
	for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs

	print('Found %s word vectors.' % len(embeddings_index))

	# second, prepare text samples and their labels
	print('Processing text dataset')

	texts = [] # list of text samples
	labels_index = {} # dictionary mapping label name to numeric id
	labels = [] # list of label ids
	for name in sorted(os.listdir(TEXT_DATA_DIR)):
	path = os.path.join(TEXT_DATA_DIR, name)
	if os.path.isdir(path):
	label_id = len(labels_index)
	labels_index[name] = label_id
	for fname in sorted(os.listdir(path)):
	if fname.isdigit():
	fpath = os.path.join(path, fname)
	args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'}
	with open(fpath, **args) as f:
	t = f.read()
	i = t.find('\n\n') # skip header
	if 0 < i:
	t = t[i:]
	texts.append(t)
	labels.append(label_id)

	print('Found %s texts.' % len(texts))

	# finally, vectorize the text samples into a 2D integer tensor
	tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
	tokenizer.fit_on_texts(texts)
	sequences = tokenizer.texts_to_sequences(texts)

	word_index = tokenizer.word_index
	print('Found %s unique tokens.' % len(word_index))

	data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

	labels = to_categorical(np.asarray(labels))
	print('Shape of data tensor:', data.shape)
	print('Shape of label tensor:', labels.shape)

	# split the data into a training set and a validation set
	indices = np.arange(data.shape[0])
	np.random.shuffle(indices)
	data = data[indices].astype(np.float32)
	labels = labels[indices].astype(np.float32)
	num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

	x_train = data[:-num_validation_samples]
	y_train = labels[:-num_validation_samples]
	x_val = data[-num_validation_samples:]
	y_val = labels[-num_validation_samples:]
	print('Preparing embedding matrix.')

	# prepare embedding matrix
	num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
	embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
	for word, i in word_index.items():
	if i >= MAX_NUM_WORDS:
	continue
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
	# words not found in embedding index will be all-zeros.
	embedding_matrix[i] = embedding_vector

	# load pre-trained word embeddings into an Embedding layer
	# note that we set trainable = False so as to keep the embeddings fixed
	embedding_layer = Embedding(num_words,
	EMBEDDING_DIM,
	weights=[embedding_matrix],
	input_length=MAX_SEQUENCE_LENGTH,
	trainable=False)

	print('Training model.')

	# train a 1D convnet with global maxpooling
	sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name=PREDICT_INPUTS)
	embedded_sequences = embedding_layer(sequence_input)
	x = Conv1D(128, 5, activation='relu')(embedded_sequences)
	x = MaxPooling1D(5)(x)
	x = Conv1D(128, 5, activation='relu')(x)
	x = MaxPooling1D(5)(x)
	x = Conv1D(128, 5, activation='relu')(x)
	x = GlobalMaxPooling1D()(x)
	x = Dense(128, activation='relu')(x)
	preds = Dense(len(labels_index), activation='softmax')(x)

	model = Model(sequence_input, preds)
	model.compile(loss='categorical_crossentropy',
	optimizer='rmsprop',
	metrics=['acc'])


	def keras_model_fn(hyperparameters):
	return model


	def serving_input_fn(hyperpameters):
	tensor = tf.placeholder(tf.int32, shape=[None, MAX_SEQUENCE_LENGTH])
	return build_raw_serving_input_receiver_fn({PREDICT_INPUTS: tensor})


	def train_input_fn(training_dir, hyperpameters):
	return numpy_input_fn(x={PREDICT_INPUTS: x_train}, y=y_train, shuffle=False)


	def eval_input_fn(training_dir, hyperpameters):
	return numpy_input_fn(x={PREDICT_INPUTS: x_val}, y=y_val, shuffle=False)


	if __name__ == '__main__':
	tf.logging.set_verbosity(tf.logging.INFO)

	estimator = tf.keras.estimator.model_to_estimator(keras_model=model)

	export_strategy = saved_model_export_utils.make_export_strategy(serving_input_fn=serving_input_fn(None),
	default_output_alternative_key=None,
	exports_to_keep=1)


	def experiment_fn(output_dir):
	return Experiment(estimator=estimator, train_input_fn=train_input_fn(None, None),
	eval_input_fn=eval_input_fn(None, None), export_strategies=[export_strategy])

	learn_runner.run(experiment_fn, '.')
	import tensorflow as tf
	from sagemaker import RealTimePredictor
	from sagemaker.tensorflow.predictor import tf_deserializer, tf_serializer
	from tensorflow.python.saved_model.signature_constants import DEFAULT_SERVING_SIGNATURE_DEF_KEY
	from tensorflow_serving.apis import predict_pb2


	def _create_feature(v):
	if type(v) == int:
	return tf.train.Feature(int64_list=tf.train.Int64List(value=[v]))
	if type(v) == str:
	return tf.train.Feature(bytes_list=tf.train.BytesList(value=[v]))
	if type(v) == float:
	return tf.train.Feature(float_list=tf.train.FloatList(value=[v]))
	raise ValueError('invalid type')


	def _create_tensor_proto():
	data = {'account_age': 12.0, 'average_usage': 4.923076923, 'n_regions': 2.0, 'n_success_bills': 0.0,
	'total_success_billing': 0.0}
	features = {k: _create_feature(v) for k, v in data.items()}
	request = predict_pb2.PredictRequest()
	request.model_spec.name = "generic_model"
	request.model_spec.signature_name = DEFAULT_SERVING_SIGNATURE_DEF_KEY
	example = tf.train.Example(features=tf.train.Features(feature=features))
	return tf.make_tensor_proto(values=example.SerializeToString(), shape=[1], dtype=tf.string)


	if __name__ == '__main__':
	tensor_proto = _create_tensor_proto()

	endpoint = 'sagemaker-tensorflow-py2-cpu-2018-01-25-19-56-02-379'
	predictor = RealTimePredictor(endpoint_name=endpoint,
	deserializer=tf_deserializer,
	serializer=tf_serializer,
	content_type='application/octet-stream')

	request = predict_pb2.PredictRequest()
	request.model_spec.name = "generic_model"
	request.model_spec.signature_name = DEFAULT_SERVING_SIGNATURE_DEF_KEY
	request.inputs['examples'].CopyFrom(tensor_proto)

	result = predictor.predict(request)

	print(result)
	import os
	from sagemaker import s3_input
	from sagemaker.tensorflow import TensorFlow

	if __name__ == '__main__':
	estimator = TensorFlow(entry_point='keras_embeddings.py',
	input_mode='Pipe',
	role='SageMakerRole',
	training_steps=10000,
	evaluation_steps=100,
	train_instance_count=1,
	train_instance_type='ml.c4.xlarge')

	bucket_folder = 'my-bucket-name/data/keras-embeddings'

	# two folders, training and embedding, will be created in the container in
	# '/opt/ml/input/data'
	# https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo.html
	# #your-algorithms-training-algo-running-container-trainingdata
	channel = {'training': s3_input(os.path.join(bucket_folder, 'training')),
	'embedding': s3_input(os.path.join(bucket_folder, 'embedding'))}

	estimator.fit(channel)