Created
February 28, 2018 04:19
-
-
Save mvsusp/fee588819dd73300123e209298d8ee2c to your computer and use it in GitHub Desktop.
keras_embeddings
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from __future__ import absolute_import | |
| from __future__ import division | |
| from __future__ import print_function | |
| from __future__ import print_function | |
| import sys | |
| import numpy as np | |
| import os | |
| import tensorflow as tf | |
| from tensorflow.contrib.learn import Experiment | |
| from tensorflow.contrib.learn.python.learn import learn_runner | |
| from tensorflow.contrib.learn.python.learn.utils import saved_model_export_utils | |
| from tensorflow.python.estimator.export.export import build_raw_serving_input_receiver_fn | |
| from tensorflow.python.estimator.inputs.numpy_io import numpy_input_fn | |
| from tensorflow.python.keras._impl.keras.engine.topology import Input | |
| from tensorflow.python.keras._impl.keras.layers import Dense, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D | |
| from tensorflow.python.keras._impl.keras.models import Model | |
| from tensorflow.python.keras._impl.keras.preprocessing.sequence import pad_sequences | |
| from tensorflow.python.keras._impl.keras.preprocessing.text import Tokenizer | |
| from tensorflow.python.keras._impl.keras.utils import to_categorical | |
| from tensorflow.python.saved_model.signature_constants import PREDICT_INPUTS | |
| # data inside the container will stored inside this folder | |
| # https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo.html#your-algorithms-training-algo-running-container-trainingdata | |
| SAGEMAKER_DATA_PATH = '/opt/ml/input/data' | |
| inside_sagemaker_container = os.path.exists(SAGEMAKER_DATA_PATH) | |
| # allows script to be executed inside and outside the container | |
| base_dir = SAGEMAKER_DATA_PATH if inside_sagemaker_container else 'data' | |
| GLOVE_DIR = os.path.join(base_dir, 'embedding', 'glove.6B') | |
| TEXT_DATA_DIR = os.path.join(base_dir, 'training', '20_newsgroup') | |
| MAX_SEQUENCE_LENGTH = 1000 | |
| MAX_NUM_WORDS = 20000 | |
| EMBEDDING_DIM = 100 | |
| VALIDATION_SPLIT = 0.2 | |
| # first, build index mapping words in the embeddings set | |
| # to their embedding vector | |
| print('Indexing word vectors.') | |
| embeddings_index = {} | |
| with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f: | |
| for line in f: | |
| values = line.split() | |
| word = values[0] | |
| coefs = np.asarray(values[1:], dtype='float32') | |
| embeddings_index[word] = coefs | |
| print('Found %s word vectors.' % len(embeddings_index)) | |
| # second, prepare text samples and their labels | |
| print('Processing text dataset') | |
| texts = [] # list of text samples | |
| labels_index = {} # dictionary mapping label name to numeric id | |
| labels = [] # list of label ids | |
| for name in sorted(os.listdir(TEXT_DATA_DIR)): | |
| path = os.path.join(TEXT_DATA_DIR, name) | |
| if os.path.isdir(path): | |
| label_id = len(labels_index) | |
| labels_index[name] = label_id | |
| for fname in sorted(os.listdir(path)): | |
| if fname.isdigit(): | |
| fpath = os.path.join(path, fname) | |
| args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'} | |
| with open(fpath, **args) as f: | |
| t = f.read() | |
| i = t.find('\n\n') # skip header | |
| if 0 < i: | |
| t = t[i:] | |
| texts.append(t) | |
| labels.append(label_id) | |
| print('Found %s texts.' % len(texts)) | |
| # finally, vectorize the text samples into a 2D integer tensor | |
| tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) | |
| tokenizer.fit_on_texts(texts) | |
| sequences = tokenizer.texts_to_sequences(texts) | |
| word_index = tokenizer.word_index | |
| print('Found %s unique tokens.' % len(word_index)) | |
| data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) | |
| labels = to_categorical(np.asarray(labels)) | |
| print('Shape of data tensor:', data.shape) | |
| print('Shape of label tensor:', labels.shape) | |
| # split the data into a training set and a validation set | |
| indices = np.arange(data.shape[0]) | |
| np.random.shuffle(indices) | |
| data = data[indices].astype(np.float32) | |
| labels = labels[indices].astype(np.float32) | |
| num_validation_samples = int(VALIDATION_SPLIT * data.shape[0]) | |
| x_train = data[:-num_validation_samples] | |
| y_train = labels[:-num_validation_samples] | |
| x_val = data[-num_validation_samples:] | |
| y_val = labels[-num_validation_samples:] | |
| print('Preparing embedding matrix.') | |
| # prepare embedding matrix | |
| num_words = min(MAX_NUM_WORDS, len(word_index) + 1) | |
| embedding_matrix = np.zeros((num_words, EMBEDDING_DIM)) | |
| for word, i in word_index.items(): | |
| if i >= MAX_NUM_WORDS: | |
| continue | |
| embedding_vector = embeddings_index.get(word) | |
| if embedding_vector is not None: | |
| # words not found in embedding index will be all-zeros. | |
| embedding_matrix[i] = embedding_vector | |
| # load pre-trained word embeddings into an Embedding layer | |
| # note that we set trainable = False so as to keep the embeddings fixed | |
| embedding_layer = Embedding(num_words, | |
| EMBEDDING_DIM, | |
| weights=[embedding_matrix], | |
| input_length=MAX_SEQUENCE_LENGTH, | |
| trainable=False) | |
| print('Training model.') | |
| # train a 1D convnet with global maxpooling | |
| sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name=PREDICT_INPUTS) | |
| embedded_sequences = embedding_layer(sequence_input) | |
| x = Conv1D(128, 5, activation='relu')(embedded_sequences) | |
| x = MaxPooling1D(5)(x) | |
| x = Conv1D(128, 5, activation='relu')(x) | |
| x = MaxPooling1D(5)(x) | |
| x = Conv1D(128, 5, activation='relu')(x) | |
| x = GlobalMaxPooling1D()(x) | |
| x = Dense(128, activation='relu')(x) | |
| preds = Dense(len(labels_index), activation='softmax')(x) | |
| model = Model(sequence_input, preds) | |
| model.compile(loss='categorical_crossentropy', | |
| optimizer='rmsprop', | |
| metrics=['acc']) | |
| def keras_model_fn(hyperparameters): | |
| return model | |
| def serving_input_fn(hyperpameters): | |
| tensor = tf.placeholder(tf.int32, shape=[None, MAX_SEQUENCE_LENGTH]) | |
| return build_raw_serving_input_receiver_fn({PREDICT_INPUTS: tensor}) | |
| def train_input_fn(training_dir, hyperpameters): | |
| return numpy_input_fn(x={PREDICT_INPUTS: x_train}, y=y_train, shuffle=False) | |
| def eval_input_fn(training_dir, hyperpameters): | |
| return numpy_input_fn(x={PREDICT_INPUTS: x_val}, y=y_val, shuffle=False) | |
| if __name__ == '__main__': | |
| tf.logging.set_verbosity(tf.logging.INFO) | |
| estimator = tf.keras.estimator.model_to_estimator(keras_model=model) | |
| export_strategy = saved_model_export_utils.make_export_strategy(serving_input_fn=serving_input_fn(None), | |
| default_output_alternative_key=None, | |
| exports_to_keep=1) | |
| def experiment_fn(output_dir): | |
| return Experiment(estimator=estimator, train_input_fn=train_input_fn(None, None), | |
| eval_input_fn=eval_input_fn(None, None), export_strategies=[export_strategy]) | |
| learn_runner.run(experiment_fn, '.') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import tensorflow as tf | |
| from sagemaker import RealTimePredictor | |
| from sagemaker.tensorflow.predictor import tf_deserializer, tf_serializer | |
| from tensorflow.python.saved_model.signature_constants import DEFAULT_SERVING_SIGNATURE_DEF_KEY | |
| from tensorflow_serving.apis import predict_pb2 | |
| def _create_feature(v): | |
| if type(v) == int: | |
| return tf.train.Feature(int64_list=tf.train.Int64List(value=[v])) | |
| if type(v) == str: | |
| return tf.train.Feature(bytes_list=tf.train.BytesList(value=[v])) | |
| if type(v) == float: | |
| return tf.train.Feature(float_list=tf.train.FloatList(value=[v])) | |
| raise ValueError('invalid type') | |
| def _create_tensor_proto(): | |
| data = {'account_age': 12.0, 'average_usage': 4.923076923, 'n_regions': 2.0, 'n_success_bills': 0.0, | |
| 'total_success_billing': 0.0} | |
| features = {k: _create_feature(v) for k, v in data.items()} | |
| request = predict_pb2.PredictRequest() | |
| request.model_spec.name = "generic_model" | |
| request.model_spec.signature_name = DEFAULT_SERVING_SIGNATURE_DEF_KEY | |
| example = tf.train.Example(features=tf.train.Features(feature=features)) | |
| return tf.make_tensor_proto(values=example.SerializeToString(), shape=[1], dtype=tf.string) | |
| if __name__ == '__main__': | |
| tensor_proto = _create_tensor_proto() | |
| endpoint = 'sagemaker-tensorflow-py2-cpu-2018-01-25-19-56-02-379' | |
| predictor = RealTimePredictor(endpoint_name=endpoint, | |
| deserializer=tf_deserializer, | |
| serializer=tf_serializer, | |
| content_type='application/octet-stream') | |
| request = predict_pb2.PredictRequest() | |
| request.model_spec.name = "generic_model" | |
| request.model_spec.signature_name = DEFAULT_SERVING_SIGNATURE_DEF_KEY | |
| request.inputs['examples'].CopyFrom(tensor_proto) | |
| result = predictor.predict(request) | |
| print(result) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| from sagemaker import s3_input | |
| from sagemaker.tensorflow import TensorFlow | |
| if __name__ == '__main__': | |
| estimator = TensorFlow(entry_point='keras_embeddings.py', | |
| input_mode='Pipe', | |
| role='SageMakerRole', | |
| training_steps=10000, | |
| evaluation_steps=100, | |
| train_instance_count=1, | |
| train_instance_type='ml.c4.xlarge') | |
| bucket_folder = 'my-bucket-name/data/keras-embeddings' | |
| # two folders, training and embedding, will be created in the container in | |
| # '/opt/ml/input/data' | |
| # https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo.html | |
| # #your-algorithms-training-algo-running-container-trainingdata | |
| channel = {'training': s3_input(os.path.join(bucket_folder, 'training')), | |
| 'embedding': s3_input(os.path.join(bucket_folder, 'embedding'))} | |
| estimator.fit(channel) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi @mvsusp, its a very nice implementation. I am searching for some examples for training and deploying keras model in sagemaker. I understood 'keras_embeddings.py' and 'train_and_deploy.py' files. But I am unable to understand 'predictor.py' file. If my understanding is correct, it is for deploying the fitted model.
My question is, cant we run estimator.deploy() method after fitting the model and use predictor.predict() method for inferences?
And one more thing is here i didnt see where you deployed the model and created endpoint, but it is used in the predictor method.
If you dont mind, can you please clear my confusions... It would be really heplful...
Thanks so much,
Harathi