This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def predict_glove_images(headline, k=2): | |
| """ | |
| Predicts the closest matching image caption given an article headline | |
| Returns a list of image ids | |
| """ | |
| # pre-processes the headline | |
| text_prep = preprocessing(headline) | |
| # finding the average embedding for the headline | |
| emb = average_embedding(text_prep) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| start_time = time.time() | |
| # saving the use embeddings for all the image captions to a numpy array | |
| use_img_embedding = np.zeros((len(image_df),512)) | |
| for i, text in enumerate(image_df.caption.values): | |
| if i % 100000 == 0 and i > 0: | |
| print(f'{i} out of {len(image_df.caption.values)} done in {time.time() - start_time:.2f}s') | |
| emb = use_encoder([text]) | |
| use_img_embedding[i] = emb | |
| print(f'{i} out of {len(image_df.caption.values)} done') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # image embeddings | |
| start_time = time.time() | |
| # saving the embeddings for all the image captions to a numpy array | |
| image_embeddings = np.zeros(shape=(len(image_df), D)) | |
| for i, text in enumerate(image_df.caption.values): | |
| if i % 100000 == 0 and i > 0: | |
| print(f'{i} out of {len(image_df.caption.values)} done in {time.time() - start_time:.2f}s') | |
| text_prep = preprocessing(text) | |
| emb = average_embedding(text_prep) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def vec(w, D=50): | |
| """ | |
| Converts a word to an embedding vector | |
| """ | |
| try: | |
| return np.array(words_dict[w]) | |
| # if the word is not in our vocabulary, we return zeros | |
| except: | |
| return np.zeros(D) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # loading the dataset | |
| with open('data/captioning_dataset.json') as json_file: | |
| data = json.load(json_file) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # loading glove data file | |
| # URL to donwload the GloVe embedding: https://nlp.stanford.edu/projects/glove/ | |
| D = 50 | |
| glove_data_file = f'data/glove.6B.{D}d.txt' | |
| words = pd.read_csv(glove_data_file, sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE) | |
| # creating a dictionary for accessing words quickly | |
| words_dict = {word: embed for word, embed in zip(words.index, words.values.tolist())} | |
| print(f'Loaded {len(words_dict.keys())} words from the GloVe file') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Placeholder |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # import dependencies | |
| import numpy as np | |
| from sklearn.preprocessing import Normalizer | |
| # normalize the vector | |
| # flatten for matrix multiplication | |
| normalized_headline = Normalizer().fit_transform(article_headline).flatten() | |
| # compute cosine distances between input article headline and all image captions | |
| img_scores = np.asarray(np.dot(normalized_img_embedding_matrix, normalized_headline.T)).flatten() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # import dependencies | |
| # tensorflow>=2.0.0 | |
| # tensorflow_hub>=0.6.0 | |
| import tensorflow as tf | |
| print(f'Tensorflow version {tf.__version__}') # should be 2.0.0 or greater | |
| import tensorflow_hub as hub | |
| # load pretrained USE | |
| try: | |
| # if hub.load() fails, download is available directly from url |