Skip to content

Instantly share code, notes, and snippets.

View phoebewong's full-sized avatar

Phoebe Wong phoebewong

View GitHub Profile
@phoebewong
phoebewong / predict.py
Created December 12, 2019 21:06
Recommend images using GloVe and USE embedding
def predict_glove_images(headline, k=2):
"""
Predicts the closest matching image caption given an article headline
Returns a list of image ids
"""
# pre-processes the headline
text_prep = preprocessing(headline)
# finding the average embedding for the headline
emb = average_embedding(text_prep)
@phoebewong
phoebewong / embedding_matrix_use.py
Created December 12, 2019 21:05
Create embedding matrix for all images using USE embedding
start_time = time.time()
# saving the use embeddings for all the image captions to a numpy array
use_img_embedding = np.zeros((len(image_df),512))
for i, text in enumerate(image_df.caption.values):
if i % 100000 == 0 and i > 0:
print(f'{i} out of {len(image_df.caption.values)} done in {time.time() - start_time:.2f}s')
emb = use_encoder([text])
use_img_embedding[i] = emb
print(f'{i} out of {len(image_df.caption.values)} done')
@phoebewong
phoebewong / embedding_matrix_glove.py
Created December 12, 2019 21:00
Create embedding matrix with GloVe embeddings
# image embeddings
start_time = time.time()
# saving the embeddings for all the image captions to a numpy array
image_embeddings = np.zeros(shape=(len(image_df), D))
for i, text in enumerate(image_df.caption.values):
if i % 100000 == 0 and i > 0:
print(f'{i} out of {len(image_df.caption.values)} done in {time.time() - start_time:.2f}s')
text_prep = preprocessing(text)
emb = average_embedding(text_prep)
@phoebewong
phoebewong / helper_glove.py
Created December 12, 2019 21:00
Helper functions for GloVe embeddings model
def vec(w, D=50):
"""
Converts a word to an embedding vector
"""
try:
return np.array(words_dict[w])
# if the word is not in our vocabulary, we return zeros
except:
return np.zeros(D)
@phoebewong
phoebewong / load_data.py
Created December 12, 2019 20:58
Load news article headline and caption dataset
# loading the dataset
with open('data/captioning_dataset.json') as json_file:
data = json.load(json_file)
@phoebewong
phoebewong / load_glove.py
Last active December 12, 2019 20:59
Loade GloVe embeddings
# loading glove data file
# URL to donwload the GloVe embedding: https://nlp.stanford.edu/projects/glove/
D = 50
glove_data_file = f'data/glove.6B.{D}d.txt'
words = pd.read_csv(glove_data_file, sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)
# creating a dictionary for accessing words quickly
words_dict = {word: embed for word, embed in zip(words.index, words.values.tolist())}
print(f'Loaded {len(words_dict.keys())} words from the GloVe file')
@phoebewong
phoebewong / data_cleaning.py
Created December 12, 2019 06:48
Data Cleaning
# Placeholder
@phoebewong
phoebewong / cosine_distance.py
Last active December 12, 2019 06:59
Calculate cosine distance of embeddings
# import dependencies
import numpy as np
from sklearn.preprocessing import Normalizer
# normalize the vector
# flatten for matrix multiplication
normalized_headline = Normalizer().fit_transform(article_headline).flatten()
# compute cosine distances between input article headline and all image captions
img_scores = np.asarray(np.dot(normalized_img_embedding_matrix, normalized_headline.T)).flatten()
@phoebewong
phoebewong / load_use.py
Last active December 12, 2019 21:10
Load USE model
# import dependencies
# tensorflow>=2.0.0
# tensorflow_hub>=0.6.0
import tensorflow as tf
print(f'Tensorflow version {tf.__version__}') # should be 2.0.0 or greater
import tensorflow_hub as hub
# load pretrained USE
try:
# if hub.load() fails, download is available directly from url