Phoebe Wong phoebewong

Data Scientist | Contributor to qualtRics R pkg #Harvard #UCBerkeley #rstats #Python #NLP #Statistics

phoebewong / predict.py

Created December 12, 2019 21:06

Recommend images using GloVe and USE embedding

	def predict_glove_images(headline, k=2):
	"""
	Predicts the closest matching image caption given an article headline
	Returns a list of image ids
	"""
	# pre-processes the headline
	text_prep = preprocessing(headline)

	# finding the average embedding for the headline
	emb = average_embedding(text_prep)

phoebewong / embedding_matrix_use.py

Created December 12, 2019 21:05

Create embedding matrix for all images using USE embedding

	start_time = time.time()

	# saving the use embeddings for all the image captions to a numpy array
	use_img_embedding = np.zeros((len(image_df),512))
	for i, text in enumerate(image_df.caption.values):
	if i % 100000 == 0 and i > 0:
	print(f'{i} out of {len(image_df.caption.values)} done in {time.time() - start_time:.2f}s')
	emb = use_encoder([text])
	use_img_embedding[i] = emb
	print(f'{i} out of {len(image_df.caption.values)} done')

phoebewong / embedding_matrix_glove.py

Created December 12, 2019 21:00

Create embedding matrix with GloVe embeddings

	# image embeddings
	start_time = time.time()

	# saving the embeddings for all the image captions to a numpy array
	image_embeddings = np.zeros(shape=(len(image_df), D))
	for i, text in enumerate(image_df.caption.values):
	if i % 100000 == 0 and i > 0:
	print(f'{i} out of {len(image_df.caption.values)} done in {time.time() - start_time:.2f}s')
	text_prep = preprocessing(text)
	emb = average_embedding(text_prep)

phoebewong / helper_glove.py

Created December 12, 2019 21:00

Helper functions for GloVe embeddings model

	def vec(w, D=50):
	"""
	Converts a word to an embedding vector
	"""
	try:
	return np.array(words_dict[w])
	# if the word is not in our vocabulary, we return zeros
	except:
	return np.zeros(D)

phoebewong / load_data.py

Created December 12, 2019 20:58

Load news article headline and caption dataset

	# loading the dataset
	with open('data/captioning_dataset.json') as json_file:
	data = json.load(json_file)

phoebewong / load_glove.py

Last active December 12, 2019 20:59

Loade GloVe embeddings

	# loading glove data file
	# URL to donwload the GloVe embedding: https://nlp.stanford.edu/projects/glove/
	D = 50
	glove_data_file = f'data/glove.6B.{D}d.txt'
	words = pd.read_csv(glove_data_file, sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)

	# creating a dictionary for accessing words quickly
	words_dict = {word: embed for word, embed in zip(words.index, words.values.tolist())}
	print(f'Loaded {len(words_dict.keys())} words from the GloVe file')

phoebewong / data_cleaning.py

Created December 12, 2019 06:48

Data Cleaning

# Placeholder

phoebewong / cosine_distance.py

Last active December 12, 2019 06:59

Calculate cosine distance of embeddings

	# import dependencies
	import numpy as np
	from sklearn.preprocessing import Normalizer


	# normalize the vector
	# flatten for matrix multiplication
	normalized_headline = Normalizer().fit_transform(article_headline).flatten()
	# compute cosine distances between input article headline and all image captions
	img_scores = np.asarray(np.dot(normalized_img_embedding_matrix, normalized_headline.T)).flatten()

phoebewong / load_use.py

Last active December 12, 2019 21:10

Load USE model

	# import dependencies
	# tensorflow>=2.0.0
	# tensorflow_hub>=0.6.0
	import tensorflow as tf
	print(f'Tensorflow version {tf.__version__}') # should be 2.0.0 or greater
	import tensorflow_hub as hub

	# load pretrained USE
	try:
	# if hub.load() fails, download is available directly from url