Skip to content

Instantly share code, notes, and snippets.

@phoebewong
Created December 12, 2019 21:00
Show Gist options
  • Select an option

  • Save phoebewong/dfa3f90960b4074d7c820c5f7598f944 to your computer and use it in GitHub Desktop.

Select an option

Save phoebewong/dfa3f90960b4074d7c820c5f7598f944 to your computer and use it in GitHub Desktop.
Helper functions for GloVe embeddings model
def vec(w, D=50):
"""
Converts a word to an embedding vector
"""
try:
return np.array(words_dict[w])
# if the word is not in our vocabulary, we return zeros
except:
return np.zeros(D)
def average_embedding(sentence, D=50):
"""
Computes the average embedding of a sentence
"""
total_embeddings = np.zeros(D)
num_words = len(sentence.split())
# a sanity check
if num_words == 0:
return total_embeddings
# getting the embedding for each word
for word in sentence.split():
emb = vec(word)
total_embeddings += emb
# averaging the embeddings
avg_embeddings = total_embeddings/num_words
# so that we are not dividing by zero
if np.linalg.norm(avg_embeddings) > 1e-10:
return avg_embeddings/np.linalg.norm(avg_embeddings)
else:
return avg_embeddings
def preprocessing(sentence):
"""
Preprocessing. Removes punctuation and stop words
"""
# removing extra whitespace and making the sentence lower case
sentence = sentence.lower().strip()
# removing punctuation
bad_chars = '-.?;,!@#$%^&*()+/{}[]\\":\'“’'
for char in bad_chars:
sentence = sentence.replace(char, ' ').strip()
all_words = sentence.split()
# removing stop words
filtered_sentence = [w for w in all_words if not w in stopwords]
return ' '.join(filtered_sentence)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment