Created
December 12, 2019 21:00
-
-
Save phoebewong/dfa3f90960b4074d7c820c5f7598f944 to your computer and use it in GitHub Desktop.
Helper functions for GloVe embeddings model
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def vec(w, D=50): | |
| """ | |
| Converts a word to an embedding vector | |
| """ | |
| try: | |
| return np.array(words_dict[w]) | |
| # if the word is not in our vocabulary, we return zeros | |
| except: | |
| return np.zeros(D) | |
| def average_embedding(sentence, D=50): | |
| """ | |
| Computes the average embedding of a sentence | |
| """ | |
| total_embeddings = np.zeros(D) | |
| num_words = len(sentence.split()) | |
| # a sanity check | |
| if num_words == 0: | |
| return total_embeddings | |
| # getting the embedding for each word | |
| for word in sentence.split(): | |
| emb = vec(word) | |
| total_embeddings += emb | |
| # averaging the embeddings | |
| avg_embeddings = total_embeddings/num_words | |
| # so that we are not dividing by zero | |
| if np.linalg.norm(avg_embeddings) > 1e-10: | |
| return avg_embeddings/np.linalg.norm(avg_embeddings) | |
| else: | |
| return avg_embeddings | |
| def preprocessing(sentence): | |
| """ | |
| Preprocessing. Removes punctuation and stop words | |
| """ | |
| # removing extra whitespace and making the sentence lower case | |
| sentence = sentence.lower().strip() | |
| # removing punctuation | |
| bad_chars = '-.?;,!@#$%^&*()+/{}[]\\":\'“’' | |
| for char in bad_chars: | |
| sentence = sentence.replace(char, ' ').strip() | |
| all_words = sentence.split() | |
| # removing stop words | |
| filtered_sentence = [w for w in all_words if not w in stopwords] | |
| return ' '.join(filtered_sentence) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment