def vec(w, D=50): """ Converts a word to an embedding vector """ try: return np.array(words_dict[w]) # if the word is not in our vocabulary, we return zeros except: return np.zeros(D) def average_embedding(sentence, D=50): """ Computes the average embedding of a sentence """ total_embeddings = np.zeros(D) num_words = len(sentence.split()) # a sanity check if num_words == 0: return total_embeddings # getting the embedding for each word for word in sentence.split(): emb = vec(word) total_embeddings += emb # averaging the embeddings avg_embeddings = total_embeddings/num_words # so that we are not dividing by zero if np.linalg.norm(avg_embeddings) > 1e-10: return avg_embeddings/np.linalg.norm(avg_embeddings) else: return avg_embeddings def preprocessing(sentence): """ Preprocessing. Removes punctuation and stop words """ # removing extra whitespace and making the sentence lower case sentence = sentence.lower().strip() # removing punctuation bad_chars = '-.?;,!@#$%^&*()+/{}[]\\":\'“’' for char in bad_chars: sentence = sentence.replace(char, ' ').strip() all_words = sentence.split() # removing stop words filtered_sentence = [w for w in all_words if not w in stopwords] return ' '.join(filtered_sentence)