phoebewong · December 12, 2019 21:00
diff --git a/helper_glove.py b/helper_glove.py
 def vec(w, D=50):
    """
    Converts a word to an embedding vector
    """
    try:
        return np.array(words_dict[w])
    # if the word is not in our vocabulary, we return zeros
    except:
        return np.zeros(D)

 def average_embedding(sentence, D=50):
    """
    Computes the average embedding of a sentence
    """
    total_embeddings = np.zeros(D)
    num_words = len(sentence.split())
    
    # a sanity check
    if num_words == 0:
        return total_embeddings
    
    # getting the embedding for each word
    for word in sentence.split():
        emb = vec(word)
        total_embeddings += emb
        
    # averaging the embeddings
    avg_embeddings = total_embeddings/num_words
    
    # so that we are not dividing by zero
    if np.linalg.norm(avg_embeddings) > 1e-10:
        return avg_embeddings/np.linalg.norm(avg_embeddings)
    else:
        return avg_embeddings

 def preprocessing(sentence):
    """
    Preprocessing. Removes punctuation and stop words
    """
    # removing extra whitespace and making the sentence lower case
    sentence = sentence.lower().strip()
    
    # removing punctuation
    bad_chars = '-.?;,!@#$%^&*()+/{}[]\\":\'“’'
    for char in bad_chars:
        sentence = sentence.replace(char, ' ').strip()
    all_words = sentence.split()
    
    # removing stop words
    filtered_sentence = [w for w in all_words if not w in stopwords]
    return ' '.join(filtered_sentence)
	def vec(w, D=50):
	"""
	Converts a word to an embedding vector
	"""
	try:
	return np.array(words_dict[w])
	# if the word is not in our vocabulary, we return zeros
	except:
	return np.zeros(D)

	def average_embedding(sentence, D=50):
	"""
	Computes the average embedding of a sentence
	"""
	total_embeddings = np.zeros(D)
	num_words = len(sentence.split())

	# a sanity check
	if num_words == 0:
	return total_embeddings

	# getting the embedding for each word
	for word in sentence.split():
	emb = vec(word)
	total_embeddings += emb

	# averaging the embeddings
	avg_embeddings = total_embeddings/num_words

	# so that we are not dividing by zero
	if np.linalg.norm(avg_embeddings) > 1e-10:
	return avg_embeddings/np.linalg.norm(avg_embeddings)
	else:
	return avg_embeddings

	def preprocessing(sentence):
	"""
	Preprocessing. Removes punctuation and stop words
	"""
	# removing extra whitespace and making the sentence lower case
	sentence = sentence.lower().strip()

	# removing punctuation
	bad_chars = '-.?;,!@#$%^&*()+/{}[]\\":\'“’'
	for char in bad_chars:
	sentence = sentence.replace(char, ' ').strip()
	all_words = sentence.split()

	# removing stop words
	filtered_sentence = [w for w in all_words if not w in stopwords]
	return ' '.join(filtered_sentence)
No results found