Skip to content

Instantly share code, notes, and snippets.

@smrati
Created October 24, 2024 17:09
Show Gist options
  • Select an option

  • Save smrati/e0423b75d3966d49801e61e800a0e724 to your computer and use it in GitHub Desktop.

Select an option

Save smrati/e0423b75d3966d49801e61e800a0e724 to your computer and use it in GitHub Desktop.
Glove embeddigs text retrival
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity
# Load GloVe embeddings
def load_glove_embeddings(glove_file):
embeddings_index = {}
with open(glove_file, 'r', encoding='utf-8') as f:
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
print(f'Loaded {len(embeddings_index)} word vectors.')
return embeddings_index
# Preprocess the text (basic tokenization and lowercasing)
def preprocess_text(text):
text = text.lower() # convert to lowercase
text = re.sub(r'\W+', ' ', text) # remove non-alphanumeric characters
return text.strip()
# Convert a sentence into a vector by averaging word embeddings
def text_to_embedding(text, embeddings_index, embedding_dim=100):
words = text.split()
word_embeddings = []
for word in words:
if word in embeddings_index:
word_embeddings.append(embeddings_index[word])
if not word_embeddings:
# Return a zero vector if no words from the text match GloVe vocab
return np.zeros(embedding_dim)
# Average the word vectors to get a sentence/paragraph vector
return np.mean(word_embeddings, axis=0)
# Process the text file and create embeddings for each line/sentence
def create_embeddings_for_text_file(text_file, glove_file, embedding_dim=100):
embeddings_index = load_glove_embeddings(glove_file)
embeddings = []
lines = []
# Read the text file
with open(text_file, 'r', encoding='utf-8') as f:
for line in f:
preprocessed_line = preprocess_text(line)
if preprocessed_line:
lines.append(preprocessed_line)
embedding = text_to_embedding(preprocessed_line, embeddings_index, embedding_dim)
embeddings.append(embedding)
return lines, np.array(embeddings)
# Example usage:
glove_file_path = 'glove.6B.100d.txt' # Path to your GloVe embeddings
text_file_path = 'placeholder.txt' # Path to your large text file
lines, text_embeddings = create_embeddings_for_text_file(text_file_path, glove_file_path)
# Convert query into embedding
def query_to_embedding(query, embeddings_index, embedding_dim=100):
query = preprocess_text(query)
return text_to_embedding(query, embeddings_index, embedding_dim)
# Search for the most similar line in the text file
def search_similar_text(query, lines, text_embeddings, glove_file, embedding_dim=100):
# Load GloVe embeddings (if not loaded)
embeddings_index = load_glove_embeddings(glove_file)
# Convert the query to an embedding
query_embedding = query_to_embedding(query, embeddings_index, embedding_dim)
# Compute cosine similarity between query and text embeddings
similarities = cosine_similarity([query_embedding], text_embeddings)[0]
# Get the index of the most similar text
most_similar_index = np.argmax(similarities)
# Return the most similar text and its similarity score
return lines[most_similar_index], similarities[most_similar_index]
# Example usage:
query = "How presidential election take place?"
most_similar_text, similarity_score = search_similar_text(query, lines, text_embeddings, glove_file_path)
print(f'Most similar text: {most_similar_text}')
print(f'Similarity score: {similarity_score}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment