Created
October 24, 2024 17:09
-
-
Save smrati/e0423b75d3966d49801e61e800a0e724 to your computer and use it in GitHub Desktop.
Glove embeddigs text retrival
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import numpy as np | |
| import re | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| # Load GloVe embeddings | |
| def load_glove_embeddings(glove_file): | |
| embeddings_index = {} | |
| with open(glove_file, 'r', encoding='utf-8') as f: | |
| for line in f: | |
| values = line.split() | |
| word = values[0] | |
| coefs = np.asarray(values[1:], dtype='float32') | |
| embeddings_index[word] = coefs | |
| print(f'Loaded {len(embeddings_index)} word vectors.') | |
| return embeddings_index | |
| # Preprocess the text (basic tokenization and lowercasing) | |
| def preprocess_text(text): | |
| text = text.lower() # convert to lowercase | |
| text = re.sub(r'\W+', ' ', text) # remove non-alphanumeric characters | |
| return text.strip() | |
| # Convert a sentence into a vector by averaging word embeddings | |
| def text_to_embedding(text, embeddings_index, embedding_dim=100): | |
| words = text.split() | |
| word_embeddings = [] | |
| for word in words: | |
| if word in embeddings_index: | |
| word_embeddings.append(embeddings_index[word]) | |
| if not word_embeddings: | |
| # Return a zero vector if no words from the text match GloVe vocab | |
| return np.zeros(embedding_dim) | |
| # Average the word vectors to get a sentence/paragraph vector | |
| return np.mean(word_embeddings, axis=0) | |
| # Process the text file and create embeddings for each line/sentence | |
| def create_embeddings_for_text_file(text_file, glove_file, embedding_dim=100): | |
| embeddings_index = load_glove_embeddings(glove_file) | |
| embeddings = [] | |
| lines = [] | |
| # Read the text file | |
| with open(text_file, 'r', encoding='utf-8') as f: | |
| for line in f: | |
| preprocessed_line = preprocess_text(line) | |
| if preprocessed_line: | |
| lines.append(preprocessed_line) | |
| embedding = text_to_embedding(preprocessed_line, embeddings_index, embedding_dim) | |
| embeddings.append(embedding) | |
| return lines, np.array(embeddings) | |
| # Example usage: | |
| glove_file_path = 'glove.6B.100d.txt' # Path to your GloVe embeddings | |
| text_file_path = 'placeholder.txt' # Path to your large text file | |
| lines, text_embeddings = create_embeddings_for_text_file(text_file_path, glove_file_path) | |
| # Convert query into embedding | |
| def query_to_embedding(query, embeddings_index, embedding_dim=100): | |
| query = preprocess_text(query) | |
| return text_to_embedding(query, embeddings_index, embedding_dim) | |
| # Search for the most similar line in the text file | |
| def search_similar_text(query, lines, text_embeddings, glove_file, embedding_dim=100): | |
| # Load GloVe embeddings (if not loaded) | |
| embeddings_index = load_glove_embeddings(glove_file) | |
| # Convert the query to an embedding | |
| query_embedding = query_to_embedding(query, embeddings_index, embedding_dim) | |
| # Compute cosine similarity between query and text embeddings | |
| similarities = cosine_similarity([query_embedding], text_embeddings)[0] | |
| # Get the index of the most similar text | |
| most_similar_index = np.argmax(similarities) | |
| # Return the most similar text and its similarity score | |
| return lines[most_similar_index], similarities[most_similar_index] | |
| # Example usage: | |
| query = "How presidential election take place?" | |
| most_similar_text, similarity_score = search_similar_text(query, lines, text_embeddings, glove_file_path) | |
| print(f'Most similar text: {most_similar_text}') | |
| print(f'Similarity score: {similarity_score}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment