smrati · October 24, 2024 17:09
diff --git a/globe_based_text_retrieval.py b/globe_based_text_retrieval.py
 import numpy as np
 import re
 from sklearn.metrics.pairwise import cosine_similarity


 # Load GloVe embeddings
 def load_glove_embeddings(glove_file):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print(f'Loaded {len(embeddings_index)} word vectors.')
    return embeddings_index


 # Preprocess the text (basic tokenization and lowercasing)
 def preprocess_text(text):
    text = text.lower()  # convert to lowercase
    text = re.sub(r'\W+', ' ', text)  # remove non-alphanumeric characters
    return text.strip()


 # Convert a sentence into a vector by averaging word embeddings
 def text_to_embedding(text, embeddings_index, embedding_dim=100):
    words = text.split()
    word_embeddings = []

    for word in words:
        if word in embeddings_index:
            word_embeddings.append(embeddings_index[word])

    if not word_embeddings:
        # Return a zero vector if no words from the text match GloVe vocab
        return np.zeros(embedding_dim)

    # Average the word vectors to get a sentence/paragraph vector
    return np.mean(word_embeddings, axis=0)


 # Process the text file and create embeddings for each line/sentence
 def create_embeddings_for_text_file(text_file, glove_file, embedding_dim=100):
    embeddings_index = load_glove_embeddings(glove_file)
    embeddings = []
    lines = []

    # Read the text file
    with open(text_file, 'r', encoding='utf-8') as f:
        for line in f:
            preprocessed_line = preprocess_text(line)
            if preprocessed_line:
                lines.append(preprocessed_line)
                embedding = text_to_embedding(preprocessed_line, embeddings_index, embedding_dim)
                embeddings.append(embedding)

    return lines, np.array(embeddings)


 # Example usage:
 glove_file_path = 'glove.6B.100d.txt'  # Path to your GloVe embeddings
 text_file_path = 'placeholder.txt'  # Path to your large text file

 lines, text_embeddings = create_embeddings_for_text_file(text_file_path, glove_file_path)



 # Convert query into embedding
 def query_to_embedding(query, embeddings_index, embedding_dim=100):
    query = preprocess_text(query)
    return text_to_embedding(query, embeddings_index, embedding_dim)


 # Search for the most similar line in the text file
 def search_similar_text(query, lines, text_embeddings, glove_file, embedding_dim=100):
    # Load GloVe embeddings (if not loaded)
    embeddings_index = load_glove_embeddings(glove_file)

    # Convert the query to an embedding
    query_embedding = query_to_embedding(query, embeddings_index, embedding_dim)

    # Compute cosine similarity between query and text embeddings
    similarities = cosine_similarity([query_embedding], text_embeddings)[0]

    # Get the index of the most similar text
    most_similar_index = np.argmax(similarities)

    # Return the most similar text and its similarity score
    return lines[most_similar_index], similarities[most_similar_index]


 # Example usage:
 query = "How presidential election take place?"
 most_similar_text, similarity_score = search_similar_text(query, lines, text_embeddings, glove_file_path)
 print(f'Most similar text: {most_similar_text}')
 print(f'Similarity score: {similarity_score}')
	import numpy as np
	import re
	from sklearn.metrics.pairwise import cosine_similarity


	# Load GloVe embeddings
	def load_glove_embeddings(glove_file):
	embeddings_index = {}
	with open(glove_file, 'r', encoding='utf-8') as f:
	for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
	print(f'Loaded {len(embeddings_index)} word vectors.')
	return embeddings_index


	# Preprocess the text (basic tokenization and lowercasing)
	def preprocess_text(text):
	text = text.lower() # convert to lowercase
	text = re.sub(r'\W+', ' ', text) # remove non-alphanumeric characters
	return text.strip()


	# Convert a sentence into a vector by averaging word embeddings
	def text_to_embedding(text, embeddings_index, embedding_dim=100):
	words = text.split()
	word_embeddings = []

	for word in words:
	if word in embeddings_index:
	word_embeddings.append(embeddings_index[word])

	if not word_embeddings:
	# Return a zero vector if no words from the text match GloVe vocab
	return np.zeros(embedding_dim)

	# Average the word vectors to get a sentence/paragraph vector
	return np.mean(word_embeddings, axis=0)


	# Process the text file and create embeddings for each line/sentence
	def create_embeddings_for_text_file(text_file, glove_file, embedding_dim=100):
	embeddings_index = load_glove_embeddings(glove_file)
	embeddings = []
	lines = []

	# Read the text file
	with open(text_file, 'r', encoding='utf-8') as f:
	for line in f:
	preprocessed_line = preprocess_text(line)
	if preprocessed_line:
	lines.append(preprocessed_line)
	embedding = text_to_embedding(preprocessed_line, embeddings_index, embedding_dim)
	embeddings.append(embedding)

	return lines, np.array(embeddings)


	# Example usage:
	glove_file_path = 'glove.6B.100d.txt' # Path to your GloVe embeddings
	text_file_path = 'placeholder.txt' # Path to your large text file

	lines, text_embeddings = create_embeddings_for_text_file(text_file_path, glove_file_path)



	# Convert query into embedding
	def query_to_embedding(query, embeddings_index, embedding_dim=100):
	query = preprocess_text(query)
	return text_to_embedding(query, embeddings_index, embedding_dim)


	# Search for the most similar line in the text file
	def search_similar_text(query, lines, text_embeddings, glove_file, embedding_dim=100):
	# Load GloVe embeddings (if not loaded)
	embeddings_index = load_glove_embeddings(glove_file)

	# Convert the query to an embedding
	query_embedding = query_to_embedding(query, embeddings_index, embedding_dim)

	# Compute cosine similarity between query and text embeddings
	similarities = cosine_similarity([query_embedding], text_embeddings)[0]

	# Get the index of the most similar text
	most_similar_index = np.argmax(similarities)

	# Return the most similar text and its similarity score
	return lines[most_similar_index], similarities[most_similar_index]


	# Example usage:
	query = "How presidential election take place?"
	most_similar_text, similarity_score = search_similar_text(query, lines, text_embeddings, glove_file_path)
	print(f'Most similar text: {most_similar_text}')
	print(f'Similarity score: {similarity_score}')
No results found