Skip to content

Instantly share code, notes, and snippets.

@wwbrannon
Last active May 6, 2025 19:01
Show Gist options
  • Select an option

  • Save wwbrannon/843e073f11f8a932947357d93512c554 to your computer and use it in GitHub Desktop.

Select an option

Save wwbrannon/843e073f11f8a932947357d93512c554 to your computer and use it in GitHub Desktop.
A push-button script to identify duplicative or similar paragraphs across multiple LaTeX chapter files
#!/usr/bin/env python3
"""
find-duplicate-paragraphs.py
A push-button script to identify duplicative or similar paragraphs across multiple LaTeX chapter files.
Usage:
python find_duplicate_paragraphs.py -i path_to_tex_files -o output_report.csv -t 0.8
python find_duplicate_paragraphs.py -i path_to_tex_files -o output_report.csv -t 0.8 -m all-MiniLM-L6-v2
Arguments:
-i, --input_dir Path to the directory containing .tex files.
-o, --output Path to the output CSV report file. (Default: similar_paragraphs_report.csv)
-t, --threshold Similarity threshold between 0 and 1. (Default: 0.8)
-m, --model Model name for SentenceTransformer. If not provided, TF-IDF is used.
-e, --exclude_same_file If set, paragraphs from the same file will not be compared.
-s, --skip_input_include If set, detex will not follow \\input and \\include
Example:
python find-duplicate-paragraphs.py -e -s -m all-MiniLM-L6-v2 -t 0.85 -i ./chapters -o duplicates_report.csv
"""
# After running this script, you can browse the detected similar paragraphs using something like
#
# import textwrap
# import pandas as pd
#
# df = pd.read_csv('similar_paragraphs_report.csv')
#
# i = 0
# print(textwrap.fill(df.iloc[i]['Text A'], width=80))
# print('\n\n')
# print(textwrap.fill(df.iloc[i]['Text B'], width=80))
import os
import sys
import glob
import argparse
import subprocess
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def parse_arguments():
parser = argparse.ArgumentParser(description="Identify duplicative or similar paragraphs across LaTeX chapters.")
parser.add_argument('-i', '--input_dir', type=str, required=True, help='Path to the directory containing .tex files.')
parser.add_argument('-o', '--output', type=str, default='similar_paragraphs_report.csv', help='Path to the output CSV report file.')
parser.add_argument('-t', '--threshold', type=float, default=0.8, help='Similarity threshold between 0 and 1.')
parser.add_argument('-m', '--model', type=str, default=None, help='Model name for SentenceTransformer. If not provided, TF-IDF is used.')
parser.add_argument('-e', '--exclude_same_file', action='store_true', help='Exclude comparing paragraphs within the same file.')
parser.add_argument('-s', '--skip-input-include', action='store_true', help=r"Don't follow \input and \include")
return parser.parse_args()
def check_detex_installed():
try:
subprocess.run(['detex', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
except subprocess.CalledProcessError:
print("detex is installed but returned an error when checking the version.")
sys.exit(1)
except FileNotFoundError:
print("detex is not installed or not found in PATH. Please install detex and ensure it's in your system's PATH.")
sys.exit(1)
def detex_file(tex_path, skip=False):
cmd = ['detex', '-l']
# '-n' avoids following \input and \include
if skip:
cmd += ['-n']
cmd += [tex_path]
try:
result = subprocess.run(cmd, check=True, text=True,
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return result.stdout
except subprocess.CalledProcessError as e:
print(f"Error processing {tex_path}: {e.stderr}")
return ""
def extract_paragraphs(text):
# Split based on two or more newlines
paragraphs = [para.strip() for para in text.split('\n\n') if para.strip()]
return paragraphs
def process_tex_files(input_dir, skip=False):
tex_files = glob.glob(os.path.join(input_dir, '*.tex'))
if not tex_files:
print(f"No .tex files found in the directory: {input_dir}")
sys.exit(1)
all_paragraphs = []
metadata = [] # List of tuples: (filename, paragraph_number)
print("Extracting paragraphs from .tex files...")
for tex_file in tqdm(tex_files, desc="Processing .tex files"):
plain_text = detex_file(tex_file, skip=skip)
paragraphs = extract_paragraphs(plain_text)
filename = os.path.basename(tex_file)
for idx, para in enumerate(paragraphs, 1):
all_paragraphs.append(para)
metadata.append((filename, idx))
return all_paragraphs, metadata
def compute_similarity_tfidf(paragraphs):
print("Vectorizing paragraphs using TF-IDF...")
vectorizer = TfidfVectorizer().fit_transform(paragraphs)
vectors = vectorizer.toarray()
print("Computing cosine similarity matrix...")
cosine_matrix = cosine_similarity(vectors)
return cosine_matrix
def compute_similarity_sentence_transformers(paragraphs, model_name):
try:
from sentence_transformers import SentenceTransformer
except ImportError:
print("The 'sentence-transformers' library is not installed. Install it using 'pip install sentence-transformers'")
sys.exit(1)
print(f"Loading SentenceTransformer model: {model_name}")
model = SentenceTransformer(model_name)
print("Encoding paragraphs using SentenceTransformer...")
embeddings = model.encode(paragraphs, batch_size=32, show_progress_bar=True)
print("Computing cosine similarity matrix...")
cosine_matrix = cosine_similarity(embeddings)
return cosine_matrix
def find_similar_paragraphs(cosine_matrix, metadata, threshold=0.8, exclude_same_file=False):
similar_pairs = []
num_paras = len(metadata)
print("Identifying similar paragraph pairs...")
for i in tqdm(range(num_paras), desc="Comparing paragraphs"):
for j in range(i + 1, num_paras):
similarity = cosine_matrix[i][j]
if similarity >= threshold:
if exclude_same_file and metadata[i][0] == metadata[j][0]:
continue
similar_pairs.append({
'File A': metadata[i][0],
'Paragraph A': metadata[i][1],
'Text A': metadata[i][0] + " - Paragraph " + str(metadata[i][1]),
'File B': metadata[j][0],
'Paragraph B': metadata[j][1],
'Text B': metadata[j][0] + " - Paragraph " + str(metadata[j][1]),
'Similarity': round(similarity, 4)
})
return similar_pairs
def find_similar_paragraphs_with_text(cosine_matrix, metadata, paragraphs, threshold=0.8, exclude_same_file=False):
similar_pairs = []
num_paras = len(metadata)
print("Identifying similar paragraph pairs...")
for i in tqdm(range(num_paras), desc="Comparing paragraphs"):
for j in range(i + 1, num_paras):
similarity = cosine_matrix[i][j]
if similarity >= threshold:
if exclude_same_file and metadata[i][0] == metadata[j][0]:
continue
similar_pairs.append({
'Similarity': round(similarity, 4),
'File A': metadata[i][0],
'Paragraph A': metadata[i][1],
'File B': metadata[j][0],
'Paragraph B': metadata[j][1],
'Text A': paragraphs[i].replace('\n', ' ').replace('\r', ' '),
'Text B': paragraphs[j].replace('\n', ' ').replace('\r', ' '),
})
return similar_pairs
def save_report(similar_pairs, output_path):
if not similar_pairs:
print("No similar paragraphs found based on the given threshold.")
return
df = pd.DataFrame(similar_pairs)
df_sorted = df.sort_values(by='Similarity', ascending=False)
df_sorted.to_csv(output_path, index=False)
print(f"Similarity report saved to {output_path}")
def main():
args = parse_arguments()
# Check if detex is installed
check_detex_installed()
# Check if input directory exists
if not os.path.isdir(args.input_dir):
print(f"The input directory does not exist: {args.input_dir}")
sys.exit(1)
# Process .tex files to extract paragraphs
paragraphs, metadata = process_tex_files(args.input_dir, skip=args.skip_input_include)
if not paragraphs:
print("No paragraphs extracted from the .tex files.")
sys.exit(1)
# Compute similarity matrix
if args.model:
cosine_matrix = compute_similarity_sentence_transformers(paragraphs, args.model)
else:
cosine_matrix = compute_similarity_tfidf(paragraphs)
# Find similar paragraphs
if args.model:
similar_pairs = find_similar_paragraphs_with_text(
cosine_matrix,
metadata,
paragraphs,
threshold=args.threshold,
exclude_same_file=args.exclude_same_file
)
else:
similar_pairs = find_similar_paragraphs_with_text(
cosine_matrix,
metadata,
paragraphs,
threshold=args.threshold,
exclude_same_file=args.exclude_same_file
)
# Save the report
save_report(similar_pairs, args.output)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment