Last active
May 6, 2025 19:01
-
-
Save wwbrannon/843e073f11f8a932947357d93512c554 to your computer and use it in GitHub Desktop.
A push-button script to identify duplicative or similar paragraphs across multiple LaTeX chapter files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| find-duplicate-paragraphs.py | |
| A push-button script to identify duplicative or similar paragraphs across multiple LaTeX chapter files. | |
| Usage: | |
| python find_duplicate_paragraphs.py -i path_to_tex_files -o output_report.csv -t 0.8 | |
| python find_duplicate_paragraphs.py -i path_to_tex_files -o output_report.csv -t 0.8 -m all-MiniLM-L6-v2 | |
| Arguments: | |
| -i, --input_dir Path to the directory containing .tex files. | |
| -o, --output Path to the output CSV report file. (Default: similar_paragraphs_report.csv) | |
| -t, --threshold Similarity threshold between 0 and 1. (Default: 0.8) | |
| -m, --model Model name for SentenceTransformer. If not provided, TF-IDF is used. | |
| -e, --exclude_same_file If set, paragraphs from the same file will not be compared. | |
| -s, --skip_input_include If set, detex will not follow \\input and \\include | |
| Example: | |
| python find-duplicate-paragraphs.py -e -s -m all-MiniLM-L6-v2 -t 0.85 -i ./chapters -o duplicates_report.csv | |
| """ | |
| # After running this script, you can browse the detected similar paragraphs using something like | |
| # | |
| # import textwrap | |
| # import pandas as pd | |
| # | |
| # df = pd.read_csv('similar_paragraphs_report.csv') | |
| # | |
| # i = 0 | |
| # print(textwrap.fill(df.iloc[i]['Text A'], width=80)) | |
| # print('\n\n') | |
| # print(textwrap.fill(df.iloc[i]['Text B'], width=80)) | |
| import os | |
| import sys | |
| import glob | |
| import argparse | |
| import subprocess | |
| import pandas as pd | |
| from tqdm import tqdm | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| def parse_arguments(): | |
| parser = argparse.ArgumentParser(description="Identify duplicative or similar paragraphs across LaTeX chapters.") | |
| parser.add_argument('-i', '--input_dir', type=str, required=True, help='Path to the directory containing .tex files.') | |
| parser.add_argument('-o', '--output', type=str, default='similar_paragraphs_report.csv', help='Path to the output CSV report file.') | |
| parser.add_argument('-t', '--threshold', type=float, default=0.8, help='Similarity threshold between 0 and 1.') | |
| parser.add_argument('-m', '--model', type=str, default=None, help='Model name for SentenceTransformer. If not provided, TF-IDF is used.') | |
| parser.add_argument('-e', '--exclude_same_file', action='store_true', help='Exclude comparing paragraphs within the same file.') | |
| parser.add_argument('-s', '--skip-input-include', action='store_true', help=r"Don't follow \input and \include") | |
| return parser.parse_args() | |
| def check_detex_installed(): | |
| try: | |
| subprocess.run(['detex', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True) | |
| except subprocess.CalledProcessError: | |
| print("detex is installed but returned an error when checking the version.") | |
| sys.exit(1) | |
| except FileNotFoundError: | |
| print("detex is not installed or not found in PATH. Please install detex and ensure it's in your system's PATH.") | |
| sys.exit(1) | |
| def detex_file(tex_path, skip=False): | |
| cmd = ['detex', '-l'] | |
| # '-n' avoids following \input and \include | |
| if skip: | |
| cmd += ['-n'] | |
| cmd += [tex_path] | |
| try: | |
| result = subprocess.run(cmd, check=True, text=True, | |
| stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
| return result.stdout | |
| except subprocess.CalledProcessError as e: | |
| print(f"Error processing {tex_path}: {e.stderr}") | |
| return "" | |
| def extract_paragraphs(text): | |
| # Split based on two or more newlines | |
| paragraphs = [para.strip() for para in text.split('\n\n') if para.strip()] | |
| return paragraphs | |
| def process_tex_files(input_dir, skip=False): | |
| tex_files = glob.glob(os.path.join(input_dir, '*.tex')) | |
| if not tex_files: | |
| print(f"No .tex files found in the directory: {input_dir}") | |
| sys.exit(1) | |
| all_paragraphs = [] | |
| metadata = [] # List of tuples: (filename, paragraph_number) | |
| print("Extracting paragraphs from .tex files...") | |
| for tex_file in tqdm(tex_files, desc="Processing .tex files"): | |
| plain_text = detex_file(tex_file, skip=skip) | |
| paragraphs = extract_paragraphs(plain_text) | |
| filename = os.path.basename(tex_file) | |
| for idx, para in enumerate(paragraphs, 1): | |
| all_paragraphs.append(para) | |
| metadata.append((filename, idx)) | |
| return all_paragraphs, metadata | |
| def compute_similarity_tfidf(paragraphs): | |
| print("Vectorizing paragraphs using TF-IDF...") | |
| vectorizer = TfidfVectorizer().fit_transform(paragraphs) | |
| vectors = vectorizer.toarray() | |
| print("Computing cosine similarity matrix...") | |
| cosine_matrix = cosine_similarity(vectors) | |
| return cosine_matrix | |
| def compute_similarity_sentence_transformers(paragraphs, model_name): | |
| try: | |
| from sentence_transformers import SentenceTransformer | |
| except ImportError: | |
| print("The 'sentence-transformers' library is not installed. Install it using 'pip install sentence-transformers'") | |
| sys.exit(1) | |
| print(f"Loading SentenceTransformer model: {model_name}") | |
| model = SentenceTransformer(model_name) | |
| print("Encoding paragraphs using SentenceTransformer...") | |
| embeddings = model.encode(paragraphs, batch_size=32, show_progress_bar=True) | |
| print("Computing cosine similarity matrix...") | |
| cosine_matrix = cosine_similarity(embeddings) | |
| return cosine_matrix | |
| def find_similar_paragraphs(cosine_matrix, metadata, threshold=0.8, exclude_same_file=False): | |
| similar_pairs = [] | |
| num_paras = len(metadata) | |
| print("Identifying similar paragraph pairs...") | |
| for i in tqdm(range(num_paras), desc="Comparing paragraphs"): | |
| for j in range(i + 1, num_paras): | |
| similarity = cosine_matrix[i][j] | |
| if similarity >= threshold: | |
| if exclude_same_file and metadata[i][0] == metadata[j][0]: | |
| continue | |
| similar_pairs.append({ | |
| 'File A': metadata[i][0], | |
| 'Paragraph A': metadata[i][1], | |
| 'Text A': metadata[i][0] + " - Paragraph " + str(metadata[i][1]), | |
| 'File B': metadata[j][0], | |
| 'Paragraph B': metadata[j][1], | |
| 'Text B': metadata[j][0] + " - Paragraph " + str(metadata[j][1]), | |
| 'Similarity': round(similarity, 4) | |
| }) | |
| return similar_pairs | |
| def find_similar_paragraphs_with_text(cosine_matrix, metadata, paragraphs, threshold=0.8, exclude_same_file=False): | |
| similar_pairs = [] | |
| num_paras = len(metadata) | |
| print("Identifying similar paragraph pairs...") | |
| for i in tqdm(range(num_paras), desc="Comparing paragraphs"): | |
| for j in range(i + 1, num_paras): | |
| similarity = cosine_matrix[i][j] | |
| if similarity >= threshold: | |
| if exclude_same_file and metadata[i][0] == metadata[j][0]: | |
| continue | |
| similar_pairs.append({ | |
| 'Similarity': round(similarity, 4), | |
| 'File A': metadata[i][0], | |
| 'Paragraph A': metadata[i][1], | |
| 'File B': metadata[j][0], | |
| 'Paragraph B': metadata[j][1], | |
| 'Text A': paragraphs[i].replace('\n', ' ').replace('\r', ' '), | |
| 'Text B': paragraphs[j].replace('\n', ' ').replace('\r', ' '), | |
| }) | |
| return similar_pairs | |
| def save_report(similar_pairs, output_path): | |
| if not similar_pairs: | |
| print("No similar paragraphs found based on the given threshold.") | |
| return | |
| df = pd.DataFrame(similar_pairs) | |
| df_sorted = df.sort_values(by='Similarity', ascending=False) | |
| df_sorted.to_csv(output_path, index=False) | |
| print(f"Similarity report saved to {output_path}") | |
| def main(): | |
| args = parse_arguments() | |
| # Check if detex is installed | |
| check_detex_installed() | |
| # Check if input directory exists | |
| if not os.path.isdir(args.input_dir): | |
| print(f"The input directory does not exist: {args.input_dir}") | |
| sys.exit(1) | |
| # Process .tex files to extract paragraphs | |
| paragraphs, metadata = process_tex_files(args.input_dir, skip=args.skip_input_include) | |
| if not paragraphs: | |
| print("No paragraphs extracted from the .tex files.") | |
| sys.exit(1) | |
| # Compute similarity matrix | |
| if args.model: | |
| cosine_matrix = compute_similarity_sentence_transformers(paragraphs, args.model) | |
| else: | |
| cosine_matrix = compute_similarity_tfidf(paragraphs) | |
| # Find similar paragraphs | |
| if args.model: | |
| similar_pairs = find_similar_paragraphs_with_text( | |
| cosine_matrix, | |
| metadata, | |
| paragraphs, | |
| threshold=args.threshold, | |
| exclude_same_file=args.exclude_same_file | |
| ) | |
| else: | |
| similar_pairs = find_similar_paragraphs_with_text( | |
| cosine_matrix, | |
| metadata, | |
| paragraphs, | |
| threshold=args.threshold, | |
| exclude_same_file=args.exclude_same_file | |
| ) | |
| # Save the report | |
| save_report(similar_pairs, args.output) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment