wwbrannon · May 6, 2025 19:01
diff --git a/find-duplicate-paragraphs.py b/find-duplicate-paragraphs.py
 #!/usr/bin/env python3

 """
 find-duplicate-paragraphs.py

 A push-button script to identify duplicative or similar paragraphs across multiple LaTeX chapter files.

 Usage:
    python find_duplicate_paragraphs.py -i path_to_tex_files -o output_report.csv -t 0.8
    python find_duplicate_paragraphs.py -i path_to_tex_files -o output_report.csv -t 0.8 -m all-MiniLM-L6-v2

 Arguments:
    -i, --input_dir           Path to the directory containing .tex files.
    -o, --output              Path to the output CSV report file. (Default: similar_paragraphs_report.csv)
    -t, --threshold           Similarity threshold between 0 and 1. (Default: 0.8)
    -m, --model               Model name for SentenceTransformer. If not provided, TF-IDF is used.
    -e, --exclude_same_file   If set, paragraphs from the same file will not be compared.
    -s, --skip_input_include  If set, detex will not follow \\input and \\include

 Example:
    python find-duplicate-paragraphs.py -e -s -m all-MiniLM-L6-v2 -t 0.85 -i ./chapters -o duplicates_report.csv
 """

 # After running this script, you can browse the detected similar paragraphs using something like
 #
 # import textwrap
 # import pandas as pd
 #
 # df = pd.read_csv('similar_paragraphs_report.csv')
 #
 # i = 0
 # print(textwrap.fill(df.iloc[i]['Text A'], width=80))
 # print('\n\n')
 # print(textwrap.fill(df.iloc[i]['Text B'], width=80))

 import os
 import sys
 import glob
 import argparse
 import subprocess

 import pandas as pd

 from tqdm import tqdm
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity

 def parse_arguments():
    parser = argparse.ArgumentParser(description="Identify duplicative or similar paragraphs across LaTeX chapters.")
    parser.add_argument('-i', '--input_dir', type=str, required=True, help='Path to the directory containing .tex files.')
    parser.add_argument('-o', '--output', type=str, default='similar_paragraphs_report.csv', help='Path to the output CSV report file.')
    parser.add_argument('-t', '--threshold', type=float, default=0.8, help='Similarity threshold between 0 and 1.')
    parser.add_argument('-m', '--model', type=str, default=None, help='Model name for SentenceTransformer. If not provided, TF-IDF is used.')
    parser.add_argument('-e', '--exclude_same_file', action='store_true', help='Exclude comparing paragraphs within the same file.')
    parser.add_argument('-s', '--skip-input-include', action='store_true', help=r"Don't follow \input and \include")
    return parser.parse_args()


 def check_detex_installed():
    try:
        subprocess.run(['detex', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
    except subprocess.CalledProcessError:
        print("detex is installed but returned an error when checking the version.")
        sys.exit(1)
    except FileNotFoundError:
        print("detex is not installed or not found in PATH. Please install detex and ensure it's in your system's PATH.")
        sys.exit(1)


 def detex_file(tex_path, skip=False):
    cmd = ['detex', '-l']

    # '-n' avoids following \input and \include
    if skip:
        cmd += ['-n']

    cmd += [tex_path]

    try:
        result = subprocess.run(cmd, check=True, text=True,
                                stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        return result.stdout
    except subprocess.CalledProcessError as e:
        print(f"Error processing {tex_path}: {e.stderr}")
        return ""


 def extract_paragraphs(text):
    # Split based on two or more newlines
    paragraphs = [para.strip() for para in text.split('\n\n') if para.strip()]
    return paragraphs


 def process_tex_files(input_dir, skip=False):
    tex_files = glob.glob(os.path.join(input_dir, '*.tex'))
    if not tex_files:
        print(f"No .tex files found in the directory: {input_dir}")
        sys.exit(1)

    all_paragraphs = []
    metadata = []  # List of tuples: (filename, paragraph_number)

    print("Extracting paragraphs from .tex files...")
    for tex_file in tqdm(tex_files, desc="Processing .tex files"):
        plain_text = detex_file(tex_file, skip=skip)
        paragraphs = extract_paragraphs(plain_text)
        filename = os.path.basename(tex_file)
        for idx, para in enumerate(paragraphs, 1):
            all_paragraphs.append(para)
            metadata.append((filename, idx))

    return all_paragraphs, metadata


 def compute_similarity_tfidf(paragraphs):
    print("Vectorizing paragraphs using TF-IDF...")
    vectorizer = TfidfVectorizer().fit_transform(paragraphs)
    vectors = vectorizer.toarray()

    print("Computing cosine similarity matrix...")
    cosine_matrix = cosine_similarity(vectors)
    return cosine_matrix


 def compute_similarity_sentence_transformers(paragraphs, model_name):
    try:
        from sentence_transformers import SentenceTransformer
    except ImportError:
        print("The 'sentence-transformers' library is not installed. Install it using 'pip install sentence-transformers'")
        sys.exit(1)

    print(f"Loading SentenceTransformer model: {model_name}")
    model = SentenceTransformer(model_name)

    print("Encoding paragraphs using SentenceTransformer...")
    embeddings = model.encode(paragraphs, batch_size=32, show_progress_bar=True)

    print("Computing cosine similarity matrix...")
    cosine_matrix = cosine_similarity(embeddings)
    return cosine_matrix


 def find_similar_paragraphs(cosine_matrix, metadata, threshold=0.8, exclude_same_file=False):
    similar_pairs = []
    num_paras = len(metadata)

    print("Identifying similar paragraph pairs...")
    for i in tqdm(range(num_paras), desc="Comparing paragraphs"):
        for j in range(i + 1, num_paras):
            similarity = cosine_matrix[i][j]
            if similarity >= threshold:
                if exclude_same_file and metadata[i][0] == metadata[j][0]:
                    continue
                similar_pairs.append({
                    'File A': metadata[i][0],
                    'Paragraph A': metadata[i][1],
                    'Text A': metadata[i][0] + " - Paragraph " + str(metadata[i][1]),
                    'File B': metadata[j][0],
                    'Paragraph B': metadata[j][1],
                    'Text B': metadata[j][0] + " - Paragraph " + str(metadata[j][1]),
                    'Similarity': round(similarity, 4)
                })
    return similar_pairs


 def find_similar_paragraphs_with_text(cosine_matrix, metadata, paragraphs, threshold=0.8, exclude_same_file=False):
    similar_pairs = []
    num_paras = len(metadata)

    print("Identifying similar paragraph pairs...")
    for i in tqdm(range(num_paras), desc="Comparing paragraphs"):
        for j in range(i + 1, num_paras):
            similarity = cosine_matrix[i][j]
            if similarity >= threshold:
                if exclude_same_file and metadata[i][0] == metadata[j][0]:
                    continue
                similar_pairs.append({
                    'Similarity': round(similarity, 4),
                    'File A': metadata[i][0],
                    'Paragraph A': metadata[i][1],
                    'File B': metadata[j][0],
                    'Paragraph B': metadata[j][1],
                    'Text A': paragraphs[i].replace('\n', ' ').replace('\r', ' '),
                    'Text B': paragraphs[j].replace('\n', ' ').replace('\r', ' '),
                })
    return similar_pairs


 def save_report(similar_pairs, output_path):
    if not similar_pairs:
        print("No similar paragraphs found based on the given threshold.")
        return
    df = pd.DataFrame(similar_pairs)
    df_sorted = df.sort_values(by='Similarity', ascending=False)
    df_sorted.to_csv(output_path, index=False)
    print(f"Similarity report saved to {output_path}")


 def main():
    args = parse_arguments()

    # Check if detex is installed
    check_detex_installed()

    # Check if input directory exists
    if not os.path.isdir(args.input_dir):
        print(f"The input directory does not exist: {args.input_dir}")
        sys.exit(1)

    # Process .tex files to extract paragraphs
    paragraphs, metadata = process_tex_files(args.input_dir, skip=args.skip_input_include)

    if not paragraphs:
        print("No paragraphs extracted from the .tex files.")
        sys.exit(1)

    # Compute similarity matrix
    if args.model:
        cosine_matrix = compute_similarity_sentence_transformers(paragraphs, args.model)
    else:
        cosine_matrix = compute_similarity_tfidf(paragraphs)

    # Find similar paragraphs
    if args.model:
        similar_pairs = find_similar_paragraphs_with_text(
            cosine_matrix,
            metadata,
            paragraphs,
            threshold=args.threshold,
            exclude_same_file=args.exclude_same_file
        )
    else:
        similar_pairs = find_similar_paragraphs_with_text(
            cosine_matrix,
            metadata,
            paragraphs,
            threshold=args.threshold,
            exclude_same_file=args.exclude_same_file
        )

    # Save the report
    save_report(similar_pairs, args.output)

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3

	"""
	find-duplicate-paragraphs.py

	A push-button script to identify duplicative or similar paragraphs across multiple LaTeX chapter files.

	Usage:
	python find_duplicate_paragraphs.py -i path_to_tex_files -o output_report.csv -t 0.8
	python find_duplicate_paragraphs.py -i path_to_tex_files -o output_report.csv -t 0.8 -m all-MiniLM-L6-v2

	Arguments:
	-i, --input_dir Path to the directory containing .tex files.
	-o, --output Path to the output CSV report file. (Default: similar_paragraphs_report.csv)
	-t, --threshold Similarity threshold between 0 and 1. (Default: 0.8)
	-m, --model Model name for SentenceTransformer. If not provided, TF-IDF is used.
	-e, --exclude_same_file If set, paragraphs from the same file will not be compared.
	-s, --skip_input_include If set, detex will not follow \\input and \\include

	Example:
	python find-duplicate-paragraphs.py -e -s -m all-MiniLM-L6-v2 -t 0.85 -i ./chapters -o duplicates_report.csv
	"""

	# After running this script, you can browse the detected similar paragraphs using something like
	#
	# import textwrap
	# import pandas as pd
	#
	# df = pd.read_csv('similar_paragraphs_report.csv')
	#
	# i = 0
	# print(textwrap.fill(df.iloc[i]['Text A'], width=80))
	# print('\n\n')
	# print(textwrap.fill(df.iloc[i]['Text B'], width=80))

	import os
	import sys
	import glob
	import argparse
	import subprocess

	import pandas as pd

	from tqdm import tqdm
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity

	def parse_arguments():
	parser = argparse.ArgumentParser(description="Identify duplicative or similar paragraphs across LaTeX chapters.")
	parser.add_argument('-i', '--input_dir', type=str, required=True, help='Path to the directory containing .tex files.')
	parser.add_argument('-o', '--output', type=str, default='similar_paragraphs_report.csv', help='Path to the output CSV report file.')
	parser.add_argument('-t', '--threshold', type=float, default=0.8, help='Similarity threshold between 0 and 1.')
	parser.add_argument('-m', '--model', type=str, default=None, help='Model name for SentenceTransformer. If not provided, TF-IDF is used.')
	parser.add_argument('-e', '--exclude_same_file', action='store_true', help='Exclude comparing paragraphs within the same file.')
	parser.add_argument('-s', '--skip-input-include', action='store_true', help=r"Don't follow \input and \include")
	return parser.parse_args()


	def check_detex_installed():
	try:
	subprocess.run(['detex', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
	except subprocess.CalledProcessError:
	print("detex is installed but returned an error when checking the version.")
	sys.exit(1)
	except FileNotFoundError:
	print("detex is not installed or not found in PATH. Please install detex and ensure it's in your system's PATH.")
	sys.exit(1)


	def detex_file(tex_path, skip=False):
	cmd = ['detex', '-l']

	# '-n' avoids following \input and \include
	if skip:
	cmd += ['-n']

	cmd += [tex_path]

	try:
	result = subprocess.run(cmd, check=True, text=True,
	stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	return result.stdout
	except subprocess.CalledProcessError as e:
	print(f"Error processing {tex_path}: {e.stderr}")
	return ""


	def extract_paragraphs(text):
	# Split based on two or more newlines
	paragraphs = [para.strip() for para in text.split('\n\n') if para.strip()]
	return paragraphs


	def process_tex_files(input_dir, skip=False):
	tex_files = glob.glob(os.path.join(input_dir, '*.tex'))
	if not tex_files:
	print(f"No .tex files found in the directory: {input_dir}")
	sys.exit(1)

	all_paragraphs = []
	metadata = [] # List of tuples: (filename, paragraph_number)

	print("Extracting paragraphs from .tex files...")
	for tex_file in tqdm(tex_files, desc="Processing .tex files"):
	plain_text = detex_file(tex_file, skip=skip)
	paragraphs = extract_paragraphs(plain_text)
	filename = os.path.basename(tex_file)
	for idx, para in enumerate(paragraphs, 1):
	all_paragraphs.append(para)
	metadata.append((filename, idx))

	return all_paragraphs, metadata


	def compute_similarity_tfidf(paragraphs):
	print("Vectorizing paragraphs using TF-IDF...")
	vectorizer = TfidfVectorizer().fit_transform(paragraphs)
	vectors = vectorizer.toarray()

	print("Computing cosine similarity matrix...")
	cosine_matrix = cosine_similarity(vectors)
	return cosine_matrix


	def compute_similarity_sentence_transformers(paragraphs, model_name):
	try:
	from sentence_transformers import SentenceTransformer
	except ImportError:
	print("The 'sentence-transformers' library is not installed. Install it using 'pip install sentence-transformers'")
	sys.exit(1)

	print(f"Loading SentenceTransformer model: {model_name}")
	model = SentenceTransformer(model_name)

	print("Encoding paragraphs using SentenceTransformer...")
	embeddings = model.encode(paragraphs, batch_size=32, show_progress_bar=True)

	print("Computing cosine similarity matrix...")
	cosine_matrix = cosine_similarity(embeddings)
	return cosine_matrix


	def find_similar_paragraphs(cosine_matrix, metadata, threshold=0.8, exclude_same_file=False):
	similar_pairs = []
	num_paras = len(metadata)

	print("Identifying similar paragraph pairs...")
	for i in tqdm(range(num_paras), desc="Comparing paragraphs"):
	for j in range(i + 1, num_paras):
	similarity = cosine_matrix[i][j]
	if similarity >= threshold:
	if exclude_same_file and metadata[i][0] == metadata[j][0]:
	continue
	similar_pairs.append({
	'File A': metadata[i][0],
	'Paragraph A': metadata[i][1],
	'Text A': metadata[i][0] + " - Paragraph " + str(metadata[i][1]),
	'File B': metadata[j][0],
	'Paragraph B': metadata[j][1],
	'Text B': metadata[j][0] + " - Paragraph " + str(metadata[j][1]),
	'Similarity': round(similarity, 4)
	})
	return similar_pairs


	def find_similar_paragraphs_with_text(cosine_matrix, metadata, paragraphs, threshold=0.8, exclude_same_file=False):
	similar_pairs = []
	num_paras = len(metadata)

	print("Identifying similar paragraph pairs...")
	for i in tqdm(range(num_paras), desc="Comparing paragraphs"):
	for j in range(i + 1, num_paras):
	similarity = cosine_matrix[i][j]
	if similarity >= threshold:
	if exclude_same_file and metadata[i][0] == metadata[j][0]:
	continue
	similar_pairs.append({
	'Similarity': round(similarity, 4),
	'File A': metadata[i][0],
	'Paragraph A': metadata[i][1],
	'File B': metadata[j][0],
	'Paragraph B': metadata[j][1],
	'Text A': paragraphs[i].replace('\n', ' ').replace('\r', ' '),
	'Text B': paragraphs[j].replace('\n', ' ').replace('\r', ' '),
	})
	return similar_pairs


	def save_report(similar_pairs, output_path):
	if not similar_pairs:
	print("No similar paragraphs found based on the given threshold.")
	return
	df = pd.DataFrame(similar_pairs)
	df_sorted = df.sort_values(by='Similarity', ascending=False)
	df_sorted.to_csv(output_path, index=False)
	print(f"Similarity report saved to {output_path}")


	def main():
	args = parse_arguments()

	# Check if detex is installed
	check_detex_installed()

	# Check if input directory exists
	if not os.path.isdir(args.input_dir):
	print(f"The input directory does not exist: {args.input_dir}")
	sys.exit(1)

	# Process .tex files to extract paragraphs
	paragraphs, metadata = process_tex_files(args.input_dir, skip=args.skip_input_include)

	if not paragraphs:
	print("No paragraphs extracted from the .tex files.")
	sys.exit(1)

	# Compute similarity matrix
	if args.model:
	cosine_matrix = compute_similarity_sentence_transformers(paragraphs, args.model)
	else:
	cosine_matrix = compute_similarity_tfidf(paragraphs)

	# Find similar paragraphs
	if args.model:
	similar_pairs = find_similar_paragraphs_with_text(
	cosine_matrix,
	metadata,
	paragraphs,
	threshold=args.threshold,
	exclude_same_file=args.exclude_same_file
	)
	else:
	similar_pairs = find_similar_paragraphs_with_text(
	cosine_matrix,
	metadata,
	paragraphs,
	threshold=args.threshold,
	exclude_same_file=args.exclude_same_file
	)

	# Save the report
	save_report(similar_pairs, args.output)

	if __name__ == "__main__":
	main()
No results found