Skip to content

Instantly share code, notes, and snippets.

@greg-randall
Last active June 3, 2025 20:28
Show Gist options
  • Select an option

  • Save greg-randall/ba04ac4df18c82677e7c26a80c90133c to your computer and use it in GitHub Desktop.

Select an option

Save greg-randall/ba04ac4df18c82677e7c26a80c90133c to your computer and use it in GitHub Desktop.

Revisions

  1. greg-randall revised this gist Jun 3, 2025. 1 changed file with 200 additions and 102 deletions.
    302 changes: 200 additions & 102 deletions name_cleaner.py
    Original file line number Diff line number Diff line change
    @@ -52,13 +52,16 @@
    from nameparser import HumanName
    from thefuzz import fuzz
    import Levenshtein
    from nicknames import NickNamer
    from nicknames import NickNamer # Ensure NickNamer is properly imported
    from itertools import combinations
    import argparse
    import unicodedata
    import os
    import csv

    # NickNamer instance should be created once
    # We'll create it in main and pass it to the preprocessing function

    def remove_accents(text):
    """
    Removes all accent marks from a string.
    @@ -76,102 +79,161 @@ def remove_accents(text):
    # Return only the base characters (remove the accent marks)
    return ''.join([c for c in nfkd_form if not unicodedata.combining(c)])

    def normalize_name(name):
    # Handle potential None or empty strings
    if not name:
    return ""
    # Escape any single quotes in the name before parsing
    parsed_name = HumanName(name.lower().strip())
    parsed_name.title = ''
    return str(parsed_name).strip()

    def get_name_parts(full_name):
    # Handle potential None or empty strings
    if not full_name:
    return "", ""

    parsed_name = HumanName(full_name)
    def preprocess_name_data(full_name_str, nn_instance):
    """
    Parses a full name string and extracts/precomputes various attributes
    needed for comparison.

    first_parts = []
    Args:
    full_name_str (str): The full name string to process.
    nn_instance (NickNamer): An initialized NickNamer object.
    Returns:
    dict or None: A dictionary with preprocessed name data, or None if input is invalid.
    """
    if not full_name_str:
    return None

    # Normalize by lowercasing, stripping, and removing titles early
    # HumanName parsing can be sensitive, so handle potential errors
    try:
    parsed_name = HumanName(full_name_str.lower().strip())
    parsed_name.title = '' # Remove titles
    except Exception as e:
    # print(f"Warning: Could not parse name '{full_name_str}': {e}") # Optional: log parsing errors
    return {
    'original': full_name_str,
    'valid_for_comparison': False,
    'error_message': f"Parsing error: {e}"
    }


    first_name_elements = []
    if parsed_name.first:
    first_parts.append(parsed_name.first)
    first_name_elements.append(parsed_name.first)
    if parsed_name.middle:
    first_parts.append(parsed_name.middle)

    first_name = ' '.join(first_parts).lower()
    last_name = parsed_name.last.lower() if parsed_name.last else ""
    # Split middle names into parts (e.g., "J. R." -> ["J.", "R."], "Mary Anne" -> ["Mary", "Anne"])
    first_name_elements.extend(parsed_name.middle.split())

    # Ensure all parts are actual strings and lowercased
    first_parts_list = [part.lower() for part in first_name_elements if part]
    last_name_str = parsed_name.last.lower() if parsed_name.last else ""

    # If essential parts are missing after parsing, mark as not valid for comparison
    if not first_parts_list or not last_name_str:
    return {
    'original': full_name_str,
    'valid_for_comparison': False,
    'error_message': "Missing first or last name after parsing."
    }

    first_parts_no_accents = [remove_accents(part) for part in first_parts_list]

    return first_name, last_name
    first_name_parts_nick_sets = []
    for part in first_parts_list:
    # Generate nickname variations for each part
    # Using try-except for nickname lookups as they might fail for unusual inputs
    try:
    variations = nn_instance.nicknames_of(part) | nn_instance.canonicals_of(part) | {part}
    first_name_parts_nick_sets.append(variations)
    except Exception as e:
    # print(f"Warning: Nickname lookup failed for part '{part}' in name '{full_name_str}': {e}") # Optional
    first_name_parts_nick_sets.append({part}) # Default to just the part itself

    def are_names_similar(name1, name2, first_name_threshold, last_name_threshold, last_name_distance, first_name_distance):
    # Handle potential None or empty inputs
    if not name1 or not name2:
    return False
    full_first_name_str = ' '.join(first_parts_list)

    return {
    'original': full_name_str, # Store the original unprocessed name for output
    'valid_for_comparison': True,
    'first_parts': first_parts_list,
    'last_name': last_name_str,
    'first_parts_no_accents': first_parts_no_accents,
    'first_name_parts_nick_sets': first_name_parts_nick_sets,
    'full_first_name_for_fuzz': full_first_name_str
    }

    def are_names_similar_optimized(p_name1_data, p_name2_data, first_name_threshold, last_name_threshold, last_name_distance, first_name_distance):
    """
    Compares two preprocessed name data structures.
    Args:
    p_name1_data (dict): Preprocessed data for the first name.
    p_name2_data (dict): Preprocessed data for the second name.
    first_name_threshold (int): Min similarity ratio for full first names.
    last_name_threshold (int): Min similarity ratio for last names.
    last_name_distance (int): Max Levenshtein distance for last names.
    first_name_distance (int): Max Levenshtein distance for first name parts.
    first1, last1 = get_name_parts(name1)
    first2, last2 = get_name_parts(name2)
    Returns:
    bool: True if names are considered similar, False otherwise.
    """
    # Ensure both names were processed successfully and have essential parts
    if not p_name1_data.get('valid_for_comparison', False) or not p_name2_data.get('valid_for_comparison', False):
    return False

    last1 = p_name1_data['last_name']
    last2 = p_name2_data['last_name']

    # Last name similarity check
    last_name_ratio = fuzz.ratio(last1, last2)

    # If either name is missing essential parts, return False
    if not (first1 and last1 and first2 and last2):
    lev_dist_last = 0
    if len(last1) > 0 and len(last2) > 0 and last1[0] != last2[0] and last1[1:] == last2[1:]:
    lev_dist_last = 2
    elif last1 and last2:
    lev_dist_last = Levenshtein.distance(last1, last2)
    elif last1 != last2:
    lev_dist_last = max(len(last1), len(last2))

    if last_name_ratio < last_name_threshold and lev_dist_last > last_name_distance:
    return False

    # First check if last names are similar enough
    last_name_ratio = fuzz.ratio(last1, last2)

    # Most of the time the first letter of the last name will not contain a typo
    if last1[0] != last2[0] and last1[1:] == last2[1:]:
    levenshtein_distance = 2
    else:
    levenshtein_distance = Levenshtein.distance(last1, last2)
    # --- First Name Checks ---
    full_first1 = p_name1_data['full_first_name_for_fuzz']
    full_first2 = p_name2_data['full_first_name_for_fuzz']

    if last_name_ratio < last_name_threshold and levenshtein_distance > last_name_distance:
    return False

    # If first names are exactly the same, we're done
    if first1 == first2:
    if full_first1 == full_first2:
    return True

    # Split first names into parts
    first1_parts = first1.split()
    first2_parts = first2.split()

    # Check for direct match by Levenshtein distance
    # If any part of the first name is within the distance threshold, consider it a match

    first1_parts = p_name1_data['first_parts']
    first2_parts = p_name2_data['first_parts']

    # Check for Levenshtein distance on individual first name parts
    for part1 in first1_parts:
    for part2 in first2_parts:
    if Levenshtein.distance(part1, part2) <= first_name_distance:
    return True

    # Check for accent-insensitive matches
    # Strip accents and compare directly
    for part1 in first1_parts:
    for part2 in first2_parts:
    if remove_accents(part1) == remove_accents(part2):

    # Check for accent-insensitive matches on individual first name parts
    first1_parts_no_accents = p_name1_data['first_parts_no_accents']
    first2_parts_no_accents = p_name2_data['first_parts_no_accents']
    for p1_no_accent in first1_parts_no_accents:
    for p2_no_accent in first2_parts_no_accents:
    if p1_no_accent == p2_no_accent:
    return True

    # Check nicknames using the nicknames library
    nn = NickNamer()

    for part1 in first1_parts:
    for part2 in first2_parts:
    part1_variations = nn.nicknames_of(part1) | nn.canonicals_of(part1) | {part1}
    part2_variations = nn.nicknames_of(part2) | nn.canonicals_of(part2) | {part2}
    if part1_variations & part2_variations:
    # Check nicknames using precomputed sets
    p_name1_nick_sets = p_name1_data['first_name_parts_nick_sets']
    p_name2_nick_sets = p_name2_data['first_name_parts_nick_sets']
    for set1 in p_name1_nick_sets:
    for set2 in p_name2_nick_sets:
    if set1 & set2: # Check for intersection
    return True

    # Compare full first names with fuzzy matching as a last resort
    first_name_ratio = fuzz.token_sort_ratio(first1, first2)
    first_name_ratio = fuzz.token_sort_ratio(full_first1, full_first2)
    return first_name_ratio >= first_name_threshold

    def main():
    parser = argparse.ArgumentParser(description='Find similar names in a text file.')
    parser = argparse.ArgumentParser(
    description='Find similar names in a text file.',
    formatter_class=argparse.RawTextHelpFormatter # To preserve formatting of help text
    )
    parser.add_argument('input_file', help='Text file containing names (one per line)')
    parser.add_argument('--first-threshold', type=int, default=75,
    help='Threshold for first name similarity (default: 75)')
    help='Threshold for first name similarity (0-100, default: 75)')
    parser.add_argument('--last-threshold', type=int, default=95,
    help='Threshold for last name similarity (default: 95)')
    help='Threshold for last name similarity (0-100, default: 95)')
    parser.add_argument('--last-distance', type=int, default=1,
    help='Maximum Levenshtein distance for last names (default: 1)')
    parser.add_argument('--first-distance', type=int, default=1,
    @@ -181,66 +243,102 @@ def main():

    args = parser.parse_args()

    # Create default output filename if not provided
    if args.output_file is None:
    # Split the input file path and change the extension to .csv
    input_base, input_ext = os.path.splitext(args.input_file)
    args.output_file = input_base + '.csv'
    input_base, _ = os.path.splitext(args.input_file)
    args.output_file = input_base + '_matches.csv' # Added _matches to avoid overwriting input if same name

    try:
    # Use universal newlines mode and properly handle UTF-8 encoding
    with open(args.input_file, 'r', encoding='utf-8', newline=None) as f:
    # Strip whitespace and filter out empty lines
    names = [line.strip() for line in f if line.strip()]
    names = [name for name in names if len(name) > 0]

    with open(args.input_file, 'r', encoding='utf-8') as f:
    raw_names = [line.strip() for line in f if line.strip()]
    except FileNotFoundError:
    print(f"Error: Could not find file '{args.input_file}'")
    return
    except UnicodeDecodeError:
    print(f"Error: File encoding issue. Please ensure the file is saved in UTF-8 format.")
    print(f"Error: File encoding issue in '{args.input_file}'. Please ensure it is UTF-8.")
    return
    except Exception as e:
    print(f"Error reading file: {e}")
    print(f"Error reading file '{args.input_file}': {e}")
    return

    if len(names) < 2:
    print("Error: Need at least two names to compare")
    if len(raw_names) < 2:
    print("Error: Need at least two names to compare from the input file.")
    return

    # Instantiate NickNamer once
    try:
    nn = NickNamer()
    except Exception as e:
    print(f"Error initializing NickNamer: {e}. Nickname matching will be affected.")
    # Fallback: create a dummy nn that does nothing if NickNamer fails to init
    class DummyNickNamer:
    def nicknames_of(self, name): return set()
    def canonicals_of(self, name): return set()
    nn = DummyNickNamer()


    print("Preprocessing names...")
    processed_names_data = []
    for name_str in raw_names:
    data = preprocess_name_data(name_str, nn)
    if data:
    processed_names_data.append(data)

    valid_processed_names = [pname for pname in processed_names_data if pname.get('valid_for_comparison', False)]

    skipped_count = len(processed_names_data) - len(valid_processed_names)
    if skipped_count > 0:
    print(f"Warning: Skipped {skipped_count} names that could not be adequately parsed (e.g., missing parts or parsing errors).")
    # Optionally, list skipped names and reasons:
    # for pname_data in processed_names_data:
    # if not pname_data.get('valid_for_comparison', False):
    # print(f" - Skipped: '{pname_data.get('original', 'N/A')}' Reason: {pname_data.get('error_message', 'Unknown')}")


    if len(valid_processed_names) < 2:
    print("Error: Need at least two validly parsed names to compare.")
    return

    print(f"Comparing {len(valid_processed_names)} processed names...")
    similar_pairs = []
    for name1, name2 in combinations(names, 2):
    # Using combinations on the list of preprocessed data
    for p_name1_data, p_name2_data in combinations(valid_processed_names, 2):
    try:
    if are_names_similar(name1, name2,
    args.first_threshold,
    args.last_threshold,
    args.last_distance,
    args.first_distance):
    similar_pairs.append((name1, name2))
    if are_names_similar_optimized(p_name1_data, p_name2_data,
    args.first_threshold,
    args.last_threshold,
    args.last_distance,
    args.first_distance):
    # Store the original name strings for the output
    similar_pairs.append((p_name1_data['original'], p_name2_data['original']))
    except Exception as e:
    print(f"Warning: Error processing names '{name1}' and '{name2}': {e}")
    continue

    name1_orig = p_name1_data.get('original', 'Unknown Name 1')
    name2_orig = p_name2_data.get('original', 'Unknown Name 2')
    print(f"Warning: Error during comparison of '{name1_orig}' and '{name2_orig}': {e}")
    continue # Continue to the next pair

    # Output results to console
    if similar_pairs:
    print(f"Found {len(similar_pairs)} potentially matching pairs.")
    print(f"Writing results to {args.output_file}")
    print(f"\nFound {len(similar_pairs)} potentially matching pairs:")
    for pair in similar_pairs:
    print(f"{pair[0]} <-> {pair[1]}")
    print(f" {pair[0]} <-> {pair[1]}")
    print(f"\nWriting results to {args.output_file}") # Moved this line here for better flow
    else:
    print("\nNo matching names found.")

    # Write results to CSV file
    try:
    with open(args.output_file, 'w', newline='', encoding='utf-8') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['Name1', 'Name2']) # Header row
    csv_writer.writerow(['Name1', 'Name2']) # Header row
    csv_writer.writerows(similar_pairs)

    if similar_pairs:
    print(f"\nResults successfully written to {args.output_file}")
    print(f"Results successfully written to {args.output_file}")
    elif not similar_pairs and os.path.exists(args.output_file): # If no pairs, but file was created
    print(f"An empty CSV file with headers has been created at {args.output_file}")

    except Exception as e:
    print(f"Error writing to CSV file: {e}")
    print(f"Error writing to CSV file '{args.output_file}': {e}")

    if __name__ == "__main__":
    main()
    main()
  2. greg-randall revised this gist Mar 28, 2025. No changes.
  3. greg-randall revised this gist Mar 28, 2025. 1 changed file with 84 additions and 13 deletions.
    97 changes: 84 additions & 13 deletions name_cleaner.py
    Original file line number Diff line number Diff line change
    @@ -4,7 +4,9 @@
    # Basic usage with default thresholds:
    python3 nametest.py sample_names.txt
    # With custom thresholds:
    python3 nametest.py sample_names.txt --first-threshold 80 --last-threshold 90 --last-distance 2
    python3 nametest.py sample_names.txt --first-threshold 80 --last-threshold 90 --last-distance 2 --first-distance 1
    # With custom output file:
    python3 nametest.py sample_names.txt --output-file my_matches.csv
    This script implements a flexible name matching system that identifies potentially equivalent names
    while accounting for common variations in how names are written. It's particularly useful for
    @@ -13,31 +15,38 @@
    Key Features:
    - Handles common nickname variations (e.g., "Anthony" <-> "Tony", "Theodore" <-> "Ted")
    - Tolerates minor typos in last names (up to 1 character difference)
    - Tolerates minor typos in both first and last names (up to specified character differences)
    - Ignores professional/honorary titles (e.g., "Dr.", "Senator")
    - Supports middle names/initials
    - Uses separate similarity thresholds for first and last names
    - Accent-insensitive comparison (e.g., "José" <-> "Jose")
    - Outputs matched names to CSV file for further processing
    Matching Rules:
    1. Last names must be nearly identical (default 95% similarity or max 1 character difference)
    2. First names can match in any of these ways:
    - Exact string match
    - Known nickname variation (using the nicknames library)
    - Levenshtein distance within threshold (default 1 character)
    - Accent-insensitive exact match
    - Fuzzy string similarity above threshold (default 75%)
    Example Matches:
    Anthony Smith <-> Tony Smith # Nickname variation
    Maxwell Jones <-> Max Jones # Common shortening
    Geoffrey Greg <-> Geoff Gregg # Typo in last name
    Senator Zachary Williams <-> Zack Williams # Title removed + nickname
    Patrick Moore <-> Dr. Pat Moore # Title removed + nickname
    Theodore J. Johnson <-> Ted Johnson # Nickname + middle initial
    Anthony Smith <-> Tony Smith # Nickname variation
    Maxwell Jones <-> Max Jones # Common shortening
    Geoffrey Greg <-> Geoff Gregg # Typo in last name
    Senator Zachary Williams <-> Zack Williams # Title removed + nickname
    Patrick Moore <-> Dr. Pat Moore # Title removed + nickname
    Theodore J. Johnson <-> Ted Johnson # Nickname + middle initial
    Hortense Félicité de Mailly <-> Hortense Felicite de Mailly # Multiple accent marks ignored
    Dependencies:
    - nameparser: For structured name parsing
    - thefuzz: For fuzzy string matching
    - Levenshtein: For edit distance calculation
    - nicknames: For nickname/canonical name lookups
    - unicodedata: For accent normalization
    - csv: For output formatting
    """

    from nameparser import HumanName
    @@ -46,6 +55,26 @@
    from nicknames import NickNamer
    from itertools import combinations
    import argparse
    import unicodedata
    import os
    import csv

    def remove_accents(text):
    """
    Removes all accent marks from a string.
    Args:
    text (str): Text with possible accent marks
    Returns:
    str: Text with accent marks removed
    """
    if not text:
    return ""
    # Normalize to decomposed form (separate base characters from accents)
    nfkd_form = unicodedata.normalize('NFKD', text)
    # Return only the base characters (remove the accent marks)
    return ''.join([c for c in nfkd_form if not unicodedata.combining(c)])

    def normalize_name(name):
    # Handle potential None or empty strings
    @@ -75,7 +104,7 @@ def get_name_parts(full_name):

    return first_name, last_name

    def are_names_similar(name1, name2, first_name_threshold, last_name_threshold, last_name_distance):
    def are_names_similar(name1, name2, first_name_threshold, last_name_threshold, last_name_distance, first_name_distance):
    # Handle potential None or empty inputs
    if not name1 or not name2:
    return False
    @@ -102,11 +131,27 @@ def are_names_similar(name1, name2, first_name_threshold, last_name_threshold, l
    # If first names are exactly the same, we're done
    if first1 == first2:
    return True

    # Split first names into parts
    first1_parts = first1.split()
    first2_parts = first2.split()

    # Check for direct match by Levenshtein distance
    # If any part of the first name is within the distance threshold, consider it a match
    for part1 in first1_parts:
    for part2 in first2_parts:
    if Levenshtein.distance(part1, part2) <= first_name_distance:
    return True

    # Check for accent-insensitive matches
    # Strip accents and compare directly
    for part1 in first1_parts:
    for part2 in first2_parts:
    if remove_accents(part1) == remove_accents(part2):
    return True

    # Check nicknames using the nicknames library
    nn = NickNamer()
    first1_parts = first1.split()
    first2_parts = first2.split()

    for part1 in first1_parts:
    for part2 in first2_parts:
    @@ -116,6 +161,7 @@ def are_names_similar(name1, name2, first_name_threshold, last_name_threshold, l
    if part1_variations & part2_variations:
    return True

    # Compare full first names with fuzzy matching as a last resort
    first_name_ratio = fuzz.token_sort_ratio(first1, first2)
    return first_name_ratio >= first_name_threshold

    @@ -128,9 +174,19 @@ def main():
    help='Threshold for last name similarity (default: 95)')
    parser.add_argument('--last-distance', type=int, default=1,
    help='Maximum Levenshtein distance for last names (default: 1)')
    parser.add_argument('--first-distance', type=int, default=1,
    help='Maximum Levenshtein distance for first name parts (default: 1)')
    parser.add_argument('--output-file', type=str, default=None,
    help='Path to output CSV file (default: input_filename.csv)')

    args = parser.parse_args()

    # Create default output filename if not provided
    if args.output_file is None:
    # Split the input file path and change the extension to .csv
    input_base, input_ext = os.path.splitext(args.input_file)
    args.output_file = input_base + '.csv'

    try:
    # Use universal newlines mode and properly handle UTF-8 encoding
    with open(args.input_file, 'r', encoding='utf-8', newline=None) as f:
    @@ -158,18 +214,33 @@ def main():
    if are_names_similar(name1, name2,
    args.first_threshold,
    args.last_threshold,
    args.last_distance):
    args.last_distance,
    args.first_distance):
    similar_pairs.append((name1, name2))
    except Exception as e:
    print(f"Warning: Error processing names '{name1}' and '{name2}': {e}")
    continue

    # Output results to console
    if similar_pairs:
    print("Potentially matching names:")
    print(f"Found {len(similar_pairs)} potentially matching pairs.")
    print(f"Writing results to {args.output_file}")
    for pair in similar_pairs:
    print(f"{pair[0]} <-> {pair[1]}")
    else:
    print("\nNo matching names found.")

    # Write results to CSV file
    try:
    with open(args.output_file, 'w', newline='', encoding='utf-8') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['Name1', 'Name2']) # Header row
    csv_writer.writerows(similar_pairs)

    if similar_pairs:
    print(f"\nResults successfully written to {args.output_file}")
    except Exception as e:
    print(f"Error writing to CSV file: {e}")

    if __name__ == "__main__":
    main()
  4. greg-randall revised this gist Dec 5, 2024. 1 changed file with 6 additions and 1 deletion.
    7 changes: 6 additions & 1 deletion name_cleaner.py
    Original file line number Diff line number Diff line change
    @@ -89,7 +89,12 @@ def are_names_similar(name1, name2, first_name_threshold, last_name_threshold, l

    # First check if last names are similar enough
    last_name_ratio = fuzz.ratio(last1, last2)
    levenshtein_distance = Levenshtein.distance(last1, last2)

    # Most of the time the first letter of the last name will not contain a typo
    if last1[0] != last2[0] and last1[1:] == last2[1:]:
    levenshtein_distance = 2
    else:
    levenshtein_distance = Levenshtein.distance(last1, last2)

    if last_name_ratio < last_name_threshold and levenshtein_distance > last_name_distance:
    return False
  5. greg-randall revised this gist Dec 5, 2024. 1 changed file with 1 addition and 5 deletions.
    6 changes: 1 addition & 5 deletions name_cleaner.py
    Original file line number Diff line number Diff line change
    @@ -1,15 +1,11 @@
    """
    Name Matching Algorithm with Nickname and Typo Tolerance
    # Basic usage with default thresholds:
    python3 sample_names.txt names.txt
    python3 nametest.py sample_names.txt
    # With custom thresholds:
    python3 nametest.py sample_names.txt --first-threshold 80 --last-threshold 90 --last-distance 2
    This script implements a flexible name matching system that identifies potentially equivalent names
    while accounting for common variations in how names are written. It's particularly useful for
    deduplicating contact lists, matching author names, or identifying the same person across different
  6. greg-randall revised this gist Dec 5, 2024. 1 changed file with 129 additions and 142 deletions.
    271 changes: 129 additions & 142 deletions name_cleaner.py
    Original file line number Diff line number Diff line change
    @@ -1,187 +1,174 @@
    """
    Tries to match names that are pretty similar.
    The last names need to be nearly identical, will still match if there's a single charecter typo.
    The first names have titles dropped, and are compared but require a lower similarity for output.
    Example Output:
    Potentially matching names:
    Anthony Smith <-> Tony Smith
    Maxwell Jones <-> Max Jones
    Geoffrey Greg <-> Geoffrey Gregg
    Senator Zachary Williams <-> Zack Williams
    Patrick Moore <-> Dr. Pat Moore
    Theodore J. Johnson <-> Ted Johnson
    """
    Name Matching Algorithm with Nickname and Typo Tolerance
    # Basic usage with default thresholds:
    python3 sample_names.txt names.txt
    # With custom thresholds:
    python3 nametest.py sample_names.txt --first-threshold 80 --last-threshold 90 --last-distance 2
    This script implements a flexible name matching system that identifies potentially equivalent names
    while accounting for common variations in how names are written. It's particularly useful for
    deduplicating contact lists, matching author names, or identifying the same person across different
    databases.
    Key Features:
    - Handles common nickname variations (e.g., "Anthony" <-> "Tony", "Theodore" <-> "Ted")
    - Tolerates minor typos in last names (up to 1 character difference)
    - Ignores professional/honorary titles (e.g., "Dr.", "Senator")
    - Supports middle names/initials
    - Uses separate similarity thresholds for first and last names
    Matching Rules:
    1. Last names must be nearly identical (default 95% similarity or max 1 character difference)
    2. First names can match in any of these ways:
    - Exact string match
    - Known nickname variation (using the nicknames library)
    - Fuzzy string similarity above threshold (default 75%)
    Example Matches:
    Anthony Smith <-> Tony Smith # Nickname variation
    Maxwell Jones <-> Max Jones # Common shortening
    Geoffrey Greg <-> Geoff Gregg # Typo in last name
    Senator Zachary Williams <-> Zack Williams # Title removed + nickname
    Patrick Moore <-> Dr. Pat Moore # Title removed + nickname
    Theodore J. Johnson <-> Ted Johnson # Nickname + middle initial
    Dependencies:
    - nameparser: For structured name parsing
    - thefuzz: For fuzzy string matching
    - Levenshtein: For edit distance calculation
    - nicknames: For nickname/canonical name lookups
    """

    from nameparser import HumanName
    from thefuzz import fuzz
    from itertools import combinations
    import Levenshtein

    names = [
    "Anthony Smith",
    "Tony Smith",
    "Maxwell Jones",
    "Max Jones",
    "Geoffrey Greg",
    "Senator Zachary Williams",
    "Zack Williams",
    "Patrick Moore",
    "Dr. Pat Moore",
    "Theodore J. Johnson",
    "Ted Johnson",
    "Geoffrey Gregg",
    ]

    nicknames = {
    # B names
    'bill': ['william', 'will', 'billy', 'willie'],
    'bob': ['robert', 'rob', 'bobby', 'robbie', 'bert'],
    'ben': ['benjamin', 'benji', 'benny'],
    'bert': ['herbert', 'albert', 'bertram', 'roberto'],

    # C names
    'chuck': ['charles', 'charlie', 'chas'],
    'chris': ['christopher', 'christian'],

    # D names
    'dave': ['david', 'davey', 'davie'],
    'dan': ['daniel', 'danny', 'dannie'],
    'dick': ['richard', 'rich', 'ricky', 'ricardo'],

    # E names
    'ed': ['edward', 'eddie', 'eduardo', 'edwin'],
    'eli': ['elijah', 'elias'],

    # F names
    'fred': ['frederick', 'freddie', 'fredrick'],
    'frank': ['francis', 'francisco', 'franklin'],

    # G names
    'greg': ['gregory', 'gregg', 'greggory'],
    'gabe': ['gabriel', 'gaby'],
    'gus': ['augustus', 'gustav', 'augusto'],

    # H names
    'hal': ['harold', 'harry'],
    'hank': ['henry', 'heinrich'],

    # J names
    'jim': ['james', 'jimmy', 'jamie'],
    'joe': ['joseph', 'joey', 'jose'],
    'jack': ['john', 'jonathan', 'johnny'],
    'jerry': ['jerome', 'gerald', 'geraldo'],
    'jeff': ['jeffrey', 'geoffrey'],

    # K names
    'ken': ['kenneth', 'kenny'],
    'kit': ['christopher', 'christian'],

    # L names
    'larry': ['lawrence', 'laurence', 'lorenzo'],
    'len': ['leonard', 'leonardo'],

    # M names
    'mike': ['michael', 'mickey', 'mick'],
    'matt': ['matthew', 'mathew', 'mateo'],
    'max': ['maxwell', 'maximilian', 'maximillian'],

    # N names
    'nick': ['nicholas', 'nicolas', 'nico'],
    'nat': ['nathan', 'nathaniel', 'nataniel'],

    # P names
    'pat': ['patrick', 'patricia'],
    'pete': ['peter', 'pedro'],
    'phil': ['phillip', 'philip', 'felipe'],

    # R names
    'ron': ['ronald', 'ronnie', 'ronny'],
    'ray': ['raymond', 'raymund'],
    'rick': ['richard', 'ricardo'],

    # S names
    'sam': ['samuel', 'sammy', 'sammie'],
    'stan': ['stanley', 'stanford'],
    'steve': ['steven', 'stephen', 'esteban'],

    # T names
    'ted': ['theodore', 'edmund', 'eduardo'],
    'tom': ['thomas', 'tommy', 'tomas'],
    'tony': ['anthony', 'antonio'],

    # V names
    'vic': ['victor', 'vincent'],
    'val': ['valentine', 'valentin'],

    # W names
    'walt': ['walter', 'wallace'],
    'will': ['william', 'wilhelm', 'guillermo'],

    # Z names
    'zack': ['zachary', 'zachariah', 'zach']
    }
    from nicknames import NickNamer
    from itertools import combinations
    import argparse

    def normalize_name(name):
    # Parse the name using HumanName
    parsed_name = HumanName(name.lower())

    # Remove titles
    # Handle potential None or empty strings
    if not name:
    return ""
    # Escape any single quotes in the name before parsing
    parsed_name = HumanName(name.lower().strip())
    parsed_name.title = ''

    return str(parsed_name).strip()

    def get_name_parts(full_name):
    # Handle potential None or empty strings
    if not full_name:
    return "", ""

    parsed_name = HumanName(full_name)

    # Combine all first name parts (first name and middle names)

    first_parts = []
    if parsed_name.first:
    first_parts.append(parsed_name.first)
    if parsed_name.middle:
    first_parts.append(parsed_name.middle)

    first_name = ' '.join(first_parts).lower()
    last_name = parsed_name.last.lower()
    last_name = parsed_name.last.lower() if parsed_name.last else ""

    return first_name, last_name

    def are_names_similar(name1, name2, first_name_threshold=75, last_name_threshold=95):
    def are_names_similar(name1, name2, first_name_threshold, last_name_threshold, last_name_distance):
    # Handle potential None or empty inputs
    if not name1 or not name2:
    return False

    first1, last1 = get_name_parts(name1)
    first2, last2 = get_name_parts(name2)

    # If either name is missing essential parts, return False
    if not (first1 and last1 and first2 and last2):
    return False

    # First check if last names are similar enough
    last_name_ratio = fuzz.ratio(last1, last2)
    levenshtein_distance = Levenshtein.distance(last1, last2)

    if last_name_ratio < last_name_threshold and levenshtein_distance > 1:
    if last_name_ratio < last_name_threshold and levenshtein_distance > last_name_distance:
    return False

    # If last names match, check first names
    # If first names are exactly the same, we're done
    if first1 == first2:
    return True

    # Check nicknames
    # Check nicknames using the nicknames library
    nn = NickNamer()
    first1_parts = first1.split()
    first2_parts = first2.split()

    for part1 in first1_parts:
    for part2 in first2_parts:
    for nickname, variants in nicknames.items():
    if (part1 == nickname and part2 in variants) or \
    (part2 == nickname and part1 in variants):
    return True
    part1_variations = nn.nicknames_of(part1) | nn.canonicals_of(part1) | {part1}
    part2_variations = nn.nicknames_of(part2) | nn.canonicals_of(part2) | {part2}

    if part1_variations & part2_variations:
    return True

    # If no nickname matches, try fuzzy matching on first names
    first_name_ratio = fuzz.token_sort_ratio(first1, first2)
    return first_name_ratio >= first_name_threshold

    # Find similar pairs
    similar_pairs = []
    for name1, name2 in combinations(names, 2):
    if are_names_similar(name1, name2):
    similar_pairs.append((name1, name2))
    def main():
    parser = argparse.ArgumentParser(description='Find similar names in a text file.')
    parser.add_argument('input_file', help='Text file containing names (one per line)')
    parser.add_argument('--first-threshold', type=int, default=75,
    help='Threshold for first name similarity (default: 75)')
    parser.add_argument('--last-threshold', type=int, default=95,
    help='Threshold for last name similarity (default: 95)')
    parser.add_argument('--last-distance', type=int, default=1,
    help='Maximum Levenshtein distance for last names (default: 1)')

    args = parser.parse_args()

    try:
    # Use universal newlines mode and properly handle UTF-8 encoding
    with open(args.input_file, 'r', encoding='utf-8', newline=None) as f:
    # Strip whitespace and filter out empty lines
    names = [line.strip() for line in f if line.strip()]
    names = [name for name in names if len(name) > 0]

    except FileNotFoundError:
    print(f"Error: Could not find file '{args.input_file}'")
    return
    except UnicodeDecodeError:
    print(f"Error: File encoding issue. Please ensure the file is saved in UTF-8 format.")
    return
    except Exception as e:
    print(f"Error reading file: {e}")
    return

    if len(names) < 2:
    print("Error: Need at least two names to compare")
    return

    similar_pairs = []
    for name1, name2 in combinations(names, 2):
    try:
    if are_names_similar(name1, name2,
    args.first_threshold,
    args.last_threshold,
    args.last_distance):
    similar_pairs.append((name1, name2))
    except Exception as e:
    print(f"Warning: Error processing names '{name1}' and '{name2}': {e}")
    continue

    if similar_pairs:
    print("Potentially matching names:")
    for pair in similar_pairs:
    print(f"{pair[0]} <-> {pair[1]}")
    else:
    print("\nNo matching names found.")

    print("Potentially matching names:")
    for pair in similar_pairs:
    print(f"{pair[0]} <-> {pair[1]}")
    if __name__ == "__main__":
    main()
  7. greg-randall revised this gist Dec 5, 2024. 1 changed file with 26 additions and 17 deletions.
    43 changes: 26 additions & 17 deletions name_cleaner.py
    Original file line number Diff line number Diff line change
    @@ -1,30 +1,38 @@
    """
    Tries to match names that are pretty similar.
    The last names need to be nearly identical, will still match if there's a single charecter typo.
    The first names have titles dropped, and are compared but require a lower similarity for output.
    Example Output:
    Potentially matching names:
    Anthony Smith <-> Tony Smith
    Maxwell Jones <-> Max Jones
    Geoffrey Greg <-> Geoffrey Gregg
    Senator Zachary Williams <-> Zack Williams
    Patrick Moore <-> Dr. Pat Moore
    Theodore J. Johnson <-> Ted Johnson
    """

    from nameparser import HumanName
    from thefuzz import fuzz
    from itertools import combinations
    import Levenshtein

    names = [
    "Anthony Smith",
    "Tony Smith",
    "Maxwell Jones",
    "Max Jones",
    "Zachary Williams",
    "Geoffrey Greg",
    "Senator Zachary Williams",
    "Zack Williams",
    "Patrick Moore",
    "Pat Moore",
    "Dr. Pat Moore",
    "Theodore J. Johnson",
    "Ted Johnson"
    "Ted Johnson",
    "Geoffrey Gregg",
    ]

    TITLES_TO_REMOVE = [
    'honorable',
    'the',
    'judge',
    'hon',
    'dr',
    'professor',
    ]

    # Keep your existing nicknames dictionary
    nicknames = {
    # B names
    'bill': ['william', 'will', 'billy', 'willie'],
    @@ -118,9 +126,8 @@ def normalize_name(name):
    # Parse the name using HumanName
    parsed_name = HumanName(name.lower())

    # Remove titles if they're in our removal list
    if parsed_name.title and parsed_name.title.lower() in TITLES_TO_REMOVE:
    parsed_name.title = ''
    # Remove titles
    parsed_name.title = ''

    return str(parsed_name).strip()

    @@ -145,7 +152,9 @@ def are_names_similar(name1, name2, first_name_threshold=75, last_name_threshold

    # First check if last names are similar enough
    last_name_ratio = fuzz.ratio(last1, last2)
    if last_name_ratio < last_name_threshold:
    levenshtein_distance = Levenshtein.distance(last1, last2)

    if last_name_ratio < last_name_threshold and levenshtein_distance > 1:
    return False

    # If last names match, check first names
  8. greg-randall revised this gist Dec 5, 2024. 1 changed file with 28 additions and 14 deletions.
    42 changes: 28 additions & 14 deletions name_cleaner.py
    Original file line number Diff line number Diff line change
    @@ -1,3 +1,7 @@
    from nameparser import HumanName
    from thefuzz import fuzz
    from itertools import combinations

    names = [
    "Anthony Smith",
    "Tony Smith",
    @@ -7,15 +11,10 @@
    "Zack Williams",
    "Patrick Moore",
    "Pat Moore",
    "Theodore Johnson",
    "Theodore J. Johnson",
    "Ted Johnson"
    ]


    from thefuzz import fuzz
    from itertools import combinations

    # List of titles to remove during comparison
    TITLES_TO_REMOVE = [
    'honorable',
    'the',
    @@ -25,6 +24,7 @@
    'professor',
    ]

    # Keep your existing nicknames dictionary
    nicknames = {
    # B names
    'bill': ['william', 'will', 'billy', 'willie'],
    @@ -115,16 +115,29 @@
    }

    def normalize_name(name):
    name_lower = name.lower()
    for title in TITLES_TO_REMOVE:
    name_lower = name_lower.replace(title, '')
    return name_lower.strip()
    # Parse the name using HumanName
    parsed_name = HumanName(name.lower())

    # Remove titles if they're in our removal list
    if parsed_name.title and parsed_name.title.lower() in TITLES_TO_REMOVE:
    parsed_name.title = ''

    return str(parsed_name).strip()

    def get_name_parts(full_name):
    parts = normalize_name(full_name).split()
    if len(parts) >= 2:
    return ' '.join(parts[:-1]), parts[-1]
    return full_name, ''
    parsed_name = HumanName(full_name)

    # Combine all first name parts (first name and middle names)
    first_parts = []
    if parsed_name.first:
    first_parts.append(parsed_name.first)
    if parsed_name.middle:
    first_parts.append(parsed_name.middle)

    first_name = ' '.join(first_parts).lower()
    last_name = parsed_name.last.lower()

    return first_name, last_name

    def are_names_similar(name1, name2, first_name_threshold=75, last_name_threshold=95):
    first1, last1 = get_name_parts(name1)
    @@ -154,6 +167,7 @@ def are_names_similar(name1, name2, first_name_threshold=75, last_name_threshold
    first_name_ratio = fuzz.token_sort_ratio(first1, first2)
    return first_name_ratio >= first_name_threshold

    # Find similar pairs
    similar_pairs = []
    for name1, name2 in combinations(names, 2):
    if are_names_similar(name1, name2):
  9. greg-randall renamed this gist Dec 4, 2024. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  10. greg-randall created this gist Dec 4, 2024.
    164 changes: 164 additions & 0 deletions gistfile1.txt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,164 @@
    names = [
    "Anthony Smith",
    "Tony Smith",
    "Maxwell Jones",
    "Max Jones",
    "Zachary Williams",
    "Zack Williams",
    "Patrick Moore",
    "Pat Moore",
    "Theodore Johnson",
    "Ted Johnson"
    ]


    from thefuzz import fuzz
    from itertools import combinations

    # List of titles to remove during comparison
    TITLES_TO_REMOVE = [
    'honorable',
    'the',
    'judge',
    'hon',
    'dr',
    'professor',
    ]

    nicknames = {
    # B names
    'bill': ['william', 'will', 'billy', 'willie'],
    'bob': ['robert', 'rob', 'bobby', 'robbie', 'bert'],
    'ben': ['benjamin', 'benji', 'benny'],
    'bert': ['herbert', 'albert', 'bertram', 'roberto'],

    # C names
    'chuck': ['charles', 'charlie', 'chas'],
    'chris': ['christopher', 'christian'],

    # D names
    'dave': ['david', 'davey', 'davie'],
    'dan': ['daniel', 'danny', 'dannie'],
    'dick': ['richard', 'rich', 'ricky', 'ricardo'],

    # E names
    'ed': ['edward', 'eddie', 'eduardo', 'edwin'],
    'eli': ['elijah', 'elias'],

    # F names
    'fred': ['frederick', 'freddie', 'fredrick'],
    'frank': ['francis', 'francisco', 'franklin'],

    # G names
    'greg': ['gregory', 'gregg', 'greggory'],
    'gabe': ['gabriel', 'gaby'],
    'gus': ['augustus', 'gustav', 'augusto'],

    # H names
    'hal': ['harold', 'harry'],
    'hank': ['henry', 'heinrich'],

    # J names
    'jim': ['james', 'jimmy', 'jamie'],
    'joe': ['joseph', 'joey', 'jose'],
    'jack': ['john', 'jonathan', 'johnny'],
    'jerry': ['jerome', 'gerald', 'geraldo'],
    'jeff': ['jeffrey', 'geoffrey'],

    # K names
    'ken': ['kenneth', 'kenny'],
    'kit': ['christopher', 'christian'],

    # L names
    'larry': ['lawrence', 'laurence', 'lorenzo'],
    'len': ['leonard', 'leonardo'],

    # M names
    'mike': ['michael', 'mickey', 'mick'],
    'matt': ['matthew', 'mathew', 'mateo'],
    'max': ['maxwell', 'maximilian', 'maximillian'],

    # N names
    'nick': ['nicholas', 'nicolas', 'nico'],
    'nat': ['nathan', 'nathaniel', 'nataniel'],

    # P names
    'pat': ['patrick', 'patricia'],
    'pete': ['peter', 'pedro'],
    'phil': ['phillip', 'philip', 'felipe'],

    # R names
    'ron': ['ronald', 'ronnie', 'ronny'],
    'ray': ['raymond', 'raymund'],
    'rick': ['richard', 'ricardo'],

    # S names
    'sam': ['samuel', 'sammy', 'sammie'],
    'stan': ['stanley', 'stanford'],
    'steve': ['steven', 'stephen', 'esteban'],

    # T names
    'ted': ['theodore', 'edmund', 'eduardo'],
    'tom': ['thomas', 'tommy', 'tomas'],
    'tony': ['anthony', 'antonio'],

    # V names
    'vic': ['victor', 'vincent'],
    'val': ['valentine', 'valentin'],

    # W names
    'walt': ['walter', 'wallace'],
    'will': ['william', 'wilhelm', 'guillermo'],

    # Z names
    'zack': ['zachary', 'zachariah', 'zach']
    }

    def normalize_name(name):
    name_lower = name.lower()
    for title in TITLES_TO_REMOVE:
    name_lower = name_lower.replace(title, '')
    return name_lower.strip()

    def get_name_parts(full_name):
    parts = normalize_name(full_name).split()
    if len(parts) >= 2:
    return ' '.join(parts[:-1]), parts[-1]
    return full_name, ''

    def are_names_similar(name1, name2, first_name_threshold=75, last_name_threshold=95):
    first1, last1 = get_name_parts(name1)
    first2, last2 = get_name_parts(name2)

    # First check if last names are similar enough
    last_name_ratio = fuzz.ratio(last1, last2)
    if last_name_ratio < last_name_threshold:
    return False

    # If last names match, check first names
    if first1 == first2:
    return True

    # Check nicknames
    first1_parts = first1.split()
    first2_parts = first2.split()

    for part1 in first1_parts:
    for part2 in first2_parts:
    for nickname, variants in nicknames.items():
    if (part1 == nickname and part2 in variants) or \
    (part2 == nickname and part1 in variants):
    return True

    # If no nickname matches, try fuzzy matching on first names
    first_name_ratio = fuzz.token_sort_ratio(first1, first2)
    return first_name_ratio >= first_name_threshold

    similar_pairs = []
    for name1, name2 in combinations(names, 2):
    if are_names_similar(name1, name2):
    similar_pairs.append((name1, name2))

    print("Potentially matching names:")
    for pair in similar_pairs:
    print(f"{pair[0]} <-> {pair[1]}")