greg-randall · June 3, 2025 20:28
diff --git a/name_cleaner.py b/name_cleaner.py
 """
 Name Matching Algorithm with Nickname and Typo Tolerance

 # Basic usage with default thresholds:
    python3 nametest.py sample_names.txt
 # With custom thresholds:
    python3 nametest.py sample_names.txt --first-threshold 80 --last-threshold 90 --last-distance 2

 This script implements a flexible name matching system that identifies potentially equivalent names
 while accounting for common variations in how names are written. It's particularly useful for
 deduplicating contact lists, matching author names, or identifying the same person across different
 databases.

 Key Features:
 - Handles common nickname variations (e.g., "Anthony" <-> "Tony", "Theodore" <-> "Ted")
 - Tolerates minor typos in last names (up to 1 character difference)
 - Ignores professional/honorary titles (e.g., "Dr.", "Senator")
 - Supports middle names/initials
 - Uses separate similarity thresholds for first and last names

 Matching Rules:
 1. Last names must be nearly identical (default 95% similarity or max 1 character difference)
 2. First names can match in any of these ways:
   - Exact string match
   - Known nickname variation (using the nicknames library)
   - Fuzzy string similarity above threshold (default 75%)

 Example Matches:
 Anthony Smith <-> Tony Smith                    # Nickname variation
 Maxwell Jones <-> Max Jones                     # Common shortening
 Geoffrey Greg <-> Geoff Gregg                   # Typo in last name
 Senator Zachary Williams <-> Zack Williams      # Title removed + nickname
 Patrick Moore <-> Dr. Pat Moore                 # Title removed + nickname
 Theodore J. Johnson <-> Ted Johnson             # Nickname + middle initial

 Dependencies:
 - nameparser: For structured name parsing
 - thefuzz: For fuzzy string matching
 - Levenshtein: For edit distance calculation
 - nicknames: For nickname/canonical name lookups
 """

 from nameparser import HumanName
 from thefuzz import fuzz
 import Levenshtein
 from nicknames import NickNamer
 from itertools import combinations
 import argparse

 def normalize_name(name):
    # Handle potential None or empty strings
    if not name:
        return ""
    # Escape any single quotes in the name before parsing
    parsed_name = HumanName(name.lower().strip())
    parsed_name.title = ''
    return str(parsed_name).strip()

 def get_name_parts(full_name):
    # Handle potential None or empty strings
    if not full_name:
        return "", ""
        
    parsed_name = HumanName(full_name)
    
   
    first_parts = []
    if parsed_name.first:
        first_parts.append(parsed_name.first)
    if parsed_name.middle:
        first_parts.append(parsed_name.middle)
    
    first_name = ' '.join(first_parts).lower()
    last_name = parsed_name.last.lower() if parsed_name.last else ""
    
    return first_name, last_name

 def are_names_similar(name1, name2, first_name_threshold, last_name_threshold, last_name_distance):
    # Handle potential None or empty inputs
    if not name1 or not name2:
        return False
        
    first1, last1 = get_name_parts(name1)
    first2, last2 = get_name_parts(name2)
    
    # If either name is missing essential parts, return False
    if not (first1 and last1 and first2 and last2):
        return False
    
    # First check if last names are similar enough
    last_name_ratio = fuzz.ratio(last1, last2)

    # Most of the time the first letter of the last name will not contain a typo
    if last1[0] != last2[0] and last1[1:] == last2[1:]:
        levenshtein_distance = 2
    else:
        levenshtein_distance = Levenshtein.distance(last1, last2)

    if last_name_ratio < last_name_threshold and levenshtein_distance > last_name_distance:
        return False
        
    # If first names are exactly the same, we're done
    if first1 == first2:
        return True
        
    # Check nicknames using the nicknames library
    nn = NickNamer()
    first1_parts = first1.split()
    first2_parts = first2.split()
    
    for part1 in first1_parts:
        for part2 in first2_parts:
            part1_variations = nn.nicknames_of(part1) | nn.canonicals_of(part1) | {part1}
            part2_variations = nn.nicknames_of(part2) | nn.canonicals_of(part2) | {part2}
            
            if part1_variations & part2_variations:
                return True
    
    first_name_ratio = fuzz.token_sort_ratio(first1, first2)
    return first_name_ratio >= first_name_threshold

 def main():
    parser = argparse.ArgumentParser(description='Find similar names in a text file.')
    parser.add_argument('input_file', help='Text file containing names (one per line)')
    parser.add_argument('--first-threshold', type=int, default=75,
                        help='Threshold for first name similarity (default: 75)')
    parser.add_argument('--last-threshold', type=int, default=95,
                        help='Threshold for last name similarity (default: 95)')
    parser.add_argument('--last-distance', type=int, default=1,
                        help='Maximum Levenshtein distance for last names (default: 1)')

    args = parser.parse_args()

    try:
        # Use universal newlines mode and properly handle UTF-8 encoding
        with open(args.input_file, 'r', encoding='utf-8', newline=None) as f:
            # Strip whitespace and filter out empty lines
            names = [line.strip() for line in f if line.strip()]
            names = [name for name in names if len(name) > 0]

    except FileNotFoundError:
        print(f"Error: Could not find file '{args.input_file}'")
        return
    except UnicodeDecodeError:
        print(f"Error: File encoding issue. Please ensure the file is saved in UTF-8 format.")
        return
    except Exception as e:
        print(f"Error reading file: {e}")
        return

    if len(names) < 2:
        print("Error: Need at least two names to compare")
        return

    similar_pairs = []
    for name1, name2 in combinations(names, 2):
        try:
            if are_names_similar(name1, name2, 
                               args.first_threshold,
                               args.last_threshold,
                               args.last_distance):
                similar_pairs.append((name1, name2))
        except Exception as e:
            print(f"Warning: Error processing names '{name1}' and '{name2}': {e}")
            continue

    if similar_pairs:
        print("Potentially matching names:")
        for pair in similar_pairs:
            print(f"{pair[0]} <-> {pair[1]}")
    else:
        print("\nNo matching names found.")

 if __name__ == "__main__":
    main()
	"""
	Name Matching Algorithm with Nickname and Typo Tolerance

	# Basic usage with default thresholds:
	python3 nametest.py sample_names.txt
	# With custom thresholds:
	python3 nametest.py sample_names.txt --first-threshold 80 --last-threshold 90 --last-distance 2

	This script implements a flexible name matching system that identifies potentially equivalent names
	while accounting for common variations in how names are written. It's particularly useful for
	deduplicating contact lists, matching author names, or identifying the same person across different
	databases.

	Key Features:
	- Handles common nickname variations (e.g., "Anthony" <-> "Tony", "Theodore" <-> "Ted")
	- Tolerates minor typos in last names (up to 1 character difference)
	- Ignores professional/honorary titles (e.g., "Dr.", "Senator")
	- Supports middle names/initials
	- Uses separate similarity thresholds for first and last names

	Matching Rules:
	1. Last names must be nearly identical (default 95% similarity or max 1 character difference)
	2. First names can match in any of these ways:
	- Exact string match
	- Known nickname variation (using the nicknames library)
	- Fuzzy string similarity above threshold (default 75%)

	Example Matches:
	Anthony Smith <-> Tony Smith # Nickname variation
	Maxwell Jones <-> Max Jones # Common shortening
	Geoffrey Greg <-> Geoff Gregg # Typo in last name
	Senator Zachary Williams <-> Zack Williams # Title removed + nickname
	Patrick Moore <-> Dr. Pat Moore # Title removed + nickname
	Theodore J. Johnson <-> Ted Johnson # Nickname + middle initial

	Dependencies:
	- nameparser: For structured name parsing
	- thefuzz: For fuzzy string matching
	- Levenshtein: For edit distance calculation
	- nicknames: For nickname/canonical name lookups
	"""

	from nameparser import HumanName
	from thefuzz import fuzz
	import Levenshtein
	from nicknames import NickNamer
	from itertools import combinations
	import argparse

	def normalize_name(name):
	# Handle potential None or empty strings
	if not name:
	return ""
	# Escape any single quotes in the name before parsing
	parsed_name = HumanName(name.lower().strip())
	parsed_name.title = ''
	return str(parsed_name).strip()

	def get_name_parts(full_name):
	# Handle potential None or empty strings
	if not full_name:
	return "", ""

	parsed_name = HumanName(full_name)


	first_parts = []
	if parsed_name.first:
	first_parts.append(parsed_name.first)
	if parsed_name.middle:
	first_parts.append(parsed_name.middle)

	first_name = ' '.join(first_parts).lower()
	last_name = parsed_name.last.lower() if parsed_name.last else ""

	return first_name, last_name

	def are_names_similar(name1, name2, first_name_threshold, last_name_threshold, last_name_distance):
	# Handle potential None or empty inputs
	if not name1 or not name2:
	return False

	first1, last1 = get_name_parts(name1)
	first2, last2 = get_name_parts(name2)

	# If either name is missing essential parts, return False
	if not (first1 and last1 and first2 and last2):
	return False

	# First check if last names are similar enough
	last_name_ratio = fuzz.ratio(last1, last2)

	# Most of the time the first letter of the last name will not contain a typo
	if last1[0] != last2[0] and last1[1:] == last2[1:]:
	levenshtein_distance = 2
	else:
	levenshtein_distance = Levenshtein.distance(last1, last2)

	if last_name_ratio < last_name_threshold and levenshtein_distance > last_name_distance:
	return False

	# If first names are exactly the same, we're done
	if first1 == first2:
	return True

	# Check nicknames using the nicknames library
	nn = NickNamer()
	first1_parts = first1.split()
	first2_parts = first2.split()

	for part1 in first1_parts:
	for part2 in first2_parts:
	part1_variations = nn.nicknames_of(part1) \| nn.canonicals_of(part1) \| {part1}
	part2_variations = nn.nicknames_of(part2) \| nn.canonicals_of(part2) \| {part2}

	if part1_variations & part2_variations:
	return True

	first_name_ratio = fuzz.token_sort_ratio(first1, first2)
	return first_name_ratio >= first_name_threshold

	def main():
	parser = argparse.ArgumentParser(description='Find similar names in a text file.')
	parser.add_argument('input_file', help='Text file containing names (one per line)')
	parser.add_argument('--first-threshold', type=int, default=75,
	help='Threshold for first name similarity (default: 75)')
	parser.add_argument('--last-threshold', type=int, default=95,
	help='Threshold for last name similarity (default: 95)')
	parser.add_argument('--last-distance', type=int, default=1,
	help='Maximum Levenshtein distance for last names (default: 1)')

	args = parser.parse_args()

	try:
	# Use universal newlines mode and properly handle UTF-8 encoding
	with open(args.input_file, 'r', encoding='utf-8', newline=None) as f:
	# Strip whitespace and filter out empty lines
	names = [line.strip() for line in f if line.strip()]
	names = [name for name in names if len(name) > 0]

	except FileNotFoundError:
	print(f"Error: Could not find file '{args.input_file}'")
	return
	except UnicodeDecodeError:
	print(f"Error: File encoding issue. Please ensure the file is saved in UTF-8 format.")
	return
	except Exception as e:
	print(f"Error reading file: {e}")
	return

	if len(names) < 2:
	print("Error: Need at least two names to compare")
	return

	similar_pairs = []
	for name1, name2 in combinations(names, 2):
	try:
	if are_names_similar(name1, name2,
	args.first_threshold,
	args.last_threshold,
	args.last_distance):
	similar_pairs.append((name1, name2))
	except Exception as e:
	print(f"Warning: Error processing names '{name1}' and '{name2}': {e}")
	continue

	if similar_pairs:
	print("Potentially matching names:")
	for pair in similar_pairs:
	print(f"{pair[0]} <-> {pair[1]}")
	else:
	print("\nNo matching names found.")

	if __name__ == "__main__":
	main()
No results found