greg-randall · June 3, 2025 20:28
diff --git a/name_cleaner.py b/name_cleaner.py
 """ 
 Tries to match names that are pretty similar. 
 The last names need to be nearly identical, will still match if there's a single charecter typo.
 The first names have titles dropped, and are compared but require a lower similarity for output.

 Example Output:
    Potentially matching names:
    Anthony Smith <-> Tony Smith
    Maxwell Jones <-> Max Jones
    Geoffrey Greg <-> Geoffrey Gregg
    Senator Zachary Williams <-> Zack Williams
    Patrick Moore <-> Dr. Pat Moore
    Theodore J. Johnson <-> Ted Johnson
 """

 from nameparser import HumanName
 from thefuzz import fuzz
 from itertools import combinations
 import Levenshtein

 names = [
    "Anthony Smith",
    "Tony Smith", 
    "Maxwell Jones",
    "Max Jones",
    "Geoffrey Greg",
    "Senator Zachary Williams",
    "Zack Williams",
    "Patrick Moore",
    "Dr. Pat Moore",
    "Theodore J. Johnson",
    "Ted Johnson",
    "Geoffrey Gregg",
 ]

 nicknames = {
    # B names
    'bill': ['william', 'will', 'billy', 'willie'],
    'bob': ['robert', 'rob', 'bobby', 'robbie', 'bert'],
    'ben': ['benjamin', 'benji', 'benny'],
    'bert': ['herbert', 'albert', 'bertram', 'roberto'],
    
    # C names
    'chuck': ['charles', 'charlie', 'chas'],
    'chris': ['christopher', 'christian'],
    
    # D names
    'dave': ['david', 'davey', 'davie'],
    'dan': ['daniel', 'danny', 'dannie'],
    'dick': ['richard', 'rich', 'ricky', 'ricardo'],
    
    # E names
    'ed': ['edward', 'eddie', 'eduardo', 'edwin'],
    'eli': ['elijah', 'elias'],
    
    # F names
    'fred': ['frederick', 'freddie', 'fredrick'],
    'frank': ['francis', 'francisco', 'franklin'],
    
    # G names
    'greg': ['gregory', 'gregg', 'greggory'],
    'gabe': ['gabriel', 'gaby'],
    'gus': ['augustus', 'gustav', 'augusto'],
    
    # H names
    'hal': ['harold', 'harry'],
    'hank': ['henry', 'heinrich'],
    
    # J names
    'jim': ['james', 'jimmy', 'jamie'],
    'joe': ['joseph', 'joey', 'jose'],
    'jack': ['john', 'jonathan', 'johnny'],
    'jerry': ['jerome', 'gerald', 'geraldo'],
    'jeff': ['jeffrey', 'geoffrey'],
    
    # K names
    'ken': ['kenneth', 'kenny'],
    'kit': ['christopher', 'christian'],
    
    # L names
    'larry': ['lawrence', 'laurence', 'lorenzo'],
    'len': ['leonard', 'leonardo'],
    
    # M names
    'mike': ['michael', 'mickey', 'mick'],
    'matt': ['matthew', 'mathew', 'mateo'],
    'max': ['maxwell', 'maximilian', 'maximillian'],
    
    # N names
    'nick': ['nicholas', 'nicolas', 'nico'],
    'nat': ['nathan', 'nathaniel', 'nataniel'],
    
    # P names
    'pat': ['patrick', 'patricia'],
    'pete': ['peter', 'pedro'],
    'phil': ['phillip', 'philip', 'felipe'],
    
    # R names
    'ron': ['ronald', 'ronnie', 'ronny'],
    'ray': ['raymond', 'raymund'],
    'rick': ['richard', 'ricardo'],
    
    # S names
    'sam': ['samuel', 'sammy', 'sammie'],
    'stan': ['stanley', 'stanford'],
    'steve': ['steven', 'stephen', 'esteban'],
    
    # T names
    'ted': ['theodore', 'edmund', 'eduardo'],
    'tom': ['thomas', 'tommy', 'tomas'],
    'tony': ['anthony', 'antonio'],
    
    # V names
    'vic': ['victor', 'vincent'],
    'val': ['valentine', 'valentin'],
    
    # W names
    'walt': ['walter', 'wallace'],
    'will': ['william', 'wilhelm', 'guillermo'],
    
    # Z names
    'zack': ['zachary', 'zachariah', 'zach']
 }

 def normalize_name(name):
    # Parse the name using HumanName
    parsed_name = HumanName(name.lower())
    
    # Remove titles
    parsed_name.title = ''
    
    return str(parsed_name).strip()

 def get_name_parts(full_name):
    parsed_name = HumanName(full_name)
    
    # Combine all first name parts (first name and middle names)
    first_parts = []
    if parsed_name.first:
        first_parts.append(parsed_name.first)
    if parsed_name.middle:
        first_parts.append(parsed_name.middle)
    
    first_name = ' '.join(first_parts).lower()
    last_name = parsed_name.last.lower()
    
    return first_name, last_name

 def are_names_similar(name1, name2, first_name_threshold=75, last_name_threshold=95):
    first1, last1 = get_name_parts(name1)
    first2, last2 = get_name_parts(name2)
    
    # First check if last names are similar enough
    last_name_ratio = fuzz.ratio(last1, last2)
    levenshtein_distance = Levenshtein.distance(last1, last2)

    if last_name_ratio < last_name_threshold and levenshtein_distance > 1:
        return False
        
    # If last names match, check first names
    if first1 == first2:
        return True
        
    # Check nicknames
    first1_parts = first1.split()
    first2_parts = first2.split()
    
    for part1 in first1_parts:
        for part2 in first2_parts:
            for nickname, variants in nicknames.items():
                if (part1 == nickname and part2 in variants) or \
                   (part2 == nickname and part1 in variants):
                    return True
    
    # If no nickname matches, try fuzzy matching on first names
    first_name_ratio = fuzz.token_sort_ratio(first1, first2)
    return first_name_ratio >= first_name_threshold

 # Find similar pairs
 similar_pairs = []
 for name1, name2 in combinations(names, 2):
    if are_names_similar(name1, name2):
        similar_pairs.append((name1, name2))

 print("Potentially matching names:")
 for pair in similar_pairs:
    print(f"{pair[0]} <-> {pair[1]}")
	"""
	Tries to match names that are pretty similar.
	The last names need to be nearly identical, will still match if there's a single charecter typo.
	The first names have titles dropped, and are compared but require a lower similarity for output.

	Example Output:
	Potentially matching names:
	Anthony Smith <-> Tony Smith
	Maxwell Jones <-> Max Jones
	Geoffrey Greg <-> Geoffrey Gregg
	Senator Zachary Williams <-> Zack Williams
	Patrick Moore <-> Dr. Pat Moore
	Theodore J. Johnson <-> Ted Johnson
	"""

	from nameparser import HumanName
	from thefuzz import fuzz
	from itertools import combinations
	import Levenshtein

	names = [
	"Anthony Smith",
	"Tony Smith",
	"Maxwell Jones",
	"Max Jones",
	"Geoffrey Greg",
	"Senator Zachary Williams",
	"Zack Williams",
	"Patrick Moore",
	"Dr. Pat Moore",
	"Theodore J. Johnson",
	"Ted Johnson",
	"Geoffrey Gregg",
	]

	nicknames = {
	# B names
	'bill': ['william', 'will', 'billy', 'willie'],
	'bob': ['robert', 'rob', 'bobby', 'robbie', 'bert'],
	'ben': ['benjamin', 'benji', 'benny'],
	'bert': ['herbert', 'albert', 'bertram', 'roberto'],

	# C names
	'chuck': ['charles', 'charlie', 'chas'],
	'chris': ['christopher', 'christian'],

	# D names
	'dave': ['david', 'davey', 'davie'],
	'dan': ['daniel', 'danny', 'dannie'],
	'dick': ['richard', 'rich', 'ricky', 'ricardo'],

	# E names
	'ed': ['edward', 'eddie', 'eduardo', 'edwin'],
	'eli': ['elijah', 'elias'],

	# F names
	'fred': ['frederick', 'freddie', 'fredrick'],
	'frank': ['francis', 'francisco', 'franklin'],

	# G names
	'greg': ['gregory', 'gregg', 'greggory'],
	'gabe': ['gabriel', 'gaby'],
	'gus': ['augustus', 'gustav', 'augusto'],

	# H names
	'hal': ['harold', 'harry'],
	'hank': ['henry', 'heinrich'],

	# J names
	'jim': ['james', 'jimmy', 'jamie'],
	'joe': ['joseph', 'joey', 'jose'],
	'jack': ['john', 'jonathan', 'johnny'],
	'jerry': ['jerome', 'gerald', 'geraldo'],
	'jeff': ['jeffrey', 'geoffrey'],

	# K names
	'ken': ['kenneth', 'kenny'],
	'kit': ['christopher', 'christian'],

	# L names
	'larry': ['lawrence', 'laurence', 'lorenzo'],
	'len': ['leonard', 'leonardo'],

	# M names
	'mike': ['michael', 'mickey', 'mick'],
	'matt': ['matthew', 'mathew', 'mateo'],
	'max': ['maxwell', 'maximilian', 'maximillian'],

	# N names
	'nick': ['nicholas', 'nicolas', 'nico'],
	'nat': ['nathan', 'nathaniel', 'nataniel'],

	# P names
	'pat': ['patrick', 'patricia'],
	'pete': ['peter', 'pedro'],
	'phil': ['phillip', 'philip', 'felipe'],

	# R names
	'ron': ['ronald', 'ronnie', 'ronny'],
	'ray': ['raymond', 'raymund'],
	'rick': ['richard', 'ricardo'],

	# S names
	'sam': ['samuel', 'sammy', 'sammie'],
	'stan': ['stanley', 'stanford'],
	'steve': ['steven', 'stephen', 'esteban'],

	# T names
	'ted': ['theodore', 'edmund', 'eduardo'],
	'tom': ['thomas', 'tommy', 'tomas'],
	'tony': ['anthony', 'antonio'],

	# V names
	'vic': ['victor', 'vincent'],
	'val': ['valentine', 'valentin'],

	# W names
	'walt': ['walter', 'wallace'],
	'will': ['william', 'wilhelm', 'guillermo'],

	# Z names
	'zack': ['zachary', 'zachariah', 'zach']
	}

	def normalize_name(name):
	# Parse the name using HumanName
	parsed_name = HumanName(name.lower())

	# Remove titles
	parsed_name.title = ''

	return str(parsed_name).strip()

	def get_name_parts(full_name):
	parsed_name = HumanName(full_name)

	# Combine all first name parts (first name and middle names)
	first_parts = []
	if parsed_name.first:
	first_parts.append(parsed_name.first)
	if parsed_name.middle:
	first_parts.append(parsed_name.middle)

	first_name = ' '.join(first_parts).lower()
	last_name = parsed_name.last.lower()

	return first_name, last_name

	def are_names_similar(name1, name2, first_name_threshold=75, last_name_threshold=95):
	first1, last1 = get_name_parts(name1)
	first2, last2 = get_name_parts(name2)

	# First check if last names are similar enough
	last_name_ratio = fuzz.ratio(last1, last2)
	levenshtein_distance = Levenshtein.distance(last1, last2)

	if last_name_ratio < last_name_threshold and levenshtein_distance > 1:
	return False

	# If last names match, check first names
	if first1 == first2:
	return True

	# Check nicknames
	first1_parts = first1.split()
	first2_parts = first2.split()

	for part1 in first1_parts:
	for part2 in first2_parts:
	for nickname, variants in nicknames.items():
	if (part1 == nickname and part2 in variants) or \
	(part2 == nickname and part1 in variants):
	return True

	# If no nickname matches, try fuzzy matching on first names
	first_name_ratio = fuzz.token_sort_ratio(first1, first2)
	return first_name_ratio >= first_name_threshold

	# Find similar pairs
	similar_pairs = []
	for name1, name2 in combinations(names, 2):
	if are_names_similar(name1, name2):
	similar_pairs.append((name1, name2))

	print("Potentially matching names:")
	for pair in similar_pairs:
	print(f"{pair[0]} <-> {pair[1]}")
No results found