Skip to content

Instantly share code, notes, and snippets.

@greg-randall
Last active June 3, 2025 20:28
Show Gist options
  • Select an option

  • Save greg-randall/ba04ac4df18c82677e7c26a80c90133c to your computer and use it in GitHub Desktop.

Select an option

Save greg-randall/ba04ac4df18c82677e7c26a80c90133c to your computer and use it in GitHub Desktop.
Name similarity comparator. I use it to try and align data in spreadsheets. Run: python3 nametest.py sample_names.txt Then use the sample_names.csv to match in your spreadsheets.
"""
Tries to match names that are pretty similar.
The last names need to be nearly identical, will still match if there's a single charecter typo.
The first names have titles dropped, and are compared but require a lower similarity for output.
Example Output:
Potentially matching names:
Anthony Smith <-> Tony Smith
Maxwell Jones <-> Max Jones
Geoffrey Greg <-> Geoffrey Gregg
Senator Zachary Williams <-> Zack Williams
Patrick Moore <-> Dr. Pat Moore
Theodore J. Johnson <-> Ted Johnson
"""
from nameparser import HumanName
from thefuzz import fuzz
from itertools import combinations
import Levenshtein
names = [
"Anthony Smith",
"Tony Smith",
"Maxwell Jones",
"Max Jones",
"Geoffrey Greg",
"Senator Zachary Williams",
"Zack Williams",
"Patrick Moore",
"Dr. Pat Moore",
"Theodore J. Johnson",
"Ted Johnson",
"Geoffrey Gregg",
]
nicknames = {
# B names
'bill': ['william', 'will', 'billy', 'willie'],
'bob': ['robert', 'rob', 'bobby', 'robbie', 'bert'],
'ben': ['benjamin', 'benji', 'benny'],
'bert': ['herbert', 'albert', 'bertram', 'roberto'],
# C names
'chuck': ['charles', 'charlie', 'chas'],
'chris': ['christopher', 'christian'],
# D names
'dave': ['david', 'davey', 'davie'],
'dan': ['daniel', 'danny', 'dannie'],
'dick': ['richard', 'rich', 'ricky', 'ricardo'],
# E names
'ed': ['edward', 'eddie', 'eduardo', 'edwin'],
'eli': ['elijah', 'elias'],
# F names
'fred': ['frederick', 'freddie', 'fredrick'],
'frank': ['francis', 'francisco', 'franklin'],
# G names
'greg': ['gregory', 'gregg', 'greggory'],
'gabe': ['gabriel', 'gaby'],
'gus': ['augustus', 'gustav', 'augusto'],
# H names
'hal': ['harold', 'harry'],
'hank': ['henry', 'heinrich'],
# J names
'jim': ['james', 'jimmy', 'jamie'],
'joe': ['joseph', 'joey', 'jose'],
'jack': ['john', 'jonathan', 'johnny'],
'jerry': ['jerome', 'gerald', 'geraldo'],
'jeff': ['jeffrey', 'geoffrey'],
# K names
'ken': ['kenneth', 'kenny'],
'kit': ['christopher', 'christian'],
# L names
'larry': ['lawrence', 'laurence', 'lorenzo'],
'len': ['leonard', 'leonardo'],
# M names
'mike': ['michael', 'mickey', 'mick'],
'matt': ['matthew', 'mathew', 'mateo'],
'max': ['maxwell', 'maximilian', 'maximillian'],
# N names
'nick': ['nicholas', 'nicolas', 'nico'],
'nat': ['nathan', 'nathaniel', 'nataniel'],
# P names
'pat': ['patrick', 'patricia'],
'pete': ['peter', 'pedro'],
'phil': ['phillip', 'philip', 'felipe'],
# R names
'ron': ['ronald', 'ronnie', 'ronny'],
'ray': ['raymond', 'raymund'],
'rick': ['richard', 'ricardo'],
# S names
'sam': ['samuel', 'sammy', 'sammie'],
'stan': ['stanley', 'stanford'],
'steve': ['steven', 'stephen', 'esteban'],
# T names
'ted': ['theodore', 'edmund', 'eduardo'],
'tom': ['thomas', 'tommy', 'tomas'],
'tony': ['anthony', 'antonio'],
# V names
'vic': ['victor', 'vincent'],
'val': ['valentine', 'valentin'],
# W names
'walt': ['walter', 'wallace'],
'will': ['william', 'wilhelm', 'guillermo'],
# Z names
'zack': ['zachary', 'zachariah', 'zach']
}
def normalize_name(name):
# Parse the name using HumanName
parsed_name = HumanName(name.lower())
# Remove titles
parsed_name.title = ''
return str(parsed_name).strip()
def get_name_parts(full_name):
parsed_name = HumanName(full_name)
# Combine all first name parts (first name and middle names)
first_parts = []
if parsed_name.first:
first_parts.append(parsed_name.first)
if parsed_name.middle:
first_parts.append(parsed_name.middle)
first_name = ' '.join(first_parts).lower()
last_name = parsed_name.last.lower()
return first_name, last_name
def are_names_similar(name1, name2, first_name_threshold=75, last_name_threshold=95):
first1, last1 = get_name_parts(name1)
first2, last2 = get_name_parts(name2)
# First check if last names are similar enough
last_name_ratio = fuzz.ratio(last1, last2)
levenshtein_distance = Levenshtein.distance(last1, last2)
if last_name_ratio < last_name_threshold and levenshtein_distance > 1:
return False
# If last names match, check first names
if first1 == first2:
return True
# Check nicknames
first1_parts = first1.split()
first2_parts = first2.split()
for part1 in first1_parts:
for part2 in first2_parts:
for nickname, variants in nicknames.items():
if (part1 == nickname and part2 in variants) or \
(part2 == nickname and part1 in variants):
return True
# If no nickname matches, try fuzzy matching on first names
first_name_ratio = fuzz.token_sort_ratio(first1, first2)
return first_name_ratio >= first_name_threshold
# Find similar pairs
similar_pairs = []
for name1, name2 in combinations(names, 2):
if are_names_similar(name1, name2):
similar_pairs.append((name1, name2))
print("Potentially matching names:")
for pair in similar_pairs:
print(f"{pair[0]} <-> {pair[1]}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment