Last active
June 3, 2025 20:28
-
-
Save greg-randall/ba04ac4df18c82677e7c26a80c90133c to your computer and use it in GitHub Desktop.
Name similarity comparator. I use it to try and align data in spreadsheets. Run: python3 nametest.py sample_names.txt Then use the sample_names.csv to match in your spreadsheets.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| names = [ | |
| "Anthony Smith", | |
| "Tony Smith", | |
| "Maxwell Jones", | |
| "Max Jones", | |
| "Zachary Williams", | |
| "Zack Williams", | |
| "Patrick Moore", | |
| "Pat Moore", | |
| "Theodore Johnson", | |
| "Ted Johnson" | |
| ] | |
| from thefuzz import fuzz | |
| from itertools import combinations | |
| # List of titles to remove during comparison | |
| TITLES_TO_REMOVE = [ | |
| 'honorable', | |
| 'the', | |
| 'judge', | |
| 'hon', | |
| 'dr', | |
| 'professor', | |
| ] | |
| nicknames = { | |
| # B names | |
| 'bill': ['william', 'will', 'billy', 'willie'], | |
| 'bob': ['robert', 'rob', 'bobby', 'robbie', 'bert'], | |
| 'ben': ['benjamin', 'benji', 'benny'], | |
| 'bert': ['herbert', 'albert', 'bertram', 'roberto'], | |
| # C names | |
| 'chuck': ['charles', 'charlie', 'chas'], | |
| 'chris': ['christopher', 'christian'], | |
| # D names | |
| 'dave': ['david', 'davey', 'davie'], | |
| 'dan': ['daniel', 'danny', 'dannie'], | |
| 'dick': ['richard', 'rich', 'ricky', 'ricardo'], | |
| # E names | |
| 'ed': ['edward', 'eddie', 'eduardo', 'edwin'], | |
| 'eli': ['elijah', 'elias'], | |
| # F names | |
| 'fred': ['frederick', 'freddie', 'fredrick'], | |
| 'frank': ['francis', 'francisco', 'franklin'], | |
| # G names | |
| 'greg': ['gregory', 'gregg', 'greggory'], | |
| 'gabe': ['gabriel', 'gaby'], | |
| 'gus': ['augustus', 'gustav', 'augusto'], | |
| # H names | |
| 'hal': ['harold', 'harry'], | |
| 'hank': ['henry', 'heinrich'], | |
| # J names | |
| 'jim': ['james', 'jimmy', 'jamie'], | |
| 'joe': ['joseph', 'joey', 'jose'], | |
| 'jack': ['john', 'jonathan', 'johnny'], | |
| 'jerry': ['jerome', 'gerald', 'geraldo'], | |
| 'jeff': ['jeffrey', 'geoffrey'], | |
| # K names | |
| 'ken': ['kenneth', 'kenny'], | |
| 'kit': ['christopher', 'christian'], | |
| # L names | |
| 'larry': ['lawrence', 'laurence', 'lorenzo'], | |
| 'len': ['leonard', 'leonardo'], | |
| # M names | |
| 'mike': ['michael', 'mickey', 'mick'], | |
| 'matt': ['matthew', 'mathew', 'mateo'], | |
| 'max': ['maxwell', 'maximilian', 'maximillian'], | |
| # N names | |
| 'nick': ['nicholas', 'nicolas', 'nico'], | |
| 'nat': ['nathan', 'nathaniel', 'nataniel'], | |
| # P names | |
| 'pat': ['patrick', 'patricia'], | |
| 'pete': ['peter', 'pedro'], | |
| 'phil': ['phillip', 'philip', 'felipe'], | |
| # R names | |
| 'ron': ['ronald', 'ronnie', 'ronny'], | |
| 'ray': ['raymond', 'raymund'], | |
| 'rick': ['richard', 'ricardo'], | |
| # S names | |
| 'sam': ['samuel', 'sammy', 'sammie'], | |
| 'stan': ['stanley', 'stanford'], | |
| 'steve': ['steven', 'stephen', 'esteban'], | |
| # T names | |
| 'ted': ['theodore', 'edmund', 'eduardo'], | |
| 'tom': ['thomas', 'tommy', 'tomas'], | |
| 'tony': ['anthony', 'antonio'], | |
| # V names | |
| 'vic': ['victor', 'vincent'], | |
| 'val': ['valentine', 'valentin'], | |
| # W names | |
| 'walt': ['walter', 'wallace'], | |
| 'will': ['william', 'wilhelm', 'guillermo'], | |
| # Z names | |
| 'zack': ['zachary', 'zachariah', 'zach'] | |
| } | |
| def normalize_name(name): | |
| name_lower = name.lower() | |
| for title in TITLES_TO_REMOVE: | |
| name_lower = name_lower.replace(title, '') | |
| return name_lower.strip() | |
| def get_name_parts(full_name): | |
| parts = normalize_name(full_name).split() | |
| if len(parts) >= 2: | |
| return ' '.join(parts[:-1]), parts[-1] | |
| return full_name, '' | |
| def are_names_similar(name1, name2, first_name_threshold=75, last_name_threshold=95): | |
| first1, last1 = get_name_parts(name1) | |
| first2, last2 = get_name_parts(name2) | |
| # First check if last names are similar enough | |
| last_name_ratio = fuzz.ratio(last1, last2) | |
| if last_name_ratio < last_name_threshold: | |
| return False | |
| # If last names match, check first names | |
| if first1 == first2: | |
| return True | |
| # Check nicknames | |
| first1_parts = first1.split() | |
| first2_parts = first2.split() | |
| for part1 in first1_parts: | |
| for part2 in first2_parts: | |
| for nickname, variants in nicknames.items(): | |
| if (part1 == nickname and part2 in variants) or \ | |
| (part2 == nickname and part1 in variants): | |
| return True | |
| # If no nickname matches, try fuzzy matching on first names | |
| first_name_ratio = fuzz.token_sort_ratio(first1, first2) | |
| return first_name_ratio >= first_name_threshold | |
| similar_pairs = [] | |
| for name1, name2 in combinations(names, 2): | |
| if are_names_similar(name1, name2): | |
| similar_pairs.append((name1, name2)) | |
| print("Potentially matching names:") | |
| for pair in similar_pairs: | |
| print(f"{pair[0]} <-> {pair[1]}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment