Last active
June 3, 2025 20:28
-
-
Save greg-randall/ba04ac4df18c82677e7c26a80c90133c to your computer and use it in GitHub Desktop.
Name similarity comparator. I use it to try and align data in spreadsheets. Run: python3 nametest.py sample_names.txt Then use the sample_names.csv to match in your spreadsheets.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Tries to match names that are pretty similar. | |
| The last names need to be nearly identical, will still match if there's a single charecter typo. | |
| The first names have titles dropped, and are compared but require a lower similarity for output. | |
| Example Output: | |
| Potentially matching names: | |
| Anthony Smith <-> Tony Smith | |
| Maxwell Jones <-> Max Jones | |
| Geoffrey Greg <-> Geoffrey Gregg | |
| Senator Zachary Williams <-> Zack Williams | |
| Patrick Moore <-> Dr. Pat Moore | |
| Theodore J. Johnson <-> Ted Johnson | |
| """ | |
| from nameparser import HumanName | |
| from thefuzz import fuzz | |
| from itertools import combinations | |
| import Levenshtein | |
| names = [ | |
| "Anthony Smith", | |
| "Tony Smith", | |
| "Maxwell Jones", | |
| "Max Jones", | |
| "Geoffrey Greg", | |
| "Senator Zachary Williams", | |
| "Zack Williams", | |
| "Patrick Moore", | |
| "Dr. Pat Moore", | |
| "Theodore J. Johnson", | |
| "Ted Johnson", | |
| "Geoffrey Gregg", | |
| ] | |
| nicknames = { | |
| # B names | |
| 'bill': ['william', 'will', 'billy', 'willie'], | |
| 'bob': ['robert', 'rob', 'bobby', 'robbie', 'bert'], | |
| 'ben': ['benjamin', 'benji', 'benny'], | |
| 'bert': ['herbert', 'albert', 'bertram', 'roberto'], | |
| # C names | |
| 'chuck': ['charles', 'charlie', 'chas'], | |
| 'chris': ['christopher', 'christian'], | |
| # D names | |
| 'dave': ['david', 'davey', 'davie'], | |
| 'dan': ['daniel', 'danny', 'dannie'], | |
| 'dick': ['richard', 'rich', 'ricky', 'ricardo'], | |
| # E names | |
| 'ed': ['edward', 'eddie', 'eduardo', 'edwin'], | |
| 'eli': ['elijah', 'elias'], | |
| # F names | |
| 'fred': ['frederick', 'freddie', 'fredrick'], | |
| 'frank': ['francis', 'francisco', 'franklin'], | |
| # G names | |
| 'greg': ['gregory', 'gregg', 'greggory'], | |
| 'gabe': ['gabriel', 'gaby'], | |
| 'gus': ['augustus', 'gustav', 'augusto'], | |
| # H names | |
| 'hal': ['harold', 'harry'], | |
| 'hank': ['henry', 'heinrich'], | |
| # J names | |
| 'jim': ['james', 'jimmy', 'jamie'], | |
| 'joe': ['joseph', 'joey', 'jose'], | |
| 'jack': ['john', 'jonathan', 'johnny'], | |
| 'jerry': ['jerome', 'gerald', 'geraldo'], | |
| 'jeff': ['jeffrey', 'geoffrey'], | |
| # K names | |
| 'ken': ['kenneth', 'kenny'], | |
| 'kit': ['christopher', 'christian'], | |
| # L names | |
| 'larry': ['lawrence', 'laurence', 'lorenzo'], | |
| 'len': ['leonard', 'leonardo'], | |
| # M names | |
| 'mike': ['michael', 'mickey', 'mick'], | |
| 'matt': ['matthew', 'mathew', 'mateo'], | |
| 'max': ['maxwell', 'maximilian', 'maximillian'], | |
| # N names | |
| 'nick': ['nicholas', 'nicolas', 'nico'], | |
| 'nat': ['nathan', 'nathaniel', 'nataniel'], | |
| # P names | |
| 'pat': ['patrick', 'patricia'], | |
| 'pete': ['peter', 'pedro'], | |
| 'phil': ['phillip', 'philip', 'felipe'], | |
| # R names | |
| 'ron': ['ronald', 'ronnie', 'ronny'], | |
| 'ray': ['raymond', 'raymund'], | |
| 'rick': ['richard', 'ricardo'], | |
| # S names | |
| 'sam': ['samuel', 'sammy', 'sammie'], | |
| 'stan': ['stanley', 'stanford'], | |
| 'steve': ['steven', 'stephen', 'esteban'], | |
| # T names | |
| 'ted': ['theodore', 'edmund', 'eduardo'], | |
| 'tom': ['thomas', 'tommy', 'tomas'], | |
| 'tony': ['anthony', 'antonio'], | |
| # V names | |
| 'vic': ['victor', 'vincent'], | |
| 'val': ['valentine', 'valentin'], | |
| # W names | |
| 'walt': ['walter', 'wallace'], | |
| 'will': ['william', 'wilhelm', 'guillermo'], | |
| # Z names | |
| 'zack': ['zachary', 'zachariah', 'zach'] | |
| } | |
| def normalize_name(name): | |
| # Parse the name using HumanName | |
| parsed_name = HumanName(name.lower()) | |
| # Remove titles | |
| parsed_name.title = '' | |
| return str(parsed_name).strip() | |
| def get_name_parts(full_name): | |
| parsed_name = HumanName(full_name) | |
| # Combine all first name parts (first name and middle names) | |
| first_parts = [] | |
| if parsed_name.first: | |
| first_parts.append(parsed_name.first) | |
| if parsed_name.middle: | |
| first_parts.append(parsed_name.middle) | |
| first_name = ' '.join(first_parts).lower() | |
| last_name = parsed_name.last.lower() | |
| return first_name, last_name | |
| def are_names_similar(name1, name2, first_name_threshold=75, last_name_threshold=95): | |
| first1, last1 = get_name_parts(name1) | |
| first2, last2 = get_name_parts(name2) | |
| # First check if last names are similar enough | |
| last_name_ratio = fuzz.ratio(last1, last2) | |
| levenshtein_distance = Levenshtein.distance(last1, last2) | |
| if last_name_ratio < last_name_threshold and levenshtein_distance > 1: | |
| return False | |
| # If last names match, check first names | |
| if first1 == first2: | |
| return True | |
| # Check nicknames | |
| first1_parts = first1.split() | |
| first2_parts = first2.split() | |
| for part1 in first1_parts: | |
| for part2 in first2_parts: | |
| for nickname, variants in nicknames.items(): | |
| if (part1 == nickname and part2 in variants) or \ | |
| (part2 == nickname and part1 in variants): | |
| return True | |
| # If no nickname matches, try fuzzy matching on first names | |
| first_name_ratio = fuzz.token_sort_ratio(first1, first2) | |
| return first_name_ratio >= first_name_threshold | |
| # Find similar pairs | |
| similar_pairs = [] | |
| for name1, name2 in combinations(names, 2): | |
| if are_names_similar(name1, name2): | |
| similar_pairs.append((name1, name2)) | |
| print("Potentially matching names:") | |
| for pair in similar_pairs: | |
| print(f"{pair[0]} <-> {pair[1]}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment