Skip to content

Instantly share code, notes, and snippets.

@greg-randall
Last active June 3, 2025 20:28
Show Gist options
  • Select an option

  • Save greg-randall/ba04ac4df18c82677e7c26a80c90133c to your computer and use it in GitHub Desktop.

Select an option

Save greg-randall/ba04ac4df18c82677e7c26a80c90133c to your computer and use it in GitHub Desktop.
Name similarity comparator. I use it to try and align data in spreadsheets. Run: python3 nametest.py sample_names.txt Then use the sample_names.csv to match in your spreadsheets.
"""
Name Matching Algorithm with Nickname and Typo Tolerance
# Basic usage with default thresholds:
python3 nametest.py sample_names.txt
# With custom thresholds:
python3 nametest.py sample_names.txt --first-threshold 80 --last-threshold 90 --last-distance 2
This script implements a flexible name matching system that identifies potentially equivalent names
while accounting for common variations in how names are written. It's particularly useful for
deduplicating contact lists, matching author names, or identifying the same person across different
databases.
Key Features:
- Handles common nickname variations (e.g., "Anthony" <-> "Tony", "Theodore" <-> "Ted")
- Tolerates minor typos in last names (up to 1 character difference)
- Ignores professional/honorary titles (e.g., "Dr.", "Senator")
- Supports middle names/initials
- Uses separate similarity thresholds for first and last names
Matching Rules:
1. Last names must be nearly identical (default 95% similarity or max 1 character difference)
2. First names can match in any of these ways:
- Exact string match
- Known nickname variation (using the nicknames library)
- Fuzzy string similarity above threshold (default 75%)
Example Matches:
Anthony Smith <-> Tony Smith # Nickname variation
Maxwell Jones <-> Max Jones # Common shortening
Geoffrey Greg <-> Geoff Gregg # Typo in last name
Senator Zachary Williams <-> Zack Williams # Title removed + nickname
Patrick Moore <-> Dr. Pat Moore # Title removed + nickname
Theodore J. Johnson <-> Ted Johnson # Nickname + middle initial
Dependencies:
- nameparser: For structured name parsing
- thefuzz: For fuzzy string matching
- Levenshtein: For edit distance calculation
- nicknames: For nickname/canonical name lookups
"""
from nameparser import HumanName
from thefuzz import fuzz
import Levenshtein
from nicknames import NickNamer
from itertools import combinations
import argparse
def normalize_name(name):
# Handle potential None or empty strings
if not name:
return ""
# Escape any single quotes in the name before parsing
parsed_name = HumanName(name.lower().strip())
parsed_name.title = ''
return str(parsed_name).strip()
def get_name_parts(full_name):
# Handle potential None or empty strings
if not full_name:
return "", ""
parsed_name = HumanName(full_name)
first_parts = []
if parsed_name.first:
first_parts.append(parsed_name.first)
if parsed_name.middle:
first_parts.append(parsed_name.middle)
first_name = ' '.join(first_parts).lower()
last_name = parsed_name.last.lower() if parsed_name.last else ""
return first_name, last_name
def are_names_similar(name1, name2, first_name_threshold, last_name_threshold, last_name_distance):
# Handle potential None or empty inputs
if not name1 or not name2:
return False
first1, last1 = get_name_parts(name1)
first2, last2 = get_name_parts(name2)
# If either name is missing essential parts, return False
if not (first1 and last1 and first2 and last2):
return False
# First check if last names are similar enough
last_name_ratio = fuzz.ratio(last1, last2)
levenshtein_distance = Levenshtein.distance(last1, last2)
if last_name_ratio < last_name_threshold and levenshtein_distance > last_name_distance:
return False
# If first names are exactly the same, we're done
if first1 == first2:
return True
# Check nicknames using the nicknames library
nn = NickNamer()
first1_parts = first1.split()
first2_parts = first2.split()
for part1 in first1_parts:
for part2 in first2_parts:
part1_variations = nn.nicknames_of(part1) | nn.canonicals_of(part1) | {part1}
part2_variations = nn.nicknames_of(part2) | nn.canonicals_of(part2) | {part2}
if part1_variations & part2_variations:
return True
first_name_ratio = fuzz.token_sort_ratio(first1, first2)
return first_name_ratio >= first_name_threshold
def main():
parser = argparse.ArgumentParser(description='Find similar names in a text file.')
parser.add_argument('input_file', help='Text file containing names (one per line)')
parser.add_argument('--first-threshold', type=int, default=75,
help='Threshold for first name similarity (default: 75)')
parser.add_argument('--last-threshold', type=int, default=95,
help='Threshold for last name similarity (default: 95)')
parser.add_argument('--last-distance', type=int, default=1,
help='Maximum Levenshtein distance for last names (default: 1)')
args = parser.parse_args()
try:
# Use universal newlines mode and properly handle UTF-8 encoding
with open(args.input_file, 'r', encoding='utf-8', newline=None) as f:
# Strip whitespace and filter out empty lines
names = [line.strip() for line in f if line.strip()]
names = [name for name in names if len(name) > 0]
except FileNotFoundError:
print(f"Error: Could not find file '{args.input_file}'")
return
except UnicodeDecodeError:
print(f"Error: File encoding issue. Please ensure the file is saved in UTF-8 format.")
return
except Exception as e:
print(f"Error reading file: {e}")
return
if len(names) < 2:
print("Error: Need at least two names to compare")
return
similar_pairs = []
for name1, name2 in combinations(names, 2):
try:
if are_names_similar(name1, name2,
args.first_threshold,
args.last_threshold,
args.last_distance):
similar_pairs.append((name1, name2))
except Exception as e:
print(f"Warning: Error processing names '{name1}' and '{name2}': {e}")
continue
if similar_pairs:
print("Potentially matching names:")
for pair in similar_pairs:
print(f"{pair[0]} <-> {pair[1]}")
else:
print("\nNo matching names found.")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment