Skip to content

Instantly share code, notes, and snippets.

@andycasey
Created April 24, 2026 13:10
Show Gist options
  • Select an option

  • Save andycasey/93b72942c356a71a30c3aab2dc2420b8 to your computer and use it in GitHub Desktop.

Select an option

Save andycasey/93b72942c356a71a30c3aab2dc2420b8 to your computer and use it in GitHub Desktop.
Sanitise and validate ORCIDS
def normalize_orcid(orcid: str) -> Optional[str]:
"""
Normalize ORCID identifier.
Handles common data quality issues:
- Strip leading/trailing quotes and whitespace
- Pad missing leading zeros: 0-0001-9884-3716 → 0000-0001-9884-3716
- Strip orcid.org URL prefix
- Validate format: XXXX-XXXX-XXXX-XXXX
"""
if not orcid:
return None
orcid = orcid.strip().strip('"').strip("'").strip()
if not orcid:
return None
orcid = orcid.replace("https://orcid.org/", "").replace("http://orcid.org/", "")
parts = orcid.split("-")
if len(parts) == 4:
cleaned_parts = [''.join(c for c in p.upper() if c.isdigit() or c == 'X') for p in parts]
orcid = ''.join(p.zfill(4) if len(p) <= 4 else p for p in cleaned_parts)
cleaned = ''.join(c for c in orcid.upper() if c.isdigit() or c == 'X')
if len(cleaned) != 16 or not cleaned[:15].isdigit() or cleaned[15] not in '0123456789X':
return None
return f"{cleaned[0:4]}-{cleaned[4:8]}-{cleaned[8:12]}-{cleaned[12:16]}"
def is_valid_orcid(orcid: str) -> tuple[bool, Optional[str]]:
"""Return (is_valid, sanitised_orcid); sanitised_orcid is XXXX-XXXX-XXXX-XXXX or None if unparseable."""
sanitised = normalize_orcid(orcid) if orcid else None
if sanitised is None:
return (False, None)
cleaned = sanitised.replace("-", "")
total = 0
for ch in cleaned[:15]:
total = (total + int(ch)) * 2
remainder = total % 11
expected = "X" if (12 - remainder) % 11 == 10 else str((12 - remainder) % 11)
return (expected == cleaned[15], sanitised)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment