Created
April 24, 2026 13:10
-
-
Save andycasey/93b72942c356a71a30c3aab2dc2420b8 to your computer and use it in GitHub Desktop.
Sanitise and validate ORCIDS
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def normalize_orcid(orcid: str) -> Optional[str]: | |
| """ | |
| Normalize ORCID identifier. | |
| Handles common data quality issues: | |
| - Strip leading/trailing quotes and whitespace | |
| - Pad missing leading zeros: 0-0001-9884-3716 → 0000-0001-9884-3716 | |
| - Strip orcid.org URL prefix | |
| - Validate format: XXXX-XXXX-XXXX-XXXX | |
| """ | |
| if not orcid: | |
| return None | |
| orcid = orcid.strip().strip('"').strip("'").strip() | |
| if not orcid: | |
| return None | |
| orcid = orcid.replace("https://orcid.org/", "").replace("http://orcid.org/", "") | |
| parts = orcid.split("-") | |
| if len(parts) == 4: | |
| cleaned_parts = [''.join(c for c in p.upper() if c.isdigit() or c == 'X') for p in parts] | |
| orcid = ''.join(p.zfill(4) if len(p) <= 4 else p for p in cleaned_parts) | |
| cleaned = ''.join(c for c in orcid.upper() if c.isdigit() or c == 'X') | |
| if len(cleaned) != 16 or not cleaned[:15].isdigit() or cleaned[15] not in '0123456789X': | |
| return None | |
| return f"{cleaned[0:4]}-{cleaned[4:8]}-{cleaned[8:12]}-{cleaned[12:16]}" | |
| def is_valid_orcid(orcid: str) -> tuple[bool, Optional[str]]: | |
| """Return (is_valid, sanitised_orcid); sanitised_orcid is XXXX-XXXX-XXXX-XXXX or None if unparseable.""" | |
| sanitised = normalize_orcid(orcid) if orcid else None | |
| if sanitised is None: | |
| return (False, None) | |
| cleaned = sanitised.replace("-", "") | |
| total = 0 | |
| for ch in cleaned[:15]: | |
| total = (total + int(ch)) * 2 | |
| remainder = total % 11 | |
| expected = "X" if (12 - remainder) % 11 == 10 else str((12 - remainder) % 11) | |
| return (expected == cleaned[15], sanitised) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment