Last active
August 25, 2025 14:10
-
-
Save playday3008/c05899baeaf9c613dca89e33c44b850d to your computer and use it in GitHub Desktop.
Google Takeout postprocess
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from dataclasses import dataclass | |
| from typing import Optional, Any | |
| from datetime import datetime | |
| from pathlib import Path | |
| from glob import glob | |
| import os | |
| import json | |
| import re | |
| from pydantic import BaseModel | |
| class NTime(BaseModel): | |
| timestamp: datetime | |
| formatted: str | |
| class GeoData(BaseModel): | |
| latitude: float | |
| longitude: float | |
| altitude: float | |
| latitudeSpan: float | |
| longitudeSpan: float | |
| class People(BaseModel): | |
| name: str | |
| class DeviceFolder(BaseModel): | |
| localFolderName: str | |
| class MobileUpload(BaseModel): | |
| deviceFolder: Optional[DeviceFolder] = None | |
| deviceType: Optional[str] = None | |
| class WebUpload(BaseModel): | |
| computerUpload: dict[str, Any] | |
| class DriveDesktopUploader(BaseModel): | |
| version: str | |
| class Composition(BaseModel): | |
| type: str | |
| class GooglePhotosOrigin(BaseModel): | |
| driveSync: Optional[dict[str, Any]] = None | |
| mobileUpload: Optional[MobileUpload] = None | |
| webUpload: Optional[WebUpload] = None | |
| driveDesktopUploader: Optional[DriveDesktopUploader] = None | |
| composition: Optional[Composition] = None | |
| class AppSource(BaseModel): | |
| androidPackageName: str | |
| class GooglePhotosMetadata(BaseModel): | |
| title: str | |
| description: str | |
| imageViews: int | |
| creationTime: NTime | |
| photoTakenTime: NTime | |
| geoData: GeoData | |
| geoDataExif: Optional[GeoData] = None | |
| people: Optional[list[People]] = None | |
| archived: Optional[bool] = None | |
| favorited: Optional[bool] = None | |
| url: str | |
| googlePhotosOrigin: Optional[GooglePhotosOrigin] = None | |
| appSource: Optional[AppSource] = None | |
| @dataclass | |
| class Item: | |
| mediaFile: Path | |
| metadataFile: Path | |
| metadata: GooglePhotosMetadata | |
| def google_photos(base_path: Path = Path(".")) -> None: | |
| blacklist_files: list[str] = [ | |
| # Metadata of each album | |
| "metadata.json", | |
| # Metadatas of whole Google Photos library | |
| "print-subscriptions.json", | |
| "shared_album_comments.json", | |
| "user-generated-memory-titles.json", | |
| ] | |
| # Check base path | |
| if not base_path.exists(): | |
| print(f"Base path {base_path} does not exist") | |
| exit(1) | |
| if not base_path.is_dir(): | |
| print(f"Base path {base_path} is not a directory") | |
| exit(2) | |
| print(f"Processing files in {base_path}") | |
| # Get all non JSON files (for later comparison) | |
| media_before: dict[Path, os.stat_result] = { | |
| Path(p): Path(p).stat() | |
| for p in glob(os.path.join(base_path, f"**/*.*"), recursive=True) | |
| if Path(p).suffix != ".json" | |
| } | |
| # Get all media and metadata files | |
| all_media: set[Path] = { | |
| Path(p) | |
| for p in glob(os.path.join(base_path, f"**/*.*"), recursive=True) | |
| if Path(p).suffix != ".json" | |
| } | |
| all_meta: set[Path] = { | |
| Path(p) | |
| for p in glob(os.path.join(base_path, f"**/*.json"), recursive=True) | |
| if Path(p).name not in blacklist_files | |
| } | |
| print(f"Found {len(all_media)} media files and {len(all_meta)} metadata files") | |
| # Try to match media files to metadata files | |
| metadata_match: re.Pattern[str] = re.compile( | |
| r"^(.+?)(?:|\.(?:|s(?:|u(?:|p(?:|p(?:|l(?:|e(?:|m(?:|e(?:|n(?:|t(?:|a(?:|l(?:|-(?:|m(?:|e(?:|t(?:|a(?:|d(?:|a(?:|t(?:|a))))))))))))))))))))))(|\((?:|\d+?(?:|\))))\.json$" | |
| ) | |
| items: list[Item] = [] | |
| for metafile in all_meta: | |
| # Match metadata file | |
| meta_match: Optional[re.Match[str]] = metadata_match.match(metafile.name) | |
| if not meta_match: | |
| print(f"Could not match regex for '{metafile}'") | |
| continue | |
| # Get metadata | |
| meta: GooglePhotosMetadata | |
| with open(metafile, "r") as f: | |
| data: Any = json.load(f) | |
| try: | |
| meta = GooglePhotosMetadata.model_validate(data) | |
| except Exception as e: | |
| print(f"Error parsing '{metafile}': {e}") | |
| continue | |
| # Find corresponding media file | |
| max_len = 51 | |
| media_origin: Path = Path(meta.title) | |
| media: Path = metafile.parent / ( | |
| media_origin.stem[0 : max_len - len(media_origin.suffix)] | |
| + meta_match.group(2) | |
| + media_origin.suffix | |
| ) | |
| if not media.exists(): | |
| # Try case insensitive match | |
| media_candidates: list[Path] = [ | |
| p | |
| for p in metafile.parent.glob( | |
| meta.title[0:max_len] + "*", | |
| case_sensitive=False, | |
| ) | |
| if p.suffix != ".json" | |
| ] | |
| if len(media_candidates) == 0: | |
| print(f"No media file found for '{metafile}' with title '{meta.title}'") | |
| continue | |
| elif len(media_candidates) > 1: | |
| print( | |
| f"Multiple media files found for '{metafile}' with title '{meta.title}':" | |
| ) | |
| for mc in media_candidates: | |
| print(f" - {mc}") | |
| continue | |
| else: | |
| media = media_candidates[0] | |
| items.append(Item(mediaFile=media, metadataFile=metafile, metadata=meta)) | |
| # Handle `-edited` suffix | |
| if True: | |
| edited_file: Path = metafile.parent / ( | |
| (media_origin.stem + "-edited")[0 : max_len - len(media_origin.suffix)] | |
| + meta_match.group(2) | |
| + media_origin.suffix | |
| ) | |
| if edited_file.exists() and edited_file != media: | |
| items.append( | |
| Item(mediaFile=edited_file, metadataFile=metafile, metadata=meta) | |
| ) | |
| print(f"Matched {len(items)} items") | |
| # For each item, modify creation/modification time of media file to match photoTakenTime | |
| for item in items: | |
| new_time: datetime = item.metadata.photoTakenTime.timestamp | |
| mod_time: float = new_time.timestamp() | |
| os.utime(item.mediaFile, (mod_time, mod_time)) | |
| # Delete metadata file after processing | |
| try: | |
| os.remove(item.metadataFile) | |
| except Exception as e: | |
| pass | |
| # Find non JSON files with unchanged timestamps (i.e. not processed) | |
| media_after: dict[Path, os.stat_result] = { | |
| Path(p): Path(p).stat() | |
| for p in glob(os.path.join(base_path, f"**/*.*"), recursive=True) | |
| if Path(p).suffix != ".json" | |
| } | |
| unprocessed: list[Path] = [] | |
| for file, stats in media_before.items(): | |
| if file in media_after and media_after[file] == stats: | |
| unprocessed.append(file) | |
| for file in unprocessed: | |
| print(f"Unprocessed file: '{file}'") | |
| # Rename media files to match metadata title (if different) | |
| edited_match = re.compile(r"^(.+?)((?:-edited)(?:\(\d+?\))?)$") | |
| for item in items: | |
| if item.mediaFile.name != item.metadata.title: | |
| new_media: Path = item.mediaFile.parent / item.metadata.title | |
| edited: Optional[re.Match[str]] = edited_match.match(item.mediaFile.stem) | |
| if edited: | |
| title = Path(item.metadata.title) | |
| new_media = item.mediaFile.parent / ( | |
| title.stem + edited.group(2) + title.suffix | |
| ) | |
| if new_media == item.mediaFile: | |
| continue | |
| if new_media.exists(): | |
| print( | |
| f"Cannot rename '{item.mediaFile}' to '{new_media}' as it already exists" | |
| ) | |
| elif len(Path(item.mediaFile.name).suffix) != 0: | |
| # print(f"Renaming {item.mediaFile} to {new_media}") | |
| item.mediaFile.rename(new_media) | |
| item.mediaFile = new_media | |
| else: | |
| print( | |
| f"Cannot rename '{item.mediaFile}' to '{new_media}' as it has no suffix" | |
| ) | |
| return | |
| if __name__ == "__main__": | |
| google_photos(Path("./Takeout/Google Photos/")) | |
| pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment