Last active
April 29, 2026 19:23
-
-
Save Terrance/a299e20658af519cbf8f4fbacf348107 to your computer and use it in GitHub Desktop.
Script to parse Joplin Desktop's revision history and reverse-apply it to the current set of notes, outputting a git repository.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| from collections import defaultdict | |
| from collections.abc import Iterable | |
| from dataclasses import dataclass | |
| from datetime import datetime | |
| import json | |
| import logging | |
| import os | |
| import os.path | |
| from pathlib import Path | |
| from queue import Empty, Queue | |
| import re | |
| import sqlite3 | |
| from subprocess import CalledProcessError, run | |
| from typing import cast | |
| from diff_match_patch import diff_match_patch, patch_obj | |
| LOG = logging.getLogger(__name__) | |
| DMP = diff_match_patch() | |
| META_FRONT = ("longitude", "latitude", "altitude") | |
| META_IGNORE = {"body"} | |
| @dataclass | |
| class Entry: | |
| note: sqlite3.Row | |
| revision: sqlite3.Row | None | |
| title: str | |
| body: str | |
| metadata: dict[str, str] | None | |
| def find_profile(home: Path, profile_name: str | None): | |
| config = home / ".config" / "joplin-desktop" | |
| if profile_name: | |
| profile = config / profile_name | |
| else: | |
| profile_names = list(config.glob("profile-*")) | |
| assert len(profile_names) == 1 | |
| profile = config / profile_names[0] | |
| assert profile.is_dir() | |
| LOG.info("Profile: %s", profile) | |
| return profile | |
| def prune_folders(rows: Iterable[sqlite3.Row], exclude: list[str]): | |
| folders = {row["id"]: row for row in rows} | |
| LOG.debug("Found %s folders", len(folders)) | |
| tree = defaultdict[str, set[str]](set) | |
| for row_id, row in folders.items(): | |
| tree[row["parent_id"]].add(row_id) | |
| queue = Queue[str]() | |
| queue.put_nowait("") | |
| pruned = dict[str, sqlite3.Row]() | |
| while True: | |
| try: | |
| parent_id = queue.get_nowait() | |
| except Empty: | |
| break | |
| if parent_id in exclude: | |
| continue | |
| if parent_id: | |
| pruned[parent_id] = folders[parent_id] | |
| for child_id in tree[parent_id]: | |
| queue.put_nowait(child_id) | |
| LOG.debug("Pruned to %s folders", len(pruned)) | |
| return pruned | |
| def undo_revision(text: str, delta: dict) -> str: | |
| patches: list[patch_obj] = [] | |
| for change in reversed(delta): | |
| patch = patch_obj() | |
| patch.diffs = [[mod * -1, ctx] for mod, ctx in change["diffs"]] | |
| patch.start1 = change["start2"] | |
| patch.start2 = change["start1"] | |
| patch.length1 = change["length2"] | |
| patch.length2 = change["length1"] | |
| patches.append(patch) | |
| reverted, status = DMP.patch_apply(patches, text) | |
| if not all(status): | |
| LOG.warning("Failed to apply all patches") | |
| LOG.debug("Results: %s", status) | |
| return reverted | |
| def note_folder(note: sqlite3.Row, folders: dict[str, sqlite3.Row]): | |
| parent_id = note["parent_id"] | |
| path = list[str]() | |
| while parent_id in folders: | |
| parent = folders[parent_id] | |
| path.insert(0, parent["title"]) | |
| parent_id = parent["parent_id"] | |
| return Path(os.path.join(*path)) | |
| def fix_links(body: str, paths: dict[str, Path], resources: dict[str, str]): | |
| seen = set() | |
| def _repl(match: re.Match): | |
| link_text, item_id = match.groups() | |
| if item_id in paths: | |
| seen.add(item_id) | |
| path = paths[item_id] | |
| return f"[[{path.with_name(path.stem)}]]" | |
| elif item_id in resources: | |
| return f"[{link_text}](/resources/{resources[item_id]})" | |
| else: | |
| return match[0] | |
| return (re.sub(r"\[([^\]]*)\]\(:/([a-f0-9]+)\)", _repl, body), seen) | |
| def has_value(val): | |
| return val and not str(val).startswith("0.0000") | |
| def add_front_matter(body: str, metadata: dict[str, str] | None): | |
| if metadata: | |
| lines = list[str]() | |
| for key in META_FRONT: | |
| value = metadata.get(key) | |
| if has_value(value): | |
| lines.append(f"{key}: {value}") | |
| if lines: | |
| body = "\n".join(("---", *lines, "---", "", body)) | |
| return body | |
| def git(*args: str, **kwargs): | |
| run(("git", *args), check=True, **kwargs) | |
| def main(home: Path, profile_name: str | None, exclude: list[str], output: Path | None, author: tuple[str, str] | None): | |
| profile = find_profile(home, profile_name) | |
| with sqlite3.connect(f"file:{profile / 'database.sqlite'}?mode=ro", uri=True) as conn: | |
| conn.row_factory = sqlite3.Row | |
| cursor = conn.cursor() | |
| folders = prune_folders(cursor.execute("SELECT * FROM folders"), exclude) | |
| marks = ", ".join("?" * len(folders)) | |
| folder_ids = list(folders) | |
| cursor.execute(f"SELECT * FROM notes WHERE parent_id IN ({marks})", folder_ids) | |
| notes: dict[str, sqlite3.Row] = {row["id"]: row for row in cursor.fetchall()} | |
| LOG.debug("Read %s notes", len(notes)) | |
| cursor.execute( | |
| f""" | |
| SELECT * FROM revisions WHERE item_id IN ( | |
| SELECT id FROM notes WHERE parent_id IN ({marks}) | |
| ) ORDER BY updated_time DESC, created_time DESC | |
| """, | |
| folder_ids, | |
| ) | |
| revisions = cursor.fetchall() | |
| LOG.debug("Read %s revisions", len(revisions)) | |
| entries = defaultdict[str, list[Entry]](list) | |
| titles = dict[str, str]() | |
| bodies = dict[str, str]() | |
| for i, revision in enumerate(revisions): | |
| note_id = revision["item_id"] | |
| note = notes[note_id] | |
| title = titles.get(note_id, cast(str, note["title"])) | |
| body = bodies.get(note_id, cast(str, note["body"])) | |
| entries[note_id].append(Entry(note, revision, title, body, None)) | |
| titles[note_id] = undo_revision(title, json.loads(revision["title_diff"])) | |
| bodies[note_id] = undo_revision(body, json.loads(revision["body_diff"])) | |
| LOG.debug("Unapplied revision #%s on note %s", i, note_id) | |
| for note_id, note in notes.items(): | |
| if note["parent_id"] not in folder_ids: | |
| continue | |
| title = titles.get(note_id, cast(str, note["title"])) | |
| body = bodies.get(note_id, cast(str, note["body"])) | |
| if title or body: | |
| LOG.debug("Added residual diff for note %s", note_id) | |
| entries[note_id].append(Entry(note, None, title, body, None)) | |
| history = list[tuple[int, int, Entry | sqlite3.Row]]() | |
| for note_entries in entries.values(): | |
| first = note_entries[0] | |
| if first.revision: | |
| metadata = json.loads(first.revision["metadata_diff"])["new"] | |
| else: | |
| metadata = dict(first.note) | |
| for key in META_IGNORE: | |
| metadata.pop(key, None) | |
| for entry in note_entries: | |
| if entry is not first and entry.revision: | |
| metadata = dict[str, str](metadata) | |
| diff = json.loads(entry.revision["metadata_diff"]) | |
| for key in diff.get("deleted", ()): | |
| if metadata.pop(key, None) is None: | |
| LOG.warning("Missing removed metadata key %s in revision %s", key, entry.revision["id"]) | |
| metadata.update(diff.get("new", {})) | |
| entry.metadata = metadata | |
| if entry.revision: | |
| commited = authored = entry.revision["updated_time"] | |
| else: | |
| commited = authored = entry.note["created_time"] | |
| history.append((commited, authored, entry)) | |
| LOG.info("Processed all revisions") | |
| resources = dict[str, str]() | |
| for resource in cursor.execute("SELECT * FROM resources"): | |
| resources[resource["id"]] = resource["title"] | |
| history.append((resource["created_time"], resource["user_created_time"], resource)) | |
| LOG.info("Read %s resources", len(resources)) | |
| history.sort() | |
| if not output: | |
| return | |
| output.mkdir() | |
| os.chdir(output) | |
| resources_dir = Path("resources") | |
| resources_dir.mkdir() | |
| LOG.info("Starting output to %s", output) | |
| git("init") | |
| if author: | |
| git("config", "--local", "user.name", author[0]) | |
| git("config", "--local", "user.email", author[1]) | |
| start_date = datetime.fromtimestamp(history[0][0] / 1000).astimezone() | |
| git("commit", "--date", start_date.isoformat(), "--allow-empty", "-m", "Initial commit") | |
| paths = dict[str, Path]() | |
| rev_paths = dict[Path, str]() | |
| for committed, authored, ctx in history: | |
| if isinstance(ctx, Entry): | |
| note_id = ctx.note["id"] | |
| source = paths.get(note_id) | |
| folder = note_folder(ctx.note, folders) | |
| folder.mkdir(exist_ok=True) | |
| name = f"{ctx.title.replace("/", "+") or note_id}" | |
| target = folder / f"{name}.md" | |
| i = 1 | |
| while rev_paths.get(target) not in (None, note_id): | |
| i += 1 | |
| target = folder / f"{name} ({i}).md" | |
| body, seen = fix_links(ctx.body, paths, resources) | |
| body = add_front_matter(body, ctx.metadata) | |
| if not source: | |
| msg = f"Create {target}" | |
| elif source != target: | |
| msg = f"Move {source} to {target}" | |
| git("mv", str(source), str(target)) | |
| del rev_paths[source] | |
| for ref_id in seen: | |
| ref_path = paths[ref_id] | |
| ref_body, _ = fix_links(bodies[ref_id], paths, resources) | |
| with open(ref_path, "wb") as fp: | |
| fp.write(ref_body.encode(errors="replace")) | |
| git("add", str(ref_path)) | |
| else: | |
| msg = f"Update {target}" | |
| if ctx.metadata: | |
| msg += "\n" | |
| for key, value in ctx.metadata.items(): | |
| if has_value(value): | |
| label = re.sub(r"_(.)", lambda m: f"-{m[1].upper()}", f"{key[0].upper()}{key[1:]}") | |
| msg += f"\n{label}: {value}" | |
| paths[note_id] = target | |
| rev_paths[target] = note_id | |
| with open(target, "wb") as fp: | |
| fp.write(body.encode(errors="replace")) | |
| else: | |
| name = f"{ctx['id']}.{ctx['file_extension']}" | |
| source = profile / "resources" / name | |
| target = resources_dir / ctx["title"] | |
| source.copy(target) | |
| msg = f"Add resource {name}" | |
| git("add", str(target)) | |
| commit_date = datetime.fromtimestamp(committed / 1000).astimezone() | |
| author_date = datetime.fromtimestamp(authored / 1000).astimezone() | |
| try: | |
| git("commit", "--date", author_date.isoformat(), "-m", msg, env={"GIT_COMMITTER_DATE": commit_date.isoformat()}) | |
| except CalledProcessError: | |
| LOG.warning("Failed to commit history entry %s (%s)", committed, msg.splitlines()[0]) | |
| if __name__ == "__main__": | |
| from argparse import ArgumentParser | |
| parser = ArgumentParser() | |
| parser.add_argument("--home", type=Path, default=Path.home()) | |
| parser.add_argument("--profile") | |
| parser.add_argument("--exclude", nargs="+", default=[]) | |
| parser.add_argument("--output", type=Path) | |
| parser.add_argument("--author", nargs=2) | |
| args = parser.parse_args() | |
| logging.basicConfig(level=logging.DEBUG) | |
| main(args.home, args.profile, args.exclude, args.output, args.author) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment