Skip to content

Instantly share code, notes, and snippets.

@Terrance
Last active April 29, 2026 19:23
Show Gist options
  • Select an option

  • Save Terrance/a299e20658af519cbf8f4fbacf348107 to your computer and use it in GitHub Desktop.

Select an option

Save Terrance/a299e20658af519cbf8f4fbacf348107 to your computer and use it in GitHub Desktop.
Script to parse Joplin Desktop's revision history and reverse-apply it to the current set of notes, outputting a git repository.
#!/usr/bin/env python3
from collections import defaultdict
from collections.abc import Iterable
from dataclasses import dataclass
from datetime import datetime
import json
import logging
import os
import os.path
from pathlib import Path
from queue import Empty, Queue
import re
import sqlite3
from subprocess import CalledProcessError, run
from typing import cast
from diff_match_patch import diff_match_patch, patch_obj
LOG = logging.getLogger(__name__)
DMP = diff_match_patch()
META_FRONT = ("longitude", "latitude", "altitude")
META_IGNORE = {"body"}
@dataclass
class Entry:
note: sqlite3.Row
revision: sqlite3.Row | None
title: str
body: str
metadata: dict[str, str] | None
def find_profile(home: Path, profile_name: str | None):
config = home / ".config" / "joplin-desktop"
if profile_name:
profile = config / profile_name
else:
profile_names = list(config.glob("profile-*"))
assert len(profile_names) == 1
profile = config / profile_names[0]
assert profile.is_dir()
LOG.info("Profile: %s", profile)
return profile
def prune_folders(rows: Iterable[sqlite3.Row], exclude: list[str]):
folders = {row["id"]: row for row in rows}
LOG.debug("Found %s folders", len(folders))
tree = defaultdict[str, set[str]](set)
for row_id, row in folders.items():
tree[row["parent_id"]].add(row_id)
queue = Queue[str]()
queue.put_nowait("")
pruned = dict[str, sqlite3.Row]()
while True:
try:
parent_id = queue.get_nowait()
except Empty:
break
if parent_id in exclude:
continue
if parent_id:
pruned[parent_id] = folders[parent_id]
for child_id in tree[parent_id]:
queue.put_nowait(child_id)
LOG.debug("Pruned to %s folders", len(pruned))
return pruned
def undo_revision(text: str, delta: dict) -> str:
patches: list[patch_obj] = []
for change in reversed(delta):
patch = patch_obj()
patch.diffs = [[mod * -1, ctx] for mod, ctx in change["diffs"]]
patch.start1 = change["start2"]
patch.start2 = change["start1"]
patch.length1 = change["length2"]
patch.length2 = change["length1"]
patches.append(patch)
reverted, status = DMP.patch_apply(patches, text)
if not all(status):
LOG.warning("Failed to apply all patches")
LOG.debug("Results: %s", status)
return reverted
def note_folder(note: sqlite3.Row, folders: dict[str, sqlite3.Row]):
parent_id = note["parent_id"]
path = list[str]()
while parent_id in folders:
parent = folders[parent_id]
path.insert(0, parent["title"])
parent_id = parent["parent_id"]
return Path(os.path.join(*path))
def fix_links(body: str, paths: dict[str, Path], resources: dict[str, str]):
seen = set()
def _repl(match: re.Match):
link_text, item_id = match.groups()
if item_id in paths:
seen.add(item_id)
path = paths[item_id]
return f"[[{path.with_name(path.stem)}]]"
elif item_id in resources:
return f"[{link_text}](/resources/{resources[item_id]})"
else:
return match[0]
return (re.sub(r"\[([^\]]*)\]\(:/([a-f0-9]+)\)", _repl, body), seen)
def has_value(val):
return val and not str(val).startswith("0.0000")
def add_front_matter(body: str, metadata: dict[str, str] | None):
if metadata:
lines = list[str]()
for key in META_FRONT:
value = metadata.get(key)
if has_value(value):
lines.append(f"{key}: {value}")
if lines:
body = "\n".join(("---", *lines, "---", "", body))
return body
def git(*args: str, **kwargs):
run(("git", *args), check=True, **kwargs)
def main(home: Path, profile_name: str | None, exclude: list[str], output: Path | None, author: tuple[str, str] | None):
profile = find_profile(home, profile_name)
with sqlite3.connect(f"file:{profile / 'database.sqlite'}?mode=ro", uri=True) as conn:
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
folders = prune_folders(cursor.execute("SELECT * FROM folders"), exclude)
marks = ", ".join("?" * len(folders))
folder_ids = list(folders)
cursor.execute(f"SELECT * FROM notes WHERE parent_id IN ({marks})", folder_ids)
notes: dict[str, sqlite3.Row] = {row["id"]: row for row in cursor.fetchall()}
LOG.debug("Read %s notes", len(notes))
cursor.execute(
f"""
SELECT * FROM revisions WHERE item_id IN (
SELECT id FROM notes WHERE parent_id IN ({marks})
) ORDER BY updated_time DESC, created_time DESC
""",
folder_ids,
)
revisions = cursor.fetchall()
LOG.debug("Read %s revisions", len(revisions))
entries = defaultdict[str, list[Entry]](list)
titles = dict[str, str]()
bodies = dict[str, str]()
for i, revision in enumerate(revisions):
note_id = revision["item_id"]
note = notes[note_id]
title = titles.get(note_id, cast(str, note["title"]))
body = bodies.get(note_id, cast(str, note["body"]))
entries[note_id].append(Entry(note, revision, title, body, None))
titles[note_id] = undo_revision(title, json.loads(revision["title_diff"]))
bodies[note_id] = undo_revision(body, json.loads(revision["body_diff"]))
LOG.debug("Unapplied revision #%s on note %s", i, note_id)
for note_id, note in notes.items():
if note["parent_id"] not in folder_ids:
continue
title = titles.get(note_id, cast(str, note["title"]))
body = bodies.get(note_id, cast(str, note["body"]))
if title or body:
LOG.debug("Added residual diff for note %s", note_id)
entries[note_id].append(Entry(note, None, title, body, None))
history = list[tuple[int, int, Entry | sqlite3.Row]]()
for note_entries in entries.values():
first = note_entries[0]
if first.revision:
metadata = json.loads(first.revision["metadata_diff"])["new"]
else:
metadata = dict(first.note)
for key in META_IGNORE:
metadata.pop(key, None)
for entry in note_entries:
if entry is not first and entry.revision:
metadata = dict[str, str](metadata)
diff = json.loads(entry.revision["metadata_diff"])
for key in diff.get("deleted", ()):
if metadata.pop(key, None) is None:
LOG.warning("Missing removed metadata key %s in revision %s", key, entry.revision["id"])
metadata.update(diff.get("new", {}))
entry.metadata = metadata
if entry.revision:
commited = authored = entry.revision["updated_time"]
else:
commited = authored = entry.note["created_time"]
history.append((commited, authored, entry))
LOG.info("Processed all revisions")
resources = dict[str, str]()
for resource in cursor.execute("SELECT * FROM resources"):
resources[resource["id"]] = resource["title"]
history.append((resource["created_time"], resource["user_created_time"], resource))
LOG.info("Read %s resources", len(resources))
history.sort()
if not output:
return
output.mkdir()
os.chdir(output)
resources_dir = Path("resources")
resources_dir.mkdir()
LOG.info("Starting output to %s", output)
git("init")
if author:
git("config", "--local", "user.name", author[0])
git("config", "--local", "user.email", author[1])
start_date = datetime.fromtimestamp(history[0][0] / 1000).astimezone()
git("commit", "--date", start_date.isoformat(), "--allow-empty", "-m", "Initial commit")
paths = dict[str, Path]()
rev_paths = dict[Path, str]()
for committed, authored, ctx in history:
if isinstance(ctx, Entry):
note_id = ctx.note["id"]
source = paths.get(note_id)
folder = note_folder(ctx.note, folders)
folder.mkdir(exist_ok=True)
name = f"{ctx.title.replace("/", "+") or note_id}"
target = folder / f"{name}.md"
i = 1
while rev_paths.get(target) not in (None, note_id):
i += 1
target = folder / f"{name} ({i}).md"
body, seen = fix_links(ctx.body, paths, resources)
body = add_front_matter(body, ctx.metadata)
if not source:
msg = f"Create {target}"
elif source != target:
msg = f"Move {source} to {target}"
git("mv", str(source), str(target))
del rev_paths[source]
for ref_id in seen:
ref_path = paths[ref_id]
ref_body, _ = fix_links(bodies[ref_id], paths, resources)
with open(ref_path, "wb") as fp:
fp.write(ref_body.encode(errors="replace"))
git("add", str(ref_path))
else:
msg = f"Update {target}"
if ctx.metadata:
msg += "\n"
for key, value in ctx.metadata.items():
if has_value(value):
label = re.sub(r"_(.)", lambda m: f"-{m[1].upper()}", f"{key[0].upper()}{key[1:]}")
msg += f"\n{label}: {value}"
paths[note_id] = target
rev_paths[target] = note_id
with open(target, "wb") as fp:
fp.write(body.encode(errors="replace"))
else:
name = f"{ctx['id']}.{ctx['file_extension']}"
source = profile / "resources" / name
target = resources_dir / ctx["title"]
source.copy(target)
msg = f"Add resource {name}"
git("add", str(target))
commit_date = datetime.fromtimestamp(committed / 1000).astimezone()
author_date = datetime.fromtimestamp(authored / 1000).astimezone()
try:
git("commit", "--date", author_date.isoformat(), "-m", msg, env={"GIT_COMMITTER_DATE": commit_date.isoformat()})
except CalledProcessError:
LOG.warning("Failed to commit history entry %s (%s)", committed, msg.splitlines()[0])
if __name__ == "__main__":
from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument("--home", type=Path, default=Path.home())
parser.add_argument("--profile")
parser.add_argument("--exclude", nargs="+", default=[])
parser.add_argument("--output", type=Path)
parser.add_argument("--author", nargs=2)
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG)
main(args.home, args.profile, args.exclude, args.output, args.author)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment