Skip to content

Instantly share code, notes, and snippets.

@jgru
Last active May 5, 2026 10:43
Show Gist options
  • Select an option

  • Save jgru/b2fa5fe83f1b35b17fdb6be2d50c84b3 to your computer and use it in GitHub Desktop.

Select an option

Save jgru/b2fa5fe83f1b35b17fdb6be2d50c84b3 to your computer and use it in GitHub Desktop.
Parses a .tex file for citekeys and extracts those from a more comprehensive .bib file into a smaller one.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
`extract_bibliography.py` is a CLI-utility to extract all cite keys from a
.tex file and write the corresponding BibTeX entries from a given .bib file
into a smaller output .bib file.
Cite commands supported: \\cite, \\citep, \\citet, \\citealt, \\citealp,
\\citeauthor, \\citeyear, and their starred variants, as well as
\\footcite, \\parencite, \\textcite (biblatex).
Missing keys are reported but do not abort the process.
Note on cross-references: if BibTeX @crossref fields are used, the referenced
parent entries must also be present in the output .bib file. This script does
not follow @crossref fields automatically; make sure the parent entries are
cited directly, or add them to the output file manually afterwards.
"""
__author__ = "jgru"
__version__ = "0.0.1"
import argparse
import os
import re
import sys
_COMMENT_RE = re.compile(r"(?<!\\)%[^\n]*")
def _strip_comments(content):
return _COMMENT_RE.sub("", content)
_CITE_RE = re.compile(
r"\\(?:cite[tp*]?(?:alt|alp|author|year)?|footcite|parencite|textcite)"
r"\*?(?:\[[^\]]*\]){0,2}\{([^}]+)\}"
)
_INPUT_RE = re.compile(r"\\(?:input|include)\{([^}]+)\}")
def extract_cite_keys(texfile, _visited=None, _seen=None, skip_commented=False):
"""Extract cite keys from texfile, recursively following \\input and \\include."""
if _visited is None:
_visited = set()
if _seen is None:
_seen = set()
texfile = os.path.realpath(texfile)
if texfile in _visited:
return []
_visited.add(texfile)
with open(texfile, "r", encoding="utf-8") as f:
content = f.read()
if skip_commented:
content = _strip_comments(content)
keys = []
for match in _CITE_RE.finditer(content):
for key in match.group(1).split(","):
key = key.strip()
if key and key not in _seen:
keys.append(key)
_seen.add(key)
texdir = os.path.dirname(texfile)
for match in _INPUT_RE.finditer(content):
subfile = match.group(1).strip()
if not subfile.endswith(".tex"):
subfile += ".tex"
subpath = os.path.realpath(os.path.join(texdir, subfile))
if os.path.isfile(subpath):
keys.extend(extract_cite_keys(subpath, _visited, _seen, skip_commented))
else:
print(f"Warning: \\input file not found: {subpath}", file=sys.stderr)
return keys
_SKIP_TYPES = {"string", "preamble", "comment"}
_ENTRY_RE = re.compile(r"@(\w+)\s*([{(])", re.IGNORECASE)
_KEY_RE = re.compile(r"\s*([^,\s]+)\s*,")
def scan_bib_entries(bibfile):
"""Return {key: raw_entry_text} by brace-counting — no full parse."""
with open(bibfile, "r", encoding="utf-8") as f:
content = f.read()
entries = {}
n = len(content)
for m in _ENTRY_RE.finditer(content):
if m.group(1).lower() in _SKIP_TYPES:
continue
open_ch = m.group(2)
close_ch = "}" if open_ch == "{" else ")"
entry_start = m.start()
i = m.end() # just past the opening brace
key_m = _KEY_RE.match(content, i)
if not key_m:
continue
key = key_m.group(1)
depth = 1
while i < n and depth:
c = content[i]
if c == open_ch:
depth += 1
elif c == close_ch:
depth -= 1
i += 1
entries[key] = content[entry_start:i]
return entries
def write_bib(outfile, keys, entries):
with open(outfile, "w", encoding="utf-8") as f:
f.write("% This bibliography is autogenerated\n% DO NOT EDIT BY HAND\n%\n\n")
for key in keys:
f.write(entries[key])
f.write("\n\n")
f.write("% This bibliography is autogenerated\n% DO NOT EDIT BY HAND\n%\n")
def main(texfile, bibfile, outfile, verbose=False, skip_commented=False):
if not os.path.isfile(texfile):
print(f"Error: tex file not found: {texfile}", file=sys.stderr)
sys.exit(1)
if not os.path.isfile(bibfile):
print(f"Error: bib file not found: {bibfile}", file=sys.stderr)
sys.exit(1)
visited = set()
keys = extract_cite_keys(texfile, _visited=visited, skip_commented=skip_commented)
extra = len(visited) - 1
suffix = f" (+ {extra} \\input file(s))" if extra else ""
print(f"Found {len(keys)} unique cite key(s) in '{texfile}'{suffix}")
bib_dict = scan_bib_entries(bibfile)
print(f"Scanned {len(bib_dict)} entry/entries from '{bibfile}'")
matched = []
missing = []
for key in keys:
if key in bib_dict:
matched.append(key)
else:
missing.append(key)
if missing:
print(f"Warning: {len(missing)} key(s) not found in bib file:")
for k in missing:
print(f" - {k}")
if verbose:
print("Matched keys:")
for key in matched:
print(f" {key}")
if os.path.isfile(outfile):
os.rename(outfile, f"{outfile}.bak")
print(f"Existing '{outfile}' backed up as '{outfile}.bak'")
write_bib(outfile, matched, bib_dict)
print(f"Wrote {len(matched)} entry/entries to '{outfile}'")
def handle_arguments():
parser = argparse.ArgumentParser(
description="Extract cited BibTeX entries from a .bib file based on "
"cite keys found in a .tex file."
)
parser.add_argument("texfile", type=str, help=".tex file to scan for cite keys")
parser.add_argument(
"bibfile", type=str, help="source .bib file to extract entries from"
)
parser.add_argument(
"outfile", type=str, help="output .bib file with extracted entries"
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
help="list all matched cite keys",
)
parser.add_argument(
"--skip-commented",
action="store_true",
help="ignore \\input/\\include and cite commands on commented-out lines",
)
return parser.parse_args()
if __name__ == "__main__":
args = handle_arguments()
main(**vars(args))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment