Last active
May 5, 2026 10:43
-
-
Save jgru/b2fa5fe83f1b35b17fdb6be2d50c84b3 to your computer and use it in GitHub Desktop.
Parses a .tex file for citekeys and extracts those from a more comprehensive .bib file into a smaller one.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # -*- coding: utf-8 -*- | |
| """ | |
| `extract_bibliography.py` is a CLI-utility to extract all cite keys from a | |
| .tex file and write the corresponding BibTeX entries from a given .bib file | |
| into a smaller output .bib file. | |
| Cite commands supported: \\cite, \\citep, \\citet, \\citealt, \\citealp, | |
| \\citeauthor, \\citeyear, and their starred variants, as well as | |
| \\footcite, \\parencite, \\textcite (biblatex). | |
| Missing keys are reported but do not abort the process. | |
| Note on cross-references: if BibTeX @crossref fields are used, the referenced | |
| parent entries must also be present in the output .bib file. This script does | |
| not follow @crossref fields automatically; make sure the parent entries are | |
| cited directly, or add them to the output file manually afterwards. | |
| """ | |
| __author__ = "jgru" | |
| __version__ = "0.0.1" | |
| import argparse | |
| import os | |
| import re | |
| import sys | |
| _COMMENT_RE = re.compile(r"(?<!\\)%[^\n]*") | |
| def _strip_comments(content): | |
| return _COMMENT_RE.sub("", content) | |
| _CITE_RE = re.compile( | |
| r"\\(?:cite[tp*]?(?:alt|alp|author|year)?|footcite|parencite|textcite)" | |
| r"\*?(?:\[[^\]]*\]){0,2}\{([^}]+)\}" | |
| ) | |
| _INPUT_RE = re.compile(r"\\(?:input|include)\{([^}]+)\}") | |
| def extract_cite_keys(texfile, _visited=None, _seen=None, skip_commented=False): | |
| """Extract cite keys from texfile, recursively following \\input and \\include.""" | |
| if _visited is None: | |
| _visited = set() | |
| if _seen is None: | |
| _seen = set() | |
| texfile = os.path.realpath(texfile) | |
| if texfile in _visited: | |
| return [] | |
| _visited.add(texfile) | |
| with open(texfile, "r", encoding="utf-8") as f: | |
| content = f.read() | |
| if skip_commented: | |
| content = _strip_comments(content) | |
| keys = [] | |
| for match in _CITE_RE.finditer(content): | |
| for key in match.group(1).split(","): | |
| key = key.strip() | |
| if key and key not in _seen: | |
| keys.append(key) | |
| _seen.add(key) | |
| texdir = os.path.dirname(texfile) | |
| for match in _INPUT_RE.finditer(content): | |
| subfile = match.group(1).strip() | |
| if not subfile.endswith(".tex"): | |
| subfile += ".tex" | |
| subpath = os.path.realpath(os.path.join(texdir, subfile)) | |
| if os.path.isfile(subpath): | |
| keys.extend(extract_cite_keys(subpath, _visited, _seen, skip_commented)) | |
| else: | |
| print(f"Warning: \\input file not found: {subpath}", file=sys.stderr) | |
| return keys | |
| _SKIP_TYPES = {"string", "preamble", "comment"} | |
| _ENTRY_RE = re.compile(r"@(\w+)\s*([{(])", re.IGNORECASE) | |
| _KEY_RE = re.compile(r"\s*([^,\s]+)\s*,") | |
| def scan_bib_entries(bibfile): | |
| """Return {key: raw_entry_text} by brace-counting — no full parse.""" | |
| with open(bibfile, "r", encoding="utf-8") as f: | |
| content = f.read() | |
| entries = {} | |
| n = len(content) | |
| for m in _ENTRY_RE.finditer(content): | |
| if m.group(1).lower() in _SKIP_TYPES: | |
| continue | |
| open_ch = m.group(2) | |
| close_ch = "}" if open_ch == "{" else ")" | |
| entry_start = m.start() | |
| i = m.end() # just past the opening brace | |
| key_m = _KEY_RE.match(content, i) | |
| if not key_m: | |
| continue | |
| key = key_m.group(1) | |
| depth = 1 | |
| while i < n and depth: | |
| c = content[i] | |
| if c == open_ch: | |
| depth += 1 | |
| elif c == close_ch: | |
| depth -= 1 | |
| i += 1 | |
| entries[key] = content[entry_start:i] | |
| return entries | |
| def write_bib(outfile, keys, entries): | |
| with open(outfile, "w", encoding="utf-8") as f: | |
| f.write("% This bibliography is autogenerated\n% DO NOT EDIT BY HAND\n%\n\n") | |
| for key in keys: | |
| f.write(entries[key]) | |
| f.write("\n\n") | |
| f.write("% This bibliography is autogenerated\n% DO NOT EDIT BY HAND\n%\n") | |
| def main(texfile, bibfile, outfile, verbose=False, skip_commented=False): | |
| if not os.path.isfile(texfile): | |
| print(f"Error: tex file not found: {texfile}", file=sys.stderr) | |
| sys.exit(1) | |
| if not os.path.isfile(bibfile): | |
| print(f"Error: bib file not found: {bibfile}", file=sys.stderr) | |
| sys.exit(1) | |
| visited = set() | |
| keys = extract_cite_keys(texfile, _visited=visited, skip_commented=skip_commented) | |
| extra = len(visited) - 1 | |
| suffix = f" (+ {extra} \\input file(s))" if extra else "" | |
| print(f"Found {len(keys)} unique cite key(s) in '{texfile}'{suffix}") | |
| bib_dict = scan_bib_entries(bibfile) | |
| print(f"Scanned {len(bib_dict)} entry/entries from '{bibfile}'") | |
| matched = [] | |
| missing = [] | |
| for key in keys: | |
| if key in bib_dict: | |
| matched.append(key) | |
| else: | |
| missing.append(key) | |
| if missing: | |
| print(f"Warning: {len(missing)} key(s) not found in bib file:") | |
| for k in missing: | |
| print(f" - {k}") | |
| if verbose: | |
| print("Matched keys:") | |
| for key in matched: | |
| print(f" {key}") | |
| if os.path.isfile(outfile): | |
| os.rename(outfile, f"{outfile}.bak") | |
| print(f"Existing '{outfile}' backed up as '{outfile}.bak'") | |
| write_bib(outfile, matched, bib_dict) | |
| print(f"Wrote {len(matched)} entry/entries to '{outfile}'") | |
| def handle_arguments(): | |
| parser = argparse.ArgumentParser( | |
| description="Extract cited BibTeX entries from a .bib file based on " | |
| "cite keys found in a .tex file." | |
| ) | |
| parser.add_argument("texfile", type=str, help=".tex file to scan for cite keys") | |
| parser.add_argument( | |
| "bibfile", type=str, help="source .bib file to extract entries from" | |
| ) | |
| parser.add_argument( | |
| "outfile", type=str, help="output .bib file with extracted entries" | |
| ) | |
| parser.add_argument( | |
| "-v", | |
| "--verbose", | |
| action="store_true", | |
| help="list all matched cite keys", | |
| ) | |
| parser.add_argument( | |
| "--skip-commented", | |
| action="store_true", | |
| help="ignore \\input/\\include and cite commands on commented-out lines", | |
| ) | |
| return parser.parse_args() | |
| if __name__ == "__main__": | |
| args = handle_arguments() | |
| main(**vars(args)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment