jgru · May 5, 2026 10:43
diff --git a/extract_bibliography.py b/extract_bibliography.py
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-

 """
 `extract_bibliography.py` is a CLI-utility to extract all cite keys from a
 .tex file and write the corresponding BibTeX entries from a given .bib file
 into a smaller output .bib file.

 Cite commands supported: \\cite, \\citep, \\citet, \\citealt, \\citealp,
 \\citeauthor, \\citeyear, and their starred variants, as well as
 \\footcite, \\parencite, \\textcite (biblatex).

 Missing keys are reported but do not abort the process.

 Note on cross-references: if BibTeX @crossref fields are used, the referenced
 parent entries must also be present in the output .bib file. This script does
 not follow @crossref fields automatically; make sure the parent entries are
 cited directly, or add them to the output file manually afterwards.
 """

 __author__ = "jgru"
 __version__ = "0.0.1"

 import argparse
 import os
 import re
 import sys

 _COMMENT_RE = re.compile(r"(?<!\\)%[^\n]*")


 def _strip_comments(content):
    return _COMMENT_RE.sub("", content)


 _CITE_RE = re.compile(
    r"\\(?:cite[tp*]?(?:alt|alp|author|year)?|footcite|parencite|textcite)"
    r"\*?(?:\[[^\]]*\]){0,2}\{([^}]+)\}"
 )
 _INPUT_RE = re.compile(r"\\(?:input|include)\{([^}]+)\}")


 def extract_cite_keys(texfile, _visited=None, _seen=None, skip_commented=False):
    """Extract cite keys from texfile, recursively following \\input and \\include."""
    if _visited is None:
        _visited = set()
    if _seen is None:
        _seen = set()

    texfile = os.path.realpath(texfile)
    if texfile in _visited:
        return []
    _visited.add(texfile)

    with open(texfile, "r", encoding="utf-8") as f:
        content = f.read()

    if skip_commented:
        content = _strip_comments(content)

    keys = []
    for match in _CITE_RE.finditer(content):
        for key in match.group(1).split(","):
            key = key.strip()
            if key and key not in _seen:
                keys.append(key)
                _seen.add(key)

    texdir = os.path.dirname(texfile)
    for match in _INPUT_RE.finditer(content):
        subfile = match.group(1).strip()
        if not subfile.endswith(".tex"):
            subfile += ".tex"
        subpath = os.path.realpath(os.path.join(texdir, subfile))
        if os.path.isfile(subpath):
            keys.extend(extract_cite_keys(subpath, _visited, _seen, skip_commented))
        else:
            print(f"Warning: \\input file not found: {subpath}", file=sys.stderr)

    return keys


 _SKIP_TYPES = {"string", "preamble", "comment"}
 _ENTRY_RE = re.compile(r"@(\w+)\s*([{(])", re.IGNORECASE)
 _KEY_RE = re.compile(r"\s*([^,\s]+)\s*,")


 def scan_bib_entries(bibfile):
    """Return {key: raw_entry_text} by brace-counting — no full parse."""
    with open(bibfile, "r", encoding="utf-8") as f:
        content = f.read()

    entries = {}
    n = len(content)

    for m in _ENTRY_RE.finditer(content):
        if m.group(1).lower() in _SKIP_TYPES:
            continue

        open_ch = m.group(2)
        close_ch = "}" if open_ch == "{" else ")"
        entry_start = m.start()
        i = m.end()  # just past the opening brace

        key_m = _KEY_RE.match(content, i)
        if not key_m:
            continue
        key = key_m.group(1)

        depth = 1
        while i < n and depth:
            c = content[i]
            if c == open_ch:
                depth += 1
            elif c == close_ch:
                depth -= 1
            i += 1

        entries[key] = content[entry_start:i]

    return entries


 def write_bib(outfile, keys, entries):
    with open(outfile, "w", encoding="utf-8") as f:
        f.write("% This bibliography is autogenerated\n% DO NOT EDIT BY HAND\n%\n\n")
        for key in keys:
            f.write(entries[key])
            f.write("\n\n")
        f.write("% This bibliography is autogenerated\n% DO NOT EDIT BY HAND\n%\n")


 def main(texfile, bibfile, outfile, verbose=False, skip_commented=False):
    if not os.path.isfile(texfile):
        print(f"Error: tex file not found: {texfile}", file=sys.stderr)
        sys.exit(1)
    if not os.path.isfile(bibfile):
        print(f"Error: bib file not found: {bibfile}", file=sys.stderr)
        sys.exit(1)

    visited = set()
    keys = extract_cite_keys(texfile, _visited=visited, skip_commented=skip_commented)
    extra = len(visited) - 1
    suffix = f" (+ {extra} \\input file(s))" if extra else ""
    print(f"Found {len(keys)} unique cite key(s) in '{texfile}'{suffix}")

    bib_dict = scan_bib_entries(bibfile)
    print(f"Scanned {len(bib_dict)} entry/entries from '{bibfile}'")

    matched = []
    missing = []

    for key in keys:
        if key in bib_dict:
            matched.append(key)
        else:
            missing.append(key)

    if missing:
        print(f"Warning: {len(missing)} key(s) not found in bib file:")
        for k in missing:
            print(f"  - {k}")

    if verbose:
        print("Matched keys:")
        for key in matched:
            print(f"  {key}")

    if os.path.isfile(outfile):
        os.rename(outfile, f"{outfile}.bak")
        print(f"Existing '{outfile}' backed up as '{outfile}.bak'")

    write_bib(outfile, matched, bib_dict)
    print(f"Wrote {len(matched)} entry/entries to '{outfile}'")


 def handle_arguments():
    parser = argparse.ArgumentParser(
        description="Extract cited BibTeX entries from a .bib file based on "
        "cite keys found in a .tex file."
    )
    parser.add_argument("texfile", type=str, help=".tex file to scan for cite keys")
    parser.add_argument(
        "bibfile", type=str, help="source .bib file to extract entries from"
    )
    parser.add_argument(
        "outfile", type=str, help="output .bib file with extracted entries"
    )
    parser.add_argument(
        "-v",
        "--verbose",
        action="store_true",
        help="list all matched cite keys",
    )
    parser.add_argument(
        "--skip-commented",
        action="store_true",
        help="ignore \\input/\\include and cite commands on commented-out lines",
    )
    return parser.parse_args()


 if __name__ == "__main__":
    args = handle_arguments()
    main(**vars(args))
	#!/usr/bin/env python3
	# -- coding: utf-8 --

	"""
	`extract_bibliography.py` is a CLI-utility to extract all cite keys from a
	.tex file and write the corresponding BibTeX entries from a given .bib file
	into a smaller output .bib file.

	Cite commands supported: \\cite, \\citep, \\citet, \\citealt, \\citealp,
	\\citeauthor, \\citeyear, and their starred variants, as well as
	\\footcite, \\parencite, \\textcite (biblatex).

	Missing keys are reported but do not abort the process.

	Note on cross-references: if BibTeX @crossref fields are used, the referenced
	parent entries must also be present in the output .bib file. This script does
	not follow @crossref fields automatically; make sure the parent entries are
	cited directly, or add them to the output file manually afterwards.
	"""

	__author__ = "jgru"
	__version__ = "0.0.1"

	import argparse
	import os
	import re
	import sys

	_COMMENT_RE = re.compile(r"(?<!\\)%[^\n]*")


	def _strip_comments(content):
	return _COMMENT_RE.sub("", content)


	_CITE_RE = re.compile(
	r"\\(?:cite[tp*]?(?:alt\|alp\|author\|year)?\|footcite\|parencite\|textcite)"
	r"\?(?:\[[^\]]\]){0,2}\{([^}]+)\}"
	)
	_INPUT_RE = re.compile(r"\\(?:input\|include)\{([^}]+)\}")


	def extract_cite_keys(texfile, _visited=None, _seen=None, skip_commented=False):
	"""Extract cite keys from texfile, recursively following \\input and \\include."""
	if _visited is None:
	_visited = set()
	if _seen is None:
	_seen = set()

	texfile = os.path.realpath(texfile)
	if texfile in _visited:
	return []
	_visited.add(texfile)

	with open(texfile, "r", encoding="utf-8") as f:
	content = f.read()

	if skip_commented:
	content = _strip_comments(content)

	keys = []
	for match in _CITE_RE.finditer(content):
	for key in match.group(1).split(","):
	key = key.strip()
	if key and key not in _seen:
	keys.append(key)
	_seen.add(key)

	texdir = os.path.dirname(texfile)
	for match in _INPUT_RE.finditer(content):
	subfile = match.group(1).strip()
	if not subfile.endswith(".tex"):
	subfile += ".tex"
	subpath = os.path.realpath(os.path.join(texdir, subfile))
	if os.path.isfile(subpath):
	keys.extend(extract_cite_keys(subpath, _visited, _seen, skip_commented))
	else:
	print(f"Warning: \\input file not found: {subpath}", file=sys.stderr)

	return keys


	_SKIP_TYPES = {"string", "preamble", "comment"}
	_ENTRY_RE = re.compile(r"@(\w+)\s*([{(])", re.IGNORECASE)
	_KEY_RE = re.compile(r"\s([^,\s]+)\s,")


	def scan_bib_entries(bibfile):
	"""Return {key: raw_entry_text} by brace-counting — no full parse."""
	with open(bibfile, "r", encoding="utf-8") as f:
	content = f.read()

	entries = {}
	n = len(content)

	for m in _ENTRY_RE.finditer(content):
	if m.group(1).lower() in _SKIP_TYPES:
	continue

	open_ch = m.group(2)
	close_ch = "}" if open_ch == "{" else ")"
	entry_start = m.start()
	i = m.end() # just past the opening brace

	key_m = _KEY_RE.match(content, i)
	if not key_m:
	continue
	key = key_m.group(1)

	depth = 1
	while i < n and depth:
	c = content[i]
	if c == open_ch:
	depth += 1
	elif c == close_ch:
	depth -= 1
	i += 1

	entries[key] = content[entry_start:i]

	return entries


	def write_bib(outfile, keys, entries):
	with open(outfile, "w", encoding="utf-8") as f:
	f.write("% This bibliography is autogenerated\n% DO NOT EDIT BY HAND\n%\n\n")
	for key in keys:
	f.write(entries[key])
	f.write("\n\n")
	f.write("% This bibliography is autogenerated\n% DO NOT EDIT BY HAND\n%\n")


	def main(texfile, bibfile, outfile, verbose=False, skip_commented=False):
	if not os.path.isfile(texfile):
	print(f"Error: tex file not found: {texfile}", file=sys.stderr)
	sys.exit(1)
	if not os.path.isfile(bibfile):
	print(f"Error: bib file not found: {bibfile}", file=sys.stderr)
	sys.exit(1)

	visited = set()
	keys = extract_cite_keys(texfile, _visited=visited, skip_commented=skip_commented)
	extra = len(visited) - 1
	suffix = f" (+ {extra} \\input file(s))" if extra else ""
	print(f"Found {len(keys)} unique cite key(s) in '{texfile}'{suffix}")

	bib_dict = scan_bib_entries(bibfile)
	print(f"Scanned {len(bib_dict)} entry/entries from '{bibfile}'")

	matched = []
	missing = []

	for key in keys:
	if key in bib_dict:
	matched.append(key)
	else:
	missing.append(key)

	if missing:
	print(f"Warning: {len(missing)} key(s) not found in bib file:")
	for k in missing:
	print(f" - {k}")

	if verbose:
	print("Matched keys:")
	for key in matched:
	print(f" {key}")

	if os.path.isfile(outfile):
	os.rename(outfile, f"{outfile}.bak")
	print(f"Existing '{outfile}' backed up as '{outfile}.bak'")

	write_bib(outfile, matched, bib_dict)
	print(f"Wrote {len(matched)} entry/entries to '{outfile}'")


	def handle_arguments():
	parser = argparse.ArgumentParser(
	description="Extract cited BibTeX entries from a .bib file based on "
	"cite keys found in a .tex file."
	)
	parser.add_argument("texfile", type=str, help=".tex file to scan for cite keys")
	parser.add_argument(
	"bibfile", type=str, help="source .bib file to extract entries from"
	)
	parser.add_argument(
	"outfile", type=str, help="output .bib file with extracted entries"
	)
	parser.add_argument(
	"-v",
	"--verbose",
	action="store_true",
	help="list all matched cite keys",
	)
	parser.add_argument(
	"--skip-commented",
	action="store_true",
	help="ignore \\input/\\include and cite commands on commented-out lines",
	)
	return parser.parse_args()


	if __name__ == "__main__":
	args = handle_arguments()
	main(**vars(args))
No results found