Skip to content

Instantly share code, notes, and snippets.

@vpnry
Created April 17, 2026 10:40
Show Gist options
  • Select an option

  • Save vpnry/bfaa1c898bf6bdf73ac53d423d962314 to your computer and use it in GitHub Desktop.

Select an option

Save vpnry/bfaa1c898bf6bdf73ac53d423d962314 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Digital Pali Dictionary TXT Parser Test
https://github.com/digitalpalidictionary/dpd-db/releases/download/v0.3.20260402/dpd-txt.zip
Parses a structured dictionary text file into JSON with HTML-formatted content.
"""
import re
import json
import sys
from collections import defaultdict
def parse_dict_file(filepath: str) -> list[dict]:
with open(filepath, "r", encoding="utf-8") as f:
lines = f.readlines()
# Each entry: { base_word, number_suffix, pos_and_def, details_lines }
raw_entries = []
current_entry = None
for line in lines:
# Strip trailing newline only (preserve leading spaces)
line_stripped = line.rstrip("\n")
if not line_stripped.strip():
# Blank line — close current entry
if current_entry:
raw_entries.append(current_entry)
current_entry = None
continue
if line_stripped and line_stripped[0] != " ":
# Word entry line
if current_entry:
raw_entries.append(current_entry)
# Extract word (everything before the very first comma)
comma_idx = line_stripped.find(",")
if comma_idx == -1:
# No comma: treat whole line as word with no definition
word_full = line_stripped.strip()
rest = ""
else:
word_full = line_stripped[:comma_idx].strip()
rest = line_stripped[comma_idx + 1:].strip()
# Split word_full into base + numeric suffix
# Handles both "a 1.1" and "akaci 2" (bare integer)
m = re.match(r"^(.+?)\s+(\d+(?:\.\d+)?)$", word_full)
if m:
base_word = m.group(1).strip()
num_suffix = m.group(2)
else:
base_word = word_full
num_suffix = None
current_entry = {
"base_word": base_word,
"num_suffix": num_suffix,
"pos_and_def": rest,
"details": [],
}
else:
# Indented line — detail content for current entry
if current_entry is not None:
current_entry["details"].append(line_stripped.strip())
# Flush last entry
if current_entry:
raw_entries.append(current_entry)
# Group entries by base_word, preserving order
grouped: dict[str, list] = defaultdict(list)
order: list[str] = []
for entry in raw_entries:
bw = entry["base_word"]
if bw not in grouped:
order.append(bw)
grouped[bw].append(entry)
# Build JSON output
result = []
for base_word in order:
entries = grouped[base_word]
html = build_html(base_word, entries)
result.append({
"dict_key": base_word,
"dict_content": html,
})
return result
def parse_pos_and_def(pos_and_def: str) -> tuple[str, str]:
"""
Entry line after the first comma has the format:
{POS.} {definition text}
The POS is one or more dot-terminated tokens like:
adj. letter. prefix. cs. abbrev.
adj. gen. pron. pl. ind. interj.
We greedily consume tokens of the form [word][.] as long as they look
like POS abbreviations (short, all alpha/space, ending in dot), stopping
before the definition.
Examples:
"adj. smooth; not harsh" -> pos="adj." def="smooth; not harsh"
"pron. much better" -> pos="pron." def="much better"
"adj. gen. of words" -> pos="adj. gen." def="of words"
"letter. (gram) letter a" -> pos="letter." def="(gram) letter a"
"""
# Split into the POS portion and definition.
# Strategy: the POS ends at the last '.' that is followed by a space
# and a non-POS-looking token (i.e. something that starts a real definition:
# parenthetical, lowercase word that isn't a known POS abbrev, etc.)
#
# Simpler reliable rule: POS tokens are [A-Za-zāīūṭḍṇḷṃñ\-]+ followed by '.'
# We take as many such tokens as appear before the first token that is
# either: starts with '(' OR is a longer word (>8 chars) OR the accumulated
# POS string already ended and the next token has no dot.
#
# Even simpler: split on '. ' and collect leading tokens ≤10 chars as POS.
tokens = re.split(r"(?<=\.)\s+", pos_and_def, maxsplit=10)
pos_tokens = []
for tok in tokens:
# A POS token: ends with '.', short (≤12 chars), no digits, no semicolons
if re.match(r"^[A-Za-zāīūṭḍṇḷṃñ\s\-]{1,12}\.$", tok) and ";" not in tok:
pos_tokens.append(tok)
else:
break
if pos_tokens:
pos = " ".join(pos_tokens)
# Remainder: everything after the POS prefix
remainder = pos_and_def[len(pos):].strip()
return pos, remainder
return "", pos_and_def.strip()
def detail_line_to_html(line: str) -> str:
"""Format a detail line like 'Key: value' into styled HTML."""
m = re.match(r"^([A-Za-z /]+):\s*(.+)$", line)
if m:
key = m.group(1).strip()
val = m.group(2).strip()
return f'<span class="detail-key">{key}:</span> <span class="detail-val">{val}</span>'
return line
def build_html(base_word: str, entries: list[dict]) -> str:
has_multiple = any(e["num_suffix"] for e in entries)
parts = []
# Header
parts.append(f'<div class="dict-entry">')
parts.append(f' <h2 class="headword">{base_word}</h2>')
for i, entry in enumerate(entries):
pos, definition = parse_pos_and_def(entry["pos_and_def"])
if has_multiple and entry["num_suffix"]:
parts.append(f' <div class="sense">')
parts.append(f' <span class="sense-num">{i + 1}.</span>')
else:
parts.append(f' <div class="sense">')
if pos:
parts.append(f' <span class="pos">{pos}</span>')
if definition:
parts.append(f' <span class="definition">{definition}</span>')
if entry["details"]:
parts.append(f' <dl class="details">')
for dl in entry["details"]:
html_line = detail_line_to_html(dl)
parts.append(f' <dd>{html_line}</dd>')
parts.append(f' </dl>')
parts.append(f' </div>')
parts.append(f'</div>')
return "\n".join(parts)
def main():
if len(sys.argv) < 2:
print("Usage: parse_dict.py <input_file> [output.json]", file=sys.stderr)
sys.exit(1)
input_file = sys.argv[1]
output_file = sys.argv[2] if len(sys.argv) > 2 else None
data = parse_dict_file(input_file)
json_str = json.dumps(data, ensure_ascii=False, indent=2)
if output_file:
with open(output_file, "w", encoding="utf-8") as f:
f.write(json_str)
print(f"Written {len(data)} entries to {output_file}")
else:
print(json_str)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment