Created
April 17, 2026 10:40
-
-
Save vpnry/bfaa1c898bf6bdf73ac53d423d962314 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Digital Pali Dictionary TXT Parser Test | |
| https://github.com/digitalpalidictionary/dpd-db/releases/download/v0.3.20260402/dpd-txt.zip | |
| Parses a structured dictionary text file into JSON with HTML-formatted content. | |
| """ | |
| import re | |
| import json | |
| import sys | |
| from collections import defaultdict | |
| def parse_dict_file(filepath: str) -> list[dict]: | |
| with open(filepath, "r", encoding="utf-8") as f: | |
| lines = f.readlines() | |
| # Each entry: { base_word, number_suffix, pos_and_def, details_lines } | |
| raw_entries = [] | |
| current_entry = None | |
| for line in lines: | |
| # Strip trailing newline only (preserve leading spaces) | |
| line_stripped = line.rstrip("\n") | |
| if not line_stripped.strip(): | |
| # Blank line — close current entry | |
| if current_entry: | |
| raw_entries.append(current_entry) | |
| current_entry = None | |
| continue | |
| if line_stripped and line_stripped[0] != " ": | |
| # Word entry line | |
| if current_entry: | |
| raw_entries.append(current_entry) | |
| # Extract word (everything before the very first comma) | |
| comma_idx = line_stripped.find(",") | |
| if comma_idx == -1: | |
| # No comma: treat whole line as word with no definition | |
| word_full = line_stripped.strip() | |
| rest = "" | |
| else: | |
| word_full = line_stripped[:comma_idx].strip() | |
| rest = line_stripped[comma_idx + 1:].strip() | |
| # Split word_full into base + numeric suffix | |
| # Handles both "a 1.1" and "akaci 2" (bare integer) | |
| m = re.match(r"^(.+?)\s+(\d+(?:\.\d+)?)$", word_full) | |
| if m: | |
| base_word = m.group(1).strip() | |
| num_suffix = m.group(2) | |
| else: | |
| base_word = word_full | |
| num_suffix = None | |
| current_entry = { | |
| "base_word": base_word, | |
| "num_suffix": num_suffix, | |
| "pos_and_def": rest, | |
| "details": [], | |
| } | |
| else: | |
| # Indented line — detail content for current entry | |
| if current_entry is not None: | |
| current_entry["details"].append(line_stripped.strip()) | |
| # Flush last entry | |
| if current_entry: | |
| raw_entries.append(current_entry) | |
| # Group entries by base_word, preserving order | |
| grouped: dict[str, list] = defaultdict(list) | |
| order: list[str] = [] | |
| for entry in raw_entries: | |
| bw = entry["base_word"] | |
| if bw not in grouped: | |
| order.append(bw) | |
| grouped[bw].append(entry) | |
| # Build JSON output | |
| result = [] | |
| for base_word in order: | |
| entries = grouped[base_word] | |
| html = build_html(base_word, entries) | |
| result.append({ | |
| "dict_key": base_word, | |
| "dict_content": html, | |
| }) | |
| return result | |
| def parse_pos_and_def(pos_and_def: str) -> tuple[str, str]: | |
| """ | |
| Entry line after the first comma has the format: | |
| {POS.} {definition text} | |
| The POS is one or more dot-terminated tokens like: | |
| adj. letter. prefix. cs. abbrev. | |
| adj. gen. pron. pl. ind. interj. | |
| We greedily consume tokens of the form [word][.] as long as they look | |
| like POS abbreviations (short, all alpha/space, ending in dot), stopping | |
| before the definition. | |
| Examples: | |
| "adj. smooth; not harsh" -> pos="adj." def="smooth; not harsh" | |
| "pron. much better" -> pos="pron." def="much better" | |
| "adj. gen. of words" -> pos="adj. gen." def="of words" | |
| "letter. (gram) letter a" -> pos="letter." def="(gram) letter a" | |
| """ | |
| # Split into the POS portion and definition. | |
| # Strategy: the POS ends at the last '.' that is followed by a space | |
| # and a non-POS-looking token (i.e. something that starts a real definition: | |
| # parenthetical, lowercase word that isn't a known POS abbrev, etc.) | |
| # | |
| # Simpler reliable rule: POS tokens are [A-Za-zāīūṭḍṇḷṃñ\-]+ followed by '.' | |
| # We take as many such tokens as appear before the first token that is | |
| # either: starts with '(' OR is a longer word (>8 chars) OR the accumulated | |
| # POS string already ended and the next token has no dot. | |
| # | |
| # Even simpler: split on '. ' and collect leading tokens ≤10 chars as POS. | |
| tokens = re.split(r"(?<=\.)\s+", pos_and_def, maxsplit=10) | |
| pos_tokens = [] | |
| for tok in tokens: | |
| # A POS token: ends with '.', short (≤12 chars), no digits, no semicolons | |
| if re.match(r"^[A-Za-zāīūṭḍṇḷṃñ\s\-]{1,12}\.$", tok) and ";" not in tok: | |
| pos_tokens.append(tok) | |
| else: | |
| break | |
| if pos_tokens: | |
| pos = " ".join(pos_tokens) | |
| # Remainder: everything after the POS prefix | |
| remainder = pos_and_def[len(pos):].strip() | |
| return pos, remainder | |
| return "", pos_and_def.strip() | |
| def detail_line_to_html(line: str) -> str: | |
| """Format a detail line like 'Key: value' into styled HTML.""" | |
| m = re.match(r"^([A-Za-z /]+):\s*(.+)$", line) | |
| if m: | |
| key = m.group(1).strip() | |
| val = m.group(2).strip() | |
| return f'<span class="detail-key">{key}:</span> <span class="detail-val">{val}</span>' | |
| return line | |
| def build_html(base_word: str, entries: list[dict]) -> str: | |
| has_multiple = any(e["num_suffix"] for e in entries) | |
| parts = [] | |
| # Header | |
| parts.append(f'<div class="dict-entry">') | |
| parts.append(f' <h2 class="headword">{base_word}</h2>') | |
| for i, entry in enumerate(entries): | |
| pos, definition = parse_pos_and_def(entry["pos_and_def"]) | |
| if has_multiple and entry["num_suffix"]: | |
| parts.append(f' <div class="sense">') | |
| parts.append(f' <span class="sense-num">{i + 1}.</span>') | |
| else: | |
| parts.append(f' <div class="sense">') | |
| if pos: | |
| parts.append(f' <span class="pos">{pos}</span>') | |
| if definition: | |
| parts.append(f' <span class="definition">{definition}</span>') | |
| if entry["details"]: | |
| parts.append(f' <dl class="details">') | |
| for dl in entry["details"]: | |
| html_line = detail_line_to_html(dl) | |
| parts.append(f' <dd>{html_line}</dd>') | |
| parts.append(f' </dl>') | |
| parts.append(f' </div>') | |
| parts.append(f'</div>') | |
| return "\n".join(parts) | |
| def main(): | |
| if len(sys.argv) < 2: | |
| print("Usage: parse_dict.py <input_file> [output.json]", file=sys.stderr) | |
| sys.exit(1) | |
| input_file = sys.argv[1] | |
| output_file = sys.argv[2] if len(sys.argv) > 2 else None | |
| data = parse_dict_file(input_file) | |
| json_str = json.dumps(data, ensure_ascii=False, indent=2) | |
| if output_file: | |
| with open(output_file, "w", encoding="utf-8") as f: | |
| f.write(json_str) | |
| print(f"Written {len(data)} entries to {output_file}") | |
| else: | |
| print(json_str) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment