vpnry · April 17, 2026 10:40
diff --git a/parse_dpd_txt.py b/parse_dpd_txt.py
 #!/usr/bin/env python3
 """
 Digital Pali Dictionary TXT Parser Test
 https://github.com/digitalpalidictionary/dpd-db/releases/download/v0.3.20260402/dpd-txt.zip

 Parses a structured dictionary text file into JSON with HTML-formatted content.
 """

 import re
 import json
 import sys
 from collections import defaultdict


 def parse_dict_file(filepath: str) -> list[dict]:
    with open(filepath, "r", encoding="utf-8") as f:
        lines = f.readlines()

    # Each entry: { base_word, number_suffix, pos_and_def, details_lines }
    raw_entries = []
    current_entry = None

    for line in lines:
        # Strip trailing newline only (preserve leading spaces)
        line_stripped = line.rstrip("\n")

        if not line_stripped.strip():
            # Blank line — close current entry
            if current_entry:
                raw_entries.append(current_entry)
                current_entry = None
            continue

        if line_stripped and line_stripped[0] != " ":
            # Word entry line
            if current_entry:
                raw_entries.append(current_entry)

            # Extract word (everything before the very first comma)
            comma_idx = line_stripped.find(",")
            if comma_idx == -1:
                # No comma: treat whole line as word with no definition
                word_full = line_stripped.strip()
                rest = ""
            else:
                word_full = line_stripped[:comma_idx].strip()
                rest = line_stripped[comma_idx + 1:].strip()

            # Split word_full into base + numeric suffix
            # Handles both "a 1.1" and "akaci 2" (bare integer)
            m = re.match(r"^(.+?)\s+(\d+(?:\.\d+)?)$", word_full)
            if m:
                base_word = m.group(1).strip()
                num_suffix = m.group(2)
            else:
                base_word = word_full
                num_suffix = None

            current_entry = {
                "base_word": base_word,
                "num_suffix": num_suffix,
                "pos_and_def": rest,
                "details": [],
            }

        else:
            # Indented line — detail content for current entry
            if current_entry is not None:
                current_entry["details"].append(line_stripped.strip())

    # Flush last entry
    if current_entry:
        raw_entries.append(current_entry)

    # Group entries by base_word, preserving order
    grouped: dict[str, list] = defaultdict(list)
    order: list[str] = []
    for entry in raw_entries:
        bw = entry["base_word"]
        if bw not in grouped:
            order.append(bw)
        grouped[bw].append(entry)

    # Build JSON output
    result = []
    for base_word in order:
        entries = grouped[base_word]
        html = build_html(base_word, entries)
        result.append({
            "dict_key": base_word,
            "dict_content": html,
        })

    return result


 def parse_pos_and_def(pos_and_def: str) -> tuple[str, str]:
    """
    Entry line after the first comma has the format:
        {POS.} {definition text}

    The POS is one or more dot-terminated tokens like:
        adj.         letter.       prefix.      cs.      abbrev.
        adj. gen.    pron. pl.     ind. interj.

    We greedily consume tokens of the form [word][.] as long as they look
    like POS abbreviations (short, all alpha/space, ending in dot), stopping
    before the definition.

    Examples:
        "adj. smooth; not harsh"   -> pos="adj."  def="smooth; not harsh"
        "pron. much better"        -> pos="pron."  def="much better"
        "adj. gen. of words"       -> pos="adj. gen."  def="of words"
        "letter. (gram) letter a"  -> pos="letter."  def="(gram) letter a"
    """
    # Split into the POS portion and definition.
    # Strategy: the POS ends at the last '.' that is followed by a space
    # and a non-POS-looking token (i.e. something that starts a real definition:
    # parenthetical, lowercase word that isn't a known POS abbrev, etc.)
    #
    # Simpler reliable rule: POS tokens are [A-Za-zāīūṭḍṇḷṃñ\-]+ followed by '.'
    # We take as many such tokens as appear before the first token that is
    # either: starts with '(' OR is a longer word (>8 chars) OR the accumulated
    # POS string already ended and the next token has no dot.
    #
    # Even simpler: split on '. ' and collect leading tokens ≤10 chars as POS.
    tokens = re.split(r"(?<=\.)\s+", pos_and_def, maxsplit=10)
    pos_tokens = []
    for tok in tokens:
        # A POS token: ends with '.', short (≤12 chars), no digits, no semicolons
        if re.match(r"^[A-Za-zāīūṭḍṇḷṃñ\s\-]{1,12}\.$", tok) and ";" not in tok:
            pos_tokens.append(tok)
        else:
            break

    if pos_tokens:
        pos = " ".join(pos_tokens)
        # Remainder: everything after the POS prefix
        remainder = pos_and_def[len(pos):].strip()
        return pos, remainder
    return "", pos_and_def.strip()


 def detail_line_to_html(line: str) -> str:
    """Format a detail line like 'Key: value' into styled HTML."""
    m = re.match(r"^([A-Za-z /]+):\s*(.+)$", line)
    if m:
        key = m.group(1).strip()
        val = m.group(2).strip()
        return f'<span class="detail-key">{key}:</span> <span class="detail-val">{val}</span>'
    return line


 def build_html(base_word: str, entries: list[dict]) -> str:
    has_multiple = any(e["num_suffix"] for e in entries)

    parts = []

    # Header
    parts.append(f'<div class="dict-entry">')
    parts.append(f'  <h2 class="headword">{base_word}</h2>')

    for i, entry in enumerate(entries):
        pos, definition = parse_pos_and_def(entry["pos_and_def"])

        if has_multiple and entry["num_suffix"]:
            parts.append(f'  <div class="sense">')
            parts.append(f'    <span class="sense-num">{i + 1}.</span>')
        else:
            parts.append(f'  <div class="sense">')

        if pos:
            parts.append(f'    <span class="pos">{pos}</span>')

        if definition:
            parts.append(f'    <span class="definition">{definition}</span>')

        if entry["details"]:
            parts.append(f'    <dl class="details">')
            for dl in entry["details"]:
                html_line = detail_line_to_html(dl)
                parts.append(f'      <dd>{html_line}</dd>')
            parts.append(f'    </dl>')

        parts.append(f'  </div>')

    parts.append(f'</div>')
    return "\n".join(parts)


 def main():
    if len(sys.argv) < 2:
        print("Usage: parse_dict.py <input_file> [output.json]", file=sys.stderr)
        sys.exit(1)

    input_file = sys.argv[1]
    output_file = sys.argv[2] if len(sys.argv) > 2 else None

    data = parse_dict_file(input_file)

    json_str = json.dumps(data, ensure_ascii=False, indent=2)

    if output_file:
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(json_str)
        print(f"Written {len(data)} entries to {output_file}")
    else:
        print(json_str)


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Digital Pali Dictionary TXT Parser Test
	https://github.com/digitalpalidictionary/dpd-db/releases/download/v0.3.20260402/dpd-txt.zip

	Parses a structured dictionary text file into JSON with HTML-formatted content.
	"""

	import re
	import json
	import sys
	from collections import defaultdict


	def parse_dict_file(filepath: str) -> list[dict]:
	with open(filepath, "r", encoding="utf-8") as f:
	lines = f.readlines()

	# Each entry: { base_word, number_suffix, pos_and_def, details_lines }
	raw_entries = []
	current_entry = None

	for line in lines:
	# Strip trailing newline only (preserve leading spaces)
	line_stripped = line.rstrip("\n")

	if not line_stripped.strip():
	# Blank line — close current entry
	if current_entry:
	raw_entries.append(current_entry)
	current_entry = None
	continue

	if line_stripped and line_stripped[0] != " ":
	# Word entry line
	if current_entry:
	raw_entries.append(current_entry)

	# Extract word (everything before the very first comma)
	comma_idx = line_stripped.find(",")
	if comma_idx == -1:
	# No comma: treat whole line as word with no definition
	word_full = line_stripped.strip()
	rest = ""
	else:
	word_full = line_stripped[:comma_idx].strip()
	rest = line_stripped[comma_idx + 1:].strip()

	# Split word_full into base + numeric suffix
	# Handles both "a 1.1" and "akaci 2" (bare integer)
	m = re.match(r"^(.+?)\s+(\d+(?:\.\d+)?)$", word_full)
	if m:
	base_word = m.group(1).strip()
	num_suffix = m.group(2)
	else:
	base_word = word_full
	num_suffix = None

	current_entry = {
	"base_word": base_word,
	"num_suffix": num_suffix,
	"pos_and_def": rest,
	"details": [],
	}

	else:
	# Indented line — detail content for current entry
	if current_entry is not None:
	current_entry["details"].append(line_stripped.strip())

	# Flush last entry
	if current_entry:
	raw_entries.append(current_entry)

	# Group entries by base_word, preserving order
	grouped: dict[str, list] = defaultdict(list)
	order: list[str] = []
	for entry in raw_entries:
	bw = entry["base_word"]
	if bw not in grouped:
	order.append(bw)
	grouped[bw].append(entry)

	# Build JSON output
	result = []
	for base_word in order:
	entries = grouped[base_word]
	html = build_html(base_word, entries)
	result.append({
	"dict_key": base_word,
	"dict_content": html,
	})

	return result


	def parse_pos_and_def(pos_and_def: str) -> tuple[str, str]:
	"""
	Entry line after the first comma has the format:
	{POS.} {definition text}

	The POS is one or more dot-terminated tokens like:
	adj. letter. prefix. cs. abbrev.
	adj. gen. pron. pl. ind. interj.

	We greedily consume tokens of the form [word][.] as long as they look
	like POS abbreviations (short, all alpha/space, ending in dot), stopping
	before the definition.

	Examples:
	"adj. smooth; not harsh" -> pos="adj." def="smooth; not harsh"
	"pron. much better" -> pos="pron." def="much better"
	"adj. gen. of words" -> pos="adj. gen." def="of words"
	"letter. (gram) letter a" -> pos="letter." def="(gram) letter a"
	"""
	# Split into the POS portion and definition.
	# Strategy: the POS ends at the last '.' that is followed by a space
	# and a non-POS-looking token (i.e. something that starts a real definition:
	# parenthetical, lowercase word that isn't a known POS abbrev, etc.)
	#
	# Simpler reliable rule: POS tokens are [A-Za-zāīūṭḍṇḷṃñ\-]+ followed by '.'
	# We take as many such tokens as appear before the first token that is
	# either: starts with '(' OR is a longer word (>8 chars) OR the accumulated
	# POS string already ended and the next token has no dot.
	#
	# Even simpler: split on '. ' and collect leading tokens ≤10 chars as POS.
	tokens = re.split(r"(?<=\.)\s+", pos_and_def, maxsplit=10)
	pos_tokens = []
	for tok in tokens:
	# A POS token: ends with '.', short (≤12 chars), no digits, no semicolons
	if re.match(r"^[A-Za-zāīūṭḍṇḷṃñ\s\-]{1,12}\.$", tok) and ";" not in tok:
	pos_tokens.append(tok)
	else:
	break

	if pos_tokens:
	pos = " ".join(pos_tokens)
	# Remainder: everything after the POS prefix
	remainder = pos_and_def[len(pos):].strip()
	return pos, remainder
	return "", pos_and_def.strip()


	def detail_line_to_html(line: str) -> str:
	"""Format a detail line like 'Key: value' into styled HTML."""
	m = re.match(r"^([A-Za-z /]+):\s*(.+)$", line)
	if m:
	key = m.group(1).strip()
	val = m.group(2).strip()
	return f'<span class="detail-key">{key}:</span> <span class="detail-val">{val}</span>'
	return line


	def build_html(base_word: str, entries: list[dict]) -> str:
	has_multiple = any(e["num_suffix"] for e in entries)

	parts = []

	# Header
	parts.append(f'<div class="dict-entry">')
	parts.append(f' <h2 class="headword">{base_word}</h2>')

	for i, entry in enumerate(entries):
	pos, definition = parse_pos_and_def(entry["pos_and_def"])

	if has_multiple and entry["num_suffix"]:
	parts.append(f' <div class="sense">')
	parts.append(f' <span class="sense-num">{i + 1}.</span>')
	else:
	parts.append(f' <div class="sense">')

	if pos:
	parts.append(f' <span class="pos">{pos}</span>')

	if definition:
	parts.append(f' <span class="definition">{definition}</span>')

	if entry["details"]:
	parts.append(f' <dl class="details">')
	for dl in entry["details"]:
	html_line = detail_line_to_html(dl)
	parts.append(f' <dd>{html_line}</dd>')
	parts.append(f' </dl>')

	parts.append(f' </div>')

	parts.append(f'</div>')
	return "\n".join(parts)


	def main():
	if len(sys.argv) < 2:
	print("Usage: parse_dict.py <input_file> [output.json]", file=sys.stderr)
	sys.exit(1)

	input_file = sys.argv[1]
	output_file = sys.argv[2] if len(sys.argv) > 2 else None

	data = parse_dict_file(input_file)

	json_str = json.dumps(data, ensure_ascii=False, indent=2)

	if output_file:
	with open(output_file, "w", encoding="utf-8") as f:
	f.write(json_str)
	print(f"Written {len(data)} entries to {output_file}")
	else:
	print(json_str)


	if __name__ == "__main__":
	main()
No results found