Created
April 28, 2026 14:06
-
-
Save antonmry/fd3d6103d1c99036a91ae923fb5a8d49 to your computer and use it in GitHub Desktop.
datafusion_similar_to.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # /// script | |
| # requires-python = ">=3.10" | |
| # dependencies = ["datafusion>=43"] | |
| # /// | |
| """Demonstrate DataFusion's SIMILAR TO behavior with % vs .* | |
| DataFusion treats SIMILAR TO as a pure regex match (same as ~), without | |
| converting SQL-standard wildcards (% and _) to regex equivalents (.* and .). | |
| See: https://github.com/apache/datafusion/blob/eae7bf4fa1c037c0a065d1f36d0669f5bb97a9cf/datafusion/physical-expr/src/expressions/binary.rs#L970-L983 | |
| Usage: uv run worktrees/datafusion_similar_to.py | |
| """ | |
| import datafusion | |
| from datafusion import SessionContext | |
| BOLD = "\033[1m" | |
| GREEN = "\033[32m" | |
| RED = "\033[31m" | |
| YELLOW = "\033[33m" | |
| RESET = "\033[0m" | |
| def main() -> None: | |
| ctx = SessionContext() | |
| ctx.sql( | |
| """ | |
| CREATE TABLE projects AS VALUES | |
| (1, 'Slack'), | |
| (2, 'Slack Marketplace Listing'), | |
| (3, 'Weekly error review'), | |
| (4, 'Weekly debugging issues'), | |
| (5, 'Backlog Triage') | |
| """ | |
| ).collect() | |
| # Rename default columns | |
| ctx.sql( | |
| """ | |
| CREATE TABLE t AS | |
| SELECT column1 AS id, column2 AS name FROM projects | |
| """ | |
| ).collect() | |
| print(f"{BOLD}DataFusion v{datafusion.__version__} — SIMILAR TO behavior{RESET}\n") | |
| print("Data: Slack, Slack Marketplace Listing, Weekly error review,") | |
| print(" Weekly debugging issues, Backlog Triage\n") | |
| queries = [ | |
| ("LIKE with %", "SELECT name FROM t WHERE name LIKE 'Slack%'"), | |
| ("SIMILAR TO with % (SQL standard expects match)", "SELECT name FROM t WHERE name SIMILAR TO '(Slack|Weekly)%'"), | |
| ("SIMILAR TO with .* (regex syntax)", "SELECT name FROM t WHERE name SIMILAR TO '(Slack|Weekly).*'"), | |
| ("~ with % (regex, % is literal)", "SELECT name FROM t WHERE name ~ '(Slack|Weekly)%'"), | |
| ("~ with .* (regex)", "SELECT name FROM t WHERE name ~ '(Slack|Weekly).*'"), | |
| ("Literal: 'Slack' SIMILAR TO 'Slack%'", "SELECT 'Slack' SIMILAR TO 'Slack%' AS result"), | |
| ("Literal: 'Slack' SIMILAR TO 'Slack.*'", "SELECT 'Slack' SIMILAR TO 'Slack.*' AS result"), | |
| ("Literal: 'hello' SIMILAR TO 'hell_'", "SELECT 'hello' SIMILAR TO 'hell_' AS result"), | |
| ("Literal: 'hello' SIMILAR TO 'hell.'", "SELECT 'hello' SIMILAR TO 'hell.' AS result"), | |
| ] | |
| for label, sql in queries: | |
| print(f"{BOLD}--- {label} ---{RESET}") | |
| print(f"{YELLOW}query:{RESET} {sql}") | |
| try: | |
| result = ctx.sql(sql).collect() | |
| rows = [row.to_pydict() for row in result] | |
| if not rows or all(len(v) == 0 for v in rows[0].values()): | |
| print(f"{RED} (empty result){RESET}") | |
| else: | |
| # Flatten batches | |
| for batch in rows: | |
| cols = list(batch.keys()) | |
| n = len(batch[cols[0]]) | |
| for i in range(n): | |
| vals = {c: batch[c][i] for c in cols} | |
| print(f"{GREEN} {vals}{RESET}") | |
| except Exception as e: | |
| print(f"{RED} ERROR: {e}{RESET}") | |
| print() | |
| if __name__ == "__main__": | |
| main() |
Author
antonmry
commented
Apr 28, 2026
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment