Skip to content

Instantly share code, notes, and snippets.

@antonmry
Created April 28, 2026 14:06
Show Gist options
  • Select an option

  • Save antonmry/fd3d6103d1c99036a91ae923fb5a8d49 to your computer and use it in GitHub Desktop.

Select an option

Save antonmry/fd3d6103d1c99036a91ae923fb5a8d49 to your computer and use it in GitHub Desktop.
datafusion_similar_to.py
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.10"
# dependencies = ["datafusion>=43"]
# ///
"""Demonstrate DataFusion's SIMILAR TO behavior with % vs .*
DataFusion treats SIMILAR TO as a pure regex match (same as ~), without
converting SQL-standard wildcards (% and _) to regex equivalents (.* and .).
See: https://github.com/apache/datafusion/blob/eae7bf4fa1c037c0a065d1f36d0669f5bb97a9cf/datafusion/physical-expr/src/expressions/binary.rs#L970-L983
Usage: uv run worktrees/datafusion_similar_to.py
"""
import datafusion
from datafusion import SessionContext
BOLD = "\033[1m"
GREEN = "\033[32m"
RED = "\033[31m"
YELLOW = "\033[33m"
RESET = "\033[0m"
def main() -> None:
ctx = SessionContext()
ctx.sql(
"""
CREATE TABLE projects AS VALUES
(1, 'Slack'),
(2, 'Slack Marketplace Listing'),
(3, 'Weekly error review'),
(4, 'Weekly debugging issues'),
(5, 'Backlog Triage')
"""
).collect()
# Rename default columns
ctx.sql(
"""
CREATE TABLE t AS
SELECT column1 AS id, column2 AS name FROM projects
"""
).collect()
print(f"{BOLD}DataFusion v{datafusion.__version__} — SIMILAR TO behavior{RESET}\n")
print("Data: Slack, Slack Marketplace Listing, Weekly error review,")
print(" Weekly debugging issues, Backlog Triage\n")
queries = [
("LIKE with %", "SELECT name FROM t WHERE name LIKE 'Slack%'"),
("SIMILAR TO with % (SQL standard expects match)", "SELECT name FROM t WHERE name SIMILAR TO '(Slack|Weekly)%'"),
("SIMILAR TO with .* (regex syntax)", "SELECT name FROM t WHERE name SIMILAR TO '(Slack|Weekly).*'"),
("~ with % (regex, % is literal)", "SELECT name FROM t WHERE name ~ '(Slack|Weekly)%'"),
("~ with .* (regex)", "SELECT name FROM t WHERE name ~ '(Slack|Weekly).*'"),
("Literal: 'Slack' SIMILAR TO 'Slack%'", "SELECT 'Slack' SIMILAR TO 'Slack%' AS result"),
("Literal: 'Slack' SIMILAR TO 'Slack.*'", "SELECT 'Slack' SIMILAR TO 'Slack.*' AS result"),
("Literal: 'hello' SIMILAR TO 'hell_'", "SELECT 'hello' SIMILAR TO 'hell_' AS result"),
("Literal: 'hello' SIMILAR TO 'hell.'", "SELECT 'hello' SIMILAR TO 'hell.' AS result"),
]
for label, sql in queries:
print(f"{BOLD}--- {label} ---{RESET}")
print(f"{YELLOW}query:{RESET} {sql}")
try:
result = ctx.sql(sql).collect()
rows = [row.to_pydict() for row in result]
if not rows or all(len(v) == 0 for v in rows[0].values()):
print(f"{RED} (empty result){RESET}")
else:
# Flatten batches
for batch in rows:
cols = list(batch.keys())
n = len(batch[cols[0]])
for i in range(n):
vals = {c: batch[c][i] for c in cols}
print(f"{GREEN} {vals}{RESET}")
except Exception as e:
print(f"{RED} ERROR: {e}{RESET}")
print()
if __name__ == "__main__":
main()
@antonmry
Copy link
Copy Markdown
Author

image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment