Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save senthilsweb/da500072c1625f501bb9ce3ce4138bcb to your computer and use it in GitHub Desktop.

Select an option

Save senthilsweb/da500072c1625f501bb9ce3ce4138bcb to your computer and use it in GitHub Desktop.
streamlit-duckdb-sql-editor-with-embedded-llm.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/senthilsweb/da500072c1625f501bb9ce3ce4138bcb/streamlit-duckdb-sql-editor-with-embedded-llm.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"id": "a5c573a8-0c1a-4b28-81fa-b8c59f45e086",
"metadata": {
"id": "a5c573a8-0c1a-4b28-81fa-b8c59f45e086"
},
"source": [
"# πŸ¦† DuckDB SQL Editor with PII Detection & Masking\n",
"\n",
"This notebook integrates **DuckDB**, **Streamlit**, and **OpenAI GPT** to enable interactive SQL querying with **PII detection and masking**.\n",
"\n",
"## πŸ”Ή Features\n",
"- **In-Memory DuckDB** for fast SQL execution.\n",
"- **Custom Functions** to mask sensitive data:\n",
" - `mask_email()`, `mask_phone()`, `mask_credit_card()`, `mask_address()`\n",
"- **Remote Data Loading** from CSV/Parquet files.\n",
"- **Streamlit UI with SQL Editor** for easy query execution.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "39cd92e1-3c00-4516-858a-b5fbde213253",
"metadata": {
"id": "39cd92e1-3c00-4516-858a-b5fbde213253"
},
"outputs": [],
"source": [
"!pip install streamlit duckdb openai pandas faker streamlit-ace numpy"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9fda3e42-d291-4e80-9acf-e66d0682c4a4",
"metadata": {
"id": "9fda3e42-d291-4e80-9acf-e66d0682c4a4"
},
"outputs": [],
"source": [
"%%writefile streamlit_app.py\n",
"\n",
"import streamlit as st\n",
"import duckdb\n",
"import pandas as pd\n",
"import openai\n",
"from duckdb.typing import VARCHAR\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\", category=DeprecationWarning)\n",
"\n",
"# ===========================================\n",
"# 🎨 Streamlit Page Configuration & Layout\n",
"# ===========================================\n",
"st.set_page_config(\n",
" page_title=\"DuckDB SQL Editor with PII Masking\",\n",
" layout=\"wide\",\n",
" initial_sidebar_state=\"expanded\"\n",
")\n",
"\n",
"\n",
"# OpenAI Key\n",
"openai.api_key = \"YOUR-OPENAI-KEY\"\n",
"\n",
"\n",
"# ==================================\n",
"# πŸ€– OpenAI-Powered PII Detection\n",
"# ==================================\n",
"def detect_pii(text: str) -> str:\n",
" \"\"\"\n",
" Uses OpenAI GPT model to detect Personally Identifiable Information (PII) in the provided text.\n",
"\n",
" Returns:\n",
" - \"Detected PII: [Type]\" (if PII is found)\n",
" - \"No PII detected\" (if no PII is found)\n",
" \"\"\"\n",
" try:\n",
" response = openai.ChatCompletion.create(\n",
" model=\"gpt-3.5-turbo\",\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": \"You are an assistant specializing in detecting PII in text. Identify the type of PII (e.g., 'Email', 'Phone Number', 'Credit Card', 'Address', 'Name') and return the result in the format: 'Detected PII: [Type].' If no PII is detected, return 'No PII detected.'\"},\n",
" {\"role\": \"user\", \"content\": f\"Does this text contain PII? Analyze and specify: {text}\"}\n",
" ],\n",
" max_tokens=100,\n",
" temperature=0.2,\n",
" )\n",
" return response['choices'][0]['message']['content'].strip()\n",
" except Exception as e:\n",
" return f\"Error: {str(e)}\"\n",
"\n",
"\n",
"# ==================================\n",
"# πŸ”’ PII Masking Functions\n",
"# ==================================\n",
"\n",
"def mask_email(email: str) -> str:\n",
" \"\"\"Masks part of the email address to protect privacy.\"\"\"\n",
" try:\n",
" local_part, domain = email.split('@')\n",
" return f\"{local_part[0]}*****@{domain}\"\n",
" except Exception as e:\n",
" return f\"Error: {str(e)}\"\n",
"\n",
"def mask_credit_card(credit_card: str) -> str:\n",
" \"\"\"Masks a credit card number.\"\"\"\n",
" return f\"{credit_card[:4]} **** **** ****\"\n",
"\n",
"def mask_phone(phone: str) -> str:\n",
" \"\"\"Masks a phone number, keeping only the first three digits.\"\"\"\n",
" return f\"{phone[:3]}-***-****\"\n",
"\n",
"def mask_address(address: str) -> str:\n",
" \"\"\"Masks an address, showing only the first 10 characters.\"\"\"\n",
" return f\"{address[:10]}...\"\n",
"\n",
"\n",
"# ====================================\n",
"# πŸ› οΈ Register Custom Functions in DuckDB\n",
"# ====================================\n",
"def register_custom_functions(conn):\n",
" \"\"\"Registers masking & PII detection functions as SQL functions in DuckDB.\"\"\"\n",
" functions = {\n",
" \"detect_pii\": detect_pii,\n",
" \"mask_email\": mask_email,\n",
" \"mask_credit_card\": mask_credit_card,\n",
" \"mask_phone\": mask_phone,\n",
" \"mask_address\": mask_address,\n",
" }\n",
"\n",
" for func_name, func_ref in functions.items():\n",
" try:\n",
" conn.create_function(func_name, func_ref, [VARCHAR], VARCHAR)\n",
" print(f\"βœ… Successfully registered function: {func_name}\")\n",
" except Exception as e:\n",
" print(f\"⚠️ Function {func_name} already exists, skipping registration: {e}\")\n",
"\n",
"\n",
"# ====================================================\n",
"# πŸ¦† Initialize a Persistent DuckDB Connection\n",
"# ====================================================\n",
"@st.cache_resource\n",
"def get_duckdb_conn():\n",
" \"\"\"Creates a persistent DuckDB connection & registers functions.\"\"\"\n",
" conn = duckdb.connect()\n",
"\n",
" # Load DuckDB Extensions\n",
" conn.execute(\"INSTALL httpfs; LOAD httpfs;\")\n",
" conn.execute(\"INSTALL sqlite_scanner; LOAD sqlite_scanner;\")\n",
" conn.execute(\"INSTALL parquet; LOAD parquet;\")\n",
" conn.execute(\"INSTALL json; LOAD json;\")\n",
"\n",
" # Register Custom SQL Functions\n",
" register_custom_functions(conn)\n",
"\n",
" return conn\n",
"\n",
"\n",
"# Create and persist the DuckDB connection\n",
"conn = get_duckdb_conn()\n",
"\n",
"\n",
"# ==========================================\n",
"# 🎨 Streamlit UI: DuckDB SQL Query Interface\n",
"# ==========================================\n",
"st.title(\"πŸ¦† DuckDB SQL Editor with LLM-driven PII Detection & Masking\")\n",
"\n",
"# ===========================================\n",
"# 🎨 Sidebar Navigation Flow (Hardcoded Steps)\n",
"# ===========================================\n",
"# Sidebar Header\n",
"st.sidebar.title(\"Workflow Steps\")\n",
"st.sidebar.info(\n",
" \"\"\"\n",
" - Connect to In-Memory DuckDB.\n",
" - Attach Custom Functions.\n",
" - Registers Custom Functions (Masking for Emails, Credit Cards, Phones, and Addresses)\n",
" \"\"\"\n",
")\n",
"\n",
"st.sidebar.title(\"Example SQL Queries\")\n",
"st.sidebar.write(\"**Create and load Table**\")\n",
"st.sidebar.code(\"\"\"\n",
"CREATE TABLE passengers AS\n",
"SELECT * FROM read_csv_auto('https://raw.githubusercontent.com/senthilsweb/datasets/refs/heads/main/data/passenger.csv');\n",
"\"\"\", language=\"sql\")\n",
"\n",
"st.sidebar.write(\"Write SQL Query with Custom Functions\")\n",
"st.sidebar.code(\"\"\"\n",
"SELECT forename, surname, email,\n",
" mask_email(email) AS masked_email,\n",
" telephone, address\n",
"FROM passengers\n",
"LIMIT 15;\n",
"\"\"\", language=\"sql\")\n",
"\n",
"st.sidebar.write(\"Write SQL Query with LLM functions\")\n",
"st.sidebar.code(\"\"\"\n",
"SELECT email, address,detect_pii(address) as pii_detected\n",
"FROM passengers\n",
"LIMIT 15;\n",
"\"\"\", language=\"sql\")\n",
"\n",
"# detect_pii\n",
"\n",
"\n",
"# Sidebar Database & Settings\n",
"st.sidebar.subheader(\"πŸ“Š Database Information\")\n",
"\n",
"# Show DuckDB Version\n",
"try:\n",
" duckdb_version = conn.execute(\"SELECT version();\").fetchone()[0]\n",
" st.sidebar.success(f\"**DuckDB Version:** `{duckdb_version}`\")\n",
"except Exception as e:\n",
" st.sidebar.error(f\"Error retrieving DuckDB version: {e}\")\n",
"\n",
"# ===================================\n",
"# πŸ“ Monaco Editor for SQL Queries\n",
"# ===================================\n",
"from streamlit_ace import st_ace\n",
"\n",
"st.subheader(\"πŸ“ SQL Query Editor\")\n",
"\n",
"query = st_ace(\n",
" value=\"SELECT * FROM passengers LIMIT 10;\", # Default SQL query\n",
" language=\"sql\",\n",
" theme=\"tomorrow_night\", # Dark mode theme\n",
" key=\"sql_editor\",\n",
" min_lines=5,\n",
" max_lines=20,\n",
" font_size=14,\n",
" show_gutter=True,\n",
" wrap=True\n",
")\n",
"\n",
"# Execute Query on Button Click\n",
"if st.button(\"πŸš€ Run Query\"):\n",
" if query.strip():\n",
" try:\n",
" # Execute SQL Query\n",
" df = conn.execute(query).fetchdf()\n",
"\n",
" if df.empty:\n",
" st.warning(\"βœ… Query executed successfully, but returned an empty result.\")\n",
" else:\n",
" # Display results in an interactive DataFrame\n",
" st.dataframe(df, use_container_width=True)\n",
"\n",
" # Allow user to download results as CSV\n",
" st.download_button(\"Download Results as CSV\", df.to_csv(index=False), \"query_results.csv\", \"text/csv\")\n",
" except Exception as e:\n",
" st.error(f\"❌ Query execution failed: {str(e)}\")\n",
" else:\n",
" st.warning(\"⚠️ Please enter a valid SQL query.\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "05c4831b-7ec9-40d9-b9e8-6c2fc5254c4f",
"metadata": {
"id": "05c4831b-7ec9-40d9-b9e8-6c2fc5254c4f"
},
"outputs": [],
"source": [
"!streamlit run streamlit_app.py"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
},
"colab": {
"provenance": [],
"include_colab_link": true
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment