Last active
September 6, 2025 18:35
-
-
Save senthilsweb/da500072c1625f501bb9ce3ce4138bcb to your computer and use it in GitHub Desktop.
streamlit-duckdb-sql-editor-with-embedded-llm.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/senthilsweb/da500072c1625f501bb9ce3ce4138bcb/streamlit-duckdb-sql-editor-with-embedded-llm.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "a5c573a8-0c1a-4b28-81fa-b8c59f45e086", | |
| "metadata": { | |
| "id": "a5c573a8-0c1a-4b28-81fa-b8c59f45e086" | |
| }, | |
| "source": [ | |
| "# π¦ DuckDB SQL Editor with PII Detection & Masking\n", | |
| "\n", | |
| "This notebook integrates **DuckDB**, **Streamlit**, and **OpenAI GPT** to enable interactive SQL querying with **PII detection and masking**.\n", | |
| "\n", | |
| "## πΉ Features\n", | |
| "- **In-Memory DuckDB** for fast SQL execution.\n", | |
| "- **Custom Functions** to mask sensitive data:\n", | |
| " - `mask_email()`, `mask_phone()`, `mask_credit_card()`, `mask_address()`\n", | |
| "- **Remote Data Loading** from CSV/Parquet files.\n", | |
| "- **Streamlit UI with SQL Editor** for easy query execution.\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "39cd92e1-3c00-4516-858a-b5fbde213253", | |
| "metadata": { | |
| "id": "39cd92e1-3c00-4516-858a-b5fbde213253" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "!pip install streamlit duckdb openai pandas faker streamlit-ace numpy" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "9fda3e42-d291-4e80-9acf-e66d0682c4a4", | |
| "metadata": { | |
| "id": "9fda3e42-d291-4e80-9acf-e66d0682c4a4" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "%%writefile streamlit_app.py\n", | |
| "\n", | |
| "import streamlit as st\n", | |
| "import duckdb\n", | |
| "import pandas as pd\n", | |
| "import openai\n", | |
| "from duckdb.typing import VARCHAR\n", | |
| "import warnings\n", | |
| "warnings.filterwarnings(\"ignore\", category=DeprecationWarning)\n", | |
| "\n", | |
| "# ===========================================\n", | |
| "# π¨ Streamlit Page Configuration & Layout\n", | |
| "# ===========================================\n", | |
| "st.set_page_config(\n", | |
| " page_title=\"DuckDB SQL Editor with PII Masking\",\n", | |
| " layout=\"wide\",\n", | |
| " initial_sidebar_state=\"expanded\"\n", | |
| ")\n", | |
| "\n", | |
| "\n", | |
| "# OpenAI Key\n", | |
| "openai.api_key = \"YOUR-OPENAI-KEY\"\n", | |
| "\n", | |
| "\n", | |
| "# ==================================\n", | |
| "# π€ OpenAI-Powered PII Detection\n", | |
| "# ==================================\n", | |
| "def detect_pii(text: str) -> str:\n", | |
| " \"\"\"\n", | |
| " Uses OpenAI GPT model to detect Personally Identifiable Information (PII) in the provided text.\n", | |
| "\n", | |
| " Returns:\n", | |
| " - \"Detected PII: [Type]\" (if PII is found)\n", | |
| " - \"No PII detected\" (if no PII is found)\n", | |
| " \"\"\"\n", | |
| " try:\n", | |
| " response = openai.ChatCompletion.create(\n", | |
| " model=\"gpt-3.5-turbo\",\n", | |
| " messages=[\n", | |
| " {\"role\": \"system\", \"content\": \"You are an assistant specializing in detecting PII in text. Identify the type of PII (e.g., 'Email', 'Phone Number', 'Credit Card', 'Address', 'Name') and return the result in the format: 'Detected PII: [Type].' If no PII is detected, return 'No PII detected.'\"},\n", | |
| " {\"role\": \"user\", \"content\": f\"Does this text contain PII? Analyze and specify: {text}\"}\n", | |
| " ],\n", | |
| " max_tokens=100,\n", | |
| " temperature=0.2,\n", | |
| " )\n", | |
| " return response['choices'][0]['message']['content'].strip()\n", | |
| " except Exception as e:\n", | |
| " return f\"Error: {str(e)}\"\n", | |
| "\n", | |
| "\n", | |
| "# ==================================\n", | |
| "# π PII Masking Functions\n", | |
| "# ==================================\n", | |
| "\n", | |
| "def mask_email(email: str) -> str:\n", | |
| " \"\"\"Masks part of the email address to protect privacy.\"\"\"\n", | |
| " try:\n", | |
| " local_part, domain = email.split('@')\n", | |
| " return f\"{local_part[0]}*****@{domain}\"\n", | |
| " except Exception as e:\n", | |
| " return f\"Error: {str(e)}\"\n", | |
| "\n", | |
| "def mask_credit_card(credit_card: str) -> str:\n", | |
| " \"\"\"Masks a credit card number.\"\"\"\n", | |
| " return f\"{credit_card[:4]} **** **** ****\"\n", | |
| "\n", | |
| "def mask_phone(phone: str) -> str:\n", | |
| " \"\"\"Masks a phone number, keeping only the first three digits.\"\"\"\n", | |
| " return f\"{phone[:3]}-***-****\"\n", | |
| "\n", | |
| "def mask_address(address: str) -> str:\n", | |
| " \"\"\"Masks an address, showing only the first 10 characters.\"\"\"\n", | |
| " return f\"{address[:10]}...\"\n", | |
| "\n", | |
| "\n", | |
| "# ====================================\n", | |
| "# π οΈ Register Custom Functions in DuckDB\n", | |
| "# ====================================\n", | |
| "def register_custom_functions(conn):\n", | |
| " \"\"\"Registers masking & PII detection functions as SQL functions in DuckDB.\"\"\"\n", | |
| " functions = {\n", | |
| " \"detect_pii\": detect_pii,\n", | |
| " \"mask_email\": mask_email,\n", | |
| " \"mask_credit_card\": mask_credit_card,\n", | |
| " \"mask_phone\": mask_phone,\n", | |
| " \"mask_address\": mask_address,\n", | |
| " }\n", | |
| "\n", | |
| " for func_name, func_ref in functions.items():\n", | |
| " try:\n", | |
| " conn.create_function(func_name, func_ref, [VARCHAR], VARCHAR)\n", | |
| " print(f\"β Successfully registered function: {func_name}\")\n", | |
| " except Exception as e:\n", | |
| " print(f\"β οΈ Function {func_name} already exists, skipping registration: {e}\")\n", | |
| "\n", | |
| "\n", | |
| "# ====================================================\n", | |
| "# π¦ Initialize a Persistent DuckDB Connection\n", | |
| "# ====================================================\n", | |
| "@st.cache_resource\n", | |
| "def get_duckdb_conn():\n", | |
| " \"\"\"Creates a persistent DuckDB connection & registers functions.\"\"\"\n", | |
| " conn = duckdb.connect()\n", | |
| "\n", | |
| " # Load DuckDB Extensions\n", | |
| " conn.execute(\"INSTALL httpfs; LOAD httpfs;\")\n", | |
| " conn.execute(\"INSTALL sqlite_scanner; LOAD sqlite_scanner;\")\n", | |
| " conn.execute(\"INSTALL parquet; LOAD parquet;\")\n", | |
| " conn.execute(\"INSTALL json; LOAD json;\")\n", | |
| "\n", | |
| " # Register Custom SQL Functions\n", | |
| " register_custom_functions(conn)\n", | |
| "\n", | |
| " return conn\n", | |
| "\n", | |
| "\n", | |
| "# Create and persist the DuckDB connection\n", | |
| "conn = get_duckdb_conn()\n", | |
| "\n", | |
| "\n", | |
| "# ==========================================\n", | |
| "# π¨ Streamlit UI: DuckDB SQL Query Interface\n", | |
| "# ==========================================\n", | |
| "st.title(\"π¦ DuckDB SQL Editor with LLM-driven PII Detection & Masking\")\n", | |
| "\n", | |
| "# ===========================================\n", | |
| "# π¨ Sidebar Navigation Flow (Hardcoded Steps)\n", | |
| "# ===========================================\n", | |
| "# Sidebar Header\n", | |
| "st.sidebar.title(\"Workflow Steps\")\n", | |
| "st.sidebar.info(\n", | |
| " \"\"\"\n", | |
| " - Connect to In-Memory DuckDB.\n", | |
| " - Attach Custom Functions.\n", | |
| " - Registers Custom Functions (Masking for Emails, Credit Cards, Phones, and Addresses)\n", | |
| " \"\"\"\n", | |
| ")\n", | |
| "\n", | |
| "st.sidebar.title(\"Example SQL Queries\")\n", | |
| "st.sidebar.write(\"**Create and load Table**\")\n", | |
| "st.sidebar.code(\"\"\"\n", | |
| "CREATE TABLE passengers AS\n", | |
| "SELECT * FROM read_csv_auto('https://raw.githubusercontent.com/senthilsweb/datasets/refs/heads/main/data/passenger.csv');\n", | |
| "\"\"\", language=\"sql\")\n", | |
| "\n", | |
| "st.sidebar.write(\"Write SQL Query with Custom Functions\")\n", | |
| "st.sidebar.code(\"\"\"\n", | |
| "SELECT forename, surname, email,\n", | |
| " mask_email(email) AS masked_email,\n", | |
| " telephone, address\n", | |
| "FROM passengers\n", | |
| "LIMIT 15;\n", | |
| "\"\"\", language=\"sql\")\n", | |
| "\n", | |
| "st.sidebar.write(\"Write SQL Query with LLM functions\")\n", | |
| "st.sidebar.code(\"\"\"\n", | |
| "SELECT email, address,detect_pii(address) as pii_detected\n", | |
| "FROM passengers\n", | |
| "LIMIT 15;\n", | |
| "\"\"\", language=\"sql\")\n", | |
| "\n", | |
| "# detect_pii\n", | |
| "\n", | |
| "\n", | |
| "# Sidebar Database & Settings\n", | |
| "st.sidebar.subheader(\"π Database Information\")\n", | |
| "\n", | |
| "# Show DuckDB Version\n", | |
| "try:\n", | |
| " duckdb_version = conn.execute(\"SELECT version();\").fetchone()[0]\n", | |
| " st.sidebar.success(f\"**DuckDB Version:** `{duckdb_version}`\")\n", | |
| "except Exception as e:\n", | |
| " st.sidebar.error(f\"Error retrieving DuckDB version: {e}\")\n", | |
| "\n", | |
| "# ===================================\n", | |
| "# π Monaco Editor for SQL Queries\n", | |
| "# ===================================\n", | |
| "from streamlit_ace import st_ace\n", | |
| "\n", | |
| "st.subheader(\"π SQL Query Editor\")\n", | |
| "\n", | |
| "query = st_ace(\n", | |
| " value=\"SELECT * FROM passengers LIMIT 10;\", # Default SQL query\n", | |
| " language=\"sql\",\n", | |
| " theme=\"tomorrow_night\", # Dark mode theme\n", | |
| " key=\"sql_editor\",\n", | |
| " min_lines=5,\n", | |
| " max_lines=20,\n", | |
| " font_size=14,\n", | |
| " show_gutter=True,\n", | |
| " wrap=True\n", | |
| ")\n", | |
| "\n", | |
| "# Execute Query on Button Click\n", | |
| "if st.button(\"π Run Query\"):\n", | |
| " if query.strip():\n", | |
| " try:\n", | |
| " # Execute SQL Query\n", | |
| " df = conn.execute(query).fetchdf()\n", | |
| "\n", | |
| " if df.empty:\n", | |
| " st.warning(\"β Query executed successfully, but returned an empty result.\")\n", | |
| " else:\n", | |
| " # Display results in an interactive DataFrame\n", | |
| " st.dataframe(df, use_container_width=True)\n", | |
| "\n", | |
| " # Allow user to download results as CSV\n", | |
| " st.download_button(\"Download Results as CSV\", df.to_csv(index=False), \"query_results.csv\", \"text/csv\")\n", | |
| " except Exception as e:\n", | |
| " st.error(f\"β Query execution failed: {str(e)}\")\n", | |
| " else:\n", | |
| " st.warning(\"β οΈ Please enter a valid SQL query.\")\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "05c4831b-7ec9-40d9-b9e8-6c2fc5254c4f", | |
| "metadata": { | |
| "id": "05c4831b-7ec9-40d9-b9e8-6c2fc5254c4f" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "!streamlit run streamlit_app.py" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.11.6" | |
| }, | |
| "colab": { | |
| "provenance": [], | |
| "include_colab_link": true | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment