{ "cells": [ { "cell_type": "code", "execution_count": 40, "id": "1593637f-ddcc-4c19-9aa9-fe17f0ff1ebd", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import hashlib\n", "import uuid\n", "import math\n", "import random\n", "from diffprivlib.models import LogisticRegression, GaussianNB, DecisionTreeClassifier\n", "from sklearn.preprocessing import LabelEncoder" ] }, { "cell_type": "code", "execution_count": 41, "id": "ad3bc31c-19be-40f7-9015-c3ea6c94daa4", "metadata": {}, "outputs": [], "source": [ "def auto_preprocess_df(df, null_threshold=0.5):\n", " \"\"\"\n", " 1. If unique values < 65: LabelEncodes (treating NaNs as a label).\n", " 2. Else: Converts to numeric (coercing errors to NaN).\n", " 3. If column Null % > threshold: Drops the column.\n", " 4. Else: Fills NaNs with median.\n", " 5. Finally: Drops any rows that still contain Nulls.\n", " \"\"\"\n", " df_processed = df.copy()\n", " le = LabelEncoder()\n", " cols_to_drop = []\n", "\n", " for col in df_processed.columns:\n", " # Condition 1: Low Cardinality (Categorical)\n", " if df_processed[col].nunique() < 65:\n", " # Convert to string to handle mixed types and NaNs as a category\n", " df_processed[col] = le.fit_transform(df_processed[col].astype(str))\n", " \n", " # Condition 2: High Cardinality (Numeric/ID/Messy)\n", " else:\n", " # Coerce to number (Strings/IDs become NaN)\n", " df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')\n", " \n", " # Check Null Percentage immediately after coercion\n", " null_pct = df_processed[col].isnull().mean()\n", " \n", " if null_pct > null_threshold:\n", " # If too many NaNs (e.g., it was an ID column), mark for dropping\n", " cols_to_drop.append(col)\n", " else:\n", " # Otherwise, fill with median\n", " median_val = df_processed[col].median()\n", " df_processed[col] = df_processed[col].fillna(median_val)\n", "\n", " # Drop the columns identified as \"mostly null\"\n", " if cols_to_drop:\n", " print(f\"Dropping columns > {null_threshold:.0%} Null: {cols_to_drop}\")\n", " df_processed.drop(columns=cols_to_drop, inplace=True)\n", "\n", " # Final cleanup: Drop rows containing any remaining NaNs\n", " original_len = len(df_processed)\n", " df_processed.dropna(axis=0, inplace=True)\n", " \n", " if len(df_processed) < original_len:\n", " print(f\"Dropped {original_len - len(df_processed)} rows containing Nulls.\")\n", "\n", " return df_processed" ] }, { "cell_type": "code", "execution_count": 3, "id": "26c40daf-66e2-46ea-bdcc-da60af6f301b", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "def calculate_k_anonymity(df, quasi_identifiers):\n", " # Gunakan dropna=False agar baris dengan NaN tetap dihitung sebagai grup tersendiri\n", " grouped = df.groupby(quasi_identifiers, dropna=False)\n", " group_sizes = grouped.size()\n", " \n", " if group_sizes.empty:\n", " return 0, group_sizes\n", " \n", " k_actual = group_sizes.min()\n", " return k_actual, group_sizes\n", "\n", "def calculate_l_diversity(df, quasi_identifiers, sensitive_col):\n", " # Gunakan dropna=False\n", " grouped = df.groupby(quasi_identifiers, dropna=False)\n", " diversity_counts = grouped[sensitive_col].nunique()\n", " \n", " if diversity_counts.empty:\n", " return 0, diversity_counts\n", "\n", " l_actual = diversity_counts.min()\n", " return l_actual, diversity_counts\n", "\n", "def calculate_entropy_l_diversity(df, quasi_identifiers, sensitive_col):\n", " grouped = df.groupby(quasi_identifiers, dropna=False)\n", " entropy_results = []\n", " \n", " for name, group in grouped:\n", " counts = group[sensitive_col].value_counts(normalize=True)\n", " entropy = -np.sum(counts * np.log(counts))\n", " entropy_results.append({\n", " 'group_id': name,\n", " 'size': len(group),\n", " 'entropy': entropy\n", " })\n", " \n", " results_df = pd.DataFrame(entropy_results)\n", " if results_df.empty:\n", " return 0.0, results_df\n", " \n", " min_entropy = results_df['entropy'].min()\n", " return min_entropy, results_df" ] }, { "cell_type": "code", "execution_count": 42, "id": "7a8c66ac-3047-4edc-9b82-83d566b139cc", "metadata": {}, "outputs": [], "source": [ "df=pd.read_csv(\"WA_Fn-UseC_-Telco-Customer-Churn.csv\")" ] }, { "cell_type": "code", "execution_count": 43, "id": "ff90986c-ec4e-4f82-9651-d40d3c42688b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | customerID | \n", "gender | \n", "SeniorCitizen | \n", "Partner | \n", "Dependents | \n", "tenure | \n", "PhoneService | \n", "MultipleLines | \n", "InternetService | \n", "OnlineSecurity | \n", "... | \n", "DeviceProtection | \n", "TechSupport | \n", "StreamingTV | \n", "StreamingMovies | \n", "Contract | \n", "PaperlessBilling | \n", "PaymentMethod | \n", "MonthlyCharges | \n", "TotalCharges | \n", "Churn | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "7590-VHVEG | \n", "Female | \n", "0 | \n", "Yes | \n", "No | \n", "1 | \n", "No | \n", "No phone service | \n", "DSL | \n", "No | \n", "... | \n", "No | \n", "No | \n", "No | \n", "No | \n", "Month-to-month | \n", "Yes | \n", "Electronic check | \n", "29.85 | \n", "29.85 | \n", "No | \n", "
| 1 | \n", "5575-GNVDE | \n", "Male | \n", "0 | \n", "No | \n", "No | \n", "34 | \n", "Yes | \n", "No | \n", "DSL | \n", "Yes | \n", "... | \n", "Yes | \n", "No | \n", "No | \n", "No | \n", "One year | \n", "No | \n", "Mailed check | \n", "56.95 | \n", "1889.5 | \n", "No | \n", "
| 2 | \n", "3668-QPYBK | \n", "Male | \n", "0 | \n", "No | \n", "No | \n", "2 | \n", "Yes | \n", "No | \n", "DSL | \n", "Yes | \n", "... | \n", "No | \n", "No | \n", "No | \n", "No | \n", "Month-to-month | \n", "Yes | \n", "Mailed check | \n", "53.85 | \n", "108.15 | \n", "Yes | \n", "
| 3 | \n", "7795-CFOCW | \n", "Male | \n", "0 | \n", "No | \n", "No | \n", "45 | \n", "No | \n", "No phone service | \n", "DSL | \n", "Yes | \n", "... | \n", "Yes | \n", "Yes | \n", "No | \n", "No | \n", "One year | \n", "No | \n", "Bank transfer (automatic) | \n", "42.30 | \n", "1840.75 | \n", "No | \n", "
| 4 | \n", "9237-HQITU | \n", "Female | \n", "0 | \n", "No | \n", "No | \n", "2 | \n", "Yes | \n", "No | \n", "Fiber optic | \n", "No | \n", "... | \n", "No | \n", "No | \n", "No | \n", "No | \n", "Month-to-month | \n", "Yes | \n", "Electronic check | \n", "70.70 | \n", "151.65 | \n", "Yes | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 7038 | \n", "6840-RESVB | \n", "Male | \n", "0 | \n", "Yes | \n", "Yes | \n", "24 | \n", "Yes | \n", "Yes | \n", "DSL | \n", "Yes | \n", "... | \n", "Yes | \n", "Yes | \n", "Yes | \n", "Yes | \n", "One year | \n", "Yes | \n", "Mailed check | \n", "84.80 | \n", "1990.5 | \n", "No | \n", "
| 7039 | \n", "2234-XADUH | \n", "Female | \n", "0 | \n", "Yes | \n", "Yes | \n", "72 | \n", "Yes | \n", "Yes | \n", "Fiber optic | \n", "No | \n", "... | \n", "Yes | \n", "No | \n", "Yes | \n", "Yes | \n", "One year | \n", "Yes | \n", "Credit card (automatic) | \n", "103.20 | \n", "7362.9 | \n", "No | \n", "
| 7040 | \n", "4801-JZAZL | \n", "Female | \n", "0 | \n", "Yes | \n", "Yes | \n", "11 | \n", "No | \n", "No phone service | \n", "DSL | \n", "Yes | \n", "... | \n", "No | \n", "No | \n", "No | \n", "No | \n", "Month-to-month | \n", "Yes | \n", "Electronic check | \n", "29.60 | \n", "346.45 | \n", "No | \n", "
| 7041 | \n", "8361-LTMKD | \n", "Male | \n", "1 | \n", "Yes | \n", "No | \n", "4 | \n", "Yes | \n", "Yes | \n", "Fiber optic | \n", "No | \n", "... | \n", "No | \n", "No | \n", "No | \n", "No | \n", "Month-to-month | \n", "Yes | \n", "Mailed check | \n", "74.40 | \n", "306.6 | \n", "Yes | \n", "
| 7042 | \n", "3186-AJIEK | \n", "Male | \n", "0 | \n", "No | \n", "No | \n", "66 | \n", "Yes | \n", "No | \n", "Fiber optic | \n", "Yes | \n", "... | \n", "Yes | \n", "Yes | \n", "Yes | \n", "Yes | \n", "Two year | \n", "Yes | \n", "Bank transfer (automatic) | \n", "105.65 | \n", "6844.5 | \n", "No | \n", "
7043 rows × 21 columns
\n", "| \n", " | customerID | \n", "StreamingTV | \n", "TechSupport | \n", "PaperlessBilling | \n", "InternetService | \n", "PhoneService | \n", "PaymentMethod | \n", "MultipleLines | \n", "OnlineSecurity | \n", "OnlineBackup | \n", "Contract | \n", "DeviceProtection | \n", "TotalCharges | \n", "StreamingMovies | \n", "tenure | \n", "Churn | \n", "MonthlyCharges | \n", "SeniorCitizen | \n", "gender | \n", "IsIndependent | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "f3f9002e121cbbcc038f195a656a32a17395c6ce1815e8... | \n", "A | \n", "A | \n", "A | \n", "A | \n", "A | \n", "A | \n", "A | \n", "A | \n", "A | \n", "A | \n", "A | \n", "B | \n", "A | \n", "B | \n", "A | \n", "D | \n", "A | \n", "A | \n", "A | \n", "
| 1 | \n", "5f362546eb7515f442a40d3d9bf632e4a481c92bcb0529... | \n", "A | \n", "A | \n", "B | \n", "A | \n", "B | \n", "B | \n", "B | \n", "B | \n", "B | \n", "B | \n", "B | \n", "C | \n", "A | \n", "A | \n", "A | \n", "A | \n", "A | \n", "B | \n", "B | \n", "
| 2 | \n", "701bf78bdf332d16ec5fb80a1affac3a080a8b9c258c2d... | \n", "A | \n", "A | \n", "A | \n", "A | \n", "B | \n", "B | \n", "B | \n", "B | \n", "A | \n", "A | \n", "A | \n", "B | \n", "A | \n", "B | \n", "B | \n", "A | \n", "A | \n", "B | \n", "B | \n", "
| 3 | \n", "59a0e1b332a6b6ba8f6a72f74085e6498e8c54a5d18191... | \n", "A | \n", "B | \n", "B | \n", "A | \n", "A | \n", "C | \n", "A | \n", "B | \n", "B | \n", "B | \n", "B | \n", "C | \n", "A | \n", "A | \n", "A | \n", "A | \n", "A | \n", "B | \n", "B | \n", "
| 4 | \n", "66370db9e1bb6a851fa692a2499f4f1356efab8e5abf90... | \n", "A | \n", "A | \n", "A | \n", "B | \n", "B | \n", "A | \n", "B | \n", "A | \n", "B | \n", "A | \n", "A | \n", "B | \n", "A | \n", "B | \n", "B | \n", "C | \n", "A | \n", "A | \n", "B | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 7038 | \n", "d947cded0fac0a00ebc313bbe4c4b64e23283d0cf7b536... | \n", "B | \n", "B | \n", "A | \n", "A | \n", "B | \n", "B | \n", "C | \n", "B | \n", "B | \n", "B | \n", "B | \n", "C | \n", "B | \n", "D | \n", "A | \n", "C | \n", "A | \n", "B | \n", "A | \n", "
| 7039 | \n", "cb985e07827c720145c2de03810a8ec565fe1121c0b6bd... | \n", "B | \n", "A | \n", "A | \n", "B | \n", "B | \n", "D | \n", "C | \n", "A | \n", "A | \n", "B | \n", "B | \n", "D | \n", "B | \n", "C | \n", "A | \n", "B | \n", "A | \n", "A | \n", "A | \n", "
| 7040 | \n", "32b725664afc96d8366d2d44ffb7a801132d3f84d68c19... | \n", "A | \n", "A | \n", "A | \n", "A | \n", "A | \n", "A | \n", "A | \n", "B | \n", "B | \n", "A | \n", "A | \n", "B | \n", "A | \n", "D | \n", "A | \n", "D | \n", "A | \n", "A | \n", "A | \n", "
| 7041 | \n", "b8463bcb56ddf1b47ee63bafdf65079f88d260b99ccd89... | \n", "A | \n", "A | \n", "A | \n", "B | \n", "B | \n", "B | \n", "C | \n", "A | \n", "B | \n", "A | \n", "A | \n", "B | \n", "A | \n", "B | \n", "B | \n", "C | \n", "B | \n", "B | \n", "A | \n", "
| 7042 | \n", "6bd283e5eb3d43ae083283e36d747965afa39b80f70ab5... | \n", "B | \n", "B | \n", "A | \n", "B | \n", "B | \n", "C | \n", "B | \n", "B | \n", "B | \n", "C | \n", "B | \n", "D | \n", "B | \n", "C | \n", "A | \n", "B | \n", "A | \n", "B | \n", "B | \n", "
7043 rows × 20 columns
\n", "