MaxGhenis · March 31, 2026 12:01
diff --git a/org_validation_2024.ipynb b/org_validation_2024.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "d422ab93",
   "metadata": {},
   "source": [
    "# ORG Validation: 2024 Annual Donor Build\n",
    "\n",
    "This notebook summarizes the 2024 CPS ORG donor build used for `hourly_wage`, `is_paid_hourly`, and the BLS-based `is_union_member_or_covered` assignment.\n",
    "\n",
    "It is intended as PR support material rather than repo source."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "7e359ce0",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-03-31T11:50:17.241889Z",
     "iopub.status.busy": "2026-03-31T11:50:17.241799Z",
     "iopub.status.idle": "2026-03-31T11:50:20.605167Z",
     "shell.execute_reply": "2026-03-31T11:50:20.604764Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/maxghenis/.codex-worktrees/us-data-org-impute/.venv/lib/python3.13/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "PosixPath('/Users/maxghenis/.codex-worktrees/us-data-org-impute/policyengine_us_data/storage/census_cps_org_2024_wages.csv.gz')"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import sys\n",
    "from pathlib import Path\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from IPython.display import display\n",
    "\n",
    "US_INPUTS_WORKTREE = Path(\"/Users/maxghenis/.codex-worktrees/us-org-inputs\")\n",
    "if str(US_INPUTS_WORKTREE) not in sys.path:\n",
    "    sys.path.insert(0, str(US_INPUTS_WORKTREE))\n",
    "\n",
    "from policyengine_us_data.datasets.org.org import (\n",
    "    ORG_FILENAME,\n",
    "    ORG_MONTHS,\n",
    "    ORG_PREDICTORS,\n",
    "    CPS_BASIC_MONTHLY_ORG_COLUMNS,\n",
    "    _cps_basic_org_month_url,\n",
    "    _predict_union_coverage_from_bls_tables,\n",
    "    load_org_training_data,\n",
    ")\n",
    "from policyengine_us_data.storage import STORAGE_FOLDER\n",
    "\n",
    "pd.set_option(\"display.max_columns\", 20)\n",
    "pd.set_option(\"display.float_format\", lambda x: f\"{x:,.4f}\")\n",
    "\n",
    "cache_path = STORAGE_FOLDER / ORG_FILENAME\n",
    "cache_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "d2d63c8a",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-03-31T11:50:20.606378Z",
     "iopub.status.busy": "2026-03-31T11:50:20.606291Z",
     "iopub.status.idle": "2026-03-31T11:50:20.710134Z",
     "shell.execute_reply": "2026-03-31T11:50:20.709665Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>metric</th>\n",
       "      <th>value</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>donor rows</td>\n",
       "      <td>119,237.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>weighted weight sum (millions)</td>\n",
       "      <td>16,523,600.2140</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>weighted mean hourly wage</td>\n",
       "      <td>35.1700</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>weighted p10 hourly wage</td>\n",
       "      <td>14.4000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>weighted p50 hourly wage</td>\n",
       "      <td>25.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>weighted p90 hourly wage</td>\n",
       "      <td>62.5000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>weighted p99 hourly wage</td>\n",
       "      <td>222.3700</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>weighted paid-hourly share</td>\n",
       "      <td>0.5532</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>weighted union share</td>\n",
       "      <td>0.1101</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>weighted share wage &gt; $100</td>\n",
       "      <td>0.0276</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>weighted share wage &gt; $200</td>\n",
       "      <td>0.0155</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>weighted share wage &lt; $7.25</td>\n",
       "      <td>0.0107</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>weighted share wage &lt; $10</td>\n",
       "      <td>0.0216</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>weighted p10 weekly hours</td>\n",
       "      <td>25.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>weighted p50 weekly hours</td>\n",
       "      <td>40.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>weighted p90 weekly hours</td>\n",
       "      <td>50.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>max hourly wage</td>\n",
       "      <td>2,890.6700</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                            metric           value\n",
       "0                       donor rows    119,237.0000\n",
       "1   weighted weight sum (millions) 16,523,600.2140\n",
       "2        weighted mean hourly wage         35.1700\n",
       "3         weighted p10 hourly wage         14.4000\n",
       "4         weighted p50 hourly wage         25.0000\n",
       "5         weighted p90 hourly wage         62.5000\n",
       "6         weighted p99 hourly wage        222.3700\n",
       "7       weighted paid-hourly share          0.5532\n",
       "8             weighted union share          0.1101\n",
       "9       weighted share wage > $100          0.0276\n",
       "10      weighted share wage > $200          0.0155\n",
       "11     weighted share wage < $7.25          0.0107\n",
       "12       weighted share wage < $10          0.0216\n",
       "13       weighted p10 weekly hours         25.0000\n",
       "14       weighted p50 weekly hours         40.0000\n",
       "15       weighted p90 weekly hours         50.0000\n",
       "16                 max hourly wage      2,890.6700"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "org = load_org_training_data().copy()\n",
    "org[\"is_union_member_or_covered\"] = _predict_union_coverage_from_bls_tables(\n",
    "    org[ORG_PREDICTORS]\n",
    ").astype(np.float32)\n",
    "\n",
    "w = org[\"sample_weight\"].astype(float).to_numpy()\n",
    "wage = org[\"hourly_wage\"].astype(float).to_numpy()\n",
    "hours = org[\"weekly_hours_worked\"].astype(float).to_numpy()\n",
    "paid = org[\"is_paid_hourly\"].astype(float).to_numpy()\n",
    "union = org[\"is_union_member_or_covered\"].astype(float).to_numpy()\n",
    "\n",
    "def wmean(x):\n",
    "    return float(np.average(x, weights=w))\n",
    "\n",
    "def wquantile(x, quantiles):\n",
    "    order = np.argsort(x)\n",
    "    xs = x[order]\n",
    "    ws = w[order]\n",
    "    cdf = np.cumsum(ws) / ws.sum()\n",
    "    return [float(xs[np.searchsorted(cdf, q, side=\"left\")]) for q in quantiles]\n",
    "\n",
    "summary = pd.DataFrame(\n",
    "    {\n",
    "        \"metric\": [\n",
    "            \"donor rows\",\n",
    "            \"weighted weight sum (millions)\",\n",
    "            \"weighted mean hourly wage\",\n",
    "            \"weighted p10 hourly wage\",\n",
    "            \"weighted p50 hourly wage\",\n",
    "            \"weighted p90 hourly wage\",\n",
    "            \"weighted p99 hourly wage\",\n",
    "            \"weighted paid-hourly share\",\n",
    "            \"weighted union share\",\n",
    "            \"weighted share wage > $100\",\n",
    "            \"weighted share wage > $200\",\n",
    "            \"weighted share wage < $7.25\",\n",
    "            \"weighted share wage < $10\",\n",
    "            \"weighted p10 weekly hours\",\n",
    "            \"weighted p50 weekly hours\",\n",
    "            \"weighted p90 weekly hours\",\n",
    "            \"max hourly wage\",\n",
    "        ],\n",
    "        \"value\": [\n",
    "            len(org),\n",
    "            round(w.sum() / 1e6, 3),\n",
    "            round(wmean(wage), 2),\n",
    "            round(wquantile(wage, [0.1])[0], 2),\n",
    "            round(wquantile(wage, [0.5])[0], 2),\n",
    "            round(wquantile(wage, [0.9])[0], 2),\n",
    "            round(wquantile(wage, [0.99])[0], 2),\n",
    "            round(wmean(paid), 4),\n",
    "            round(wmean(union), 4),\n",
    "            round(wmean((wage > 100).astype(float)), 4),\n",
    "            round(wmean((wage > 200).astype(float)), 4),\n",
    "            round(wmean((wage < 7.25).astype(float)), 4),\n",
    "            round(wmean((wage < 10).astype(float)), 4),\n",
    "            round(wquantile(hours, [0.1])[0], 2),\n",
    "            round(wquantile(hours, [0.5])[0], 2),\n",
    "            round(wquantile(hours, [0.9])[0], 2),\n",
    "            round(float(wage.max()), 2),\n",
    "        ],\n",
    "    }\n",
    ")\n",
    "summary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "90f78e2a",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-03-31T11:50:20.711194Z",
     "iopub.status.busy": "2026-03-31T11:50:20.711117Z",
     "iopub.status.idle": "2026-03-31T11:50:20.726474Z",
     "shell.execute_reply": "2026-03-31T11:50:20.726003Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Lowest weighted union states (rows >= 500)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/9l/_wztzgbx7mgc7l1r0416cy7m0000gn/T/ipykernel_2281/292089446.py:3: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
      "  .apply(\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>state_fips</th>\n",
       "      <th>rows</th>\n",
       "      <th>union_share_weighted</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>37.0000</td>\n",
       "      <td>2,664.0000</td>\n",
       "      <td>0.0297</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>46.0000</td>\n",
       "      <td>1,490.0000</td>\n",
       "      <td>0.0385</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>45.0000</td>\n",
       "      <td>1,913.0000</td>\n",
       "      <td>0.0406</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>13.0000</td>\n",
       "      <td>2,502.0000</td>\n",
       "      <td>0.0432</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>5.0000</td>\n",
       "      <td>2,115.0000</td>\n",
       "      <td>0.0440</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    state_fips       rows  union_share_weighted\n",
       "33     37.0000 2,664.0000                0.0297\n",
       "41     46.0000 1,490.0000                0.0385\n",
       "40     45.0000 1,913.0000                0.0406\n",
       "10     13.0000 2,502.0000                0.0432\n",
       "3       5.0000 2,115.0000                0.0440"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Highest weighted union states (rows >= 500)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>state_fips</th>\n",
       "      <th>rows</th>\n",
       "      <th>union_share_weighted</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>15.0000</td>\n",
       "      <td>1,623.0000</td>\n",
       "      <td>0.2688</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>36.0000</td>\n",
       "      <td>4,475.0000</td>\n",
       "      <td>0.2163</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2.0000</td>\n",
       "      <td>1,287.0000</td>\n",
       "      <td>0.1945</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>34.0000</td>\n",
       "      <td>2,422.0000</td>\n",
       "      <td>0.1765</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>53.0000</td>\n",
       "      <td>2,340.0000</td>\n",
       "      <td>0.1760</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    state_fips       rows  union_share_weighted\n",
       "11     15.0000 1,623.0000                0.2688\n",
       "32     36.0000 4,475.0000                0.2163\n",
       "1       2.0000 1,287.0000                0.1945\n",
       "30     34.0000 2,422.0000                0.1765\n",
       "47     53.0000 2,340.0000                0.1760"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "state_summary = (\n",
    "    org.groupby(\"state_fips\", group_keys=False)\n",
    "    .apply(\n",
    "        lambda g: pd.Series(\n",
    "            {\n",
    "                \"rows\": int(len(g)),\n",
    "                \"union_share_weighted\": float(\n",
    "                    np.average(\n",
    "                        g[\"is_union_member_or_covered\"],\n",
    "                        weights=g[\"sample_weight\"],\n",
    "                    )\n",
    "                ),\n",
    "            }\n",
    "        )\n",
    "    )\n",
    "    .reset_index()\n",
    ")\n",
    "state_summary = state_summary[state_summary[\"rows\"] >= 500]\n",
    "\n",
    "print(\"Lowest weighted union states (rows >= 500)\")\n",
    "display(state_summary.nsmallest(5, \"union_share_weighted\"))\n",
    "\n",
    "print(\"Highest weighted union states (rows >= 500)\")\n",
    "display(state_summary.nlargest(5, \"union_share_weighted\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "87b339b3",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-03-31T11:50:20.727534Z",
     "iopub.status.busy": "2026-03-31T11:50:20.727453Z",
     "iopub.status.idle": "2026-03-31T11:50:59.653532Z",
     "shell.execute_reply": "2026-03-31T11:50:59.653057Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cutoff</th>\n",
       "      <th>rows</th>\n",
       "      <th>weighted_share</th>\n",
       "      <th>direct_share_within_tail_weighted</th>\n",
       "      <th>derived_share_within_tail_weighted</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>100</td>\n",
       "      <td>3208</td>\n",
       "      <td>0.0276</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>1.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>200</td>\n",
       "      <td>1774</td>\n",
       "      <td>0.0155</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>1.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>500</td>\n",
       "      <td>26</td>\n",
       "      <td>0.0002</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>1.0000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   cutoff  rows  weighted_share  direct_share_within_tail_weighted  \\\n",
       "0     100  3208          0.0276                             0.0000   \n",
       "1     200  1774          0.0155                             0.0000   \n",
       "2     500    26          0.0002                             0.0000   \n",
       "\n",
       "   derived_share_within_tail_weighted  \n",
       "0                              1.0000  \n",
       "1                              1.0000  \n",
       "2                              1.0000  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tail_frames = []\n",
    "for month in ORG_MONTHS:\n",
    "    raw = pd.read_csv(\n",
    "        _cps_basic_org_month_url(2024, month),\n",
    "        usecols=CPS_BASIC_MONTHLY_ORG_COLUMNS,\n",
    "        low_memory=False,\n",
    "    )\n",
    "    month_org = raw.loc[\n",
    "        raw[\"HRMIS\"].isin([4, 8])\n",
    "        & (raw[\"pworwgt\"] > 0)\n",
    "        & (raw[\"prerelg\"] == 1)\n",
    "        & raw[\"pemlr\"].isin([1, 2])\n",
    "        & raw[\"peio1cow\"].isin([1, 2, 3, 4, 5])\n",
    "        & raw[\"peernhry\"].isin([1, 2])\n",
    "        & (raw[\"gestfips\"] > 0)\n",
    "        & (raw[\"prtage\"] >= 16)\n",
    "        & (raw[\"pehruslt\"] > 0)\n",
    "        & (raw[\"pternwa\"] > 0)\n",
    "    ].copy()\n",
    "    weekly_earnings = month_org[\"pternwa\"].astype(float) / 100\n",
    "    direct = month_org[\"pternhly\"].astype(float) / 100\n",
    "    hourly = np.where(\n",
    "        (month_org[\"peernhry\"] == 1) & (direct > 0),\n",
    "        direct,\n",
    "        weekly_earnings / month_org[\"pehruslt\"].astype(float),\n",
    "    )\n",
    "    src = np.where(\n",
    "        (month_org[\"peernhry\"] == 1) & (direct > 0),\n",
    "        \"direct\",\n",
    "        \"derived\",\n",
    "    )\n",
    "    tail_frames.append(\n",
    "        pd.DataFrame(\n",
    "            {\n",
    "                \"hourly\": hourly,\n",
    "                \"source\": src,\n",
    "                \"sample_weight\": month_org[\"pworwgt\"].astype(float),\n",
    "            }\n",
    "        )\n",
    "    )\n",
    "\n",
    "tail = pd.concat(tail_frames, ignore_index=True)\n",
    "tail_summary = []\n",
    "for cutoff in [100, 200, 500]:\n",
    "    mask = tail[\"hourly\"] > cutoff\n",
    "    subset = tail.loc[mask]\n",
    "    tail_summary.append(\n",
    "        {\n",
    "            \"cutoff\": cutoff,\n",
    "            \"rows\": int(mask.sum()),\n",
    "            \"weighted_share\": round(\n",
    "                float(np.average(mask.astype(float), weights=tail[\"sample_weight\"])),\n",
    "                4,\n",
    "            ),\n",
    "            \"direct_share_within_tail_weighted\": round(\n",
    "                float(\n",
    "                    np.average(\n",
    "                        (subset[\"source\"] == \"direct\").astype(float),\n",
    "                        weights=subset[\"sample_weight\"],\n",
    "                    )\n",
    "                ),\n",
    "                4,\n",
    "            ) if len(subset) else None,\n",
    "            \"derived_share_within_tail_weighted\": round(\n",
    "                float(\n",
    "                    np.average(\n",
    "                        (subset[\"source\"] == \"derived\").astype(float),\n",
    "                        weights=subset[\"sample_weight\"],\n",
    "                    )\n",
    "                ),\n",
    "                4,\n",
    "            ) if len(subset) else None,\n",
    "        }\n",
    "    )\n",
    "\n",
    "pd.DataFrame(tail_summary)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1f1918e0",
   "metadata": {},
   "source": [
    "## Reading Guide\n",
    "\n",
    "- The annual wage distribution is the primary donor distribution used for `hourly_wage` and `is_paid_hourly`.\n",
    "- The state union table reflects the BLS-based assignment used instead of the sparse public-use CPS union microdata fields.\n",
    "- The tail audit shows whether very high hourly wages are coming from direct hourly reports or from the weekly-earnings divided by hours fallback."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python (us-data-venv)",
   "language": "python",
   "name": "us-data-venv"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
diff --git a/org_validation_2024_summary.md b/org_validation_2024_summary.md
	{
	"cells": [
	{
	"cell_type": "markdown",
	"id": "d422ab93",
	"metadata": {},
	"source": [
	"# ORG Validation: 2024 Annual Donor Build\n",
	"\n",
	"This notebook summarizes the 2024 CPS ORG donor build used for `hourly_wage`, `is_paid_hourly`, and the BLS-based `is_union_member_or_covered` assignment.\n",
	"\n",
	"It is intended as PR support material rather than repo source."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "7e359ce0",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2026-03-31T11:50:17.241889Z",
	"iopub.status.busy": "2026-03-31T11:50:17.241799Z",
	"iopub.status.idle": "2026-03-31T11:50:20.605167Z",
	"shell.execute_reply": "2026-03-31T11:50:20.604764Z"
	}
	},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/Users/maxghenis/.codex-worktrees/us-data-org-impute/.venv/lib/python3.13/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
	" from .autonotebook import tqdm as notebook_tqdm\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"PosixPath('/Users/maxghenis/.codex-worktrees/us-data-org-impute/policyengine_us_data/storage/census_cps_org_2024_wages.csv.gz')"
	]
	},
	"execution_count": 1,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"import sys\n",
	"from pathlib import Path\n",
	"\n",
	"import numpy as np\n",
	"import pandas as pd\n",
	"from IPython.display import display\n",
	"\n",
	"US_INPUTS_WORKTREE = Path(\"/Users/maxghenis/.codex-worktrees/us-org-inputs\")\n",
	"if str(US_INPUTS_WORKTREE) not in sys.path:\n",
	" sys.path.insert(0, str(US_INPUTS_WORKTREE))\n",
	"\n",
	"from policyengine_us_data.datasets.org.org import (\n",
	" ORG_FILENAME,\n",
	" ORG_MONTHS,\n",
	" ORG_PREDICTORS,\n",
	" CPS_BASIC_MONTHLY_ORG_COLUMNS,\n",
	" _cps_basic_org_month_url,\n",
	" _predict_union_coverage_from_bls_tables,\n",
	" load_org_training_data,\n",
	")\n",
	"from policyengine_us_data.storage import STORAGE_FOLDER\n",
	"\n",
	"pd.set_option(\"display.max_columns\", 20)\n",
	"pd.set_option(\"display.float_format\", lambda x: f\"{x:,.4f}\")\n",
	"\n",
	"cache_path = STORAGE_FOLDER / ORG_FILENAME\n",
	"cache_path"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "d2d63c8a",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2026-03-31T11:50:20.606378Z",
	"iopub.status.busy": "2026-03-31T11:50:20.606291Z",
	"iopub.status.idle": "2026-03-31T11:50:20.710134Z",
	"shell.execute_reply": "2026-03-31T11:50:20.709665Z"
	}
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>metric</th>\n",
	" <th>value</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>donor rows</td>\n",
	" <td>119,237.0000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>weighted weight sum (millions)</td>\n",
	" <td>16,523,600.2140</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>weighted mean hourly wage</td>\n",
	" <td>35.1700</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>weighted p10 hourly wage</td>\n",
	" <td>14.4000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>weighted p50 hourly wage</td>\n",
	" <td>25.0000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>5</th>\n",
	" <td>weighted p90 hourly wage</td>\n",
	" <td>62.5000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>6</th>\n",
	" <td>weighted p99 hourly wage</td>\n",
	" <td>222.3700</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>7</th>\n",
	" <td>weighted paid-hourly share</td>\n",
	" <td>0.5532</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>8</th>\n",
	" <td>weighted union share</td>\n",
	" <td>0.1101</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>9</th>\n",
	" <td>weighted share wage > $100</td>\n",
	" <td>0.0276</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>10</th>\n",
	" <td>weighted share wage > $200</td>\n",
	" <td>0.0155</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>11</th>\n",
	" <td>weighted share wage < $7.25</td>\n",
	" <td>0.0107</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>12</th>\n",
	" <td>weighted share wage < $10</td>\n",
	" <td>0.0216</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>13</th>\n",
	" <td>weighted p10 weekly hours</td>\n",
	" <td>25.0000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>14</th>\n",
	" <td>weighted p50 weekly hours</td>\n",
	" <td>40.0000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>15</th>\n",
	" <td>weighted p90 weekly hours</td>\n",
	" <td>50.0000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>16</th>\n",
	" <td>max hourly wage</td>\n",
	" <td>2,890.6700</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" metric value\n",
	"0 donor rows 119,237.0000\n",
	"1 weighted weight sum (millions) 16,523,600.2140\n",
	"2 weighted mean hourly wage 35.1700\n",
	"3 weighted p10 hourly wage 14.4000\n",
	"4 weighted p50 hourly wage 25.0000\n",
	"5 weighted p90 hourly wage 62.5000\n",
	"6 weighted p99 hourly wage 222.3700\n",
	"7 weighted paid-hourly share 0.5532\n",
	"8 weighted union share 0.1101\n",
	"9 weighted share wage > $100 0.0276\n",
	"10 weighted share wage > $200 0.0155\n",
	"11 weighted share wage < $7.25 0.0107\n",
	"12 weighted share wage < $10 0.0216\n",
	"13 weighted p10 weekly hours 25.0000\n",
	"14 weighted p50 weekly hours 40.0000\n",
	"15 weighted p90 weekly hours 50.0000\n",
	"16 max hourly wage 2,890.6700"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"org = load_org_training_data().copy()\n",
	"org[\"is_union_member_or_covered\"] = _predict_union_coverage_from_bls_tables(\n",
	" org[ORG_PREDICTORS]\n",
	").astype(np.float32)\n",
	"\n",
	"w = org[\"sample_weight\"].astype(float).to_numpy()\n",
	"wage = org[\"hourly_wage\"].astype(float).to_numpy()\n",
	"hours = org[\"weekly_hours_worked\"].astype(float).to_numpy()\n",
	"paid = org[\"is_paid_hourly\"].astype(float).to_numpy()\n",
	"union = org[\"is_union_member_or_covered\"].astype(float).to_numpy()\n",
	"\n",
	"def wmean(x):\n",
	" return float(np.average(x, weights=w))\n",
	"\n",
	"def wquantile(x, quantiles):\n",
	" order = np.argsort(x)\n",
	" xs = x[order]\n",
	" ws = w[order]\n",
	" cdf = np.cumsum(ws) / ws.sum()\n",
	" return [float(xs[np.searchsorted(cdf, q, side=\"left\")]) for q in quantiles]\n",
	"\n",
	"summary = pd.DataFrame(\n",
	" {\n",
	" \"metric\": [\n",
	" \"donor rows\",\n",
	" \"weighted weight sum (millions)\",\n",
	" \"weighted mean hourly wage\",\n",
	" \"weighted p10 hourly wage\",\n",
	" \"weighted p50 hourly wage\",\n",
	" \"weighted p90 hourly wage\",\n",
	" \"weighted p99 hourly wage\",\n",
	" \"weighted paid-hourly share\",\n",
	" \"weighted union share\",\n",
	" \"weighted share wage > $100\",\n",
	" \"weighted share wage > $200\",\n",
	" \"weighted share wage < $7.25\",\n",
	" \"weighted share wage < $10\",\n",
	" \"weighted p10 weekly hours\",\n",
	" \"weighted p50 weekly hours\",\n",
	" \"weighted p90 weekly hours\",\n",
	" \"max hourly wage\",\n",
	" ],\n",
	" \"value\": [\n",
	" len(org),\n",
	" round(w.sum() / 1e6, 3),\n",
	" round(wmean(wage), 2),\n",
	" round(wquantile(wage, [0.1])[0], 2),\n",
	" round(wquantile(wage, [0.5])[0], 2),\n",
	" round(wquantile(wage, [0.9])[0], 2),\n",
	" round(wquantile(wage, [0.99])[0], 2),\n",
	" round(wmean(paid), 4),\n",
	" round(wmean(union), 4),\n",
	" round(wmean((wage > 100).astype(float)), 4),\n",
	" round(wmean((wage > 200).astype(float)), 4),\n",
	" round(wmean((wage < 7.25).astype(float)), 4),\n",
	" round(wmean((wage < 10).astype(float)), 4),\n",
	" round(wquantile(hours, [0.1])[0], 2),\n",
	" round(wquantile(hours, [0.5])[0], 2),\n",
	" round(wquantile(hours, [0.9])[0], 2),\n",
	" round(float(wage.max()), 2),\n",
	" ],\n",
	" }\n",
	")\n",
	"summary"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "90f78e2a",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2026-03-31T11:50:20.711194Z",
	"iopub.status.busy": "2026-03-31T11:50:20.711117Z",
	"iopub.status.idle": "2026-03-31T11:50:20.726474Z",
	"shell.execute_reply": "2026-03-31T11:50:20.726003Z"
	}
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Lowest weighted union states (rows >= 500)\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/var/folders/9l/_wztzgbx7mgc7l1r0416cy7m0000gn/T/ipykernel_2281/292089446.py:3: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
	" .apply(\n"
	]
	},
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>state_fips</th>\n",
	" <th>rows</th>\n",
	" <th>union_share_weighted</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>33</th>\n",
	" <td>37.0000</td>\n",
	" <td>2,664.0000</td>\n",
	" <td>0.0297</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>41</th>\n",
	" <td>46.0000</td>\n",
	" <td>1,490.0000</td>\n",
	" <td>0.0385</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>40</th>\n",
	" <td>45.0000</td>\n",
	" <td>1,913.0000</td>\n",
	" <td>0.0406</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>10</th>\n",
	" <td>13.0000</td>\n",
	" <td>2,502.0000</td>\n",
	" <td>0.0432</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>5.0000</td>\n",
	" <td>2,115.0000</td>\n",
	" <td>0.0440</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" state_fips rows union_share_weighted\n",
	"33 37.0000 2,664.0000 0.0297\n",
	"41 46.0000 1,490.0000 0.0385\n",
	"40 45.0000 1,913.0000 0.0406\n",
	"10 13.0000 2,502.0000 0.0432\n",
	"3 5.0000 2,115.0000 0.0440"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Highest weighted union states (rows >= 500)\n"
	]
	},
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>state_fips</th>\n",
	" <th>rows</th>\n",
	" <th>union_share_weighted</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>11</th>\n",
	" <td>15.0000</td>\n",
	" <td>1,623.0000</td>\n",
	" <td>0.2688</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>32</th>\n",
	" <td>36.0000</td>\n",
	" <td>4,475.0000</td>\n",
	" <td>0.2163</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>2.0000</td>\n",
	" <td>1,287.0000</td>\n",
	" <td>0.1945</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>30</th>\n",
	" <td>34.0000</td>\n",
	" <td>2,422.0000</td>\n",
	" <td>0.1765</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>47</th>\n",
	" <td>53.0000</td>\n",
	" <td>2,340.0000</td>\n",
	" <td>0.1760</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" state_fips rows union_share_weighted\n",
	"11 15.0000 1,623.0000 0.2688\n",
	"32 36.0000 4,475.0000 0.2163\n",
	"1 2.0000 1,287.0000 0.1945\n",
	"30 34.0000 2,422.0000 0.1765\n",
	"47 53.0000 2,340.0000 0.1760"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	}
	],
	"source": [
	"state_summary = (\n",
	" org.groupby(\"state_fips\", group_keys=False)\n",
	" .apply(\n",
	" lambda g: pd.Series(\n",
	" {\n",
	" \"rows\": int(len(g)),\n",
	" \"union_share_weighted\": float(\n",
	" np.average(\n",
	" g[\"is_union_member_or_covered\"],\n",
	" weights=g[\"sample_weight\"],\n",
	" )\n",
	" ),\n",
	" }\n",
	" )\n",
	" )\n",
	" .reset_index()\n",
	")\n",
	"state_summary = state_summary[state_summary[\"rows\"] >= 500]\n",
	"\n",
	"print(\"Lowest weighted union states (rows >= 500)\")\n",
	"display(state_summary.nsmallest(5, \"union_share_weighted\"))\n",
	"\n",
	"print(\"Highest weighted union states (rows >= 500)\")\n",
	"display(state_summary.nlargest(5, \"union_share_weighted\"))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "87b339b3",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2026-03-31T11:50:20.727534Z",
	"iopub.status.busy": "2026-03-31T11:50:20.727453Z",
	"iopub.status.idle": "2026-03-31T11:50:59.653532Z",
	"shell.execute_reply": "2026-03-31T11:50:59.653057Z"
	}
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>cutoff</th>\n",
	" <th>rows</th>\n",
	" <th>weighted_share</th>\n",
	" <th>direct_share_within_tail_weighted</th>\n",
	" <th>derived_share_within_tail_weighted</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>100</td>\n",
	" <td>3208</td>\n",
	" <td>0.0276</td>\n",
	" <td>0.0000</td>\n",
	" <td>1.0000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>200</td>\n",
	" <td>1774</td>\n",
	" <td>0.0155</td>\n",
	" <td>0.0000</td>\n",
	" <td>1.0000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>500</td>\n",
	" <td>26</td>\n",
	" <td>0.0002</td>\n",
	" <td>0.0000</td>\n",
	" <td>1.0000</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" cutoff rows weighted_share direct_share_within_tail_weighted \\\n",
	"0 100 3208 0.0276 0.0000 \n",
	"1 200 1774 0.0155 0.0000 \n",
	"2 500 26 0.0002 0.0000 \n",
	"\n",
	" derived_share_within_tail_weighted \n",
	"0 1.0000 \n",
	"1 1.0000 \n",
	"2 1.0000 "
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"tail_frames = []\n",
	"for month in ORG_MONTHS:\n",
	" raw = pd.read_csv(\n",
	" _cps_basic_org_month_url(2024, month),\n",
	" usecols=CPS_BASIC_MONTHLY_ORG_COLUMNS,\n",
	" low_memory=False,\n",
	" )\n",
	" month_org = raw.loc[\n",
	" raw[\"HRMIS\"].isin([4, 8])\n",
	" & (raw[\"pworwgt\"] > 0)\n",
	" & (raw[\"prerelg\"] == 1)\n",
	" & raw[\"pemlr\"].isin([1, 2])\n",
	" & raw[\"peio1cow\"].isin([1, 2, 3, 4, 5])\n",
	" & raw[\"peernhry\"].isin([1, 2])\n",
	" & (raw[\"gestfips\"] > 0)\n",
	" & (raw[\"prtage\"] >= 16)\n",
	" & (raw[\"pehruslt\"] > 0)\n",
	" & (raw[\"pternwa\"] > 0)\n",
	" ].copy()\n",
	" weekly_earnings = month_org[\"pternwa\"].astype(float) / 100\n",
	" direct = month_org[\"pternhly\"].astype(float) / 100\n",
	" hourly = np.where(\n",
	" (month_org[\"peernhry\"] == 1) & (direct > 0),\n",
	" direct,\n",
	" weekly_earnings / month_org[\"pehruslt\"].astype(float),\n",
	" )\n",
	" src = np.where(\n",
	" (month_org[\"peernhry\"] == 1) & (direct > 0),\n",
	" \"direct\",\n",
	" \"derived\",\n",
	" )\n",
	" tail_frames.append(\n",
	" pd.DataFrame(\n",
	" {\n",
	" \"hourly\": hourly,\n",
	" \"source\": src,\n",
	" \"sample_weight\": month_org[\"pworwgt\"].astype(float),\n",
	" }\n",
	" )\n",
	" )\n",
	"\n",
	"tail = pd.concat(tail_frames, ignore_index=True)\n",
	"tail_summary = []\n",
	"for cutoff in [100, 200, 500]:\n",
	" mask = tail[\"hourly\"] > cutoff\n",
	" subset = tail.loc[mask]\n",
	" tail_summary.append(\n",
	" {\n",
	" \"cutoff\": cutoff,\n",
	" \"rows\": int(mask.sum()),\n",
	" \"weighted_share\": round(\n",
	" float(np.average(mask.astype(float), weights=tail[\"sample_weight\"])),\n",
	" 4,\n",
	" ),\n",
	" \"direct_share_within_tail_weighted\": round(\n",
	" float(\n",
	" np.average(\n",
	" (subset[\"source\"] == \"direct\").astype(float),\n",
	" weights=subset[\"sample_weight\"],\n",
	" )\n",
	" ),\n",
	" 4,\n",
	" ) if len(subset) else None,\n",
	" \"derived_share_within_tail_weighted\": round(\n",
	" float(\n",
	" np.average(\n",
	" (subset[\"source\"] == \"derived\").astype(float),\n",
	" weights=subset[\"sample_weight\"],\n",
	" )\n",
	" ),\n",
	" 4,\n",
	" ) if len(subset) else None,\n",
	" }\n",
	" )\n",
	"\n",
	"pd.DataFrame(tail_summary)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "1f1918e0",
	"metadata": {},
	"source": [
	"## Reading Guide\n",
	"\n",
	"- The annual wage distribution is the primary donor distribution used for `hourly_wage` and `is_paid_hourly`.\n",
	"- The state union table reflects the BLS-based assignment used instead of the sparse public-use CPS union microdata fields.\n",
	"- The tail audit shows whether very high hourly wages are coming from direct hourly reports or from the weekly-earnings divided by hours fallback."
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python (us-data-venv)",
	"language": "python",
	"name": "us-data-venv"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.13.12"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}
No results found