Last active
March 17, 2026 03:15
-
-
Save lfcunha/c78a362e5ab1207bac4f7bcbbb6fdf94 to your computer and use it in GitHub Desktop.
provenance demo
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 30, | |
| "id": "f1bbe183-3cd3-4b5d-92c0-1083217783e3", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import pprint\n", | |
| "\n", | |
| "import pandas as pd" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "fb2891f5-2c15-4fc7-86a3-9813d8441207", | |
| "metadata": {}, | |
| "source": [ | |
| "# Data provenance demo\n", | |
| "\n", | |
| "### We have two tables that we join and we want to track back a particular cell to its original dataset (table and row)\n", | |
| "\n", | |
| "\n", | |
| "Use joined table metadata to track any cell to its original source. We need to keep the id of the row of each upstream table. therefore, we have multiple provenance columns in a table, as many as the source tables.\n", | |
| "\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "27496fd3-f302-4ba7-b8fe-0fafec936005", | |
| "metadata": {}, | |
| "source": [ | |
| "## We have two tables that we join and we would like to keep provenance at the cell level" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 26, | |
| "id": "edb4f95d-2c7b-4bb6-8d31-57434231d43c", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "provenance_demo_upstream_table1 = pd.read_csv(\"provenance_demo_upstream_table1.csv\")\n", | |
| "provenance_demo_upstream_table2 = pd.read_csv(\"provenance_demo_upstream_table2.csv\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "9622fc11-101e-4f0d-91d0-89d46887a8c4", | |
| "metadata": {}, | |
| "source": [ | |
| "## Each row of each table should have an id for traceability" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "id": "ce61e0c3-3872-4a7b-887e-bef7e288e557", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "provenance_demo_upstream_table1['id'] = provenance_demo_upstream_table1.index\n", | |
| "provenance_demo_upstream_table2['id'] = provenance_demo_upstream_table2.index" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 27, | |
| "id": "b51323ee-3d41-43dc-bb57-1b2e066cab78", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>:id</th>\n", | |
| " <th>:created_at</th>\n", | |
| " <th>:updated_at</th>\n", | |
| " <th>:version</th>\n", | |
| " <th>document_id</th>\n", | |
| " <th>id</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>row-pch7-6uqa_jgkj</td>\n", | |
| " <td>2023-12-08T20:28:00.359Z</td>\n", | |
| " <td>2023-12-08T20:28:00.359Z</td>\n", | |
| " <td>rv-ss29-qmc4~f2id</td>\n", | |
| " <td>2003010800829001</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>row-b4dr.imc6-kqvu</td>\n", | |
| " <td>2022-08-08T19:32:38.749Z</td>\n", | |
| " <td>2022-08-08T19:32:38.749Z</td>\n", | |
| " <td>rv-qi5z.6nsf.g9kb</td>\n", | |
| " <td>2003010900179001</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>row-t3yc_g6cm_pvsi</td>\n", | |
| " <td>2019-07-08T19:51:01.537Z</td>\n", | |
| " <td>2019-07-08T19:51:01.537Z</td>\n", | |
| " <td>rv-8rm7~4ni7-rbfb</td>\n", | |
| " <td>2003011100073001</td>\n", | |
| " <td>5</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>row-994e-vk6r_vji4</td>\n", | |
| " <td>2021-11-08T20:28:24.181Z</td>\n", | |
| " <td>2021-11-08T20:28:24.181Z</td>\n", | |
| " <td>rv-4m8q-z9rj~tngs</td>\n", | |
| " <td>2003011200031001</td>\n", | |
| " <td>7</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " :id :created_at :updated_at \\\n", | |
| "0 row-pch7-6uqa_jgkj 2023-12-08T20:28:00.359Z 2023-12-08T20:28:00.359Z \n", | |
| "1 row-b4dr.imc6-kqvu 2022-08-08T19:32:38.749Z 2022-08-08T19:32:38.749Z \n", | |
| "2 row-t3yc_g6cm_pvsi 2019-07-08T19:51:01.537Z 2019-07-08T19:51:01.537Z \n", | |
| "3 row-994e-vk6r_vji4 2021-11-08T20:28:24.181Z 2021-11-08T20:28:24.181Z \n", | |
| "\n", | |
| " :version document_id id \n", | |
| "0 rv-ss29-qmc4~f2id 2003010800829001 0 \n", | |
| "1 rv-qi5z.6nsf.g9kb 2003010900179001 3 \n", | |
| "2 rv-8rm7~4ni7-rbfb 2003011100073001 5 \n", | |
| "3 rv-4m8q-z9rj~tngs 2003011200031001 7 " | |
| ] | |
| }, | |
| "execution_count": 27, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "provenance_demo_upstream_table1" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 28, | |
| "id": "aa4e3d01-0a62-4413-9ba7-966d5aa69704", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>document_id</th>\n", | |
| " <th>record_type</th>\n", | |
| " <th>party_type</th>\n", | |
| " <th>name</th>\n", | |
| " <th>address_1</th>\n", | |
| " <th>address_2</th>\n", | |
| " <th>country</th>\n", | |
| " <th>city</th>\n", | |
| " <th>state</th>\n", | |
| " <th>zip</th>\n", | |
| " <th>id</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>2003010800829001</td>\n", | |
| " <td>P</td>\n", | |
| " <td>1</td>\n", | |
| " <td>CHRISTOFIDES, JOANNA</td>\n", | |
| " <td>149-42 15TH ROAD</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>US</td>\n", | |
| " <td>WHITESTONE</td>\n", | |
| " <td>NY</td>\n", | |
| " <td>11357</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>2003010900179001</td>\n", | |
| " <td>P</td>\n", | |
| " <td>1</td>\n", | |
| " <td>500 KINGSLAND ASSOC, LLC, C/O JOSE NODAR</td>\n", | |
| " <td>39 EAST 39TH STREET</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>US</td>\n", | |
| " <td>PATERSON</td>\n", | |
| " <td>NJ</td>\n", | |
| " <td>7514</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>2003011100073001</td>\n", | |
| " <td>P</td>\n", | |
| " <td>1</td>\n", | |
| " <td>RYTY CONSTRUCTION CO, LLC</td>\n", | |
| " <td>1758 PITMAN AVENUE</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>US</td>\n", | |
| " <td>BRONX</td>\n", | |
| " <td>NY</td>\n", | |
| " <td>10466</td>\n", | |
| " <td>5</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>2003011200031001</td>\n", | |
| " <td>P</td>\n", | |
| " <td>2</td>\n", | |
| " <td>WASHINGTON MUTUAL BANK, FA</td>\n", | |
| " <td>400 EAST MAIN STREET</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>US</td>\n", | |
| " <td>STOCKTON</td>\n", | |
| " <td>CA</td>\n", | |
| " <td>95290</td>\n", | |
| " <td>7</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " document_id record_type party_type \\\n", | |
| "0 2003010800829001 P 1 \n", | |
| "1 2003010900179001 P 1 \n", | |
| "2 2003011100073001 P 1 \n", | |
| "3 2003011200031001 P 2 \n", | |
| "\n", | |
| " name address_1 address_2 \\\n", | |
| "0 CHRISTOFIDES, JOANNA 149-42 15TH ROAD NaN \n", | |
| "1 500 KINGSLAND ASSOC, LLC, C/O JOSE NODAR 39 EAST 39TH STREET NaN \n", | |
| "2 RYTY CONSTRUCTION CO, LLC 1758 PITMAN AVENUE NaN \n", | |
| "3 WASHINGTON MUTUAL BANK, FA 400 EAST MAIN STREET NaN \n", | |
| "\n", | |
| " country city state zip id \n", | |
| "0 US WHITESTONE NY 11357 0 \n", | |
| "1 US PATERSON NJ 7514 3 \n", | |
| "2 US BRONX NY 10466 5 \n", | |
| "3 US STOCKTON CA 95290 7 " | |
| ] | |
| }, | |
| "execution_count": 28, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "provenance_demo_upstream_table2" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "1aa54889-f51c-4228-8618-23ba59a981a4", | |
| "metadata": {}, | |
| "source": [ | |
| "## We join the tables to produce provenance_demo_downstream" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "id": "2f00cf8f-3f15-4857-805d-3fce5c2c3a5b", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "provenance_demo_downstream = provenance_demo_upstream_table1.merge(provenance_demo_upstream_table2, on=\"document_id\", how=\"inner\", suffixes=('_provenance_demo_upstream_table1', '_provenance_demo_upstream_table2'))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "id": "196e742c-4c38-4fc7-85e0-863a0e02be1c", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>:id</th>\n", | |
| " <th>:created_at</th>\n", | |
| " <th>:updated_at</th>\n", | |
| " <th>:version</th>\n", | |
| " <th>document_id</th>\n", | |
| " <th>id_provenance_demo_upstream_table1</th>\n", | |
| " <th>record_type</th>\n", | |
| " <th>party_type</th>\n", | |
| " <th>name</th>\n", | |
| " <th>address_1</th>\n", | |
| " <th>address_2</th>\n", | |
| " <th>country</th>\n", | |
| " <th>city</th>\n", | |
| " <th>state</th>\n", | |
| " <th>zip</th>\n", | |
| " <th>id_provenance_demo_upstream_table2</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>row-pch7-6uqa_jgkj</td>\n", | |
| " <td>2023-12-08T20:28:00.359Z</td>\n", | |
| " <td>2023-12-08T20:28:00.359Z</td>\n", | |
| " <td>rv-ss29-qmc4~f2id</td>\n", | |
| " <td>2003010800829001</td>\n", | |
| " <td>0</td>\n", | |
| " <td>P</td>\n", | |
| " <td>1</td>\n", | |
| " <td>CHRISTOFIDES, JOANNA</td>\n", | |
| " <td>149-42 15TH ROAD</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>US</td>\n", | |
| " <td>WHITESTONE</td>\n", | |
| " <td>NY</td>\n", | |
| " <td>11357</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>row-b4dr.imc6-kqvu</td>\n", | |
| " <td>2022-08-08T19:32:38.749Z</td>\n", | |
| " <td>2022-08-08T19:32:38.749Z</td>\n", | |
| " <td>rv-qi5z.6nsf.g9kb</td>\n", | |
| " <td>2003010900179001</td>\n", | |
| " <td>1</td>\n", | |
| " <td>P</td>\n", | |
| " <td>1</td>\n", | |
| " <td>500 KINGSLAND ASSOC, LLC, C/O JOSE NODAR</td>\n", | |
| " <td>39 EAST 39TH STREET</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>US</td>\n", | |
| " <td>PATERSON</td>\n", | |
| " <td>NJ</td>\n", | |
| " <td>7514</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>row-t3yc_g6cm_pvsi</td>\n", | |
| " <td>2019-07-08T19:51:01.537Z</td>\n", | |
| " <td>2019-07-08T19:51:01.537Z</td>\n", | |
| " <td>rv-8rm7~4ni7-rbfb</td>\n", | |
| " <td>2003011100073001</td>\n", | |
| " <td>2</td>\n", | |
| " <td>P</td>\n", | |
| " <td>1</td>\n", | |
| " <td>RYTY CONSTRUCTION CO, LLC</td>\n", | |
| " <td>1758 PITMAN AVENUE</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>US</td>\n", | |
| " <td>BRONX</td>\n", | |
| " <td>NY</td>\n", | |
| " <td>10466</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>row-994e-vk6r_vji4</td>\n", | |
| " <td>2021-11-08T20:28:24.181Z</td>\n", | |
| " <td>2021-11-08T20:28:24.181Z</td>\n", | |
| " <td>rv-4m8q-z9rj~tngs</td>\n", | |
| " <td>2003011200031001</td>\n", | |
| " <td>3</td>\n", | |
| " <td>P</td>\n", | |
| " <td>2</td>\n", | |
| " <td>WASHINGTON MUTUAL BANK, FA</td>\n", | |
| " <td>400 EAST MAIN STREET</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>US</td>\n", | |
| " <td>STOCKTON</td>\n", | |
| " <td>CA</td>\n", | |
| " <td>95290</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " :id :created_at :updated_at \\\n", | |
| "0 row-pch7-6uqa_jgkj 2023-12-08T20:28:00.359Z 2023-12-08T20:28:00.359Z \n", | |
| "1 row-b4dr.imc6-kqvu 2022-08-08T19:32:38.749Z 2022-08-08T19:32:38.749Z \n", | |
| "2 row-t3yc_g6cm_pvsi 2019-07-08T19:51:01.537Z 2019-07-08T19:51:01.537Z \n", | |
| "3 row-994e-vk6r_vji4 2021-11-08T20:28:24.181Z 2021-11-08T20:28:24.181Z \n", | |
| "\n", | |
| " :version document_id id_provenance_demo_upstream_table1 \\\n", | |
| "0 rv-ss29-qmc4~f2id 2003010800829001 0 \n", | |
| "1 rv-qi5z.6nsf.g9kb 2003010900179001 1 \n", | |
| "2 rv-8rm7~4ni7-rbfb 2003011100073001 2 \n", | |
| "3 rv-4m8q-z9rj~tngs 2003011200031001 3 \n", | |
| "\n", | |
| " record_type party_type name \\\n", | |
| "0 P 1 CHRISTOFIDES, JOANNA \n", | |
| "1 P 1 500 KINGSLAND ASSOC, LLC, C/O JOSE NODAR \n", | |
| "2 P 1 RYTY CONSTRUCTION CO, LLC \n", | |
| "3 P 2 WASHINGTON MUTUAL BANK, FA \n", | |
| "\n", | |
| " address_1 address_2 country city state zip \\\n", | |
| "0 149-42 15TH ROAD NaN US WHITESTONE NY 11357 \n", | |
| "1 39 EAST 39TH STREET NaN US PATERSON NJ 7514 \n", | |
| "2 1758 PITMAN AVENUE NaN US BRONX NY 10466 \n", | |
| "3 400 EAST MAIN STREET NaN US STOCKTON CA 95290 \n", | |
| "\n", | |
| " id_provenance_demo_upstream_table2 \n", | |
| "0 0 \n", | |
| "1 1 \n", | |
| "2 2 \n", | |
| "3 3 " | |
| ] | |
| }, | |
| "execution_count": 13, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "provenance_demo_downstream" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "7eef1cde-fb0a-4aba-a277-06489ee8dcf8", | |
| "metadata": {}, | |
| "source": [ | |
| "### Now we want to track the source of a field, for instance the name 'RYTY CONSTRUCTION CO, LLC'" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "5944c7d9-0aed-4c23-bfe5-6f607301fbb0", | |
| "metadata": {}, | |
| "source": [ | |
| "#### Use the table metadata to keep track of the source table of of each column" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 29, | |
| "id": "ece5de4e-6dfb-4e6a-b701-698828181aa6", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "provenance_demo_downstream_metadata= \\\n", | |
| " {\"provenance_demo_upstream_table1\": [':id', ':created_at', ':updated_at', ':version'],\n", | |
| " \"provenance_demo_upstream_table2\": ['record_type', 'party_type', 'name', 'address_1', 'address_2', 'country', 'city', 'state', 'zip']}" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 31, | |
| "id": "ef27fd3d-4798-4318-9919-928327f4742c", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "{':created_at': 'provenance_demo_upstream_table1',\n", | |
| " ':id': 'provenance_demo_upstream_table1',\n", | |
| " ':updated_at': 'provenance_demo_upstream_table1',\n", | |
| " ':version': 'provenance_demo_upstream_table1',\n", | |
| " 'address_1': 'provenance_demo_upstream_table2',\n", | |
| " 'address_2': 'provenance_demo_upstream_table2',\n", | |
| " 'city': 'provenance_demo_upstream_table2',\n", | |
| " 'country': 'provenance_demo_upstream_table2',\n", | |
| " 'name': 'provenance_demo_upstream_table2',\n", | |
| " 'party_type': 'provenance_demo_upstream_table2',\n", | |
| " 'record_type': 'provenance_demo_upstream_table2',\n", | |
| " 'state': 'provenance_demo_upstream_table2',\n", | |
| " 'zip': 'provenance_demo_upstream_table2'}\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "provenance_demo_downstream_metadata_inv = {value: key for key, values in provenance_demo_downstream_metadata.items() for value in values}\n", | |
| "\n", | |
| "\n", | |
| "pprint.pprint(provenance_demo_downstream_metadata_inv)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "43d31aeb-d781-4ab9-9046-43e932a18f5c", | |
| "metadata": {}, | |
| "source": [ | |
| "#### define the column and value we want to track:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 18, | |
| "id": "cc181d2c-975b-42b1-b45f-6853a102af92", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "field = 'name'\n", | |
| "value = 'RYTY CONSTRUCTION CO, LLC'" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "1a27ff84-c97c-4576-85f8-f4f7779618eb", | |
| "metadata": {}, | |
| "source": [ | |
| "### Find the original data (we know the source table for this column from the metadata)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 32, | |
| "id": "fd026523-75c3-4638-b4dd-4acfd99f6a1c", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "source_table = provenance_demo_downstream_metadata_inv[\"name\"]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 33, | |
| "id": "fa3ef116-9b9b-450e-947e-3a1dc879a484", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>document_id</th>\n", | |
| " <th>record_type</th>\n", | |
| " <th>party_type</th>\n", | |
| " <th>name</th>\n", | |
| " <th>address_1</th>\n", | |
| " <th>address_2</th>\n", | |
| " <th>country</th>\n", | |
| " <th>city</th>\n", | |
| " <th>state</th>\n", | |
| " <th>zip</th>\n", | |
| " <th>id</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>2003011100073001</td>\n", | |
| " <td>P</td>\n", | |
| " <td>1</td>\n", | |
| " <td>RYTY CONSTRUCTION CO, LLC</td>\n", | |
| " <td>1758 PITMAN AVENUE</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>US</td>\n", | |
| " <td>BRONX</td>\n", | |
| " <td>NY</td>\n", | |
| " <td>10466</td>\n", | |
| " <td>5</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " document_id record_type party_type name \\\n", | |
| "2 2003011100073001 P 1 RYTY CONSTRUCTION CO, LLC \n", | |
| "\n", | |
| " address_1 address_2 country city state zip id \n", | |
| "2 1758 PITMAN AVENUE NaN US BRONX NY 10466 5 " | |
| ] | |
| }, | |
| "execution_count": 33, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "eval(f\"{source_table}[{source_table}.index == provenance_demo_downstream.iloc[provenance_demo_downstream[provenance_demo_downstream['{field}'] == '{value}'].index.to_list()[0]].id_{source_table}]\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "0ba67f44-2264-41f4-8a03-317bafe80f02", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.12.3" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment