Skip to content

Instantly share code, notes, and snippets.

@dinya
Last active May 20, 2021 05:20
Show Gist options
  • Select an option

  • Save dinya/06e374721993fd387db19ee7f83065a5 to your computer and use it in GitHub Desktop.

Select an option

Save dinya/06e374721993fd387db19ee7f83065a5 to your computer and use it in GitHub Desktop.
Replace flat column names with hierarchical ones in the Pandas DataFrame
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from collections import Counter\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"\n",
"def index_as_multiindex(index, suffix_on_overlap=True, strip_empty_ends=False):\n",
" \"\"\"Replace flat index (:obj:`pandas.Index`) with hierarchical (:obj:`pandas.MultiIndex`).\n",
"\n",
" Parameters\n",
" ----------\n",
" index : pandas.Index\n",
" :obj:`pandas.Index`-object to be converted to :obj:`pandas.MultiIndex`-object\n",
" suffix_on_overlap : bool (default True)\n",
" Add suffixes (_0, _1, ...) to overlapping column names. To raise an exception on overlapping columns use False.\n",
" strip_empty_ends : bool (default False)\n",
" Strip \"\" in the tuples (empty str).\n",
"\n",
" See Also\n",
" --------\n",
" columns_as_multiindex\n",
"\n",
" Examples\n",
" --------\n",
" See :func:`columns_as_multiindex`\n",
" \"\"\"\n",
"\n",
" # TODO: To be reviewed with the pd.Index(index.values.tolist(), tupleize_cols=True).\n",
" # It is required that the elements of the list index.values.tolist() are reduced to the same shape!\n",
" columns = []\n",
" for idx in index.tolist():\n",
" if isinstance(idx, str) or not np.iterable(idx):\n",
" idx = (idx, )\n",
" if strip_empty_ends:\n",
" idx = list_rstrip(idx)\n",
" columns.append(idx)\n",
"\n",
" max_len = max([len(c) for c in columns])\n",
" columns = [tuple(c + (\"\", ) * (max_len - len(c))) for c in columns]\n",
" # If same named columns exists then add suffixes _0, _1, _2, ... to the top level\n",
" duplicates = [item for item, count in Counter(columns).items() if count > 1]\n",
" if len(duplicates):\n",
" if suffix_on_overlap:\n",
" for dup in duplicates:\n",
" indexes = [i for i, x in enumerate(columns) if x == dup]\n",
" for i, idx in enumerate(indexes):\n",
" col = list(columns[idx])\n",
" col[0] = col[0] + \"_%i\" % i\n",
" columns[idx] = tuple(col)\n",
" else:\n",
" raise ValueError(\"columns overlap, but the auto-renaming is disabled\")\n",
"\n",
" return pd.MultiIndex.from_tuples(columns)\n",
"\n",
"\n",
"def columns_as_multiindex(df, inplace=False, suffix_on_overlap=True, strip_empty_ends=False):\n",
" \"\"\"Replace flat columns (:obj:`pandas.Index`) with hierarchical (:obj:`pandas.MultiIndex`).\n",
"\n",
" Parameters\n",
" ----------\n",
" df : pandas.DataFrame\n",
" Dataframe which contains the columns to be converted to MultiIndex.\n",
" suffix_on_overlap : bool (default True)\n",
" Add suffixes (_0, _1, ...) to overlapping column names. To raise an exception on overlapping columns use False.\n",
" inplace : bool (default False)\n",
" If ``True``, do operation inplace and return ``None``.\n",
" strip_empty_ends : bool (default False)\n",
" Strip \"\" in the tuples (empty str).\n",
"\n",
" Returns\n",
" -------\n",
" pandas.DataFrame or None\n",
" Returns dataframe with modifed columns or ``None`` (depends on `inplace` parameter value).\n",
"\n",
" Notes\n",
" -----\n",
" Reverse operation (hierarchical to flat)::\n",
"\n",
" df.columns = [tuple(col) for col in df.columns.values]\n",
"\n",
" See for details https://stackoverflow.com/questions/14507794/pandas-how-to-flatten-a-hierarchical-index-in-columns\n",
"\n",
" Examples\n",
" --------\n",
" >>> df0 = pd.DataFrame([[1,2], [10, 20], [100, 200]], columns=[\"A\", \"B\"])\n",
" >>> df0\n",
" ... A B\n",
" ... 0 1 2\n",
" ... 1 10 20\n",
" ... 2 100 200\n",
"\n",
" >>> columns = pd.MultiIndex.from_tuples([(\"C\", \"C0\"), (\"D\", \"D0\")])\n",
" >>> df1 = pd.DataFrame([[1,3], [10, 30], [100, 300]], columns=columns)\n",
" >>> df1\n",
" ... C D\n",
" ... C0 D0\n",
" ... 0 1 3\n",
" ... 1 10 30\n",
" ... 2 100 300\n",
"\n",
" Merging ``df0`` with ``df1`` returns the dataframe contains tuples in the column names\n",
"\n",
" >>> df = pd.merge(df0, df1, left_on=\"A\", right_on=[(\"C\", \"C0\")])\n",
" >>> df\n",
" ... A B (C, C0) (D, D0)\n",
" ... 0 1 2 1 3\n",
" ... 1 10 20 10 30\n",
" ... 2 100 200 100 300\n",
"\n",
" It's a bit ugly. Let's beautify the header\n",
"\n",
" >>> columns_as_multiindex(df)\n",
" >>> df\n",
" ... A B C D\n",
" ... C0 D0\n",
" ... 0 1 2 1 3\n",
" ... 1 10 20 10 30\n",
" ... 2 100 200 100 300\n",
"\n",
" If same named columns exists then add suffixes _0, _1, _2, ... to the top level\n",
"\n",
" >>> df1_columns = pd.MultiIndex.from_tuples([(\"A0\", \"B0\", \"C0\"), (\"A1\", \"B1\", \"C1\"), (\"A2\", \"B2\", \"\")])\n",
" >>> df1 = pd.DataFrame([[1, 2, 3], [10, 20, 30]], columns=df1_columns)\n",
" >>> df1\n",
" ... A0 A1 A2\n",
" ... B0 B1 B2\n",
" ... C0 C1\n",
" ... 0 1 2 3\n",
" ... 1 10 20 30\n",
" >>> df2_columns = pd.MultiIndex.from_tuples([(\"X0\", \"Y0\"), (\"X1\", \"Y1\"), (\"A2\", \"B2\")])\n",
" >>> df2 = pd.DataFrame([[1, 200, 300], [10, 200, 300]], columns=df2_columns)\n",
" >>> df2\n",
" ... X0 X1 A2\n",
" ... Y0 Y1 B2\n",
" ... 0 1 200 300\n",
" ... 1 10 200 300\n",
" >>> df_merge = pd.merge(df1, df2, left_on=[(\"A0\", \"B0\",\"C0\")], right_on=[(\"X0\", \"Y0\")])\n",
" >>> df_merge\n",
" ... (A0, B0, C0) (A1, B1, C1) (A2, B2, ) (X0, Y0) (X1, Y1) (A2, B2)\n",
" ... 0 1 2 3 1 200 300\n",
" ... 1 10 20 30 10 200 300\n",
" >>> columns_as_multiindex(df_merge)\n",
" ... A0 A1 A2_0 X0 X1 A2_1\n",
" ... B0 B1 B2 Y0 Y1 B2\n",
" ... C0 C1\n",
" ... 0 1 2 3 1 200 300\n",
" ... 1 10 20 30 10 200 300\n",
" >>> columns_as_multiindex(df_merge, suffix_on_overlap=False)\n",
" ... Traceback (most recent call last)\n",
" ... ...\n",
" ... ValueError: columns overlap, but the auto-renaming is disabled\n",
" \"\"\"\n",
"\n",
" columns_new = index_as_multiindex(df.columns, suffix_on_overlap=suffix_on_overlap,\n",
" strip_empty_ends=strip_empty_ends)\n",
"\n",
" if inplace:\n",
" df.columns = columns_new\n",
" else:\n",
" df_new = df.copy()\n",
" df_new.columns = columns_new\n",
" return df_new"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'1.1.3'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"pd.__version__"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead tr th {\n",
" text-align: left;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th>A0</th>\n",
" <th>A1</th>\n",
" <th>A2</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th>B0</th>\n",
" <th>B1</th>\n",
" <th>B2</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th>C0</th>\n",
" <th>C1</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>10</td>\n",
" <td>20</td>\n",
" <td>30</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A0 A1 A2\n",
" B0 B1 B2\n",
" C0 C1 \n",
"0 1 2 3\n",
"1 10 20 30"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_columns = pd.MultiIndex.from_tuples([(\"A0\", \"B0\", \"C0\"), (\"A1\", \"B1\", \"C1\"), (\"A2\", \"B2\", \"\")])\n",
"df1 = pd.DataFrame([[1, 2, 3], [10, 20, 30]], columns=df1_columns)\n",
"df1"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead tr th {\n",
" text-align: left;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th>X0</th>\n",
" <th>X1</th>\n",
" <th>A2</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th>Y0</th>\n",
" <th>Y1</th>\n",
" <th>B2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>200</td>\n",
" <td>300</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>10</td>\n",
" <td>200</td>\n",
" <td>300</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" X0 X1 A2\n",
" Y0 Y1 B2\n",
"0 1 200 300\n",
"1 10 200 300"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2_columns = pd.MultiIndex.from_tuples([(\"X0\", \"Y0\"), (\"X1\", \"Y1\"), (\"A2\", \"B2\")])\n",
"df2 = pd.DataFrame([[1, 200, 300], [10, 200, 300]], columns=df2_columns)\n",
"df2"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\00data\\opt\\miniconda3\\lib\\site-packages\\pandas\\core\\reshape\\merge.py:643: UserWarning: merging between different levels can give an unintended result (3 levels on the left,2 on the right)\n",
" warnings.warn(msg, UserWarning)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>(A0, B0, C0)</th>\n",
" <th>(A1, B1, C1)</th>\n",
" <th>(A2, B2, )</th>\n",
" <th>(X0, Y0)</th>\n",
" <th>(X1, Y1)</th>\n",
" <th>(A2, B2)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>200</td>\n",
" <td>300</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>10</td>\n",
" <td>20</td>\n",
" <td>30</td>\n",
" <td>10</td>\n",
" <td>200</td>\n",
" <td>300</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" (A0, B0, C0) (A1, B1, C1) (A2, B2, ) (X0, Y0) (X1, Y1) (A2, B2)\n",
"0 1 2 3 1 200 300\n",
"1 10 20 30 10 200 300"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_merge = pd.merge(df1, df2, left_on=[(\"A0\", \"B0\",\"C0\")], right_on=[(\"X0\", \"Y0\")])\n",
"df_merge"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead tr th {\n",
" text-align: left;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th>A0</th>\n",
" <th>A1</th>\n",
" <th>A2_0</th>\n",
" <th>X0</th>\n",
" <th>X1</th>\n",
" <th>A2_1</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th>B0</th>\n",
" <th>B1</th>\n",
" <th>B2</th>\n",
" <th>Y0</th>\n",
" <th>Y1</th>\n",
" <th>B2</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th>C0</th>\n",
" <th>C1</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>200</td>\n",
" <td>300</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>10</td>\n",
" <td>20</td>\n",
" <td>30</td>\n",
" <td>10</td>\n",
" <td>200</td>\n",
" <td>300</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A0 A1 A2_0 X0 X1 A2_1\n",
" B0 B1 B2 Y0 Y1 B2\n",
" C0 C1 \n",
"0 1 2 3 1 200 300\n",
"1 10 20 30 10 200 300"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"columns_as_multiindex(df_merge)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "columns overlap, but the auto-renaming is disabled",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-7-0857603b176a>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mcolumns_as_multiindex\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf_merge\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msuffix_on_overlap\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m<ipython-input-1-ae2f2a30fb5d>\u001b[0m in \u001b[0;36mcolumns_as_multiindex\u001b[1;34m(df, inplace, suffix_on_overlap, strip_empty_ends)\u001b[0m\n\u001b[0;32m 153\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 154\u001b[0m columns_new = index_as_multiindex(df.columns, suffix_on_overlap=suffix_on_overlap,\n\u001b[1;32m--> 155\u001b[1;33m strip_empty_ends=strip_empty_ends)\n\u001b[0m\u001b[0;32m 156\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 157\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0minplace\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m<ipython-input-1-ae2f2a30fb5d>\u001b[0m in \u001b[0;36mindex_as_multiindex\u001b[1;34m(index, suffix_on_overlap, strip_empty_ends)\u001b[0m\n\u001b[0;32m 49\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0midx\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtuple\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcol\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 50\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 51\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"columns overlap, but the auto-renaming is disabled\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 52\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 53\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mMultiIndex\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfrom_tuples\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mValueError\u001b[0m: columns overlap, but the auto-renaming is disabled"
]
}
],
"source": [
"columns_as_multiindex(df_merge, suffix_on_overlap=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"See also issue https://github.com/pandas-dev/pandas/issues/26699"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment