{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Answering the question at https://stackoverflow.com/questions/45113070/how-do-i-make-this-function-for-concatenating-excel-sheets-from-a-single-file-mo#" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import xlrd\n", "from pathlib import Path" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Constants" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "filename = Path.home() / 'Downloads' / 'example_excel_file.xlsx'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "sheet_names = ['first_wanted', 'second_wanted']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Loading All Sheets at Once using `pd.read_excel`\n", "\n", "`read_excel` can take multiple sheets as a parameter and returns a dictionary\n", "of dataframes, one item for each sheet." ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "dfs = pd.read_excel(filename, sheetname=sheet_names)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "OrderedDict([('first_wanted', Code Description\n", " 0 1 Spam\n", " 1 2 Ham\n", " 2 3 Eggs), ('second_wanted', Code Description\n", " 0 A Hovercraft\n", " 1 B Full of\n", " 2 C Eels)])" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfs" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CodeDescriptionsource
01Spamfirst_wanted
12Hamfirst_wanted
23Eggsfirst_wanted
3AHovercraftsecond_wanted
4BFull ofsecond_wanted
5CEelssecond_wanted
\n", "
" ], "text/plain": [ " Code Description source\n", "0 1 Spam first_wanted\n", "1 2 Ham first_wanted\n", "2 3 Eggs first_wanted\n", "3 A Hovercraft second_wanted\n", "4 B Full of second_wanted\n", "5 C Eels second_wanted" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat((df.assign(source=sheet) for sheet, df in dfs.items()), ignore_index=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Functionize It" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def sheets_to_df(filename, sheet_names):\n", " df_dict = pd.read_excel(filename, sheetname=sheet_names)\n", " return pd.concat(\n", " (df.assign(source=sheet) for sheet, df in dfs.items()), ignore_index=True)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CodeDescriptionsource
01Spamfirst_wanted
12Hamfirst_wanted
23Eggsfirst_wanted
3AHovercraftsecond_wanted
4BFull ofsecond_wanted
5CEelssecond_wanted
\n", "
" ], "text/plain": [ " Code Description source\n", "0 1 Spam first_wanted\n", "1 2 Ham first_wanted\n", "2 3 Eggs first_wanted\n", "3 A Hovercraft second_wanted\n", "4 B Full of second_wanted\n", "5 C Eels second_wanted" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sheets_to_df(filename, sheet_names)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Minimize Memory Usage" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def df_gen(filename, sheet_names):\n", " with xlrd.open_workbook(filename, on_demand=True) as xl_file:\n", " for sheet in sheet_names:\n", " yield pd.read_excel(xl_file, sheetname=sheet, engine='xlrd').assign(source=sheet)\n", " # tell xlrd to let the sheet leave memory\n", " xl_file.unload_sheet(sheet)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CodeDescriptionsource
01Spamfirst_wanted
12Hamfirst_wanted
23Eggsfirst_wanted
3AHovercraftsecond_wanted
4BFull ofsecond_wanted
5CEelssecond_wanted
\n", "
" ], "text/plain": [ " Code Description source\n", "0 1 Spam first_wanted\n", "1 2 Ham first_wanted\n", "2 3 Eggs first_wanted\n", "3 A Hovercraft second_wanted\n", "4 B Full of second_wanted\n", "5 C Eels second_wanted" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat(df_gen(str(filename), sheet_names), ignore_index=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Really limit memory usage by manually building a dataframe iteratively." ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CodeDescriptionsource
01Spamfirst_wanted
12Hamfirst_wanted
23Eggsfirst_wanted
3AHovercraftsecond_wanted
4BFull ofsecond_wanted
5CEelssecond_wanted
\n", "
" ], "text/plain": [ " Code Description source\n", "0 1 Spam first_wanted\n", "1 2 Ham first_wanted\n", "2 3 Eggs first_wanted\n", "3 A Hovercraft second_wanted\n", "4 B Full of second_wanted\n", "5 C Eels second_wanted" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get the generator (nothing happens yet)\n", "gen = df_gen(str(filename), sheet_names)\n", "\n", "# get starting point\n", "df = next(gen)\n", "\n", "# iterate over the rest of the sheets and build up a final dataframe\n", "for next_df in gen:\n", " df = df.append(next_df, ignore_index=True)\n", "\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 2 }