{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Answering the question at https://stackoverflow.com/questions/45113070/how-do-i-make-this-function-for-concatenating-excel-sheets-from-a-single-file-mo#"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import xlrd\n",
"from pathlib import Path"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Constants"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"filename = Path.home() / 'Downloads' / 'example_excel_file.xlsx'"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"sheet_names = ['first_wanted', 'second_wanted']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Loading All Sheets at Once using `pd.read_excel`\n",
"\n",
"`read_excel` can take multiple sheets as a parameter and returns a dictionary\n",
"of dataframes, one item for each sheet."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"dfs = pd.read_excel(filename, sheetname=sheet_names)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"OrderedDict([('first_wanted', Code Description\n",
" 0 1 Spam\n",
" 1 2 Ham\n",
" 2 3 Eggs), ('second_wanted', Code Description\n",
" 0 A Hovercraft\n",
" 1 B Full of\n",
" 2 C Eels)])"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfs"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Code | \n",
" Description | \n",
" source | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" Spam | \n",
" first_wanted | \n",
"
\n",
" \n",
" | 1 | \n",
" 2 | \n",
" Ham | \n",
" first_wanted | \n",
"
\n",
" \n",
" | 2 | \n",
" 3 | \n",
" Eggs | \n",
" first_wanted | \n",
"
\n",
" \n",
" | 3 | \n",
" A | \n",
" Hovercraft | \n",
" second_wanted | \n",
"
\n",
" \n",
" | 4 | \n",
" B | \n",
" Full of | \n",
" second_wanted | \n",
"
\n",
" \n",
" | 5 | \n",
" C | \n",
" Eels | \n",
" second_wanted | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Code Description source\n",
"0 1 Spam first_wanted\n",
"1 2 Ham first_wanted\n",
"2 3 Eggs first_wanted\n",
"3 A Hovercraft second_wanted\n",
"4 B Full of second_wanted\n",
"5 C Eels second_wanted"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat((df.assign(source=sheet) for sheet, df in dfs.items()), ignore_index=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Functionize It"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def sheets_to_df(filename, sheet_names):\n",
" df_dict = pd.read_excel(filename, sheetname=sheet_names)\n",
" return pd.concat(\n",
" (df.assign(source=sheet) for sheet, df in dfs.items()), ignore_index=True)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Code | \n",
" Description | \n",
" source | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" Spam | \n",
" first_wanted | \n",
"
\n",
" \n",
" | 1 | \n",
" 2 | \n",
" Ham | \n",
" first_wanted | \n",
"
\n",
" \n",
" | 2 | \n",
" 3 | \n",
" Eggs | \n",
" first_wanted | \n",
"
\n",
" \n",
" | 3 | \n",
" A | \n",
" Hovercraft | \n",
" second_wanted | \n",
"
\n",
" \n",
" | 4 | \n",
" B | \n",
" Full of | \n",
" second_wanted | \n",
"
\n",
" \n",
" | 5 | \n",
" C | \n",
" Eels | \n",
" second_wanted | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Code Description source\n",
"0 1 Spam first_wanted\n",
"1 2 Ham first_wanted\n",
"2 3 Eggs first_wanted\n",
"3 A Hovercraft second_wanted\n",
"4 B Full of second_wanted\n",
"5 C Eels second_wanted"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sheets_to_df(filename, sheet_names)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Minimize Memory Usage"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def df_gen(filename, sheet_names):\n",
" with xlrd.open_workbook(filename, on_demand=True) as xl_file:\n",
" for sheet in sheet_names:\n",
" yield pd.read_excel(xl_file, sheetname=sheet, engine='xlrd').assign(source=sheet)\n",
" # tell xlrd to let the sheet leave memory\n",
" xl_file.unload_sheet(sheet)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Code | \n",
" Description | \n",
" source | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" Spam | \n",
" first_wanted | \n",
"
\n",
" \n",
" | 1 | \n",
" 2 | \n",
" Ham | \n",
" first_wanted | \n",
"
\n",
" \n",
" | 2 | \n",
" 3 | \n",
" Eggs | \n",
" first_wanted | \n",
"
\n",
" \n",
" | 3 | \n",
" A | \n",
" Hovercraft | \n",
" second_wanted | \n",
"
\n",
" \n",
" | 4 | \n",
" B | \n",
" Full of | \n",
" second_wanted | \n",
"
\n",
" \n",
" | 5 | \n",
" C | \n",
" Eels | \n",
" second_wanted | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Code Description source\n",
"0 1 Spam first_wanted\n",
"1 2 Ham first_wanted\n",
"2 3 Eggs first_wanted\n",
"3 A Hovercraft second_wanted\n",
"4 B Full of second_wanted\n",
"5 C Eels second_wanted"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat(df_gen(str(filename), sheet_names), ignore_index=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Really limit memory usage by manually building a dataframe iteratively."
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Code | \n",
" Description | \n",
" source | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" Spam | \n",
" first_wanted | \n",
"
\n",
" \n",
" | 1 | \n",
" 2 | \n",
" Ham | \n",
" first_wanted | \n",
"
\n",
" \n",
" | 2 | \n",
" 3 | \n",
" Eggs | \n",
" first_wanted | \n",
"
\n",
" \n",
" | 3 | \n",
" A | \n",
" Hovercraft | \n",
" second_wanted | \n",
"
\n",
" \n",
" | 4 | \n",
" B | \n",
" Full of | \n",
" second_wanted | \n",
"
\n",
" \n",
" | 5 | \n",
" C | \n",
" Eels | \n",
" second_wanted | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Code Description source\n",
"0 1 Spam first_wanted\n",
"1 2 Ham first_wanted\n",
"2 3 Eggs first_wanted\n",
"3 A Hovercraft second_wanted\n",
"4 B Full of second_wanted\n",
"5 C Eels second_wanted"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# get the generator (nothing happens yet)\n",
"gen = df_gen(str(filename), sheet_names)\n",
"\n",
"# get starting point\n",
"df = next(gen)\n",
"\n",
"# iterate over the rest of the sheets and build up a final dataframe\n",
"for next_df in gen:\n",
" df = df.append(next_df, ignore_index=True)\n",
"\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}