Created
July 15, 2017 05:08
-
-
Save jiffyclub/9ab668f63c3d0f9adf3e730dc37cd419 to your computer and use it in GitHub Desktop.
Using pandas and xlrd to concatenate multiple excel sheets into a single dataframe. In answer to this Stack Overflow question: https://stackoverflow.com/questions/45113070/how-do-i-make-this-function-for-concatenating-excel-sheets-from-a-single-file-mo#
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Answering the question at https://stackoverflow.com/questions/45113070/how-do-i-make-this-function-for-concatenating-excel-sheets-from-a-single-file-mo#" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import xlrd\n", | |
"from pathlib import Path" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Constants" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"filename = Path.home() / 'Downloads' / 'example_excel_file.xlsx'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"sheet_names = ['first_wanted', 'second_wanted']" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Loading All Sheets at Once using `pd.read_excel`\n", | |
"\n", | |
"`read_excel` can take multiple sheets as a parameter and returns a dictionary\n", | |
"of dataframes, one item for each sheet." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"dfs = pd.read_excel(filename, sheetname=sheet_names)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"OrderedDict([('first_wanted', Code Description\n", | |
" 0 1 Spam\n", | |
" 1 2 Ham\n", | |
" 2 3 Eggs), ('second_wanted', Code Description\n", | |
" 0 A Hovercraft\n", | |
" 1 B Full of\n", | |
" 2 C Eels)])" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"dfs" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style>\n", | |
" .dataframe thead tr:only-child th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: left;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Code</th>\n", | |
" <th>Description</th>\n", | |
" <th>source</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>Spam</td>\n", | |
" <td>first_wanted</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>Ham</td>\n", | |
" <td>first_wanted</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>Eggs</td>\n", | |
" <td>first_wanted</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>A</td>\n", | |
" <td>Hovercraft</td>\n", | |
" <td>second_wanted</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>B</td>\n", | |
" <td>Full of</td>\n", | |
" <td>second_wanted</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>C</td>\n", | |
" <td>Eels</td>\n", | |
" <td>second_wanted</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Code Description source\n", | |
"0 1 Spam first_wanted\n", | |
"1 2 Ham first_wanted\n", | |
"2 3 Eggs first_wanted\n", | |
"3 A Hovercraft second_wanted\n", | |
"4 B Full of second_wanted\n", | |
"5 C Eels second_wanted" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pd.concat((df.assign(source=sheet) for sheet, df in dfs.items()), ignore_index=True)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"##### Functionize It" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def sheets_to_df(filename, sheet_names):\n", | |
" df_dict = pd.read_excel(filename, sheetname=sheet_names)\n", | |
" return pd.concat(\n", | |
" (df.assign(source=sheet) for sheet, df in dfs.items()), ignore_index=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style>\n", | |
" .dataframe thead tr:only-child th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: left;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Code</th>\n", | |
" <th>Description</th>\n", | |
" <th>source</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>Spam</td>\n", | |
" <td>first_wanted</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>Ham</td>\n", | |
" <td>first_wanted</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>Eggs</td>\n", | |
" <td>first_wanted</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>A</td>\n", | |
" <td>Hovercraft</td>\n", | |
" <td>second_wanted</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>B</td>\n", | |
" <td>Full of</td>\n", | |
" <td>second_wanted</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>C</td>\n", | |
" <td>Eels</td>\n", | |
" <td>second_wanted</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Code Description source\n", | |
"0 1 Spam first_wanted\n", | |
"1 2 Ham first_wanted\n", | |
"2 3 Eggs first_wanted\n", | |
"3 A Hovercraft second_wanted\n", | |
"4 B Full of second_wanted\n", | |
"5 C Eels second_wanted" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"sheets_to_df(filename, sheet_names)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Minimize Memory Usage" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def df_gen(filename, sheet_names):\n", | |
" with xlrd.open_workbook(filename, on_demand=True) as xl_file:\n", | |
" for sheet in sheet_names:\n", | |
" yield pd.read_excel(xl_file, sheetname=sheet, engine='xlrd').assign(source=sheet)\n", | |
" # tell xlrd to let the sheet leave memory\n", | |
" xl_file.unload_sheet(sheet)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style>\n", | |
" .dataframe thead tr:only-child th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: left;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Code</th>\n", | |
" <th>Description</th>\n", | |
" <th>source</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>Spam</td>\n", | |
" <td>first_wanted</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>Ham</td>\n", | |
" <td>first_wanted</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>Eggs</td>\n", | |
" <td>first_wanted</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>A</td>\n", | |
" <td>Hovercraft</td>\n", | |
" <td>second_wanted</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>B</td>\n", | |
" <td>Full of</td>\n", | |
" <td>second_wanted</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>C</td>\n", | |
" <td>Eels</td>\n", | |
" <td>second_wanted</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Code Description source\n", | |
"0 1 Spam first_wanted\n", | |
"1 2 Ham first_wanted\n", | |
"2 3 Eggs first_wanted\n", | |
"3 A Hovercraft second_wanted\n", | |
"4 B Full of second_wanted\n", | |
"5 C Eels second_wanted" | |
] | |
}, | |
"execution_count": 18, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pd.concat(df_gen(str(filename), sheet_names), ignore_index=True)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Really limit memory usage by manually building a dataframe iteratively." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style>\n", | |
" .dataframe thead tr:only-child th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: left;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Code</th>\n", | |
" <th>Description</th>\n", | |
" <th>source</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>Spam</td>\n", | |
" <td>first_wanted</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>Ham</td>\n", | |
" <td>first_wanted</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>Eggs</td>\n", | |
" <td>first_wanted</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>A</td>\n", | |
" <td>Hovercraft</td>\n", | |
" <td>second_wanted</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>B</td>\n", | |
" <td>Full of</td>\n", | |
" <td>second_wanted</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>C</td>\n", | |
" <td>Eels</td>\n", | |
" <td>second_wanted</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Code Description source\n", | |
"0 1 Spam first_wanted\n", | |
"1 2 Ham first_wanted\n", | |
"2 3 Eggs first_wanted\n", | |
"3 A Hovercraft second_wanted\n", | |
"4 B Full of second_wanted\n", | |
"5 C Eels second_wanted" | |
] | |
}, | |
"execution_count": 21, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# get the generator (nothing happens yet)\n", | |
"gen = df_gen(str(filename), sheet_names)\n", | |
"\n", | |
"# get starting point\n", | |
"df = next(gen)\n", | |
"\n", | |
"# iterate over the rest of the sheets and build up a final dataframe\n", | |
"for next_df in gen:\n", | |
" df = df.append(next_df, ignore_index=True)\n", | |
"\n", | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
A small correction, it should be df_dict.items() instead of dfs.items() on cell 7