Created
July 27, 2020 12:00
-
-
Save dominiquesydow/2ab34e56b2456f53d85d3d14c5441f07 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Read files from `tar.bz2` archive on-the-fly" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from pathlib import Path\n", | |
"import tarfile\n", | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Single file in `tar.bz2`\n", | |
"\n", | |
"Use ´pandas´ - compressed files can be read by `read_csv` and friends.\n", | |
"\n", | |
"https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html\n", | |
"\n", | |
"> compression : {‘infer’, ‘gzip’, ‘bz2’, ‘zip’, ‘xz’, None}, default ‘infer’\n", | |
"> - For on-the-fly decompression of on-disk data. If ‘infer’ and filepath_or_buffer is path-like, then detect compression from the following extensions: ‘.gz’, ‘.bz2’, ‘.zip’, or ‘.xz’ (otherwise no decompression). If using ‘zip’, the ZIP file must contain only one data file to be read in. Set to None for no decompression.\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>subpockets.csv</th>\n", | |
" <th>count</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>AP-B1-GA-SE</td>\n", | |
" <td>255096.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>AP-FP-GA-SE</td>\n", | |
" <td>5016284.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>AP-B2-GA-SE</td>\n", | |
" <td>359534.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>AP-B2-FP-GA</td>\n", | |
" <td>178027.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>AP-B1-FP-GA</td>\n", | |
" <td>209843.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>AP-B1-B2-GA</td>\n", | |
" <td>3498.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>AP-GA-SE</td>\n", | |
" <td>102733.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>AP-FP-SE</td>\n", | |
" <td>512671.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>AP-FP-GA</td>\n", | |
" <td>71885.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>AP-B2-GA</td>\n", | |
" <td>1812.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>AP-B1-GA</td>\n", | |
" <td>1279.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>AP-GA</td>\n", | |
" <td>682.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>AP-FP</td>\n", | |
" <td>5924.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>AP-SE</td>\n", | |
" <td>1369.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" subpockets.csv count\n", | |
"0 AP-B1-GA-SE 255096.0\n", | |
"1 AP-FP-GA-SE 5016284.0\n", | |
"2 AP-B2-GA-SE 359534.0\n", | |
"3 AP-B2-FP-GA 178027.0\n", | |
"4 AP-B1-FP-GA 209843.0\n", | |
"5 AP-B1-B2-GA 3498.0\n", | |
"6 AP-GA-SE 102733.0\n", | |
"7 AP-FP-SE 512671.0\n", | |
"8 AP-FP-GA 71885.0\n", | |
"9 AP-B2-GA 1812.0\n", | |
"10 AP-B1-GA 1279.0\n", | |
"11 AP-GA 682.0\n", | |
"12 AP-FP 5924.0\n", | |
"13 AP-SE 1369.0\n", | |
"14 NaN NaN" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pd.read_csv('single_file.csv.tar.bz2')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Multiple files in `tar.bz2` \n", | |
"\n", | |
"Use `tarfile` to get stream to files in archive." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"filename = \"multiple_files.tar.bz2\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def load_file(member):\n", | |
" \"\"\"\n", | |
" Load content (csv/json) from a file within a tar.bz2 archive.\n", | |
" \n", | |
" Parameters\n", | |
" ----------\n", | |
" member : tarfile.TarInfo\n", | |
" Stream to file in tar.bz2 archive.\n", | |
" \"\"\"\n", | |
"\n", | |
" extracted_file = f.extractfile(member.name)\n", | |
" \n", | |
" if Path(member.name).suffix == '.csv':\n", | |
" content = pd.read_csv(extracted_file)\n", | |
" elif Path(member.name).suffix == '.json':\n", | |
" content = pd.read_json(extracted_file)\n", | |
" else:\n", | |
" print(f'Loading protocol not defined yet for: {member.name.suffix}')\n", | |
" \n", | |
" return content" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Archive members:\n", | |
"[<TarInfo 'subpockets.csv' at 0x7f0d144002a0>, <TarInfo 'original_substructure.json' at 0x7f0d144004f8>]\n" | |
] | |
} | |
], | |
"source": [ | |
"with tarfile.open(filename, mode='r:bz2') as f:\n", | |
" \n", | |
" print(f'Archive members:')\n", | |
" members = [member for member in f]\n", | |
" print(members)\n", | |
" \n", | |
" content = load_file(members[0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Unnamed: 0</th>\n", | |
" <th>count</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>AP-B1-GA-SE</td>\n", | |
" <td>255096</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>AP-FP-GA-SE</td>\n", | |
" <td>5016284</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>AP-B2-GA-SE</td>\n", | |
" <td>359534</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>AP-B2-FP-GA</td>\n", | |
" <td>178027</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>AP-B1-FP-GA</td>\n", | |
" <td>209843</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>AP-B1-B2-GA</td>\n", | |
" <td>3498</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>AP-GA-SE</td>\n", | |
" <td>102733</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>AP-FP-SE</td>\n", | |
" <td>512671</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>AP-FP-GA</td>\n", | |
" <td>71885</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>AP-B2-GA</td>\n", | |
" <td>1812</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>AP-B1-GA</td>\n", | |
" <td>1279</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>AP-GA</td>\n", | |
" <td>682</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>AP-FP</td>\n", | |
" <td>5924</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>AP-SE</td>\n", | |
" <td>1369</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Unnamed: 0 count\n", | |
"0 AP-B1-GA-SE 255096\n", | |
"1 AP-FP-GA-SE 5016284\n", | |
"2 AP-B2-GA-SE 359534\n", | |
"3 AP-B2-FP-GA 178027\n", | |
"4 AP-B1-FP-GA 209843\n", | |
"5 AP-B1-B2-GA 3498\n", | |
"6 AP-GA-SE 102733\n", | |
"7 AP-FP-SE 512671\n", | |
"8 AP-FP-GA 71885\n", | |
"9 AP-B2-GA 1812\n", | |
"10 AP-B1-GA 1279\n", | |
"11 AP-GA 682\n", | |
"12 AP-FP 5924\n", | |
"13 AP-SE 1369" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"content" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "kinfraglib", | |
"language": "python", | |
"name": "kinfraglib" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.10" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment