Skip to content

Instantly share code, notes, and snippets.

@dominiquesydow
Created July 27, 2020 12:00
Show Gist options
  • Save dominiquesydow/2ab34e56b2456f53d85d3d14c5441f07 to your computer and use it in GitHub Desktop.
Save dominiquesydow/2ab34e56b2456f53d85d3d14c5441f07 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Read files from `tar.bz2` archive on-the-fly"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"import tarfile\n",
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Single file in `tar.bz2`\n",
"\n",
"Use ´pandas´ - compressed files can be read by `read_csv` and friends.\n",
"\n",
"https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html\n",
"\n",
"> compression : {‘infer’, ‘gzip’, ‘bz2’, ‘zip’, ‘xz’, None}, default ‘infer’\n",
"> - For on-the-fly decompression of on-disk data. If ‘infer’ and filepath_or_buffer is path-like, then detect compression from the following extensions: ‘.gz’, ‘.bz2’, ‘.zip’, or ‘.xz’ (otherwise no decompression). If using ‘zip’, the ZIP file must contain only one data file to be read in. Set to None for no decompression.\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>subpockets.csv</th>\n",
" <th>count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AP-B1-GA-SE</td>\n",
" <td>255096.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AP-FP-GA-SE</td>\n",
" <td>5016284.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AP-B2-GA-SE</td>\n",
" <td>359534.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AP-B2-FP-GA</td>\n",
" <td>178027.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AP-B1-FP-GA</td>\n",
" <td>209843.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>AP-B1-B2-GA</td>\n",
" <td>3498.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>AP-GA-SE</td>\n",
" <td>102733.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>AP-FP-SE</td>\n",
" <td>512671.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>AP-FP-GA</td>\n",
" <td>71885.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>AP-B2-GA</td>\n",
" <td>1812.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>AP-B1-GA</td>\n",
" <td>1279.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>AP-GA</td>\n",
" <td>682.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>AP-FP</td>\n",
" <td>5924.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>AP-SE</td>\n",
" <td>1369.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" subpockets.csv count\n",
"0 AP-B1-GA-SE 255096.0\n",
"1 AP-FP-GA-SE 5016284.0\n",
"2 AP-B2-GA-SE 359534.0\n",
"3 AP-B2-FP-GA 178027.0\n",
"4 AP-B1-FP-GA 209843.0\n",
"5 AP-B1-B2-GA 3498.0\n",
"6 AP-GA-SE 102733.0\n",
"7 AP-FP-SE 512671.0\n",
"8 AP-FP-GA 71885.0\n",
"9 AP-B2-GA 1812.0\n",
"10 AP-B1-GA 1279.0\n",
"11 AP-GA 682.0\n",
"12 AP-FP 5924.0\n",
"13 AP-SE 1369.0\n",
"14 NaN NaN"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.read_csv('single_file.csv.tar.bz2')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Multiple files in `tar.bz2` \n",
"\n",
"Use `tarfile` to get stream to files in archive."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"filename = \"multiple_files.tar.bz2\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def load_file(member):\n",
" \"\"\"\n",
" Load content (csv/json) from a file within a tar.bz2 archive.\n",
" \n",
" Parameters\n",
" ----------\n",
" member : tarfile.TarInfo\n",
" Stream to file in tar.bz2 archive.\n",
" \"\"\"\n",
"\n",
" extracted_file = f.extractfile(member.name)\n",
" \n",
" if Path(member.name).suffix == '.csv':\n",
" content = pd.read_csv(extracted_file)\n",
" elif Path(member.name).suffix == '.json':\n",
" content = pd.read_json(extracted_file)\n",
" else:\n",
" print(f'Loading protocol not defined yet for: {member.name.suffix}')\n",
" \n",
" return content"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Archive members:\n",
"[<TarInfo 'subpockets.csv' at 0x7f0d144002a0>, <TarInfo 'original_substructure.json' at 0x7f0d144004f8>]\n"
]
}
],
"source": [
"with tarfile.open(filename, mode='r:bz2') as f:\n",
" \n",
" print(f'Archive members:')\n",
" members = [member for member in f]\n",
" print(members)\n",
" \n",
" content = load_file(members[0])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AP-B1-GA-SE</td>\n",
" <td>255096</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AP-FP-GA-SE</td>\n",
" <td>5016284</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AP-B2-GA-SE</td>\n",
" <td>359534</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AP-B2-FP-GA</td>\n",
" <td>178027</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AP-B1-FP-GA</td>\n",
" <td>209843</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>AP-B1-B2-GA</td>\n",
" <td>3498</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>AP-GA-SE</td>\n",
" <td>102733</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>AP-FP-SE</td>\n",
" <td>512671</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>AP-FP-GA</td>\n",
" <td>71885</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>AP-B2-GA</td>\n",
" <td>1812</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>AP-B1-GA</td>\n",
" <td>1279</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>AP-GA</td>\n",
" <td>682</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>AP-FP</td>\n",
" <td>5924</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>AP-SE</td>\n",
" <td>1369</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 count\n",
"0 AP-B1-GA-SE 255096\n",
"1 AP-FP-GA-SE 5016284\n",
"2 AP-B2-GA-SE 359534\n",
"3 AP-B2-FP-GA 178027\n",
"4 AP-B1-FP-GA 209843\n",
"5 AP-B1-B2-GA 3498\n",
"6 AP-GA-SE 102733\n",
"7 AP-FP-SE 512671\n",
"8 AP-FP-GA 71885\n",
"9 AP-B2-GA 1812\n",
"10 AP-B1-GA 1279\n",
"11 AP-GA 682\n",
"12 AP-FP 5924\n",
"13 AP-SE 1369"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"content"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "kinfraglib",
"language": "python",
"name": "kinfraglib"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.10"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment