Last active
June 15, 2025 02:16
-
-
Save rwcitek/7b7d5717b73070f935d7c2f08d1124c2 to your computer and use it in GitHub Desktop.
many-columns.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/rwcitek/7b7d5717b73070f935d7c2f08d1124c2/many-columns.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"id": "lIYdn1woOS1n" | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"%%capture\n", | |
"%%bash\n", | |
"pip install parquet-cli\n" | |
], | |
"metadata": { | |
"id": "uiGWTrDGYq2h" | |
}, | |
"execution_count": 2, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## Create data frame" | |
], | |
"metadata": { | |
"id": "q2T9VCFqYViy" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"n = 200_000\n", | |
"n" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "Wtzi5Cy4pYiw", | |
"outputId": "1ee9a7d8-dd85-4ccf-c53f-11fa99e91bc1" | |
}, | |
"execution_count": 3, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"200000" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 3 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"df = pd.DataFrame( columns = [ f\"col_{x:05}\" for x in range(1,n+1)] ).astype(\"uint16\")\n", | |
"df.shape\n" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "YcAlnV43m9rs", | |
"outputId": "9e8b7a93-aa11-47e2-98a4-6a827cd6804c" | |
}, | |
"execution_count": 4, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"(0, 200000)" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 4 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## Parquet\n" | |
], | |
"metadata": { | |
"id": "1BSy-pGUYTns" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"df.to_parquet(\"many.columns.parquet\", index=False)\n" | |
], | |
"metadata": { | |
"id": "Nb_sj5FFXbmZ" | |
}, | |
"execution_count": 5, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!ls -la --si\n" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "UesleF7sXbjk", | |
"outputId": "e5e3224d-c41a-44a1-904c-9f0038f54158" | |
}, | |
"execution_count": 6, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"total 85060\n", | |
"drwxr-xr-x 1 root root 4096 Jun 14 17:11 .\n", | |
"drwxr-xr-x 1 root root 4096 Jun 14 16:57 ..\n", | |
"drwxr-xr-x 4 root root 4096 Jun 12 13:35 .config\n", | |
"-rw-r--r-- 1 root root 20000 Jun 14 17:12 many.columns.csv\n", | |
"-rw-r--r-- 1 root root 87063520 Jun 14 17:45 many.columns.parquet\n", | |
"drwxr-xr-x 1 root root 4096 Jun 12 13:36 sample_data\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"df_parquet = pd.read_parquet(\"many.columns.parquet\")\n", | |
"df_parquet.shape" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "JFqtm0RMXbgh", | |
"outputId": "85727603-acd2-4246-ba10-38f8cb5f35bd" | |
}, | |
"execution_count": 7, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"(0, 200000)" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 7 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!parq many.columns.parquet --schema | tail" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "_f-SnpoTacDT", | |
"outputId": "846f6275-64cc-49e3-a923-4180d4f6b196" | |
}, | |
"execution_count": 8, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
" optional int32 field_id=-1 col_199993 (Int(bitWidth=16, isSigned=false));\n", | |
" optional int32 field_id=-1 col_199994 (Int(bitWidth=16, isSigned=false));\n", | |
" optional int32 field_id=-1 col_199995 (Int(bitWidth=16, isSigned=false));\n", | |
" optional int32 field_id=-1 col_199996 (Int(bitWidth=16, isSigned=false));\n", | |
" optional int32 field_id=-1 col_199997 (Int(bitWidth=16, isSigned=false));\n", | |
" optional int32 field_id=-1 col_199998 (Int(bitWidth=16, isSigned=false));\n", | |
" optional int32 field_id=-1 col_199999 (Int(bitWidth=16, isSigned=false));\n", | |
" optional int32 field_id=-1 col_200000 (Int(bitWidth=16, isSigned=false));\n", | |
"}\n", | |
"\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!parq many.columns.parquet --head" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "89B-oviXa1xU", | |
"outputId": "4a536695-ca91-422a-dd38-0418fc478928" | |
}, | |
"execution_count": 9, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Empty DataFrame\n", | |
"Columns: [col_00001, col_00002, col_00003, col_00004, col_00005, col_00006, col_00007, col_00008, col_00009, col_00010, col_00011, col_00012, col_00013, col_00014, col_00015, col_00016, col_00017, col_00018, col_00019, col_00020, col_00021, col_00022, col_00023, col_00024, col_00025, col_00026, col_00027, col_00028, col_00029, col_00030, col_00031, col_00032, col_00033, col_00034, col_00035, col_00036, col_00037, col_00038, col_00039, col_00040, col_00041, col_00042, col_00043, col_00044, col_00045, col_00046, col_00047, col_00048, col_00049, col_00050, col_00051, col_00052, col_00053, col_00054, col_00055, col_00056, col_00057, col_00058, col_00059, col_00060, col_00061, col_00062, col_00063, col_00064, col_00065, col_00066, col_00067, col_00068, col_00069, col_00070, col_00071, col_00072, col_00073, col_00074, col_00075, col_00076, col_00077, col_00078, col_00079, col_00080, col_00081, col_00082, col_00083, col_00084, col_00085, col_00086, col_00087, col_00088, col_00089, col_00090, col_00091, col_00092, col_00093, col_00094, col_00095, col_00096, col_00097, col_00098, col_00099, col_00100, ...]\n", | |
"Index: []\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## CSV" | |
], | |
"metadata": { | |
"id": "axwcvrZWYRgS" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"df.to_csv(\"many.columns.csv\", index=False)\n" | |
], | |
"metadata": { | |
"id": "hXiXr3awXnxj" | |
}, | |
"execution_count": 10, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!ls -la --si\n" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "-HKmaeNDoKlo", | |
"outputId": "e221ab0a-f523-4b9c-802e-463552b7be17" | |
}, | |
"execution_count": 11, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"total 87092\n", | |
"drwxr-xr-x 1 root root 4096 Jun 14 17:11 .\n", | |
"drwxr-xr-x 1 root root 4096 Jun 14 16:57 ..\n", | |
"drwxr-xr-x 4 root root 4096 Jun 12 13:35 .config\n", | |
"-rw-r--r-- 1 root root 2100001 Jun 14 17:47 many.columns.csv\n", | |
"-rw-r--r-- 1 root root 87063520 Jun 14 17:45 many.columns.parquet\n", | |
"drwxr-xr-x 1 root root 4096 Jun 12 13:36 sample_data\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!head many.columns.csv | cut -d, -f-10 | column -s, -t\n" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "ytiULJ_MoXF0", | |
"outputId": "764e731c-c866-4a91-f67c-8bdee779e166" | |
}, | |
"execution_count": 12, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"col_00001 col_00002 col_00003 col_00004 col_00005 col_00006 col_00007 col_00008 col_00009 col_00010\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"df = pd.read_csv(\"many.columns.csv\")\n", | |
"df.shape" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "MWu2m1a4odrd", | |
"outputId": "bfd00e2a-e565-4b15-e875-475646188e55" | |
}, | |
"execution_count": 13, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"(0, 200000)" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 13 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"df" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 109 | |
}, | |
"id": "Kl9LfOX0qWxQ", | |
"outputId": "7b334d25-ed2a-45ca-ce4e-ceedb70d4c91" | |
}, | |
"execution_count": 14, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"Empty DataFrame\n", | |
"Columns: [col_00001, col_00002, col_00003, col_00004, col_00005, col_00006, col_00007, col_00008, col_00009, col_00010, col_00011, col_00012, col_00013, col_00014, col_00015, col_00016, col_00017, col_00018, col_00019, col_00020, col_00021, col_00022, col_00023, col_00024, col_00025, col_00026, col_00027, col_00028, col_00029, col_00030, col_00031, col_00032, col_00033, col_00034, col_00035, col_00036, col_00037, col_00038, col_00039, col_00040, col_00041, col_00042, col_00043, col_00044, col_00045, col_00046, col_00047, col_00048, col_00049, col_00050, col_00051, col_00052, col_00053, col_00054, col_00055, col_00056, col_00057, col_00058, col_00059, col_00060, col_00061, col_00062, col_00063, col_00064, col_00065, col_00066, col_00067, col_00068, col_00069, col_00070, col_00071, col_00072, col_00073, col_00074, col_00075, col_00076, col_00077, col_00078, col_00079, col_00080, col_00081, col_00082, col_00083, col_00084, col_00085, col_00086, col_00087, col_00088, col_00089, col_00090, col_00091, col_00092, col_00093, col_00094, col_00095, col_00096, col_00097, col_00098, col_00099, col_00100, ...]\n", | |
"Index: []\n", | |
"\n", | |
"[0 rows x 200000 columns]" | |
], | |
"text/html": [ | |
"\n", | |
" <div id=\"df-1430ba39-c118-4c12-9f74-ce6b355dd151\" class=\"colab-df-container\">\n", | |
" <div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>col_00001</th>\n", | |
" <th>col_00002</th>\n", | |
" <th>col_00003</th>\n", | |
" <th>col_00004</th>\n", | |
" <th>col_00005</th>\n", | |
" <th>col_00006</th>\n", | |
" <th>col_00007</th>\n", | |
" <th>col_00008</th>\n", | |
" <th>col_00009</th>\n", | |
" <th>col_00010</th>\n", | |
" <th>...</th>\n", | |
" <th>col_199991</th>\n", | |
" <th>col_199992</th>\n", | |
" <th>col_199993</th>\n", | |
" <th>col_199994</th>\n", | |
" <th>col_199995</th>\n", | |
" <th>col_199996</th>\n", | |
" <th>col_199997</th>\n", | |
" <th>col_199998</th>\n", | |
" <th>col_199999</th>\n", | |
" <th>col_200000</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>0 rows × 200000 columns</p>\n", | |
"</div>\n", | |
" <div class=\"colab-df-buttons\">\n", | |
"\n", | |
" <div class=\"colab-df-container\">\n", | |
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-1430ba39-c118-4c12-9f74-ce6b355dd151')\"\n", | |
" title=\"Convert this dataframe to an interactive table.\"\n", | |
" style=\"display:none;\">\n", | |
"\n", | |
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n", | |
" <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n", | |
" </svg>\n", | |
" </button>\n", | |
"\n", | |
" <style>\n", | |
" .colab-df-container {\n", | |
" display:flex;\n", | |
" gap: 12px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert {\n", | |
" background-color: #E8F0FE;\n", | |
" border: none;\n", | |
" border-radius: 50%;\n", | |
" cursor: pointer;\n", | |
" display: none;\n", | |
" fill: #1967D2;\n", | |
" height: 32px;\n", | |
" padding: 0 0 0 0;\n", | |
" width: 32px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert:hover {\n", | |
" background-color: #E2EBFA;\n", | |
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
" fill: #174EA6;\n", | |
" }\n", | |
"\n", | |
" .colab-df-buttons div {\n", | |
" margin-bottom: 4px;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert {\n", | |
" background-color: #3B4455;\n", | |
" fill: #D2E3FC;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert:hover {\n", | |
" background-color: #434B5C;\n", | |
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", | |
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", | |
" fill: #FFFFFF;\n", | |
" }\n", | |
" </style>\n", | |
"\n", | |
" <script>\n", | |
" const buttonEl =\n", | |
" document.querySelector('#df-1430ba39-c118-4c12-9f74-ce6b355dd151 button.colab-df-convert');\n", | |
" buttonEl.style.display =\n", | |
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
"\n", | |
" async function convertToInteractive(key) {\n", | |
" const element = document.querySelector('#df-1430ba39-c118-4c12-9f74-ce6b355dd151');\n", | |
" const dataTable =\n", | |
" await google.colab.kernel.invokeFunction('convertToInteractive',\n", | |
" [key], {});\n", | |
" if (!dataTable) return;\n", | |
"\n", | |
" const docLinkHtml = 'Like what you see? Visit the ' +\n", | |
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", | |
" + ' to learn more about interactive tables.';\n", | |
" element.innerHTML = '';\n", | |
" dataTable['output_type'] = 'display_data';\n", | |
" await google.colab.output.renderOutput(dataTable, element);\n", | |
" const docLink = document.createElement('div');\n", | |
" docLink.innerHTML = docLinkHtml;\n", | |
" element.appendChild(docLink);\n", | |
" }\n", | |
" </script>\n", | |
" </div>\n", | |
"\n", | |
"\n", | |
" <div id=\"id_59cbfc54-2ff1-462c-b9c8-3ab00ee358d7\">\n", | |
" <style>\n", | |
" .colab-df-generate {\n", | |
" background-color: #E8F0FE;\n", | |
" border: none;\n", | |
" border-radius: 50%;\n", | |
" cursor: pointer;\n", | |
" display: none;\n", | |
" fill: #1967D2;\n", | |
" height: 32px;\n", | |
" padding: 0 0 0 0;\n", | |
" width: 32px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-generate:hover {\n", | |
" background-color: #E2EBFA;\n", | |
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
" fill: #174EA6;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-generate {\n", | |
" background-color: #3B4455;\n", | |
" fill: #D2E3FC;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-generate:hover {\n", | |
" background-color: #434B5C;\n", | |
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", | |
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", | |
" fill: #FFFFFF;\n", | |
" }\n", | |
" </style>\n", | |
" <button class=\"colab-df-generate\" onclick=\"generateWithVariable('df')\"\n", | |
" title=\"Generate code using this dataframe.\"\n", | |
" style=\"display:none;\">\n", | |
"\n", | |
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", | |
" width=\"24px\">\n", | |
" <path d=\"M7,19H8.4L18.45,9,17,7.55,7,17.6ZM5,21V16.75L18.45,3.32a2,2,0,0,1,2.83,0l1.4,1.43a1.91,1.91,0,0,1,.58,1.4,1.91,1.91,0,0,1-.58,1.4L9.25,21ZM18.45,9,17,7.55Zm-12,3A5.31,5.31,0,0,0,4.9,8.1,5.31,5.31,0,0,0,1,6.5,5.31,5.31,0,0,0,4.9,4.9,5.31,5.31,0,0,0,6.5,1,5.31,5.31,0,0,0,8.1,4.9,5.31,5.31,0,0,0,12,6.5,5.46,5.46,0,0,0,6.5,12Z\"/>\n", | |
" </svg>\n", | |
" </button>\n", | |
" <script>\n", | |
" (() => {\n", | |
" const buttonEl =\n", | |
" document.querySelector('#id_59cbfc54-2ff1-462c-b9c8-3ab00ee358d7 button.colab-df-generate');\n", | |
" buttonEl.style.display =\n", | |
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
"\n", | |
" buttonEl.onclick = () => {\n", | |
" google.colab.notebook.generateWithVariable('df');\n", | |
" }\n", | |
" })();\n", | |
" </script>\n", | |
" </div>\n", | |
"\n", | |
" </div>\n", | |
" </div>\n" | |
], | |
"application/vnd.google.colaboratory.intrinsic+json": { | |
"type": "dataframe", | |
"variable_name": "df" | |
} | |
}, | |
"metadata": {}, | |
"execution_count": 14 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [], | |
"metadata": { | |
"id": "14mNIoJTqjs3" | |
}, | |
"execution_count": 14, | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"colab": { | |
"name": "many-columns.ipynb", | |
"provenance": [], | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"display_name": "Python 3", | |
"name": "python3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment