Created
August 23, 2021 11:34
-
-
Save larsyencken/a7aa5b8f00ec69895f1165bec0a26f4a to your computer and use it in GitHub Desktop.
OWID: Exploring metadata size in SQLite
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "id": "f028245c-4082-4de8-ab6d-8b829e8338d5", | |
| "metadata": {}, | |
| "source": [ | |
| "# Metadata in Sqlite" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "55408391-ce6c-4ba3-86f2-89f17d5fa1e0", | |
| "metadata": {}, | |
| "source": [ | |
| "_How big would a Sqlite index of all our variables be?_" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "407e547a-5a7f-4e43-87c0-f2acdaed95af", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import sqlite3" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "739a1764-cab0-4bc0-ba7a-c360fc12ff84", | |
| "metadata": {}, | |
| "source": [ | |
| "## First cut" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "8f6476ed-95be-4657-a829-7be651fe8e9b", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "FILENAME = 'metadata-example.db'" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "4dce08bf-90b9-496c-aebc-588798b4a2a0", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "%%mysql\n", | |
| "\n", | |
| "select\n", | |
| " v.id var_id,\n", | |
| " v.name var_name,\n", | |
| " v.description var_desc,\n", | |
| " d.name dataset_name,\n", | |
| " d.description dataset_desc,\n", | |
| " n.name namespace,\n", | |
| " n.description namespace_desc,\n", | |
| " v.createdAt created\n", | |
| "from variables v\n", | |
| "inner join datasets d on (v.datasetId = d.id)\n", | |
| "inner join namespaces n on (n.name = d.namespace)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "e186ddd2-1cfd-4623-ba22-a93779aeef55", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "!rm -f {FILENAME}" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "9043347d-fff4-469a-bd19-7debd120bca4", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "db = sqlite3.connect(FILENAME, isolation_level=None)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "8e02b0a2-5c94-4e74-9657-b80aafffbd92", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "cursor = db.cursor()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "id": "5347056f-5e35-4977-9c46-b9e31915ccc2", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>var_id</th>\n", | |
| " <th>var_name</th>\n", | |
| " <th>var_desc</th>\n", | |
| " <th>dataset_name</th>\n", | |
| " <th>dataset_desc</th>\n", | |
| " <th>namespace</th>\n", | |
| " <th>namespace_desc</th>\n", | |
| " <th>created</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>19</td>\n", | |
| " <td>Suicide rate (WHO (2005))</td>\n", | |
| " <td></td>\n", | |
| " <td>Age-adjusted suicide rates, 1950-2005 - WHO (2...</td>\n", | |
| " <td></td>\n", | |
| " <td>owid</td>\n", | |
| " <td>Our World in Data [data uploaded by the OWID-t...</td>\n", | |
| " <td>2015-06-30 08:06:47</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>356</td>\n", | |
| " <td>Male-Female Ratio of Suicide Rate (WHO (2005))</td>\n", | |
| " <td></td>\n", | |
| " <td>Age-adjusted suicide rates, 1950-2005 - WHO (2...</td>\n", | |
| " <td></td>\n", | |
| " <td>owid</td>\n", | |
| " <td>Our World in Data [data uploaded by the OWID-t...</td>\n", | |
| " <td>2016-01-01 00:00:00</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>47</td>\n", | |
| " <td>Female suicide rate (WHO (2005))</td>\n", | |
| " <td></td>\n", | |
| " <td>Age-adjusted female suicide rates, 1950-2005 -...</td>\n", | |
| " <td></td>\n", | |
| " <td>owid</td>\n", | |
| " <td>Our World in Data [data uploaded by the OWID-t...</td>\n", | |
| " <td>2015-07-16 13:51:55</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>48</td>\n", | |
| " <td>Male suicide rate (WHO (2005))</td>\n", | |
| " <td></td>\n", | |
| " <td>Age-adjusted male suicide rates, 1950-2005 - W...</td>\n", | |
| " <td></td>\n", | |
| " <td>owid</td>\n", | |
| " <td>Our World in Data [data uploaded by the OWID-t...</td>\n", | |
| " <td>2015-07-16 13:54:10</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>27</td>\n", | |
| " <td>Human Height (University of Tuebingen (2015))</td>\n", | |
| " <td>Human height measured in centimetres.</td>\n", | |
| " <td>Human Height - University of Tuebingen (2015)</td>\n", | |
| " <td></td>\n", | |
| " <td>owid</td>\n", | |
| " <td>Our World in Data [data uploaded by the OWID-t...</td>\n", | |
| " <td>2015-07-02 06:55:54</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " var_id var_name \\\n", | |
| "0 19 Suicide rate (WHO (2005)) \n", | |
| "1 356 Male-Female Ratio of Suicide Rate (WHO (2005)) \n", | |
| "2 47 Female suicide rate (WHO (2005)) \n", | |
| "3 48 Male suicide rate (WHO (2005)) \n", | |
| "4 27 Human Height (University of Tuebingen (2015)) \n", | |
| "\n", | |
| " var_desc \\\n", | |
| "0 \n", | |
| "1 \n", | |
| "2 \n", | |
| "3 \n", | |
| "4 Human height measured in centimetres. \n", | |
| "\n", | |
| " dataset_name dataset_desc namespace \\\n", | |
| "0 Age-adjusted suicide rates, 1950-2005 - WHO (2... owid \n", | |
| "1 Age-adjusted suicide rates, 1950-2005 - WHO (2... owid \n", | |
| "2 Age-adjusted female suicide rates, 1950-2005 -... owid \n", | |
| "3 Age-adjusted male suicide rates, 1950-2005 - W... owid \n", | |
| "4 Human Height - University of Tuebingen (2015) owid \n", | |
| "\n", | |
| " namespace_desc created \n", | |
| "0 Our World in Data [data uploaded by the OWID-t... 2015-06-30 08:06:47 \n", | |
| "1 Our World in Data [data uploaded by the OWID-t... 2016-01-01 00:00:00 \n", | |
| "2 Our World in Data [data uploaded by the OWID-t... 2015-07-16 13:51:55 \n", | |
| "3 Our World in Data [data uploaded by the OWID-t... 2015-07-16 13:54:10 \n", | |
| "4 Our World in Data [data uploaded by the OWID-t... 2015-07-02 06:55:54 " | |
| ] | |
| }, | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "id": "43b5a946-6cd6-44b4-8bc4-7d4619cb0578", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df.to_sql('variables', db)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "id": "a1cc734c-f5ea-4e76-9ca3-a5537e774140", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| " 96M\tmetadata-example.db\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "!du -hs {FILENAME}" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "id": "0e1069e0-7fc7-48fc-881c-247c80f6ac74", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "db.close()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "11d70bce-a37b-4582-99df-0da47e816e04", | |
| "metadata": {}, | |
| "source": [ | |
| "## Second version" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 31, | |
| "id": "366a0fcb-d5e2-4584-96b4-b8ff508f068c", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "filename_v2 = f'{FILENAME}-v2'" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 32, | |
| "id": "796e5d5e-8dea-42b2-834b-730edf3f0d57", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "zsh:1: no matches found: metadata-example.db-v2*\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "!rm -f {filename_v2}*" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 33, | |
| "id": "9acce20f-69bb-4189-95b6-0177f6b9cb8d", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "with sqlite3.connect(filename_v2) as db:\n", | |
| " df[['var_id', 'var_name']].to_sql('variables', db)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 34, | |
| "id": "e6c84fe5-444b-419b-8191-eaf7399c0096", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| " 11M\tmetadata-example.db-v2\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "!du -hs {filename_v2}" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 35, | |
| "id": "ed922160-9d98-43d1-8acc-12f578c29586", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "102029" | |
| ] | |
| }, | |
| "execution_count": 35, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "len(df)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 36, | |
| "id": "a5b4d8cc-641b-4186-ad64-6a8bf2d98258", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "490.0" | |
| ] | |
| }, | |
| "execution_count": 36, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "49e6/100000" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 37, | |
| "id": "6e943065-a15b-4a59-8188-818d9cbf9b0e", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "!gzip -k {filename_v2}" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 38, | |
| "id": "e8c197ba-04e4-4658-9c0a-a646149ae32c", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "2.4M\tmetadata-example.db-v2.gz\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "!du -hs {filename_v2}.gz" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "2e59ae7e-4ac8-44f4-9c02-9d96d7dc7177", | |
| "metadata": {}, | |
| "source": [ | |
| "## Which variables are actually used" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 43, | |
| "id": "ce6e8edd-bc34-4e0c-a863-45d7b313cc7a", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>count(*)</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>96153</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " count(*)\n", | |
| "0 96153" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "%%mysql -d\n", | |
| "\n", | |
| "select\n", | |
| " v.id,\n", | |
| " count(*) points\n", | |
| "from variables v\n", | |
| "left join data_values dv ON (dv.variableId = v.id)\n", | |
| "WHERE\n", | |
| " dv.year is null" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "59787a5f-7001-44f5-bcb2-a037562ff41e", | |
| "metadata": {}, | |
| "source": [ | |
| "96k are not used at all, leaving c. 6k \"active\" in charts" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "1e0aad4a-7733-4936-84e8-7750f241211b", | |
| "metadata": {}, | |
| "source": [ | |
| "## Conclusion" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "0251bedb-8ef6-457e-b6fb-75b5623cbbd3", | |
| "metadata": {}, | |
| "source": [ | |
| "- We could do this for the admin, but not for every user\n", | |
| "- We could do just the active variables, unless you \"opt in\" to the full set" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.9.4" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment