Last active
October 26, 2021 12:20
-
-
Save larsyencken/5404ebcea6262701ea329ef15325b9d9 to your computer and use it in GitHub Desktop.
OWID: Notebook exploring potential sharding options for a food explorer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "776d27f9-587c-45d4-b710-84656c084e36", | |
"metadata": {}, | |
"source": [ | |
"# 2021-10-26 Daniel & Lars food exploring" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "f3eec933-cb42-41b4-bbfb-1391e3a3770b", | |
"metadata": {}, | |
"source": [ | |
"## Summary\n", | |
"\n", | |
"- How big is the current data in CSV form?\n", | |
" - 272 MB, but it needs auditing against Hannah's design again\n", | |
"- Is there a shard key we could use to shard the data into reasonable CSV files?\n", | |
" - Measures might be a reasonable shard key, but we'd need to try and see the maximum file size\n", | |
" - Any shard key we use will prevent us from doing charts which cover multiple shards" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "6ff7e167-2c96-47ee-9d92-6576c0541aac", | |
"metadata": {}, | |
"source": [ | |
"## Loading the data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "1cff3f82-401a-4d95-b2ea-2f9cbacf4b8d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = pd.read_feather('hannah-2021-09-08-raw.feather')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "7b8c3837-5506-4b51-8788-bebb1eae742c", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>dataset_id</th>\n", | |
" <th>variable</th>\n", | |
" <th>year</th>\n", | |
" <th>entity</th>\n", | |
" <th>value</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>5010</td>\n", | |
" <td>New Food Balances - Alcohol, Non-Food - 2659 -...</td>\n", | |
" <td>2014</td>\n", | |
" <td>United Kingdom</td>\n", | |
" <td>132</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>5010</td>\n", | |
" <td>New Food Balances - Alcohol, Non-Food - 2659 -...</td>\n", | |
" <td>2015</td>\n", | |
" <td>United Kingdom</td>\n", | |
" <td>319</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>5010</td>\n", | |
" <td>New Food Balances - Alcohol, Non-Food - 2659 -...</td>\n", | |
" <td>2016</td>\n", | |
" <td>United Kingdom</td>\n", | |
" <td>200</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>5010</td>\n", | |
" <td>New Food Balances - Alcohol, Non-Food - 2659 -...</td>\n", | |
" <td>2017</td>\n", | |
" <td>United Kingdom</td>\n", | |
" <td>51</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5010</td>\n", | |
" <td>New Food Balances - Alcohol, Non-Food - 2659 -...</td>\n", | |
" <td>2014</td>\n", | |
" <td>Ireland</td>\n", | |
" <td>41</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" dataset_id variable year entity value\n", | |
"0 5010 New Food Balances - Alcohol, Non-Food - 2659 -... 2014 United Kingdom 132\n", | |
"1 5010 New Food Balances - Alcohol, Non-Food - 2659 -... 2015 United Kingdom 319\n", | |
"2 5010 New Food Balances - Alcohol, Non-Food - 2659 -... 2016 United Kingdom 200\n", | |
"3 5010 New Food Balances - Alcohol, Non-Food - 2659 -... 2017 United Kingdom 51\n", | |
"4 5010 New Food Balances - Alcohol, Non-Food - 2659 -... 2014 Ireland 41" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"id": "de0aa525-5adc-442c-b7a1-6fdc42d1778f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array(['New Food Balances - Alcohol, Non-Food - 2659 - Domestic supply quantity - 5301 - 1000 tonnes',\n", | |
" 'New Food Balances - Alcohol, Non-Food - 2659 - Domestic supply quantity - 5301 - 1000 tonnes',\n", | |
" 'New Food Balances - Alcohol, Non-Food - 2659 - Domestic supply quantity - 5301 - 1000 tonnes'],\n", | |
" dtype=object)" | |
] | |
}, | |
"execution_count": 18, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"examples = df.variable.values[:3]\n", | |
"examples" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "1c6faaaa-1a1d-4cd5-bbbc-1e7306fcff4a", | |
"metadata": {}, | |
"source": [ | |
"## Reconstruct the dimensions" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"id": "9403c498-97fe-4b16-92e5-2f0a0288917d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import parse" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"id": "5ad992c4-7198-4119-aee3-99b70095ba4f", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"pattern = '{dataset} - {food_type} - {food_type_id} - {measure} - {measure_id} - {unit}'\n", | |
"\n", | |
"parser = parse.compile(pattern)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"id": "38993e1b-b9c3-4bf4-91ea-2bbb8fbd7771", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"rows = df.variable.apply(lambda v: parser.parse(v).named)\n", | |
"unpacked_vars = pd.DataFrame.from_records(rows)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"id": "ba7ae2c8-a4c9-47fb-b856-a6c2aec8b3d3", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>dataset</th>\n", | |
" <th>food_type</th>\n", | |
" <th>food_type_id</th>\n", | |
" <th>measure</th>\n", | |
" <th>measure_id</th>\n", | |
" <th>unit</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>New Food Balances</td>\n", | |
" <td>Alcohol, Non-Food</td>\n", | |
" <td>2659</td>\n", | |
" <td>Domestic supply quantity</td>\n", | |
" <td>5301</td>\n", | |
" <td>1000 tonnes</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>New Food Balances</td>\n", | |
" <td>Alcohol, Non-Food</td>\n", | |
" <td>2659</td>\n", | |
" <td>Domestic supply quantity</td>\n", | |
" <td>5301</td>\n", | |
" <td>1000 tonnes</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>New Food Balances</td>\n", | |
" <td>Alcohol, Non-Food</td>\n", | |
" <td>2659</td>\n", | |
" <td>Domestic supply quantity</td>\n", | |
" <td>5301</td>\n", | |
" <td>1000 tonnes</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>New Food Balances</td>\n", | |
" <td>Alcohol, Non-Food</td>\n", | |
" <td>2659</td>\n", | |
" <td>Domestic supply quantity</td>\n", | |
" <td>5301</td>\n", | |
" <td>1000 tonnes</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>New Food Balances</td>\n", | |
" <td>Alcohol, Non-Food</td>\n", | |
" <td>2659</td>\n", | |
" <td>Domestic supply quantity</td>\n", | |
" <td>5301</td>\n", | |
" <td>1000 tonnes</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" dataset food_type food_type_id measure measure_id unit\n", | |
"0 New Food Balances Alcohol, Non-Food 2659 Domestic supply quantity 5301 1000 tonnes\n", | |
"1 New Food Balances Alcohol, Non-Food 2659 Domestic supply quantity 5301 1000 tonnes\n", | |
"2 New Food Balances Alcohol, Non-Food 2659 Domestic supply quantity 5301 1000 tonnes\n", | |
"3 New Food Balances Alcohol, Non-Food 2659 Domestic supply quantity 5301 1000 tonnes\n", | |
"4 New Food Balances Alcohol, Non-Food 2659 Domestic supply quantity 5301 1000 tonnes" | |
] | |
}, | |
"execution_count": 27, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"unpacked_vars.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"id": "d7153859-8082-4639-8c7b-534453c11f38", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for col in ['dataset', 'food_type', 'measure', 'unit']:\n", | |
" unpacked_vars[col] = unpacked_vars[col].astype('category')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 58, | |
"id": "884e01ab-f6a9-47da-83b4-29fa20655609", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# the concat doesn't work right if the index isn't reset\n", | |
"df.reset_index(drop=True, inplace=True)\n", | |
"comb = pd.concat([df, unpacked_vars], axis=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 59, | |
"id": "3d2f7a37-294c-4353-b4c3-0ac38b879152", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>dataset_id</th>\n", | |
" <th>variable</th>\n", | |
" <th>year</th>\n", | |
" <th>entity</th>\n", | |
" <th>value</th>\n", | |
" <th>dataset</th>\n", | |
" <th>food_type</th>\n", | |
" <th>food_type_id</th>\n", | |
" <th>measure</th>\n", | |
" <th>measure_id</th>\n", | |
" <th>unit</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>5010</td>\n", | |
" <td>New Food Balances - Alcohol, Non-Food - 2659 -...</td>\n", | |
" <td>2014</td>\n", | |
" <td>United Kingdom</td>\n", | |
" <td>132</td>\n", | |
" <td>New Food Balances</td>\n", | |
" <td>Alcohol, Non-Food</td>\n", | |
" <td>2659</td>\n", | |
" <td>Domestic supply quantity</td>\n", | |
" <td>5301</td>\n", | |
" <td>1000 tonnes</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>5010</td>\n", | |
" <td>New Food Balances - Alcohol, Non-Food - 2659 -...</td>\n", | |
" <td>2015</td>\n", | |
" <td>United Kingdom</td>\n", | |
" <td>319</td>\n", | |
" <td>New Food Balances</td>\n", | |
" <td>Alcohol, Non-Food</td>\n", | |
" <td>2659</td>\n", | |
" <td>Domestic supply quantity</td>\n", | |
" <td>5301</td>\n", | |
" <td>1000 tonnes</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>5010</td>\n", | |
" <td>New Food Balances - Alcohol, Non-Food - 2659 -...</td>\n", | |
" <td>2016</td>\n", | |
" <td>United Kingdom</td>\n", | |
" <td>200</td>\n", | |
" <td>New Food Balances</td>\n", | |
" <td>Alcohol, Non-Food</td>\n", | |
" <td>2659</td>\n", | |
" <td>Domestic supply quantity</td>\n", | |
" <td>5301</td>\n", | |
" <td>1000 tonnes</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>5010</td>\n", | |
" <td>New Food Balances - Alcohol, Non-Food - 2659 -...</td>\n", | |
" <td>2017</td>\n", | |
" <td>United Kingdom</td>\n", | |
" <td>51</td>\n", | |
" <td>New Food Balances</td>\n", | |
" <td>Alcohol, Non-Food</td>\n", | |
" <td>2659</td>\n", | |
" <td>Domestic supply quantity</td>\n", | |
" <td>5301</td>\n", | |
" <td>1000 tonnes</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5010</td>\n", | |
" <td>New Food Balances - Alcohol, Non-Food - 2659 -...</td>\n", | |
" <td>2014</td>\n", | |
" <td>Ireland</td>\n", | |
" <td>41</td>\n", | |
" <td>New Food Balances</td>\n", | |
" <td>Alcohol, Non-Food</td>\n", | |
" <td>2659</td>\n", | |
" <td>Domestic supply quantity</td>\n", | |
" <td>5301</td>\n", | |
" <td>1000 tonnes</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" dataset_id variable year entity value dataset food_type food_type_id measure measure_id unit\n", | |
"0 5010 New Food Balances - Alcohol, Non-Food - 2659 -... 2014 United Kingdom 132 New Food Balances Alcohol, Non-Food 2659 Domestic supply quantity 5301 1000 tonnes\n", | |
"1 5010 New Food Balances - Alcohol, Non-Food - 2659 -... 2015 United Kingdom 319 New Food Balances Alcohol, Non-Food 2659 Domestic supply quantity 5301 1000 tonnes\n", | |
"2 5010 New Food Balances - Alcohol, Non-Food - 2659 -... 2016 United Kingdom 200 New Food Balances Alcohol, Non-Food 2659 Domestic supply quantity 5301 1000 tonnes\n", | |
"3 5010 New Food Balances - Alcohol, Non-Food - 2659 -... 2017 United Kingdom 51 New Food Balances Alcohol, Non-Food 2659 Domestic supply quantity 5301 1000 tonnes\n", | |
"4 5010 New Food Balances - Alcohol, Non-Food - 2659 -... 2014 Ireland 41 New Food Balances Alcohol, Non-Food 2659 Domestic supply quantity 5301 1000 tonnes" | |
] | |
}, | |
"execution_count": 59, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"comb.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 60, | |
"id": "6544e56a-91a9-49e5-a7ee-5d62460be6ff", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"comb.drop(columns=['dataset_id', 'variable', 'food_type_id', 'measure_id'], inplace=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 64, | |
"id": "f51eb269-66b4-4164-96da-237f142bad75", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>entity</th>\n", | |
" <th>year</th>\n", | |
" <th>food_type</th>\n", | |
" <th>measure</th>\n", | |
" <th>unit</th>\n", | |
" <th>value</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>United Kingdom</td>\n", | |
" <td>2014</td>\n", | |
" <td>Alcohol, Non-Food</td>\n", | |
" <td>Domestic supply quantity</td>\n", | |
" <td>1000 tonnes</td>\n", | |
" <td>132</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>United Kingdom</td>\n", | |
" <td>2015</td>\n", | |
" <td>Alcohol, Non-Food</td>\n", | |
" <td>Domestic supply quantity</td>\n", | |
" <td>1000 tonnes</td>\n", | |
" <td>319</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>United Kingdom</td>\n", | |
" <td>2016</td>\n", | |
" <td>Alcohol, Non-Food</td>\n", | |
" <td>Domestic supply quantity</td>\n", | |
" <td>1000 tonnes</td>\n", | |
" <td>200</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>United Kingdom</td>\n", | |
" <td>2017</td>\n", | |
" <td>Alcohol, Non-Food</td>\n", | |
" <td>Domestic supply quantity</td>\n", | |
" <td>1000 tonnes</td>\n", | |
" <td>51</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>Ireland</td>\n", | |
" <td>2014</td>\n", | |
" <td>Alcohol, Non-Food</td>\n", | |
" <td>Domestic supply quantity</td>\n", | |
" <td>1000 tonnes</td>\n", | |
" <td>41</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" entity year food_type measure unit value\n", | |
"0 United Kingdom 2014 Alcohol, Non-Food Domestic supply quantity 1000 tonnes 132\n", | |
"1 United Kingdom 2015 Alcohol, Non-Food Domestic supply quantity 1000 tonnes 319\n", | |
"2 United Kingdom 2016 Alcohol, Non-Food Domestic supply quantity 1000 tonnes 200\n", | |
"3 United Kingdom 2017 Alcohol, Non-Food Domestic supply quantity 1000 tonnes 51\n", | |
"4 Ireland 2014 Alcohol, Non-Food Domestic supply quantity 1000 tonnes 41" | |
] | |
}, | |
"execution_count": 64, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"comb = comb[['entity', 'year', 'food_type', 'measure', 'unit', 'value']]\n", | |
"comb.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 65, | |
"id": "e9fb417c-7a5f-44a8-b24e-c427bc714727", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"4544207" | |
] | |
}, | |
"execution_count": 65, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len(comb)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 66, | |
"id": "39f0ef00-f9e0-4587-aa0d-7db3f6f67903", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"comb.to_csv('2021-10-26-food-explorer-full.csv', index=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 74, | |
"id": "54e55301-48d4-4f14-be18-1b68338f1370", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"272M\t2021-10-26-food-explorer-full.csv\n" | |
] | |
} | |
], | |
"source": [ | |
"!du -hs 2021-10-26-food-explorer-full.csv" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "68c40ebe-b98b-45d0-a7e7-1a042f31d58a", | |
"metadata": {}, | |
"source": [ | |
"## Cardinality" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 72, | |
"id": "c0e5fd1c-ea73-49dc-9f28-3743e2064d89", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"entity 260\n", | |
"year 58\n", | |
"food_type 363\n", | |
"measure 30\n", | |
"unit 33\n", | |
"value 713379\n", | |
"dtype: int64" | |
] | |
}, | |
"execution_count": 72, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pd.Series({col: len(comb[col].unique()) for col in comb.columns})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 75, | |
"id": "c267341d-ec34-47fe-b388-7fb15c97e37d", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"tonnes 1274140\n", | |
"ha 753579\n", | |
"hg/ha 749532\n", | |
"1000 tonnes 643157\n", | |
"Head 294089\n", | |
"hg/An 226901\n", | |
"g/capita/day 141216\n", | |
"1000 Head 118803\n", | |
"0.1g/An 76910\n", | |
"kcal/capita/day 70360\n", | |
"kg 69872\n", | |
"100mg/An 28902\n", | |
"1000 No 16822\n", | |
"No 16388\n", | |
"hg 12479\n", | |
"5301 - 1000 tonnes 3440\n", | |
"5142 - 1000 tonnes 3438\n", | |
"645 - kg 3438\n", | |
"664 - kcal/capita/day 3435\n", | |
"684 - g/capita/day 3435\n", | |
"674 - g/capita/day 3435\n", | |
"5611 - 1000 tonnes 3430\n", | |
"5170 - 1000 tonnes 3404\n", | |
"5072 - 1000 tonnes 3276\n", | |
"5911 - 1000 tonnes 3160\n", | |
"5511 - 1000 tonnes 3107\n", | |
"5123 - 1000 tonnes 2945\n", | |
"5521 - 1000 tonnes 2768\n", | |
"5131 - 1000 tonnes 2551\n", | |
"5154 - 1000 tonnes 2217\n", | |
"5171 - 1000 tonnes 1667\n", | |
"5527 - 1000 tonnes 1223\n", | |
"511 - 1000 persons 688\n", | |
"Name: unit, dtype: int64" | |
] | |
}, | |
"execution_count": 75, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"comb.unit.value_counts()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "ecaa11aa-4a78-4109-aab0-c21cfd6f3f27", | |
"metadata": {}, | |
"source": [ | |
"^ not all units got parsed correctly" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 77, | |
"id": "2588fde3-1e51-4dd2-934f-c5543e53d7ef", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Production 1445546\n", | |
"Yield 872292\n", | |
"Area harvested 753579\n", | |
"Producing Animals/Slaughtered 242946\n", | |
"Yield/Carcass Weight 222432\n", | |
"Domestic supply quantity 76013\n", | |
"Import Quantity 72754\n", | |
"Fat supply quantity (g/capita/day) 70619\n", | |
"Protein supply quantity (g/capita/day) 70597\n", | |
"Food supply (kcal/capita/day) 70360\n", | |
"Food 69980\n", | |
"Food supply quantity (kg/capita/yr) 69872\n", | |
"Residuals 64819\n", | |
"Export Quantity 61236\n", | |
"Stock Variation 60970\n", | |
"Milk Animals 41695\n", | |
"Other uses (non-food) 36875\n", | |
"Losses 36338\n", | |
"Feed 30676\n", | |
"Tourist consumption 30186\n", | |
"Laying 28904\n", | |
"Processing 26427\n", | |
"Seed 21646\n", | |
"Prod Popultn 16388\n", | |
"2905 11200\n", | |
"2848 10163\n", | |
"2948 10163\n", | |
"2919 9925\n", | |
"2560 8918\n", | |
"Total Population 688\n", | |
"Name: measure, dtype: int64" | |
] | |
}, | |
"execution_count": 77, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"comb.measure.value_counts()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.7" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment