Created
May 1, 2020 03:55
-
-
Save loisaidasam/40d166e2e647c5197be71cb4a89d4062 to your computer and use it in GitHub Desktop.
Story about numpy ndarrays with nested sequences
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Here's a story about numpy ndarrays with structured dtypes" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Let's whip up some sample data and a sample data type ..." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import json\n", | |
| "\n", | |
| "import numpy as np\n", | |
| "\n", | |
| "\n", | |
| "sample_data = [\n", | |
| " (\n", | |
| " \"Sam\",\n", | |
| " 'black',\n", | |
| " 27,\n", | |
| " [\n", | |
| " \"pizza\",\n", | |
| " \"sourdough bread\",\n", | |
| " \"tacos\",\n", | |
| " ],\n", | |
| " [\n", | |
| " (\"Wicked Weed\", \"Napolean Complex IPA\", 1, 4.5),\n", | |
| " (\"Sweetwater\", \"IPA\", 6, 4.0),\n", | |
| " (\"Budweiser\", \"Diesel\", 11, 3.0),\n", | |
| " ],\n", | |
| " ),\n", | |
| " (\n", | |
| " \"Ryan\",\n", | |
| " 'blue',\n", | |
| " 13,\n", | |
| " [\n", | |
| " \"fish fry\",\n", | |
| " \"pulled pork\",\n", | |
| " \"fried rice\",\n", | |
| " ],\n", | |
| " [\n", | |
| " (\"Arches\", \"Bohemian Pilsner\", 1, 5),\n", | |
| " (\"Monday Night\", \"I'm On A Boat\", 2, 4.5),\n", | |
| " (\"Corona\", \"Extra\", 0, 4),\n", | |
| " ],\n", | |
| " )\n", | |
| "]\n", | |
| "\n", | |
| "dtype = np.dtype([\n", | |
| " ('name', '<U32'),\n", | |
| " ('favorite_color', '<U10'),\n", | |
| " ('favorite_number', 'int32'),\n", | |
| " ('top_three_foods', '<U32', (3,)),\n", | |
| " (\n", | |
| " 'beer_in_fridge',\n", | |
| " [\n", | |
| " ('brewer', '<U32'),\n", | |
| " ('name', '<U32'),\n", | |
| " ('count', 'int32'),\n", | |
| " ('star_rating', 'float32'),\n", | |
| " ],\n", | |
| " (3,),\n", | |
| " ),\n", | |
| "])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Here's what our sample data looks like in primitive python:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[('Sam',\n", | |
| " 'black',\n", | |
| " 27,\n", | |
| " ['pizza', 'sourdough bread', 'tacos'],\n", | |
| " [('Wicked Weed', 'Napolean Complex IPA', 1, 4.5),\n", | |
| " ('Sweetwater', 'IPA', 6, 4.0),\n", | |
| " ('Budweiser', 'Diesel', 11, 3.0)]),\n", | |
| " ('Ryan',\n", | |
| " 'blue',\n", | |
| " 13,\n", | |
| " ['fish fry', 'pulled pork', 'fried rice'],\n", | |
| " [('Arches', 'Bohemian Pilsner', 1, 5),\n", | |
| " ('Monday Night', \"I'm On A Boat\", 2, 4.5),\n", | |
| " ('Corona', 'Extra', 0, 4)])]" | |
| ] | |
| }, | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "sample_data" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "And our dtype:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "dtype([('name', '<U32'), ('favorite_color', '<U10'), ('favorite_number', '<i4'), ('top_three_foods', '<U32', (3,)), ('beer_in_fridge', [('brewer', '<U32'), ('name', '<U32'), ('count', '<i4'), ('star_rating', '<f4')], (3,))])" | |
| ] | |
| }, | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "dtype" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Here's what the whole shebang looks like as a numpy array:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "array([('Sam', 'black', 27, ['pizza', 'sourdough bread', 'tacos'], [('Wicked Weed', 'Napolean Complex IPA', 1, 4.5), ('Sweetwater', 'IPA', 6, 4. ), ('Budweiser', 'Diesel', 11, 3. )]),\n", | |
| " ('Ryan', 'blue', 13, ['fish fry', 'pulled pork', 'fried rice'], [('Arches', 'Bohemian Pilsner', 1, 5. ), ('Monday Night', \"I'm On A Boat\", 2, 4.5), ('Corona', 'Extra', 0, 4. )])],\n", | |
| " dtype=[('name', '<U32'), ('favorite_color', '<U10'), ('favorite_number', '<i4'), ('top_three_foods', '<U32', (3,)), ('beer_in_fridge', [('brewer', '<U32'), ('name', '<U32'), ('count', '<i4'), ('star_rating', '<f4')], (3,))])" | |
| ] | |
| }, | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "people = np.array(sample_data, dtype=dtype)\n", | |
| "people" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Here's what one person looks like:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "('Sam', 'black', 27, ['pizza', 'sourdough bread', 'tacos'], [('Wicked Weed', 'Napolean Complex IPA', 1, 4.5), ('Sweetwater', 'IPA', 6, 4. ), ('Budweiser', 'Diesel', 11, 3. )])" | |
| ] | |
| }, | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "sam = people[0]\n", | |
| "sam" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Their favorite food:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'pizza'" | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "sam['top_three_foods'][0]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "A beer that's in their fridge and how they rated it:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "IPA by Sweetwater: 4.0 stars (****)\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "beer = sam['beer_in_fridge'][1]\n", | |
| "stars = ''.join(['*' for _ in range(int(beer['star_rating']))])\n", | |
| "print(\"%s by %s: %s stars (%s)\" % (\n", | |
| " beer['name'],\n", | |
| " beer['brewer'],\n", | |
| " beer['star_rating'],\n", | |
| " stars,\n", | |
| "))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "OK, now let's manipulate the data a bit to see how rigid this structure is...\n", | |
| "\n", | |
| "Can we create an ndarray with just one person?" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Yep\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "try:\n", | |
| " np.array(sample_data[:1], dtype=dtype)\n", | |
| " print(\"Yep\")\n", | |
| "except Exception as exception:\n", | |
| " print(\"Nope: %s\" % exception)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Can we create a user who only has ONE favorite food (the dtype explicitly states 3)?" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Yep\n", | |
| "[('Brooke', 'pink', 22, ['brussel sprouts', 'brussel sprouts', 'brussel sprouts'], [('Wicked Weed', 'Napolean Complex IPA', 1, 4.5), ('Sweetwater', 'IPA', 6, 4. ), ('Budweiser', 'Diesel', 11, 3. )])]\n", | |
| "Hmmm, well that's bizarre, it seems to have padded the array with \"brussel sprouts\" three times ...\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "sample_data_weird_food_counts = [\n", | |
| " (\n", | |
| " \"Brooke\",\n", | |
| " 'pink',\n", | |
| " 22,\n", | |
| " [\n", | |
| " \"brussel sprouts\",\n", | |
| " ],\n", | |
| " [\n", | |
| " (\"Wicked Weed\", \"Napolean Complex IPA\", 1, 4.5),\n", | |
| " (\"Sweetwater\", \"IPA\", 6, 4.0),\n", | |
| " (\"Budweiser\", \"Diesel\", 11, 3.0),\n", | |
| " ],\n", | |
| " ),\n", | |
| "]\n", | |
| "try:\n", | |
| " people_one_food = np.array(sample_data_weird_food_counts, dtype=dtype)\n", | |
| " print(\"Yep\")\n", | |
| " print(people_one_food)\n", | |
| " print(\"Hmmm, well that's bizarre, it seems to have padded the array with \\\"brussel sprouts\\\" three times ...\")\n", | |
| "except Exception as exception:\n", | |
| " print(\"Nope: %s\" % exception)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "What if there are TWO fav foods (one too few)?" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "foods: ['brussel sprouts', 'hummus']\n", | |
| "Nope: cannot copy sequence with size 2 to array axis with dimension 3\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "sample_data_weird_food_counts[0][3].append(\"hummus\")\n", | |
| "print(\"foods: %s\" % sample_data_weird_food_counts[0][3])\n", | |
| "try:\n", | |
| " np.array(sample_data_weird_food_counts, dtype=dtype)\n", | |
| " print(\"Yep\")\n", | |
| "except Exception as exception:\n", | |
| " print(\"Nope: %s\" % exception)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "OK. What if there are FOUR fav foods (one too many)?" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "foods: ['brussel sprouts', 'hummus', 'saltines', 'ketchup']\n", | |
| "Nope: cannot copy sequence with size 4 to array axis with dimension 3\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "sample_data_weird_food_counts[0][3].extend([\"saltines\", \"ketchup\"])\n", | |
| "print(\"foods: %s\" % sample_data_weird_food_counts[0][3])\n", | |
| "try:\n", | |
| " np.array(sample_data_weird_food_counts, dtype=dtype)\n", | |
| " print(\"Yep\")\n", | |
| "except Exception as exception:\n", | |
| " print(\"Nope: %s\" % exception)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Interesting! OK, now let's fox with the dtype a bit ...\n", | |
| "\n", | |
| "The dtype descr looks like this:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[('name', '<U32'),\n", | |
| " ('favorite_color', '<U10'),\n", | |
| " ('favorite_number', '<i4'),\n", | |
| " ('top_three_foods', '<U32', (3,)),\n", | |
| " ('beer_in_fridge',\n", | |
| " [('brewer', '<U32'),\n", | |
| " ('name', '<U32'),\n", | |
| " ('count', '<i4'),\n", | |
| " ('star_rating', '<f4')],\n", | |
| " (3,))]" | |
| ] | |
| }, | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "dtype.descr" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "And it looks like this when serialized to JSON:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'[[\"name\", \"<U32\"], [\"favorite_color\", \"<U10\"], [\"favorite_number\", \"<i4\"], [\"top_three_foods\", \"<U32\", [3]], [\"beer_in_fridge\", [[\"brewer\", \"<U32\"], [\"name\", \"<U32\"], [\"count\", \"<i4\"], [\"star_rating\", \"<f4\"]], [3]]]'" | |
| ] | |
| }, | |
| "execution_count": 13, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "dtype_descr_to_json = json.dumps(dtype.descr)\n", | |
| "dtype_descr_to_json" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "And here's what that looks like when deserialized from JSON (note all lists, no tuples):" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[['name', '<U32'],\n", | |
| " ['favorite_color', '<U10'],\n", | |
| " ['favorite_number', '<i4'],\n", | |
| " ['top_three_foods', '<U32', [3]],\n", | |
| " ['beer_in_fridge',\n", | |
| " [['brewer', '<U32'],\n", | |
| " ['name', '<U32'],\n", | |
| " ['count', '<i4'],\n", | |
| " ['star_rating', '<f4']],\n", | |
| " [3]]]" | |
| ] | |
| }, | |
| "execution_count": 14, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "dtype_descr_from_json = json.loads(dtype_descr_to_json)\n", | |
| "dtype_descr_from_json" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Here's a handy little recursive function I wrote for deserializing from JSON:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def deserialize_dtype_descr_json(dtype_descr_json):\n", | |
| " \"\"\"Recursive method for deserializing nested dtype.descr json blobs\n", | |
| "\n", | |
| " The reason for this is that JSON doesn't distinguish between lists\n", | |
| " and tuples, but dtype formatting REQUIRE lists on the outside and\n", | |
| " tuples on the inside\n", | |
| " \"\"\"\n", | |
| " dtype = []\n", | |
| " for dtype_entry in dtype_descr_json:\n", | |
| " # Each dtype entry will be of size 2 or 3 depending on if it's a\n", | |
| " # sequence or not\n", | |
| " if isinstance(dtype_entry[1], list):\n", | |
| " # The second element is the dtype of the dtype entry, so if\n", | |
| " # it's a list (of lists) it has to be converted to a list of\n", | |
| " # tuples\n", | |
| " dtype_entry[1] = deserialize_dtype_descr_json(dtype_entry[1])\n", | |
| " if len(dtype_entry) >= 3:\n", | |
| " # TODO: Do we have to deserialize the third element to a\n", | |
| " # tuple if it's a list? Seems to work as a list, but doing\n", | |
| " # this anyway for safety/conformity.\n", | |
| " dtype_entry[2] = tuple(dtype_entry[2])\n", | |
| " dtype.append(tuple(dtype_entry))\n", | |
| " return dtype" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "And here's what that looks like after being deserialized:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[('name', '<U32'),\n", | |
| " ('favorite_color', '<U10'),\n", | |
| " ('favorite_number', '<i4'),\n", | |
| " ('top_three_foods', '<U32', (3,)),\n", | |
| " ('beer_in_fridge',\n", | |
| " [('brewer', '<U32'),\n", | |
| " ('name', '<U32'),\n", | |
| " ('count', '<i4'),\n", | |
| " ('star_rating', '<f4')],\n", | |
| " (3,))]" | |
| ] | |
| }, | |
| "execution_count": 16, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "dtype_descr_from_json_deserialized = deserialize_dtype_descr_json(dtype_descr_from_json)\n", | |
| "dtype_descr_from_json_deserialized" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Confirming that we can deserialize from JSON properly:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "True" | |
| ] | |
| }, | |
| "execution_count": 17, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "dtype.descr == dtype_descr_from_json_deserialized" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Confirming that using this deserialized dtype works properly:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 18, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "True" | |
| ] | |
| }, | |
| "execution_count": 18, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "people_from_deserialized_dtype_descr = np.array(sample_data,\n", | |
| " dtype=dtype_descr_from_json_deserialized)\n", | |
| "(people == people_from_deserialized_dtype_descr).all()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.8.0" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 4 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment