Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save jrjames83/07d4174ca10e5f8bdbbac41ab90ccd6c to your computer and use it in GitHub Desktop.
Save jrjames83/07d4174ca10e5f8bdbbac41ab90ccd6c to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"import sys\n",
"import os\n",
"import xml.etree.ElementTree as ET\n",
"from pathlib import Path\n",
"import functools\n",
"\n",
"pd.set_option('display.float_format', lambda x: '%.3f' % x)\n",
"\n",
"def get_child_to_parent():\n",
" categoriesFilename = 'categories_0001_abcat0010000_to_pcmcat99300050000.xml'\n",
" root_category_id = 'cat00000'\n",
" tree = ET.parse(categoriesFilename)\n",
" root = tree.getroot()\n",
"\n",
" categories = []\n",
" parents = []\n",
" for child in root:\n",
" id = child.find('id').text\n",
" cat_path = child.find('path')\n",
" cat_path_ids = [cat.find('id').text for cat in cat_path]\n",
" leaf_id = cat_path_ids[-1]\n",
" if leaf_id != root_category_id:\n",
" categories.append(leaf_id)\n",
" parents.append(cat_path_ids[-2])\n",
" parents_df = pd.DataFrame(list(zip(categories, parents)), columns =['category', 'parent'])\n",
" child_to_parent = parents_df.set_index('category')\n",
" return child_to_parent\n",
"\n",
"\n",
"def get_cat_lookup(max_depth=10):\n",
" categoriesFilename = 'categories_0001_abcat0010000_to_pcmcat99300050000.xml'\n",
" tree = ET.parse(categoriesFilename)\n",
" root = tree.getroot() \n",
" catDict = {}\n",
" for child in root:\n",
" catPath = child.find('path')\n",
" leafCat = catPath[-1].find('id').text\n",
" catPathStr = ''\n",
" depth = 0\n",
" for cat in catPath:\n",
" if catPathStr != '':\n",
" catPathStr = catPathStr + ' > '\n",
" catPathStr = catPathStr + cat.find('name').text\n",
" depth = depth + 1\n",
" if max_depth > 0 and depth == max_depth:\n",
" break\n",
" catDict[leafCat] = catPathStr\n",
" return catDict\n",
" \n",
"def get_node_parent(cat_id: str, mapping_df: pd.Series):\n",
" \"\"\"\n",
" Used to grab a node's parent category\n",
" \"\"\"\n",
" try:\n",
" parent_category = mapping_df.loc[cat_id].parent\n",
" return parent_category\n",
" except KeyError:\n",
" # must already be top of the food chain\n",
" return cat_id\n",
" \n",
"\n",
"@functools.lru_cache(maxsize=None)\n",
"def get_ultimate_parent(child_category):\n",
" \"\"\"\n",
" walk back up to the first ancestor that's not the taxonomy root\n",
" \"\"\"\n",
" parent_found = False\n",
" parent = child_category\n",
" while not parent_found:\n",
" try:\n",
" # Don't walk back up to the root\n",
" if child_to_parent.loc[parent].parent == 'cat00000':\n",
" return parent\n",
" else:\n",
" parent = child_to_parent.loc[parent].parent\n",
" except KeyError:\n",
" return parent \n",
" \n",
"child_to_parent = get_child_to_parent() \n",
"cat_lookup = get_cat_lookup()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1854998, 6)\n",
"Leaf Counts Description:\n",
"count 1854998.000\n",
"mean 34415.460\n",
"std 52692.034\n",
"min 1.000\n",
"25% 2329.000\n",
"50% 8192.000\n",
"75% 39682.000\n",
"max 177638.000\n",
"Name: leaf_counts, dtype: float64\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>query</th>\n",
" <th>click_time</th>\n",
" <th>path</th>\n",
" <th>path_length</th>\n",
" <th>leaf_counts</th>\n",
" <th>ultimate_parent</th>\n",
" </tr>\n",
" <tr>\n",
" <th>category</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>pcmcat209000050007</th>\n",
" <td>Ipad 2 64gb</td>\n",
" <td>2011-10-08 12:18:48.669</td>\n",
" <td>Best Buy &gt; Computers &amp; Tablets &gt; Tablets &amp; iPa...</td>\n",
" <td>4</td>\n",
" <td>25132</td>\n",
" <td>abcat0500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>pcmcat214700050000</th>\n",
" <td>Otter box commuter</td>\n",
" <td>2011-10-21 06:27:59.82</td>\n",
" <td>Best Buy &gt; Mobile Phones &gt; Mobile Phone Access...</td>\n",
" <td>5</td>\n",
" <td>13161</td>\n",
" <td>abcat0800000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>cat02595</th>\n",
" <td>pirates</td>\n",
" <td>2011-10-19 23:38:04.298</td>\n",
" <td>Best Buy &gt; Movies &amp; Music &gt; Music &gt; Country &gt; ...</td>\n",
" <td>5</td>\n",
" <td>3431</td>\n",
" <td>abcat0600000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" query click_time \\\n",
"category \n",
"pcmcat209000050007 Ipad 2 64gb 2011-10-08 12:18:48.669 \n",
"pcmcat214700050000 Otter box commuter 2011-10-21 06:27:59.82 \n",
"cat02595 pirates 2011-10-19 23:38:04.298 \n",
"\n",
" path \\\n",
"category \n",
"pcmcat209000050007 Best Buy > Computers & Tablets > Tablets & iPa... \n",
"pcmcat214700050000 Best Buy > Mobile Phones > Mobile Phone Access... \n",
"cat02595 Best Buy > Movies & Music > Music > Country > ... \n",
"\n",
" path_length leaf_counts ultimate_parent \n",
"category \n",
"pcmcat209000050007 4 25132 abcat0500000 \n",
"pcmcat214700050000 5 13161 abcat0800000 \n",
"cat02595 5 3431 abcat0600000 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Read our raw data in and get some category counts and other information\n",
"queries = pd.read_csv('train.csv')\n",
"queries['path'] = queries['category'].map(cat_lookup)\n",
"queries = queries.set_index('category').drop(['sku', 'user', 'query_time'], axis=1)\n",
"\n",
"# Drop a few rows that don't have a path\n",
"queries = queries[~queries.path.isna()].copy()\n",
"\n",
"# get the length of the path, so we can prune from the most terminal nodes upward, later on\n",
"queries['path_length'] = queries['path'].map(lambda x: len(x.split('>')))\n",
"\n",
"# Compute the existing frequency per Category\n",
"sizes = queries.groupby('category').size()\n",
"queries['leaf_counts'] = queries.index.map(sizes)\n",
"queries['ultimate_parent'] = queries.index.map(get_ultimate_parent)\n",
"print(queries.shape)\n",
"print('Leaf Counts Description:')\n",
"print(queries.leaf_counts.describe())\n",
"queries.sample(3)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"solving for 200 minimum query threshold\n",
"1486 is the unique remaining categories\n",
"1484 is the unique remaining categories\n",
"solving for 200 minimum query threshold\n",
"1484 is the unique remaining categories\n",
"1467 is the unique remaining categories\n",
"solving for 200 minimum query threshold\n",
"1467 is the unique remaining categories\n",
"1288 is the unique remaining categories\n",
"solving for 200 minimum query threshold\n",
"1288 is the unique remaining categories\n",
"993 is the unique remaining categories\n",
"solving for 200 minimum query threshold\n",
"993 is the unique remaining categories\n",
"833 is the unique remaining categories\n",
"solving for 200 minimum query threshold\n",
"833 is the unique remaining categories\n",
"755 is the unique remaining categories\n",
"solving for 200 minimum query threshold\n",
"755 is the unique remaining categories\n",
"752 is the unique remaining categories\n",
"solving for 200 minimum query threshold\n",
"752 is the unique remaining categories\n",
"28.570548057556152\n",
"-------\n",
"solving for 500 minimum query threshold\n",
"1486 is the unique remaining categories\n",
"1484 is the unique remaining categories\n",
"solving for 500 minimum query threshold\n",
"1484 is the unique remaining categories\n",
"1459 is the unique remaining categories\n",
"solving for 500 minimum query threshold\n",
"1459 is the unique remaining categories\n",
"1262 is the unique remaining categories\n",
"solving for 500 minimum query threshold\n",
"1262 is the unique remaining categories\n",
"885 is the unique remaining categories\n",
"solving for 500 minimum query threshold\n",
"885 is the unique remaining categories\n",
"642 is the unique remaining categories\n",
"solving for 500 minimum query threshold\n",
"642 is the unique remaining categories\n",
"564 is the unique remaining categories\n",
"solving for 500 minimum query threshold\n",
"564 is the unique remaining categories\n",
"559 is the unique remaining categories\n",
"solving for 500 minimum query threshold\n",
"559 is the unique remaining categories\n",
"41.14575910568237\n",
"-------\n",
"solving for 1000 minimum query threshold\n",
"1486 is the unique remaining categories\n",
"1483 is the unique remaining categories\n",
"solving for 1000 minimum query threshold\n",
"1483 is the unique remaining categories\n",
"1455 is the unique remaining categories\n",
"solving for 1000 minimum query threshold\n",
"1455 is the unique remaining categories\n",
"1238 is the unique remaining categories\n",
"solving for 1000 minimum query threshold\n",
"1238 is the unique remaining categories\n",
"797 is the unique remaining categories\n",
"solving for 1000 minimum query threshold\n",
"797 is the unique remaining categories\n",
"503 is the unique remaining categories\n",
"solving for 1000 minimum query threshold\n",
"503 is the unique remaining categories\n",
"410 is the unique remaining categories\n",
"solving for 1000 minimum query threshold\n",
"410 is the unique remaining categories\n",
"406 is the unique remaining categories\n",
"solving for 1000 minimum query threshold\n",
"406 is the unique remaining categories\n",
"66.01959586143494\n",
"-------\n"
]
}
],
"source": [
"import time\n",
"\n",
"for MIN_QUERY in [200, 500, 1_000]:\n",
" revised_queries = queries.copy() # to avoid reloading the data!\n",
" start = time.time()\n",
" solved = False\n",
" while not solved:\n",
" print(f'solving for {MIN_QUERY} minimum query threshold')\n",
" print(revised_queries.index.nunique(), 'is the unique remaining categories')\n",
" # Get the rows having leaf_counts associated with their category lower than the threshold\n",
" leaves_to_rollup = revised_queries.query('leaf_counts < @MIN_QUERY').copy() \n",
" if leaves_to_rollup.shape[0] == 0:\n",
" solved = True\n",
" break\n",
" # get the other which consist of queries mapped to categories with enough labels already\n",
" leaves_with_enough = revised_queries.query('leaf_counts >= @MIN_QUERY').copy() \n",
"\n",
" # for our leaves in need of rolling upward, find the true terminal nodes using their path length\n",
" longest_path_length = leaves_to_rollup.path_length.max()\n",
"\n",
" # Get the terminal leaves\n",
" to_prune = leaves_to_rollup.query('path_length == @longest_path_length').copy()\n",
" # Don't forget the non-termal leaves, too\n",
" to_not_prune = leaves_to_rollup.query('path_length != @longest_path_length').copy()\n",
"\n",
" # grab the parent category using the current and a mapping we made in the first cell \"child_to_parent\"\n",
" to_prune['parent_category'] = to_prune.index.map(lambda x: get_node_parent(x, child_to_parent))\n",
"\n",
" # update our index\n",
" to_prune.set_index('parent_category', inplace=True)\n",
" to_prune.index.names = ['category']\n",
"\n",
" # Update our path and path lengths\n",
" to_prune['updated_path'] = to_prune.index.map(lambda x: cat_lookup[x] )\n",
" to_prune['updated_path_length'] = to_prune['updated_path'].map(lambda x: len(x.split('>')))\n",
"\n",
" # drop the old path lengths and path columns and rename the updated ones in their place\n",
" to_prune.drop(['path_length', 'path'], axis=1, inplace=True)\n",
" to_prune.rename(columns={'updated_path':'path', 'updated_path_length':'path_length'}, inplace=True)\n",
"\n",
" # combine our updated rows with our unaffected rows\n",
" revised_queries = pd.concat([leaves_with_enough, to_prune, to_not_prune], axis=0, sort=False)\n",
"\n",
" # Recompute category frequencies and reset the leaf_counts column on our combined data\n",
" sizes = revised_queries.groupby(revised_queries.index).size()\n",
" revised_queries['leaf_counts'] = revised_queries.index.map(sizes)\n",
"\n",
" # Some assertions\n",
"# assert revised_queries.leaf_counts.min() >= MIN_QUERY, f'Something went awry here'\n",
" print(revised_queries.index.nunique(), 'is the unique remaining categories')\n",
" end = time.time()\n",
" print(end - start)\n",
" print('-------')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>query</th>\n",
" </tr>\n",
" <tr>\n",
" <th>path</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Best Buy &gt; Movies &amp; Music &gt; Music &gt; Gospel &amp; Christian</th>\n",
" <td>[olevia, The devil wears prada, leandria johnson, Mary mary, j moss]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Best Buy &gt; Appliances &gt; Refrigerators &gt; Beverage Coolers &amp; Dispensers</th>\n",
" <td>[wine cooler, wine, wine cooler, wine cooler, Mini fridge]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Best Buy &gt; Mobile Phones &gt; Mobile Broadband &gt; Mobile Broadband Devices with Plans</th>\n",
" <td>[internet connection, mobile Hotspot, wireless modem, wireless modem, motherboard]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Best Buy &gt; Mobile Phones &gt; Mobile Phone Accessories &gt; iPhone Accessories</th>\n",
" <td>[nano, nano, Subwoofer, prince, subwoofer]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Best Buy &gt; Computers &amp; Tablets &gt; Laptop &amp; Netbook Computers &gt; MacBooks</th>\n",
" <td>[MacBook pro, Apple notebook, MacBook, Mac book pro power adaptors, Macbook pro]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Best Buy &gt; Audio &amp; MP3 &gt; Headphones &gt; Over-Ear &amp; On-Ear Headphones</th>\n",
" <td>[dr dre beats, skullcandy studio headphones, Beats by dr.dre earphones, skullcandy, headphones]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Best Buy &gt; Video Games &gt; Wii &gt; Wii Accessories &gt; Wii Accessory Kits &amp; Bundles</th>\n",
" <td>[wii system, wii console, wii console, Skylanders, wii]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Best Buy &gt; Cameras &amp; Camcorders &gt; Digital Cameras &gt; Point &amp; Shoot Cameras &gt; Waterproof Cameras</th>\n",
" <td>[Waterproof, Pentax, waterproof camera, shock proof camera, Olympus stylus tough]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Best Buy &gt; Audio &amp; MP3 &gt; Car Audio &gt; Car Accessories &gt; iPod Accessories</th>\n",
" <td>[Cassette tape adaptors, Car radio, i pods, Ipod ford, Cassette adapter]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Best Buy &gt; Video Games</th>\n",
" <td>[12v 1.5a adapter, lord of the rings, adapter, Psp 3000, starcraft]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Best Buy &gt; Video Games &gt; Accessories &gt; Gaming Headsets</th>\n",
" <td>[xbox headset, Xbox Headset, starcraft, Mw3, Modern warfare]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Best Buy &gt; Musical Instruments &gt; Live Sound &amp; Microphones &gt; Microphones &amp; Accessories &gt; Microphones</th>\n",
" <td>[usb microphones, Mic, studio microphones, recording microphone, Microphone]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Best Buy &gt; Video Games &gt; PlayStation 3 &gt; PS3 Consoles</th>\n",
" <td>[ps3, Console PS3 game, ps3 console, ps3 console, Ps3 refurbished]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Best Buy &gt; Computers &amp; Tablets &gt; Computer Cards &amp; Components &gt; DVD/Blu-ray Drives</th>\n",
" <td>[usb flash drive, blueray, Roxio, guitar hero 3, internal blu ray]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Best Buy &gt; Audio &amp; MP3 &gt; Recording Equipment</th>\n",
" <td>[Zoom digital recorder, micro sd, digital recorder, Koss, Studio monitors]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Best Buy &gt; Computers &amp; Tablets &gt; Networking &amp; Wireless &gt; Wireless Networking &gt; Networking Accessories &gt; Network Cables</th>\n",
" <td>[Cat 5e, dvi-d, Ethernet cable, ethernet cable, Ethernet cables]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Best Buy &gt; Computers &amp; Tablets &gt; Mice &amp; Keyboards &gt; Keyboard &amp; Mouse Combos</th>\n",
" <td>[wireless keyboard, keyboard, tablet keyboards, LaborDay_Computers_20110902, LaborDay_VideoGames_20110902]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Best Buy &gt; Office &gt; Printers &gt; Inkjet Printers</th>\n",
" <td>[Printer, Printers, Hp 4500, matrix, wireless printer]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Best Buy &gt; Cameras &amp; Camcorders &gt; Digital Cameras &gt; Point &amp; Shoot Cameras &gt; Fun &amp; Basic Cameras</th>\n",
" <td>[canon g7, Sony Cybershot, Nikon coolpix l120, sd850, Nikon coolpix plum]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Best Buy &gt; Audio &amp; MP3 &gt; Home Audio &gt; Home Audio Accessories &gt; A/V Cables &amp; Connectors &gt; HDMI Cables</th>\n",
" <td>[monster cable, hdmi cable for ipod, Hdmi cable, Hdmi, Hdmi]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" query\n",
"path \n",
"Best Buy > Movies & Music > Music > Gospel & Christian [olevia, The devil wears prada, leandria johnson, Mary mary, j moss]\n",
"Best Buy > Appliances > Refrigerators > Beverage Coolers & Dispensers [wine cooler, wine, wine cooler, wine cooler, Mini fridge]\n",
"Best Buy > Mobile Phones > Mobile Broadband > Mobile Broadband Devices with Plans [internet connection, mobile Hotspot, wireless modem, wireless modem, motherboard]\n",
"Best Buy > Mobile Phones > Mobile Phone Accessories > iPhone Accessories [nano, nano, Subwoofer, prince, subwoofer]\n",
"Best Buy > Computers & Tablets > Laptop & Netbook Computers > MacBooks [MacBook pro, Apple notebook, MacBook, Mac book pro power adaptors, Macbook pro]\n",
"Best Buy > Audio & MP3 > Headphones > Over-Ear & On-Ear Headphones [dr dre beats, skullcandy studio headphones, Beats by dr.dre earphones, skullcandy, headphones]\n",
"Best Buy > Video Games > Wii > Wii Accessories > Wii Accessory Kits & Bundles [wii system, wii console, wii console, Skylanders, wii]\n",
"Best Buy > Cameras & Camcorders > Digital Cameras > Point & Shoot Cameras > Waterproof Cameras [Waterproof, Pentax, waterproof camera, shock proof camera, Olympus stylus tough]\n",
"Best Buy > Audio & MP3 > Car Audio > Car Accessories > iPod Accessories [Cassette tape adaptors, Car radio, i pods, Ipod ford, Cassette adapter]\n",
"Best Buy > Video Games [12v 1.5a adapter, lord of the rings, adapter, Psp 3000, starcraft]\n",
"Best Buy > Video Games > Accessories > Gaming Headsets [xbox headset, Xbox Headset, starcraft, Mw3, Modern warfare]\n",
"Best Buy > Musical Instruments > Live Sound & Microphones > Microphones & Accessories > Microphones [usb microphones, Mic, studio microphones, recording microphone, Microphone]\n",
"Best Buy > Video Games > PlayStation 3 > PS3 Consoles [ps3, Console PS3 game, ps3 console, ps3 console, Ps3 refurbished]\n",
"Best Buy > Computers & Tablets > Computer Cards & Components > DVD/Blu-ray Drives [usb flash drive, blueray, Roxio, guitar hero 3, internal blu ray]\n",
"Best Buy > Audio & MP3 > Recording Equipment [Zoom digital recorder, micro sd, digital recorder, Koss, Studio monitors]\n",
"Best Buy > Computers & Tablets > Networking & Wireless > Wireless Networking > Networking Accessories > Network Cables [Cat 5e, dvi-d, Ethernet cable, ethernet cable, Ethernet cables]\n",
"Best Buy > Computers & Tablets > Mice & Keyboards > Keyboard & Mouse Combos [wireless keyboard, keyboard, tablet keyboards, LaborDay_Computers_20110902, LaborDay_VideoGames_20110902]\n",
"Best Buy > Office > Printers > Inkjet Printers [Printer, Printers, Hp 4500, matrix, wireless printer]\n",
"Best Buy > Cameras & Camcorders > Digital Cameras > Point & Shoot Cameras > Fun & Basic Cameras [canon g7, Sony Cybershot, Nikon coolpix l120, sd850, Nikon coolpix plum]\n",
"Best Buy > Audio & MP3 > Home Audio > Home Audio Accessories > A/V Cables & Connectors > HDMI Cables [monster cable, hdmi cable for ipod, Hdmi cable, Hdmi, Hdmi]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import random\n",
"pd.set_option('display.max_colwidth', 500)\n",
"\n",
"# Does this look reasonable? Yepp\n",
"revised_queries.groupby('path').agg({\n",
" 'query': lambda x: random.sample(list(x), 5)\n",
"}).sample(20, random_state=39)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"queries"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment