Created
December 17, 2016 00:41
-
-
Save divergentdave/d4664a642b55593ee486c3c0545f4774 to your computer and use it in GitHub Desktop.
Deduplicated Cisco Umbrella 1 Million
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Data sources:\n", | |
"```\n", | |
"git clone https://github.com/publicsuffix/list.git\n", | |
"pip install publicsuffix\n", | |
"wget http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip\n", | |
"unzip top-1m.csv.zip\n", | |
"```" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import codecs\n", | |
"import pandas\n", | |
"import publicsuffix" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"psl_file = codecs.open(\"list/public_suffix_list.dat\", encoding=\"utf-8\")\n", | |
"psl = publicsuffix.PublicSuffixList(psl_file)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>rank</th>\n", | |
" <th>domain</th>\n", | |
" <th>public_suffix</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>com</td>\n", | |
" <td>com</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2</td>\n", | |
" <td>net</td>\n", | |
" <td>net</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>3</td>\n", | |
" <td>google.com</td>\n", | |
" <td>google.com</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>4</td>\n", | |
" <td>org</td>\n", | |
" <td>org</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>5</td>\n", | |
" <td>microsoft.com</td>\n", | |
" <td>microsoft.com</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>6</td>\n", | |
" <td>googleapis.com</td>\n", | |
" <td>googleapis.com</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>7</td>\n", | |
" <td>facebook.com</td>\n", | |
" <td>facebook.com</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>9</td>\n", | |
" <td>doubleclick.net</td>\n", | |
" <td>doubleclick.net</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>13</td>\n", | |
" <td>google-analytics.com</td>\n", | |
" <td>google-analytics.com</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>15</td>\n", | |
" <td>youtube.com</td>\n", | |
" <td>youtube.com</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>16</td>\n", | |
" <td>apple.com</td>\n", | |
" <td>apple.com</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>17</td>\n", | |
" <td>fbcdn.net</td>\n", | |
" <td>fbcdn.net</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>18</td>\n", | |
" <td>www.googleapis.com</td>\n", | |
" <td>www.googleapis.com</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>20</td>\n", | |
" <td>googlesyndication.com</td>\n", | |
" <td>googlesyndication.com</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15</th>\n", | |
" <td>22</td>\n", | |
" <td>amazonaws.com</td>\n", | |
" <td>amazonaws.com</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>16</th>\n", | |
" <td>23</td>\n", | |
" <td>googleadservices.com</td>\n", | |
" <td>googleadservices.com</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>17</th>\n", | |
" <td>31</td>\n", | |
" <td>live.com</td>\n", | |
" <td>live.com</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>18</th>\n", | |
" <td>33</td>\n", | |
" <td>googleusercontent.com</td>\n", | |
" <td>googleusercontent.com</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>19</th>\n", | |
" <td>34</td>\n", | |
" <td>fonts.googleapis.com</td>\n", | |
" <td>fonts.googleapis.com</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>20</th>\n", | |
" <td>40</td>\n", | |
" <td>yahoo.com</td>\n", | |
" <td>yahoo.com</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" rank domain public_suffix\n", | |
"1 1 com com\n", | |
"2 2 net net\n", | |
"3 3 google.com google.com\n", | |
"4 4 org org\n", | |
"5 5 microsoft.com microsoft.com\n", | |
"6 6 googleapis.com googleapis.com\n", | |
"7 7 facebook.com facebook.com\n", | |
"8 9 doubleclick.net doubleclick.net\n", | |
"9 13 google-analytics.com google-analytics.com\n", | |
"10 15 youtube.com youtube.com\n", | |
"11 16 apple.com apple.com\n", | |
"12 17 fbcdn.net fbcdn.net\n", | |
"13 18 www.googleapis.com www.googleapis.com\n", | |
"14 20 googlesyndication.com googlesyndication.com\n", | |
"15 22 amazonaws.com amazonaws.com\n", | |
"16 23 googleadservices.com googleadservices.com\n", | |
"17 31 live.com live.com\n", | |
"18 33 googleusercontent.com googleusercontent.com\n", | |
"19 34 fonts.googleapis.com fonts.googleapis.com\n", | |
"20 40 yahoo.com yahoo.com" | |
] | |
}, | |
"execution_count": 24, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df = pandas.read_csv(\"top-1m.csv\", names=[\"rank\", \"domain\"])\n", | |
"df[\"public_suffix\"] = df.domain.map(psl.get_public_suffix)\n", | |
"df = df.drop_duplicates(subset=\"public_suffix\", keep=\"first\")\n", | |
"df = df.reset_index(drop=True)\n", | |
"df.index += 1\n", | |
"df.head(20)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>rank</th>\n", | |
" <th>domain</th>\n", | |
" <th>public_suffix</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>299489</th>\n", | |
" <td>999967</td>\n", | |
" <td>bodekandrhodes.com</td>\n", | |
" <td>bodekandrhodes.com</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>299490</th>\n", | |
" <td>999969</td>\n", | |
" <td>bohme.com</td>\n", | |
" <td>bohme.com</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>299491</th>\n", | |
" <td>999971</td>\n", | |
" <td>boho-web-468027117.ap-southeast-1.elb.amazonaw...</td>\n", | |
" <td>boho-web-468027117.ap-southeast-1.elb.amazonaw...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>299492</th>\n", | |
" <td>999973</td>\n", | |
" <td>boldytours.nl</td>\n", | |
" <td>boldytours.nl</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>299493</th>\n", | |
" <td>999974</td>\n", | |
" <td>bombounowa.com</td>\n", | |
" <td>bombounowa.com</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>299494</th>\n", | |
" <td>999975</td>\n", | |
" <td>bonbone.ru</td>\n", | |
" <td>bonbone.ru</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>299495</th>\n", | |
" <td>999976</td>\n", | |
" <td>bondara.co.uk</td>\n", | |
" <td>bondara.co.uk</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>299496</th>\n", | |
" <td>999977</td>\n", | |
" <td>bongdas.net</td>\n", | |
" <td>bongdas.net</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>299497</th>\n", | |
" <td>999978</td>\n", | |
" <td>bonprixitalia.net</td>\n", | |
" <td>bonprixitalia.net</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>299498</th>\n", | |
" <td>999979</td>\n", | |
" <td>bonsaiempire.com</td>\n", | |
" <td>bonsaiempire.com</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>299499</th>\n", | |
" <td>999986</td>\n", | |
" <td>bootstrapworld.org</td>\n", | |
" <td>bootstrapworld.org</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>299500</th>\n", | |
" <td>999987</td>\n", | |
" <td>bossard.com</td>\n", | |
" <td>bossard.com</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>299501</th>\n", | |
" <td>999988</td>\n", | |
" <td>bostonprivate.com</td>\n", | |
" <td>bostonprivate.com</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>299502</th>\n", | |
" <td>999990</td>\n", | |
" <td>bourgsaintmaurice.fr</td>\n", | |
" <td>bourgsaintmaurice.fr</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>299503</th>\n", | |
" <td>999992</td>\n", | |
" <td>bowshrine.com</td>\n", | |
" <td>bowshrine.com</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>299504</th>\n", | |
" <td>999993</td>\n", | |
" <td>boxingforum24.com</td>\n", | |
" <td>boxingforum24.com</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>299505</th>\n", | |
" <td>999996</td>\n", | |
" <td>boxsets.com.ar</td>\n", | |
" <td>boxsets.com.ar</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>299506</th>\n", | |
" <td>999997</td>\n", | |
" <td>boxx.ca</td>\n", | |
" <td>boxx.ca</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>299507</th>\n", | |
" <td>999999</td>\n", | |
" <td>boydsgunstocks.com</td>\n", | |
" <td>boydsgunstocks.com</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>299508</th>\n", | |
" <td>1000000</td>\n", | |
" <td>boyself.com</td>\n", | |
" <td>boyself.com</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" rank domain \\\n", | |
"299489 999967 bodekandrhodes.com \n", | |
"299490 999969 bohme.com \n", | |
"299491 999971 boho-web-468027117.ap-southeast-1.elb.amazonaw... \n", | |
"299492 999973 boldytours.nl \n", | |
"299493 999974 bombounowa.com \n", | |
"299494 999975 bonbone.ru \n", | |
"299495 999976 bondara.co.uk \n", | |
"299496 999977 bongdas.net \n", | |
"299497 999978 bonprixitalia.net \n", | |
"299498 999979 bonsaiempire.com \n", | |
"299499 999986 bootstrapworld.org \n", | |
"299500 999987 bossard.com \n", | |
"299501 999988 bostonprivate.com \n", | |
"299502 999990 bourgsaintmaurice.fr \n", | |
"299503 999992 bowshrine.com \n", | |
"299504 999993 boxingforum24.com \n", | |
"299505 999996 boxsets.com.ar \n", | |
"299506 999997 boxx.ca \n", | |
"299507 999999 boydsgunstocks.com \n", | |
"299508 1000000 boyself.com \n", | |
"\n", | |
" public_suffix \n", | |
"299489 bodekandrhodes.com \n", | |
"299490 bohme.com \n", | |
"299491 boho-web-468027117.ap-southeast-1.elb.amazonaw... \n", | |
"299492 boldytours.nl \n", | |
"299493 bombounowa.com \n", | |
"299494 bonbone.ru \n", | |
"299495 bondara.co.uk \n", | |
"299496 bongdas.net \n", | |
"299497 bonprixitalia.net \n", | |
"299498 bonsaiempire.com \n", | |
"299499 bootstrapworld.org \n", | |
"299500 bossard.com \n", | |
"299501 bostonprivate.com \n", | |
"299502 bourgsaintmaurice.fr \n", | |
"299503 bowshrine.com \n", | |
"299504 boxingforum24.com \n", | |
"299505 boxsets.com.ar \n", | |
"299506 boxx.ca \n", | |
"299507 boydsgunstocks.com \n", | |
"299508 boyself.com " | |
] | |
}, | |
"execution_count": 25, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.tail(20)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.9" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment