Created
March 15, 2017 06:42
-
-
Save hemel-cse/87bba0eedaa1c7c5e39847966a4a72ee to your computer and use it in GitHub Desktop.
Extracting links from http://www.datacenter9.com/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:ec2be44cba00f0802d61dd91c0ff6f24b3a03e0f5c28912bb29ad4c495471a89" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import mechanize\n", | |
"import cookielib\n", | |
"from bs4 import BeautifulSoup as bs\n", | |
"import html2text\n", | |
"from time import sleep\n", | |
"import re\n", | |
"\n", | |
"br = mechanize.Browser()\n", | |
"\n", | |
"cj = cookielib.LWPCookieJar()\n", | |
"\n", | |
"br.set_cookiejar(cj)\n", | |
"\n", | |
"br.set_handle_equiv(True)\n", | |
"\n", | |
"br.set_handle_gzip(True)\n", | |
"\n", | |
"br.set_handle_redirect(True)\n", | |
"\n", | |
"br.set_handle_referer(True)\n", | |
"\n", | |
"br.set_handle_robots(False)\n", | |
"\n", | |
"br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)\n", | |
"\n", | |
"br.addheaders = [('User-agent', 'Chrome')]\n", | |
"\n", | |
"bsite = br.open('http://www.datacenter9.com/datacenters/united-states').read()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stderr", | |
"text": [ | |
"-c:16: UserWarning: gzip transfer encoding is experimental!\n" | |
] | |
} | |
], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"sp = bs(bsite)\n", | |
"\n", | |
"table = sp.find('table', {'class': 'datacenter_tab'});\n", | |
"\n", | |
"rows = table.findAll('tr')\n", | |
"\n", | |
"\n", | |
"li = []\n", | |
"final_list = []\n", | |
"in_list = []" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stderr", | |
"text": [ | |
"/home/hemel/.local/lib/python2.7/site-packages/bs4/__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"lxml\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n", | |
"\n", | |
"The code that caused this warning is on line 1 of the file <string>. To get rid of this warning, change code that looks like this:\n", | |
"\n", | |
" BeautifulSoup([your markup])\n", | |
"\n", | |
"to this:\n", | |
"\n", | |
" BeautifulSoup([your markup], \"lxml\")\n", | |
"\n", | |
" markup_type=markup_type))\n" | |
] | |
} | |
], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"for tr in rows:\n", | |
" for td in tr.findAll(\"a\", {\"href\": True}):\n", | |
" li.append(\"http://www.datacenter9.com\"+td[\"href\"])\n", | |
"for tr in rows:\n", | |
" for td in tr.findAll(\"a\", {\"href\": True}):\n", | |
" in_list.append(td.text)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def linfunc(lnk):\n", | |
" bbb = br.open(lnk).read()\n", | |
" nnn = bs(bbb)\n", | |
" tablech = nnn.find('table', {'class': 'datacenter_tab_light'});\n", | |
" rowsch = tablech.findAll('tr')\n", | |
" for tr in rowsch:\n", | |
" for td in tr.findAll(\"a\", {\"href\": True}):\n", | |
" final_list.append(td.text)\n", | |
" return\n", | |
"\n", | |
"for i in li:\n", | |
" linfunc(i)\n", | |
" sleep(1)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"for i in in_list:\n", | |
" print i\n", | |
"print \"\\n\\n\"\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"for i in final_list:\n", | |
" print i" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 6 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment