Skip to content

Instantly share code, notes, and snippets.

@hemel-cse
Created March 15, 2017 06:42
Show Gist options
  • Save hemel-cse/87bba0eedaa1c7c5e39847966a4a72ee to your computer and use it in GitHub Desktop.
Save hemel-cse/87bba0eedaa1c7c5e39847966a4a72ee to your computer and use it in GitHub Desktop.
Extracting links from http://www.datacenter9.com/
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "",
"signature": "sha256:ec2be44cba00f0802d61dd91c0ff6f24b3a03e0f5c28912bb29ad4c495471a89"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"import mechanize\n",
"import cookielib\n",
"from bs4 import BeautifulSoup as bs\n",
"import html2text\n",
"from time import sleep\n",
"import re\n",
"\n",
"br = mechanize.Browser()\n",
"\n",
"cj = cookielib.LWPCookieJar()\n",
"\n",
"br.set_cookiejar(cj)\n",
"\n",
"br.set_handle_equiv(True)\n",
"\n",
"br.set_handle_gzip(True)\n",
"\n",
"br.set_handle_redirect(True)\n",
"\n",
"br.set_handle_referer(True)\n",
"\n",
"br.set_handle_robots(False)\n",
"\n",
"br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)\n",
"\n",
"br.addheaders = [('User-agent', 'Chrome')]\n",
"\n",
"bsite = br.open('http://www.datacenter9.com/datacenters/united-states').read()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stderr",
"text": [
"-c:16: UserWarning: gzip transfer encoding is experimental!\n"
]
}
],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"sp = bs(bsite)\n",
"\n",
"table = sp.find('table', {'class': 'datacenter_tab'});\n",
"\n",
"rows = table.findAll('tr')\n",
"\n",
"\n",
"li = []\n",
"final_list = []\n",
"in_list = []"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stderr",
"text": [
"/home/hemel/.local/lib/python2.7/site-packages/bs4/__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"lxml\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n",
"\n",
"The code that caused this warning is on line 1 of the file <string>. To get rid of this warning, change code that looks like this:\n",
"\n",
" BeautifulSoup([your markup])\n",
"\n",
"to this:\n",
"\n",
" BeautifulSoup([your markup], \"lxml\")\n",
"\n",
" markup_type=markup_type))\n"
]
}
],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for tr in rows:\n",
" for td in tr.findAll(\"a\", {\"href\": True}):\n",
" li.append(\"http://www.datacenter9.com\"+td[\"href\"])\n",
"for tr in rows:\n",
" for td in tr.findAll(\"a\", {\"href\": True}):\n",
" in_list.append(td.text)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def linfunc(lnk):\n",
" bbb = br.open(lnk).read()\n",
" nnn = bs(bbb)\n",
" tablech = nnn.find('table', {'class': 'datacenter_tab_light'});\n",
" rowsch = tablech.findAll('tr')\n",
" for tr in rowsch:\n",
" for td in tr.findAll(\"a\", {\"href\": True}):\n",
" final_list.append(td.text)\n",
" return\n",
"\n",
"for i in li:\n",
" linfunc(i)\n",
" sleep(1)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for i in in_list:\n",
" print i\n",
"print \"\\n\\n\"\n"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for i in final_list:\n",
" print i"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment