Skip to content

Instantly share code, notes, and snippets.

@jjsantanna
Last active September 24, 2019 12:54
Show Gist options
  • Save jjsantanna/f119576917c590dc4366d855ee792184 to your computer and use it in GitHub Desktop.
Save jjsantanna/f119576917c590dc4366d855ee792184 to your computer and use it in GitHub Desktop.
Simple crawler with pandas for tables in html
import cfscrape
from lxml import etree
import pandas as pd
url="<put_the_url_here>"
header = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language' : 'nl-NL,nl;q=0.8,en-US;q=0.6,en;q=0.4',
'Cache-Control' : 'max-age=0',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.81 Safari/537.36'}
scraper = cfscrape.create_scraper()
scraped_html=scraper.get(headers=header,url).content
print(scraped_html.decode("utf-8"))
#FOR SIMPLE CASE IN WHICH CONTENT IS IN A TABLE HTML
tables = pd.read_html(scraped_html) # Returns list of all tables on page
tables[0]
#FOR MORE COMPLICATED CASE IN WHICH THE CONTENT IS SOMEWHERE IN THE HTML
html = etree.HTML(scraped_html)
elements = html.xpath("//div[@class='col-xs-10']")
for element in elements:
title = element.xpath("div[@class='row']/h1[@class='col-xs-12 col-sm-8']/a/text()")[0]
url_moreinfo = 'https://aws.amazon.com/'+ element.xpath("div[@class='row']/h1[@class='col-xs-12 col-sm-8']/a/@href")[0]
#SECOND LEVEL SCRAPER
scraped_html=scraper.get(url_moreinfo).content
html = etree.HTML(scraped_html)
description = html.xpath("//div[@class='sidebar-box']/p/text()")[0]
df = df.append({'title': title,
'url_moreinfo': url_moreinfo,
'description':description},
ignore_index=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment