jjsantanna · September 24, 2019 12:54
diff --git a/simple_crawler.py b/simple_crawler.py
 import cfscrape
 from lxml import etree
 import pandas as pd

 url="<put_the_url_here>"

 header = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9',
          'Accept-Encoding': 'gzip, deflate, sdch',
          'Accept-Language' : 'nl-NL,nl;q=0.8,en-US;q=0.6,en;q=0.4',
          'Cache-Control' : 'max-age=0',
          'Connection': 'keep-alive',
          'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.81 Safari/537.36'}

 scraper = cfscrape.create_scraper()
 scraped_html=scraper.get(headers=header,url).content

 print(scraped_html.decode("utf-8"))

 #FOR SIMPLE CASE IN WHICH CONTENT IS IN A TABLE HTML
 tables = pd.read_html(scraped_html) # Returns list of all tables on page
 tables[0]

 #FOR MORE COMPLICATED CASE IN WHICH THE CONTENT IS SOMEWHERE IN THE HTML
 html = etree.HTML(scraped_html)
 elements = html.xpath("//div[@class='col-xs-10']")
 for element in elements:
    title = element.xpath("div[@class='row']/h1[@class='col-xs-12 col-sm-8']/a/text()")[0]
    url_moreinfo = 'https://aws.amazon.com/'+ element.xpath("div[@class='row']/h1[@class='col-xs-12 col-sm-8']/a/@href")[0]
    #SECOND LEVEL SCRAPER
    scraped_html=scraper.get(url_moreinfo).content
    html = etree.HTML(scraped_html)
    description = html.xpath("//div[@class='sidebar-box']/p/text()")[0]

    df = df.append({'title': title,  
                    'url_moreinfo': url_moreinfo, 
                    'description':description},
                   ignore_index=True)
	import cfscrape
	from lxml import etree
	import pandas as pd

	url="<put_the_url_here>"

	header = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9',
	'Accept-Encoding': 'gzip, deflate, sdch',
	'Accept-Language' : 'nl-NL,nl;q=0.8,en-US;q=0.6,en;q=0.4',
	'Cache-Control' : 'max-age=0',
	'Connection': 'keep-alive',
	'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.81 Safari/537.36'}

	scraper = cfscrape.create_scraper()
	scraped_html=scraper.get(headers=header,url).content

	print(scraped_html.decode("utf-8"))

	#FOR SIMPLE CASE IN WHICH CONTENT IS IN A TABLE HTML
	tables = pd.read_html(scraped_html) # Returns list of all tables on page
	tables[0]

	#FOR MORE COMPLICATED CASE IN WHICH THE CONTENT IS SOMEWHERE IN THE HTML
	html = etree.HTML(scraped_html)
	elements = html.xpath("//div[@class='col-xs-10']")
	for element in elements:
	title = element.xpath("div[@class='row']/h1[@class='col-xs-12 col-sm-8']/a/text()")[0]
	url_moreinfo = 'https://aws.amazon.com/'+ element.xpath("div[@class='row']/h1[@class='col-xs-12 col-sm-8']/a/@href")[0]
	#SECOND LEVEL SCRAPER
	scraped_html=scraper.get(url_moreinfo).content
	html = etree.HTML(scraped_html)
	description = html.xpath("//div[@class='sidebar-box']/p/text()")[0]

	df = df.append({'title': title,
	'url_moreinfo': url_moreinfo,
	'description':description},
	ignore_index=True)