Skip to content

Instantly share code, notes, and snippets.

@jmrobles
Created December 12, 2021 23:05
Show Gist options
  • Save jmrobles/7b9f02fac3110a3f8484fefacede67cb to your computer and use it in GitHub Desktop.
Save jmrobles/7b9f02fac3110a3f8484fefacede67cb to your computer and use it in GitHub Desktop.
Simple Selenium Web Scraper
import sys
import json
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from langdetect import detect
# Consts
AUTH_CODE = 'asecretcode'
def scrap(urls):
opt = webdriver.ChromeOptions()
opt.add_argument("--headless")
opt.add_argument("--disable-gpu")
opt.add_argument("--no-sandbox")
driver = webdriver.Chrome(options=opt)
rets = []
for url in urls:
ret = {}
ret['url'] = url
try:
driver.get(url)
except Exception as e:
ret['success'] = False
ret['error'] = str(e)
rets.append(ret)
continue
driver.implicitly_wait(10)
ret['success'] = True
ret['title'] = driver.title
# Headings
ret['h1'] = [x.text for x in driver.find_elements_by_xpath('//h1') if x.text != '']
ret['h2'] = [x.text for x in driver.find_elements_by_xpath('//h2') if x.text != '']
ret['h3'] = [x.text for x in driver.find_elements_by_xpath('//h3') if x.text != '']
ret['h4'] = [x.text for x in driver.find_elements_by_xpath('//h4') if x.text != '']
ret['h5'] = [x.text for x in driver.find_elements_by_xpath('//h5') if x.text != '']
ret['h6'] = [x.text for x in driver.find_elements_by_xpath('//h6') if x.text != '']
# Paragraphs
ret['p'] = [x.text for x in driver.find_elements_by_xpath('//p') if x.text != '']
# Images
resources = driver.execute_script("return window.performance.getEntriesByType('resource');")
ret['img_resources'] = [x['name'] for x in resources if x['initiatorType'] == 'img']
imgs = driver.find_elements_by_xpath('//img')
ret['img'] = [{'src': x.get_attribute('src') , 'rect': x.rect} for x in imgs if x.rect['width'] > 100]
comp = ret['title']
comp += ' '.join(ret['h1'])
comp += ' '.join(ret['h2'])
comp += ' '.join(ret['h3'])
comp += ' '.join(ret['p'])
ret['lang'] = detect(comp)
rets.append(ret)
driver.close()
return rets
def handle(event, context):
# Check if it's authorized
if 'auth-code' not in event.headers or event.headers['auth-code'] != AUTH_CODE:
return {
'statusCode': 401,
'body': {'error': 'Unauthorized'}
}
# Get the URLs to scrape from the event
data = {}
try:
data = json.loads(event.body)
except Exception as e:
return {
'statusCode': 400,
'body': {
'error': 'Invalid JSON'
}
}
if 'urls' not in data:
return {
'statusCode': 400,
'body': {
'error': 'No URLs provided'
}
}
ret = scrap(data['urls'])
return {
"statusCode": 200,
"body": ret,
"headers": {
"Content-type": "application/json; charset=utf-8"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment