Created
December 12, 2021 23:05
-
-
Save jmrobles/7b9f02fac3110a3f8484fefacede67cb to your computer and use it in GitHub Desktop.
Simple Selenium Web Scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import json | |
from selenium import webdriver | |
from selenium.webdriver import ChromeOptions | |
from langdetect import detect | |
# Consts | |
AUTH_CODE = 'asecretcode' | |
def scrap(urls): | |
opt = webdriver.ChromeOptions() | |
opt.add_argument("--headless") | |
opt.add_argument("--disable-gpu") | |
opt.add_argument("--no-sandbox") | |
driver = webdriver.Chrome(options=opt) | |
rets = [] | |
for url in urls: | |
ret = {} | |
ret['url'] = url | |
try: | |
driver.get(url) | |
except Exception as e: | |
ret['success'] = False | |
ret['error'] = str(e) | |
rets.append(ret) | |
continue | |
driver.implicitly_wait(10) | |
ret['success'] = True | |
ret['title'] = driver.title | |
# Headings | |
ret['h1'] = [x.text for x in driver.find_elements_by_xpath('//h1') if x.text != ''] | |
ret['h2'] = [x.text for x in driver.find_elements_by_xpath('//h2') if x.text != ''] | |
ret['h3'] = [x.text for x in driver.find_elements_by_xpath('//h3') if x.text != ''] | |
ret['h4'] = [x.text for x in driver.find_elements_by_xpath('//h4') if x.text != ''] | |
ret['h5'] = [x.text for x in driver.find_elements_by_xpath('//h5') if x.text != ''] | |
ret['h6'] = [x.text for x in driver.find_elements_by_xpath('//h6') if x.text != ''] | |
# Paragraphs | |
ret['p'] = [x.text for x in driver.find_elements_by_xpath('//p') if x.text != ''] | |
# Images | |
resources = driver.execute_script("return window.performance.getEntriesByType('resource');") | |
ret['img_resources'] = [x['name'] for x in resources if x['initiatorType'] == 'img'] | |
imgs = driver.find_elements_by_xpath('//img') | |
ret['img'] = [{'src': x.get_attribute('src') , 'rect': x.rect} for x in imgs if x.rect['width'] > 100] | |
comp = ret['title'] | |
comp += ' '.join(ret['h1']) | |
comp += ' '.join(ret['h2']) | |
comp += ' '.join(ret['h3']) | |
comp += ' '.join(ret['p']) | |
ret['lang'] = detect(comp) | |
rets.append(ret) | |
driver.close() | |
return rets | |
def handle(event, context): | |
# Check if it's authorized | |
if 'auth-code' not in event.headers or event.headers['auth-code'] != AUTH_CODE: | |
return { | |
'statusCode': 401, | |
'body': {'error': 'Unauthorized'} | |
} | |
# Get the URLs to scrape from the event | |
data = {} | |
try: | |
data = json.loads(event.body) | |
except Exception as e: | |
return { | |
'statusCode': 400, | |
'body': { | |
'error': 'Invalid JSON' | |
} | |
} | |
if 'urls' not in data: | |
return { | |
'statusCode': 400, | |
'body': { | |
'error': 'No URLs provided' | |
} | |
} | |
ret = scrap(data['urls']) | |
return { | |
"statusCode": 200, | |
"body": ret, | |
"headers": { | |
"Content-type": "application/json; charset=utf-8" | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment