jmrobles · December 12, 2021 23:05
diff --git a/handler.py b/handler.py
 import sys
 import json

 from selenium import webdriver
 from selenium.webdriver import ChromeOptions
 from langdetect import detect

 # Consts

 AUTH_CODE = 'asecretcode'

 def scrap(urls):
    opt = webdriver.ChromeOptions()
    opt.add_argument("--headless")
    opt.add_argument("--disable-gpu")
    opt.add_argument("--no-sandbox")
    driver = webdriver.Chrome(options=opt)
    rets = []
    for url in urls:
        ret = {}
        ret['url'] = url
        try:
            driver.get(url)
        except Exception as e:
            ret['success'] = False
            ret['error'] = str(e)
            rets.append(ret)
            continue
        driver.implicitly_wait(10)

        ret['success'] = True
        ret['title'] = driver.title
        # Headings
        ret['h1'] = [x.text for x in driver.find_elements_by_xpath('//h1') if x.text != '']
        ret['h2'] = [x.text for x in driver.find_elements_by_xpath('//h2') if x.text != '']
        ret['h3'] = [x.text for x in driver.find_elements_by_xpath('//h3') if x.text != '']
        ret['h4'] = [x.text for x in driver.find_elements_by_xpath('//h4') if x.text != '']
        ret['h5'] = [x.text for x in driver.find_elements_by_xpath('//h5') if x.text != '']
        ret['h6'] = [x.text for x in driver.find_elements_by_xpath('//h6') if x.text != '']
        # Paragraphs
        ret['p'] = [x.text for x in driver.find_elements_by_xpath('//p') if x.text != '']
        # Images
        resources = driver.execute_script("return window.performance.getEntriesByType('resource');")
        ret['img_resources'] = [x['name'] for x in resources if x['initiatorType'] == 'img']
        imgs = driver.find_elements_by_xpath('//img')
        ret['img'] = [{'src': x.get_attribute('src') , 'rect': x.rect} for x in imgs if x.rect['width'] > 100]
        comp = ret['title']
        comp += ' '.join(ret['h1'])
        comp += ' '.join(ret['h2'])
        comp += ' '.join(ret['h3'])
        comp += ' '.join(ret['p'])
        ret['lang'] = detect(comp)
        rets.append(ret)
    driver.close()
    return rets

 def handle(event, context):
    # Check if it's authorized
    if 'auth-code' not in event.headers or event.headers['auth-code'] != AUTH_CODE:
        return {
            'statusCode': 401,
            'body': {'error': 'Unauthorized'}
        }
    # Get the URLs to scrape from the event
    data = {}
    try:
        data = json.loads(event.body)
    except Exception as e:
        return {
            'statusCode': 400,
            'body': {
                'error': 'Invalid JSON'
            }
        }
    if 'urls' not in data:
        return {
            'statusCode': 400,
            'body': {
                'error': 'No URLs provided'
            }
        }
    ret = scrap(data['urls'])
    return {
        "statusCode": 200,
        "body": ret,
        "headers": {
            "Content-type": "application/json; charset=utf-8"
        }
    }
	import sys
	import json

	from selenium import webdriver
	from selenium.webdriver import ChromeOptions
	from langdetect import detect

	# Consts

	AUTH_CODE = 'asecretcode'

	def scrap(urls):
	opt = webdriver.ChromeOptions()
	opt.add_argument("--headless")
	opt.add_argument("--disable-gpu")
	opt.add_argument("--no-sandbox")
	driver = webdriver.Chrome(options=opt)
	rets = []
	for url in urls:
	ret = {}
	ret['url'] = url
	try:
	driver.get(url)
	except Exception as e:
	ret['success'] = False
	ret['error'] = str(e)
	rets.append(ret)
	continue
	driver.implicitly_wait(10)

	ret['success'] = True
	ret['title'] = driver.title
	# Headings
	ret['h1'] = [x.text for x in driver.find_elements_by_xpath('//h1') if x.text != '']
	ret['h2'] = [x.text for x in driver.find_elements_by_xpath('//h2') if x.text != '']
	ret['h3'] = [x.text for x in driver.find_elements_by_xpath('//h3') if x.text != '']
	ret['h4'] = [x.text for x in driver.find_elements_by_xpath('//h4') if x.text != '']
	ret['h5'] = [x.text for x in driver.find_elements_by_xpath('//h5') if x.text != '']
	ret['h6'] = [x.text for x in driver.find_elements_by_xpath('//h6') if x.text != '']
	# Paragraphs
	ret['p'] = [x.text for x in driver.find_elements_by_xpath('//p') if x.text != '']
	# Images
	resources = driver.execute_script("return window.performance.getEntriesByType('resource');")
	ret['img_resources'] = [x['name'] for x in resources if x['initiatorType'] == 'img']
	imgs = driver.find_elements_by_xpath('//img')
	ret['img'] = [{'src': x.get_attribute('src') , 'rect': x.rect} for x in imgs if x.rect['width'] > 100]
	comp = ret['title']
	comp += ' '.join(ret['h1'])
	comp += ' '.join(ret['h2'])
	comp += ' '.join(ret['h3'])
	comp += ' '.join(ret['p'])
	ret['lang'] = detect(comp)
	rets.append(ret)
	driver.close()
	return rets

	def handle(event, context):
	# Check if it's authorized
	if 'auth-code' not in event.headers or event.headers['auth-code'] != AUTH_CODE:
	return {
	'statusCode': 401,
	'body': {'error': 'Unauthorized'}
	}
	# Get the URLs to scrape from the event
	data = {}
	try:
	data = json.loads(event.body)
	except Exception as e:
	return {
	'statusCode': 400,
	'body': {
	'error': 'Invalid JSON'
	}
	}
	if 'urls' not in data:
	return {
	'statusCode': 400,
	'body': {
	'error': 'No URLs provided'
	}
	}
	ret = scrap(data['urls'])
	return {
	"statusCode": 200,
	"body": ret,
	"headers": {
	"Content-type": "application/json; charset=utf-8"
	}
	}