Skip to content

Instantly share code, notes, and snippets.

@ronaldgreeff
Last active October 9, 2019 15:45
Show Gist options
  • Save ronaldgreeff/1d8fd8f2bbb78dec753f07784b5e0e84 to your computer and use it in GitHub Desktop.
Save ronaldgreeff/1d8fd8f2bbb78dec753f07784b5e0e84 to your computer and use it in GitHub Desktop.
#*-*encoding: utf-8*-*
from peewee import *
import os, sys, io
import re
import pickle, json
import csv
import time
import urllib
from datetime import datetime
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.common.exceptions import TimeoutException
import utils
CURRENT_DIR = os.path.dirname(__file__)
database = SqliteDatabase('raw_extracts.db')
# DATABASE
class BaseModel(Model):
class Meta:
database = database
class Extract(BaseModel):
url = CharField(null=True, unique=True)
site = CharField(null=True)
screenshot = CharField(null=True)
extract = TextField(null=True)
def create_tables():
database.connect()
with database:
database.create_tables([Extract,])
database.close()
# SELENIUM
class Driver_Config():
options = Options()
options.headless = True
caps = DesiredCapabilities().FIREFOX
caps["pageLoadStrategy"] = "normal"
driver = webdriver.Firefox(
options=options,
desired_capabilities=caps
)
# self.driver.set_page_load_timeout(60)
driver.set_window_size(1920, 1080)
class Selenium_Driver(Driver_Config):
def __init__(self):
self.driver = Driver_Config.driver
self.script = open('js_extract_script.js').read()
def extract_data_from_page(self, url, screenshot_name):
try:
self.driver.get(url)
extract = self.driver.execute_script(self.script)
self.driver.save_screenshot(os.path.join(CURRENT_DIR, '../screenshots', '{}.png'.format(screenshot_name)) )
return extract
except:
self.quit()
def quit(self, m=None):
if m:
print('\nq sel...{}\n'.format(m))
self.driver.quit()
# CRAWLER
class Crawler():
def __init__(self):
self.selenium_driver = Selenium_Driver()
def extract_and_store_data(self, url):
parsed_url = urlparse(url)
print('extracting from {}'.format(parsed_url))
url_boundary = '{}://{}/'.format(parsed_url.scheme, parsed_url.netloc)
site, screenshot_name = utils.process_url(parsed_url.netloc, parsed_url.path)
extract = self.selenium_driver.extract_data_from_page(url, screenshot_name)
print('{} URL: {}\nSITE: {}\nSCREENSHOT: {}\nEXTRACT: {}\n'.format(
datetime.now(), url, site, screenshot_name, [{i[0]: len(i[1])} for i in extract.items()]
))
new_links = []
for link in extract['links']:
if link[:len(url_boundary)] == url_boundary:
if link[-2:] == '/#':
link = link[:-2]
new_links.append(link)
extract = json.dumps(extract)
# TODO - normalize url scheme to lower case and drop empty components
# url = parsed_url.get_url()
with database.atomic():
if not Extract.select().where(Extract.url == url).exists():
Extract.create(
url=url,
site=site,
screenshot=screenshot_name,
extract=extract,
)
else:
Extract.update(
site=site,
screenshot=screenshot_name,
extract=extract,
).where(Extract.url == url).execute()
for new_link in new_links:
if not Extract.select().where(Extract.url == new_link).exists():
Extract.create(url=new_link)
# try:
# Extract.create(url=new_link)
# # link already exists
# except IntegrityError:
# pass
if __name__ == '__main__':
create_tables()
crawler = Crawler()
def csv2dict():
with open('major_urls - uk.csv', newline='') as csv_file:
for row in csv.DictReader(csv_file):
yield row
csv_records = [record['url'] for record in csv2dict()]
unprocessed_urls = []
with database.atomic():
for url in csv_records:
if not Extract.select().where(Extract.url == url).exists():
unprocessed_urls.append(url)
if unprocessed_urls:
for url in unprocessed_urls:
crawler.extract_and_store_data(url)
# for unprocessed_url in (
# Extract.select(Extract.url).where( (Extract.extract.is_null(True)) & (Extract.extract.is_null(True)) ).order_by(fn.Random()).execute()):
# crawler.extract_and_store_data(unprocessed_url.url)
def query():
unprocessed_urls = Extract.select(Extract.url).where( Extract.extract.is_null(True) ).order_by(fn.Random()).execute()
for unprocessed_url in unprocessed_urls:
crawler.extract_and_store_data(unprocessed_url.url)
try:
query()
except:
query()
# TODO - select from database first 10 records from lowest site.count()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment