Last active
October 9, 2019 15:45
-
-
Save ronaldgreeff/1d8fd8f2bbb78dec753f07784b5e0e84 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#*-*encoding: utf-8*-* | |
from peewee import * | |
import os, sys, io | |
import re | |
import pickle, json | |
import csv | |
import time | |
import urllib | |
from datetime import datetime | |
from urllib.parse import urlparse | |
from selenium import webdriver | |
from selenium.webdriver.firefox.options import Options | |
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities | |
from selenium.common.exceptions import TimeoutException | |
import utils | |
CURRENT_DIR = os.path.dirname(__file__) | |
database = SqliteDatabase('raw_extracts.db') | |
# DATABASE | |
class BaseModel(Model): | |
class Meta: | |
database = database | |
class Extract(BaseModel): | |
url = CharField(null=True, unique=True) | |
site = CharField(null=True) | |
screenshot = CharField(null=True) | |
extract = TextField(null=True) | |
def create_tables(): | |
database.connect() | |
with database: | |
database.create_tables([Extract,]) | |
database.close() | |
# SELENIUM | |
class Driver_Config(): | |
options = Options() | |
options.headless = True | |
caps = DesiredCapabilities().FIREFOX | |
caps["pageLoadStrategy"] = "normal" | |
driver = webdriver.Firefox( | |
options=options, | |
desired_capabilities=caps | |
) | |
# self.driver.set_page_load_timeout(60) | |
driver.set_window_size(1920, 1080) | |
class Selenium_Driver(Driver_Config): | |
def __init__(self): | |
self.driver = Driver_Config.driver | |
self.script = open('js_extract_script.js').read() | |
def extract_data_from_page(self, url, screenshot_name): | |
try: | |
self.driver.get(url) | |
extract = self.driver.execute_script(self.script) | |
self.driver.save_screenshot(os.path.join(CURRENT_DIR, '../screenshots', '{}.png'.format(screenshot_name)) ) | |
return extract | |
except: | |
self.quit() | |
def quit(self, m=None): | |
if m: | |
print('\nq sel...{}\n'.format(m)) | |
self.driver.quit() | |
# CRAWLER | |
class Crawler(): | |
def __init__(self): | |
self.selenium_driver = Selenium_Driver() | |
def extract_and_store_data(self, url): | |
parsed_url = urlparse(url) | |
print('extracting from {}'.format(parsed_url)) | |
url_boundary = '{}://{}/'.format(parsed_url.scheme, parsed_url.netloc) | |
site, screenshot_name = utils.process_url(parsed_url.netloc, parsed_url.path) | |
extract = self.selenium_driver.extract_data_from_page(url, screenshot_name) | |
print('{} URL: {}\nSITE: {}\nSCREENSHOT: {}\nEXTRACT: {}\n'.format( | |
datetime.now(), url, site, screenshot_name, [{i[0]: len(i[1])} for i in extract.items()] | |
)) | |
new_links = [] | |
for link in extract['links']: | |
if link[:len(url_boundary)] == url_boundary: | |
if link[-2:] == '/#': | |
link = link[:-2] | |
new_links.append(link) | |
extract = json.dumps(extract) | |
# TODO - normalize url scheme to lower case and drop empty components | |
# url = parsed_url.get_url() | |
with database.atomic(): | |
if not Extract.select().where(Extract.url == url).exists(): | |
Extract.create( | |
url=url, | |
site=site, | |
screenshot=screenshot_name, | |
extract=extract, | |
) | |
else: | |
Extract.update( | |
site=site, | |
screenshot=screenshot_name, | |
extract=extract, | |
).where(Extract.url == url).execute() | |
for new_link in new_links: | |
if not Extract.select().where(Extract.url == new_link).exists(): | |
Extract.create(url=new_link) | |
# try: | |
# Extract.create(url=new_link) | |
# # link already exists | |
# except IntegrityError: | |
# pass | |
if __name__ == '__main__': | |
create_tables() | |
crawler = Crawler() | |
def csv2dict(): | |
with open('major_urls - uk.csv', newline='') as csv_file: | |
for row in csv.DictReader(csv_file): | |
yield row | |
csv_records = [record['url'] for record in csv2dict()] | |
unprocessed_urls = [] | |
with database.atomic(): | |
for url in csv_records: | |
if not Extract.select().where(Extract.url == url).exists(): | |
unprocessed_urls.append(url) | |
if unprocessed_urls: | |
for url in unprocessed_urls: | |
crawler.extract_and_store_data(url) | |
# for unprocessed_url in ( | |
# Extract.select(Extract.url).where( (Extract.extract.is_null(True)) & (Extract.extract.is_null(True)) ).order_by(fn.Random()).execute()): | |
# crawler.extract_and_store_data(unprocessed_url.url) | |
def query(): | |
unprocessed_urls = Extract.select(Extract.url).where( Extract.extract.is_null(True) ).order_by(fn.Random()).execute() | |
for unprocessed_url in unprocessed_urls: | |
crawler.extract_and_store_data(unprocessed_url.url) | |
try: | |
query() | |
except: | |
query() | |
# TODO - select from database first 10 records from lowest site.count() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment