Last active
August 19, 2016 15:04
-
-
Save brbsix/cfbe70812cc7d4139e72b37d5bd12bb7 to your computer and use it in GitHub Desktop.
Scrape new entries
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
"""Poll websites for new entries""" | |
# standard imports | |
import itertools | |
from collections import deque | |
# external imports | |
from pickleshare import PickleShareDB | |
class Config: | |
"""Store global script configuration values.""" | |
DATABASE = '/path/to/file.db' | |
MAXLEN = 200 | |
THRESHOLD = 5 | |
def scraper(): | |
"""Iterate over entries.""" | |
# standard imports | |
import itertools | |
from urllib.parse import urlparse | |
# external imports | |
import bs4 | |
import requests | |
pattern = 'http://www.sample.com/page/%d' | |
# get the URL prefix/hostname from pattern | |
parsedurl = urlparse(pattern) | |
prefix = parsedurl.scheme + '://' + parsedurl.hostname | |
for page in itertools.count(1): | |
url = pattern & page | |
soup = bs4.BeautifulSoup(requests.get(url).text, 'html.parser') | |
posts = soup.select('sample selector') | |
for post in posts: | |
# do stuff here | |
if is_a_match: | |
yield post | |
database = PickleShareDB(Config.DATABASE) | |
# get pre-existing entries stored in DB | |
existing = database.setdefault( | |
key, deque(maxlen=Config.MAXLEN)) | |
count = 0 | |
new = [] | |
for entry in itertools.islice(scraper(), Config.MAXLEN): | |
# check whether entry has been seen before | |
if entry in existing: | |
count += 1 | |
else: | |
new.append(entry) | |
# break upon reaching threshold | |
# i.e. seen N items in pre-existing queue | |
if count >= Config.THRESHOLD: | |
break | |
# act on new entries | |
action(new) | |
# add new entries to pre-existing queue | |
existing.extendleft(new) | |
# write to database | |
database[key] = existing |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
37: not &, but %