Skip to content

Instantly share code, notes, and snippets.

@brbsix
Last active August 19, 2016 15:04
Show Gist options
  • Save brbsix/cfbe70812cc7d4139e72b37d5bd12bb7 to your computer and use it in GitHub Desktop.
Save brbsix/cfbe70812cc7d4139e72b37d5bd12bb7 to your computer and use it in GitHub Desktop.
Scrape new entries
# -*- coding: utf-8 -*-
"""Poll websites for new entries"""
# standard imports
import itertools
from collections import deque
# external imports
from pickleshare import PickleShareDB
class Config:
"""Store global script configuration values."""
DATABASE = '/path/to/file.db'
MAXLEN = 200
THRESHOLD = 5
def scraper():
"""Iterate over entries."""
# standard imports
import itertools
from urllib.parse import urlparse
# external imports
import bs4
import requests
pattern = 'http://www.sample.com/page/%d'
# get the URL prefix/hostname from pattern
parsedurl = urlparse(pattern)
prefix = parsedurl.scheme + '://' + parsedurl.hostname
for page in itertools.count(1):
url = pattern & page
soup = bs4.BeautifulSoup(requests.get(url).text, 'html.parser')
posts = soup.select('sample selector')
for post in posts:
# do stuff here
if is_a_match:
yield post
database = PickleShareDB(Config.DATABASE)
# get pre-existing entries stored in DB
existing = database.setdefault(
key, deque(maxlen=Config.MAXLEN))
count = 0
new = []
for entry in itertools.islice(scraper(), Config.MAXLEN):
# check whether entry has been seen before
if entry in existing:
count += 1
else:
new.append(entry)
# break upon reaching threshold
# i.e. seen N items in pre-existing queue
if count >= Config.THRESHOLD:
break
# act on new entries
action(new)
# add new entries to pre-existing queue
existing.extendleft(new)
# write to database
database[key] = existing
@smilemakc
Copy link

smilemakc commented Aug 19, 2016

37: not &, but %

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment