Created
March 31, 2015 12:22
-
-
Save arne-cl/9a674a1faf260bd19e56 to your computer and use it in GitHub Desktop.
simple scraper for blog.fefe.de
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import datetime | |
from datetime import timedelta | |
import requests | |
def create_dir(path): | |
""" | |
Creates a directory. Warns, if the directory can't be accessed. Passes, | |
if the directory already exists. | |
modified from http://stackoverflow.com/a/600612 | |
Parameters | |
---------- | |
path : str | |
path to the directory to be created | |
""" | |
import sys | |
import errno | |
try: | |
os.makedirs(path) | |
except OSError as exc: # Python >2.5 | |
if exc.errno == errno.EEXIST: | |
if os.path.isdir(path): | |
pass | |
else: # if something exists at the path, but it's not a dir | |
raise | |
elif exc.errno == errno.EACCES: | |
sys.stderr.write("Cannot create [%s]! Check Permissions" % path) | |
raise | |
else: | |
raise | |
def get_previous_month(date): | |
""" | |
given a date, returns the year (int) and month (int) of the previous month. | |
cf. http://stackoverflow.com/a/9725093/564514 | |
""" | |
first_day_of_month = datetime.date(year=date.year, month=date.month, day=1) | |
last_day_of_prev_month = first_day_of_month - timedelta(days=1) | |
return last_day_of_prev_month.year, last_day_of_prev_month.month | |
def get_fefe_daterange(start_date=START_DATE, end_date=datetime.date.today()): | |
current_date = end_date | |
while current_date >= START_DATE: | |
yield (current_date.year, current_date.month) | |
year_of_prev_month, prev_month = get_previous_month(current_date) | |
current_date = datetime.date(year=year_of_prev_month, month=prev_month, day=1) | |
def scrape_fefes_blog(start_date=START_DATE, end_date=datetime.date.today(), output_dir=os.curdir): | |
create_dir(output_dir) | |
for (year, month) in get_fefe_daterange(start_date, end_date): | |
month_id = '{}{month:02d}'.format(year, month=month) | |
url = 'http://blog.fefe.de/?mon=' + month_id | |
month_page = requests.get(url) | |
with open(os.path.join(output_dir, month_id+'.html'), 'w') as output_file: | |
output_file.write(month_page.content) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment