Last active
August 29, 2015 14:15
-
-
Save greencoder/229aadb9628c68d6b23d to your computer and use it in GitHub Desktop.
WX5280 Live Blog Notifier
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[Pushover] | |
app_token: <your pushover app token> | |
user_key: <your pushover user key> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
BeautifulSoup==3.2.1 | |
argparse==1.3.0 | |
dateutils==0.6.6 | |
python-dateutil==2.4.0 | |
pytz==2014.10 | |
requests==2.5.1 | |
six==1.9.0 | |
wsgiref==0.1.2 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import BeautifulSoup | |
import ConfigParser | |
import datetime | |
import dateutil.parser | |
import json | |
import os | |
import pytz | |
import re | |
import requests | |
import sys | |
def normalize_weekdays(value): | |
search_replace_patterns = [ | |
(r'Mon.?', 'Monday'), | |
(r'\b(?P<day>Tue.?|Tues.?\b)', 'Tuesday'), | |
(r'\b(?P<day>Wed.?\b)', 'Wednesday'), | |
(r'\b(?P<day>Thu.?|Thur.?|Thurs.?\b)', 'Thursday'), | |
(r'\b(?P<day>Fri.?\b)', 'Friday'), | |
(r'\b(?P<day>Sat.?\b)', 'Saturday'), | |
(r'\b(?P<day>Sun.?\b)', 'Sunday'), | |
] | |
# Look for matches in the value | |
for pattern, substitution in search_replace_patterns: | |
regex = re.search(pattern, value) | |
if regex: | |
day_match = regex.groups()[0] | |
return re.sub(day_match, substitution, value) | |
# If we got this far, no match | |
return value | |
def normalize_time(value): | |
pattern = '\((?P<time>[0-9:]*[am|pm]*)\)' | |
regex = re.search(pattern, value) | |
if regex: | |
return re.sub(pattern, regex.groups()[0], value) | |
# If we got here, no match | |
return value | |
def load_entries_from_disk(filepath): | |
if os.path.exists(filepath): | |
with open(filepath, 'r') as f: | |
try: | |
return json.loads(f.read()) | |
except ValueError: | |
return [] | |
else: | |
return [] | |
def save_entries_to_disk(entries, filepath): | |
with open(filepath, 'w') as f: | |
f.write(json.dumps(entries, indent=4)) | |
def send_notification(app_token, user_key, message): | |
request = requests.post("https://api.pushover.net:443/1/messages.json", data={ | |
"token": app_token, | |
"user": user_key, | |
"message": message, | |
"sound": "bugle", | |
"title": "WX5280 Live Blog Update", | |
"url": "http://www.weather5280.com/live-blog/", | |
"url_title": "View Online", | |
}) | |
def log(message): | |
with open('log.txt', 'a') as f: | |
message = "%s\t%s" % (datetime.datetime.now().isoformat(), message) | |
f.write(message + "\n") | |
print message | |
if __name__ == "__main__": | |
log("Checking Live Blog Entries") | |
# Load the config info from the config.txt file | |
config = ConfigParser.ConfigParser() | |
config.read("config.txt") | |
pushover_app_token = config.get('Pushover', 'app_token') | |
pushover_user_key = config.get('Pushover', 'user_key') | |
# Make sure config loaded properly | |
if not pushover_app_token or not pushover_user_key: | |
message = "Error! config.txt missing app token and/or user key." | |
log(message) | |
sys.exit() | |
# Make the request for the live blog and turn it into a soup object | |
request = requests.get("http://www.weather5280.com/live-blog/") | |
soup = BeautifulSoup.BeautifulSoup(request.text) | |
# Load the entries from disk | |
existing_entries = load_entries_from_disk('entries.json') | |
# Keep track of all the entries | |
scraped_entries = [] | |
# Find all the <h3>'s on the page | |
for h3_element in soup.findAll('h3'): | |
# The title is inside the <h3> | |
title = h3_element.text | |
# Find the next <p> tag, it contains the time info | |
p_time_element = h3_element.findNext('p') | |
raw_time_text = p_time_element.text | |
# Clean up the time string | |
time_text = normalize_weekdays(raw_time_text) | |
time_text = normalize_time(time_text) | |
# Parse the date string into a datetime object and | |
# make it timezone-aware | |
try: | |
posted_dt = dateutil.parser.parse(time_text) | |
posted_dt = posted_dt.replace(tzinfo=pytz.timezone('US/Mountain')) | |
posted_isoformat = posted_dt.isoformat() | |
except ValueError, e: | |
posted_isoformat = None | |
log("Error! Could not parse date: %s" % time_text) | |
# Add the value to the scraped entries | |
scraped_entries.append({ | |
'title': title, | |
'posted-string': time_text, | |
'posted-time': posted_isoformat, | |
}) | |
# Sort the scraped entries by date | |
sorted_scraped_entries = sorted(scraped_entries, key=lambda k: k['posted-time'], reverse=True) | |
# Save the entries to disk | |
save_entries_to_disk(sorted_scraped_entries, 'entries.json') | |
# If there are more scraped entries than existing ones, find | |
# the newest entry and notify | |
if len(sorted_scraped_entries) > len(existing_entries): | |
log("Found new entry. Notifying.") | |
message = sorted_scraped_entries[0]['title'] | |
send_notification(pushover_app_token, pushover_user_key, message) | |
else: | |
log("No new entries found.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Usage $ python scrape.py