greencoder · August 29, 2015 14:15 · greencoder · Feb 21, 2015
diff --git a/config.txt b/config.txt
 [Pushover]
 app_token: <your pushover app token>
 user_key: <your pushover user key>
diff --git a/requirements.txt b/requirements.txt
 BeautifulSoup==3.2.1
 argparse==1.3.0
 dateutils==0.6.6
 python-dateutil==2.4.0
 pytz==2014.10
 requests==2.5.1
 six==1.9.0
 wsgiref==0.1.2
diff --git a/scrape.py b/scrape.py
 import BeautifulSoup
 import ConfigParser
 import datetime
 import dateutil.parser
 import json
 import os
 import pytz
 import re
 import requests
 import sys

 def normalize_weekdays(value):

    search_replace_patterns = [
        (r'Mon.?', 'Monday'),
        (r'\b(?P<day>Tue.?|Tues.?\b)', 'Tuesday'),
        (r'\b(?P<day>Wed.?\b)', 'Wednesday'),
        (r'\b(?P<day>Thu.?|Thur.?|Thurs.?\b)', 'Thursday'),
        (r'\b(?P<day>Fri.?\b)', 'Friday'),
        (r'\b(?P<day>Sat.?\b)', 'Saturday'),
        (r'\b(?P<day>Sun.?\b)', 'Sunday'),
    ]

    # Look for matches in the value
    for pattern, substitution in search_replace_patterns:
        regex = re.search(pattern, value)
        if regex:
            day_match = regex.groups()[0]
            return re.sub(day_match, substitution, value)

    # If we got this far, no match
    return value

 def normalize_time(value):

    pattern = '\((?P<time>[0-9:]*[am|pm]*)\)'
    regex = re.search(pattern, value)

    if regex:
        return re.sub(pattern, regex.groups()[0], value)

    # If we got here, no match
    return value

 def load_entries_from_disk(filepath):
    if os.path.exists(filepath):
        with open(filepath, 'r') as f:
            try:
                return json.loads(f.read())
            except ValueError:
                return []
    else:
        return []

 def save_entries_to_disk(entries, filepath):
    with open(filepath, 'w') as f:
        f.write(json.dumps(entries, indent=4))

 def send_notification(app_token, user_key, message):
    request = requests.post("https://api.pushover.net:443/1/messages.json", data={
        "token": app_token,
        "user": user_key,
        "message": message,
        "sound": "bugle",
        "title": "WX5280 Live Blog Update",
        "url": "http://www.weather5280.com/live-blog/",
        "url_title": "View Online",
    })

 def log(message):
    with open('log.txt', 'a') as f:
        message = "%s\t%s" % (datetime.datetime.now().isoformat(), message)
        f.write(message + "\n")
        print message

 if __name__ == "__main__":

    log("Checking Live Blog Entries")

    # Load the config info from the config.txt file
    config = ConfigParser.ConfigParser()
    config.read("config.txt")
    pushover_app_token = config.get('Pushover', 'app_token')
    pushover_user_key = config.get('Pushover', 'user_key')
    
    # Make sure config loaded properly
    if not pushover_app_token or not pushover_user_key:
        message = "Error! config.txt missing app token and/or user key."
        log(message)
        sys.exit()

    # Make the request for the live blog and turn it into a soup object
    request = requests.get("http://www.weather5280.com/live-blog/")
    soup = BeautifulSoup.BeautifulSoup(request.text)

    # Load the entries from disk
    existing_entries = load_entries_from_disk('entries.json')

    # Keep track of all the entries
    scraped_entries = []

    # Find all the <h3>'s on the page
    for h3_element in soup.findAll('h3'):

        # The title is inside the <h3>
        title = h3_element.text

        # Find the next <p> tag, it contains the time info
        p_time_element = h3_element.findNext('p')
        raw_time_text = p_time_element.text

        # Clean up the time string
        time_text = normalize_weekdays(raw_time_text)
        time_text = normalize_time(time_text)

        # Parse the date string into a datetime object and
        # make it timezone-aware
        try:
            posted_dt = dateutil.parser.parse(time_text)
            posted_dt = posted_dt.replace(tzinfo=pytz.timezone('US/Mountain'))
            posted_isoformat = posted_dt.isoformat()
        except ValueError, e:
            posted_isoformat = None
            log("Error! Could not parse date: %s" % time_text)

        # Add the value to the scraped entries
        scraped_entries.append({
            'title': title,
            'posted-string': time_text,
            'posted-time': posted_isoformat,
        })

        # Sort the scraped entries by date
        sorted_scraped_entries = sorted(scraped_entries, key=lambda k: k['posted-time'], reverse=True)

        # Save the entries to disk
        save_entries_to_disk(sorted_scraped_entries, 'entries.json')

    # If there are more scraped entries than existing ones, find 
    # the newest entry and notify
    if len(sorted_scraped_entries) > len(existing_entries):
        log("Found new entry. Notifying.")
        message = sorted_scraped_entries[0]['title']
        send_notification(pushover_app_token, pushover_user_key, message)
    else:
        log("No new entries found.")
	[Pushover]
	app_token: <your pushover app token>
	user_key: <your pushover user key>
	BeautifulSoup==3.2.1
	argparse==1.3.0
	dateutils==0.6.6
	python-dateutil==2.4.0
	pytz==2014.10
	requests==2.5.1
	six==1.9.0
	wsgiref==0.1.2
	import BeautifulSoup
	import ConfigParser
	import datetime
	import dateutil.parser
	import json
	import os
	import pytz
	import re
	import requests
	import sys

	def normalize_weekdays(value):

	search_replace_patterns = [
	(r'Mon.?', 'Monday'),
	(r'\b(?P<day>Tue.?\|Tues.?\b)', 'Tuesday'),
	(r'\b(?P<day>Wed.?\b)', 'Wednesday'),
	(r'\b(?P<day>Thu.?\|Thur.?\|Thurs.?\b)', 'Thursday'),
	(r'\b(?P<day>Fri.?\b)', 'Friday'),
	(r'\b(?P<day>Sat.?\b)', 'Saturday'),
	(r'\b(?P<day>Sun.?\b)', 'Sunday'),
	]

	# Look for matches in the value
	for pattern, substitution in search_replace_patterns:
	regex = re.search(pattern, value)
	if regex:
	day_match = regex.groups()[0]
	return re.sub(day_match, substitution, value)

	# If we got this far, no match
	return value

	def normalize_time(value):

	pattern = '\((?P<time>[0-9:][am\|pm])\)'
	regex = re.search(pattern, value)

	if regex:
	return re.sub(pattern, regex.groups()[0], value)

	# If we got here, no match
	return value

	def load_entries_from_disk(filepath):
	if os.path.exists(filepath):
	with open(filepath, 'r') as f:
	try:
	return json.loads(f.read())
	except ValueError:
	return []
	else:
	return []

	def save_entries_to_disk(entries, filepath):
	with open(filepath, 'w') as f:
	f.write(json.dumps(entries, indent=4))

	def send_notification(app_token, user_key, message):
	request = requests.post("https://api.pushover.net:443/1/messages.json", data={
	"token": app_token,
	"user": user_key,
	"message": message,
	"sound": "bugle",
	"title": "WX5280 Live Blog Update",
	"url": "http://www.weather5280.com/live-blog/",
	"url_title": "View Online",
	})

	def log(message):
	with open('log.txt', 'a') as f:
	message = "%s\t%s" % (datetime.datetime.now().isoformat(), message)
	f.write(message + "\n")
	print message

	if __name__ == "__main__":

	log("Checking Live Blog Entries")

	# Load the config info from the config.txt file
	config = ConfigParser.ConfigParser()
	config.read("config.txt")
	pushover_app_token = config.get('Pushover', 'app_token')
	pushover_user_key = config.get('Pushover', 'user_key')

	# Make sure config loaded properly
	if not pushover_app_token or not pushover_user_key:
	message = "Error! config.txt missing app token and/or user key."
	log(message)
	sys.exit()

	# Make the request for the live blog and turn it into a soup object
	request = requests.get("http://www.weather5280.com/live-blog/")
	soup = BeautifulSoup.BeautifulSoup(request.text)

	# Load the entries from disk
	existing_entries = load_entries_from_disk('entries.json')

	# Keep track of all the entries
	scraped_entries = []

	# Find all the <h3>'s on the page
	for h3_element in soup.findAll('h3'):

	# The title is inside the <h3>
	title = h3_element.text

	# Find the next <p> tag, it contains the time info
	p_time_element = h3_element.findNext('p')
	raw_time_text = p_time_element.text

	# Clean up the time string
	time_text = normalize_weekdays(raw_time_text)
	time_text = normalize_time(time_text)

	# Parse the date string into a datetime object and
	# make it timezone-aware
	try:
	posted_dt = dateutil.parser.parse(time_text)
	posted_dt = posted_dt.replace(tzinfo=pytz.timezone('US/Mountain'))
	posted_isoformat = posted_dt.isoformat()
	except ValueError, e:
	posted_isoformat = None
	log("Error! Could not parse date: %s" % time_text)

	# Add the value to the scraped entries
	scraped_entries.append({
	'title': title,
	'posted-string': time_text,
	'posted-time': posted_isoformat,
	})

	# Sort the scraped entries by date
	sorted_scraped_entries = sorted(scraped_entries, key=lambda k: k['posted-time'], reverse=True)

	# Save the entries to disk
	save_entries_to_disk(sorted_scraped_entries, 'entries.json')

	# If there are more scraped entries than existing ones, find
	# the newest entry and notify
	if len(sorted_scraped_entries) > len(existing_entries):
	log("Found new entry. Notifying.")
	message = sorted_scraped_entries[0]['title']
	send_notification(pushover_app_token, pushover_user_key, message)
	else:
	log("No new entries found.")