Created
March 14, 2014 16:53
-
-
Save iserko/9551817 to your computer and use it in GitHub Desktop.
A cute little Royal Opera House website parser using Selenium
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from email.mime.text import MIMEText | |
from pyvirtualdisplay.smartdisplay import SmartDisplay | |
from selenium import webdriver | |
import difflib | |
import os.path | |
import random | |
import smtplib | |
import time | |
def parse_roh(): | |
from_addr = '[email protected]' | |
to_addr = '[email protected]' | |
# Start sleep | |
#time.sleep(random.randint(1,60)) | |
display = SmartDisplay(visible=0, size=(1024, 768), bgcolor='white', backend='xvfb') | |
display.start() | |
browser = webdriver.Chrome('/usr/bin/chromedriver') | |
event_ids = [ | |
('19766', u"La Boheme on Sat 9th March"), | |
('20351', u'Tosca on Sat 16th March'), | |
('20353', u'Tosca on Sat 23rd March'), | |
] | |
output = [] | |
num_diffs = 0 | |
for event_id, event_name in event_ids: | |
# Sleep between gets | |
time.sleep(random.randint(2,10)) | |
url = u'http://www.roh.org.uk/events/%s/tickets/syos' % event_id | |
browser.get(url) | |
#img = display.waitgrab() | |
#img.save('img_%s_%s.jpeg' % (event_id, int(time.time())), 'JPEG') | |
import ipdb | |
ipdb.set_trace() | |
sections = browser.find_elements_by_xpath("//li[@class='panel-section on']/div[@class='sections']/ul/li") | |
full_stat = [] | |
for section in sections: | |
section_name = section.find_element_by_tag_name('h1').text | |
prices = section.find_element_by_tag_name('h2').text | |
status = section.find_element_by_tag_name('h3').text | |
full_stat.append(u"Section: %s, Price: %s, Status: %s ... URL=%s" % (section_name, prices, status, url)) | |
full_stat = u'\n'.join(full_stat) | |
old_stat = u'' | |
fname = 'roh_status_%s.txt' % event_id | |
if os.path.exists(fname): | |
with open(fname, 'r') as f: | |
old_stat = f.read().decode('utf-8') | |
if full_stat != old_stat: | |
with open(fname, 'w') as f: | |
f.write(full_stat.encode('utf-8')) | |
num_diffs += 1 | |
output.append(u"Found differences for event: %s" % event_name) | |
diff = difflib.ndiff(old_stat.splitlines(), full_stat.splitlines()) | |
output += diff | |
output.append(u'') | |
output.append(u'') | |
browser.quit() | |
display.stop() | |
if output: | |
msg = MIMEText((u'\n'.join(output)).encode('utf-8'), 'plain', 'UTF-8') | |
msg['Subject'] = 'Royal Opera House - Found %s differences' % num_diffs | |
msg['From'] = from_addr | |
msg['To'] = to_addr | |
s = smtplib.SMTP('localhost') | |
s.sendmail(from_addr, [to_addr], msg.as_string()) | |
s.quit() | |
if __name__ == '__main__': | |
parse_roh() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment