Created
July 29, 2014 23:46
-
-
Save quandyfactory/c1b2432721bdf1ab6ced to your computer and use it in GitHub Desktop.
get_fringe_plays.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| """Code I used to download all the details for the 2014 Hamilton Fringe Festival plays and convert them into a JSON file.""" | |
| from bs4 import BeautifulSoup as bs | |
| import json | |
| import requests | |
| import xlrd | |
| base_url = 'http://hamiltonfringe.ca/shows/' | |
| r = requests.get(base_url) | |
| data = r.text | |
| soup = bs(data) | |
| links = [link.get('href') for link in soup.find_all('a') if link.get('href') is not None and len(link.get('href')) > 31 and link.get('href')[:31] == base_url] | |
| output = [] | |
| def get_soup(link): | |
| r = requests.get(link) | |
| data = r.text | |
| soup = bs(data) | |
| return soup | |
| def process_soup(soup): | |
| title = soup.title.text | |
| billing = [div for div in soup.find_all('div') if div.get('class') is not None and 'show-billing' in div.get('class')][0].text | |
| billing = billing.replace('\t', '') | |
| billing = billing.replace('\n', ' \n') | |
| company = [div for div in soup.find_all('div') if div.get('class') is not None and 'company-info' in div.get('class')][0].text | |
| company = company.replace('\n', '') | |
| description = [div for div in soup.find_all('div') if div.get('class') is not None and 'show-description' in div.get('class')][0].text | |
| return { "title": title, "description": '%s\n%s\n\n%s' % (company, billing, description) } | |
| def save_plays(): | |
| shows = [] | |
| for link in links: | |
| soup = get_soup(link) | |
| obj = process_soup(soup) | |
| obj['link'] = link | |
| shows.append(obj) | |
| print('Link %s added to shows.' % (link)) | |
| output = json.dumps(shows) | |
| output = output.replace('\\t', '') | |
| while '\\n\\n\\n' in output: | |
| output = output.replace('\\n\\n\\n', '\\n\\n') | |
| output = output.replace('\\u2019', "'") | |
| with open('fringe-shows.json', 'w') as file: | |
| file.write(output) | |
| print('File saved as fringe-shows.json') | |
| def get_showtimes(): | |
| worksheet = xlrd.open_workbook('shows.xls') | |
| sh = worksheet.sheet_by_name(u'shows') | |
| headers = sh.row_values(0) # get column headers | |
| data = [] | |
| for rownum in range(1,sh.nrows): # skip column headers | |
| data.append(sh.row_values(rownum)) | |
| return headers, data | |
| def get_plays(): | |
| with open('fringe-shows.json', 'r') as file: | |
| obj = file.read() | |
| return json.loads(obj) | |
| def generate_sql(plays, headers, showtimes): | |
| print('insert into wots (title, date, time, location, address, details, website, username) values ') | |
| username = 'Ryan' | |
| for showtime in showtimes: | |
| title = showtime[2].strip() | |
| for play in plays: | |
| if play['title'].strip() == title: | |
| location = showtime[0] | |
| address = '%s, Hamilton, ON' % (showtime[1]) | |
| details = play['description'] | |
| website = play['link'] | |
| if showtime[4] != '': # july 17 | |
| date = '2014-07-17' | |
| time = showtime[4] | |
| print("('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s'), " % (title, date, time, location, address, details.replace("'", "\'"), website, username)) | |
| if showtime[5] != '': #July 18 | |
| date = '2014-07-18' | |
| time = showtime[5] | |
| print("('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s'), " % (title, date, time, location, address, details.replace("'", "\'"), website, username)) | |
| if showtime[6] != '': #July 19 | |
| date = '2014-07-19' | |
| time = showtime[6] | |
| print("('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s'), " % (title, date, time, location, address, details.replace("'", "\'"), website, username)) | |
| if showtime[7] != '': #July 20 | |
| date = '2014-07-20' | |
| time = showtime[7] | |
| print("('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s'), " % (title, date, time, location, address, details.replace("'", "\'"), website, username)) | |
| if showtime[8] != '': #July 21 | |
| date = '2014-07-21' | |
| time = showtime[8] | |
| print("('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s'), " % (title, date, time, location, address, details.replace("'", "\'"), website, username)) | |
| if showtime[9] != '': #July 2 | |
| date = '2014-07-22' | |
| time = showtime[9] | |
| print("('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s'), " % (title, date, time, location, address, details.replace("'", "\'"), website, username)) | |
| if showtime[10] != '': #July 23 | |
| date = '2014-07-23' | |
| time = showtime[10] | |
| print("('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s'), " % (title, date, time, location, address, details.replace("'", "\'"), website, username)) | |
| if showtime[11] != '': #July 24 | |
| date = '2014-07-24' | |
| time = showtime[11] | |
| print("('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s'), " % (title, date, time, location, address, details.replace("'", "\'"), website, username)) | |
| if showtime[12] != '': #July 25 | |
| date = '2014-07-25' | |
| time = showtime[12] | |
| print("('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s'), " % (title, date, time, location, address, details.replace("'", "\'"), website, username)) | |
| if showtime[13] != '': #July 26 | |
| date = '2014-07-26' | |
| time = showtime[13] | |
| print("('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s'), " % (title, date, time, location, address, details.replace("'", "\'"), website, username)) | |
| if showtime[14] != '': #July 27 | |
| date = '2014-07-27' | |
| time = showtime[14] | |
| print("('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s'), " % (title, date, time, location, address, details.replace("'", "\'"), website, username)) | |
| break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment