Last active
December 16, 2020 18:16
-
-
Save jalbertbowden/ecacdd6e867c9f6d81a118b69edda25d to your computer and use it in GitHub Desktop.
scrape longwood covid data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Active Cases | Total Cumulative Cases | Total Students | Total Faculty | Date |
---|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import json | |
import re | |
import csv | |
def get_date_updated(soup, date_months): | |
date_updated = soup.findAll('p', {'class': 'lastUpdated'}) | |
date_up = date_updated[0].text | |
date_arr = date_up.split(':') | |
date_value = date_arr[1].strip() | |
dav = date_value.split(' ') | |
dav = dav[2] + ' ' + dav[0] + ' ' + dav[1] | |
data_date = format_date(dav, date_months) | |
date_correct_format = data_date[0].replace(',', '') | |
return date_correct_format | |
def format_date(var, date_months): | |
var_arr = var.split(' ') | |
if var_arr[1] == date_months[0]: | |
date_month = '01' | |
elif var_arr[1] == date_months[1]: | |
date_month = '02' | |
elif var_arr[1] == date_months[2]: | |
date_month = '03' | |
elif var_arr[1] == date_months[3]: | |
date_month = '04' | |
elif var_arr[1] == date_months[4]: | |
date_month = '05' | |
elif var_arr[1] == date_months[5]: | |
date_month = '06' | |
elif var_arr[1] == date_months[6]: | |
date_month = '07' | |
elif var_arr[1] == date_months[7]: | |
date_month = '08' | |
elif var_arr[1] == date_months[8]: | |
date_month = '09' | |
elif var_arr[1] == date_months[9]: | |
date_month = '10' | |
elif var_arr[1] == date_months[10]: | |
date_month = '11' | |
elif var_arr[1] == date_months[11]: | |
date_month = '12' | |
date_form = '2020-' + date_month + '-' + var_arr[2] | |
date_form_arr = [date_form, var_arr[0]] | |
return date_form_arr | |
def csv_append_row(csv_name, csv_row, csv_headers): | |
with open(csv_name, 'a') as fd: | |
writer = csv.DictWriter(fd, fieldnames = csv_headers) | |
writer.writerow(csv_row) | |
# csv_append_row(csv_name, my_list, csv_headers) | |
def scrape_longwood(): | |
csv_headers = ["Active Cases", "Total Cumulative Cases", "Total Students", "Total Faculty", "Date"] | |
date_months = ['Jan.', 'Feb.', 'Mar.', 'Apr.', 'May', 'Jun.', 'Jul.', 'Aug.', 'Sep.', 'Oct.', 'Nov.','Dec.'] | |
url = 'http://www.longwood.edu/covid19/dashboard/' | |
csv_name = 'longwood-covid-19.csv' | |
r = requests.get(url) | |
soup = BeautifulSoup(r.text, 'html.parser') | |
tables = soup.findAll('table')[1] | |
td_a = 0 | |
td_b = 1 | |
td_c = 2 | |
td_d = 3 | |
table_totals = soup.findAll('table')[0] | |
table_totals_tds = table_totals.find_all('strong') | |
total_students = table_totals_tds[0].text | |
total_faculty = table_totals_tds[1].text | |
total_active = soup.findAll('div', {'class': 'value'}) | |
total_active_cases = total_active[0].text | |
total_cumulative_confirmed_cases = total_active[1].text | |
td_cases = tables.find_all('td', {'class': 'caseValue'}) | |
td_dates = tables.find_all('td', {'class': 'reportDate'}) | |
date_intl = get_date_updated(soup, date_months) | |
new_dict = {} | |
new_dict['Active Cases'] = total_active_cases | |
new_dict['Total Cumulative Cases'] = total_cumulative_confirmed_cases | |
new_dict['Total Students'] = total_students | |
new_dict['Total Faculty'] = total_faculty | |
new_dict['Date'] = date_intl | |
# print(new_dict) | |
csv_append_row(csv_name, new_dict, csv_headers) | |
# csv_row_new_str = total_active_cases + ', ' + total_cumulative_confirmed_cases + ', ' + total_students + ', ' + total_faculty + ', ' + datxxx | |
"""these_rows = [] | |
for td in td_dates: | |
this_date = format_date(td.text, date_months) | |
this_row = [this_date[1], this_date[0], td_cases[td_a].text, td_cases[td_b].text, td_cases[td_c].text, td_cases[td_d].text] | |
these_rows.append(this_row) | |
td_a = td_a + 4 | |
td_b = td_b + 4 | |
td_c = td_c + 4 | |
td_d = td_d + 4""" | |
scrape_longwood() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment