Skip to content

Instantly share code, notes, and snippets.

@phwelo
Last active March 18, 2020 01:30
Show Gist options
  • Save phwelo/e049bf47bcbb3e58d16522a85dfd5c44 to your computer and use it in GitHub Desktop.
Save phwelo/e049bf47bcbb3e58d16522a85dfd5c44 to your computer and use it in GitHub Desktop.
parse caronavirus data
#!/usr/bin/env python3
from bs4 import BeautifulSoup
import urllib.request, urllib.error, urllib.parse
source_url = 'https://docs.google.com/spreadsheets/u/0/d/e/2PACX-1vR30F8lYP3jG7YOq8es0PBpJIE5yvRVZffOyaqC0GgMBN6yt0Q-NI8pxS7hd1F9dYXnowSC6zpZmW9D/pubhtml/sheet?gid=0'
def get_table_rows(url):
response = urllib.request.urlopen(url)
webContent = BeautifulSoup(response.read(), features="lxml")
parsed_table = (webContent.body.find('table', attrs={'class':'waffle'}))
return parsed_table.find_all('tr')
def parse_rows(row_array):
parsed_rows = []
# this document actually starts at row 7 at the moment
for row in table_rows[7:]:
current_row = {}
all_td = row.find_all('td')
current_row['country'] = all_td[0].text
if 'Queue' in current_row['country']:
break
current_row['cases'] = all_td[1].text
current_row['deaths'] = all_td[2].text
parsed_rows.append(current_row)
return parsed_rows
table_rows = get_table_rows(source_url)
final_obj = parse_rows(table_rows)
print(final_obj)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment