if __name__ == '__main__'
is just convention to create amain()
entry point for the code- Command line arguments can easily be added importing
sys
and using thesys.argv
list - Argparse can also be used to create more rich argument like this
- This basic version uses wget to download the file it found. You can add more arguments to wget using the existing
subprocess.run()
function - wget has some helpful options that may be considered depending on preference:
--no-clobber
or-nc
won't download a file that has the same name-N
will overwrite a file that has the same name-O
can be used to specify a file output name. This can be useful if a standardized file name is required (such as using the current date)
- curl can be used instead of the requests library like this
- For the pandas version, while testing you should probably download the file and read from the download, rather than fetch from the URL every time
Last active
December 22, 2021 19:27
-
-
Save Davey-Hughes/e158fb90a0a462f15ce4094fd140fb3e to your computer and use it in GitHub Desktop.
Download CO Covid Outbreak File
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import subprocess | |
import requests | |
from bs4 import BeautifulSoup | |
CO_COVID_URL = 'https://covid19.colorado.gov/covid19-outbreak-data' | |
def main(): | |
# download page source | |
page = requests.get(CO_COVID_URL) | |
# parse page with beautiful soup's HTML parser | |
soup = BeautifulSoup(page.text, 'html.parser') | |
# find the class with name 'file-link' | |
file_class = soup.find(class_='file-link') | |
# get the contents of the <a href> tag from the found class | |
file_href = file_class.find('a', href=True)['href'] | |
# download the file using wget command line utility | |
subprocess.run(['wget', file_href], check=True) | |
if __name__ == '__main__': | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import subprocess | |
import requests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
CO_COVID_URL = 'https://covid19.colorado.gov/covid19-outbreak-data' | |
def main(): | |
# download page source | |
page = requests.get(CO_COVID_URL) | |
# parse page with beautiful soup's HTML parser | |
soup = BeautifulSoup(page.text, 'html.parser') | |
# find the class with name 'file-link' | |
file_class = soup.find(class_='file-link') | |
# get the contents of the <a href> tag from the found class | |
file_href = file_class.find('a', href=True)['href'] | |
# load all sheets from the URL into a pandas dataframe | |
file = pd.read_excel(file_href, sheet_name=None) | |
# print a preview of the dataframe | |
print(file) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment