Skip to content

Instantly share code, notes, and snippets.

@mfitzp
Last active April 17, 2020 18:01
Show Gist options
  • Save mfitzp/29522e2ac4057bf01745 to your computer and use it in GitHub Desktop.
Save mfitzp/29522e2ac4057bf01745 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
# Python 3.x
from urllib.request import urlopen, urlretrieve
# Removed the trailing / from the URL
url = 'https://www.rbi.org.in/Scripts/bs_viewcontent.aspx?Id=2009'
u = urlopen(url)
try:
html = u.read().decode('utf-8')
finally:
u.close()
soup = BeautifulSoup(html, "html.parser")
# Select all A elements that have an href attribute, starting with http://
for link in soup.select('a[href^="http://"]'):
href = link.get('href')
if not any(href.endswith(x) for x in ['.csv','.xls','.xlsx']):
continue
filename = href.rsplit('/', 1)[-1]
# You don't need to join + quote as URLs in the HTML are absolute.
# However, we need a https:// URL (in spite of what the link says: check request in your web browser's developer tools)
href = href.replace('http://','https://')
print("Downloading %s to %s..." % (href, filename) )
urlretrieve(href, filename)
print("Done.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment