Last active
April 17, 2020 18:01
-
-
Save mfitzp/29522e2ac4057bf01745 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
# Python 3.x | |
from urllib.request import urlopen, urlretrieve | |
# Removed the trailing / from the URL | |
url = 'https://www.rbi.org.in/Scripts/bs_viewcontent.aspx?Id=2009' | |
u = urlopen(url) | |
try: | |
html = u.read().decode('utf-8') | |
finally: | |
u.close() | |
soup = BeautifulSoup(html, "html.parser") | |
# Select all A elements that have an href attribute, starting with http:// | |
for link in soup.select('a[href^="http://"]'): | |
href = link.get('href') | |
if not any(href.endswith(x) for x in ['.csv','.xls','.xlsx']): | |
continue | |
filename = href.rsplit('/', 1)[-1] | |
# You don't need to join + quote as URLs in the HTML are absolute. | |
# However, we need a https:// URL (in spite of what the link says: check request in your web browser's developer tools) | |
href = href.replace('http://','https://') | |
print("Downloading %s to %s..." % (href, filename) ) | |
urlretrieve(href, filename) | |
print("Done.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment