Last active
January 22, 2023 15:55
-
-
Save julia-git/5fb77bbfbbf08a5b9f1f8e7e739d9817 to your computer and use it in GitHub Desktop.
webscraping_nyc_mta
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import libraries | |
import requests | |
import urllib.request | |
import time | |
from bs4 import BeautifulSoup | |
# Set the URL you want to webscrape from | |
url = 'http://web.mta.info/developers/turnstile.html' | |
# Connect to the URL | |
response = requests.get(url) | |
# Parse HTML and save to BeautifulSoup object¶ | |
soup = BeautifulSoup(response.text, "html.parser") | |
# To download the whole data set, let's do a for loop through all a tags | |
line_count = 1 #variable to track what line you are on | |
for one_a_tag in soup.findAll('a'): #'a' tags are for links | |
if line_count >= 36: #code for text files starts at line 36 | |
link = one_a_tag['href'] | |
download_url = 'http://web.mta.info/developers/'+ link | |
urllib.request.urlretrieve(download_url,'./'+link[link.find('/turnstile_')+1:]) | |
time.sleep(1) #pause the code for a sec | |
#add 1 for next line | |
line_count +=1 |
Hello Julian,
I'm new to python may I know what this means "link = one_a_tag['href']"
Hello. Link is the name of the variable I am storing the value of one_a_tag['href'].
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks