Last active
August 26, 2019 18:03
-
-
Save Echocage/97d0fb82c3894f56809a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urllib.request import urlretrieve | |
from time import sleep | |
import requests | |
from bs4 import BeautifulSoup as bs | |
def get_files(): | |
""" Gets files of specified extension through user input | |
from a specified full URL path; downloads each file to | |
the user's specified local directory. """ | |
while True: | |
url = input("Enter the URL you want to scrape from: ") | |
suffix = input("What type of file do you want to scrape? \nExamples: .png, .pdf, .doc - ") | |
filepath = input("Specify a file path to save to: ") | |
if not url.startswith('http://') and not url.startswith('https://'): | |
url = 'http://' + url | |
response = requests.get(url, stream=True) | |
soup = bs(response.text) | |
link_list = [link.get('href') for link in soup.find_all('a') if suffix in str(link)] | |
for link in link_list: | |
file_name = link.rpartition('/')[-1] | |
urlretrieve(url.rsplit('/', 1)[0] + '/' + link, filepath + '\\' + file_name) | |
print_message(link_list, suffix) | |
if not repeat(input("Scrape from another URL? ")): | |
break | |
def print_message(lst, suffix): | |
""" Notifies user when done downloading files OR | |
if there are no files of the type they specified | |
Input: List of file names, String for file extension """ | |
if lst: | |
print("Finished. Downloaded all files of type", suffix) | |
else: | |
print("No files of type", suffix, "were found.") | |
def repeat(decision): | |
""" Function for running the file scraper again | |
Input: String 'yes' or 'no' """ | |
if decision.lower().startswith("y"): | |
return True | |
print("Closing program...") | |
sleep(3) | |
print("Goodbye") | |
return False | |
if __name__ == '__main__': | |
get_files() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment