Skip to content

Instantly share code, notes, and snippets.

@arifsuhan
Last active August 30, 2021 17:14
Show Gist options
  • Save arifsuhan/b9cdb50480b5faa8bc1cfd12fd959c76 to your computer and use it in GitHub Desktop.
Save arifsuhan/b9cdb50480b5faa8bc1cfd12fd959c76 to your computer and use it in GitHub Desktop.
Parse Web via BeautifulSoup -> download via Wget -> Rename File
# @title Get URLs
import os
import sys
import wget
import pandas as pd
from bs4 import BeautifulSoup
def get_soup(url):
page = requests.get(url)
soup = BeautifulSoup(page.content,'html.parser')
return soup
def parse_homepage(data):
return [ x['href'] for x in data.find([tag-name], {"id": [id-name]})]
def parse_page(data):
temp_url = data.find("source")['src']
temp_name = data.find([tag-name], {"id": "[id-name]"}).text
return [temp_name, temp_url]
#create this bar_progress method which is invoked automatically from wget
def bar_progress(current, total, width=80):
progress_message = "Downloading: %d%% [%d / %d] bytes" % (current / total * 100, current, total)
# Don't use print() as it will print in new line every time.
sys.stdout.write("\r" + progress_message)
sys.stdout.flush()
def get_url(base_url, min_page_index, max_page_index):
temp = [ parse_homepage(get_soup(base_url+str(x))) for x in range(min_page_index, max_page_index+1)]
pages = [item for sublist in temp for item in sublist]
file_list = [ parse_page( get_soup(page)) for page in pages ]
return file_list
def download_file(file_URL):
file_name = wget.download(file_URL,bar=bar_progress)
print(file_name + " Succesfully downloaded")
return file_name
def rename_file(old_name, new_name):
os.rename(old_name, new_name )
def run():
base_url = ''
file_list = get_url(base_url, 1,7)
for file in file_list:
file_name = file[0]
file_URL = file[1]
old_name = download_file(file_URL)
rename_file(old_name, file_name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment