arifsuhan · August 30, 2021 17:14
diff --git a/soupMeetsWget.py b/soupMeetsWget.py
 # @title Get URLs

 import os
 import sys
 import wget
 import pandas as pd
 from bs4 import BeautifulSoup

 def get_soup(url):
  page = requests.get(url)
  soup = BeautifulSoup(page.content,'html.parser')
  return soup

 def parse_homepage(data):
  return [ x['href'] for x in data.find([tag-name], {"id": [id-name]})]

 def parse_page(data):
  temp_url = data.find("source")['src']
  temp_name = data.find([tag-name], {"id": "[id-name]"}).text
  return [temp_name, temp_url]

 #create this bar_progress method which is invoked automatically from wget
 def bar_progress(current, total, width=80):
  progress_message = "Downloading: %d%% [%d / %d] bytes" % (current / total * 100, current, total)
  # Don't use print() as it will print in new line every time.
  sys.stdout.write("\r" + progress_message)
  sys.stdout.flush()

 def get_url(base_url, min_page_index, max_page_index):
  temp = [ parse_homepage(get_soup(base_url+str(x))) for x in range(min_page_index, max_page_index+1)]
  pages = [item for sublist in temp for item in sublist]
  file_list = [ parse_page( get_soup(page)) for page in pages ]
  return file_list

 def download_file(file_URL):
  file_name = wget.download(file_URL,bar=bar_progress)
  print(file_name + " Succesfully downloaded")
  return file_name

 def rename_file(old_name, new_name):
  os.rename(old_name, new_name )

 def run():
  base_url = ''
  file_list = get_url(base_url, 1,7)

  for file in file_list:
    file_name = file[0]
    file_URL = file[1]
    old_name = download_file(file_URL)
    rename_file(old_name, file_name)
	# @title Get URLs

	import os
	import sys
	import wget
	import pandas as pd
	from bs4 import BeautifulSoup

	def get_soup(url):
	page = requests.get(url)
	soup = BeautifulSoup(page.content,'html.parser')
	return soup

	def parse_homepage(data):
	return [ x['href'] for x in data.find([tag-name], {"id": [id-name]})]

	def parse_page(data):
	temp_url = data.find("source")['src']
	temp_name = data.find([tag-name], {"id": "[id-name]"}).text
	return [temp_name, temp_url]

	#create this bar_progress method which is invoked automatically from wget
	def bar_progress(current, total, width=80):
	progress_message = "Downloading: %d%% [%d / %d] bytes" % (current / total * 100, current, total)
	# Don't use print() as it will print in new line every time.
	sys.stdout.write("\r" + progress_message)
	sys.stdout.flush()

	def get_url(base_url, min_page_index, max_page_index):
	temp = [ parse_homepage(get_soup(base_url+str(x))) for x in range(min_page_index, max_page_index+1)]
	pages = [item for sublist in temp for item in sublist]
	file_list = [ parse_page( get_soup(page)) for page in pages ]
	return file_list

	def download_file(file_URL):
	file_name = wget.download(file_URL,bar=bar_progress)
	print(file_name + " Succesfully downloaded")
	return file_name

	def rename_file(old_name, new_name):
	os.rename(old_name, new_name )

	def run():
	base_url = ''
	file_list = get_url(base_url, 1,7)

	for file in file_list:
	file_name = file[0]
	file_URL = file[1]
	old_name = download_file(file_URL)
	rename_file(old_name, file_name)