Skip to content

Instantly share code, notes, and snippets.

@nagasudhirpulla
Last active July 31, 2024 05:19
Show Gist options
  • Save nagasudhirpulla/568972f9720263cde4a8698683fe9d4b to your computer and use it in GitHub Desktop.
Save nagasudhirpulla/568972f9720263cde4a8698683fe9d4b to your computer and use it in GitHub Desktop.
this script get links from a directory listing web page and downloads the files to a local directory
# this script get links from a directory listing web page and downloads the files from it
import urllib.request
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import requests
import os
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--src', help='Source page URL', type=str, default="")
parser.add_argument('--dest', help='Destination folder', type=str, default="")
parser.add_argument(
'--ext', help='file extensions to be downloaded', type=str, default="")
parser.add_argument('--allow_no_extensions', help='Download filenames with no extension',
action='store_true')
parser.add_argument('--force_write', help='replace all files in destination even if present',
action='store_true')
args = parser.parse_args()
def getAllFileLinksFromPage(pageUrl: str, fileExtensions: list[str], isAllowNoExtensions: bool) -> list[tuple[str, str]]:
# get all the links of files with a desired extension from a static file hosting page
# download the page HTML
pageHtml = requests.get(pageUrl).text
# get all the a tag links from the page
fileLinks: list[tuple[str, str]] = []
linkObjects = BeautifulSoup(pageHtml, 'html.parser').find_all('a')
for lObj in linkObjects:
# get filename from link
fName: str = lObj.contents[0]
# check if filename has the desired extensions
if len(fileExtensions) > 0 and not any(fName.endswith(ext) for ext in fileExtensions):
continue
# exclude filename without any file extension
if (not isAllowNoExtensions) and len(fileExtensions) == 0 and len(os.path.splitext(fName)[1]) == 0:
continue
fLink: str = lObj.get("href")
# create complete URL from relative url
if fLink.startswith("/"):
fLink = urljoin(pageUrl, fLink)
fileLinks.append((fName, fLink))
print(f"{len(fileLinks)} links found in page")
return fileLinks
def downloadFilestoFolder(fileLinks: list[tuple[str, str]], destFolderPath: str, isForceWrite: bool):
if len(fileLinks) == 0:
return
# get all filenames currently present in the folder
currentFilenames: list[str] = []
for path in os.scandir(destFolderPath):
if path.is_file():
currentFilenames.append(path.name)
print(f"{len(currentFilenames)} files present initially in destination folder")
numFilesDownloaded: int = 0
for fLink in fileLinks:
fName = fLink[0]
fUrl = fLink[1]
newFilePath = os.path.join(destFolderPath, fName)
# download if file is not already present in folder
if isForceWrite or not os.path.isfile(newFilePath):
# download file from url and save in local folder
urllib.request.urlretrieve(fUrl, newFilePath)
numFilesDownloaded += 1
print(f"{numFilesDownloaded} files downloaded to destination folder")
# transfer weekly atc files
srcPageUrl: str = args.src
destFolder: str = args.dest
desiredFileExtensions: list[str] = [x.strip()
for x in args.ext.split(",") if not args.ext == ""]
isAllowNoExtensions: bool = args.allow_no_extensions
isForceWrite: bool = args.force_write
for eItr, e in enumerate(desiredFileExtensions):
if not e.startswith("."):
desiredFileExtensions[eItr] = "."+e
print(f"Source page URL: {srcPageUrl}")
print(f"Destination folder path: {destFolder}")
print(f"File extensions for download: {desiredFileExtensions}")
if srcPageUrl == "":
print("use --src to specify source page URL")
if destFolder == "":
print("use --dest to specify destination folder path")
isDestFolderPresent = os.path.isdir(destFolder)
if not isDestFolderPresent:
print("destination folder not present")
if srcPageUrl == "" or destFolder == "" or not isDestFolderPresent:
print("exiting without copying")
exit(0)
# srcPageUrl = "http://10.2.100.55:8088/SCADA/Reports/Weekly%20ATCTTC/"
# destFolder = r"C:\Users\Nagasudhir\Documents\D Drive\POSOCO\IT\Python Projects\atc_reports_save\weeklyATCReports"
downloadFilestoFolder(getAllFileLinksFromPage(
srcPageUrl, desiredFileExtensions, isAllowNoExtensions), destFolder, isForceWrite)
print("execution complete!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment