nagasudhirpulla · July 31, 2024 05:19
diff --git a/downloadFiles.py b/downloadFiles.py
 # this script get links from a directory listing web page and downloads the files from it

 import urllib.request
 from urllib.parse import urljoin
 from bs4 import BeautifulSoup
 import requests
 import os
 import argparse


 parser = argparse.ArgumentParser()
 parser.add_argument('--src', help='Source page URL', type=str, default="")
 parser.add_argument('--dest', help='Destination folder', type=str, default="")
 parser.add_argument(
    '--ext', help='file extensions to be downloaded', type=str, default="")
 parser.add_argument('--allow_no_extensions', help='Download filenames with no extension',
                    action='store_true')
 parser.add_argument('--force_write', help='replace all files in destination even if present',
                    action='store_true')
 args = parser.parse_args()


 def getAllFileLinksFromPage(pageUrl: str, fileExtensions: list[str], isAllowNoExtensions: bool) -> list[tuple[str, str]]:
    # get all the links of files with a desired extension from a static file hosting page
    # download the page HTML
    pageHtml = requests.get(pageUrl).text
    # get all the a tag links from the page
    fileLinks: list[tuple[str, str]] = []
    linkObjects = BeautifulSoup(pageHtml, 'html.parser').find_all('a')
    for lObj in linkObjects:
        # get filename from link
        fName: str = lObj.contents[0]

        # check if filename has the desired extensions
        if len(fileExtensions) > 0 and not any(fName.endswith(ext) for ext in fileExtensions):
            continue

        # exclude filename without any file extension
        if (not isAllowNoExtensions) and len(fileExtensions) == 0 and len(os.path.splitext(fName)[1]) == 0:
            continue

        fLink: str = lObj.get("href")

        # create complete URL from relative url
        if fLink.startswith("/"):
            fLink = urljoin(pageUrl, fLink)

        fileLinks.append((fName, fLink))
    print(f"{len(fileLinks)} links found in page")
    return fileLinks


 def downloadFilestoFolder(fileLinks: list[tuple[str, str]], destFolderPath: str, isForceWrite: bool):
    if len(fileLinks) == 0:
        return
    # get all filenames currently present in the folder
    currentFilenames: list[str] = []
    for path in os.scandir(destFolderPath):
        if path.is_file():
            currentFilenames.append(path.name)
    print(f"{len(currentFilenames)} files present initially in destination folder")
    numFilesDownloaded: int = 0
    for fLink in fileLinks:
        fName = fLink[0]
        fUrl = fLink[1]
        newFilePath = os.path.join(destFolderPath, fName)
        # download if file is not already present in folder
        if isForceWrite or not os.path.isfile(newFilePath):
            # download file from url and save in local folder
            urllib.request.urlretrieve(fUrl, newFilePath)
            numFilesDownloaded += 1
    print(f"{numFilesDownloaded} files downloaded to destination folder")


 # transfer weekly atc files
 srcPageUrl: str = args.src
 destFolder: str = args.dest
 desiredFileExtensions: list[str] = [x.strip()
                                    for x in args.ext.split(",") if not args.ext == ""]
 isAllowNoExtensions: bool = args.allow_no_extensions
 isForceWrite: bool = args.force_write

 for eItr, e in enumerate(desiredFileExtensions):
    if not e.startswith("."):
        desiredFileExtensions[eItr] = "."+e

 print(f"Source page URL: {srcPageUrl}")
 print(f"Destination folder path: {destFolder}")
 print(f"File extensions for download: {desiredFileExtensions}")

 if srcPageUrl == "":
    print("use --src to specify source page URL")

 if destFolder == "":
    print("use --dest to specify destination folder path")

 isDestFolderPresent = os.path.isdir(destFolder)
 if not isDestFolderPresent:
    print("destination folder not present")

 if srcPageUrl == "" or destFolder == "" or not isDestFolderPresent:
    print("exiting without copying")
    exit(0)

 # srcPageUrl = "http://10.2.100.55:8088/SCADA/Reports/Weekly%20ATCTTC/"
 # destFolder = r"C:\Users\Nagasudhir\Documents\D Drive\POSOCO\IT\Python Projects\atc_reports_save\weeklyATCReports"
 downloadFilestoFolder(getAllFileLinksFromPage(
    srcPageUrl, desiredFileExtensions, isAllowNoExtensions), destFolder, isForceWrite)

 print("execution complete!")
	# this script get links from a directory listing web page and downloads the files from it

	import urllib.request
	from urllib.parse import urljoin
	from bs4 import BeautifulSoup
	import requests
	import os
	import argparse


	parser = argparse.ArgumentParser()
	parser.add_argument('--src', help='Source page URL', type=str, default="")
	parser.add_argument('--dest', help='Destination folder', type=str, default="")
	parser.add_argument(
	'--ext', help='file extensions to be downloaded', type=str, default="")
	parser.add_argument('--allow_no_extensions', help='Download filenames with no extension',
	action='store_true')
	parser.add_argument('--force_write', help='replace all files in destination even if present',
	action='store_true')
	args = parser.parse_args()


	def getAllFileLinksFromPage(pageUrl: str, fileExtensions: list[str], isAllowNoExtensions: bool) -> list[tuple[str, str]]:
	# get all the links of files with a desired extension from a static file hosting page
	# download the page HTML
	pageHtml = requests.get(pageUrl).text
	# get all the a tag links from the page
	fileLinks: list[tuple[str, str]] = []
	linkObjects = BeautifulSoup(pageHtml, 'html.parser').find_all('a')
	for lObj in linkObjects:
	# get filename from link
	fName: str = lObj.contents[0]

	# check if filename has the desired extensions
	if len(fileExtensions) > 0 and not any(fName.endswith(ext) for ext in fileExtensions):
	continue

	# exclude filename without any file extension
	if (not isAllowNoExtensions) and len(fileExtensions) == 0 and len(os.path.splitext(fName)[1]) == 0:
	continue

	fLink: str = lObj.get("href")

	# create complete URL from relative url
	if fLink.startswith("/"):
	fLink = urljoin(pageUrl, fLink)

	fileLinks.append((fName, fLink))
	print(f"{len(fileLinks)} links found in page")
	return fileLinks


	def downloadFilestoFolder(fileLinks: list[tuple[str, str]], destFolderPath: str, isForceWrite: bool):
	if len(fileLinks) == 0:
	return
	# get all filenames currently present in the folder
	currentFilenames: list[str] = []
	for path in os.scandir(destFolderPath):
	if path.is_file():
	currentFilenames.append(path.name)
	print(f"{len(currentFilenames)} files present initially in destination folder")
	numFilesDownloaded: int = 0
	for fLink in fileLinks:
	fName = fLink[0]
	fUrl = fLink[1]
	newFilePath = os.path.join(destFolderPath, fName)
	# download if file is not already present in folder
	if isForceWrite or not os.path.isfile(newFilePath):
	# download file from url and save in local folder
	urllib.request.urlretrieve(fUrl, newFilePath)
	numFilesDownloaded += 1
	print(f"{numFilesDownloaded} files downloaded to destination folder")


	# transfer weekly atc files
	srcPageUrl: str = args.src
	destFolder: str = args.dest
	desiredFileExtensions: list[str] = [x.strip()
	for x in args.ext.split(",") if not args.ext == ""]
	isAllowNoExtensions: bool = args.allow_no_extensions
	isForceWrite: bool = args.force_write

	for eItr, e in enumerate(desiredFileExtensions):
	if not e.startswith("."):
	desiredFileExtensions[eItr] = "."+e

	print(f"Source page URL: {srcPageUrl}")
	print(f"Destination folder path: {destFolder}")
	print(f"File extensions for download: {desiredFileExtensions}")

	if srcPageUrl == "":
	print("use --src to specify source page URL")

	if destFolder == "":
	print("use --dest to specify destination folder path")

	isDestFolderPresent = os.path.isdir(destFolder)
	if not isDestFolderPresent:
	print("destination folder not present")

	if srcPageUrl == "" or destFolder == "" or not isDestFolderPresent:
	print("exiting without copying")
	exit(0)

	# srcPageUrl = "http://10.2.100.55:8088/SCADA/Reports/Weekly%20ATCTTC/"
	# destFolder = r"C:\Users\Nagasudhir\Documents\D Drive\POSOCO\IT\Python Projects\atc_reports_save\weeklyATCReports"
	downloadFilestoFolder(getAllFileLinksFromPage(
	srcPageUrl, desiredFileExtensions, isAllowNoExtensions), destFolder, isForceWrite)

	print("execution complete!")