Last active
July 31, 2024 05:19
-
-
Save nagasudhirpulla/568972f9720263cde4a8698683fe9d4b to your computer and use it in GitHub Desktop.
this script get links from a directory listing web page and downloads the files to a local directory
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# this script get links from a directory listing web page and downloads the files from it | |
import urllib.request | |
from urllib.parse import urljoin | |
from bs4 import BeautifulSoup | |
import requests | |
import os | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--src', help='Source page URL', type=str, default="") | |
parser.add_argument('--dest', help='Destination folder', type=str, default="") | |
parser.add_argument( | |
'--ext', help='file extensions to be downloaded', type=str, default="") | |
parser.add_argument('--allow_no_extensions', help='Download filenames with no extension', | |
action='store_true') | |
parser.add_argument('--force_write', help='replace all files in destination even if present', | |
action='store_true') | |
args = parser.parse_args() | |
def getAllFileLinksFromPage(pageUrl: str, fileExtensions: list[str], isAllowNoExtensions: bool) -> list[tuple[str, str]]: | |
# get all the links of files with a desired extension from a static file hosting page | |
# download the page HTML | |
pageHtml = requests.get(pageUrl).text | |
# get all the a tag links from the page | |
fileLinks: list[tuple[str, str]] = [] | |
linkObjects = BeautifulSoup(pageHtml, 'html.parser').find_all('a') | |
for lObj in linkObjects: | |
# get filename from link | |
fName: str = lObj.contents[0] | |
# check if filename has the desired extensions | |
if len(fileExtensions) > 0 and not any(fName.endswith(ext) for ext in fileExtensions): | |
continue | |
# exclude filename without any file extension | |
if (not isAllowNoExtensions) and len(fileExtensions) == 0 and len(os.path.splitext(fName)[1]) == 0: | |
continue | |
fLink: str = lObj.get("href") | |
# create complete URL from relative url | |
if fLink.startswith("/"): | |
fLink = urljoin(pageUrl, fLink) | |
fileLinks.append((fName, fLink)) | |
print(f"{len(fileLinks)} links found in page") | |
return fileLinks | |
def downloadFilestoFolder(fileLinks: list[tuple[str, str]], destFolderPath: str, isForceWrite: bool): | |
if len(fileLinks) == 0: | |
return | |
# get all filenames currently present in the folder | |
currentFilenames: list[str] = [] | |
for path in os.scandir(destFolderPath): | |
if path.is_file(): | |
currentFilenames.append(path.name) | |
print(f"{len(currentFilenames)} files present initially in destination folder") | |
numFilesDownloaded: int = 0 | |
for fLink in fileLinks: | |
fName = fLink[0] | |
fUrl = fLink[1] | |
newFilePath = os.path.join(destFolderPath, fName) | |
# download if file is not already present in folder | |
if isForceWrite or not os.path.isfile(newFilePath): | |
# download file from url and save in local folder | |
urllib.request.urlretrieve(fUrl, newFilePath) | |
numFilesDownloaded += 1 | |
print(f"{numFilesDownloaded} files downloaded to destination folder") | |
# transfer weekly atc files | |
srcPageUrl: str = args.src | |
destFolder: str = args.dest | |
desiredFileExtensions: list[str] = [x.strip() | |
for x in args.ext.split(",") if not args.ext == ""] | |
isAllowNoExtensions: bool = args.allow_no_extensions | |
isForceWrite: bool = args.force_write | |
for eItr, e in enumerate(desiredFileExtensions): | |
if not e.startswith("."): | |
desiredFileExtensions[eItr] = "."+e | |
print(f"Source page URL: {srcPageUrl}") | |
print(f"Destination folder path: {destFolder}") | |
print(f"File extensions for download: {desiredFileExtensions}") | |
if srcPageUrl == "": | |
print("use --src to specify source page URL") | |
if destFolder == "": | |
print("use --dest to specify destination folder path") | |
isDestFolderPresent = os.path.isdir(destFolder) | |
if not isDestFolderPresent: | |
print("destination folder not present") | |
if srcPageUrl == "" or destFolder == "" or not isDestFolderPresent: | |
print("exiting without copying") | |
exit(0) | |
# srcPageUrl = "http://10.2.100.55:8088/SCADA/Reports/Weekly%20ATCTTC/" | |
# destFolder = r"C:\Users\Nagasudhir\Documents\D Drive\POSOCO\IT\Python Projects\atc_reports_save\weeklyATCReports" | |
downloadFilestoFolder(getAllFileLinksFromPage( | |
srcPageUrl, desiredFileExtensions, isAllowNoExtensions), destFolder, isForceWrite) | |
print("execution complete!") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment