-
-
Save Jwely/ad8eb800bacef9e34dd775f9b3aad987 to your computer and use it in GitHub Desktop.
import ftplib | |
import os | |
import re | |
""" | |
MIT license: 2017 - Jwely | |
Example usage: | |
``` python | |
import ftplib | |
ftp = ftplib.FTP(mysite, username, password) | |
download_ftp_tree(ftp, remote_dir, local_dir) | |
``` | |
The code above will look for a directory called "remote_dir" on the ftp host, and then duplicate the | |
directory and its entire contents into the "local_dir". | |
*** Note that if wget is an option, I recommend using that instead *** | |
""" | |
def _is_ftp_dir(ftp_handle, name, guess_by_extension=True): | |
""" simply determines if an item listed on the ftp server is a valid directory or not """ | |
# if the name has a "." in the fourth to last position, its probably a file extension | |
# this is MUCH faster than trying to set every file to a working directory, and will work 99% of time. | |
if guess_by_extension is True: | |
if len(name) >= 4: | |
if name[-4] == '.': | |
return False | |
original_cwd = ftp_handle.pwd() # remember the current working directory | |
try: | |
ftp_handle.cwd(name) # try to set directory to new name | |
ftp_handle.cwd(original_cwd) # set it back to what it was | |
return True | |
except ftplib.error_perm as e: | |
print(e) | |
return False | |
except Exception as e: | |
print(e) | |
return False | |
def _make_parent_dir(fpath): | |
""" ensures the parent directory of a filepath exists """ | |
dirname = os.path.dirname(fpath) | |
while not os.path.exists(dirname): | |
try: | |
os.makedirs(dirname) | |
print("created {0}".format(dirname)) | |
except OSError as e: | |
print(e) | |
_make_parent_dir(dirname) | |
def _download_ftp_file(ftp_handle, name, dest, overwrite): | |
""" downloads a single file from an ftp server """ | |
_make_parent_dir(dest.lstrip("/")) | |
if not os.path.exists(dest) or overwrite is True: | |
try: | |
with open(dest, 'wb') as f: | |
ftp_handle.retrbinary("RETR {0}".format(name), f.write) | |
print("downloaded: {0}".format(dest)) | |
except FileNotFoundError: | |
print("FAILED: {0}".format(dest)) | |
else: | |
print("already exists: {0}".format(dest)) | |
def _file_name_match_patern(pattern, name): | |
""" returns True if filename matches the pattern""" | |
if pattern is None: | |
return True | |
else: | |
return bool(re.match(pattern, name)) | |
def _mirror_ftp_dir(ftp_handle, name, overwrite, guess_by_extension, pattern): | |
""" replicates a directory on an ftp server recursively """ | |
for item in ftp_handle.nlst(name): | |
if _is_ftp_dir(ftp_handle, item, guess_by_extension): | |
_mirror_ftp_dir(ftp_handle, item, overwrite, guess_by_extension, pattern) | |
else: | |
if _file_name_match_patern(pattern, name): | |
_download_ftp_file(ftp_handle, item, item, overwrite) | |
else: | |
# quietly skip the file | |
pass | |
def download_ftp_tree(ftp_handle, path, destination, pattern=None, overwrite=False, guess_by_extension=True): | |
""" | |
Downloads an entire directory tree from an ftp server to the local destination | |
:param ftp_handle: an authenticated ftplib.FTP instance | |
:param path: the folder on the ftp server to download | |
:param destination: the local directory to store the copied folder | |
:param pattern: Python regex pattern, only files that match this pattern will be downloaded. | |
:param overwrite: set to True to force re-download of all files, even if they appear to exist already | |
:param guess_by_extension: It takes a while to explicitly check if every item is a directory or a file. | |
if this flag is set to True, it will assume any file ending with a three character extension ".???" is | |
a file and not a directory. Set to False if some folders may have a "." in their names -4th position. | |
""" | |
path = path.lstrip("/") | |
original_directory = os.getcwd() # remember working directory before function is executed | |
os.chdir(destination) # change working directory to ftp mirror directory | |
_mirror_ftp_dir( | |
ftp_handle, | |
path, | |
pattern=pattern, | |
overwrite=overwrite, | |
guess_by_extension=guess_by_extension) | |
os.chdir(original_directory) # reset working directory to what it was before function exec | |
if __name__ == "__main__": | |
# Example usage mirroring all jpg files in an FTP directory tree. | |
mysite = "some_ftp_site" | |
username = "anonymous" | |
password = None | |
remote_dir = "" | |
local_dir = "" | |
pattern = ".*\.jpg$" | |
ftp = ftplib.FTP(mysite, username, password) | |
download_ftp_tree(ftp, remote_dir, local_dir, pattern=pattern, overwrite=False, guess_by_extension=True) |
@PhaniChandan, Have you tried adding ftp.set_debuglevel(1) in the main method ? Maybe there will be more information. And then if everything is still ok, try changing pattern to None.
@Jwely , Thanks for this code! Am stuck in a situation where I cannot use wget. I am using your solution, but seem to have an unhandled edge case that causes a silent fail - any file or folder with square brackets in it ( [ ] ) causes a recursion loop, as ftp_handle.nlist(name).
contains only the targeted directory in the FTP server, and not the full list of nlist()
it works fine for other cases.
I believe that bracket handling needs to be done, as I have tested this with multiple files and directories. However, changing the names of the files and directories on the FTP server is not an option.
As i could not edit my comment for some reason:
I found a workaround for my issue!
While this has not been proven to work with all edge cases (japanese characters still do not work so far!) I just create the nlst()
output from mlsd(): and just calling it instead of
ftp_handle.nlst()` in line 84. this now has no problem with directories with square brackets in them.
def nlstSafe(self, directory):
out = []
for item in ftp_handle.mlsd(directory):
out.append(os.path.join(directory, item[0]))
return out
@Jwely, Thanks for this code!
I have a question about the _mirror_ftp_dir function. Shouldn't the second argument of the _file_name_match_patern function be "item" instead of "name"? Since "name" refers to the directory path, _file_name_match_patern(pattern, name) would be applying the regular expression to the directory path. I think "item" would be appropriate if you want to apply the regular expression to the file path.
Hi @Jwely . Thanks for the code. Need some help. My local_dir = "D:/Projects/Temporary files" , mysite = 'ftp.dlptest.com' , username = 'dlpuser' , password = 'rNrKYTX9g7z3RgJRmxWuGHbeu' , remote_dir = '/' , and pattern = "..upt" or "..txt".
The program is getting executed without any errors but I am unable to find any of the files in the local directory.
Could you please check and let me know what is the issue or where am I going wrong.