-
-
Save Jwely/ad8eb800bacef9e34dd775f9b3aad987 to your computer and use it in GitHub Desktop.
import ftplib | |
import os | |
import re | |
""" | |
MIT license: 2017 - Jwely | |
Example usage: | |
``` python | |
import ftplib | |
ftp = ftplib.FTP(mysite, username, password) | |
download_ftp_tree(ftp, remote_dir, local_dir) | |
``` | |
The code above will look for a directory called "remote_dir" on the ftp host, and then duplicate the | |
directory and its entire contents into the "local_dir". | |
*** Note that if wget is an option, I recommend using that instead *** | |
""" | |
def _is_ftp_dir(ftp_handle, name, guess_by_extension=True): | |
""" simply determines if an item listed on the ftp server is a valid directory or not """ | |
# if the name has a "." in the fourth to last position, its probably a file extension | |
# this is MUCH faster than trying to set every file to a working directory, and will work 99% of time. | |
if guess_by_extension is True: | |
if len(name) >= 4: | |
if name[-4] == '.': | |
return False | |
original_cwd = ftp_handle.pwd() # remember the current working directory | |
try: | |
ftp_handle.cwd(name) # try to set directory to new name | |
ftp_handle.cwd(original_cwd) # set it back to what it was | |
return True | |
except ftplib.error_perm as e: | |
print(e) | |
return False | |
except Exception as e: | |
print(e) | |
return False | |
def _make_parent_dir(fpath): | |
""" ensures the parent directory of a filepath exists """ | |
dirname = os.path.dirname(fpath) | |
while not os.path.exists(dirname): | |
try: | |
os.makedirs(dirname) | |
print("created {0}".format(dirname)) | |
except OSError as e: | |
print(e) | |
_make_parent_dir(dirname) | |
def _download_ftp_file(ftp_handle, name, dest, overwrite): | |
""" downloads a single file from an ftp server """ | |
_make_parent_dir(dest.lstrip("/")) | |
if not os.path.exists(dest) or overwrite is True: | |
try: | |
with open(dest, 'wb') as f: | |
ftp_handle.retrbinary("RETR {0}".format(name), f.write) | |
print("downloaded: {0}".format(dest)) | |
except FileNotFoundError: | |
print("FAILED: {0}".format(dest)) | |
else: | |
print("already exists: {0}".format(dest)) | |
def _file_name_match_patern(pattern, name): | |
""" returns True if filename matches the pattern""" | |
if pattern is None: | |
return True | |
else: | |
return bool(re.match(pattern, name)) | |
def _mirror_ftp_dir(ftp_handle, name, overwrite, guess_by_extension, pattern): | |
""" replicates a directory on an ftp server recursively """ | |
for item in ftp_handle.nlst(name): | |
if _is_ftp_dir(ftp_handle, item, guess_by_extension): | |
_mirror_ftp_dir(ftp_handle, item, overwrite, guess_by_extension, pattern) | |
else: | |
if _file_name_match_patern(pattern, name): | |
_download_ftp_file(ftp_handle, item, item, overwrite) | |
else: | |
# quietly skip the file | |
pass | |
def download_ftp_tree(ftp_handle, path, destination, pattern=None, overwrite=False, guess_by_extension=True): | |
""" | |
Downloads an entire directory tree from an ftp server to the local destination | |
:param ftp_handle: an authenticated ftplib.FTP instance | |
:param path: the folder on the ftp server to download | |
:param destination: the local directory to store the copied folder | |
:param pattern: Python regex pattern, only files that match this pattern will be downloaded. | |
:param overwrite: set to True to force re-download of all files, even if they appear to exist already | |
:param guess_by_extension: It takes a while to explicitly check if every item is a directory or a file. | |
if this flag is set to True, it will assume any file ending with a three character extension ".???" is | |
a file and not a directory. Set to False if some folders may have a "." in their names -4th position. | |
""" | |
path = path.lstrip("/") | |
original_directory = os.getcwd() # remember working directory before function is executed | |
os.chdir(destination) # change working directory to ftp mirror directory | |
_mirror_ftp_dir( | |
ftp_handle, | |
path, | |
pattern=pattern, | |
overwrite=overwrite, | |
guess_by_extension=guess_by_extension) | |
os.chdir(original_directory) # reset working directory to what it was before function exec | |
if __name__ == "__main__": | |
# Example usage mirroring all jpg files in an FTP directory tree. | |
mysite = "some_ftp_site" | |
username = "anonymous" | |
password = None | |
remote_dir = "" | |
local_dir = "" | |
pattern = ".*\.jpg$" | |
ftp = ftplib.FTP(mysite, username, password) | |
download_ftp_tree(ftp, remote_dir, local_dir, pattern=pattern, overwrite=False, guess_by_extension=True) |
Hello All,
First, thanks a lot for the code. i tried many others but yours is really explicit.
As @antronica, it works for one of my FTP servers (managed by filezilla server on Windows). However, my application is to retrieve pictures from an FTP built-in an Axis camera and using the same code, it returns a reccursive error. After putting many print in the code, I found it does not remember in which folder it came inbefore. I give you an example:
On the FTP where it works, I put a print in the "_mirror_ftp_dir" subroutine to know if it detected a folder or a file. Once it can only find files, the "item" value contains the whole path (i.e.: item=2018/Week40/Picture_181001_150958.jpg).
On the FTP where it does NOT work, I can only see the name of the file without the parent folders (i.e.: item=Picture_181001_150958.jpg).
Then I got the same error as @antronica: [WinError 3] The system cannot find the path specified: and I can understand it because the file does not exist in the orginal root.
I hope I'm clear enough. Would you have any idea to solve my problem?
Sincerely
Antoine
Hello,
I'm a complete beginner with Python and at a loss. I want to download the folder tree but not all the files within them. Just one particular kind of files; with given extension. What changes would I have to make to the code in order to do this?
Thanks in advance.
@necromeo Hopefully this isn't way too late of a response, but this was a relatively small feature addition, so I added it in. You can use a regex string with the pattern keyword argument now. The example now looks for files that end with .jpg
.
I should note that this script is useful as-is only in environments where wget
isn't an option, or you wish to alter the code to include more complex custom handling.
@aamosse, I do not understand your problem, hopefully you've found a way to resolve it by now, otherwise sharing the entire output might help.
Hi, great script! I learned a lot and tinkered with it quite a bit. What I cannot resolve is that it loops, when the source directory is empty... the source machine is linux and the destination is a windows server. It runs smoothly until it encounters the first empty dir. Then it hops between the dir check for the path of the empty dir and that path plus ".."
Hello All,
First, thanks a lot for the code. i tried many others but yours is really explicit.
As @antronica, it works for one of my FTP servers (managed by filezilla server on Windows). However, my application is to retrieve pictures from an FTP built-in an Axis camera and using the same code, it returns a reccursive error. After putting many print in the code, I found it does not remember in which folder it came inbefore. I give you an example:
On the FTP where it works, I put a print in the "_mirror_ftp_dir" subroutine to know if it detected a folder or a file. Once it can only find files, the "item" value contains the whole path (i.e.: item=2018/Week40/Picture_181001_150958.jpg).
On the FTP where it does NOT work, I can only see the name of the file without the parent folders (i.e.: item=Picture_181001_150958.jpg).
Then I got the same error as @antronica: [WinError 3] The system cannot find the path specified: and I can understand it because the file does not exist in the orginal root.
I hope I'm clear enough. Would you have any idea to solve my problem?Sincerely
Antoine
I am facing the same issue
AMAZINGLY BEAUTIFUL CODE.
is it possible to download the whole directory as a zip file?
what modifications would be needed in this code?
please post it, it'll be of TREMENDOUS help!
Thank you!
if directory contains greater then 1 dot, then need replace:
if guess_by_extension is True:
if len(name) >= 4:
if name[-4] == '.':
try:
ftp_handle.cwd(name[-4])
except:
return False
OSError: [Errno 22]
Unknown Character error probably
My file has a few Japanese characters in the name, that might be the reason
How about replacing the unknown characters with something else or "Unknown Characters"
OSError: [Errno 22] Invalid argument: 'Ny/Test/[200807][ã\x81°ã\x81«ã\x81\x83ã\x81\x86ã\x81\x89ï½\x9eã\x81\x8bï½\x9e]OVAã\x82\x88ã\x81\x86ã\x81\x93ã\x81\x9dï¼\x81ã\x82¹ã\x82±ã\x83\x99ã\x82¨ã\x83«ã\x83\x95ã\x81®æ£®ã\x81¸ ï¼\x834 ã\x82¨ã\x83«ã\x83\x95ã\x82\x82ã\x83\x80ã\x83¼ã\x82¯ã\x82¨ã\x83«ã\x83\x95ã\x82\x82仲è\x89¯ã\x81\x8få\xad\x90ä½\x9cã\x82\x8aï¼\x81 æ\x95\x91ä¸\x96主æ§\x98ã\x81¨ã\x80\x8eã\x83\x8fã\x83¼ã\x83¬ã\x83\xa0ç\x94\x9fæ´»ã\x80\x8f(No Watermark).mp4'
Your code looks great! However, I don't know why it seems not to work with the Windows server in my case. The console shows nothing when I run the program though all credentials are correct.
@heroform The error catch on line 55 is a lazy assumption that the OS error is a specific expected case. One possible explanation of that behavior is that you're getting a different OS error and it's stuck in a loop, never resolving it. I've added a print statement there to help you diagnose.
Hi Jwely, the log informs that "425 Failed to establish connection". This might be the connection issues, right?
And I also meet the other issue also "Note1-error_perm: 550 Failed to change directory."
I saw the program on local create files like on servers, but those all cannot be opened.
Thanks for the code ... 👍
I just got an error when got the path with /..
from ftp_handle.nlst(name), it was returning me to the parent folder and the script was trying to download what is already downloaded.
I fixed it quickly by this
def _mirror_ftp_dir(ftp_handle, name, overwrite, guess_by_extension, pattern):
""" replicates a directory on an ftp server recursively """
nlst = ftp_handle.nlst(name)
for item in nlst:
if (name + "/.") == item or (name + "/..") == item:
continue
if _is_ftp_dir(ftp_handle, item, guess_by_extension):
_mirror_ftp_dir(ftp_handle, item, overwrite, guess_by_extension, pattern)
else:
if _file_name_match_patern(pattern, name):
_download_ftp_file(ftp_handle, item, item, overwrite)
else:
# quietly skip the file
pass
Also when I tried to add as a local_dir "." or "goal_folder" or "./goal_folder", the script kept the server parent path as well,
so I got when remote_dir was set to "/ssl/wp-content/uploads" content in "./goal_folder/ssl/wp-content/uploads", but I wanted to have it all from uploads in the "./goal_folder" instead.
@domino2 I got the same thing. It kept recursing those dots back and forth :-D nice script anyways!
:) ... I tried to look up to a bit different solution. I think I used rsync or rclone cli app to sync the ftp target. But also, I have got couple issues. But I think that world is beautiful in any case :)
aah, i see. rsync looks quite interesting! I spent some time and done similar script for backing up my ftp server. It is based on this script, but uses ftputil library, so i could avoid file/folder guessing/checking and rely more on library to do things. I put it on github, so future people can save some time, if they want to do similar thing.
Hi @Jwely . Thanks for the code. Need some help. My local_dir = "D:/Projects/Temporary files" , mysite = 'ftp.dlptest.com' , username = 'dlpuser' , password = 'rNrKYTX9g7z3RgJRmxWuGHbeu' , remote_dir = '/' , and pattern = "..upt" or "..txt".
The program is getting executed without any errors but I am unable to find any of the files in the local directory.
Could you please check and let me know what is the issue or where am I going wrong.
@PhaniChandan, Have you tried adding ftp.set_debuglevel(1) in the main method ? Maybe there will be more information. And then if everything is still ok, try changing pattern to None.
@Jwely , Thanks for this code! Am stuck in a situation where I cannot use wget. I am using your solution, but seem to have an unhandled edge case that causes a silent fail - any file or folder with square brackets in it ( [ ] ) causes a recursion loop, as ftp_handle.nlist(name).
contains only the targeted directory in the FTP server, and not the full list of nlist()
it works fine for other cases.
I believe that bracket handling needs to be done, as I have tested this with multiple files and directories. However, changing the names of the files and directories on the FTP server is not an option.
As i could not edit my comment for some reason:
I found a workaround for my issue!
While this has not been proven to work with all edge cases (japanese characters still do not work so far!) I just create the nlst()
output from mlsd(): and just calling it instead of
ftp_handle.nlst()` in line 84. this now has no problem with directories with square brackets in them.
def nlstSafe(self, directory):
out = []
for item in ftp_handle.mlsd(directory):
out.append(os.path.join(directory, item[0]))
return out
@Jwely, Thanks for this code!
I have a question about the _mirror_ftp_dir function. Shouldn't the second argument of the _file_name_match_patern function be "item" instead of "name"? Since "name" refers to the directory path, _file_name_match_patern(pattern, name) would be applying the regular expression to the directory path. I think "item" would be appropriate if you want to apply the regular expression to the file path.
great gist and I'm using it successfully, thanks!
Can I ask why the try and except clause is used on the _make_parent_dir function?
My understanding is its there to fix any recursive creation errors previously on mkdir() as any other errors will keep occuring as you keep calling the function? But with makerdirs() it creates the dir recursively anyway so I'm not sure the try is required, the below works for me.
Also may be worth adding
as the exception in _is_ftp_dir function?