Extending Python's os.walk function on FTP server - python

How can I make os.walk traverse the directory tree of an FTP database (located on a remote server)? The way the code is structured now is (comments provided):
import fnmatch, os, ftplib
def find(pattern, startdir=os.curdir): #find function taking variables for both desired file and the starting directory
for (thisDir, subsHere, filesHere) in os.walk(startdir): #each of the variables change as the directory tree is walked
for name in subsHere + filesHere: #going through all of the files and subdirectories
if fnmatch.fnmatch(name, pattern): #if the name of one of the files or subs is the same as the inputted name
fullpath = os.path.join(thisDir, name) #fullpath equals the concatenation of the directory and the name
yield fullpath #return fullpath but anew each time
def findlist(pattern, startdir = os.curdir, dosort=False):
matches = list(find(pattern, startdir)) #find with arguments pattern and startdir put into a list data structure
if dosort: matches.sort() #isn't dosort automatically False? Is this statement any different from the same thing but with a line in between
return matches
#def ftp(
#specifying where to search.
if __name__ == '__main__':
import sys
namepattern, startdir = sys.argv[1], sys.argv[2]
for name in find(namepattern, startdir): print (name)
I am thinking that I need to define a new function (i.e., def ftp()) to add this functionality to the code above. However, I am afraid that the os.walk function will, by default, only walk the directory trees of the computer that the code is run from.
Is there a way that I can extend the functionality of os.walk to be able to traverse a remote directory tree (via FTP)?

All you need is utilizing the python's ftplib module. Since os.walk() is based on a Breadth-first search algorithm you need to find the directories and file names at each iteration, then continue the traversing recursively from the first directory. I implemented this algorithm about 2 years ago for using as the heart of FTPwalker, which is an optimum package for traversing extremely large directory trees Through FTP.
from os import path as ospath
class FTPWalk:
"""
This class is contain corresponding functions for traversing the FTP
servers using BFS algorithm.
"""
def __init__(self, connection):
self.connection = connection
def listdir(self, _path):
"""
return files and directory names within a path (directory)
"""
file_list, dirs, nondirs = [], [], []
try:
self.connection.cwd(_path)
except Exception as exp:
print ("the current path is : ", self.connection.pwd(), exp.__str__(),_path)
return [], []
else:
self.connection.retrlines('LIST', lambda x: file_list.append(x.split()))
for info in file_list:
ls_type, name = info[0], info[-1]
if ls_type.startswith('d'):
dirs.append(name)
else:
nondirs.append(name)
return dirs, nondirs
def walk(self, path='/'):
"""
Walk through FTP server's directory tree, based on a BFS algorithm.
"""
dirs, nondirs = self.listdir(path)
yield path, dirs, nondirs
for name in dirs:
path = ospath.join(path, name)
yield from self.walk(path)
# In python2 use:
# for path, dirs, nondirs in self.walk(path):
# yield path, dirs, nondirs
self.connection.cwd('..')
path = ospath.dirname(path)
Now for using this class, you can simply create a connection object using ftplib module and pass the the object to FTPWalk object and just loop over the walk() function:
In [2]: from test import FTPWalk
In [3]: import ftplib
In [4]: connection = ftplib.FTP("ftp.uniprot.org")
In [5]: connection.login()
Out[5]: '230 Login successful.'
In [6]: ftpwalk = FTPWalk(connection)
In [7]: for i in ftpwalk.walk():
print(i)
...:
('/', ['pub'], [])
('/pub', ['databases'], ['robots.txt'])
('/pub/databases', ['uniprot'], [])
('/pub/databases/uniprot', ['current_release', 'previous_releases'], ['LICENSE', 'current_release/README', 'current_release/knowledgebase/complete', 'previous_releases/', 'current_release/relnotes.txt', 'current_release/uniref'])
('/pub/databases/uniprot/current_release', ['decoy', 'knowledgebase', 'rdf', 'uniparc', 'uniref'], ['README', 'RELEASE.metalink', 'changes.html', 'news.html', 'relnotes.txt'])
...
...
...

I needed a function like os.walk on FTP and there where not any so i thought it would be useful to write it , for future references you can find last version here
by the way here is the code that would do that :
def FTP_Walker(FTPpath,localpath):
os.chdir(localpath)
current_loc = os.getcwd()
for item in ftp.nlst(FTPpath):
if not is_file(item):
yield from FTP_Walker(item,current_loc)
elif is_file(item):
yield(item)
current_loc = localpath
else:
print('this is a item that i could not process')
os.chdir(localpath)
return
def is_file(filename):
current = ftp.pwd()
try:
ftp.cwd(filename)
except Exception as e :
ftp.cwd(current)
return True
ftp.cwd(current)
return False
how to use:
first connect to your host :
host_address = "my host address"
user_name = "my username"
password = "my password"
ftp = FTP(host_address)
ftp.login(user=user_name,passwd=password)
now you can call the function like this:
ftpwalk = FTP_Walker("FTP root path","path to local") # I'm not using path to local yet but in future versions I will improve it. so you can just path an '/' to it
and then to print and download files you can do somthing like this :
for item in ftpwalk:
ftp.retrbinary("RETR "+item, open(os.path.join(current_loc,item.split('/')[-1]),"wb").write) #it is downloading the file
print(item) # it will print the file address
( i will write more features for it soon so if you need some specific things or have any idea that can be useful for users i'll be happy to hear that )

I wrote a library pip install walk-sftp. Event though it is named walk-sftp I included a WalkFTP class that lets you filter by start_date of files & end_date of files. You can even pass in a processing_function that returns True or False to see whether your process to clean & store data works. It also has a log parameter (pass filename) that uses pickle & keeps track of any progress so you don't overwrite or have to keep track of dates making backfilling easier.
https://pypi.org/project/walk-sftp/

Im going to assume this is what you want ... although really I have no idea
ssh = paramiko.SSHClient()
ssh.connect(server, username=username, password=password)
ssh_stdin, ssh_stdout, ssh_stderr = ssh.exec_command("locate my_file.txt")
print ssh_stdout
this will require the remote server to have the mlocate package `sudo apt-get install mlocate;sudo updatedb();

Related

Python - Watch a folder for new .zip file and upload via FTP

I am working on creating a script to watch a folder, grab any new .zip files, and then upload them via FTP to a predetermined area. Right now FTP testing is being performed Locally, since the environment isnt yet created.
The strategy I am taking is to first, unzip into a local folder. Then, perform ftplib.storbinary , on the file from the local folder, to the ftpdestination. However, the unzipping process appears to be working but I am getting a "file does not exist" error, all though I can see it in the folder itself.
Also, is there anyway to unzip directly into an FTP location? I havent been able to find a way hence the approach I am taking.
Thanks, local ftp info removed from code. All paths that are relevant in this code will be changed, most likely to dynamic fashion, but for now this is a local environment
extractZip2.py
import zipfile
import ftplib
import os
import logging
import time
from socket import error as socket_error
#Logging Setup
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('__name__')
FTPaddress = ''
FTPusername = ''
FTPpassword = ''
ftp_destination_location = ''
path_to_watch = "C:/Users/206420055/Desktop/test2/"
before = dict ([(f,None) for f in os.listdir(path_to_watch)])
temp_destination_location = "C:/Users/206420055/Desktop/temp/"
def unzip(fullPath,temporaryPath):
with zipfile.ZipFile(fullPath, "r") as z :
logger.info("Unzipping {0}".format(fullPath))
z.extractall(temporaryPath)
logger.info("Unzipped into local directory {0}".format(temp_destination_location))
def check_or_create_ftp(session, folder):
"""
Checks to see if necessary folder for currenttab is available.
Creates the folder if not found, and enters it.
"""
if folder not in session.nlst():
logger.info('Directory for {0} does not exist, creating directory\n'.format(folder))
session.mkd(folder)
session.cwd(folder)
def check_or_create(temp_destination):
"""
Checks to see if local savepath exists. Will create savepath if not exists.
"""
if not os.path.exists(temp_destination):
logger.info('Directory for %s does not exist, creating directory\n' % temp_destination)
os.makedirs(str(temp_destination))
def transfer(address,username,password,filename,destination):
logger.info("Creating Session")
try:
session = session_init(address,username,password,destination)
except (socket_error,ftplib.error_perm) as e:
logger.error(str(e))
logger.error("Error in Session Init")
else:
try:
logger.info("Sending File {0}".format(filename))
send_file(session,filename)
except (IOError, OSError, ftplib.error_perm) as e:
logger.error(e)
def session_init(address,username,password,path):
session = ftplib.FTP(address,username,password)
check_or_create_ftp(session,path)
logger.info("Session Established")
return session
def send_file(session,filename):
file = open(filename,'rb')
logger.info('Sending File : STOR '+filename)
session.storbinary('STOR '+ filename, file)
file.close()
def delete_local_files(savepath, file):
logger.info("Cleaning Up Folder {0}".format(savepath))
os.remove(file)
while 1:
time.sleep(5)
after = dict ([(f,None) for f in os.listdir(path_to_watch)])
added = [f for f in after if not f in before]
removed = [f for f in before if not f in after]
if added: print "Added: ",", ".join(added)
before = after
check_or_create(temp_destination_location)
if added :
for file in added:
print file
if file.endswith('.zip'):
unzip(path_to_watch+file, temp_destination_location)
temp_files = os.listdir(temp_destination_location)
print("Temp Files {0}".format(temp_files))
for tf in temp_files:
print("TF {0}".format(tf))
transfer(FTPaddress,FTPusername,FTPpassword,tf,ftp_destination_location)
#delete_local_files(temp_destination_location,tf)
else:
pass
edit: adding error image
Seen above, we see the file in the temp folder. But the console obviously shows the error.
just change it to
from glob import glob
zips_in_path = dict ([(f,None) for f in glob("{base_path}/*.zip".format(base_path = path_to_watch)])
os.listdir does not include the path_to_watch part of the path it is just the filenames, however glob does.
so you could also do
after = dict ([(os.path.join(path_to_watch,f),None) for f in os.listdir(path_to_watch)])
using either of these methods you should be able to get the full path to the files in the path

Matching MD5 Hashes from another script

Ok so i'm trying to create a script that does the following: Searches a directory for known hashes. Here is my first script:
Hash.py
import hashlib
from functools import partial
#call another python script
execfile("knownHashes.py")
def md5sum(filename):
with open(filename, mode='rb') as f:
d = hashlib.md5()
for buf in iter(partial(f.read, 128), b''):
d.update(buf)
return d.hexdigest()
print "Hash of is: "
print(md5sum('photo.jpg'))
if md5List == md5sum:
print "Match"
knownHashes.py
print ("Call worked\n")
md5List = "01071709f67193b295beb7eab6e66646" + "5d41402abc4b2a76b9719d911017c592"
The problem at the moment is that I manually have to type in the file I want to find out the hash of where it says photo.jpg. Also, The I haven't got the md5List to work yet.
I want the script to eventually work like this:
python hash.py <directory>
1 match
cookies.jpg matches hash
So how can I get the script to search a directory rather than manually type in what file to hash? Also, how can I fix the md5List because that is wrong?
You can get a list of files in the current working directory using the following. This is the directory that you run the script from.
import os
#Get list of files in working directory
files_list = os.listdir(os.getcwd())
You can iterate through the list using a for loop:
for file in files_list:
#do something
As equinoxel also mentioned below, you can use os.walk() as well.
Simple little gist should solve most of your problems. Understandable if you don't like using OOP for this problem, but I believe all of the important conceptual pieces are here in a pretty clean, concise representation. Let me know if you have any questions.
class PyGrep:
def __init__(self, directory):
self.directory = directory
def grab_all_files_with_ending(self, file_ending):
"""Will return absolute paths to all files with given file ending in self.directory"""
walk_results = os.walk(self.directory)
file_check = lambda walk: len(walk[2]) > 0
ending_prelim = lambda walk: file_ending in " ".join(walk[2])
relevant_results = (entry for entry in walk_results if file_check(entry) and ending_prelim(entry))
return (self.grab_files_from_os_walk(result, file_ending) for result in relevant_results)
def grab_files_from_os_walk(self, os_walk_tuple, file_ending):
format_check = lambda file_name: file_ending in file_name
directory, subfolders, file_paths = os_walk_tuple
return [os.path.join(directory, file_path) for file_path in file_paths if format_check(file_path)]

How to list all the folders and files in the directory after connecting through SFTP in Python

I am using Python and trying to connect to SFTP and want to retrieve an XML file from there and need to place it in my local system. Below is the code:
import paramiko
sftpURL = 'sftp.somewebsite.com'
sftpUser = 'user_name'
sftpPass = 'password'
ssh = paramiko.SSHClient()
# automatically add keys without requiring human intervention
ssh.set_missing_host_key_policy( paramiko.AutoAddPolicy() )
ssh.connect(sftpURL, username=sftpUser, password=sftpPass)
ftp = ssh.open_sftp()
files = ftp.listdir()
print files
Here connection is success full. And now I want to see all the folders and all the files and need to enter in to required folder for retrieving the XML file from there.
Finally my intention is to view all the folders and files after connecting to SFTP server.
In the above code I had used ftp.listdir() through which I got output as some thing like below
['.bash_logout', '.bash_profile', '.bashrc', '.mozilla', 'testfile_248.xml']
I want to know whether these are the only files present?
And the command I used above is right to view the folders too?
What is the command to view all the folders and files?
The SFTPClient.listdir returns everything, files and folders.
Were there folders, to tell them from the files, use SFTPClient.listdir_attr instead. It returns a collection of SFTPAttributes.
from stat import S_ISDIR, S_ISREG
sftp = ssh.open_sftp()
for entry in sftp.listdir_attr(remotedir):
mode = entry.st_mode
if S_ISDIR(mode):
print(entry.filename + " is folder")
elif S_ISREG(mode):
print(entry.filename + " is file")
The accepted answer by #Oz123 is inefficient. SFTPClient.listdir internally calls SFTPClient.listdir_attr and throws most information away returning file and folder names only. The answer then uselessly and laboriously re-retrieves all that data by calling SFTPClient.lstat for each file.
See also How to fetch sizes of all SFTP files in a directory through Paramiko.
Obligatory warning: Do not use AutoAddPolicy – You are losing a protection against MITM attacks by doing so. For a correct solution, see Paramiko "Unknown Server"
One quick solution is to examine the output of lstat of each object in ftp.listdir().
Here is how you can list all the directories.
>>> for i in ftp.listdir():
... lstatout=str(ftp.lstat(i)).split()[0]
... if 'd' in lstatout: print i, 'is a directory'
...
Files are the opposite search:
>>> for i in ftp.listdir():
... lstatout=str(ftp.lstat(i)).split()[0]
... if 'd' not in lstatout: print i, 'is a file'
...
Here is a solution I have come up with. Based on https://stackoverflow.com/a/59109706 . My solution gives a pretty output.
Update I have modified it slightly to incorporate Martin's suggestions. Now my code is considerably fast compared to my initial version using isdir and listdir
# prefix components:
space = ' '
branch = '│ '
# pointers:
tee = '├── '
last = '└── '
def stringpath(path):
# just a helper to get string of PosixPath
return str(path)
from pathlib import Path
from stat import S_ISDIR
def tree_sftp(sftp, path='.', parent='/', prefix=''):
"""
Loop through files to print it out
for file in tree_sftp(sftp):
print(file)
"""
fullpath = Path(parent, path)
strpath = stringpath(fullpath)
dirs = sftp.listdir_attr(strpath)
pointers = [tee] * (len(dirs) - 1) + [last]
pdirs = [Path(fullpath, d.filename) for d in dirs]
sdirs = [stringpath(path) for path in pdirs]
for pointer, sd, d in zip(pointers, sdirs, dirs):
yield prefix + pointer + d.filename
if S_ISDIR(d.st_mode):
extension = branch if pointer == tee else space
yield from tree_sftp(sftp, sd, prefix=prefix + extension)
You can try it out like this using pysftp
import pysftp
with pysftp.Connection(HOSTNAME, USERNAME, PASSWORD) as sftp:
for file in tree_sftp(sftp):
print(file)
Let me know if if works for you.

Create missing directories in ftplib storbinary

I was using pycurl to transfer files over ftp in python. I could create the missing directories automatically on my remote server using:
c.setopt(pycurl.FTP_CREATE_MISSING_DIRS, 1)
for some reasons, I have to switch to ftplib. But I don't know how to to the same here. Is there any option to add to storbinary function to do that? or I have to create the directories manually?
FTP_CREATE_MISSING_DIRS is a curl operation (added here). I'd hazard a guess that you have to do it manually with ftplib, but I'd love to be proven wrong, anyone?
I'd do something like the following: (untested, and need to catch ftplib.all_errors)
ftp = ... # Create connection
# Change directories - create if it doesn't exist
def chdir(dir):
if directory_exists(dir) is False: # (or negate, whatever you prefer for readability)
ftp.mkd(dir)
ftp.cwd(dir)
# Check if directory exists (in current location)
def directory_exists(dir):
filelist = []
ftp.retrlines('LIST',filelist.append)
for f in filelist:
if f.split()[-1] == dir and f.upper().startswith('D'):
return True
return False
Or you could do directory_exists like this: (a bit harder to read?)
# Check if directory exists (in current location)
def directory_exists(dir):
filelist = []
ftp.retrlines('LIST',filelist.append)
return any(f.split()[-1] == dir and f.upper().startswith('D') for f in filelist)
I know it's kind of an old post but I just needed this and came up with a very simple function. I'm new to Python so I'd appreciate any feedback.
from ftplib import FTP
ftp = FTP('domain.com', 'username', 'password')
def cdTree(currentDir):
if currentDir != "":
try:
ftp.cwd(currentDir)
except IOError:
cdTree("/".join(currentDir.split("/")[:-1]))
ftp.mkd(currentDir)
ftp.cwd(currentDir)
Usage example:
cdTree("/this/is/an/example")
I tried adding this as a comment to the #Alex L 's answer, but it was too long. You need to descend recursively when changing directory if you want to create directories on the way. E.g.
def chdir(ftp, directory):
ch_dir_rec(ftp,directory.split('/'))
# Check if directory exists (in current location)
def directory_exists(ftp, directory):
filelist = []
ftp.retrlines('LIST',filelist.append)
for f in filelist:
if f.split()[-1] == directory and f.upper().startswith('D'):
return True
return False
def ch_dir_rec(ftp, descending_path_split):
if len(descending_path_split) == 0:
return
next_level_directory = descending_path_split.pop(0)
if not directory_exists(ftp,next_level_directory):
ftp.mkd(next_level_directory)
ftp.cwd(next_level_directory)
ch_dir_rec(ftp,descending_path_split)
I am using the following lines to resolve missing directory paths for FTP file copy
import os
ftps = FTP_TLS('ftps_server')
ftps.connect()
ftps.login()
destination_dir_path = 'some/dir/path' # directory path on FTP
dir_path = ''
for i in destination_dir_path.split('/'):
dir_path = os.path.join(dir_path,i)
if i not in ftps.nlst(os.path.dirname(dir_path)):
ftps.mkd(dir_path) # create directory on the FTP
ftps.storbinary(...) # store file using the binary mode
An alternative is to simply loop through each of the path elements, create the next and change into the newly-created directory. My use case was fairly straightforward though as I was copying items from one FTP server to another.
def create_ftp_path(session: ftplib.FTP, required_dir: str):
required_dir = required_dir.split('/')[:-1]
for path_item in required_dir:
if path_item.strip() == '':
continue
path_item = path_item.replace('/', '')
try:
session.cwd(path_item)
except:
session.mkd(path_item)
session.cwd(path_item)
Considerations:
This function assumes you have already changed directory for your FTP session to some base path and the required_dir is a path from that base path.
required_dir includes file name as the last element.
I'm removing any / characters because in my case they were causing 553 permission denied exception.
The exception handling is lacking, but in my case upload validation is happening further in the code so even if it fails it will be caught further down.
A more robust and reliable solution:
using ftplib
Hostname = yourhostname.com
Username = yourusername
Password = yourpassword
def mkdirs(path):
ftp = FTP(Hostname,Username,Password)
items = path.split('/')
cwd = "/"
for i in range(len(items)):
list = ftp.nlst()
if(not '.' in items[i] and not items[i] in list):
ftp.mkd(cwd + items[i] + "/")
cwd += items[i] + '/'
ftp.cwd(cwd)
ftp.quit()
mkdirs('path/to/directory/file.name')
This will create directories on your server if they do not exist.
Limitations: This will not work on folders with names that contain ..
This code will create all missing folders in path:
...
def chdir(ftp_path, ftp_conn):
dirs = [d for d in ftp_path.split('/') if d != '']
for p in dirs:
print(p)
check_dir(p, ftp_conn)
def check_dir(dir, ftp_conn):
filelist = []
ftp_conn.retrlines('LIST', filelist.append)
found = False
for f in filelist:
if f.split()[-1] == dir and f.lower().startswith('d'):
found = True
if not found:
ftp_conn.mkd(dir)
ftp_conn.cwd(dir)
if __name__ == '__main__':
ftp_conn = ... # ftp connection
t = 'FTP/for_Vadim/1/2/3/'
chdir(t, ftp_conn)
This code will check all dirs in path and create missing dirs
before "FTP/for_Vadim/"
after "FTP/for_Vadim/1/2/3/"
I'm using something like this (without cwd):
# -*- coding:utf-8 -*-
from ftplib import FTP, error_perm
def createDirs(ftp, dirpath):
"""
Create dir with subdirs.
:param ftp: connected FTP
:param dirpath: path (like 'test/test1/test2')
:type ftp: FTP
:type dirpath: str
:rtype: None
"""
dirpath = dirpath.replace('\\', '/')
tmp = dirpath.split('/')
dirs = []
for _ in tmp:
if len(dirs) == 0:
dirs.append(_)
continue
dirs.append(dirs[-1] + '/' + _)
for _ in dirs:
try:
ftp.mkd(_)
except error_perm as e:
e_str = str(e)
if '550' in e_str and 'File exists' in e_str:
continue
if __name__ == '__main__':
# init ftp
createDirs(ftp=ftp, dirpath='test/1/2/3')

Downloading a directory tree with ftplib

This will not download the contents of sub-directories; how can I do so?
import ftplib
import configparser
import os
directories = []
def add_directory(line):
if line.startswith('d'):
bits = line.split()
dirname = bits[8]
directories.append(dirname)
def makeDir(archiveTo):
for dir in directories:
newDir = os.path.join(archiveTo, dir)
if os.path.isdir(newDir) == True:
print("Directory \"" + dir + "\" already exists!")
else:
os.mkdir(newDir)
def getFiles(archiveTo, ftp):
files = ftp.nlst()
for filename in files:
try:
directories.index(filename)
except:
ftp.retrbinary('RETR %s' % filename, open(os.path.join(archiveTo, filename), 'wb').write)
def runBackups():
#Load INI
filename = 'connections.ini'
config = configparser.SafeConfigParser()
config.read(filename)
connections = config.sections()
i = 0
while i < len(connections):
#Load Settings
uri = config.get(connections[i], "uri")
username = config.get(connections[i], "username")
password = config.get(connections[i], "password")
backupPath = config.get(connections[i], "backuppath")
archiveTo = config.get(connections[i], "archiveto")
#Start Back-ups
ftp = ftplib.FTP(uri)
ftp.login(username, password)
ftp.cwd(backupPath)
#Map Directory Tree
ftp.retrlines('LIST', add_directory)
#Make Directories Locally
makeDir(archiveTo)
#Gather Files
getFiles(archiveTo, ftp)
#End connection and increase counter.
ftp.quit()
i += 1
print()
print("Back-ups complete.")
print()
this should do the trick :)
import sys
import ftplib
import os
from ftplib import FTP
ftp=FTP("ftp address")
ftp.login("user","password")
def downloadFiles(path,destination):
#path & destination are str of the form "/dir/folder/something/"
#path should be the abs path to the root FOLDER of the file tree to download
try:
ftp.cwd(path)
#clone path to destination
os.chdir(destination)
os.mkdir(destination[0:len(destination)-1]+path)
print destination[0:len(destination)-1]+path+" built"
except OSError:
#folder already exists at destination
pass
except ftplib.error_perm:
#invalid entry (ensure input form: "/dir/folder/something/")
print "error: could not change to "+path
sys.exit("ending session")
#list children:
filelist=ftp.nlst()
for file in filelist:
try:
#this will check if file is folder:
ftp.cwd(path+file+"/")
#if so, explore it:
downloadFiles(path+file+"/",destination)
except ftplib.error_perm:
#not a folder with accessible content
#download & return
os.chdir(destination[0:len(destination)-1]+path)
#possibly need a permission exception catch:
with open(os.path.join(destination,file),"wb") as f:
ftp.retrbinary("RETR "+file, f.write)
print file + " downloaded"
return
source="/ftproot/folder_i_want/"
dest="/systemroot/where_i_want_it/"
downloadFiles(source,dest)
This is a very old question, but I had a similar need that i wanted to satisfy in a very general manner. I ended up writing my own solution that works very well for me. I've placed it on Gist here https://gist.github.com/Jwely/ad8eb800bacef9e34dd775f9b3aad987
and pasted it below in case i ever take the gist offline.
Example usage:
import ftplib
ftp = ftplib.FTP(mysite, username, password)
download_ftp_tree(ftp, remote_dir, local_dir)
The code above will look for a directory called "remote_dir" on the ftp host, and then duplicate the directory and its entire contents into the "local_dir".
It invokes the script below.
import ftplib
import os
def _is_ftp_dir(ftp_handle, name, guess_by_extension=True):
""" simply determines if an item listed on the ftp server is a valid directory or not """
# if the name has a "." in the fourth to last position, its probably a file extension
# this is MUCH faster than trying to set every file to a working directory, and will work 99% of time.
if guess_by_extension is True:
if name[-4] == '.':
return False
original_cwd = ftp_handle.pwd() # remember the current working directory
try:
ftp_handle.cwd(name) # try to set directory to new name
ftp_handle.cwd(original_cwd) # set it back to what it was
return True
except:
return False
def _make_parent_dir(fpath):
""" ensures the parent directory of a filepath exists """
dirname = os.path.dirname(fpath)
while not os.path.exists(dirname):
try:
os.mkdir(dirname)
print("created {0}".format(dirname))
except:
_make_parent_dir(dirname)
def _download_ftp_file(ftp_handle, name, dest, overwrite):
""" downloads a single file from an ftp server """
_make_parent_dir(dest)
if not os.path.exists(dest) or overwrite is True:
with open(dest, 'wb') as f:
ftp_handle.retrbinary("RETR {0}".format(name), f.write)
print("downloaded: {0}".format(dest))
else:
print("already exists: {0}".format(dest))
def _mirror_ftp_dir(ftp_handle, name, overwrite, guess_by_extension):
""" replicates a directory on an ftp server recursively """
for item in ftp_handle.nlst(name):
if _is_ftp_dir(ftp_handle, item):
_mirror_ftp_dir(ftp_handle, item, overwrite, guess_by_extension)
else:
_download_ftp_file(ftp_handle, item, item, overwrite)
def download_ftp_tree(ftp_handle, path, destination, overwrite=False, guess_by_extension=True):
"""
Downloads an entire directory tree from an ftp server to the local destination
:param ftp_handle: an authenticated ftplib.FTP instance
:param path: the folder on the ftp server to download
:param destination: the local directory to store the copied folder
:param overwrite: set to True to force re-download of all files, even if they appear to exist already
:param guess_by_extension: It takes a while to explicitly check if every item is a directory or a file.
if this flag is set to True, it will assume any file ending with a three character extension ".???" is
a file and not a directory. Set to False if some folders may have a "." in their names -4th position.
"""
os.chdir(destination)
_mirror_ftp_dir(ftp_handle, path, overwrite, guess_by_extension)
this is an alternative. you can try using ftputil package. You can then use it to walk the remote directories and get your files
Using ftp.mlsd() instead of ftp.nlst():
import sys
import ftplib
import os
from ftplib import FTP
def fetchFiles(ftp, path, destination, overwrite=True):
'''Fetch a whole folder from ftp. \n
Parameters
----------
ftp : ftplib.FTP object
path : string ('/dir/folder/')
destination : string ('D:/dir/folder/') folder where the files will be saved
overwrite : bool - Overwrite file if already exists.
'''
try:
ftp.cwd(path)
os.mkdir(destination[:-1] + path)
print('New folder made: ' + destination[:-1] + path)
except OSError:
# folder already exists at the destination
pass
except ftplib.error_perm:
# invalid entry (ensure input form: "/dir/folder/")
print("error: could not change to " + path)
sys.exit("ending session")
# list children:
filelist = [i for i in ftp.mlsd()]
print('Current folder: ' + filelist.pop(0)[0])
for file in filelist:
if file[1]['type'] == 'file':
fullpath = os.path.join(destination[:-1] + path, file[0])
if (not overwrite and os.path.isfile(fullpath)):
continue
else:
with open(fullpath, 'wb') as f:
ftp.retrbinary('RETR ' + file[0], f.write)
print(file[0] + ' downloaded')
elif file[1]['type'] == 'dir':
fetchFiles(ftp, path + file[0] + '/', destination, overwrite)
else:
print('Unknown type: ' + file[1]['type'])
if __name__ == "__main__":
ftp = FTP('ftp address')
ftp.login('user', 'password')
source = r'/Folder/'
dest = r'D:/Data/'
fetchFiles(ftp, source, dest, overwrite=True)
ftp.quit()
Using ftputil, a fast solution could be:
def download(folder):
for item in ftp.walk(folder):
print("Creating dir " + item[0])
os.mkdir(item[0])
for subdir in item[1]:
print("Subdirs " + subdir)
for file in item[2]:
print(r"Copying File {0} \ {1}".format(item[0], file))
ftp.download(ftp.path.join(item[0],file), os.path.join(item[0],file))
It is non-trivial at least. In the simplest case, you only assume you have files and directories. This isn't always the case, there are softlinks and hardlinks and Windows-style shortcut. Softlink and directory shortcut are particularly problematic since they make recursive directory possible, which would confuse naive-ly implemented ftp grabber.
How would you handle such recursive directory depends on your need; you might simply not follow softlinks or you might try to detect recursive links. Detecting recursive link is inherently tricky, you cannot do it reliably.

Categories

Resources