How do I throttle the FTP download with Python ftplib? For example put a cap on the speed to be 20Mb/s?
I'm using the following code to download files with Python ftplib:
from ftplib import FTP
import os
download_list = 'testlist.txt' # inital list of directories to be downloaded
path_list = [] # initalize a list of all the pathes from download_list
local_folder = 'testStorage' #where files are going to be downloaded to
downloaded_list = 'completedownload.txt' # list of completed downloads
error_list = 'incomplete_downloads.txt' # list of paths that are incomplete
ftp=FTP("ftp.address.com")
ftp.login("user_name","password") #login to FTP account
print "Successfully logged in"
# make a list of files to download from a file
with open(download_list, 'r') as f:
content = f.readlines()
path_list = [x.strip() for x in content]
for path in path_list:
path = path.replace("*","") # strips the * found in the source file
print '\nChanging directory to ' + path + ':\n'
#ftp.cwd('/AAA/BBB/CCC/logic-1/') #the format to change into path note the * is omitted
#if ftp.cwd(path) == True:
try: # tries the path in the file
ftp.cwd(path)
#ftp.retrlines('LIST')
filenames = ftp.nlst()
for filename in filenames:
local_directory = local_folder+path # create the local path ie : testStorage/AAA/BBB/CCC/logic-1/
local_filename = os.path.join(local_directory,filename) #
if os.path.exists(local_filename) == False: # checks if file already exists
if not os.path.exists(local_directory): # mimic the remote path locally
os.makedirs(local_directory)
file = open(local_filename,'wb')
ftp.retrbinary('RETR '+ filename, file.write)
print filename
file.close()
elif os.path.exists(local_filename) == True: # skip the file if it exits
print 'File ' +filename + ' already exists, skipping this file'
except: #if path in text file does not exist write to error_list.txt
print 'Path ' + path + ' does not exist writing path to error_list.txt'
with open(error_list, 'a') as f2:
f2.write(path+'\n')
continue
print "all done closing connection"
ftp.close() #CLOSE THE FTP CONNECTION
To throttle the download, just implement a function that does file.write and time.sleep as needed. Pass that function to ftp.retrbinary as callback (instead of file.write directly).
This pseudo code (I do not do Python) should give you some idea:
total_length = 0
start_time = time.time()
def write_and_sleep(buf):
global file
global total_length
global start_time
file.write(buf)
total_length += sys.getsizeof(buf)
while (total_length / (time.time() - start_time)) > 100000000:
time.sleep(0.1)
ftp.retrbinary('RETR '+ filename, write_and_sleep)
Reducing maxblocksize (the 3rd argument of ftp.retrbinary) may help achieving more smooth "download curve".
Related
Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 5 years ago.
Improve this question
AIM: I wanted to record all of the files on a variety of hard disk and collects the: file name, folders, and size of the file in megabytes. The code runs, to my knowledge doesn't produce any errors, but doesn't produce the csv file at the end?
What I've tried:
I've tried running the file with Sudo, changing the permissions with chmod +x, checking that python is in the same place for standard user and for sudo user, and lastly removing or commenting out troublesome lines which seem to yield different results or errors depending on OS.
import os
from os import path
import sys
import datetime
from datetime import date, time, timedelta
import time
import csv
#from socket import gethostbyname
#variables
#hostname = str(socket.gethostname())
scandir = "/"
savefiledir = "/Users/joshua/Documents/python/"
textfilename = str(datetime.datetime.now().strftime("%Y-%m-%d")) + "_" "directory_printer.csv"
#change directory to the root directory or the one which you want to scan for files (scandir)
os.getcwd()
os.chdir(scandir)
directory = os.getcwd()
#find all files in a directory and it's sub directory regardless of extension
results = [val for sublist in [[os.path.join(i[0], j) for j in i[2]] for i in os.walk(directory)] for val in sublist]
d = {}
file_count = 0
metadata = []
for file in results:
#full path
try:
fullpath = file
except:
fullpath = None
#file name
try:
file_directory = "/".join(str(file).split('/')[1:-1])
except:
file_directory = None
#file extension
try:
file_ext = str(file).split('/')[-1]
except:
file_ext = None
#subfolders
try:
parts = file_directory.split('/')
sub_folders = ":".join(parts[1:-1])
except:
sub_folders = None
#num subfolders
try:
count_subfolders = len(sub_folders.split(':'))
except:
count_subfolders = None
#filesize megabytes
try:
filesize_mb = os.path.getsize(file)/1024
except:
filesize_mb = None
#date modified
try:
date_modified = datetime.datetime.now() - datetime.datetime.fromtimestamp(path.getmtime(file))
except:
date_modified = None
#time modified
#try:
# time_modified = os.stat(fullpath).st_mtime #time of most recent content modification
#except:
# time_modified = None
#time created (windows)
# try:
# time_created = os.stat(fullpath).st_ctime #platform dependent; time of most recent metadata change on Unix, or the time of creation on Windows)# except:
# time_created = None
#record all file metadata
d[file_count] = {'Full_Path': fullpath, 'File_Directory': file_directory,
'File_Extension': file_ext, 'List_Sub_Folders' : sub_folders,
'Count_Sub_Folders' : count_subfolders, 'Filesize_mb' : filesize_mb,
'Date_Modified' : date_modified}
file_count = file_count + 1
#write the dictinary with the disks file metadata to a csv file
with open(textfilename,'w') as f:
w = csv.writer(f)
w.writerows(d.items())
print("Scanning directory: "
+ str(scandir) + " complete!" + "\n"
+ "The results have been saved to: " + "\n"
+ str(savefiledir)+str(textfilename))
As it is, it looks like your code will write the CSV file to scandir (/), not to savefiledir, because at the beginning of the program you call os.chdir(scandir). If you want to get the file at the right place (where the final printed message says it's saved to) you should do:
# ...
#write the dictinary with the disks file metadata to a csv file
with open(savefiledir + textfilename,'w') as f:
w = csv.writer(f)
w.writerows(d.items())
# ...
I am trying to create a script that downloads files from an FTP site based on whether I have a folder to put them in (based on date code) and whether or not I have already downloaded the file and have it in that folder. I believe I have the pieces to do this however, I am not sure how to combine them to get them to work.
So far to download the files from the FTP server I have the following code.
from ftplib import FTP
import os, sys, os.path
import re
def handleDownload(block):
file.write(block)
ddir='U:/Test Folder'
os.chdir(ddir)
ftp = FTP('sidads.colorado.edu')
ftp.login()
print ('Logging in.')
directory = '/pub/DATASETS/NOAA/G02158/unmasked/2012/04_Apr/'
print ('Changing to ' + directory)
ftp.cwd(directory)
ftp.retrlines('LIST')
print ('Accessing files')
filenames = ftp.nlst() # get filenames within the directory
print (filenames)
for filename in filenames:
if filename not in ['.', '..']:
local_filename = os.path.join(ddir, filename)
print(filename)
with open(local_filename, 'wb') as f_output:
ftp.retrbinary('RETR '+ filename, f_output.write)
ftp.quit()
To pull the string values I need from the filenames and to use these string values to determine whether or not the folder to put them in exists and whether or not the file exists I used the following code. (An important note is that the string values pulled from the filenames I download match the codes I used in my file paths therefore allowing me to match them)
for fname in filenames:
tl = fname[16:20]
t2 = fname[20:22]
t3 = fname[22:24]
if not tl: continue
print (tl) # You can append the result to a list
print (t2) # You can append the result to a list
print (t3) # You can append the result to a list
if os.path.exists(os.path.join("U:/SWEModelConstruction/UnmaskedData/",t1,t2,t3)) == true and os.path.isfile("U:/SWEModelConstruction/UnmaskedData/",t1,t2,t3,filename) != true
To follow your method,
def check_path(fname):
tl = fname[16:20]
t2 = fname[20:22]
t3 = fname[22:24]
if not tl: continue
if os.path.exists(os.path.join(dirr,t1,t2,t3)) == true and os.path.isfile(dirr,t1,t2,t3,filename) != true
for filename in filenames:
if filename not in ['.', '..']:
check_path(filename)
local_filename = os.path.join(ddir, filename)
print(filename)
with open(local_filename, 'wb') as f_output:
ftp.retrbinary('RETR '+ filename, f_output.write)
You basically need to check the path is available, if not make directories.
And then, if the file is present using os.path.isfile() and that can be done in a simpler way, as (to make things more generalised)
def check_path(path):
if not os.path.exists(path):
return False
return True
for filename in filenames:
if filename not in ['.', '..']:
local_filename = os.path.join(ddir, filename)
path='/'.join(local_filename.split('/')[:-1])
# check if dir exists otherwise create one
# wrtie files that doesnt exists
if os.path.exists(path) and not os.path.isfile(filename):
with open(local_filename, 'wb') as f_output:
ftp.retrbinary('RETR '+ filename, f_output.write)
Hope it helps!
I've written an ftp crawler to download specific files. It works up until it finds the specific file it wants to download, and then it throws this error:
ftplib.error_perm: 550
The file exists in my download folder, but the size of the file is 0 kb.
Do I need to convert something in order to get it to download?.
I can access the ftp manual and download the file without any problems, so don't think it's the login part (unless there's different ways of logging in??)
Here's my code:
import ftplib
import re
import os
class Reader:
def __init__(self):
self.data = ""
def __call__(self,s):
self.data += s + "\n"
ftp = ftplib.FTP("my_ftp_server")
ftp.login()
r = Reader()
ftp.dir(r)
def get_file_list(folder):
r = Reader()
ftp.dir(folder, r)
print ("Reading folder",folder)
global tpe
global name
for l in r.data.split("\n"):
if len(l) > 0:
vars = re.split("[ ]*", l)
tpe = vars[2]
name = vars[3]
if tpe == "<DIR>":
get_file_list( folder + "/" + name )
else:
print (folder + name)
for name in folder:
if vars[3].endswith(('501.zip','551.zip')):
if os.path.exists('C:\\download\\' + vars[3]) == False:
fhandle = open(os.path.join('C:\\download\\', vars[3]), 'wb')
print ('Getting ' + vars[3])
ftp.retrbinary('RETR ' + vars[3], fhandle.write)
fhandle.close()
elif os.path.exists(('C:\\download\\' + vars[3])) == True:
print ('File ', vars[3], ' Already Exists, Skipping Download')
print("-"*30)
print ("Fetching folders...")
get_file_list("")
Your code is probably OK.
FTP error 550 is caused by a permission issue on the server side.
This error means 'Requested action not taken. File unavailable (e.g., file not found, no access).', as you can find out here on Wikipedia
If you expect to have access to it, you should contact the sysadmin to rectify the file permission.
I'm trying to implement a file transfer automation with python 2.7 on Windows.
So I have a FTPS server, I need to move some files from it to a local directory and to upload some files from local to FTPS
The FTPS structure is like so:
- ROOT FOLDER
- AAA
- abc_id1
- in
- out
- abc_id2
- in
- out
- abc_id3
- in
- out
- BBB
- abc_id1
- in
- out
- abc_id2
- in
- out
- abc_id3
- in
- out
I must first MOVE all files that match a wildcard ABC_*.csv, they are located in all /in folders (so for ex. AAA\abc_id1\in) to a local directory
Then I must upload (COPY) some files that have a wildcard from the local directory to the corresponding abc_/in folder (for ex. a file named ABC_id3.csv must go to the abc_id3 folder)
I have began the code:
from ftplib import FTP_TLS
ftps = FTP_TLS('ip_address')
ftps.login("user", "pass") # login before securing control channel
ftps.prot_p() # switch to secure data connection
#ftps.retrlines('LIST') # list directory content securely
ftps.cwd("AAA")
ftps.retrlines('LIST')
ftps.quit()
But I don't know how can i loop through the multiple folders to accomplish the task
Please suggest some code
Regards
Two things that will help. Walking through directories with os.walk and generators.
You'll want to walk through the directories and check each file going through. Once you determine it's a file you want you can apply the appropriate FTP functionality.
Here's a sample I have from one of my apps I'm working on. I've added the ability to exclude as well.
# Generator which runs through directories and returns files
def scanDir (self, root, excludeDirs, excludeFiles, excludeExt, maxFileSize):
global fileList
print "Scanning directory " + root
x = 0
for root, dirnames, filenames in os.walk(root):
for name in filenames:
#We want absolute path to these
absroot = os.path.abspath(root)
filename = os.path.join(absroot, name)
fileSize = os.path.getsize(filename) / 1024
x = x + 1
#print x
##TODO compressed files call here (Extension)
if (os.path.isfile(filename) and os.path.getsize(filename) > 0):
if fileSize > maxFileSize:
continue
else:
try:
#print root + name
os.path.getsize(filename)
data = open(root + "/" + name, 'rb').read()
except:
data = False
print "Could not read file :: %s/%s" % (root, file)
# TODO Create Exception here and filter file paths:
# regex for /home/*/mail
self.fileList.append({"filename":filename})
yield data, filename
Here's an example of recursively walking an FTP server and fetching zip files, with an anonymous login.
#!/usr/bin/env python
from ftplib import FTP
from time import sleep
import os
ftp = FTP('ftp2.census.gov')
ftp.login()
my_dirs = [] # global
my_files = [] # global
curdir = '' # global
def get_dirs(ln):
global my_dirs
global my_files
cols = ln.split(' ')
objname = cols[len(cols)-1] # file or directory name
if ln.startswith('d'):
my_dirs.append(objname)
else:
if objname.endswith('.zip'):
my_files.append(os.path.join(curdir, objname)) # full path
def check_dir(adir):
global my_dirs
global my_files # let it accrue, then fetch them all later
global curdir
my_dirs = []
gotdirs = [] # local
curdir = ftp.pwd()
print("going to change to directory " + adir + " from " + curdir)
ftp.cwd(adir)
curdir = ftp.pwd()
print("now in directory: " + curdir)
ftp.retrlines('LIST', get_dirs)
gotdirs = my_dirs
print("found in " + adir + " directories:")
print(gotdirs)
print("Total files found so far: " + str(len(my_files)) + ".")
sleep(1)
for subdir in gotdirs:
my_dirs = []
check_dir(subdir) # recurse
ftp.cwd('..') # back up a directory when done here
try:
check_dir('/geo/tiger/GENZ2012') # root directory to start in
except:
print('oh dear.')
ftp.quit()
ftp.cwd('/.') # change to root directory for downloading
for f in my_files:
print('getting ' + f)
file_name = f.replace('/', '_') # use path as filename prefix, with underscores
ftp.retrbinary('RETR ' + f, open(file_name, 'wb').write)
sleep(1)
ftp.quit()
print('all done!')
My program runs smoothly but I want my files from ftp to be zip in my local drive
The problem is only 1 file is being zipped after calling my main() function
Here's my code:
import os
import upload
import download
import zipfile
import ConfigParser
import ftputil
def main():
#create a folder Temp on d drive for later use
path = r'D:\Temp'
os.mkdir(path)
#parse all the values at config.ini file
config = ConfigParser.ConfigParser()
config.readfp(open('config.ini'))
server = config.get('main', 'Server')
username = config.get('main', 'Username')
password = config.get('main', 'Password')
uploads = config.get('main', 'Upload folder')
downloads = config.get('main', 'Download folder')
#connect to ftp
ftp = ftputil.FTPHost(server, username, password)
dirlist = ftp.listdir(downloads)
for list in dirlist:
ftp.chdir(downloads)
target = os.path.join(path, list)
ftp.download(list, target)
#########################################################
# THis section is where algo fails but the program run#
########################################################
#zipping files
absolute_path = r'D:\Temp'
dirlist = os.listdir(absolute_path)
filepath = r'D:\Temp\project2.zip'
for list in dirlist:
get_file = os.path.join(absolute_path, list)
zip_name = zipfile.ZipFile(filepath, 'w')
zip_name.write(get_file, 'Project2b\\' + list)
if __name__ == '__main__':
print "cannot be"
When you do this :
for list in dirlist:
get_file = os.path.join(absolute_path, list)
zip_name = zipfile.ZipFile(filepath, 'w')
zip_name.write(get_file, 'Project2b\\' + list)
you recreate a ZipFile for each file you want to zip, the "w" mode means you recreate it from scratch.
Try this (create the zip file before the loop) :
zip_name = zipfile.ZipFile(filepath, 'w')
for list in dirlist:
get_file = os.path.join(absolute_path, list)
zip_name.write(get_file, 'Project2b\\' + list)
Or this, it will open the zipfile in append mode:
for list in dirlist:
get_file = os.path.join(absolute_path, list)
zip_name = zipfile.ZipFile(filepath, 'a')
zip_name.write(get_file, 'Project2b\\' + list)
Have a look at the shutil module. There is an example using shutil.make_archive():
http://docs.python.org/library/shutil.html
If you have a lot of files you can zip them in parallel:
import zipfile
from pathlib import Path, WindowsPath
from typing import List, Text
import logging
from time import time
from concurrent.futures import ThreadPoolExecutor
logging.basicConfig(
format="%(asctime)s - %(message)s", datefmt="%H:%M:%S", level=logging.DEBUG
)
PATH = (r"\\some_directory\subdirectory\zipped")
def file_names() -> List[WindowsPath]:
p = Path(PATH)
file_names = list(p.glob("./*.csv"))
logging.info("There are %d files", len(file_names))
return file_names
def zip_file(file: WindowsPath) -> None:
zip_file_name = Path(PATH, f"{file.stem}.zip")
with zipfile.ZipFile(zip_file_name, "w") as zip:
zip.write(file, arcname=file.name, compress_type=zipfile.ZIP_DEFLATED)
def main(files: List[Text]) -> None:
t0 = time()
number_of_files = len(files)
with ThreadPoolExecutor() as executor:
for counter, _ in enumerate(executor.map(zip_file, files), start=1):
# update progress every 100 files
if counter % 100 == 0:
logging.info(
"Processed %d/%d. TT: %d:%d",
counter,
number_of_files,
*divmod(int(time() - t0), 60),
)
logging.info(
"Finished zipping %d files. Total time: %d:%d",
len(files),
*divmod(int(time() - t0), 60),
)
if __name__ == "__main__":
files = file_names()
main(files)
Best way to do this is by putting debug statements at your for loops, there are two possibilities;
one is that the first forloop only downloads one file from the ftp folder
two is that the first loop downloads all files but second loop zips only one of them
use print statements to see which files are downloaded/zipped at the loops, good luck