Python FTP download 550 error - python

I've written an ftp crawler to download specific files. It works up until it finds the specific file it wants to download, and then it throws this error:
ftplib.error_perm: 550
The file exists in my download folder, but the size of the file is 0 kb.
Do I need to convert something in order to get it to download?.
I can access the ftp manual and download the file without any problems, so don't think it's the login part (unless there's different ways of logging in??)
Here's my code:
import ftplib
import re
import os
class Reader:
def __init__(self):
self.data = ""
def __call__(self,s):
self.data += s + "\n"
ftp = ftplib.FTP("my_ftp_server")
ftp.login()
r = Reader()
ftp.dir(r)
def get_file_list(folder):
r = Reader()
ftp.dir(folder, r)
print ("Reading folder",folder)
global tpe
global name
for l in r.data.split("\n"):
if len(l) > 0:
vars = re.split("[ ]*", l)
tpe = vars[2]
name = vars[3]
if tpe == "<DIR>":
get_file_list( folder + "/" + name )
else:
print (folder + name)
for name in folder:
if vars[3].endswith(('501.zip','551.zip')):
if os.path.exists('C:\\download\\' + vars[3]) == False:
fhandle = open(os.path.join('C:\\download\\', vars[3]), 'wb')
print ('Getting ' + vars[3])
ftp.retrbinary('RETR ' + vars[3], fhandle.write)
fhandle.close()
elif os.path.exists(('C:\\download\\' + vars[3])) == True:
print ('File ', vars[3], ' Already Exists, Skipping Download')
print("-"*30)
print ("Fetching folders...")
get_file_list("")

Your code is probably OK.
FTP error 550 is caused by a permission issue on the server side.
This error means 'Requested action not taken. File unavailable (e.g., file not found, no access).', as you can find out here on Wikipedia
If you expect to have access to it, you should contact the sysadmin to rectify the file permission.

Related

Incomplete extraction of Zip files using Python

I have a script where I download a bunch of Zip files (150+) from a website and unzip them. I just noticed that the Zip files weren't completely extracting - i.e., there should be 68 files in each directory and there are only 62. The script ran fine with no errors.
Any thoughts? I tried running one Zip file through by itself and it extracted fine. Could the operation be timing out or something? Please forgive my code, I'm new.
I'm running Python 2.7.
import csv, urllib, urllib2, zipfile
from datetime import date
dlList =[]
dloadUrlBase = r"https://websoilsurvey.sc.egov.usda.gov/DSD/Download/Cache/SSA/"
dloadLocBase = r"Z:/Shared/Corporate/Library/GIS_DATA/Soils/"
stateDirList =[]
countyDirList =[]
fileNameList=[]
unzipList =[]
extractLocList=[]
logfile = 'log_{}.txt'.format(date.today())
with open(r'N:\Shared\Service Areas\Geographic Information Systems\Tools and Scripts\Soil_Downloads\FinalListforDownloads.csv') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
stateDirList.append(row['StateDir'])
countyDirList.append(row['CountyDir'])
fileNameList.append(row['File_Name'])
for state, county, fileName in zip(stateDirList, countyDirList, fileNameList):
dloadDir = dloadLocBase + state + r"/" + county + "/" + fileName
requestURL = dloadUrlBase + fileName
extractLocList.append(dloadLocBase + state + r"/" + county + "/")
try:
urllib.urlretrieve(requestURL, dloadDir)
print requestURL + " found"
urllib.urlcleanup()
unzipList.append(dloadDir)
f = open(logfile, 'a+')
f.write(dloadDir + " has been downloaded")
f.close()
except:
pass
for zFile, uzDir in zip(unzipList, extractLocList):
zip_ref = zipfile.ZipFile(zFile, "r")
zip_ref.extractall(uzDir)
zip_ref.close()
Instead of just passing when an error is raised, log / print what the error is. That should indicate what is the issue or a set of issues.
except Exception as e:
print e # or print e.message
Turns out that this was a syncing issue with my network. We use a cloud based network that syncs with our offices, so somehow not all of the files were being synced and getting left in a queue.

How to skip unhashable (corrupt) files while md5 fingerprinting?

The code below makes an md5/metadata fingerprint, but crashes on files with unknown corruption (e.g., files, that can be copied, mostly even opened, but that can not be hashed or zipped up [to disguise their corruption]).
Question: How one makes this code to skip or ignore any and all problem files and just do the rest? Imagine 1 million files on 8 TB. Otherwise I leave it running and having no real-time monitoring of progress, 2 days later I find out that nothing got hashed because a couple problem files made the code hung.
Part of the code (see full code below):
def createBasicInfoListFromDisk():
global diskCompareListDetails, onlyFileNameOnDisk, driveLetter,walk_dir
walk_dir = os.path.abspath(walk_dir)
for root, subdirs, files in os.walk(walk_dir, topdown=True, onerror=None, followlinks=True ):
for filename in files:
file_path = os.path.join(root, filename)
temp = file_path.split(":")
driveLetter = temp[0]
filePathWithoutDriveLetter = temp[1]
fileSize = os.path.getsize(file_path)
mod_on = get_last_write_time(file_path)
print('\t- file %s (full path: %s)' % (filename, file_path))
print('FileName : {filename} is of size {size} and was modified on{mdt}'.format(filename=file_path,size=fileSize,mdt=mod_on ))
diskCompareListDetails.append("\"" + filePathWithoutDriveLetter+"\",\""+str(fileSize) + "\",\"" + mod_on +'"')
onlyFileNameOnDisk.append("\""+filePathWithoutDriveLetter+"\"")
return
Error:
FileName : T:\problemtest\problemfile.doc is of size 27136 and was modified on2010-10-10 13:58:32
Traceback (most recent call last):
File "t:\scripts\test.py", line 196, in <module>
createBasicInfoListFromDisk()
File "t:\scripts\test.py", line 76, in createBasicInfoListFromDisk
mod_on = get_last_write_time(file_path)
File "t:\scripts\test.py", line 61, in get_last_write_time
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(st.st_mtime))
OSError: [Errno 22] Invalid argument
Full code:
import os
import sys
import time
import datetime
import difflib
import decimal
import hashlib
from pip._vendor.distlib.compat import raw_input
csvListDetails = list()
csvCompareListDetails = list()
diskCompareListDetails = list()
onlyFileNameOnDisk = list()
addedFiles = list()
removedFiles = list()
driveLetter =""
finalFilesToChange=list()
finalFilesToDelete=list()
changedFiles=list()
csvfilewithPath="md5.csv"
import shutil
walk_dir=""
def findAndReadCSVFile(fileName):
global csvListDetails
global csvCompareListDetails
haveIgnoredLine = 0
foundFile=0
try :
inputFileHandler = open(fileName,"rt",encoding='utf-8')
update_time = get_last_write_time(fileName)
print("\n Found md5.csv, last updated on: %s" % update_time)
foundFile=1
except (OSError, IOError, FileNotFoundError):
print("\n md5.csv not found. Will create a new one.")
return foundFile
for line in inputFileHandler:
if (haveIgnoredLine==0):
haveIgnoredLine=1
continue
rowItem = line.replace("\n","").split('","')
csvCompareListDetails.append('"' + rowItem[3]+',"'+rowItem[2]+'","' +rowItem[1]+'"')
lineDetails = list()
for detailNum in range (0,len(rowItem)):
lineDetails.append('"' + (rowItem[detailNum].replace('"','')) + '"')
csvListDetails.append(lineDetails)
inputFileHandler.close()
return foundFile
def get_last_write_time(filename):
st = os.stat(filename)
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(st.st_mtime))
return convert_time_to_human_readable
def createBasicInfoListFromDisk():
global diskCompareListDetails, onlyFileNameOnDisk, driveLetter,walk_dir
walk_dir = os.path.abspath(walk_dir)
for root, subdirs, files in os.walk(walk_dir, topdown=True, onerror=None, followlinks=True ):
for filename in files:
file_path = os.path.join(root, filename)
temp = file_path.split(":")
driveLetter = temp[0]
filePathWithoutDriveLetter = temp[1]
fileSize = os.path.getsize(file_path)
mod_on = get_last_write_time(file_path)
print('\t- file %s (full path: %s)' % (filename, file_path))
print('FileName : {filename} is of size {size} and was modified on{mdt}'.format(filename=file_path,size=fileSize,mdt=mod_on ))
diskCompareListDetails.append("\"" + filePathWithoutDriveLetter+"\",\""+str(fileSize) + "\",\"" + mod_on +'"')
onlyFileNameOnDisk.append("\""+filePathWithoutDriveLetter+"\"")
return
def compareLogAndDiskLists():
global addedFiles, removedFiles
diff = difflib.unified_diff(csvCompareListDetails, diskCompareListDetails, fromfile='file1', tofile='file2', lineterm='', n=0)
lines = list(diff)[2:]
addedFiles = [line[1:] for line in lines if line[0] == '+']
removedFiles = [line[1:] for line in lines if line[0] == '-']
return
def displayInfoForUserInput():
global finalFilesToChange, finalFilesToDelete
changedOrNewFileCount = 0
noLongerExistingFilesCount = 0
totalSizeOfChange = 0
for line in addedFiles:
if line not in removedFiles:
changedOrNewFileCount = changedOrNewFileCount +1
elements = line.replace("\n","").split('","')
sizeOfFile= int(elements[1].replace('"',''))
totalSizeOfChange = totalSizeOfChange + sizeOfFile
finalFilesToChange.append(elements[0] +'"')
for line in removedFiles:
elements = line.split('","')
if elements[0]+'"' not in onlyFileNameOnDisk:
noLongerExistingFilesCount = noLongerExistingFilesCount + 1
finalFilesToDelete.append(elements[0]+'"')
GBModSz= decimal.Decimal(totalSizeOfChange) / decimal.Decimal('1073741824')
print("\n New or modified files on drive: {} (need to hash)".format(changedOrNewFileCount))
print (" Obsolete lines in md5.csv (files modified or not on drive): {} (lines to delete)".format(noLongerExistingFilesCount))
print (" {} files ({:.2f} GB) needs to be hashed.".format(changedOrNewFileCount,GBModSz))
userInput = raw_input("\n Proceed with hash? (Y/N, Yes/No) ")
if (userInput.strip().upper() == "Y" or userInput.strip().upper() == "YES"):
print("Continuing Processing...")
else:
print("You opted not to continue, Exiting")
sys.exit()
return
def processFiles(foundFile):
if (foundFile==1):
oldFileName = walk_dir+"/md5.csv"
shutil.copy( oldFileName, getTargetFileName(oldFileName))
BLOCKSIZE = 1048576*4
global changedFiles
for fileToHash in finalFilesToChange:
hasher = hashlib.new('md5')
fileToUse=driveLetter+":"+fileToHash.replace('"','')
with open(fileToUse, 'rb') as afile:
buf = afile.read(BLOCKSIZE)
while len(buf) > 0:
hasher.update(buf)
buf = afile.read(BLOCKSIZE)
fileDetails = list()
fileDetails.append(hasher.hexdigest())
fileDetails.append(get_last_write_time(fileToUse))
fileDetails.append(os.path.getsize(fileToUse))
fileDetails.append(fileToHash)
changedFiles.append(fileDetails)
return
def getTargetFileName(oldFileName):
targetFileName= walk_dir+"/generated_on_" + get_last_write_time(oldFileName).replace(" ","_").replace("-","").replace(":","")
targetFileName = targetFileName + "__archived_on_" + datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
targetFileName = targetFileName + "__md5.csv"
return targetFileName
def writeCSVFile(fileName):
try :
outputFileHandler=open(fileName,"wt",encoding='utf-8')
outputFileHandler.write("\"md5Hash\",\"LastWriteTime\",\"Length\",\"FullName\"\n")
for details in csvListDetails:
if details[3] in finalFilesToDelete:
continue
if details[3] in finalFilesToChange:
continue
outputFileHandler.write("{},{},{},{}\n".format(details[0],details[1],details[2],details[3]))
for details in changedFiles:
outputFileHandler.write("\"{}\",\"{}\",\"{}\",{}\n".format(details[0],details[1],details[2],details[3]))
outputFileHandler.close()
except (OSError, IOError, FileNotFoundError) as e:
print("ERROR :")
print("File {} is either not writable or some other error: {}".format(fileName,e))
return
if __name__ == '__main__':
walk_dir = raw_input("\n Enter drive or directory to scan: ")
csvfilewithPath=walk_dir+"/md5.csv"
print("\n Drive to scan: " + walk_dir)
foundFile = 0
foundFile=findAndReadCSVFile(csvfilewithPath)
createBasicInfoListFromDisk()
compareLogAndDiskLists()
displayInfoForUserInput()
processFiles(foundFile)
writeCSVFile(csvfilewithPath)
Trying this fix, no luck:
def get_last_write_time(filename):
try:
st = os.stat(filename)
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(st.st_mtime))
return convert_time_to_human_readable
except OSError:
pass
return "ERROR"
def createBasicInfoListFromDisk():
I agree with IMCoins and I'm very qurius on why except isn't catching the error.
So first thing I would do is to go to the source where the OSError is being raised and try to catch it explicity.
def get_last_write_time(filename):
try:
st = os.stat(filename)
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S",
time.localtime(st.st_mtime)
return convert_time_to_human_readable
except OSError:
pass
return "ERROR" #or whatever string you want add
Updated answer, for updated post.
As stated earlier, except statement with exception type specified catches everything. So, in order to do what want... I'm afraid possible answer are either :
To make a method that identifies corrupted files, and handles it properly.
Make try, except statement that encapsulate every part of your code where there could be an error.
Let me warn you about the second solution though, as sometimes, there are system errors that you do not want to avoid. I believe you should print the exception that you catch, in order to identify further problems you may encounter.
Just so you know, as you may not : your error is not in a try, except statement. Your error is in (if I copied and pasted properly in my editor) line 196, createBasicinfoListFromDisk(), then line 76, mod_on = get_last_write_time(file_path)
As you also mentioned you are using python 3.x, I suggest you are looking into the suppress function (https://docs.python.org/3/library/contextlib.html#contextlib.suppress).
I hope it helped you.

python code doesn't generate the csv file [closed]

Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 5 years ago.
Improve this question
AIM: I wanted to record all of the files on a variety of hard disk and collects the: file name, folders, and size of the file in megabytes. The code runs, to my knowledge doesn't produce any errors, but doesn't produce the csv file at the end?
What I've tried:
I've tried running the file with Sudo, changing the permissions with chmod +x, checking that python is in the same place for standard user and for sudo user, and lastly removing or commenting out troublesome lines which seem to yield different results or errors depending on OS.
import os
from os import path
import sys
import datetime
from datetime import date, time, timedelta
import time
import csv
#from socket import gethostbyname
#variables
#hostname = str(socket.gethostname())
scandir = "/"
savefiledir = "/Users/joshua/Documents/python/"
textfilename = str(datetime.datetime.now().strftime("%Y-%m-%d")) + "_" "directory_printer.csv"
#change directory to the root directory or the one which you want to scan for files (scandir)
os.getcwd()
os.chdir(scandir)
directory = os.getcwd()
#find all files in a directory and it's sub directory regardless of extension
results = [val for sublist in [[os.path.join(i[0], j) for j in i[2]] for i in os.walk(directory)] for val in sublist]
d = {}
file_count = 0
metadata = []
for file in results:
#full path
try:
fullpath = file
except:
fullpath = None
#file name
try:
file_directory = "/".join(str(file).split('/')[1:-1])
except:
file_directory = None
#file extension
try:
file_ext = str(file).split('/')[-1]
except:
file_ext = None
#subfolders
try:
parts = file_directory.split('/')
sub_folders = ":".join(parts[1:-1])
except:
sub_folders = None
#num subfolders
try:
count_subfolders = len(sub_folders.split(':'))
except:
count_subfolders = None
#filesize megabytes
try:
filesize_mb = os.path.getsize(file)/1024
except:
filesize_mb = None
#date modified
try:
date_modified = datetime.datetime.now() - datetime.datetime.fromtimestamp(path.getmtime(file))
except:
date_modified = None
#time modified
#try:
# time_modified = os.stat(fullpath).st_mtime #time of most recent content modification
#except:
# time_modified = None
#time created (windows)
# try:
# time_created = os.stat(fullpath).st_ctime #platform dependent; time of most recent metadata change on Unix, or the time of creation on Windows)# except:
# time_created = None
#record all file metadata
d[file_count] = {'Full_Path': fullpath, 'File_Directory': file_directory,
'File_Extension': file_ext, 'List_Sub_Folders' : sub_folders,
'Count_Sub_Folders' : count_subfolders, 'Filesize_mb' : filesize_mb,
'Date_Modified' : date_modified}
file_count = file_count + 1
#write the dictinary with the disks file metadata to a csv file
with open(textfilename,'w') as f:
w = csv.writer(f)
w.writerows(d.items())
print("Scanning directory: "
+ str(scandir) + " complete!" + "\n"
+ "The results have been saved to: " + "\n"
+ str(savefiledir)+str(textfilename))
As it is, it looks like your code will write the CSV file to scandir (/), not to savefiledir, because at the beginning of the program you call os.chdir(scandir). If you want to get the file at the right place (where the final printed message says it's saved to) you should do:
# ...
#write the dictinary with the disks file metadata to a csv file
with open(savefiledir + textfilename,'w') as f:
w = csv.writer(f)
w.writerows(d.items())
# ...

Throttling FTP download with Python ftplib

How do I throttle the FTP download with Python ftplib? For example put a cap on the speed to be 20Mb/s?
I'm using the following code to download files with Python ftplib:
from ftplib import FTP
import os
download_list = 'testlist.txt' # inital list of directories to be downloaded
path_list = [] # initalize a list of all the pathes from download_list
local_folder = 'testStorage' #where files are going to be downloaded to
downloaded_list = 'completedownload.txt' # list of completed downloads
error_list = 'incomplete_downloads.txt' # list of paths that are incomplete
ftp=FTP("ftp.address.com")
ftp.login("user_name","password") #login to FTP account
print "Successfully logged in"
# make a list of files to download from a file
with open(download_list, 'r') as f:
content = f.readlines()
path_list = [x.strip() for x in content]
for path in path_list:
path = path.replace("*","") # strips the * found in the source file
print '\nChanging directory to ' + path + ':\n'
#ftp.cwd('/AAA/BBB/CCC/logic-1/') #the format to change into path note the * is omitted
#if ftp.cwd(path) == True:
try: # tries the path in the file
ftp.cwd(path)
#ftp.retrlines('LIST')
filenames = ftp.nlst()
for filename in filenames:
local_directory = local_folder+path # create the local path ie : testStorage/AAA/BBB/CCC/logic-1/
local_filename = os.path.join(local_directory,filename) #
if os.path.exists(local_filename) == False: # checks if file already exists
if not os.path.exists(local_directory): # mimic the remote path locally
os.makedirs(local_directory)
file = open(local_filename,'wb')
ftp.retrbinary('RETR '+ filename, file.write)
print filename
file.close()
elif os.path.exists(local_filename) == True: # skip the file if it exits
print 'File ' +filename + ' already exists, skipping this file'
except: #if path in text file does not exist write to error_list.txt
print 'Path ' + path + ' does not exist writing path to error_list.txt'
with open(error_list, 'a') as f2:
f2.write(path+'\n')
continue
print "all done closing connection"
ftp.close() #CLOSE THE FTP CONNECTION
To throttle the download, just implement a function that does file.write and time.sleep as needed. Pass that function to ftp.retrbinary as callback (instead of file.write directly).
This pseudo code (I do not do Python) should give you some idea:
total_length = 0
start_time = time.time()
def write_and_sleep(buf):
global file
global total_length
global start_time
file.write(buf)
total_length += sys.getsizeof(buf)
while (total_length / (time.time() - start_time)) > 100000000:
time.sleep(0.1)
ftp.retrbinary('RETR '+ filename, write_and_sleep)
Reducing maxblocksize (the 3rd argument of ftp.retrbinary) may help achieving more smooth "download curve".

Python - move / upload specific files to FTPS

I'm trying to implement a file transfer automation with python 2.7 on Windows.
So I have a FTPS server, I need to move some files from it to a local directory and to upload some files from local to FTPS
The FTPS structure is like so:
- ROOT FOLDER
- AAA
- abc_id1
- in
- out
- abc_id2
- in
- out
- abc_id3
- in
- out
- BBB
- abc_id1
- in
- out
- abc_id2
- in
- out
- abc_id3
- in
- out
I must first MOVE all files that match a wildcard ABC_*.csv, they are located in all /in folders (so for ex. AAA\abc_id1\in) to a local directory
Then I must upload (COPY) some files that have a wildcard from the local directory to the corresponding abc_/in folder (for ex. a file named ABC_id3.csv must go to the abc_id3 folder)
I have began the code:
from ftplib import FTP_TLS
ftps = FTP_TLS('ip_address')
ftps.login("user", "pass") # login before securing control channel
ftps.prot_p() # switch to secure data connection
#ftps.retrlines('LIST') # list directory content securely
ftps.cwd("AAA")
ftps.retrlines('LIST')
ftps.quit()
But I don't know how can i loop through the multiple folders to accomplish the task
Please suggest some code
Regards
Two things that will help. Walking through directories with os.walk and generators.
You'll want to walk through the directories and check each file going through. Once you determine it's a file you want you can apply the appropriate FTP functionality.
Here's a sample I have from one of my apps I'm working on. I've added the ability to exclude as well.
# Generator which runs through directories and returns files
def scanDir (self, root, excludeDirs, excludeFiles, excludeExt, maxFileSize):
global fileList
print "Scanning directory " + root
x = 0
for root, dirnames, filenames in os.walk(root):
for name in filenames:
#We want absolute path to these
absroot = os.path.abspath(root)
filename = os.path.join(absroot, name)
fileSize = os.path.getsize(filename) / 1024
x = x + 1
#print x
##TODO compressed files call here (Extension)
if (os.path.isfile(filename) and os.path.getsize(filename) > 0):
if fileSize > maxFileSize:
continue
else:
try:
#print root + name
os.path.getsize(filename)
data = open(root + "/" + name, 'rb').read()
except:
data = False
print "Could not read file :: %s/%s" % (root, file)
# TODO Create Exception here and filter file paths:
# regex for /home/*/mail
self.fileList.append({"filename":filename})
yield data, filename
Here's an example of recursively walking an FTP server and fetching zip files, with an anonymous login.
#!/usr/bin/env python
from ftplib import FTP
from time import sleep
import os
ftp = FTP('ftp2.census.gov')
ftp.login()
my_dirs = [] # global
my_files = [] # global
curdir = '' # global
def get_dirs(ln):
global my_dirs
global my_files
cols = ln.split(' ')
objname = cols[len(cols)-1] # file or directory name
if ln.startswith('d'):
my_dirs.append(objname)
else:
if objname.endswith('.zip'):
my_files.append(os.path.join(curdir, objname)) # full path
def check_dir(adir):
global my_dirs
global my_files # let it accrue, then fetch them all later
global curdir
my_dirs = []
gotdirs = [] # local
curdir = ftp.pwd()
print("going to change to directory " + adir + " from " + curdir)
ftp.cwd(adir)
curdir = ftp.pwd()
print("now in directory: " + curdir)
ftp.retrlines('LIST', get_dirs)
gotdirs = my_dirs
print("found in " + adir + " directories:")
print(gotdirs)
print("Total files found so far: " + str(len(my_files)) + ".")
sleep(1)
for subdir in gotdirs:
my_dirs = []
check_dir(subdir) # recurse
ftp.cwd('..') # back up a directory when done here
try:
check_dir('/geo/tiger/GENZ2012') # root directory to start in
except:
print('oh dear.')
ftp.quit()
ftp.cwd('/.') # change to root directory for downloading
for f in my_files:
print('getting ' + f)
file_name = f.replace('/', '_') # use path as filename prefix, with underscores
ftp.retrbinary('RETR ' + f, open(file_name, 'wb').write)
sleep(1)
ftp.quit()
print('all done!')

Categories

Resources