Progress bar while uploading a file to dropbox - python

import dropbox
client = dropbox.client.DropboxClient('<token>')
f = open('/ssd-scratch/abhishekb/try/1.mat', 'rb')
response = client.put_file('/data/1.mat', f)
I want to upload a big file to dropbox. How can I check the progress? [Docs]
EDIT:
The uploader offeset is same below somehow. What am I doing wrong
import os,pdb,dropbox
size=1194304
client = dropbox.client.DropboxClient(token)
path='D:/bci_code/datasets/1.mat'
tot_size = os.path.getsize(path)
bigFile = open(path, 'rb')
uploader = client.get_chunked_uploader(bigFile, size)
print "uploading: ", tot_size
while uploader.offset < tot_size:
try:
upload = uploader.upload_chunked()
print uploader.offset
except rest.ErrorResponse, e:
print("something went wrong")
EDIT 2:
size=1194304
tot_size = os.path.getsize(path)
bigFile = open(path, 'rb')
uploader = client.get_chunked_uploader(bigFile, tot_size)
print "uploading: ", tot_size
while uploader.offset < tot_size:
try:
upload = uploader.upload_chunked(chunk_size=size)
print uploader.offset
except rest.ErrorResponse, e:
print("something went wrong")

upload_chunked, as the documentation notes:
Uploads data from this ChunkedUploader's file_obj in chunks, until an
error occurs. Throws an exception when an error occurs, and can be
called again to resume the upload.
So yes, it uploads the entire file (unless an error occurs) before returning.
If you want to upload a chunk at a time on your own, you should use upload_chunk and commit_chunked_upload.
Here's some working code that shows you how to upload a single chunk at a time and print progress in between chunks:
from io import BytesIO
import os
from dropbox.client import DropboxClient
client = DropboxClient(ACCESS_TOKEN)
path = 'test.data'
chunk_size = 1024*1024 # 1MB
total_size = os.path.getsize(path)
upload_id = None
offset = 0
with open(path, 'rb') as f:
while offset < total_size:
offset, upload_id = client.upload_chunk(
BytesIO(f.read(chunk_size)),
offset=offset, upload_id=upload_id)
print('Uploaded so far: {} bytes'.format(offset))
# Note the "auto/" on the next line, which is needed because
# this method doesn't attach the root by itself.
client.commit_chunked_upload('auto/test.data', upload_id)
print('Upload complete.')

Related

Creating a tar stream in memory from multiple file byte streams

I'm trying to create a tar stream in memory add files to it and then save it to S3. But there is some issue and the files inside the ta have zero size. Can any one please advise? Code snippet below-
def tar_and_upload(bucket, keys, dest_bucket):
s3 = boto3.client('s3')
file_obj = io.BytesIO()
tar_file_obj = tarfile.open(mode = "w:gz", fileobj=file_obj)
response = {}
for key in keys:
obj = s3.get_object(Bucket=bucket, Key=key)
_bytes = obj["Body"].read()
_file_name = key.split("/")[-1]
tar_file_obj.addfile(tarfile.TarInfo(_file_name), _bytes)
tar_file_obj.close()
try:
obj_name = "{}.tar.gz".format(str(uuid.uuid4()))
s3.put_object(Body=file_obj.getvalue(), Bucket=dest_bucket, Key=obj_name)
except Exception as e:
logging.error("Can't save tar to S3", exc_info=True)
return
def tar_and_upload(bucket, keys, dest_bucket):
s3 = boto3.client('s3')
file_obj = io.BytesIO()
tar_file_obj = tarfile.open(mode = "w:gz", fileobj=file_obj)
response = {}
for key in keys:
obj = s3.get_object(Bucket=bucket, Key=key)
_bytes = obj["Body"].read()
_file_name = key.split("/")[-1]
info = tarfile.TarInfo(_file_name)
info.size = obj["ContentLength"]
info.mtime = s3.head_object(Bucket=bucket, Key=key)['LastModified'].timestamp()
tar_file_obj.addfile(info, io.BytesIO(_bytes))
tar_file_obj.close()
try:
obj_name = "{}.tar.gz".format(str(uuid.uuid4()))
s3.put_object(Body=file_obj.getvalue(), Bucket=dest_bucket, Key=obj_name)
except Exception as e:
logging.error("Can't save tar to S3", exc_info=True)
return
For others, looking to do the same for s3 object
Okay apparently when adding byte streams to a tar, we need to explicitly specify the size.
Sample code-
import tarfile
import uuid
import io
import os
def tar_and_upload():
file_obj = io.BytesIO()
tar_file_obj = tarfile.open(mode = "w:gz", fileobj=file_obj)
for filename in os.listdir("images"):
print(filename)
file_path = os.path.join("images", filename)
#tar_file_obj.add(file_path)
with open(file_path, "rb") as f:
_bytes = f.read()
tar_info = tarfile.TarInfo(filename)
tar_info.size = len(_bytes)
tar_file_obj.addfile(tar_info, io.BytesIO(_bytes))
tar_file_obj.close()
try:
obj_name = "{}.tar.gz".format(str(uuid.uuid4()))
object_path = os.path.join("temp", obj_name)
with open(object_path, "wb") as f:
f.write(file_obj.getvalue())
print(obj_name)
except Exception as e:
print(str(e))
if __name__ == "__main__":
tar_and_upload()

How to skip unhashable (corrupt) files while md5 fingerprinting?

The code below makes an md5/metadata fingerprint, but crashes on files with unknown corruption (e.g., files, that can be copied, mostly even opened, but that can not be hashed or zipped up [to disguise their corruption]).
Question: How one makes this code to skip or ignore any and all problem files and just do the rest? Imagine 1 million files on 8 TB. Otherwise I leave it running and having no real-time monitoring of progress, 2 days later I find out that nothing got hashed because a couple problem files made the code hung.
Part of the code (see full code below):
def createBasicInfoListFromDisk():
global diskCompareListDetails, onlyFileNameOnDisk, driveLetter,walk_dir
walk_dir = os.path.abspath(walk_dir)
for root, subdirs, files in os.walk(walk_dir, topdown=True, onerror=None, followlinks=True ):
for filename in files:
file_path = os.path.join(root, filename)
temp = file_path.split(":")
driveLetter = temp[0]
filePathWithoutDriveLetter = temp[1]
fileSize = os.path.getsize(file_path)
mod_on = get_last_write_time(file_path)
print('\t- file %s (full path: %s)' % (filename, file_path))
print('FileName : {filename} is of size {size} and was modified on{mdt}'.format(filename=file_path,size=fileSize,mdt=mod_on ))
diskCompareListDetails.append("\"" + filePathWithoutDriveLetter+"\",\""+str(fileSize) + "\",\"" + mod_on +'"')
onlyFileNameOnDisk.append("\""+filePathWithoutDriveLetter+"\"")
return
Error:
FileName : T:\problemtest\problemfile.doc is of size 27136 and was modified on2010-10-10 13:58:32
Traceback (most recent call last):
File "t:\scripts\test.py", line 196, in <module>
createBasicInfoListFromDisk()
File "t:\scripts\test.py", line 76, in createBasicInfoListFromDisk
mod_on = get_last_write_time(file_path)
File "t:\scripts\test.py", line 61, in get_last_write_time
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(st.st_mtime))
OSError: [Errno 22] Invalid argument
Full code:
import os
import sys
import time
import datetime
import difflib
import decimal
import hashlib
from pip._vendor.distlib.compat import raw_input
csvListDetails = list()
csvCompareListDetails = list()
diskCompareListDetails = list()
onlyFileNameOnDisk = list()
addedFiles = list()
removedFiles = list()
driveLetter =""
finalFilesToChange=list()
finalFilesToDelete=list()
changedFiles=list()
csvfilewithPath="md5.csv"
import shutil
walk_dir=""
def findAndReadCSVFile(fileName):
global csvListDetails
global csvCompareListDetails
haveIgnoredLine = 0
foundFile=0
try :
inputFileHandler = open(fileName,"rt",encoding='utf-8')
update_time = get_last_write_time(fileName)
print("\n Found md5.csv, last updated on: %s" % update_time)
foundFile=1
except (OSError, IOError, FileNotFoundError):
print("\n md5.csv not found. Will create a new one.")
return foundFile
for line in inputFileHandler:
if (haveIgnoredLine==0):
haveIgnoredLine=1
continue
rowItem = line.replace("\n","").split('","')
csvCompareListDetails.append('"' + rowItem[3]+',"'+rowItem[2]+'","' +rowItem[1]+'"')
lineDetails = list()
for detailNum in range (0,len(rowItem)):
lineDetails.append('"' + (rowItem[detailNum].replace('"','')) + '"')
csvListDetails.append(lineDetails)
inputFileHandler.close()
return foundFile
def get_last_write_time(filename):
st = os.stat(filename)
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(st.st_mtime))
return convert_time_to_human_readable
def createBasicInfoListFromDisk():
global diskCompareListDetails, onlyFileNameOnDisk, driveLetter,walk_dir
walk_dir = os.path.abspath(walk_dir)
for root, subdirs, files in os.walk(walk_dir, topdown=True, onerror=None, followlinks=True ):
for filename in files:
file_path = os.path.join(root, filename)
temp = file_path.split(":")
driveLetter = temp[0]
filePathWithoutDriveLetter = temp[1]
fileSize = os.path.getsize(file_path)
mod_on = get_last_write_time(file_path)
print('\t- file %s (full path: %s)' % (filename, file_path))
print('FileName : {filename} is of size {size} and was modified on{mdt}'.format(filename=file_path,size=fileSize,mdt=mod_on ))
diskCompareListDetails.append("\"" + filePathWithoutDriveLetter+"\",\""+str(fileSize) + "\",\"" + mod_on +'"')
onlyFileNameOnDisk.append("\""+filePathWithoutDriveLetter+"\"")
return
def compareLogAndDiskLists():
global addedFiles, removedFiles
diff = difflib.unified_diff(csvCompareListDetails, diskCompareListDetails, fromfile='file1', tofile='file2', lineterm='', n=0)
lines = list(diff)[2:]
addedFiles = [line[1:] for line in lines if line[0] == '+']
removedFiles = [line[1:] for line in lines if line[0] == '-']
return
def displayInfoForUserInput():
global finalFilesToChange, finalFilesToDelete
changedOrNewFileCount = 0
noLongerExistingFilesCount = 0
totalSizeOfChange = 0
for line in addedFiles:
if line not in removedFiles:
changedOrNewFileCount = changedOrNewFileCount +1
elements = line.replace("\n","").split('","')
sizeOfFile= int(elements[1].replace('"',''))
totalSizeOfChange = totalSizeOfChange + sizeOfFile
finalFilesToChange.append(elements[0] +'"')
for line in removedFiles:
elements = line.split('","')
if elements[0]+'"' not in onlyFileNameOnDisk:
noLongerExistingFilesCount = noLongerExistingFilesCount + 1
finalFilesToDelete.append(elements[0]+'"')
GBModSz= decimal.Decimal(totalSizeOfChange) / decimal.Decimal('1073741824')
print("\n New or modified files on drive: {} (need to hash)".format(changedOrNewFileCount))
print (" Obsolete lines in md5.csv (files modified or not on drive): {} (lines to delete)".format(noLongerExistingFilesCount))
print (" {} files ({:.2f} GB) needs to be hashed.".format(changedOrNewFileCount,GBModSz))
userInput = raw_input("\n Proceed with hash? (Y/N, Yes/No) ")
if (userInput.strip().upper() == "Y" or userInput.strip().upper() == "YES"):
print("Continuing Processing...")
else:
print("You opted not to continue, Exiting")
sys.exit()
return
def processFiles(foundFile):
if (foundFile==1):
oldFileName = walk_dir+"/md5.csv"
shutil.copy( oldFileName, getTargetFileName(oldFileName))
BLOCKSIZE = 1048576*4
global changedFiles
for fileToHash in finalFilesToChange:
hasher = hashlib.new('md5')
fileToUse=driveLetter+":"+fileToHash.replace('"','')
with open(fileToUse, 'rb') as afile:
buf = afile.read(BLOCKSIZE)
while len(buf) > 0:
hasher.update(buf)
buf = afile.read(BLOCKSIZE)
fileDetails = list()
fileDetails.append(hasher.hexdigest())
fileDetails.append(get_last_write_time(fileToUse))
fileDetails.append(os.path.getsize(fileToUse))
fileDetails.append(fileToHash)
changedFiles.append(fileDetails)
return
def getTargetFileName(oldFileName):
targetFileName= walk_dir+"/generated_on_" + get_last_write_time(oldFileName).replace(" ","_").replace("-","").replace(":","")
targetFileName = targetFileName + "__archived_on_" + datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
targetFileName = targetFileName + "__md5.csv"
return targetFileName
def writeCSVFile(fileName):
try :
outputFileHandler=open(fileName,"wt",encoding='utf-8')
outputFileHandler.write("\"md5Hash\",\"LastWriteTime\",\"Length\",\"FullName\"\n")
for details in csvListDetails:
if details[3] in finalFilesToDelete:
continue
if details[3] in finalFilesToChange:
continue
outputFileHandler.write("{},{},{},{}\n".format(details[0],details[1],details[2],details[3]))
for details in changedFiles:
outputFileHandler.write("\"{}\",\"{}\",\"{}\",{}\n".format(details[0],details[1],details[2],details[3]))
outputFileHandler.close()
except (OSError, IOError, FileNotFoundError) as e:
print("ERROR :")
print("File {} is either not writable or some other error: {}".format(fileName,e))
return
if __name__ == '__main__':
walk_dir = raw_input("\n Enter drive or directory to scan: ")
csvfilewithPath=walk_dir+"/md5.csv"
print("\n Drive to scan: " + walk_dir)
foundFile = 0
foundFile=findAndReadCSVFile(csvfilewithPath)
createBasicInfoListFromDisk()
compareLogAndDiskLists()
displayInfoForUserInput()
processFiles(foundFile)
writeCSVFile(csvfilewithPath)
Trying this fix, no luck:
def get_last_write_time(filename):
try:
st = os.stat(filename)
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(st.st_mtime))
return convert_time_to_human_readable
except OSError:
pass
return "ERROR"
def createBasicInfoListFromDisk():
I agree with IMCoins and I'm very qurius on why except isn't catching the error.
So first thing I would do is to go to the source where the OSError is being raised and try to catch it explicity.
def get_last_write_time(filename):
try:
st = os.stat(filename)
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S",
time.localtime(st.st_mtime)
return convert_time_to_human_readable
except OSError:
pass
return "ERROR" #or whatever string you want add
Updated answer, for updated post.
As stated earlier, except statement with exception type specified catches everything. So, in order to do what want... I'm afraid possible answer are either :
To make a method that identifies corrupted files, and handles it properly.
Make try, except statement that encapsulate every part of your code where there could be an error.
Let me warn you about the second solution though, as sometimes, there are system errors that you do not want to avoid. I believe you should print the exception that you catch, in order to identify further problems you may encounter.
Just so you know, as you may not : your error is not in a try, except statement. Your error is in (if I copied and pasted properly in my editor) line 196, createBasicinfoListFromDisk(), then line 76, mod_on = get_last_write_time(file_path)
As you also mentioned you are using python 3.x, I suggest you are looking into the suppress function (https://docs.python.org/3/library/contextlib.html#contextlib.suppress).
I hope it helped you.

Empty chunks when spliting a large file

I am trying to split a large files into 50Mb chunks and save them in another files. After running some read/write operations, some of my chunks were smaller than 50Mb (43Mb,17Mb and so on). Although, I wrote the same code in Java and It has the same problem. What is wrong? my codes are following bellow:
By the way, What we can do to speed up this code to split into chunks faster?
try:
f = open(self.__filename, 'rb')
except (OSError, IOError), e:
raise FileSplitterException, str(e)
bname = (os.path.split(self.__filename))[1]
fsize = os.path.getsize(self.__filename)
self.__chunksize = int(float(fsize)/float(self.__numchunks))
chunksz = self.__chunksize
total_bytes = 0
for x in range(self.__numchunks):
chunkfilename = bname + '-' + str(x+1) + self.__postfix
if x == self.__numchunks - 1:
chunksz = fsize - total_bytes
try:
print 'Writing file',chunkfilename
data = f.read(chunksz)
total_bytes += len(data)
chunkf = file(chunkfilename, 'wb')
chunkf.write(data)
chunkf.close()
except (OSError, IOError), e:
print e
continue
except EOFError, e:
print e
break
The code in the question seems to be focussed on producing a set number of chunks rather than files of 50MB in size.
This code produces 50MB files.
import os
try:
f = open('big.txt', 'rb')
except (OSError, IOError), e:
raise FileSplitterException, str(e)
bname = (os.path.split('big.txt'))[1]
chunksz = 50 * 1000 * 1000 # metric MB - use 1024 * 1024 for binary MB (MiB)
counter = 0
while True:
chunkfilename = bname + '-' + str(counter+1) + '.foo'
try:
print 'Writing file',chunkfilename
data = f.read(chunksz)
if not data:
# We have reached the end of the file, end the script.
break
chunkf = file(chunkfilename, 'wb')
chunkf.write(data)
chunkf.close()
except (OSError, IOError), e:
print e
continue
except EOFError, e:
print e
break
counter += 1
Some aspects of the code are considered poor style in modern python - for example not using a context manager to open files - but I haven't changed these in case the OP is on an old python like 2.5.
Your question is unclear because you haven't included a Minimal, Complete, and Verifiable example—so I don't know exactly what's wrong with your code. However after creating / simulating my guess as to the missing parts, I was able to come up with something that does exactly what you want, I think.
import os
class FileSplitterException(Exception): pass
class FileSplitter(object):
def __init__(self, filename, chunksize):
if not os.path.isfile(filename):
raise FileSplitterException(
"File: {!r} does not exist".format(filename))
self._filename = filename
self._postfix = 'chunk'
self._chunksize = chunksize
def split(self):
bname = os.path.splitext(self._filename)[0]
fsize = os.path.getsize(self._filename)
chunks, partial = divmod(fsize, self._chunksize)
if partial:
chunks += 1
with open(self._filename, 'rb') as infile:
for i in range(chunks):
chunk_filename = os.path.join('{}-{}.{}'.format(
bname, i, self._postfix))
with open(chunk_filename, 'wb') as outfile:
data = infile.read(self._chunksize)
if data:
outfile.write(data)
else:
FileSplitterException('unexpected EOF encountered')
if __name__ == '__main__':
import glob
filename = 'big_file.txt'
chunksize = 1 * 1024 * 1024 # 1 Mb
print('splitting {} into {:,} sized chunks'.format(filename, chunksize))
fs = FileSplitter(filename, chunksize)
fs.split()
print('chunk files written:')
bname = os.path.splitext(filename)[0]
for chunkname in sorted(glob.glob(bname + '-*.' + fs._postfix)):
fsize = os.path.getsize(chunkname)
print(' {}: size: {:,}'.format(chunkname, fsize))

Python Filling Up Disk

I need to setup some test conditions to simulate a filled up disk. I created the following to simply write garbage to the disk:
#!/usr/bin/python
import os
import sys
import mmap
def freespace(p):
"""
Returns the number of free bytes on the drive that ``p`` is on
"""
s = os.statvfs(p)
return s.f_bsize * s.f_bavail
if __name__ == '__main__':
drive_path = sys.argv[1]
output_path = sys.argv[2]
output_file = open(output_path, 'w')
while freespace(drive_path) > 0:
output_file.write("!")
print freespace(drive_path)
output_file.flush()
output_file.close()
As far as I can tell by looking at the return value from freespace, the write method does not write the file to until it is closed, thereby making the while condition invalid.
Is there a way I can write the data directly to the file? Or another solution perhaps?
This is untested but I imagine something along these lines will be the quickest way to fill the disk easily
import sys
import errno
write_str = "!"*1024*1024*5 # 5MB
output_path = sys.argv[1]
with open(output_path, "w") as f:
while True:
try:
f.write(write_str)
f.flush()
except IOError as err:
if err.errno == errno.ENOSPC:
write_str_len = len(write_str)
if write_str_len > 1:
write_str = write_str[:write_str_len/2]
else:
break
else:
raise
You could try/catch a disk full exception on write.

How to download large file with binary mode in python?

I am code a download function in python. The file size >1GB. The server is linux, HTTP server is Karrigell. Client is browse, Firefox or IE. I meet a big trouble.
At first, I use sys.stdout() to send file content.
file = open(path, 'rb')
size = os.path.getsize(path)
RESPONSE['Pragma'] = 'public'
RESPONSE['Expires'] = '0'
RESPONSE['Cache-Control'] = 'must-revalidate, pre-check=0'
RESPONSE['Content-Disposition'] = 'attachment; filename="' + os.path.basename(path) + '"'
RESPONSE['Content-type'] = "application/octet-stream"
RESPONSE['Content-Transfer-Encoding'] = 'binary'
RESPONSE['Content-length'] = str(os.path.getsize(path))
sys.stdout.flush()
chunk_size = 10000
handle = open(path, "rb")
while True:
buffer = handle.read(chunk_size)
if buffer:
STDOUT(buffer)
else:
break
sys.stdout.flush()
The problem is the server out of memory! I know, stdout write content to memory first, then memory send to socket.
So, I modify the function. Send content to socket directly. I use the py-sendfile module. http://code.google.com/p/py-sendfile/
file = open(path, 'rb')
size = os.path.getsize(path)
sock = REQUEST_HANDLER.sock
sock.sendall("""HTTP/1.1 200 OK\r\nPragma: no-cache\r\nExpires: 0\r\nCache-Control: no-cache, no-store\r\nContent-Disposition: attachment; filename="%s"\r\nContent-Type: application/octet-stream\r\nContent-Length: %u\r\nContent-Range: bytes 0-4096/%u\r\nLocation: "%s"\r\n\r\n""" % (os.path.basename(path), size, size, os.path.basename(path)))
offset = 0
nbytes = 4096
while 1:
try:
sent = sendfile.sendfile(sock.fileno(), file.fileno(), offset, nbytes)
except OSError, err:
if err.errno in (errno.EAGAIN, errno.EBUSY): # retry
continue
raise
else:
if sent == 0:
break # done
offset += sent
This time, the server memory is OK, but browse die! The browse memory rise quickly! Not free
until the socket accept whole file content.
I don't know how to deal with these problems. I think the second idea is right, send content to socket directly. But why browse can't free memory while accept data?
You should try to download the file in chunks. This is an example that works for me using urllib2
import os
import urllib2
import math
def downloadChunks(url):
"""Helper to download large files
the only arg is a url
this file will go to a temp directory
the file will also be downloaded
in chunks and print out how much remains
"""
baseFile = os.path.basename(url)
#move the file to a more uniq path
os.umask(0002)
temp_path = "/tmp/"
try:
file = os.path.join(temp_path,baseFile)
req = urllib2.urlopen(url)
total_size = int(req.info().getheader('Content-Length').strip())
downloaded = 0
CHUNK = 256 * 10240
with open(file, 'wb') as fp:
while True:
chunk = req.read(CHUNK)
downloaded += len(chunk)
print math.floor( (downloaded / total_size) * 100 )
if not chunk: break
fp.write(chunk)
except urllib2.HTTPError, e:
print "HTTP Error:",e.code , url
return False
except urllib2.URLError, e:
print "URL Error:",e.reason , url
return False
return file

Categories

Resources