How to reconnect in requests to continue a download

How to reconnect in requests to continue a download - python

I have been making this download manager app in tkinter and requests and I realized that sometimes if the user is downloading multiple files at the same time it fails to keep up and all the downloads end without any error. I also tried urllib3 and the standard urllib though the only difference that urrlib had was that it just raised and error but still failed. I want to make my program in a way that if the download ends:
Firstly check if the file size is less than it is supposed to be
If it is then get the size of that file and make a range header like so: {"Range": f"bytes={current_size}-{file_size}"}
Store the rest of the file in a temp file. After it is downloaded, get the data from both of the files and write it to one (merge the files together)
I used a while loop and temp counter but the problem is that when requests can't keep up and reaches the while loop, it makes millions of temp files with the size of each of them being 197 bytes and it doesn't work. I also tried just using an if loop hoping that it would be fixed, the difference being that it just didn't create millions of files but still didn't work. Finally I tried writing a separate mock program that just straightly got the rest of the files and merged it the half-downloaded file and it worked but for some reason when I try it in my program it doesn't. Keep in mind that I don't want to create a thread for each tempfile because it can be easily written in the same thread as the one that is downloading the file. How can I do this? My code (Be aware that this function is running in a separate thread):
currently_downloading = np.array([], dtype='S')
current_temp = 0
def download_files():
global files_downloading, times_clicked, currently_downloading, packed, last_temp, current_temp
try:
abort = False
win = None
available_num = 0
downloaded = 0
url = str(url_entry.get())
try:
headers = requests.head(url, headers={'accept-encoding': ''}).headers
except ValueError:
raise InvalidURL()
try:
file_size = float(headers['Content-Length'])
except TypeError:
raise NotDownloadable()
name = ""
formatname = ""
if num.get() == 1:
name = url.split("/")[-1].split(".")[0]
else:
if name_entry.get().strip() != "":
for char in str(name_entry.get()):
if char in banned_chars:
print("Usage of banned characters")
raise BannedCharsUsage()
else:
name = str(name_entry.get())
else:
raise EmptyName()
if var.get() == 1:
formatname = '.' + headers['Content-Type'].split('/')[1]
else:
if str(format_entry.get())[0] == '.' and len(format_entry.get()) >= 3:
formatname = str(format_entry.get())
else:
raise InvalidFormat()
fullname = str(name) + formatname
path = (str(output_entry.get()) + "/").replace(r" \ ".strip(), "/")
if chum.get() == 1:
conn = sqlite3.connect("DEF_PATH.db")
c = conn.cursor()
c.execute("SELECT * FROM DIRECTORY_LIST WHERE SELECTED_DEF = 1")
crnt_default_path = np.array(c.fetchone())
path = str(crnt_default_path[0] + "/").replace(r" \ ".strip(), "/")
conn.commit()
conn.close()
else:
pass
all_files_dir = np.array([], dtype='S')
for file in os.listdir(path):
all_files_dir = np.append(all_files_dir, path + file)
all_files_dir = np.concatenate((all_files_dir, currently_downloading))
while path + fullname in all_files_dir:
for element in currently_downloading:
if element not in all_files_dir:
all_files_dir = np.append(all_files_dir, element)
available_num += 1
if num.get() == 1:
name = url.split("/")[-1].split(".")[0] + f" ({available_num})"
else:
name = str(name_entry.get()) + f" ({available_num})"
fullname = name + formatname
if path + fullname not in all_files_dir:
currently_downloading = np.append(currently_downloading, path + fullname)
available_num = 0
break
else:
currently_downloading = np.append(currently_downloading, path + fullname)
def cancel_dl():
nonlocal abort
abort = True
start_time = time.time()
try:
r = requests.get(url, allow_redirects=False, stream=True)
start = last_print = time.time()
with open(path + fullname, 'wb') as fp:
for chunk in r.iter_content(chunk_size=4096):
if abort:
raise AbortException()
downloaded += fp.write(chunk)
if downloaded > 1000000:
lbl_crnt_size.config(text=f"Downloaded: {round(downloaded / 1000000, 2)} MB")
else:
lbl_crnt_size.config(text=f"Downloaded: {round(downloaded / 1000, 2)} KB")
pct_done = int(downloaded / file_size * 100)
lbl_percent.config(text=f"{round(pct_done, 2)} %")
download_prg["value"] = pct_done
now = time.time()
if now - last_print >= 1:
speed_sec = round(downloaded / (now - start))
if speed_sec > 1000000:
lbl_speed.config(text=f"{round(speed_sec / 1000000, 3)} MB/s")
else:
lbl_speed.config(text=f"{round(speed_sec / 1000, 3)} KB/s")
last_print = time.time()
while os.stat(path + fullname).st_size < file_size:
current_temp += 1
rng = {"Range": f"bytes={os.stat(path + fullname).st_size}-{file_size}"}
r = requests.get(url, allow_redirects=False, stream=True, headers=rng)
start = last_print = time.time()
with open(f"temp/Temp-{current_temp}{formatname}", 'wb') as fp:
for chunk in r.iter_content(chunk_size=4096):
if abort:
raise AbortException()
downloaded += fp.write(chunk)
if downloaded > 1000000:
lbl_crnt_size.config(text=f"Downloaded: {round(downloaded / 1000000, 2)} MB")
else:
lbl_crnt_size.config(text=f"Downloaded: {round(downloaded / 1000, 2)} KB")
pct_done = int(downloaded / file_size * 100)
lbl_percent.config(text=f"{round(pct_done, 2)} %")
download_prg["value"] = pct_done
now = time.time()
if now - last_print >= 1:
speed_sec = round(downloaded / (now - start))
if speed_sec > 1000000:
lbl_speed.config(text=f"{round(speed_sec / 1000000, 3)} MB/s")
else:
lbl_speed.config(text=f"{round(speed_sec / 1000, 3)} KB/s")
last_print = time.time()
with open(f"temp/Temp-{current_temp}{formatname}", 'rb') as fp:
temp_binary = fp.read()
with open(path + fullname, 'rb') as fp:
main_binary = fp.read()
with open(path + fullname, 'wb') as fp:
fp.write(main_binary + temp_binary)
except AbortException:
if os.path.exists(path + fullname):
os.remove(path + fullname)

There is no inbuilt function to do that so you will have to Manually do that .
First thing you need to do is keep record of how many chunks/buffers you have written to file.
Before download function declare some variable, say x=0. (To count how much data is written to file)
then inside the download function check if x == 0.
If true then download normally,
Else : resume download using range header
Read Following examples for range header :- source
If the web server supports the range request then you can add the Range header to your request:
Range: bytes=StartPos-StopPos
You will receive the part between StartPos and StopPos. If dont know the StopPos just use:
Range: bytes=StartPos-
So your code would be:
def resume_download(fileurl, resume_byte_position):
resume_header = {'Range': 'bytes=%d-' % resume_byte_position}
return requests.get(fileurl, headers=resume_header, stream=True, verify=False, allow_redirects=True)
Another example :-
https://www.oreilly.com/library/view/python-cookbook/0596001673/ch11s06.html
Also update the variable x after writing each chunk (x = x + chunk_size)
And in the end of your download part, add a "if" statement to check if the file size of downloaded file is same as the file size of file on server (you can get that by requests.header.get('Content-Length'). If file size is not same then you call your download function again.

Related

My code is meant to find image duplicates using numpy, it worked for some but doesn't seem to work for all images even if they look exactly alike

I thought this would be a basic task and I am still kind of at a beginner level so this is what my code is at the moment. It is supposed to break the images into byte arrays and then into numpy arrays. There are so many variables because I keep getting a storage error unless I break the process down into multiple variables such as
first = FI.read()--> firsti = first -->firstim = bytearray(firsti) --> firstimgg = np.array(firstim)--->
def find_copies_folders(foldera,folderb):
start = time.time()
ar = []
arr =[]
imagefolder = []
foldercounter = 0
filecounter = 0
secondfilecounter = 0
try:
for subdir, dirs, files in os.walk(folderb):
for file in files:
path = os.path.join(subdir, file)
print(path)
print(time.time()-start)
for subdir,dirs,files in os.walk(foldera):
for fils in files:
FI = open(os.path.join(subdir,fils),'rb')
print("------")
first = FI.read()
firsti = first
firstim = bytearray(firsti)
firstimgg = np.array(firstim)
filecounter+=1
for subdirb,dirs,files in os.walk(folderb):
for sils in files:
SI = open(os.path.join(subdirb,sils),'rb')
SA = (os.path.join(subdirb,sils))
second = SI.read()
secondi = second
secondim = bytearray(secondi)
secondimgg = np.asarray(secondim)
print(fils+": "+str(firstimgg)+" "+sils+": "+str(secondimgg))
if np.array_equal(secondimgg, firstimgg) == True:
db = secondimgg
img = cv2.imdecode(db, cv2.IMREAD_COLOR)
arr.append(img)
ar.append(sils)
print(fils+" = "+ sils)
newfolder = SA
print(str(SA)+" lets see")
SI.close()
shutil.move(SA, Dest_folder)
continue
elif np.array_equal(secondimgg, firstimgg) == False:
#print(fils+" and "+sils+ " have no similarities")
continue
elif np.isclose(secondimgg, firstimgg,rtol=0.1) == True:
print(fils+" and "+sils+ " have slight similarities")
elif np.array_equiv(secondimgg, firstimgg) == True:
print(fils + " and " + sils + " have slight similarities2")
except FileNotFoundError:
print(str(files) +" has been moved")
except PermissionError:
print(folder+" Found it")
pass
except AttributeError:
print("Not an image")
pass
except shutil.Error:
print(newfolder +" Has been moved....Moving on.")
ap = len(ar)
print("Copies: "+str(ar))
#print(ar)
print(ap)
print(filecounter)

Unexpected end of data when zipping zip files in Python

Good day.
I wrote a little Python program to help me easily create .cbc files for Calibre, which is just a renamed .zip file with a text file called comics.txt for TOC purposes. Each chapter is another zip file.
The issue is that the last zip file zipped always has the error "Unexpected end of data". The file itself is not corrupt, if I unzip it and rezip it it works perfectly. Playing around it seems that the problem is that Python doesn't close the last zip file after zipping it, since I can't delete the last zip while the program is still running since it's still open in Python. Needless to say, Calibre doesn't like the file and fails to convert it unless I manually rezip the affected chapters.
The code is as follows, checking the folders for not-image files, zipping the folders, zipping the zips while creating the text file, and "changing" extension.
import re, glob, os, zipfile, shutil, pathlib, gzip, itertools
Folders = glob.glob("*/")
items = len(Folders)
cn_list = []
cn_list_filtered = []
dirs_filtered = []
ch_id = ["c", "Ch. "]
subdir_im = []
total = 0
Dirs = next(os.walk('.'))[1]
for i in range(0, len(Dirs)):
for items in os.listdir("./" + Dirs[i]):
if items.__contains__('.png') or items.__contains__('.jpg'):
total+=1
else:
print(items + " not an accepted format.")
subdir_im.append(total)
total = 0
for fname in Folders:
if re.search(ch_id[0] + r'\d+' + r'[\S]' + r'\d+', fname):
cn = re.findall(ch_id[0] + "(\d+[\S]\d+)", fname)[0]
cn_list.append(cn)
elif re.search(ch_id[0] + r'\d+', fname):
cn = re.findall(ch_id[0] + "(\d+)", fname)[0]
cn_list.append(cn)
elif re.search(ch_id[1] + r'\d+' + '[\S]' + r'\d+', fname):
cn = re.findall(ch_id[1] + "(\d+[\S]\d+)", fname)[0]
cn_list.append(cn)
elif re.search(ch_id[1] + r'\d+', fname):
cn = re.findall(ch_id[1] + "(\d+)", fname)[0]
cn_list.append(cn)
else:
print('Warning: File found without proper filename format.')
cn_list_filtered = set(cn_list)
cn_list_filtered = sorted(cn_list_filtered)
cwd = os.getcwd()
Dirs = Folders
subdir_zi = []
total = 0
for i in range(0, len(cn_list_filtered)):
for folders in Dirs:
if folders.__contains__(ch_id[0] + cn_list_filtered[i] + " ")\
or folders.__contains__(ch_id[1] + cn_list_filtered[i] + " "):
print('Zipping folder ', folders)
namezip = "Chapter " + cn_list_filtered[i] + ".zip"
current_zip = zipfile.ZipFile(namezip, "a")
for items in os.listdir(folders):
if items.__contains__('.png') or items.__contains__('.jpg'):
current_zip.write(folders + "/" + items, items)
total+=1
subdir_zi.append(total)
total = 0
print('Folder contents in order:', subdir_im, ' Total:', sum(subdir_im))
print("Number of items per zip: ", subdir_zi, ' Total:', sum(subdir_zi))
if subdir_im == subdir_zi:
print("All items in folders have been successfully zipped")
else:
print("Warning: File count in folders and zips do not match. Please check the affected chapters")
zips = glob.glob("*.zip")
namezip2 = os.path.basename(os.getcwd()) + ".zip"
zipfinal = zipfile.ZipFile(namezip2, "a")
for i in range(0, len(zips), 1):
zipfinal.write(zips[i],zips[i])
Data = []
for i in range (0,len(cn_list_filtered),1):
Datai = ("Chapter " + cn_list_filtered[i] + ".zip" + ":Chapter " + cn_list_filtered[i] + "\r\n")
Data.append(Datai)
Dataok = ''.join(Data)
with zipfile.ZipFile(namezip2, 'a') as myzip:
myzip.writestr("comics.txt", Dataok)
zipfinal.close()
os.rename(namezip2, namezip2 + ".cbc")
os.system("pause")
I am by no means a programmer, that is just a Frankenstein monster code I eventually managed to put together by checking threads, but this last issue has me stumped.
Some solutions I tried are:
for i in range(0, len(zips), 1):
zipfinal.write(zips[i],zips[i])
zips[i].close()
Fails with:
zips[i].close()
AttributeError: 'str' object has no attribute 'close'
and:
for i in range(0, len(zips), 1):
zipfinal.write(zips[i],zips[i])
zips[len(zips)].close()
Fails with:
zips[len(zips)].close()
IndexError: list index out of range
Thanks for the help.

This solved my issue:
def generate_zip(file_list, file_name=None):
zip_buffer = io.BytesIO()
zf = zipfile.ZipFile(zip_buffer, mode="w", compression=zipfile.ZIP_DEFLATED)
for file in file_list:
print(f"Filename: {file[0]}\nData: {file[1]}")
zf.writestr(file[0], file[1])
**zf.close()**
with open(file_name, 'wb') as f:
f.write(zip_buffer.getvalue())
f.close()

How to skip unhashable (corrupt) files while md5 fingerprinting?

The code below makes an md5/metadata fingerprint, but crashes on files with unknown corruption (e.g., files, that can be copied, mostly even opened, but that can not be hashed or zipped up [to disguise their corruption]).
Question: How one makes this code to skip or ignore any and all problem files and just do the rest? Imagine 1 million files on 8 TB. Otherwise I leave it running and having no real-time monitoring of progress, 2 days later I find out that nothing got hashed because a couple problem files made the code hung.
Part of the code (see full code below):
def createBasicInfoListFromDisk():
global diskCompareListDetails, onlyFileNameOnDisk, driveLetter,walk_dir
walk_dir = os.path.abspath(walk_dir)
for root, subdirs, files in os.walk(walk_dir, topdown=True, onerror=None, followlinks=True ):
for filename in files:
file_path = os.path.join(root, filename)
temp = file_path.split(":")
driveLetter = temp[0]
filePathWithoutDriveLetter = temp[1]
fileSize = os.path.getsize(file_path)
mod_on = get_last_write_time(file_path)
print('\t- file %s (full path: %s)' % (filename, file_path))
print('FileName : {filename} is of size {size} and was modified on{mdt}'.format(filename=file_path,size=fileSize,mdt=mod_on ))
diskCompareListDetails.append("\"" + filePathWithoutDriveLetter+"\",\""+str(fileSize) + "\",\"" + mod_on +'"')
onlyFileNameOnDisk.append("\""+filePathWithoutDriveLetter+"\"")
return
Error:
FileName : T:\problemtest\problemfile.doc is of size 27136 and was modified on2010-10-10 13:58:32
Traceback (most recent call last):
File "t:\scripts\test.py", line 196, in <module>
createBasicInfoListFromDisk()
File "t:\scripts\test.py", line 76, in createBasicInfoListFromDisk
mod_on = get_last_write_time(file_path)
File "t:\scripts\test.py", line 61, in get_last_write_time
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(st.st_mtime))
OSError: [Errno 22] Invalid argument
Full code:
import os
import sys
import time
import datetime
import difflib
import decimal
import hashlib
from pip._vendor.distlib.compat import raw_input
csvListDetails = list()
csvCompareListDetails = list()
diskCompareListDetails = list()
onlyFileNameOnDisk = list()
addedFiles = list()
removedFiles = list()
driveLetter =""
finalFilesToChange=list()
finalFilesToDelete=list()
changedFiles=list()
csvfilewithPath="md5.csv"
import shutil
walk_dir=""
def findAndReadCSVFile(fileName):
global csvListDetails
global csvCompareListDetails
haveIgnoredLine = 0
foundFile=0
try :
inputFileHandler = open(fileName,"rt",encoding='utf-8')
update_time = get_last_write_time(fileName)
print("\n Found md5.csv, last updated on: %s" % update_time)
foundFile=1
except (OSError, IOError, FileNotFoundError):
print("\n md5.csv not found. Will create a new one.")
return foundFile
for line in inputFileHandler:
if (haveIgnoredLine==0):
haveIgnoredLine=1
continue
rowItem = line.replace("\n","").split('","')
csvCompareListDetails.append('"' + rowItem[3]+',"'+rowItem[2]+'","' +rowItem[1]+'"')
lineDetails = list()
for detailNum in range (0,len(rowItem)):
lineDetails.append('"' + (rowItem[detailNum].replace('"','')) + '"')
csvListDetails.append(lineDetails)
inputFileHandler.close()
return foundFile
def get_last_write_time(filename):
st = os.stat(filename)
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(st.st_mtime))
return convert_time_to_human_readable
def createBasicInfoListFromDisk():
global diskCompareListDetails, onlyFileNameOnDisk, driveLetter,walk_dir
walk_dir = os.path.abspath(walk_dir)
for root, subdirs, files in os.walk(walk_dir, topdown=True, onerror=None, followlinks=True ):
for filename in files:
file_path = os.path.join(root, filename)
temp = file_path.split(":")
driveLetter = temp[0]
filePathWithoutDriveLetter = temp[1]
fileSize = os.path.getsize(file_path)
mod_on = get_last_write_time(file_path)
print('\t- file %s (full path: %s)' % (filename, file_path))
print('FileName : {filename} is of size {size} and was modified on{mdt}'.format(filename=file_path,size=fileSize,mdt=mod_on ))
diskCompareListDetails.append("\"" + filePathWithoutDriveLetter+"\",\""+str(fileSize) + "\",\"" + mod_on +'"')
onlyFileNameOnDisk.append("\""+filePathWithoutDriveLetter+"\"")
return
def compareLogAndDiskLists():
global addedFiles, removedFiles
diff = difflib.unified_diff(csvCompareListDetails, diskCompareListDetails, fromfile='file1', tofile='file2', lineterm='', n=0)
lines = list(diff)[2:]
addedFiles = [line[1:] for line in lines if line[0] == '+']
removedFiles = [line[1:] for line in lines if line[0] == '-']
return
def displayInfoForUserInput():
global finalFilesToChange, finalFilesToDelete
changedOrNewFileCount = 0
noLongerExistingFilesCount = 0
totalSizeOfChange = 0
for line in addedFiles:
if line not in removedFiles:
changedOrNewFileCount = changedOrNewFileCount +1
elements = line.replace("\n","").split('","')
sizeOfFile= int(elements[1].replace('"',''))
totalSizeOfChange = totalSizeOfChange + sizeOfFile
finalFilesToChange.append(elements[0] +'"')
for line in removedFiles:
elements = line.split('","')
if elements[0]+'"' not in onlyFileNameOnDisk:
noLongerExistingFilesCount = noLongerExistingFilesCount + 1
finalFilesToDelete.append(elements[0]+'"')
GBModSz= decimal.Decimal(totalSizeOfChange) / decimal.Decimal('1073741824')
print("\n New or modified files on drive: {} (need to hash)".format(changedOrNewFileCount))
print (" Obsolete lines in md5.csv (files modified or not on drive): {} (lines to delete)".format(noLongerExistingFilesCount))
print (" {} files ({:.2f} GB) needs to be hashed.".format(changedOrNewFileCount,GBModSz))
userInput = raw_input("\n Proceed with hash? (Y/N, Yes/No) ")
if (userInput.strip().upper() == "Y" or userInput.strip().upper() == "YES"):
print("Continuing Processing...")
else:
print("You opted not to continue, Exiting")
sys.exit()
return
def processFiles(foundFile):
if (foundFile==1):
oldFileName = walk_dir+"/md5.csv"
shutil.copy( oldFileName, getTargetFileName(oldFileName))
BLOCKSIZE = 1048576*4
global changedFiles
for fileToHash in finalFilesToChange:
hasher = hashlib.new('md5')
fileToUse=driveLetter+":"+fileToHash.replace('"','')
with open(fileToUse, 'rb') as afile:
buf = afile.read(BLOCKSIZE)
while len(buf) > 0:
hasher.update(buf)
buf = afile.read(BLOCKSIZE)
fileDetails = list()
fileDetails.append(hasher.hexdigest())
fileDetails.append(get_last_write_time(fileToUse))
fileDetails.append(os.path.getsize(fileToUse))
fileDetails.append(fileToHash)
changedFiles.append(fileDetails)
return
def getTargetFileName(oldFileName):
targetFileName= walk_dir+"/generated_on_" + get_last_write_time(oldFileName).replace(" ","_").replace("-","").replace(":","")
targetFileName = targetFileName + "__archived_on_" + datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
targetFileName = targetFileName + "__md5.csv"
return targetFileName
def writeCSVFile(fileName):
try :
outputFileHandler=open(fileName,"wt",encoding='utf-8')
outputFileHandler.write("\"md5Hash\",\"LastWriteTime\",\"Length\",\"FullName\"\n")
for details in csvListDetails:
if details[3] in finalFilesToDelete:
continue
if details[3] in finalFilesToChange:
continue
outputFileHandler.write("{},{},{},{}\n".format(details[0],details[1],details[2],details[3]))
for details in changedFiles:
outputFileHandler.write("\"{}\",\"{}\",\"{}\",{}\n".format(details[0],details[1],details[2],details[3]))
outputFileHandler.close()
except (OSError, IOError, FileNotFoundError) as e:
print("ERROR :")
print("File {} is either not writable or some other error: {}".format(fileName,e))
return
if __name__ == '__main__':
walk_dir = raw_input("\n Enter drive or directory to scan: ")
csvfilewithPath=walk_dir+"/md5.csv"
print("\n Drive to scan: " + walk_dir)
foundFile = 0
foundFile=findAndReadCSVFile(csvfilewithPath)
createBasicInfoListFromDisk()
compareLogAndDiskLists()
displayInfoForUserInput()
processFiles(foundFile)
writeCSVFile(csvfilewithPath)
Trying this fix, no luck:
def get_last_write_time(filename):
try:
st = os.stat(filename)
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(st.st_mtime))
return convert_time_to_human_readable
except OSError:
pass
return "ERROR"
def createBasicInfoListFromDisk():

I agree with IMCoins and I'm very qurius on why except isn't catching the error.
So first thing I would do is to go to the source where the OSError is being raised and try to catch it explicity.
def get_last_write_time(filename):
try:
st = os.stat(filename)
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S",
time.localtime(st.st_mtime)
return convert_time_to_human_readable
except OSError:
pass
return "ERROR" #or whatever string you want add

Updated answer, for updated post.
As stated earlier, except statement with exception type specified catches everything. So, in order to do what want... I'm afraid possible answer are either :
To make a method that identifies corrupted files, and handles it properly.
Make try, except statement that encapsulate every part of your code where there could be an error.
Let me warn you about the second solution though, as sometimes, there are system errors that you do not want to avoid. I believe you should print the exception that you catch, in order to identify further problems you may encounter.
Just so you know, as you may not : your error is not in a try, except statement. Your error is in (if I copied and pasted properly in my editor) line 196, createBasicinfoListFromDisk(), then line 76, mod_on = get_last_write_time(file_path)
As you also mentioned you are using python 3.x, I suggest you are looking into the suppress function (https://docs.python.org/3/library/contextlib.html#contextlib.suppress).
I hope it helped you.

Empty chunks when spliting a large file

I am trying to split a large files into 50Mb chunks and save them in another files. After running some read/write operations, some of my chunks were smaller than 50Mb (43Mb,17Mb and so on). Although, I wrote the same code in Java and It has the same problem. What is wrong? my codes are following bellow:
By the way, What we can do to speed up this code to split into chunks faster?
try:
f = open(self.__filename, 'rb')
except (OSError, IOError), e:
raise FileSplitterException, str(e)
bname = (os.path.split(self.__filename))[1]
fsize = os.path.getsize(self.__filename)
self.__chunksize = int(float(fsize)/float(self.__numchunks))
chunksz = self.__chunksize
total_bytes = 0
for x in range(self.__numchunks):
chunkfilename = bname + '-' + str(x+1) + self.__postfix
if x == self.__numchunks - 1:
chunksz = fsize - total_bytes
try:
print 'Writing file',chunkfilename
data = f.read(chunksz)
total_bytes += len(data)
chunkf = file(chunkfilename, 'wb')
chunkf.write(data)
chunkf.close()
except (OSError, IOError), e:
print e
continue
except EOFError, e:
print e
break

The code in the question seems to be focussed on producing a set number of chunks rather than files of 50MB in size.
This code produces 50MB files.
import os
try:
f = open('big.txt', 'rb')
except (OSError, IOError), e:
raise FileSplitterException, str(e)
bname = (os.path.split('big.txt'))[1]
chunksz = 50 * 1000 * 1000 # metric MB - use 1024 * 1024 for binary MB (MiB)
counter = 0
while True:
chunkfilename = bname + '-' + str(counter+1) + '.foo'
try:
print 'Writing file',chunkfilename
data = f.read(chunksz)
if not data:
# We have reached the end of the file, end the script.
break
chunkf = file(chunkfilename, 'wb')
chunkf.write(data)
chunkf.close()
except (OSError, IOError), e:
print e
continue
except EOFError, e:
print e
break
counter += 1
Some aspects of the code are considered poor style in modern python - for example not using a context manager to open files - but I haven't changed these in case the OP is on an old python like 2.5.

Your question is unclear because you haven't included a Minimal, Complete, and Verifiable example—so I don't know exactly what's wrong with your code. However after creating / simulating my guess as to the missing parts, I was able to come up with something that does exactly what you want, I think.
import os
class FileSplitterException(Exception): pass
class FileSplitter(object):
def __init__(self, filename, chunksize):
if not os.path.isfile(filename):
raise FileSplitterException(
"File: {!r} does not exist".format(filename))
self._filename = filename
self._postfix = 'chunk'
self._chunksize = chunksize
def split(self):
bname = os.path.splitext(self._filename)[0]
fsize = os.path.getsize(self._filename)
chunks, partial = divmod(fsize, self._chunksize)
if partial:
chunks += 1
with open(self._filename, 'rb') as infile:
for i in range(chunks):
chunk_filename = os.path.join('{}-{}.{}'.format(
bname, i, self._postfix))
with open(chunk_filename, 'wb') as outfile:
data = infile.read(self._chunksize)
if data:
outfile.write(data)
else:
FileSplitterException('unexpected EOF encountered')
if __name__ == '__main__':
import glob
filename = 'big_file.txt'
chunksize = 1 * 1024 * 1024 # 1 Mb
print('splitting {} into {:,} sized chunks'.format(filename, chunksize))
fs = FileSplitter(filename, chunksize)
fs.split()
print('chunk files written:')
bname = os.path.splitext(filename)[0]
for chunkname in sorted(glob.glob(bname + '-*.' + fs._postfix)):
fsize = os.path.getsize(chunkname)
print(' {}: size: {:,}'.format(chunkname, fsize))

Can’t download YouTube video

I’m having trouble retrieving the YouTube video automatically. Here’s the code. The problem is the last part. download = urllib.request.urlopen(download_url).read()
# YouTube video download script
# 10n1z3d[at]w[dot]cn
import urllib.request
import sys
print("\n--------------------------")
print (" YouTube Video Downloader")
print ("--------------------------\n")
try:
video_url = sys.argv[1]
except:
video_url = input('[+] Enter video URL: ')
print("[+] Connecting...")
try:
if(video_url.endswith('&feature=related')):
video_id = video_url.split('www.youtube.com/watch?v=')[1].split('&feature=related')[0]
elif(video_url.endswith('&feature=dir')):
video_id = video_url.split('www.youtube.com/watch?v=')[1].split('&feature=dir')[0]
elif(video_url.endswith('&feature=fvst')):
video_id = video_url.split('www.youtube.com/watch?v=')[1].split('&feature=fvst')[0]
elif(video_url.endswith('&feature=channel_page')):
video_id = video_url.split('www.youtube.com/watch?v=')[1].split('&feature=channel_page')[0]
else:
video_id = video_url.split('www.youtube.com/watch?v=')[1]
except:
print("[-] Invalid URL.")
exit(1)
print("[+] Parsing token...")
try:
url = str(urllib.request.urlopen('http://www.youtube.com/get_video_info?&video_id=' + video_id).read())
token_value = url.split('video_id=' + video_id + '&token=')[1].split('&thumbnail_url')[0]
download_url = "http://www.youtube.com/get_video?video_id=" + video_id + "&t=" + token_value + "&fmt=18"
except:
url = str(urllib.request.urlopen('www.youtube.com/watch?v=' + video_id))
exit(1)
v_url = str(urllib.request.urlopen('http://' + video_url).read())
video_title = v_url.split('"rv.2.title": "')[1].split('", "rv.4.rating"')[0]
if '"' in video_title:
video_title = video_title.replace('"', '"')
elif '&' in video_title:
video_title = video_title.replace('&', '&')
print("[+] Downloading " + '"' + video_title + '"...')
try:
print(download_url)
file = open(video_title + '.mp4', 'wb')
download = urllib.request.urlopen(download_url).read()
print(download)
for line in download:
file.write(line)
file.close()
except:
print("[-] Error downloading. Quitting.")
exit(1)
print("\n[+] Done. The video is saved to the current working directory(cwd).\n")
There’s an error message (thanks Wooble):
Traceback (most recent call last):
File "C:/Python31/MyLib/DrawingBoard/youtube_download-.py", line 52, in <module>
download = urllib.request.urlopen(download_url).read()
File "C:\Python31\lib\urllib\request.py", line 119, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python31\lib\urllib\request.py", line 353, in open
response = meth(req, response)
File "C:\Python31\lib\urllib\request.py", line 465, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python31\lib\urllib\request.py", line 385, in error
result = self._call_chain(*args)
File "C:\Python31\lib\urllib\request.py", line 325, in _call_chain
result = func(*args)
File "C:\Python31\lib\urllib\request.py", line 560, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "C:\Python31\lib\urllib\request.py", line 353, in open
response = meth(req, response)
File "C:\Python31\lib\urllib\request.py", line 465, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python31\lib\urllib\request.py", line 391, in error
return self._call_chain(*args)
File "C:\Python31\lib\urllib\request.py", line 325, in _call_chain
result = func(*args)
File "C:\Python31\lib\urllib\request.py", line 473, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden

The code on the original question relies on several assumptions about the content of YouTube pages and URLs (expressed in constructs such as "url.split('something=')[1]") which may not always be true. I tested it and it might depend even on which related videos show on the page. You might have tripped on any of those specificities.
Here's a cleaner version, which uses urllib to parse URLs and query strings, and which successfully downloads a video. I've removed some of the try/except which didn't do much but exit, for clarity. Incidentally, it deals with Unicode video titles by removing non-ASCII characters from the filename to which the video is saved. It also takes any numbers of YouTube URLs and downloads them all. Finally, it masks its user-agent as Chrome for Mac (which is what I currently use).
#!/usr/bin/env python3
import sys
import urllib.request
from urllib.request import urlopen, FancyURLopener
from urllib.parse import urlparse, parse_qs, unquote
class UndercoverURLopener(FancyURLopener):
version = "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.9 Safari/533.2"
urllib.request._urlopener = UndercoverURLopener()
def youtube_download(video_url):
video_id = parse_qs(urlparse(video_url).query)['v'][0]
url_data = urlopen('http://www.youtube.com/get_video_info?&video_id=' + video_id).read()
url_info = parse_qs(unquote(url_data.decode('utf-8')))
token_value = url_info['token'][0]
download_url = "http://www.youtube.com/get_video?video_id={0}&t={1}&fmt=18".format(
video_id, token_value)
video_title = url_info['title'][0] if 'title' in url_info else ''
# Unicode filenames are more trouble than they're worth
filename = video_title.encode('ascii', 'ignore').decode('ascii').replace("/", "-") + '.mp4'
print("\t Downloading '{}' to '{}'...".format(video_title, filename))
try:
download = urlopen(download_url).read()
f = open(filename, 'wb')
f.write(download)
f.close()
except Exception as e:
print("\t Download failed! {}".format(str(e)))
print("\t Skipping...")
else:
print("\t Done.")
def main():
print("\n--------------------------")
print (" YouTube Video Downloader")
print ("--------------------------\n")
try:
video_urls = sys.argv[1:]
except:
video_urls = input('Enter (space-separated) video URLs: ')
for u in video_urls:
youtube_download(u)
print("\n Done.")
if __name__ == '__main__':
main()

I'm going to shamelessly plug my script which automates checking for valid formats, automatically choosing the best quality format for a video, and works on both the Flash and HTML5 variants of YouTube pages (as well as Vimeo).
If you wrote that script then please look at my source code for inspiration and feel free to steal some code. I challenge you to please write something better. Open source thrives on competition!
However, if you copied that script and are just trying to get it working, may I suggest you give my script a try and see if it fares better for you. You can access it both from the command line as a script or even as a module in another Python file.

You may also check youtube-dl which is written in Python and check how it's written.

It looks like YouTube guys have changed algorithms for accessing video files. Instead of "token" they now use "signature" variable, and "signature" seems to be dependent on either cookie-stored data or IP address of the client (in case of cookies-disabled browser like urllib in Python 2). Here's a hack I've come up with (URLs are IP address-locked):
#!/usr/bin/python
import re
from urlparse import *
from urllib import *
def yt_url(video_url):
video_id = parse_qs(urlparse(video_url).query)['v'][0]
get_vars = parse_qs(unquote(urlopen("http://www.youtube.com/get_video_info?video_id="+video_id).read()))
url = get_vars["id"][0].split(",")[1].split("|")[1]
elements = dict()
elements["itag"] = get_vars["itag"][0]
elements["sver"] = get_vars["sver"][0]
elements["expire"] = get_vars["expire"][0]
elements["signature"] = get_vars["signature"][0]
elements["factor"] = get_vars["factor"][0]
elements["id"] = get_vars["id"][0].split(",")[0]
elements["key"] = get_vars["key"][0]
elements["burst"] = get_vars["burst"][0]
elements["sparams"] = get_vars["sparams"][0]
elements["algorithm"] = get_vars["algorithm"][0]
elements["ipbits"] = "8"
for get_var in elements:
url += "&" + get_var + "=" + elements[get_var]
return (get_vars["title"][0], url)
if __name__ == '__main__':
(title, url) = yt_url("http://www.youtube.com/watch?v=4tAr7tuakt0")
print "Title: %s" % (title,)
print "Video: %s" % (url,)

#!/usr/bin/env python
import urllib2, urllib
import re
import os
import sys
import time
linkurl =raw_input('Enter URL:')
linkurl1 = urllib.urlopen(linkurl).read()
file1 = open("index.html", "w")
file1.write(linkurl1)
file1.close()
fname = 'index.html'
## Giving new matrix value to find
find = ("yt.playerConfig =", '"title":')
## File reading programme
with open(fname) as infile:
for line_no, line in enumerate(infile, 1):
lline = line.lower()
if any(word.lower() in lline for word in find):
y = line.rstrip()
fileurl = y
y1 = y.replace("%3A%2F%2F", "://")
y2 = y1.replace("%2F", "/")
y3 = y2.replace("%3F", "?")
y4 = y3.replace("%3D", "=")
y5 = y4.replace("%26", "&")
y6 = y5.replace("%252", "%2")
y7 = y6.replace("sig", "&signature")
# Display video resolution information
print ""
print "Video resolution: "
print "[46=1080(.webm)]--[37=1080(.mp4)]--[35=480(.flv)]--[36=180(.3gpp)]"
print "[45=720(.webm) ]--[22=720(.mp4) ]--[34=360(.flv)]--[17=144(.3gpp)]"
print "[44=480(.webm) ]--[18=360(.mp4) ]--[5=240(.flv) ]"
print "[43=360(.webm) ]"
print ""
# Programme to get all itag list file
itag = re.findall('itag=(\d+)', y)
print `"itag list= "` + `itag`
resol = raw_input("Type itag number: ")
# Programme to get filename file
fname = 'index.html'
find = (' <title>', '</title>')
with open(fname) as infile:
for line_no, line in enumerate(infile, 1):
lline = line.lower()
if any(word.lower() in lline for word in find):
y = line.rstrip()
fileurl1 = y.split(">")[-2]
filename2 = fileurl1.split('"')[-2]
if resol == '46':
# Programme to get WebM file in 1080 HD
y1080_webm = re.findall(r'itag=46(.*?)\u0026quality=hd1080', y7)
url_1080_webm1 = re.findall(r'\\u0026url=(.*?)\\u0026type', `y1080_webm`)
signature = re.findall(r'signature=(.*?)\\', `y1080_webm`)
url_1080_webm2 = `url_1080_webm1`.split("\\")[0]
url_1080_webm = url_1080_webm2.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_1080_webm
#print url_1080_webm
ext = ".webm"
elif resol == '37':
# Programme to get MP4 file in 1080 HD
y1080_mp4 = re.findall(r'itag=37(.*?)\u0026quality=hd1080', y7)
url_1080_mp41 = re.findall(r'\\u0026url=(.*?)\\u0026type', `y1080_mp4`)
signature = re.findall(r'signature=(.*?)\\', `y1080_mp4`)
url_1080_mp42 = `url_1080_mp41`.split("\\")[0]
url_1080_mp4 = url_1080_mp42.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_1080_mp4
#print url_1080_mp4
ext = ".mp4"
elif resol == '45':
# Programme to get WebM file in 720 HD
y720_webm = re.findall(r'itag=45(.*?)\u0026quality=hd720', y7)
url_720_webm1 = re.findall(r'\\u0026url=(.*?)\\u0026type', `y720_webm`)
signature = re.findall(r'signature=(.*?)\\', `y720_webm`)
url_720_webm2 = `url_720_webm1`.split("\\")[0]
url_720_webm = url_720_webm2.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_720_webm
#print url_720_webm
ext = ".webm"
elif resol == '22':
# Programme to get MP4 file in 720 HD
y720_mp4 = re.findall(r'itag=22(.*?)\u0026quality=hd720', y7)
url_720_mp41 = re.findall(r'\\u0026url=(.*?)\\u0026type', `y720_mp4`)
signature = re.findall(r'signature=(.*?)\\', `y720_mp4`)
url_720_mp42 = `url_720_mp41`.split("\\")[0]
url_720_mp4 = url_720_mp42.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_720_mp4
#print url_720_mp4
ext = ".mp4"
elif resol == '44':
# Programme to get WebM file in 480 large
y480_webm = re.findall(r'itag=44(.*?)\u0026quality=large', y7)
url_480_webm1 = re.findall(r'\\u0026url=(.*?)\\u0026type', `y480_webm`)
signature = re.findall(r'signature=(.*?)\\', `y480_webm`)
url_480_webm2 = `url_480_webm1`.split("\\")[0]
url_480_webm = url_480_webm2.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_480_webm
#print url_480_webm
ext = ".webm"
elif resol == '35':
# Programme to get a FLV file in 480 large
y480_flv = re.findall(r'itag=35(.*?)\u0026quality=large', y7)
url_480_flv1 = re.findall(r'\\u0026url=(.*?)\\', `y480_flv`)
signature = re.findall(r'signature=(.*?)\\', `y480_flv`)
url_480_flv2 = `url_480_flv1`.split("\\")[0]
url_480_flv = url_480_flv2.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_480_flv
#print url_480_flv
ext = ".flv"
elif resol == '43':
# Programme to get WebM file in 360 medium
y360_webm = re.findall(r'itag=43(.*?)\u0026quality=medium', y7)
url_360_webm1 = re.findall(r'\\u0026url=(.*?)\\', `y360_webm`)
signature = re.findall(r'signature=(.*?)\\', `y360_webm`)
url_360_webm2 = `url_360_webm1`.split("\\")[0]
url_360_webm = url_360_webm2.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_360_webm
#print url_360_webm
ext = ".webm"
elif resol == '34':
# Programme to get FLV file in 360 medium
y360_flv = re.findall(r'itag=34(.*?)\u0026quality=medium', y7)
url_360_flv1 = re.findall(r'\\u0026url=(.*?)\\', `y360_flv`)
signature = re.findall(r'signature=(.*?)\\', `y360_flv`)
url_360_flv2 = `url_360_flv1`.split("\\")[0]
url_360_flv = url_360_flv2.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_360_flv
#print url_360_flv
ext = ".flv"
elif resol == '18':
# Programme to get MP4 file in 360 medium
y360_mp4 = re.findall(r'itag=18(.*?)\u0026quality=medium', y7)
url_360_mp41 = re.findall(r'\\u0026url=(.*?)\\', `y360_mp4`)
signature = re.findall(r'signature=(.*?)\\', `y360_mp4`)
url_360_mp42 = `url_360_mp41`.split("\\")[0]
url_360_mp4 = url_360_mp42.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_360_mp4
#print url_360_mp4
ext = ".mp4"
elif resol == '5':
# Programme to get FLV file in 240 small
y240_flv = re.findall(r'itag=5(.*?)\u0026quality=small', y7)
url_240_flv1 = re.findall(r'\\u0026url=(.*?)\\', `y240_flv`)
signature = re.findall(r'signature=(.*?)\\', `y240_flv`)
url_240_flv2 = `url_240_flv1`.split("\\")[0]
url_240_flv = url_240_flv2.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_240_flv
#print url_240_flv
ext = ".flv"
elif resol == '36':
# Programme to get 3gpp file in 180 small
y180_3gpp = re.findall(r'itag=36(.*?)\u0026quality=small', y7)
url_180_3gpp1 = re.findall(r'\\u0026url=(.*?)\\', `y180_3gpp`)
signature = re.findall(r'signature=(.*?)\\', `y180_3gpp`)
url_180_3gpp2 = `url_180_3gpp1`.split("\\")[0]
url_180_3gpp = url_180_3gpp2.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_180_3gpp
#print url_180_3gpp
ext = ".3gpp"
elif resol == '17':
# Programme to get 3gpp file in 144 small
y144_3gpp = re.findall(r'itag=17(.*?)\u0026quality=small', y7)
url_144_3gpp1 = re.findall(r'\\u0026url=(.*?)\\', `y144_3gpp`)
signature = re.findall(r'signature=(.*?)\\', `y144_3gpp`)
url_144_3gpp2 = `url_144_3gpp1`.split("\\")[0]
url_144_3gpp = url_144_3gpp2.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_144_3gpp
#print url_144_3gpp
ext = ".3gpp"
#newindex = open("index1.txt", 'w')
#newindex.write(y7)
print url
filename = filename2 + ext
print filename
req = urllib2.Request(url, headers={'Range': "bytes=0-838860800"})
data = urllib2.urlopen(req)
print "connected to ""http://"+url.split("/")[2] + "/"
f = open(filename,'wb')
meta_data = data.info()
file_size = int(meta_data.getheaders("Content-Length")[0])
print "filesize= " + `file_size/1048576` + " MB"
bytes_received = 0
chunk_size = 10240
while True:
start_time = time.time()
buffer = data.read(chunk_size)
if not buffer:
break
bytes_received += len(buffer)
f.write(buffer)
Td = time.time() - start_time
speed1 = round(len(buffer)/1024.0, 1)
speed = round(speed1/Td, 1)
speed_MB = round(speed/1024.0, 1)
speed_GB = round(speed_MB/1024.0, 1)
bytes_received_MB = round(bytes_received/1048576.0, 3)
percent = bytes_received * 100. / file_size
if speed < 1:
speed_byte = round(len(buffer)/Td, 1)
Tr = (file_size-bytes_received)/(60*speed_byte)
status = r"[Downloaded=%.3f MB] [%3.2f%%] [speed= %.1f B/s] [eta %1d min] " % (bytes_received_MB, percent, speed_byte, Tr)
elif speed < 1024:
Tr = (file_size-bytes_received)/(60*1024*speed)
status = r"[Downloaded=%.3f MB] [%3.2f%%] [speed= %.1f KB/s] [eta %1d min] " % (bytes_received_MB, percent, speed, Tr)
elif speed < 1048576:
Tr = (file_size-bytes_received)/(60*1024*1024*speed_MB)
status = r"[Downloaded=%.3f MB] [%3.2f%%] [speed= %.1f MB/s] [eta %1d min] " % (bytes_received_MB, percent, speed_MB, Tr)
else:
Tr = (file_size-bytes_received)/(60*1024*1024*1024*speed_GB)
status = r"[Downloaded=%.3f MB] [%3.2f%%] [speed= %.1f GB/s] [eta %1d min] " % (bytes_received_MB, percent, speed_GB, Tr)
status = status + chr(8) * (len(status) + 1)
print status,

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to reconnect in requests to continue a download - python

Related

My code is meant to find image duplicates using numpy, it worked for some but doesn't seem to work for all images even if they look exactly alike

Unexpected end of data when zipping zip files in Python

How to skip unhashable (corrupt) files while md5 fingerprinting?

Empty chunks when spliting a large file

Can’t download YouTube video

Categories

Resources