I’m having trouble retrieving the YouTube video automatically. Here’s the code. The problem is the last part. download = urllib.request.urlopen(download_url).read()
# YouTube video download script
# 10n1z3d[at]w[dot]cn
import urllib.request
import sys
print("\n--------------------------")
print (" YouTube Video Downloader")
print ("--------------------------\n")
try:
video_url = sys.argv[1]
except:
video_url = input('[+] Enter video URL: ')
print("[+] Connecting...")
try:
if(video_url.endswith('&feature=related')):
video_id = video_url.split('www.youtube.com/watch?v=')[1].split('&feature=related')[0]
elif(video_url.endswith('&feature=dir')):
video_id = video_url.split('www.youtube.com/watch?v=')[1].split('&feature=dir')[0]
elif(video_url.endswith('&feature=fvst')):
video_id = video_url.split('www.youtube.com/watch?v=')[1].split('&feature=fvst')[0]
elif(video_url.endswith('&feature=channel_page')):
video_id = video_url.split('www.youtube.com/watch?v=')[1].split('&feature=channel_page')[0]
else:
video_id = video_url.split('www.youtube.com/watch?v=')[1]
except:
print("[-] Invalid URL.")
exit(1)
print("[+] Parsing token...")
try:
url = str(urllib.request.urlopen('http://www.youtube.com/get_video_info?&video_id=' + video_id).read())
token_value = url.split('video_id=' + video_id + '&token=')[1].split('&thumbnail_url')[0]
download_url = "http://www.youtube.com/get_video?video_id=" + video_id + "&t=" + token_value + "&fmt=18"
except:
url = str(urllib.request.urlopen('www.youtube.com/watch?v=' + video_id))
exit(1)
v_url = str(urllib.request.urlopen('http://' + video_url).read())
video_title = v_url.split('"rv.2.title": "')[1].split('", "rv.4.rating"')[0]
if '"' in video_title:
video_title = video_title.replace('"', '"')
elif '&' in video_title:
video_title = video_title.replace('&', '&')
print("[+] Downloading " + '"' + video_title + '"...')
try:
print(download_url)
file = open(video_title + '.mp4', 'wb')
download = urllib.request.urlopen(download_url).read()
print(download)
for line in download:
file.write(line)
file.close()
except:
print("[-] Error downloading. Quitting.")
exit(1)
print("\n[+] Done. The video is saved to the current working directory(cwd).\n")
There’s an error message (thanks Wooble):
Traceback (most recent call last):
File "C:/Python31/MyLib/DrawingBoard/youtube_download-.py", line 52, in <module>
download = urllib.request.urlopen(download_url).read()
File "C:\Python31\lib\urllib\request.py", line 119, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python31\lib\urllib\request.py", line 353, in open
response = meth(req, response)
File "C:\Python31\lib\urllib\request.py", line 465, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python31\lib\urllib\request.py", line 385, in error
result = self._call_chain(*args)
File "C:\Python31\lib\urllib\request.py", line 325, in _call_chain
result = func(*args)
File "C:\Python31\lib\urllib\request.py", line 560, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "C:\Python31\lib\urllib\request.py", line 353, in open
response = meth(req, response)
File "C:\Python31\lib\urllib\request.py", line 465, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python31\lib\urllib\request.py", line 391, in error
return self._call_chain(*args)
File "C:\Python31\lib\urllib\request.py", line 325, in _call_chain
result = func(*args)
File "C:\Python31\lib\urllib\request.py", line 473, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden
The code on the original question relies on several assumptions about the content of YouTube pages and URLs (expressed in constructs such as "url.split('something=')[1]") which may not always be true. I tested it and it might depend even on which related videos show on the page. You might have tripped on any of those specificities.
Here's a cleaner version, which uses urllib to parse URLs and query strings, and which successfully downloads a video. I've removed some of the try/except which didn't do much but exit, for clarity. Incidentally, it deals with Unicode video titles by removing non-ASCII characters from the filename to which the video is saved. It also takes any numbers of YouTube URLs and downloads them all. Finally, it masks its user-agent as Chrome for Mac (which is what I currently use).
#!/usr/bin/env python3
import sys
import urllib.request
from urllib.request import urlopen, FancyURLopener
from urllib.parse import urlparse, parse_qs, unquote
class UndercoverURLopener(FancyURLopener):
version = "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.9 Safari/533.2"
urllib.request._urlopener = UndercoverURLopener()
def youtube_download(video_url):
video_id = parse_qs(urlparse(video_url).query)['v'][0]
url_data = urlopen('http://www.youtube.com/get_video_info?&video_id=' + video_id).read()
url_info = parse_qs(unquote(url_data.decode('utf-8')))
token_value = url_info['token'][0]
download_url = "http://www.youtube.com/get_video?video_id={0}&t={1}&fmt=18".format(
video_id, token_value)
video_title = url_info['title'][0] if 'title' in url_info else ''
# Unicode filenames are more trouble than they're worth
filename = video_title.encode('ascii', 'ignore').decode('ascii').replace("/", "-") + '.mp4'
print("\t Downloading '{}' to '{}'...".format(video_title, filename))
try:
download = urlopen(download_url).read()
f = open(filename, 'wb')
f.write(download)
f.close()
except Exception as e:
print("\t Download failed! {}".format(str(e)))
print("\t Skipping...")
else:
print("\t Done.")
def main():
print("\n--------------------------")
print (" YouTube Video Downloader")
print ("--------------------------\n")
try:
video_urls = sys.argv[1:]
except:
video_urls = input('Enter (space-separated) video URLs: ')
for u in video_urls:
youtube_download(u)
print("\n Done.")
if __name__ == '__main__':
main()
I'm going to shamelessly plug my script which automates checking for valid formats, automatically choosing the best quality format for a video, and works on both the Flash and HTML5 variants of YouTube pages (as well as Vimeo).
If you wrote that script then please look at my source code for inspiration and feel free to steal some code. I challenge you to please write something better. Open source thrives on competition!
However, if you copied that script and are just trying to get it working, may I suggest you give my script a try and see if it fares better for you. You can access it both from the command line as a script or even as a module in another Python file.
You may also check youtube-dl which is written in Python and check how it's written.
It looks like YouTube guys have changed algorithms for accessing video files. Instead of "token" they now use "signature" variable, and "signature" seems to be dependent on either cookie-stored data or IP address of the client (in case of cookies-disabled browser like urllib in Python 2). Here's a hack I've come up with (URLs are IP address-locked):
#!/usr/bin/python
import re
from urlparse import *
from urllib import *
def yt_url(video_url):
video_id = parse_qs(urlparse(video_url).query)['v'][0]
get_vars = parse_qs(unquote(urlopen("http://www.youtube.com/get_video_info?video_id="+video_id).read()))
url = get_vars["id"][0].split(",")[1].split("|")[1]
elements = dict()
elements["itag"] = get_vars["itag"][0]
elements["sver"] = get_vars["sver"][0]
elements["expire"] = get_vars["expire"][0]
elements["signature"] = get_vars["signature"][0]
elements["factor"] = get_vars["factor"][0]
elements["id"] = get_vars["id"][0].split(",")[0]
elements["key"] = get_vars["key"][0]
elements["burst"] = get_vars["burst"][0]
elements["sparams"] = get_vars["sparams"][0]
elements["algorithm"] = get_vars["algorithm"][0]
elements["ipbits"] = "8"
for get_var in elements:
url += "&" + get_var + "=" + elements[get_var]
return (get_vars["title"][0], url)
if __name__ == '__main__':
(title, url) = yt_url("http://www.youtube.com/watch?v=4tAr7tuakt0")
print "Title: %s" % (title,)
print "Video: %s" % (url,)
#!/usr/bin/env python
import urllib2, urllib
import re
import os
import sys
import time
linkurl =raw_input('Enter URL:')
linkurl1 = urllib.urlopen(linkurl).read()
file1 = open("index.html", "w")
file1.write(linkurl1)
file1.close()
fname = 'index.html'
## Giving new matrix value to find
find = ("yt.playerConfig =", '"title":')
## File reading programme
with open(fname) as infile:
for line_no, line in enumerate(infile, 1):
lline = line.lower()
if any(word.lower() in lline for word in find):
y = line.rstrip()
fileurl = y
y1 = y.replace("%3A%2F%2F", "://")
y2 = y1.replace("%2F", "/")
y3 = y2.replace("%3F", "?")
y4 = y3.replace("%3D", "=")
y5 = y4.replace("%26", "&")
y6 = y5.replace("%252", "%2")
y7 = y6.replace("sig", "&signature")
# Display video resolution information
print ""
print "Video resolution: "
print "[46=1080(.webm)]--[37=1080(.mp4)]--[35=480(.flv)]--[36=180(.3gpp)]"
print "[45=720(.webm) ]--[22=720(.mp4) ]--[34=360(.flv)]--[17=144(.3gpp)]"
print "[44=480(.webm) ]--[18=360(.mp4) ]--[5=240(.flv) ]"
print "[43=360(.webm) ]"
print ""
# Programme to get all itag list file
itag = re.findall('itag=(\d+)', y)
print `"itag list= "` + `itag`
resol = raw_input("Type itag number: ")
# Programme to get filename file
fname = 'index.html'
find = (' <title>', '</title>')
with open(fname) as infile:
for line_no, line in enumerate(infile, 1):
lline = line.lower()
if any(word.lower() in lline for word in find):
y = line.rstrip()
fileurl1 = y.split(">")[-2]
filename2 = fileurl1.split('"')[-2]
if resol == '46':
# Programme to get WebM file in 1080 HD
y1080_webm = re.findall(r'itag=46(.*?)\u0026quality=hd1080', y7)
url_1080_webm1 = re.findall(r'\\u0026url=(.*?)\\u0026type', `y1080_webm`)
signature = re.findall(r'signature=(.*?)\\', `y1080_webm`)
url_1080_webm2 = `url_1080_webm1`.split("\\")[0]
url_1080_webm = url_1080_webm2.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_1080_webm
#print url_1080_webm
ext = ".webm"
elif resol == '37':
# Programme to get MP4 file in 1080 HD
y1080_mp4 = re.findall(r'itag=37(.*?)\u0026quality=hd1080', y7)
url_1080_mp41 = re.findall(r'\\u0026url=(.*?)\\u0026type', `y1080_mp4`)
signature = re.findall(r'signature=(.*?)\\', `y1080_mp4`)
url_1080_mp42 = `url_1080_mp41`.split("\\")[0]
url_1080_mp4 = url_1080_mp42.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_1080_mp4
#print url_1080_mp4
ext = ".mp4"
elif resol == '45':
# Programme to get WebM file in 720 HD
y720_webm = re.findall(r'itag=45(.*?)\u0026quality=hd720', y7)
url_720_webm1 = re.findall(r'\\u0026url=(.*?)\\u0026type', `y720_webm`)
signature = re.findall(r'signature=(.*?)\\', `y720_webm`)
url_720_webm2 = `url_720_webm1`.split("\\")[0]
url_720_webm = url_720_webm2.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_720_webm
#print url_720_webm
ext = ".webm"
elif resol == '22':
# Programme to get MP4 file in 720 HD
y720_mp4 = re.findall(r'itag=22(.*?)\u0026quality=hd720', y7)
url_720_mp41 = re.findall(r'\\u0026url=(.*?)\\u0026type', `y720_mp4`)
signature = re.findall(r'signature=(.*?)\\', `y720_mp4`)
url_720_mp42 = `url_720_mp41`.split("\\")[0]
url_720_mp4 = url_720_mp42.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_720_mp4
#print url_720_mp4
ext = ".mp4"
elif resol == '44':
# Programme to get WebM file in 480 large
y480_webm = re.findall(r'itag=44(.*?)\u0026quality=large', y7)
url_480_webm1 = re.findall(r'\\u0026url=(.*?)\\u0026type', `y480_webm`)
signature = re.findall(r'signature=(.*?)\\', `y480_webm`)
url_480_webm2 = `url_480_webm1`.split("\\")[0]
url_480_webm = url_480_webm2.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_480_webm
#print url_480_webm
ext = ".webm"
elif resol == '35':
# Programme to get a FLV file in 480 large
y480_flv = re.findall(r'itag=35(.*?)\u0026quality=large', y7)
url_480_flv1 = re.findall(r'\\u0026url=(.*?)\\', `y480_flv`)
signature = re.findall(r'signature=(.*?)\\', `y480_flv`)
url_480_flv2 = `url_480_flv1`.split("\\")[0]
url_480_flv = url_480_flv2.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_480_flv
#print url_480_flv
ext = ".flv"
elif resol == '43':
# Programme to get WebM file in 360 medium
y360_webm = re.findall(r'itag=43(.*?)\u0026quality=medium', y7)
url_360_webm1 = re.findall(r'\\u0026url=(.*?)\\', `y360_webm`)
signature = re.findall(r'signature=(.*?)\\', `y360_webm`)
url_360_webm2 = `url_360_webm1`.split("\\")[0]
url_360_webm = url_360_webm2.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_360_webm
#print url_360_webm
ext = ".webm"
elif resol == '34':
# Programme to get FLV file in 360 medium
y360_flv = re.findall(r'itag=34(.*?)\u0026quality=medium', y7)
url_360_flv1 = re.findall(r'\\u0026url=(.*?)\\', `y360_flv`)
signature = re.findall(r'signature=(.*?)\\', `y360_flv`)
url_360_flv2 = `url_360_flv1`.split("\\")[0]
url_360_flv = url_360_flv2.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_360_flv
#print url_360_flv
ext = ".flv"
elif resol == '18':
# Programme to get MP4 file in 360 medium
y360_mp4 = re.findall(r'itag=18(.*?)\u0026quality=medium', y7)
url_360_mp41 = re.findall(r'\\u0026url=(.*?)\\', `y360_mp4`)
signature = re.findall(r'signature=(.*?)\\', `y360_mp4`)
url_360_mp42 = `url_360_mp41`.split("\\")[0]
url_360_mp4 = url_360_mp42.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_360_mp4
#print url_360_mp4
ext = ".mp4"
elif resol == '5':
# Programme to get FLV file in 240 small
y240_flv = re.findall(r'itag=5(.*?)\u0026quality=small', y7)
url_240_flv1 = re.findall(r'\\u0026url=(.*?)\\', `y240_flv`)
signature = re.findall(r'signature=(.*?)\\', `y240_flv`)
url_240_flv2 = `url_240_flv1`.split("\\")[0]
url_240_flv = url_240_flv2.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_240_flv
#print url_240_flv
ext = ".flv"
elif resol == '36':
# Programme to get 3gpp file in 180 small
y180_3gpp = re.findall(r'itag=36(.*?)\u0026quality=small', y7)
url_180_3gpp1 = re.findall(r'\\u0026url=(.*?)\\', `y180_3gpp`)
signature = re.findall(r'signature=(.*?)\\', `y180_3gpp`)
url_180_3gpp2 = `url_180_3gpp1`.split("\\")[0]
url_180_3gpp = url_180_3gpp2.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_180_3gpp
#print url_180_3gpp
ext = ".3gpp"
elif resol == '17':
# Programme to get 3gpp file in 144 small
y144_3gpp = re.findall(r'itag=17(.*?)\u0026quality=small', y7)
url_144_3gpp1 = re.findall(r'\\u0026url=(.*?)\\', `y144_3gpp`)
signature = re.findall(r'signature=(.*?)\\', `y144_3gpp`)
url_144_3gpp2 = `url_144_3gpp1`.split("\\")[0]
url_144_3gpp = url_144_3gpp2.split("'")[1] + "&signature=" + `signature`.split("'")[1] + "&ptk=machinima"
url = url_144_3gpp
#print url_144_3gpp
ext = ".3gpp"
#newindex = open("index1.txt", 'w')
#newindex.write(y7)
print url
filename = filename2 + ext
print filename
req = urllib2.Request(url, headers={'Range': "bytes=0-838860800"})
data = urllib2.urlopen(req)
print "connected to ""http://"+url.split("/")[2] + "/"
f = open(filename,'wb')
meta_data = data.info()
file_size = int(meta_data.getheaders("Content-Length")[0])
print "filesize= " + `file_size/1048576` + " MB"
bytes_received = 0
chunk_size = 10240
while True:
start_time = time.time()
buffer = data.read(chunk_size)
if not buffer:
break
bytes_received += len(buffer)
f.write(buffer)
Td = time.time() - start_time
speed1 = round(len(buffer)/1024.0, 1)
speed = round(speed1/Td, 1)
speed_MB = round(speed/1024.0, 1)
speed_GB = round(speed_MB/1024.0, 1)
bytes_received_MB = round(bytes_received/1048576.0, 3)
percent = bytes_received * 100. / file_size
if speed < 1:
speed_byte = round(len(buffer)/Td, 1)
Tr = (file_size-bytes_received)/(60*speed_byte)
status = r"[Downloaded=%.3f MB] [%3.2f%%] [speed= %.1f B/s] [eta %1d min] " % (bytes_received_MB, percent, speed_byte, Tr)
elif speed < 1024:
Tr = (file_size-bytes_received)/(60*1024*speed)
status = r"[Downloaded=%.3f MB] [%3.2f%%] [speed= %.1f KB/s] [eta %1d min] " % (bytes_received_MB, percent, speed, Tr)
elif speed < 1048576:
Tr = (file_size-bytes_received)/(60*1024*1024*speed_MB)
status = r"[Downloaded=%.3f MB] [%3.2f%%] [speed= %.1f MB/s] [eta %1d min] " % (bytes_received_MB, percent, speed_MB, Tr)
else:
Tr = (file_size-bytes_received)/(60*1024*1024*1024*speed_GB)
status = r"[Downloaded=%.3f MB] [%3.2f%%] [speed= %.1f GB/s] [eta %1d min] " % (bytes_received_MB, percent, speed_GB, Tr)
status = status + chr(8) * (len(status) + 1)
print status,
Related
I have been making this download manager app in tkinter and requests and I realized that sometimes if the user is downloading multiple files at the same time it fails to keep up and all the downloads end without any error. I also tried urllib3 and the standard urllib though the only difference that urrlib had was that it just raised and error but still failed. I want to make my program in a way that if the download ends:
Firstly check if the file size is less than it is supposed to be
If it is then get the size of that file and make a range header like so: {"Range": f"bytes={current_size}-{file_size}"}
Store the rest of the file in a temp file. After it is downloaded, get the data from both of the files and write it to one (merge the files together)
I used a while loop and temp counter but the problem is that when requests can't keep up and reaches the while loop, it makes millions of temp files with the size of each of them being 197 bytes and it doesn't work. I also tried just using an if loop hoping that it would be fixed, the difference being that it just didn't create millions of files but still didn't work. Finally I tried writing a separate mock program that just straightly got the rest of the files and merged it the half-downloaded file and it worked but for some reason when I try it in my program it doesn't. Keep in mind that I don't want to create a thread for each tempfile because it can be easily written in the same thread as the one that is downloading the file. How can I do this? My code (Be aware that this function is running in a separate thread):
currently_downloading = np.array([], dtype='S')
current_temp = 0
def download_files():
global files_downloading, times_clicked, currently_downloading, packed, last_temp, current_temp
try:
abort = False
win = None
available_num = 0
downloaded = 0
url = str(url_entry.get())
try:
headers = requests.head(url, headers={'accept-encoding': ''}).headers
except ValueError:
raise InvalidURL()
try:
file_size = float(headers['Content-Length'])
except TypeError:
raise NotDownloadable()
name = ""
formatname = ""
if num.get() == 1:
name = url.split("/")[-1].split(".")[0]
else:
if name_entry.get().strip() != "":
for char in str(name_entry.get()):
if char in banned_chars:
print("Usage of banned characters")
raise BannedCharsUsage()
else:
name = str(name_entry.get())
else:
raise EmptyName()
if var.get() == 1:
formatname = '.' + headers['Content-Type'].split('/')[1]
else:
if str(format_entry.get())[0] == '.' and len(format_entry.get()) >= 3:
formatname = str(format_entry.get())
else:
raise InvalidFormat()
fullname = str(name) + formatname
path = (str(output_entry.get()) + "/").replace(r" \ ".strip(), "/")
if chum.get() == 1:
conn = sqlite3.connect("DEF_PATH.db")
c = conn.cursor()
c.execute("SELECT * FROM DIRECTORY_LIST WHERE SELECTED_DEF = 1")
crnt_default_path = np.array(c.fetchone())
path = str(crnt_default_path[0] + "/").replace(r" \ ".strip(), "/")
conn.commit()
conn.close()
else:
pass
all_files_dir = np.array([], dtype='S')
for file in os.listdir(path):
all_files_dir = np.append(all_files_dir, path + file)
all_files_dir = np.concatenate((all_files_dir, currently_downloading))
while path + fullname in all_files_dir:
for element in currently_downloading:
if element not in all_files_dir:
all_files_dir = np.append(all_files_dir, element)
available_num += 1
if num.get() == 1:
name = url.split("/")[-1].split(".")[0] + f" ({available_num})"
else:
name = str(name_entry.get()) + f" ({available_num})"
fullname = name + formatname
if path + fullname not in all_files_dir:
currently_downloading = np.append(currently_downloading, path + fullname)
available_num = 0
break
else:
currently_downloading = np.append(currently_downloading, path + fullname)
def cancel_dl():
nonlocal abort
abort = True
start_time = time.time()
try:
r = requests.get(url, allow_redirects=False, stream=True)
start = last_print = time.time()
with open(path + fullname, 'wb') as fp:
for chunk in r.iter_content(chunk_size=4096):
if abort:
raise AbortException()
downloaded += fp.write(chunk)
if downloaded > 1000000:
lbl_crnt_size.config(text=f"Downloaded: {round(downloaded / 1000000, 2)} MB")
else:
lbl_crnt_size.config(text=f"Downloaded: {round(downloaded / 1000, 2)} KB")
pct_done = int(downloaded / file_size * 100)
lbl_percent.config(text=f"{round(pct_done, 2)} %")
download_prg["value"] = pct_done
now = time.time()
if now - last_print >= 1:
speed_sec = round(downloaded / (now - start))
if speed_sec > 1000000:
lbl_speed.config(text=f"{round(speed_sec / 1000000, 3)} MB/s")
else:
lbl_speed.config(text=f"{round(speed_sec / 1000, 3)} KB/s")
last_print = time.time()
while os.stat(path + fullname).st_size < file_size:
current_temp += 1
rng = {"Range": f"bytes={os.stat(path + fullname).st_size}-{file_size}"}
r = requests.get(url, allow_redirects=False, stream=True, headers=rng)
start = last_print = time.time()
with open(f"temp/Temp-{current_temp}{formatname}", 'wb') as fp:
for chunk in r.iter_content(chunk_size=4096):
if abort:
raise AbortException()
downloaded += fp.write(chunk)
if downloaded > 1000000:
lbl_crnt_size.config(text=f"Downloaded: {round(downloaded / 1000000, 2)} MB")
else:
lbl_crnt_size.config(text=f"Downloaded: {round(downloaded / 1000, 2)} KB")
pct_done = int(downloaded / file_size * 100)
lbl_percent.config(text=f"{round(pct_done, 2)} %")
download_prg["value"] = pct_done
now = time.time()
if now - last_print >= 1:
speed_sec = round(downloaded / (now - start))
if speed_sec > 1000000:
lbl_speed.config(text=f"{round(speed_sec / 1000000, 3)} MB/s")
else:
lbl_speed.config(text=f"{round(speed_sec / 1000, 3)} KB/s")
last_print = time.time()
with open(f"temp/Temp-{current_temp}{formatname}", 'rb') as fp:
temp_binary = fp.read()
with open(path + fullname, 'rb') as fp:
main_binary = fp.read()
with open(path + fullname, 'wb') as fp:
fp.write(main_binary + temp_binary)
except AbortException:
if os.path.exists(path + fullname):
os.remove(path + fullname)
There is no inbuilt function to do that so you will have to Manually do that .
First thing you need to do is keep record of how many chunks/buffers you have written to file.
Before download function declare some variable, say x=0. (To count how much data is written to file)
then inside the download function check if x == 0.
If true then download normally,
Else : resume download using range header
Read Following examples for range header :- source
If the web server supports the range request then you can add the Range header to your request:
Range: bytes=StartPos-StopPos
You will receive the part between StartPos and StopPos. If dont know the StopPos just use:
Range: bytes=StartPos-
So your code would be:
def resume_download(fileurl, resume_byte_position):
resume_header = {'Range': 'bytes=%d-' % resume_byte_position}
return requests.get(fileurl, headers=resume_header, stream=True, verify=False, allow_redirects=True)
Another example :-
https://www.oreilly.com/library/view/python-cookbook/0596001673/ch11s06.html
Also update the variable x after writing each chunk (x = x + chunk_size)
And in the end of your download part, add a "if" statement to check if the file size of downloaded file is same as the file size of file on server (you can get that by requests.header.get('Content-Length'). If file size is not same then you call your download function again.
I want to automate a report I created in Namely using python, how can I do this with the Namely API?
Here's a python script I made that should cover it:
#Imports
import http.client
import json
import os
import time
import smtplib
#Constants
namelyDomain = "company.namely.com" #change this to your company's namely
csvName = "C:\\Path\\To_Write\\Your_CSV\\Report.csv" #absolute path to write csv
reportID = "0a12bac7-eac4-4bae-b18f-63ea3173gbb4" #report ID (find in URL)
APIkey = "yuIo4fH7f4z4dgabsSqXzxm9IMbW1ixLhjP0eh8jPuIo9vUI1nij9qZmG822al54" #get this from Namely>API>Personal Access Tokens
server = smtplib.SMTP()
#Variables
line = ""
columnCount = 0
#run report with get request
conn = http.client.HTTPSConnection(namelyDomain)
payload = "{}"
headers = { 'authorization': "Bearer " + APIkey }
conn.request("GET", "/api/v1/reports/" + reportID + ".json", payload, headers)
res = conn.getresponse()
if(res.status != 200):
print("failed to connect")
exit()
data = res.read() #returns json object
#Delete if it exists (overwrite)
if os.path.exists(csvName):
os.remove(csvName)
#make the csv
f = open(csvName,"w")
#get objects to loop from
dataHeader = dataRow = json.loads(data)
#Print headers to CSV
for data in dataHeader['reports'][0]['columns']:
columnCount = columnCount + 1
line = line + str(data['label']) + ","
line = line.rstrip(",")
f.write(line + chr(10))
#Print rows to CSV
for data in dataRow['reports'][0]['content']:
line = '"'
for ndx in range(0,columnCount):
line = line + str(data[ndx]) + '","'
line = line.replace("None","").replace('\u202d','').replace('\u202c','').rstrip('"').rstrip(",")
f.write(line + chr(10))
Just replace:
namelyDomain with your company's namely domain
csvName with the absolute path of where you want to write the csv report
reportID with the id of the report you want to generate
APIkey with the personal access token from namely
Useful Link: https://developers.namely.com/1.0/reports/show-report
how to download the image from google and rename the image with keywords at the same time when using google-images-download? While using this package, the name is generated from the Image URL
what I except is that the image can be named with the Item name
Could anyone help me ?
here is code:
from google_images_download import google_images_download
response = google_images_download.googleimagesdownload()
arguments = {"keywords":"cat, dog, pig", "limit":1, "print_urls":True, "image_directory":'home1', "size":"large"}
absolute_image_paths = response.download(arguments)
you need to override the methods in google_images_download including: download_image, _get_all_items, download. _get_all_items and download need to post download_image
# Download Images
def download_image(self,image_url,image_format,main_directory,dir_name,count,print_urls,socket_timeout,prefix,print_size,no_numbering,search_term):
if print_urls:
print("Image URL: " + image_url)
try:
req = Request(image_url, headers={
"User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"})
try:
# timeout time to download an image
if socket_timeout:
timeout = float(socket_timeout)
else:
timeout = 10
response = urlopen(req, None, timeout)
data = response.read()
response.close()
# keep everything after the last '/'
# for keyword in keywords.split(','):
# image_name = str(keyword)
# print(image_name)
image_name = search_term + "." + "jpeg"
print(image_name,'XXXxXXXXx')
# image_name = str(image_url[(image_url.rfind('/')) + 1:])
# image_name = image_name.lower()
# if no extension then add it
# remove everything after the image name
# if image_format == "":
# image_name = image_name + "." + "jpg"
# elif image_format == "jpeg":
# image_name = image_name[:image_name.find(image_format) + 4]
# else:
# image_name = image_name[:image_name.find(image_format) + 3]
# prefix name in image
if prefix:
prefix = prefix + " "
else:
prefix = ''
# if no_numbering:
path = main_directory + "/" + dir_name + "/" + prefix + image_name
# else:
# path = main_directory + "/" + dir_name + "/" + prefix + str(count) + ". " + image_name
print(path)
try:
output_file = open(path, 'wb')
output_file.write(data)
output_file.close()
absolute_path = os.path.abspath(path)
except OSError as e:
download_status = 'fail'
download_message = "OSError on an image...trying next one..." + " Error: " + str(e)
return_image_name = ''
absolute_path = ''
#return image name back to calling method to use it for thumbnail downloads
download_status = 'success'
download_message = "Completed Image ====> " + prefix + str(count) + ". " + image_name
return_image_name = prefix + str(count) + ". " + image_name
# image size parameter
if print_size:
print("Image Size: " + str(self.file_size(path)))
except UnicodeEncodeError as e:
download_status = 'fail'
download_message = "UnicodeEncodeError on an image...trying next one..." + " Error: " + str(e)
return_image_name = ''
absolute_path = ''
except URLError as e:
download_status = 'fail'
download_message = "URLError on an image...trying next one..." + " Error: " + str(e)
return_image_name = ''
absolute_path = ''
except HTTPError as e: # If there is any HTTPError
download_status = 'fail'
download_message = "HTTPError on an image...trying next one..." + " Error: " + str(e)
return_image_name = ''
absolute_path = ''
except URLError as e:
download_status = 'fail'
download_message = "URLError on an image...trying next one..." + " Error: " + str(e)
return_image_name = ''
absolute_path = ''
except ssl.CertificateError as e:
download_status = 'fail'
download_message = "CertificateError on an image...trying next one..." + " Error: " + str(e)
return_image_name = ''
absolute_path = ''
except IOError as e: # If there is any IOError
download_status = 'fail'
download_message = "IOError on an image...trying next one..." + " Error: " + str(e)
return_image_name = ''
absolute_path = ''
except IncompleteRead as e:
download_status = 'fail'
download_message = "IncompleteReadError on an image...trying next one..." + " Error: " + str(e)
return_image_name = ''
absolute_path = ''
return download_status,download_message,return_image_name,absolute_path
I am trying to make a script in python that downloads an image from a discord server but it doesnt work
My code
import http.client, json, calendar, os, time, base64
from datetime import datetime
app_token = "token"
channel_id = "id"
web_hk = "api/webhooks/my webhook"
latest_timestamp = ""
def query_server():
global app_token, channel_id, latest_timestamp
response = 'Processing...'
conn = http.client.HTTPSConnection("discordapp.com")
headers = {"authorization": "Bot " + app_token }
conn.request("GET", "/api/channels/" + channel_id +"/messages", "", headers)
r1 = conn.getresponse()
status = r1.reason
print(status)
r = r1.read()
print(r)
conversation = json.loads(r.decode('utf-8'))
# print(json.dumps(conversation, indent=4, sort_keys=True))
i = 0
while i < len(conversation) :
comment = conversation[i]
i += 1
timestamp = comment["timestamp"]
if(timestamp <= latest_timestamp) :
break
print(comment)
if(comment['content'] == 'Go!') :
print('parsing command')
style_comment = conversation[i]
if style_comment['attachments'] == [] :
response = 'Missing style image'
break
content_comment = conversation[i + 1]
if content_comment['attachments'] == []:
response = 'Missing content image'
break
conn = http.client.HTTPSConnection("cdn.discordapp.com")
current_time_int = str(int(time.mktime(datetime.utcnow().timetuple())))
# download style image
url = style_comment['attachments'][0]['url']
img_path = url.split("https://cdn.discordapp.com")[1]
t = url.split("/")
style_img_filename = current_time_int + "-" + t[-1]
conn.request("GET", img_path, "", headers)
r1 = conn.getresponse().read()
style_file = open(style_img_filename, "wb")
style_file.write(base64.encodebytes(r1))
style_file.close()
os.chmod(style_img_filename, 0o777)
# download content image
url = content_comment['attachments'][0]['url']
img_path = url.split("https://cdn.discordapp.com")[1]
t = url.split("/")
content_img_filename = current_time_int + "-" + t[-1]
conn.request("GET", img_path, "", headers)
r1 = conn.getresponse().read()
content_file = open(content_img_filename, "wb")
content_file.write(base64.encodebytes(r1))
content_file.close()
os.chmod(content_img_filename, 0o777)
output_img_filename = current_time_int + "-output.jpg"
cmd = "python neural_style.py --content {} --styles {} --output {} --width 500".format(content_img_filename, style_img_filename, output_img_filename)
print(cmd)
os.system(cmd)
break
print(response)
query_server()
What I get
Traceback (most recent call last):
File "neural_style.py", line 216, in <module>
main()
File "neural_style.py", line 119, in main
content_image = imread(options.content)
File "neural_style.py", line 201, in imread
img = scipy.misc.imread(path).astype(np.float)
File "C:\Users\Baxter\AppData\Local\Programs\Python\Python35\lib\site-packages\numpy\lib\utils.py", line 101, in newfunc
return func(*args, **kwds)
File "C:\Users\Baxter\AppData\Local\Programs\Python\Python35\lib\site-packages\scipy\misc\pilutil.py", line 164, in imread
im = Image.open(name)
File "C:\Users\Baxter\AppData\Local\Programs\Python\Python35\lib\site-packages\PIL\Image.py", line 2585, in open
% (filename if filename else fp))
OSError: cannot identify image file '1527709726-madelbrot.jpg'
It technicaly downloads somthing, because I see the file name in the folder, but it says it cannot identify it. I cant even open it.
The code below makes an md5/metadata fingerprint, but crashes on files with unknown corruption (e.g., files, that can be copied, mostly even opened, but that can not be hashed or zipped up [to disguise their corruption]).
Question: How one makes this code to skip or ignore any and all problem files and just do the rest? Imagine 1 million files on 8 TB. Otherwise I leave it running and having no real-time monitoring of progress, 2 days later I find out that nothing got hashed because a couple problem files made the code hung.
Part of the code (see full code below):
def createBasicInfoListFromDisk():
global diskCompareListDetails, onlyFileNameOnDisk, driveLetter,walk_dir
walk_dir = os.path.abspath(walk_dir)
for root, subdirs, files in os.walk(walk_dir, topdown=True, onerror=None, followlinks=True ):
for filename in files:
file_path = os.path.join(root, filename)
temp = file_path.split(":")
driveLetter = temp[0]
filePathWithoutDriveLetter = temp[1]
fileSize = os.path.getsize(file_path)
mod_on = get_last_write_time(file_path)
print('\t- file %s (full path: %s)' % (filename, file_path))
print('FileName : {filename} is of size {size} and was modified on{mdt}'.format(filename=file_path,size=fileSize,mdt=mod_on ))
diskCompareListDetails.append("\"" + filePathWithoutDriveLetter+"\",\""+str(fileSize) + "\",\"" + mod_on +'"')
onlyFileNameOnDisk.append("\""+filePathWithoutDriveLetter+"\"")
return
Error:
FileName : T:\problemtest\problemfile.doc is of size 27136 and was modified on2010-10-10 13:58:32
Traceback (most recent call last):
File "t:\scripts\test.py", line 196, in <module>
createBasicInfoListFromDisk()
File "t:\scripts\test.py", line 76, in createBasicInfoListFromDisk
mod_on = get_last_write_time(file_path)
File "t:\scripts\test.py", line 61, in get_last_write_time
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(st.st_mtime))
OSError: [Errno 22] Invalid argument
Full code:
import os
import sys
import time
import datetime
import difflib
import decimal
import hashlib
from pip._vendor.distlib.compat import raw_input
csvListDetails = list()
csvCompareListDetails = list()
diskCompareListDetails = list()
onlyFileNameOnDisk = list()
addedFiles = list()
removedFiles = list()
driveLetter =""
finalFilesToChange=list()
finalFilesToDelete=list()
changedFiles=list()
csvfilewithPath="md5.csv"
import shutil
walk_dir=""
def findAndReadCSVFile(fileName):
global csvListDetails
global csvCompareListDetails
haveIgnoredLine = 0
foundFile=0
try :
inputFileHandler = open(fileName,"rt",encoding='utf-8')
update_time = get_last_write_time(fileName)
print("\n Found md5.csv, last updated on: %s" % update_time)
foundFile=1
except (OSError, IOError, FileNotFoundError):
print("\n md5.csv not found. Will create a new one.")
return foundFile
for line in inputFileHandler:
if (haveIgnoredLine==0):
haveIgnoredLine=1
continue
rowItem = line.replace("\n","").split('","')
csvCompareListDetails.append('"' + rowItem[3]+',"'+rowItem[2]+'","' +rowItem[1]+'"')
lineDetails = list()
for detailNum in range (0,len(rowItem)):
lineDetails.append('"' + (rowItem[detailNum].replace('"','')) + '"')
csvListDetails.append(lineDetails)
inputFileHandler.close()
return foundFile
def get_last_write_time(filename):
st = os.stat(filename)
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(st.st_mtime))
return convert_time_to_human_readable
def createBasicInfoListFromDisk():
global diskCompareListDetails, onlyFileNameOnDisk, driveLetter,walk_dir
walk_dir = os.path.abspath(walk_dir)
for root, subdirs, files in os.walk(walk_dir, topdown=True, onerror=None, followlinks=True ):
for filename in files:
file_path = os.path.join(root, filename)
temp = file_path.split(":")
driveLetter = temp[0]
filePathWithoutDriveLetter = temp[1]
fileSize = os.path.getsize(file_path)
mod_on = get_last_write_time(file_path)
print('\t- file %s (full path: %s)' % (filename, file_path))
print('FileName : {filename} is of size {size} and was modified on{mdt}'.format(filename=file_path,size=fileSize,mdt=mod_on ))
diskCompareListDetails.append("\"" + filePathWithoutDriveLetter+"\",\""+str(fileSize) + "\",\"" + mod_on +'"')
onlyFileNameOnDisk.append("\""+filePathWithoutDriveLetter+"\"")
return
def compareLogAndDiskLists():
global addedFiles, removedFiles
diff = difflib.unified_diff(csvCompareListDetails, diskCompareListDetails, fromfile='file1', tofile='file2', lineterm='', n=0)
lines = list(diff)[2:]
addedFiles = [line[1:] for line in lines if line[0] == '+']
removedFiles = [line[1:] for line in lines if line[0] == '-']
return
def displayInfoForUserInput():
global finalFilesToChange, finalFilesToDelete
changedOrNewFileCount = 0
noLongerExistingFilesCount = 0
totalSizeOfChange = 0
for line in addedFiles:
if line not in removedFiles:
changedOrNewFileCount = changedOrNewFileCount +1
elements = line.replace("\n","").split('","')
sizeOfFile= int(elements[1].replace('"',''))
totalSizeOfChange = totalSizeOfChange + sizeOfFile
finalFilesToChange.append(elements[0] +'"')
for line in removedFiles:
elements = line.split('","')
if elements[0]+'"' not in onlyFileNameOnDisk:
noLongerExistingFilesCount = noLongerExistingFilesCount + 1
finalFilesToDelete.append(elements[0]+'"')
GBModSz= decimal.Decimal(totalSizeOfChange) / decimal.Decimal('1073741824')
print("\n New or modified files on drive: {} (need to hash)".format(changedOrNewFileCount))
print (" Obsolete lines in md5.csv (files modified or not on drive): {} (lines to delete)".format(noLongerExistingFilesCount))
print (" {} files ({:.2f} GB) needs to be hashed.".format(changedOrNewFileCount,GBModSz))
userInput = raw_input("\n Proceed with hash? (Y/N, Yes/No) ")
if (userInput.strip().upper() == "Y" or userInput.strip().upper() == "YES"):
print("Continuing Processing...")
else:
print("You opted not to continue, Exiting")
sys.exit()
return
def processFiles(foundFile):
if (foundFile==1):
oldFileName = walk_dir+"/md5.csv"
shutil.copy( oldFileName, getTargetFileName(oldFileName))
BLOCKSIZE = 1048576*4
global changedFiles
for fileToHash in finalFilesToChange:
hasher = hashlib.new('md5')
fileToUse=driveLetter+":"+fileToHash.replace('"','')
with open(fileToUse, 'rb') as afile:
buf = afile.read(BLOCKSIZE)
while len(buf) > 0:
hasher.update(buf)
buf = afile.read(BLOCKSIZE)
fileDetails = list()
fileDetails.append(hasher.hexdigest())
fileDetails.append(get_last_write_time(fileToUse))
fileDetails.append(os.path.getsize(fileToUse))
fileDetails.append(fileToHash)
changedFiles.append(fileDetails)
return
def getTargetFileName(oldFileName):
targetFileName= walk_dir+"/generated_on_" + get_last_write_time(oldFileName).replace(" ","_").replace("-","").replace(":","")
targetFileName = targetFileName + "__archived_on_" + datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
targetFileName = targetFileName + "__md5.csv"
return targetFileName
def writeCSVFile(fileName):
try :
outputFileHandler=open(fileName,"wt",encoding='utf-8')
outputFileHandler.write("\"md5Hash\",\"LastWriteTime\",\"Length\",\"FullName\"\n")
for details in csvListDetails:
if details[3] in finalFilesToDelete:
continue
if details[3] in finalFilesToChange:
continue
outputFileHandler.write("{},{},{},{}\n".format(details[0],details[1],details[2],details[3]))
for details in changedFiles:
outputFileHandler.write("\"{}\",\"{}\",\"{}\",{}\n".format(details[0],details[1],details[2],details[3]))
outputFileHandler.close()
except (OSError, IOError, FileNotFoundError) as e:
print("ERROR :")
print("File {} is either not writable or some other error: {}".format(fileName,e))
return
if __name__ == '__main__':
walk_dir = raw_input("\n Enter drive or directory to scan: ")
csvfilewithPath=walk_dir+"/md5.csv"
print("\n Drive to scan: " + walk_dir)
foundFile = 0
foundFile=findAndReadCSVFile(csvfilewithPath)
createBasicInfoListFromDisk()
compareLogAndDiskLists()
displayInfoForUserInput()
processFiles(foundFile)
writeCSVFile(csvfilewithPath)
Trying this fix, no luck:
def get_last_write_time(filename):
try:
st = os.stat(filename)
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(st.st_mtime))
return convert_time_to_human_readable
except OSError:
pass
return "ERROR"
def createBasicInfoListFromDisk():
I agree with IMCoins and I'm very qurius on why except isn't catching the error.
So first thing I would do is to go to the source where the OSError is being raised and try to catch it explicity.
def get_last_write_time(filename):
try:
st = os.stat(filename)
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S",
time.localtime(st.st_mtime)
return convert_time_to_human_readable
except OSError:
pass
return "ERROR" #or whatever string you want add
Updated answer, for updated post.
As stated earlier, except statement with exception type specified catches everything. So, in order to do what want... I'm afraid possible answer are either :
To make a method that identifies corrupted files, and handles it properly.
Make try, except statement that encapsulate every part of your code where there could be an error.
Let me warn you about the second solution though, as sometimes, there are system errors that you do not want to avoid. I believe you should print the exception that you catch, in order to identify further problems you may encounter.
Just so you know, as you may not : your error is not in a try, except statement. Your error is in (if I copied and pasted properly in my editor) line 196, createBasicinfoListFromDisk(), then line 76, mod_on = get_last_write_time(file_path)
As you also mentioned you are using python 3.x, I suggest you are looking into the suppress function (https://docs.python.org/3/library/contextlib.html#contextlib.suppress).
I hope it helped you.