Python: why urllib.request.urlopen.read() never finishes download? - python

I have the code:
from urllib.request import urlopen
url = 'http://gmsh.info/bin/MacOSX/gmsh-4.5.2-MacOSX-sdk.tgz'
sdk = urlopen(url).read()
and the question: why this download never ends? Link is OK and it works in browsers. I tried to set some headers like this:
from urllib import request
req = request.Request(url)
req.add_header('user-agent', "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11")
sdk = request.urlopen(req).read()
but this didn't help. Any ideas?

this is because the file size is very big try downloading it into chunks..
as shown in example it will work..
import urllib.request
filedata = urllib.request.urlopen('http://gmsh.info/bin/MacOSX/gmsh-4.5.2-MacOSX-sdk.tgz')
CHUNK = 1 * 1024
with open('test.zip', 'wb') as f:
while True:
chunk = filedata.read(CHUNK)
if not chunk:
break
f.write(chunk)

Related

Python: Downloading an image from an URL but getting HTTP Error 403

I'm trying to download this image:
https://bu3.mkklcdnbuv1.com/mangakakalot/m2/mother_im_sorry/chapter_5_chapter_5/2.jpg
I have tried: Set Headers = {'User-agent': 'Mozilla/5.0'}
The code i received is still 403.
Can anyone suggest me a way to overcome this?
My Code:
import requests
import shutil
r = requests.get('https://bu3.mkklcdnbuv1.com/mangakakalot/m2/mother_im_sorry/chapter_5_chapter_5/2.jpg',stream=True, headers={'User-agent': 'Mozilla/5.0'})
print (r.status_code)
if r.status_code == 200:
with open("img.jgp", 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
You need to set the referer header in your code, like:
import requests
with requests.Session() as session:
resp_2 = session.get("https://bu3.mkklcdnbuv1.com/mangakakalot/m2/mother_im_sorry/chapter_5_chapter_5/2.jpg", headers={"referer":"https://mangakakalot.com/chapter/ro920198/chapter_5"})
with open("xx.jpg","wb") as f:
f.write(resp_2.content)

download a pdf using python

I am trying to download a pdf from the internet. I have a battery of links needed to pull the pdf from the internet.
I have this block of code:
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options
url = 'http://webapps.rrc.texas.gov/CMPL/viewPdfReportFormAction.do?method=cmplG1FormPdf&packetSummaryId=2928'
opts = Options()
opts.headless = True
assert opts.headless # Operating in headless mode
browser_detail = Firefox(options=opts)
browser_detail.get(url)
print(browser_detail.page_source)
with open('temp/metadata.pdf', 'wb') as fd:
fd.write(browser_detail.page_source)
browser_detail.close()
I also have tried requests. Same response:
import requests
url = 'http://webapps.rrc.texas.gov/CMPL/viewPdfReportFormAction.do?method=cmplG1FormPdf&packetSummaryId=2928'
r = requests.get(url, stream=True)
with open('temp/metadata.pdf', 'wb') as fd:
for chunk in r.iter_content(2000):
fd.write(chunk)
the problem is if I put the url into a browser, the pdf comes up, but when I put it to this code, the page_source is html. This makes me think that there's a forwarding or server-side processing involved.
How do I get the PDF down?
Thanks!
I was able to pull down the PDF file using requests.
The page is looking for a proper User-Agent so I set it to Chrome MacOS.
h = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8","User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36" }
r = requests.get(url, stream=True, headers=h)
And it worked.
tmp/project/1> file metadata.pdf
metadata.pdf: PDF document, version 1.4
with open('temp/metadata.pdf', 'wb') as fd:
fd.write(r.content)

requests fails to download any image from a certain site although all image there is downloadable?

Here is the code I use for downloading any image (it always works fine except for this site www.pexels.com) . It actually download the image, but corrupted when it comes to this site ? I wonder why ??
url = "https://images.pexels.com/photos/844297/pexels-photo-844297.jpeg?auto=compress&cs=tinysrgb&dpr=2&h=650&w=940"
response = requests.get(url , stream = True)
file= open("Hello.jpg" , 'wb')
for chunk in response.iter_content(10000):
file.write(chunk)
file.close()
You need to add a user-agent to your request headers.
The following code works:
import requests
url = "https://images.pexels.com/photos/844297/pexels-photo-844297.jpeg?auto=compress&cs=tinysrgb&dpr=2&h=650&w=940"
headers = {"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
}
response = requests.get(url , stream = True, headers=headers)
file= open("Hello.jpg" , 'wb')
for chunk in response.iter_content(10000):
file.write(chunk)
file.close()

HTTP Error 403: Forbidden with urlretrieve

I am trying to download a PDF, however I get the following error: HTTP Error 403: Forbidden
I am aware that the server is blocking for whatever reason, but I cant seem to find a solution.
import urllib.request
import urllib.parse
import requests
def download_pdf(url):
full_name = "Test.pdf"
urllib.request.urlretrieve(url, full_name)
try:
url = ('http://papers.xtremepapers.com/CIE/Cambridge%20IGCSE/Mathematics%20(0580)/0580_s03_qp_1.pdf')
print('initialized')
hdr = {}
hdr = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36',
'Content-Length': '136963',
}
print('HDR recieved')
req = urllib.request.Request(url, headers=hdr)
print('Header sent')
resp = urllib.request.urlopen(req)
print('Request sent')
respData = resp.read()
download_pdf(url)
print('Complete')
except Exception as e:
print(str(e))
You seem to have already realised this; the remote server is apparently checking the user agent header and rejecting requests from Python's urllib. But urllib.request.urlretrieve() doesn't allow you to change the HTTP headers, however, you can use urllib.request.URLopener.retrieve():
import urllib.request
opener = urllib.request.URLopener()
opener.addheader('User-Agent', 'whatever')
filename, headers = opener.retrieve(url, 'Test.pdf')
N.B. You are using Python 3 and these functions are now considered part of the "Legacy interface", and URLopener has been deprecated. For that reason you should not use them in new code.
The above aside, you are going to a lot of trouble to simply access a URL. Your code imports requests, but you don't use it - you should though because it is much easier than urllib. This works for me:
import requests
url = 'http://papers.xtremepapers.com/CIE/Cambridge%20IGCSE/Mathematics%20(0580)/0580_s03_qp_1.pdf'
r = requests.get(url)
with open('0580_s03_qp_1.pdf', 'wb') as outfile:
outfile.write(r.content)

Python 3 error 403 when downloading file

I'm using a script to grab download links from an HTML page (sent to me via mail) and then download the files, the script has been working great for about 6 months, but last week i started getting "403 Error".
from what I've read and understand, the issue is that the site is blocking me, thinking that it's a bot (can't deny that), but I'm not scraping the HTML code of the site, just trying to download a file using requests.get, I only get this error from one specific site, other ones I can download fine.
I've tried setting headers={'User-Agent': 'Mozilla/5.0'} but that didn't help.
here's the function that downloads the file:
def download_file(dl_url, local_save_path):
"""Download URL to given path"""
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
auth_check = requests.get(dl_url, auth=(username.get(), password.get()), verify=False, headers={'User-Agent': user_agent})
dnl_sum = 1024
local_filename = dl_url.split('/')[-1]
complete_name = os.path.join(local_save_path, local_filename)
# Get file size
r = requests.head(dl_url, auth=(username.get(), password.get()), verify=False, headers={'User-Agent': user_agent})
try:
dl_file_size = int(r.headers['content-length'])
file_size.set(str(int(int(r.headers['content-length']) * (10 ** -6))) + "MB")
c = 1
except KeyError:
c = 0
pass
# NOTE the stream=True parameter
print('1')
r = requests.get(dl_url, stream=True, auth=(username.get(), password.get()), verify=False, headers={'User-Agent': user_agent})
print('2')
while True:
try:
with open(complete_name, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.flush()
if c == 1:
download_perc.set(percentage(dl_file_size, dnl_sum))
elif c == 0:
print(dnl_sum)
dnl_sum = os.path.getsize(complete_name)
except FileNotFoundError:
continue
break
return
Have you try to use a proxy ?
You can use tor, it's allow you dynamic IP address and website can't recognize you.
Try this https://techoverflow.net/blog/2015/02/06/using-python-requests-over-tor/

Categories

Resources