Download a picture from certain URL with Python - python

I learned how to download a picture from a certain URL with python as:
import urllib
imgurl="http://www.digimouth.com/news/media/2011/09/google-logo.jpg"
resource = urllib.urlopen(imgurl)
output = open("test.jpg","wb")
output.write(resource.read())
output.close()
and it worked well, but when i changed the URL to
imgurl="http://farm1.static.flickr.com/96/242125326_607a826afe_o.jpg"
it did not work, and gave the information
File "face_down.py", line 3, in <module>
resource = urllib2.urlopen(imgurl)
File "D:\Python27\another\Lib\urllib2.py", line 154, in urlopen
return opener.open(url, data, timeout)
File "D:\Python27\another\Lib\urllib2.py", line 431, in open
response = self._open(req, data)
File "D:\Python27\another\Lib\urllib2.py", line 449, in _open
'_open', req)
File "D:\Python27\another\Lib\urllib2.py", line 409, in _call_chain
result = func(*args)
File "D:\Python27\another\Lib\urllib2.py", line 1227, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "D:\Python27\another\Lib\urllib2.py", line 1197, in do_open
raise URLError(err)
urllib2.URLError: <urlopen error [Errno 10060] >
and I tried to open the latter image URL, and it could be shown as the former, I have no idea to solve it~~ help~~~~

You can try using requests module. The response will be some bytes. So, you can iterate over those byte chunks and write to the file.
import requests
url = "http://farm1.static.flickr.com/96/242125326_607a826afe_o.jpg"
r = requests.get(url)
path = "filename.jpg"
with open(path, 'wb') as f:
for chunk in r:
f.write(chunk)

I looked up both of the addresses and the second one does not lead anywhere. That is probably the problem.
import urllib
imgurl="webpage url"
openimg = urllib.urlopen(imgurl) #opens image (prepares it)
img = open("test.jpg","wb") #opens the img to read it
img.write(openimg.read()) #prints it to the console
img.close() #closes img
Try the link again in your webpage and if it turns up with "webpage not available" that is probably the problem.

Related

Python quits unexpectedly. PyQt5 QProgressBar is not showing and ValueError: unknown url type [duplicate]

I have searched a lot of similar question on SO, but did not find an exact match to my case.
I am trying to download a video using python 2.7
Here is my code for downloading the video
import urllib2
from bs4 import BeautifulSoup as bs
with open('video.txt','r') as f:
last_downloaded_video = f.read()
webpage = urllib2.urlopen('http://*.net/watch/**-'+last_downloaded_video)
soup = bs(webpage)
a = []
for link in soup.find_all('a'):
if link.has_attr('data-video-id'):
a.append(link)
#try just with first data-video-id
id = a[0]['data-video-id']
webpage2 = urllib2.urlopen('http://*/video/play/'+id)
soup = bs(webpage2)
string = str(soup.find_all('script')[2])
print string
url = string.split(': ')[1].split(',')[0]
url = url.replace('"','')
print url
print type(url)
video = urllib2.urlopen(url).read()
filename = "video.mp4"
with open(filename,'wb') as f:
f.write(video)
This code gives an unknown url type error. The traceback is
Traceback (most recent call last):
File "naruto.py", line 26, in <module>
video = urllib2.urlopen(url).read()
File "/usr/lib/python2.7/urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 404, in open
response = self._open(req, data)
File "/usr/lib/python2.7/urllib2.py", line 427, in _open
'unknown_open', req)
File "/usr/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 1247, in unknown_open
raise URLError('unknown url type: %s' % type)
urllib2.URLError: <urlopen error unknown url type: 'http>
However, when i store the same url in a variable and attempt to download it from terminal, no error is shown.
I am confused as to what the problem is.
I got a similar question in python mailing list
It's hard to tell without seeing the HTML from the page that you are scraping, however, a stray ' (single quote) character at the beginning of the URL might be the cause - this causes the same exception:
>>> import urllib2
>>> urllib2.urlopen("'http://blah.com")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "urllib2.py", line 404, in open
response = self._open(req, data)
File "urllib2.py", line 427, in _open
'unknown_open', req)
File "urllib2.py", line 382, in _call_chain
result = func(*args)
File "urllib2.py", line 1249, in unknown_open
raise URLError('unknown url type: %s' % type)
urllib2.URLError: <urlopen error unknown url type: 'http>
So, try cleaning up your URL and remove any stray quotes.
Update after OP feedback:
The results of the print statement indicate that the URL has a single quote character at the beginning and end of the URL string. There should not any quotes of any type surrounding the URL when it is passed to urlopen(). You can remove leading and trailing quotes (both single and double) from the URL string with this:
url = url.strip('\'"')

Python download images with alernating variables

I was trying to download images with url's that change but got an error.
url_image="http://www.joblo.com/timthumb.php?src=/posters/images/full/"+str(title_2)+"-poster1.jpg&h=333&w=225"
user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
headers = {'User-Agent': user_agent}
req = urllib.request.Request(url_image, None, headers)
print(url_image)
#image, h = urllib.request.urlretrieve(url_image)
with urllib.request.urlopen(req) as response:
the_page = response.read()
#print (the_page)
with open('poster.jpg', 'wb') as f:
f.write(the_page)
Traceback (most recent call last):
File "C:\Users\luke\Desktop\scraper\imager finder.py", line 97, in
with urllib.request.urlopen(req) as response:
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 162, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 465, in open
response = self._open(req, data)
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 483, in _open
'_open', req)
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 443, in _call_chain
result = func(*args)
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1268, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1243, in do_open
r = h.getresponse()
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 1174, in getresponse
response.begin()
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 282, in begin
version, status, reason = self._read_status()
File "C:\Users\luke\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 264, in _read_status
raise BadStatusLine(line)
http.client.BadStatusLine:
My advice is to use urlib2. In addition, I've written a nice function (I think) that will also allow gzip encoding (reduce bandwidth) if the server supports it. I use this for downloading social media files, but should work for anything.
I would try to debug your code, but since it's just a snippet (and the error messages are formatted badly), it's hard to know exactly where your error is occurring (it's certainly not line 97 in your code snippet).
This isn't as short as it could be, but it's clear and reusable. This is python 2.7, it looks like you're using 3 - in which case you google some other questions that address how to use urllib2 in python 3.
import urllib2
import gzip
from StringIO import StringIO
def download(url):
"""
Download and return the file specified in the URL; attempt to use
gzip encoding if possible.
"""
request = urllib2.Request(url)
request.add_header('Accept-Encoding', 'gzip')
try:
response = urllib2.urlopen(request)
except Exception, e:
raise IOError("%s(%s) %s" % (_ERRORS[1], url, e))
payload = response.read()
if response.info().get('Content-Encoding') == 'gzip':
buf = StringIO(payload)
f = gzip.GzipFile(fileobj=buf)
payload = f.read()
return payload
def save_media(filename, media):
file_handle = open(filename, "wb")
file_handle.write(media)
file_handle.close()
title_2 = "10-cloverfield-lane"
media = download("http://www.joblo.com/timthumb.php?src=/posters/images/full/{}-poster1.jpg&h=333&w=225".format(title_2))
save_media("poster.jpg", media)

Script to download stock price data from yahoo finance, randomly 404s

the following script reads a .txt of ticker symbols of companies to download the corresponding financial informations in a .csv . The data is pulled from Yahoo Finance and saved in a local directory.
import urllib.request
import requests
import time
#Define the URL to download the .csv from.
url_begin = "http://real-chart.finance.yahoo.com/table.csv?s="
url_end = "&a=00&b=1&c=1950&d=11&e=31&f=2050&g=d&ignore=.csv"
#Function that reads all available ticker symbols from ticker_daten.txt. This file should be in the same directory as the program.
def readTickers(file):
read_ticker = []
ins = open( file, "r" )
for line in ins:
if line.endswith('\n'):
line=line[:-1]
read_ticker.append(line)
ins.close()
return read_ticker
#File location for tickersymbols to download
tickers = readTickers("C:/Users/Win7ADM/Desktop/Jonas/stock-price-leecher/ticker_daten.txt")
#Loop through list of ticker symbols and download .csv's .
for i in tickers:
#Forge downloadable link.
link_created = url_begin + i + url_end
#Make sure that the link actually leads to a file.
try:
r = requests.head(link_created)
if r.status_code==404:
print(str(r.status_code)+": No page found!")
time.sleep(0.5)
else:
print(link_created)
#Finally download the file, if it does exist.
urllib.request.urlretrieve(link_created, "C:/Users/Win7ADM/Desktop/Jonas/stock-price-leecher/data/"+i+".csv")
time.sleep(0.5)
except requests.ConnectionError:
#A Connection error occurred.
print ("ConnectionError: 404 No page found!")
except requests.HTTPError:
#An HTTP error occurred.
print ("HTTPError!")
except requests.Timeout:
#Connection timed out.
print ("Timeout!")
The Problem: The script crashes randomlyafter loading between 20-1750 .csv's. The crash produces the following output.
Process started >>>
http://real-chart.finance.yahoo.com/table.csv?s=0055.HK&a=00&b=1&c=1950&d=11&e=31&f=2050&g=d&ignore=.csv
http://real-chart.finance.yahoo.com/table.csv?s=0056.HK&a=00&b=1&c=1950&d=11&e=31&f=2050&g=d&ignore=.csv
http://real-chart.finance.yahoo.com/table.csv?s=0057.HK&a=00&b=1&c=1950&d=11&e=31&f=2050&g=d&ignore=.csv
http://real-chart.finance.yahoo.com/table.csv?s=0058.HK&a=00&b=1&c=1950&d=11&e=31&f=2050&g=d&ignore=.csv
Traceback (most recent call last):
File "Stock-Price Leecher.py", line 40, in <module>
urllib.request.urlretrieve(link_created, "C:/Users/Win7ADM/Desktop/Jonas/stock-price-leecher/data/"+i+".csv")
File "c:\Python34\lib\urllib\request.py", line 178, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "c:\Python34\lib\urllib\request.py", line 153, in urlopen
return opener.open(url, data, timeout)
File "c:\Python34\lib\urllib\request.py", line 461, in open
response = meth(req, response)
File "c:\Python34\lib\urllib\request.py", line 571, in http_response
'http', request, response, code, msg, hdrs)
File "c:\Python34\lib\urllib\request.py", line 499, in error
return self._call_chain(*args)
File "c:\Python34\lib\urllib\request.py", line 433, in _call_chain
result = func(*args)
File "c:\Python34\lib\urllib\request.py", line 579, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 404: Not Found
<<< Process finished. (Exit code 1)
================ READY ================
Does anyone of you have any Idea why this might happen?

Transcribing sound files to text in python and google speech api

I have a bunch of files in wav. I made a simple script to convert them to flac so I can use it with the google speech api. Here is the python code:
import urllib2
url = "https://www.google.com/speech-api/v1/recognize?client=chromium&lang=en-US"
audio = open('somefile.flac','rb').read()
headers={'Content-Type': 'audio/x-flac; rate=16000', 'User-Agent':'Mozilla/5.0'}
request = urllib2.Request(url, data=audio, headers=headers)
response = urllib2.urlopen(request)
print response.read()
However I am getting this error:
Traceback (most recent call last):
File "transcribe.py", line 7, in <module>
response = urllib2.urlopen(request)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 392, in open
response = self._open(req, data)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 410, in _open
'_open', req)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 370, in _call_chain
result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 1194, in https_open
return self.do_open(httplib.HTTPSConnection, req)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 1161, in do_open
raise URLError(err)
urllib2.URLError: <urlopen error [Errno 32] Broken pipe>
I thought at first that it was because the file is too big. But I recorded myself for 5 seconds and it still does the same.
I dont think google ha released the api yet so it's hard to understand why its failing.
Is there any other good speech-to-text api out there that can be used in either Python or Node?
----- Editing for my attempt with requests:
import json
import requests
url = 'https://www.google.com/speech-api/v1/recognize?client=chromium&lang=en-US'
data = {'file': open('file.flac', 'rb')}
headers = {'Content-Type': 'audio/x-flac; rate=16000', 'User-Agent':'Mozilla/5.0'}
r = requests.post(url, data=data, headers=headers)
# r = requests.post(url, files=data, headers=headers) ## does not work either
# r = requests.post(url, data=open('file.flac', 'rb').read(), headers=headers) ## does not work either
print r.text
Produced the same problem as above.
The API Accepts HTTP POST requests. You're using a HTTP GET Request here. This can be confirmed by loading the URI in your code directly into a browser:
HTTP method GET is not supported by this URL
Error 405
Also, i'd recommend using the requests python library. See http://www.python-requests.org/en/latest/user/quickstart/#post-a-multipart-encoded-file
Lastly, it seems that the API only accepts segments up to 15 seconds long. Perhaps your error is the file is too large? If you can upload an example flac file, perhaps we could diagnose further.

I'm getting an error when trying to open a website url with Python 3.1, urllib & json: operation was attempted on something that is not a socket

I'm getting an error when trying to open a website url with Python 3.1, urllib & json
urllib.error.URLError:
Here's the code. The first website loads fine. The second one
import json
import urllib.request
import urllib.parse
import util
# This one works fine
response = urllib.request.urlopen('http://python.org/')
html = response.read()
print(html)
# parms - CSV filename, company, ....
p_filename = "c:\\temp\\test.csv"
jg_token = "zzzzzzzzzzzzzzzzzzzzzzzzz"
jg_proto = "https://"
jg_webst = "www.jigsaw.com/rest/"
jg_cmd_searchContact = "searchContact.json"
jg_key_companyName = "companyName"
jg_key_levels = "levels"
jg_key_departments = "departments"
jg_args = {
"token":jg_token,
jg_key_companyName: "Technical Innovations",
jg_key_departments: "HR"
}
jg_url = jg_proto + jg_webst + jg_cmd_searchContact + "?" + urllib.parse.urlencode(jg_args)
# This one generates teh error
result = json.load(urllib.request.urlopen(jg_url))
urllib.error.URLError:
File "c:\dev\xdev\PyJigsaw\searchContact.py", line 46, in
result = json.load(urllib.request.urlopen(jg_url))
File "c:\dev\tdev\Python31\Lib\urllib\request.py", line 121, in urlopen
return _opener.open(url, data, timeout)
File "c:\dev\tdev\Python31\Lib\urllib\request.py", line 349, in open
response = self._open(req, data)
File "c:\dev\tdev\Python31\Lib\urllib\request.py", line 367, in _open
'_open', req)
File "c:\dev\tdev\Python31\Lib\urllib\request.py", line 327, in _call_chain
result = func(*args)
File "c:\dev\tdev\Python31\Lib\urllib\request.py", line 1098, in https_open
return self.do_open(http.client.HTTPSConnection, req)
File "c:\dev\tdev\Python31\Lib\urllib\request.py", line 1075, in do_open
raise URLError(err)
Please edit the title and tags and maybe even the question body:
This has nothing to do with JSON and everything to do with Windows. It's also at a lower level than urllib. (Probably in the SSL code.) Distilled:
Both of the following approaches fail on Python 3.1.2 for Vista, but work fine on Linux (Python 3.1.3)
print( HTTPSConnection(hostname).request('GET',url).getresponse().read() )
print( urllib.request.urlopen('https://'+hostname+url).read() )
Change them to not use SSL, and then they work fine on Windows:
print( HTTPConnection(hostname).request('GET',url).getresponse().read() )
print( urllib.request.urlopen('http://'+hostname+url).read() )
On Vista, I just upgraded from Python 3.1.2 to Python 3.2 and this is no longer a problem.
The following now works just fine:
print( urllib.request.urlopen('https://'+hostname+url).read() )

Categories

Resources