I am trying to download all images of a particular wikipedia page. Here is the code snippet
from bs4 import BeautifulSoup as bs
import urllib2
import urlparse
from urllib import urlretrieve
site="http://en.wikipedia.org/wiki/Pune"
hdr= {'User-Agent': 'Mozilla/5.0'}
outpath=""
req = urllib2.Request(site,headers=hdr)
page = urllib2.urlopen(req)
soup =bs(page)
tag_image=soup.findAll("img")
for image in tag_image:
print "Image: %(src)s" % image
urlretrieve(image["src"], "/home/mayank/Desktop/test")
While after running the program I see error with following stack
Image: //upload.wikimedia.org/wikipedia/commons/thumb/0/04/Pune_Montage.JPG/250px-Pune_Montage.JPG
Traceback (most recent call last):
File "download_images.py", line 15, in <module>
urlretrieve(image["src"], "/home/mayank/Desktop/test")
File "/usr/lib/python2.7/urllib.py", line 93, in urlretrieve
return _urlopener.retrieve(url, filename, reporthook, data)
File "/usr/lib/python2.7/urllib.py", line 239, in retrieve
fp = self.open(url, data)
File "/usr/lib/python2.7/urllib.py", line 207, in open
return getattr(self, name)(url)
File "/usr/lib/python2.7/urllib.py", line 460, in open_file
return self.open_ftp(url)
File "/usr/lib/python2.7/urllib.py", line 543, in open_ftp
ftpwrapper(user, passwd, host, port, dirs)
File "/usr/lib/python2.7/urllib.py", line 864, in __init__
self.init()
File "/usr/lib/python2.7/urllib.py", line 870, in init
self.ftp.connect(self.host, self.port, self.timeout)
File "/usr/lib/python2.7/ftplib.py", line 132, in connect
self.sock = socket.create_connection((self.host, self.port), self.timeout)
File "/usr/lib/python2.7/socket.py", line 571, in create_connection
raise err
IOError: [Errno ftp error] [Errno 111] Connection refused
please help on what is causing this error?
// is shorthand for the current protocol. It seems like Wikipedia is using the shorthand, so you have to explicitly specify HTTP instead of FTP (which Python is assuming for some reason):
for image in tag_image:
src = 'http:' + image
Related
I am trying to run this code:
import urllib
htmlfile = urllib.urlopen("https://www.google.co.in/?gfe_rd=cr&ei=7VzrV6WWG8KC0ATxor_IDw")
htmltext = htmlfile.read()
print htmltext
But the following error is shown, when I run the code:
Traceback (most recent call last):
File "C:\Python27\newscrap.py", line 2, in <module>
htmlfile = urllib.urlopen("https://www.google.co.in/?gfe_rd=cr&ei=7VzrV6WWG8KC0ATxor_IDw")
File "C:\Python27\lib\urllib.py", line 87, in urlopen
return opener.open(url)
File "C:\Python27\lib\urllib.py", line 213, in open
return getattr(self, name)(url)
File "C:\Python27\lib\urllib.py", line 443, in open_https
h.endheaders(data)
File "C:\Python27\lib\httplib.py", line 997, in endheaders
self._send_output(message_body)
File "C:\Python27\lib\httplib.py", line 850, in _send_output
self.send(msg)
File "C:\Python27\lib\httplib.py", line 812, in send
self.connect()
File "C:\Python27\lib\httplib.py", line 1208, in connect
HTTPConnection.connect(self)
File "C:\Python27\lib\httplib.py", line 793, in connect
self.timeout, self.source_address)
File "C:\Python27\lib\socket.py", line 553, in create_connection
for res in getaddrinfo(host, port, 0, SOCK_STREAM):
IOError: [Errno socket error] [Errno 11004] getaddrinfo failed
Can anyone tell why this error occurs?
It's a connection problem (your code works on my computer). Check your firewall / proxy settings / dns server / other connection settings.
I am writing a python program to fetch all hyperlinks from a Url provided .
It was working fine when I run it at home and my Hostel room with internet connection without proxy But when i tried running the program at my University Network having a proxy setting a got this error :
*'Traceback (most recent call last):
File "myurl.py", line 26, in <module>
main()
File "myurl.py", line 24, in main
process(url)
File "myurl.py", line 7, in process
page = urllib.urlopen(url)
File "/usr/lib/python2.7/urllib.py", line 84, in urlopen
return opener.open(url)
File "/usr/lib/python2.7/urllib.py", line 205, in open
return getattr(self, name)(url)
File "/usr/lib/python2.7/urllib.py", line 342, in open_http
h.endheaders(data)
File "/usr/lib/python2.7/httplib.py", line 940, in endheaders
self._send_output(message_body)
File "/usr/lib/python2.7/httplib.py", line 803, in _send_output
self.send(msg)
File "/usr/lib/python2.7/httplib.py", line 755, in send
self.connect()
File "/usr/lib/python2.7/httplib.py", line 736, in connect
self.timeout, self.source_address)
File "/usr/lib/python2.7/socket.py", line 551, in create_connection
for res in getaddrinfo(host, port, 0, SOCK_STREAM):
IOError: [Errno socket error] [Errno -2] Name or service not known''*
My Program code is :
import sys
import urllib
import urlparse
from bs4 import BeautifulSoup
def process(url):
page = urllib.urlopen(url)
text = page.read()
page.close()
soup = BeautifulSoup(text)
with open('s.txt','w') as file:
for tag in soup.findAll('a', href=True):
tag['href'] = urlparse.urljoin(url, tag['href'])
print tag['href']
file.write('\n')
file.write(tag['href'])
def main():
if len(sys.argv) == 1:
print 'No url !!'
sys.exit(1)
for url in sys.argv[1:]:
The code works fine without a proxy involved. The problem is not your code as much as it is your other network variables. Untrusted proxies, other system settings concerning your connection to said proxy, etc. Eliminate your system and the proxy in question from the equation, and test further.
I am new to Python and trying to run attached code.
import urllib
url = 'https://www.google.com/'
print(url)
sock = urllib.urlopen(url)
But I get an error below:
>>> runfile('C:/Users/myname/Documents/Python Scripts/ex00.py', wdir=r'C:/Users/myname/Documents/Python Scripts')
https://www.google.com/
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Users\myname\AppData\Local\Continuum\Anaconda\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 540, in runfile
execfile(filename, namespace)
File "C:/Users/myname/Documents/Python Scripts/ex00.py", line 5, in <module>
sock = urllib.urlopen(url)
File "C:\Users\myname\AppData\Local\Continuum\Anaconda\lib\urllib.py", line 87, in urlopen
return opener.open(url)
File "C:\Users\myname\AppData\Local\Continuum\Anaconda\lib\urllib.py", line 208, in open
return getattr(self, name)(url)
File "C:\Users\myname\AppData\Local\Continuum\Anaconda\lib\urllib.py", line 437, in open_https
h.endheaders(data)
File "C:\Users\myname\AppData\Local\Continuum\Anaconda\lib\httplib.py", line 969, in endheaders
self._send_output(message_body)
File "C:\Users\myname\AppData\Local\Continuum\Anaconda\lib\httplib.py", line 829, in _send_output
self.send(msg)
File "C:\Users\myname\AppData\Local\Continuum\Anaconda\lib\httplib.py", line 791, in send
self.connect()
File "C:\Users\myname\AppData\Local\Continuum\Anaconda\lib\httplib.py", line 1172, in connect
self.timeout, self.source_address)
File "C:\Users\myname\AppData\Local\Continuum\Anaconda\lib\socket.py", line 571, in create_connection
raise err
IOError: [Errno socket error] [Errno 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond
Earlier I thought that it is may be because I am running the script from my office and VPN is causing issues. So I ran it from my home internet connection but no resolution :(
from urllib import urlopen
from bs4 import BeautifulSoup
import re
# Copy all of the content from the provided web page
webpage = urlopen('http://stats.espncricinfo.com/indian-premier-league-2012/engine/records/averages/batting.html?id=6680;type=tournament').read()
soup=BeautifulSoup(webpage);
commentary=soup.find_all("tr", "data2");
for i in range(10):
for stat in commentary[i].stripped_strings:
print stat,
print ""
I am running this python program in eclipse. I have changed my proxy entries in network connections. but i am getting IOError as below :
IOError: [Errno socket error] [Errno -2] Name or service not known
Traceback (most recent call last):
File "/home/sumanth/workspace/python/scraping.py", line 22, in
webpage = urlopen('http://stats.espncricinfo.com/indian-premier-league-2012/engine/records/averages/batting.html?id=6680;type=tournament').read()
File "/usr/lib/python2.7/urllib.py", line 86, in urlopen
return opener.open(url)
File "/usr/lib/python2.7/urllib.py", line 207, in open
return getattr(self, name)(url)
File "/usr/lib/python2.7/urllib.py", line 344, in open_http
h.endheaders(data)
File "/usr/lib/python2.7/httplib.py", line 958, in endheaders
self._send_output(message_body)
File "/usr/lib/python2.7/httplib.py", line 818, in _send_output
self.send(msg)
File "/usr/lib/python2.7/httplib.py", line 780, in send
self.connect()
File "/usr/lib/python2.7/httplib.py", line 761, in connect
self.timeout, self.source_address)
File "/usr/lib/python2.7/socket.py", line 571, in create_connection
raise err
IOError: [Errno socket error] [Errno 110] Connection timed out
It looks like you have a flakey internet connection. The error "Name or service not known" means the DNS lookup for the page failed, the "Connection timed out error" means you were unable to contact the remote server but the DNS lookup succeeded.
I am not able to open a url for read() using urllib or urllib2 even after using proxyhandlers (in case of urllib2) and setting proxies in urllib.
My network which uses proxies to connect to internet have proxies (taken from my browser) is:
HTTP Proxy: someproxy.com Port: 1080
I have tried urllib:
import urllib
myproxies = {'http':'http://someproxy.com:1080'}
data = urllib.urlopen('http://www.google.com', proxies = myproxies).read()
but I am receiving this error:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Python27\lib\urllib.py", line 84, in urlopen
return opener.open(url)
File "C:\Python27\lib\urllib.py", line 200, in open
return self.open_unknown_proxy(proxy, fullurl, data)
File "C:\Python27\lib\urllib.py", line 219, in open_unknown_proxy
raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
IOError: [Errno socket error] [Errno 11001] getaddrinfo failed'
and for urllib2:
import urllib2
proxy = urllib2.ProxyHandler({'http':'http://someproxy.com:1080'})
opener1 = urllib2.build_opener(proxy)
urllib2.install_opener(opener1)
urllib2.urlopen('http://www.google.com')'
I am getting this error:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Python27\lib\urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 394, in open
response = self._open(req, data)
File "C:\Python27\lib\urllib2.py", line 412, in _open
'_open', req)
File "C:\Python27\lib\urllib2.py", line 372, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 1199, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "C:\Python27\lib\urllib2.py", line 1174, in do_open
raise URLError(err)
urllib2.URLError: <urlopen error [Errno 11001] getaddrinfo failed>
any help will be greatly appreciated.
MRick
I think you want the following for urllib
...
proxies = {'http':'http://someproxy.com:1080/'}
data = urllib.urlopen('http://www.google.com', proxies=proxies).read()
...
or this for urllib2:
...
proxy = urllib2.ProxyHandler({'http':'http://someproxy.com:1080'})
...
Note the proxy url includes the protocol part, which your code omits.