Why this request doesn't work? - python

I want to make a simple stupid twitter app using Twitter API.
If I request this page from my browser it does work:
http://search.twitter.com/search.atom?q=hello&rpp=10&page=1
but if I request this page from python using urllib or urllib2 most of the times it doesn't work:
response = urllib2.urlopen("http://search.twitter.com/search.atom?q=hello&rpp=10&page=1")
and I get this error:
Traceback (most recent call last):
File "twitter.py", line 24, in <module>
response = urllib2.urlopen("http://search.twitter.com/search.atom?q=hello&rpp=10&page=1")
File "/usr/lib/python2.6/urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.6/urllib2.py", line 391, in open
response = self._open(req, data)
File "/usr/lib/python2.6/urllib2.py", line 409, in _open
'_open', req)
File "/usr/lib/python2.6/urllib2.py", line 369, in _call_chain
result = func(*args)
File "/usr/lib/python2.6/urllib2.py", line 1161, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "/usr/lib/python2.6/urllib2.py", line 1136, in do_open
raise URLError(err)
urllib2.URLError: <urlopen error [Errno 110] Connection timed out>
Why ??

The code seems alright.
The following worked.
>>> import urllib
>>> import urllib2
>>> user_agent = 'curl/7.21.1 (x86_64-apple-darwin10.4.0) libcurl/7.21.1'
>>> url='http://search.twitter.com/search.atom?q=hello&rpp=10&page=1'
>>> headers = { 'User-Agent' : user_agent }
>>> req = urllib2.Request(url, None, headers)
>>> response = urllib2.urlopen(req)
>>> the_page = response.read()
>>> print the_page
The other is twitter actually could not respond. This happens once too often with Twitter.

did you change the default socket timeout somewhere in your script? your example code works reliably for me.
it could be your internet connection, or you might try
import socket
socket.setdefaulttimeout(30)
assuming urllib/2 don't override the socket timeout.

Related

how to get raw html text of a given url using python

I'm using html2text in python to get raw text (tags included) of a HTML page by taking any URL but I'm getting an error.
My code -
import html2text
import urllib2
proxy = urllib2.ProxyHandler({'http': 'http://<proxy>:<pass>#<ip>:<port>'})
auth = urllib2.HTTPBasicAuthHandler()
opener = urllib2.build_opener(proxy, auth, urllib2.HTTPHandler)
urllib2.install_opener(opener)
html = urllib2.urlopen("http://www.ndtv.com/india-news/this-stunt-for-a-facebook-like-got-the-hyderabad-youth-arrested-740851").read()
print html2text.html2text(html)
The error -
Traceback (most recent call last):
File "t.py", line 8, in <module>
html = urllib2.urlopen("http://www.ndtv.com/india-news/this-stunt-for-a-facebook-like-got-the-hyderabad-youth-arrested-740851").read()
File "/usr/lib/python2.7/urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 404, in open
response = self._open(req, data)
File "/usr/lib/python2.7/urllib2.py", line 422, in _open
'_open', req)
File "/usr/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 1214, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "/usr/lib/python2.7/urllib2.py", line 1184, in do_open
raise URLError(err)
urllib2.URLError: <urlopen error [Errno 110] Connection timed out>
Can anyone explain what I'm doing wrong?
If you don't require SSL, this script in Python 2.7.x should work:
import urllib
url = "http://stackoverflow.com"
f = urllib.urlopen(url)
print f.read()
and in Python 3.x use urllib.request instead of urllib
Because urllib2 for Python 2, in Python 3 it was merged into urllib.
http:// is required.
EDIT: In 2020, you should use the 3rd party module requests. requests can be installed with pip.
import requests
print(requests.get("http://stackoverflow.com").text)

Python handling socket.error: [Errno 104] Connection reset by peer

When using Python 2.7 with urllib2 to retrieve data from an API, I get the error [Errno 104] Connection reset by peer. Whats causing the error, and how should the error be handled so that the script does not crash?
ticker.py
def urlopen(url):
response = None
request = urllib2.Request(url=url)
try:
response = urllib2.urlopen(request).read()
except urllib2.HTTPError as err:
print "HTTPError: {} ({})".format(url, err.code)
except urllib2.URLError as err:
print "URLError: {} ({})".format(url, err.reason)
except httplib.BadStatusLine as err:
print "BadStatusLine: {}".format(url)
return response
def get_rate(from_currency="EUR", to_currency="USD"):
url = "https://finance.yahoo.com/d/quotes.csv?f=sl1&s=%s%s=X" % (
from_currency, to_currency)
data = urlopen(url)
if "%s%s" % (from_currency, to_currency) in data:
return float(data.strip().split(",")[1])
return None
counter = 0
while True:
counter = counter + 1
if counter==0 or counter%10:
rateEurUsd = float(get_rate('EUR', 'USD'))
# does more stuff here
Traceback
Traceback (most recent call last):
File "/var/www/testApp/python/ticker.py", line 71, in <module>
rateEurUsd = float(get_rate('EUR', 'USD'))
File "/var/www/testApp/python/ticker.py", line 29, in get_exchange_rate
data = urlopen(url)
File "/var/www/testApp/python/ticker.py", line 16, in urlopen
response = urllib2.urlopen(request).read()
File "/usr/lib/python2.7/urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 406, in open
response = meth(req, response)
File "/usr/lib/python2.7/urllib2.py", line 519, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python2.7/urllib2.py", line 438, in error
result = self._call_chain(*args)
File "/usr/lib/python2.7/urllib2.py", line 378, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 625, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/usr/lib/python2.7/urllib2.py", line 406, in open
response = meth(req, response)
File "/usr/lib/python2.7/urllib2.py", line 519, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python2.7/urllib2.py", line 438, in error
result = self._call_chain(*args)
File "/usr/lib/python2.7/urllib2.py", line 378, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 625, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/usr/lib/python2.7/urllib2.py", line 400, in open
response = self._open(req, data)
File "/usr/lib/python2.7/urllib2.py", line 418, in _open
'_open', req)
File "/usr/lib/python2.7/urllib2.py", line 378, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 1207, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "/usr/lib/python2.7/urllib2.py", line 1180, in do_open
r = h.getresponse(buffering=True)
File "/usr/lib/python2.7/httplib.py", line 1030, in getresponse
response.begin()
File "/usr/lib/python2.7/httplib.py", line 407, in begin
version, status, reason = self._read_status()
File "/usr/lib/python2.7/httplib.py", line 365, in _read_status
line = self.fp.readline()
File "/usr/lib/python2.7/socket.py", line 447, in readline
data = self._sock.recv(self._rbufsize)
socket.error: [Errno 104] Connection reset by peer
error: Forever detected script exited with code: 1
"Connection reset by peer" is the TCP/IP equivalent of slamming the phone back on the hook. It's more polite than merely not replying, leaving one hanging. But it's not the FIN-ACK expected of the truly polite TCP/IP converseur. (From other SO answer)
So you can't do anything about it, it is the issue of the server.
But you could use try .. except block to handle that exception:
from socket import error as SocketError
import errno
try:
response = urllib2.urlopen(request).read()
except SocketError as e:
if e.errno != errno.ECONNRESET:
raise # Not error we are looking for
pass # Handle error here.
You can try to add some time.sleep calls to your code.
It seems like the server side limits the amount of requests per timeunit (hour, day, second) as a security issue. You need to guess how many (maybe using another script with a counter?) and adjust your script to not surpass this limit.
In order to avoid your code from crashing, try to catch this error with try .. except around the urllib2 calls.
There is a way to catch the error directly in the except clause with ConnectionResetError, better to isolate the right error.
This example also catches the timeout.
from urllib.request import urlopen
from socket import timeout
url = "http://......"
try:
string = urlopen(url, timeout=5).read()
except ConnectionResetError:
print("==> ConnectionResetError")
pass
except timeout:
print("==> Timeout")
pass
there are 2 solution you can try.
request too frequently.
try sleep after per request
time.sleep(1)
the server detect the request client is python, so reject.
add User-Agent in header to handle this.
headers = {
"Content-Type": "application/json;charset=UTF-8",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)"
}
try:
res = requests.post("url", json=req, headers=headers)
except Exception as e:
print(e)
pass
the second solution save me

How to crawl Twitter pages using Python?

When I try to crawl Twitter using this code:
import urllib2
s = "https://mobile.twitter.com/bing/"
html = urllib2.urlopen(s).read()
print html
... I get the following error:
Traceback (most recent call last):
File "C:\Users\arpit\Downloads\Desktop\Wiki Code\final Crawler_wiki.py", line 14, in <module>
html = urllib2.urlopen(s).read()
File "C:\Python27\lib\urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 400, in open
response = self._open(req, data)
File "C:\Python27\lib\urllib2.py", line 418, in _open
'_open', req)
File "C:\Python27\lib\urllib2.py", line 378, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 1215, in https_open
return self.do_open(httplib.HTTPSConnection, req)
File "C:\Python27\lib\urllib2.py", line 1177, in do_open
raise URLError(err)
URLError: <urlopen error [Errno 10061] No connection could be made because the target machine actively refused it>
If I replace mobile.twitter.com with twitter.com then it works, but I want it to work with mobile.twitter.com.
The twitter site is probably looking for a user-agent which you dont have set when you make the request through the urllib api.
You will likely need to use something like mechanize to fake your user-agent.
But I highly suggest your use the twitter api which provide a lot of easy and awesome way to play with data.

Connection refused using urllib2 and Tor in python

I am pretty new to python. I am trying to write a pretty simple web scraper for a project I am working on. In the process I am trying to use Tor to change my IP address so I don't get disconnected from the service I am scraping. I was trying to test the code specific to getting a new IP before adding it to my project. Here is the code I am testing.
from TorCtl import TorCtl
import urllib2
for i in range(1,51):
proxy_support = urllib2.ProxyHandler({"http" : "127.0.0.1:8118"} )
opener = urllib2.build_opener(proxy_support)
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib2.install_opener(opener)
print "IP " + str(i) + ":"
print urllib2.urlopen('http://ifconfig.me/ip').read()
conn = TorCtl.connect(controlAddr="127.0.0.1", controlPort=9051, passphrase="torPass")
conn.sendAndRecv('signal newnymrn')
conn.close()
When I do this i get the following error:
IP 1: Traceback (most recent call last): File "scrapingTools.py",
line 86, in
main() File "scrapingTools.py", line 76, in main
print urllib2.urlopen('http://ifconfig.me/ip').read() File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py",
line 126, in urlopen
return _opener.open(url, data, timeout) File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py",
line 394, in open
response = self._open(req, data) File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py",
line 412, in _open
'_open', req) File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py",
line 372, in _call_chain
result = func(*args) File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py",
line 1199, in http_open
return self.do_open(httplib.HTTPConnection, req) File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py",
line 1174, in do_open
raise URLError(err) urllib2.URLError:
Any help understanding what is going on here would be greatly appreciated.
There is some problem with your proxy configuration.
Your code works without the proxy settings.
I don't know anything about TorCtl but you're not sending an AUTHENTICATE string, tor will expect that. It should look something like:
telnet localhost:9051
>> 250 OK
AUTHENTICATE "xxx"
>> 250 OK
signal NEWNYM
>> 250 OK
Note, wait a few seconds for the identity to have changed.

Connect to network not working using urllib or urllib2 even after configuring proxy

I am not able to open a url for read() using urllib or urllib2 even after using proxyhandlers (in case of urllib2) and setting proxies in urllib.
My network which uses proxies to connect to internet have proxies (taken from my browser) is:
HTTP Proxy: someproxy.com Port: 1080
I have tried urllib:
import urllib
myproxies = {'http':'http://someproxy.com:1080'}
data = urllib.urlopen('http://www.google.com', proxies = myproxies).read()
but I am receiving this error:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Python27\lib\urllib.py", line 84, in urlopen
return opener.open(url)
File "C:\Python27\lib\urllib.py", line 200, in open
return self.open_unknown_proxy(proxy, fullurl, data)
File "C:\Python27\lib\urllib.py", line 219, in open_unknown_proxy
raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
IOError: [Errno socket error] [Errno 11001] getaddrinfo failed'
and for urllib2:
import urllib2
proxy = urllib2.ProxyHandler({'http':'http://someproxy.com:1080'})
opener1 = urllib2.build_opener(proxy)
urllib2.install_opener(opener1)
urllib2.urlopen('http://www.google.com')'
I am getting this error:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Python27\lib\urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 394, in open
response = self._open(req, data)
File "C:\Python27\lib\urllib2.py", line 412, in _open
'_open', req)
File "C:\Python27\lib\urllib2.py", line 372, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 1199, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "C:\Python27\lib\urllib2.py", line 1174, in do_open
raise URLError(err)
urllib2.URLError: <urlopen error [Errno 11001] getaddrinfo failed>
any help will be greatly appreciated.
MRick
I think you want the following for urllib
...
proxies = {'http':'http://someproxy.com:1080/'}
data = urllib.urlopen('http://www.google.com', proxies=proxies).read()
...
or this for urllib2:
...
proxy = urllib2.ProxyHandler({'http':'http://someproxy.com:1080'})
...
Note the proxy url includes the protocol part, which your code omits.

Categories

Resources