Python crawl web HttpEror 500 using urlib2 - python

I use url lib, urllib2, cookie lib to scrape a web:get the login page and post the data.
def getpage():
codeurl=r"http://www.xxx/sign_in"
request=urllib2.Request(codeurl)
response=urllib2.urlopen(request)
return response
def parsecode(response):
"""
parse the login page to get the changed code
"""
pattern=re.compile(r"""<meta.*?csrf-token.*?content=(.*?)\s/>""")
code=re.findall(pattern,response.read())[0]
return code
def Hand():
"""
deal with cookie and header
"""
headers={
"Referer":"xxx",
"User-Agent":"xxx"
}
ck=cookielib.MozillaCookieJar()
handle=urllib2.HTTPCookieProcessor(ck)
openner=urllib2.build_opener(handle)
head=[]
for key,value in headers.items():
tup=(key,value)
head.append(tup)
openner.addheaders = head
return openner
def postdata(code,openner):
"""
post the data xxx.com needed
"""
logurl=r"http://www.jianshu.com/sessions"
sign_in={"name":"xxx","password":"xxx","authenticity_token":code}
data=urllib.urlencode(sign_in).encode("utf-8")
x=openner.open(logurl,data)
for item in ck:
print item
However,I met this bug:
Traceback (most recent call last):
File "jianshu.py", line 80, in
postdata(code,op)
File "jianshu.py", line 43, in postdata
x=openner.open(logurl,data)
File "/usr/lib64/python2.7/urllib2.py", line 437, in open
response = meth(req, response)
File "/usr/lib64/python2.7/urllib2.py", line 550, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib64/python2.7/urllib2.py", line 475, in error
return self._call_chain(*args)
File "/usr/lib64/python2.7/urllib2.py", line 409, in _call_chain
result = func(*args)
File "/usr/lib64/python2.7/urllib2.py", line 558, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 500: Internal Server Error

Are you possibly missing a ' in between the 'r' and 'http://...' this line:
codeurl=r"http://www.xxx/sign_in"

Related

python urlopen returns error

I am trying to parse some data from 'https://datausa.io/profile/geo/jacksonville-fl/#intro', but I am not sure how to access it from python. My code is:
adress, headers = urllib.request.urlretrieve(' https://datausa.io/profile/geo/jacksonville-fl/#intro')
handle = open(adress)
and it returns the error:
Traceback (most recent call last):
File "C:/Users/Jared/AppData/Local/Programs/Python/Python36-32/capstone1.py", line 16, in <module>
adress, headers = urllib.request.urlretrieve(' https://datausa.io/profile/geo/jacksonville-fl/#intro')
File "C:\Users\Jared\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 248, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "C:\Users\Jared\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\Jared\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 532, in open
response = meth(req, response)
File "C:\Users\Jared\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Users\Jared\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 570, in error
return self._call_chain(*args)
File "C:\Users\Jared\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 504, in _call_chain
result = func(*args)
File "C:\Users\Jared\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 650, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden
Please explain what is wrong or tell me a better way to access the page. Also, does the ' .io ' suffix affecthow python handles it?
Thanks.
This worked for me:
import requests
url = "https://datausa.io/profile/geo/jacksonville-fl/#intro"
req = requests.request("GET",url)

HTTP ERROR in Python

I seem to be getting this error with urllib.request and it gives me this url error that i cant seem to fix.
raceback (most recent call last):
File "C:\Users\Jarvis\Documents\Python Scripts\MultiCheck by Koala.py", line 133, in <module>
Migration()
File "C:\Users\Jarvis\Documents\Python Scripts\MultiCheck by Koala.py", line 116, in Migration
rawdata_uuid = urllib.request.urlopen(url)
File "C:\Python34\lib\urllib\request.py", line 161, in urlopen
return opener.open(url, data, timeout)
File "C:\Python34\lib\urllib\request.py", line 469, in open
response = meth(req, response)
File "C:\Python34\lib\urllib\request.py", line 579, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python34\lib\urllib\request.py", line 507, in error
return self._call_chain(*args)
File "C:\Python34\lib\urllib\request.py", line 441, in _call_chain
result = func(*args)
File "C:\Python34\lib\urllib\request.py", line 587, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 429: 42
The code im using is here is for a migration checker for a game:
def Migration():
url = "https://api.mojang.com/users/profiles/minecraft/" + einfos
rawdata = urllib.request.urlopen(url)
newrawdata = rawdata.read()
jsondata = json.loads(newrawdata.decode('utf-8'))
results = jsondata['id']
url = "https://sessionserver.mojang.com/session/minecraft/profile/" + results
rawdata_uuid = urllib.request.urlopen(url)
newrawdata_uuid = rawdata_uuid.read()
jsondata_uuid = json.loads(newrawdata_uuid.decode('utf-8'))
try:
results = jsondata_uuid['legacy']
print ("Unmigrated")
except:
print("Migrated")
Error 429 means: Too many requests. You seem to have hit a rate limit. The additional number gives are the seconds you have to wait for the limitation to be dropped. So, try again in 42s, or later.

urllib2 retrieve an arbitrary file based on URL and save it into a named file

I am writing a python script to use the urllib2 module as an equivalent to the command line utility wget. The only function I want for this is that it can be used to retrieve an arbitrary file based on URL and save it into a named file. I also only need to worry about two command line arguments, the URL from which the file is to be downloaded and the name of the file into which the content are to be saved.
Example:
python Prog7.py www.python.org pythonHomePage.html
This is my code:
import urllib
import urllib2
#import requests
url = 'http://www.python.org/pythonHomePage.html'
print "downloading with urllib"
urllib.urlretrieve(url, "code.txt")
print "downloading with urllib2"
f = urllib2.urlopen(url)
data = f.read()
with open("code2.txt", "wb") as code:
code.write(data)
urllib seems to work but urllib2 does not seem to work.
Errors received:
File "Problem7.py", line 11, in <module>
f = urllib2.urlopen(url)
File "/usr/lib64/python2.6/urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib64/python2.6/urllib2.py", line 397, in open
response = meth(req, response)
File "/usr/lib64/python2.6/urllib2.py", line 510, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib64/python2.6/urllib2.py", line 429, in error
result = self._call_chain(*args)
File "/usr/lib64/python2.6/urllib2.py", line 369, in _call_chain
result = func(*args)
File "/usr/lib64/python2.6/urllib2.py", line 616, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/usr/lib64/python2.6/urllib2.py", line 397, in open
response = meth(req, response)
File "/usr/lib64/python2.6/urllib2.py", line 510, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib64/python2.6/urllib2.py", line 435, in error
return self._call_chain(*args)
File "/usr/lib64/python2.6/urllib2.py", line 369, in _call_chain
result = func(*args)
File "/usr/lib64/python2.6/urllib2.py", line 518, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 404: NOT FOUND
And the URL is doesn't exist at all; https://www.python.org/pythonHomePage.html is indeed a 404 Not Found page.
The difference between urllib and urllib2 then is that the latter automatically raises an exception when a 404 page is returned, while urllib.urlretrieve() just saves the error page for you:
>>> import urllib
>>> urllib.urlopen('https://www.python.org/pythonHomePage.html').getcode()
404
>>> import urllib2
>>> urllib2.urlopen('https://www.python.org/pythonHomePage.html')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Users/mj/Development/Library/buildout.python/parts/opt/lib/python2.7/urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "/Users/mj/Development/Library/buildout.python/parts/opt/lib/python2.7/urllib2.py", line 410, in open
response = meth(req, response)
File "/Users/mj/Development/Library/buildout.python/parts/opt/lib/python2.7/urllib2.py", line 523, in http_response
'http', request, response, code, msg, hdrs)
File "/Users/mj/Development/Library/buildout.python/parts/opt/lib/python2.7/urllib2.py", line 448, in error
return self._call_chain(*args)
File "/Users/mj/Development/Library/buildout.python/parts/opt/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(*args)
File "/Users/mj/Development/Library/buildout.python/parts/opt/lib/python2.7/urllib2.py", line 531, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 404: NOT FOUND
If you wanted to save the error page, you can catch the urllib2.HTTPError exception:
try:
f = urllib2.urlopen(url)
data = f.read()
except urllib2.HTTPError as err:
data = err.read()
It is due to the different behavior by urllib and urllib2.
Since the web page returns a 404 error (webpage not found) urllib2 "catches" it while urllib downloads the html of the returned page regardless of the error.
If you want to print the html to the text file you can print the error:
import urllib2
try:
data = urllib2.urlopen('http://www.python.org/pythonHomePage.html').read()
except urllib2.HTTPError, e:
print e.code
print e.msg
print e.headers
print e.fp.read()
with open("code2.txt", "wb") as code:
code.write(e.fp.read())
req will be a Request object, fp will be a file-like object with the
HTTP error body, code will be the three-digit code of the error, msg
will be the user-visible explanation of the code and hdrs will be a
mapping object with the headers of the error.
More data about HTTP error: urllib2 documentation

urlencode gives HTTP Error 403: FORBIDDEN

callurl = "http://vgintnh116:8001/master_data/"
params = urllib.urlencode({'res': 'arovit', 'qfields': 'prod' })
f = urllib2.urlopen(callurl, params)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/u/vgtools2/python-2.6.5/lib/python2.6/urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "/u/vgtools2/python-2.6.5/lib/python2.6/urllib2.py", line 397, in open
response = meth(req, response)
File "/u/vgtools2/python-2.6.5/lib/python2.6/urllib2.py", line 510, in http_response
'http', request, response, code, msg, hdrs)
File "/u/vgtools2/python-2.6.5/lib/python2.6/urllib2.py", line 435, in error
return self._call_chain(*args)
File "/u/vgtools2/python-2.6.5/lib/python2.6/urllib2.py", line 369, in _call_chain
result = func(*args)
File "/u/vgtools2/python-2.6.5/lib/python2.6/urllib2.py", line 518, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 403: FORBIDDEN
But it works with -
callurl = "http://vgintnh116:8001/master_data/res=arovit&qfields=prod"
f = urllib2.urlopen(callurl)
Please help. I want to use urlencode to avoid handling spaces and extra characters.
If you pass the second argument (data), request will be POST instead of GET.
Also, dictionaries in Python does not have order. To guarantee the order, you should use sequence.
callurl = "http://vgintnh116:8001/master_data/"
params = urllib.urlencode([('res', 'arovit'), ('qfields', 'prod')])
f = urllib2.urlopen(callurl + params)
From urllib2 documentation:
the HTTP request will be a POST instead of a GET when the data
parameter is provided
In your working example, you are making a GET request.

Django model not saving when calling save()

so i am trying to save a django model, and for some reason i am only getting a 500 internal server error. the thing is,
if i comment the social_auth.save() it works and i can manipulate the
object, but not save it
why is this happening? i am using django tastypie and i am trying to save a django-social-auth instance.
def obj_create(self, bundle, request=None, **kwargs):
try:
#this is not supposed to upgrade password
bundle = super(UserResource, self).obj_create(bundle)
bundle.obj.save()
if bundle.data.get('extra_data') != None:
print bundle.data.get('extra_data')
fb_id = bundle.data.get('extra_data')['id']
#social_auth=UserSocialAuth(user_id = bundle.obj, provider=bundle.data.get('provider'),uid=fb_id,extra_data=bundle.data.get('extra_data') )
social_auth=UserSocialAuth()
social_auth.user_id = bundle.obj
social_auth.provider=bundle.data.get('provider')
social_auth.uid=fb_id
social_auth.extra_data=bundle.data.get('extra_data')
print "social: ",social_auth.extra_data
social_auth.save()
except IntegrityError:
raise BadRequest('Username already exists')
return bundle
traceback:
Traceback (most recent call last):
File "temp_3.py", line 23, in <module>
post()
File "temp_3.py", line 18, in post
f = urllib2.urlopen(req)
File "/usr/lib/python2.7/urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 406, in open
response = meth(req, response)
File "/usr/lib/python2.7/urllib2.py", line 519, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python2.7/urllib2.py", line 444, in error
return self._call_chain(*args)
File "/usr/lib/python2.7/urllib2.py", line 378, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 527, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 500: INTERNAL SERVER ERROR
if bundle.obj is of type User, then social_auth.user_id = bundle.obj is wrong and should be social_auth.user = bundle.obj
also ensure you are not in this case:
django-social-auth HTTP 500

Categories

Resources