HTTP Error 409: conflict urllib2 python - python

I would like to iterate over a list of url api requests with different ?get variables (x1,....,xn).
from urllib2 import urlopen, Request
import pandas as pd
var=[x[0],...,x[n]]
url_list=['http://BLAH/get?var[0]&fmt=csv','http://BLAH/get?var[1]&fmt=csv',... ...,'http://BLAH/get?var[n]&fmt=csv']
j=0
while j < len(url_list):
req=Request(url_list[j])
response=urlopen(req)
df=pd.read_csv(response)
print df
j=j+1**
I have tried to del req and response.close() but my code still produces a conflict error.
File "C:\Users\Anaconda\lib\urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "C:\Users\Anaconda\lib\urllib2.py", line 410, in open
response = meth(req, response)
File "C:\Users\Anaconda\lib\urllib2.py", line 523, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Users\nfitzsimons\Anaconda\lib\urllib2.py", line 448, in error
return self._call_chain(*args)
File "C:\Users\Anaconda\lib\urllib2.py", line 382, in _call_chain
result = func(*args)
File "C:\Users\Anaconda\lib\urllib2.py", line 531, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
rllib2.HTTPError: HTTP Error 409: Conflict
Has anybody any suggestions?

If you're just trying to iterate over a list of urls and print the contents upon a successful request, why not try this?
#!/usr/bin/env python
try:
import requests
except ImportError as err:
print("Woops, you're missing " + str(err))
urls = []
req = requests
for url in urls:
response = req.get(url)
if response.status_code == 200: #Successful request
print(response.content)

Related

How to use urllib in python?

I was wondering if somebody can help me with getting my code to work.
import urllib.request
import urllib.parse
import re
url = 'https://www.google.com'
values = {'s':'basics',
'submit':'search'}
data = urllib.parse.urlencode(values)
data = data.encode('utf-8')
req = urllib.request.Request(url,data)
resp = urllib.request.urlopen(req)
respData = resp.read()
print(respData)
It is giving me this error message when running the code.
TypeError: POST data should be bytes, an iterable of bytes, or a file object. It cannot be of type str.
I hope someone can help me with my problem. If not thanks anyways.
It is giving me this gigantic error message:
It is giving me this gigantic error
Traceback (most recent call last):
File "C:\Users\user\OneDrive\Desktop\Lotto.py", line 11, in <module>
resp = urllib.request.urlopen(req)
File "C:\Users\user\AppData\Local\Programs\Python\Python37-32\lib\urllib \request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\user\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 531, in open
response = meth(req, response)
File "C:\Users\user\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 641, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Users\user\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 569, in error
return self._call_chain(*args)
File "C:\Users\user\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
File "C:\Users\user\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 649, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 405: Method Not Allowed
It's not working because you have a TYPO:
data = urllib.parse.urlencode(values)
date = data.encode('utf-8') # YOUR TYPO IS HERE
req = urllib.request.Request(url,data)
You need to change that line to:
data = data.encode('utf-8') # TYPO FIXED!

Python - Scraping with BeautifulSoup and Urllib

I am trying to read website, but unfortunately something is wrong.
import bs4 as bs
import urllib.request
sauce = urllib.request.urlopen('https://csgoempire.com/withdraw').read()
soup = bs.BeautifulSoup(sauce,'lxml')
print(soup.find_all('p'))
Error:
Traceback (most recent call last):
File "F:/Informatika/Python3X/GamblinSitesBot/GamblingSitesBot.py", line 4, in <module>
sauce = urllib.request.urlopen('https://csgoempire.com/').read()
File "c:\users\edgaras\appdata\local\programs\python\python36\Lib\urllib\request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "c:\users\edgaras\appdata\local\programs\python\python36\Lib\urllib\request.py", line 532, in open
response = meth(req, response)
File "c:\users\edgaras\appdata\local\programs\python\python36\Lib\urllib\request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "c:\users\edgaras\appdata\local\programs\python\python36\Lib\urllib\request.py", line 570, in error
return self._call_chain(*args)
File "c:\users\edgaras\appdata\local\programs\python\python36\Lib\urllib\request.py", line 504, in _call_chain
result = func(*args)
File "c:\users\edgaras\appdata\local\programs\python\python36\Lib\urllib\request.py", line 650, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden
Process finished with exit code 1
Moreover, this code works with other sites such as google.com
you can achieve the same using request library. This works fine
import bs4 as bs
import requests
sauce = requests.get('https://csgoempire.com/withdraw')
soup = bs.BeautifulSoup(sauce.content,'html.parser')
print(soup.find_all('p'))

Python3: HTTP Error 302 while using urllib

I want to read the value of different stocks from websites. Therefore I wrote this tiny script, which reads the page source and then parses out the value:
stock_reader.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from re import search
from urllib import request
def main():
links = [
[
'CSG',
'UBS',
],
[
'http://www.tradegate.de/orderbuch.php?isin=CH0012138530',
'http://www.tradegate.de/orderbuch.php?isin=CH0244767585',
],
]
for i in in range(len(links[0])):
url = links[1][i]
htmltext = request.urlopen(url).read().decode('utf-8')
source = htmltext.splitlines()
for line in source:
if 'id="bid"' in line:
m = search('\d+.\d+', line)
print('{}'.format(m.string[m.start():m.end()]))
if __name__ == '__main__':
main()
sometimes it works but sometimes this error gets raised:
error message
Traceback (most recent call last):
File "./aktien_reader.py", line 39, in <module>
main()
File "./aktien_reader.py", line 30, in main
htmltext = request.urlopen(url).read().decode('utf-8')
File "/usr/lib/python3.3/urllib/request.py", line 160, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.3/urllib/request.py", line 479, in open
response = meth(req, response)
File "/usr/lib/python3.3/urllib/request.py", line 591, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python3.3/urllib/request.py", line 511, in error
result = self._call_chain(*args)
File "/usr/lib/python3.3/urllib/request.py", line 451, in _call_chain
result = func(*args)
File "/usr/lib/python3.3/urllib/request.py", line 696, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/usr/lib/python3.3/urllib/request.py", line 479, in open
response = meth(req, response)
File "/usr/lib/python3.3/urllib/request.py", line 591, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python3.3/urllib/request.py", line 511, in error
result = self._call_chain(*args)
File "/usr/lib/python3.3/urllib/request.py", line 451, in _call_chain
result = func(*args)
File "/usr/lib/python3.3/urllib/request.py", line 696, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/usr/lib/python3.3/urllib/request.py", line 479, in open
response = meth(req, response)
File "/usr/lib/python3.3/urllib/request.py", line 591, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python3.3/urllib/request.py", line 511, in error
result = self._call_chain(*args)
File "/usr/lib/python3.3/urllib/request.py", line 451, in _call_chain
result = func(*args)
File "/usr/lib/python3.3/urllib/request.py", line 696, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/usr/lib/python3.3/urllib/request.py", line 479, in open
response = meth(req, response)
File "/usr/lib/python3.3/urllib/request.py", line 591, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python3.3/urllib/request.py", line 511, in error
result = self._call_chain(*args)
File "/usr/lib/python3.3/urllib/request.py", line 451, in _call_chain
result = func(*args)
File "/usr/lib/python3.3/urllib/request.py", line 696, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/usr/lib/python3.3/urllib/request.py", line 479, in open
response = meth(req, response)
File "/usr/lib/python3.3/urllib/request.py", line 591, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python3.3/urllib/request.py", line 511, in error
result = self._call_chain(*args)
File "/usr/lib/python3.3/urllib/request.py", line 451, in _call_chain
result = func(*args)
File "/usr/lib/python3.3/urllib/request.py", line 686, in http_error_302
self.inf_msg + msg, headers, fp)
urllib.error.HTTPError: HTTP Error 302: The HTTP server returned a redirect error that would lead to an infinite loop.
The last 30x error message was:
Found
My question is: why is it happening and how can I avoid it?
This happens probably because the destination site uses cookies and redirect you in case you don't send cookies.
What you can use is something like that :
from http.cookiejar import CookieJar
url = "http://www.tradegate.de/orderbuch.php?isin=CH0012138530"
req = urllib.request.Request(url, None, {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8','Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3','Accept-Encoding': 'gzip, deflate, sdch','Accept-Language': 'en-US,en;q=0.8','Connection': 'keep-alive'})
cj = CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
response = opener.open(req)
response.read()
This way, you support Cookies and website will allow you to get the page :-)
Another way would be to use the requests package which is really simplest to use. In your case, it would lead to :
import requests
url = "http://www.tradegate.de/orderbuch.php?isin=CH0012138530"
r = requests.get(url, headers={'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}, timeout=15)
print(r.content)
This answer is a simplification of the one by Cédric J. You don't really need to import CookieJar or set various Accept headers if you don't want to. You should however generally set a timeout. It is tested with Python 3.7. I would typically remember to use a new opener for each random URL that I want cookies for.
from urllib.request import build_opener, HTTPCookieProcessor, Request
url = 'https://www.cell.com/cell-metabolism/fulltext/S1550-4131(18)30630-2'
opener = build_opener(HTTPCookieProcessor())
Without a Request object:
response = opener.open(url, timeout=30)
content = response.read()
With a Request object:
request = Request(url)
response = opener.open(request, timeout=30)
content = response.read()
HTTP Status code 302 it's a kind of a redirect, it will have a header with a new URL for access (Not necessary a working URL..)
Location: http://www.example.com/x/y/
This is quite often used to block bots who make to many requests in too of a short time frame. So not an coding problem.

urllib HTTP Error: 400 Bad Request

I looked at this answer: Again urllib.error.HTTPError: HTTP Error 400: Bad Request because it was very similar to my question, but that solution did not work. I'm using Python 3.3.2. The lines that are like ..something.. are just values I replaced to protect my privacy. They should be strings
I'm getting an Error 400: Bad request from this code:
import urllib.parse
import urllib.request
url = '..url..'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
values = {'email':'..my email address..',
'github':'..my github account..'}
headers = {'User-Agent':user_agent}
data = urllib.parse.urlencode(values)
data = data.encode('utf-8')
req = urllib.request.Request(url, data, headers)
response = urllib.request.urlopen(req) #this line causes the errors
page = response.read()
This is the particular error message:
Traceback (most recent call last):
File "/Users/.../Documents/Code 2040/File.py", line 23, in <module>
response = urllib.request.urlopen(req)
File "/Library/Frameworks/Python.framework/Versions/3.3/lib/python3.3/urllib/request.py", line 156, in urlopen
return opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/3.3/lib/python3.3/urllib/request.py", line 475, in open
response = meth(req, response)
File "/Library/Frameworks/Python.framework/Versions/3.3/lib/python3.3/urllib/request.py", line 587, in http_response
'http', request, response, code, msg, hdrs)
File "/Library/Frameworks/Python.framework/Versions/3.3/lib/python3.3/urllib/request.py", line 513, in error
return self._call_chain(*args)
File "/Library/Frameworks/Python.framework/Versions/3.3/lib/python3.3/urllib/request.py", line 447, in _call_chain
result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/3.3/lib/python3.3/urllib/request.py", line 595, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 400: Bad Request

urllib2 retrieve an arbitrary file based on URL and save it into a named file

I am writing a python script to use the urllib2 module as an equivalent to the command line utility wget. The only function I want for this is that it can be used to retrieve an arbitrary file based on URL and save it into a named file. I also only need to worry about two command line arguments, the URL from which the file is to be downloaded and the name of the file into which the content are to be saved.
Example:
python Prog7.py www.python.org pythonHomePage.html
This is my code:
import urllib
import urllib2
#import requests
url = 'http://www.python.org/pythonHomePage.html'
print "downloading with urllib"
urllib.urlretrieve(url, "code.txt")
print "downloading with urllib2"
f = urllib2.urlopen(url)
data = f.read()
with open("code2.txt", "wb") as code:
code.write(data)
urllib seems to work but urllib2 does not seem to work.
Errors received:
File "Problem7.py", line 11, in <module>
f = urllib2.urlopen(url)
File "/usr/lib64/python2.6/urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib64/python2.6/urllib2.py", line 397, in open
response = meth(req, response)
File "/usr/lib64/python2.6/urllib2.py", line 510, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib64/python2.6/urllib2.py", line 429, in error
result = self._call_chain(*args)
File "/usr/lib64/python2.6/urllib2.py", line 369, in _call_chain
result = func(*args)
File "/usr/lib64/python2.6/urllib2.py", line 616, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/usr/lib64/python2.6/urllib2.py", line 397, in open
response = meth(req, response)
File "/usr/lib64/python2.6/urllib2.py", line 510, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib64/python2.6/urllib2.py", line 435, in error
return self._call_chain(*args)
File "/usr/lib64/python2.6/urllib2.py", line 369, in _call_chain
result = func(*args)
File "/usr/lib64/python2.6/urllib2.py", line 518, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 404: NOT FOUND
And the URL is doesn't exist at all; https://www.python.org/pythonHomePage.html is indeed a 404 Not Found page.
The difference between urllib and urllib2 then is that the latter automatically raises an exception when a 404 page is returned, while urllib.urlretrieve() just saves the error page for you:
>>> import urllib
>>> urllib.urlopen('https://www.python.org/pythonHomePage.html').getcode()
404
>>> import urllib2
>>> urllib2.urlopen('https://www.python.org/pythonHomePage.html')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Users/mj/Development/Library/buildout.python/parts/opt/lib/python2.7/urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "/Users/mj/Development/Library/buildout.python/parts/opt/lib/python2.7/urllib2.py", line 410, in open
response = meth(req, response)
File "/Users/mj/Development/Library/buildout.python/parts/opt/lib/python2.7/urllib2.py", line 523, in http_response
'http', request, response, code, msg, hdrs)
File "/Users/mj/Development/Library/buildout.python/parts/opt/lib/python2.7/urllib2.py", line 448, in error
return self._call_chain(*args)
File "/Users/mj/Development/Library/buildout.python/parts/opt/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(*args)
File "/Users/mj/Development/Library/buildout.python/parts/opt/lib/python2.7/urllib2.py", line 531, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 404: NOT FOUND
If you wanted to save the error page, you can catch the urllib2.HTTPError exception:
try:
f = urllib2.urlopen(url)
data = f.read()
except urllib2.HTTPError as err:
data = err.read()
It is due to the different behavior by urllib and urllib2.
Since the web page returns a 404 error (webpage not found) urllib2 "catches" it while urllib downloads the html of the returned page regardless of the error.
If you want to print the html to the text file you can print the error:
import urllib2
try:
data = urllib2.urlopen('http://www.python.org/pythonHomePage.html').read()
except urllib2.HTTPError, e:
print e.code
print e.msg
print e.headers
print e.fp.read()
with open("code2.txt", "wb") as code:
code.write(e.fp.read())
req will be a Request object, fp will be a file-like object with the
HTTP error body, code will be the three-digit code of the error, msg
will be the user-visible explanation of the code and hdrs will be a
mapping object with the headers of the error.
More data about HTTP error: urllib2 documentation

Categories

Resources