Listing urls from a csv file - python

I'm trying to list urls from a csv file to see what their HTTP code is. This is what ive got so far:
import urllib.request, urllib.error
url = ['http://www.10vibes.info'
'http://www.10vibes.info']
try:
conn = urllib.request.urlopen(url)
except urllib.error.HTTPError as e:
print(e.code)
except urllib.error.URLError as e:
print('URLError')
else:
print('good')

Pass url as string as follows:
import urllib.request, urllib.error
url = ['http://www.10vibes.info'
'http://www.10vibes.info']
for my_url in url:
try:
conn = urllib.request.urlopen(my_url)
except urllib.error.HTTPError as e:
# Return code error (e.g. 404, 501, ...)
# ...
print(e.code)
pass
except urllib.error.URLError as e:
# Not an HTTP-specific error (e.g. connection refused)
# ...
print('URLError')
pass
print('good')

Related

Re-attempt to open url with urllib in python on timeout

I am looking to parse data from a large number of webpages using Python (>10k) and I am finding that the function I have written to do this often encounters a timeout error every 500 loops. I have attempted to fix this with a try - except code block, but i would like to improve the function so it will re-attempt to open the url four or five times before returning the error. Is there an elegant way to do this?
My code below:
def url_open(url):
from urllib.request import Request, urlopen
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
try:
s = urlopen(req,timeout=50).read()
except urllib.request.HTTPError as e:
if e.code == 404:
print(str(e))
else:
print(str(e))
s=urlopen(req,timeout=50).read()
raise
return BeautifulSoup(s, "lxml")
I've used a pattern like this for retrying in the past:
def url_open(url):
from urllib.request import Request, urlopen
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
retrycount = 0
s = None
while s is None:
try:
s = urlopen(req,timeout=50).read()
except urllib.request.HTTPError as e:
print(str(e))
if canRetry(e.code):
retrycount+=1
if retrycount > 5:
raise
# thread.sleep for a bit
else:
raise
return BeautifulSoup(s, "lxml")
You just have to define canRetry somewhere else.

How to check HTTP errors for more than two URLs?

Question: I've 3 URLS - testurl1, testurl2 and testurl3. I'd like to try testurl1 first, if I get 404 error then try testurl2, if that gets 404 error then try testurl3. How to achieve this? So far I've tried below but that works only for two url, how to add support for third url?
from urllib2 import Request, urlopen
from urllib2 import URLError, HTTPError
def checkfiles():
req = Request('http://testurl1')
try:
response = urlopen(req)
url1=('http://testurl1')
except HTTPError, URLError:
url1 = ('http://testurl2')
print url1
finalURL='wget '+url1+'/testfile.tgz'
print finalURL
checkfiles()
Another job for plain old for loop:
for url in testurl1, testurl2, testurl3
req = Request(url)
try:
response = urlopen(req)
except HttpError as err:
if err.code == 404:
continue
raise
else:
# do what you want with successful response here (or outside the loop)
break
else:
# They ALL errored out with HTTPError code 404. Handle this?
raise err
Hmmm maybe something like this?
from urllib2 import Request, urlopen
from urllib2 import URLError, HTTPError
def checkfiles():
req = Request('http://testurl1')
try:
response = urlopen(req)
url1=('http://testurl1')
except HTTPError, URLError:
try:
url1 = ('http://testurl2')
except HTTPError, URLError:
url1 = ('http://testurl3')
print url1
finalURL='wget '+url1+'/testfile.tgz'
print finalURL
checkfiles()

Python 3 urllib.request.urlopen

How can I avoid exceptions from urllib.request.urlopen if response.status_code is not 200? Now it raise URLError or HTTPError based on request status.
Is there any other way to make request with python3 basic libs?
How can I get response headers if status_code != 200 ?
Use try except, the below code:
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
req = Request("http://www.111cn.net /")
try:
response = urlopen(req)
except HTTPError as e:
# do something
print('Error code: ', e.code)
except URLError as e:
# do something
print('Reason: ', e.reason)
else:
# do something
print('good!')
The docs state that the exception type, HTTPError, can also be treated as a HTTPResponse. Thus, you can get the response body from an error response as follows:
import urllib.request
import urllib.error
def open_url(request):
try:
return urllib.request.urlopen(request)
except urllib.error.HTTPError as e:
# "e" can be treated as a http.client.HTTPResponse object
return e
and then use as follows:
result = open_url('http://www.stackoverflow.com/404-file-not-found')
print(result.status) # prints 404
print(result.read()) # prints page contents
print(result.headers.items()) # lists headers
I found a solution from py3 docs
>>> import http.client
>>> conn = http.client.HTTPConnection("www.python.org")
>>> # Example of an invalid request
>>> conn.request("GET", "/parrot.spam")
>>> r2 = conn.getresponse()
>>> print(r2.status, r2.reason)
404 Not Found
>>> data2 = r2.read()
>>> conn.close()
https://docs.python.org/3/library/http.client.html#examples

Still getting an HTTP404 error even with "try and except" clause

I need to access a url and if it gives me an HTTPError I need to wait five minutes and try again (this works for this particular website). It looks like the code doesn't recognize the except clause and it still gives me an HTTPError instantly (without waiting the 5 min).
import urllib2, datetime, re,os, requests
from time import sleep
import time
from dateutil.relativedelta import relativedelta
from requests.exceptions import HTTPError, ConnectionError
from bs4 import BeautifulSoup
try:
resp = requests.get(url)
except HTTPError:
while True:
print "Wait."
time.sleep(305)
resp = requests.get(url)
except ConnectionError:
while True:
print "Wait."
time.sleep(305)
resp = requests.get(url)
You put this resp = requests.get(url) in to try/except block, but after except you put the same thing again. If something throws an error and you put that after except, it will throw that error again.
while True:
try:
resp = requests.get(url)
except HTTPError:
print "Wait."
time.sleep(305)
continue #pass the codes after this block
except ConnectionError:
print "Wait."
time.sleep(305)
continue #pass the codes after this block
else:
break
Basically until your url responds correctly, it will run the same thing again and again.
Inside your except blocks, you have this:
resp = requests.get(url)
This isn't protected by a try block, so it throws an error. You have to rearrange your code a little:
while True:
try:
resp = requests.get(url)
except HTTPError:
print "Wait."
time.sleep(305)
except ConnectionError:
print "Wait."
time.sleep(305)
else: break
It's now an infinite loop. When the connection fails, the loop just continues. When it succeeds, the loop exits.

In Python, how do I use urllib to see if a website is 404 or 200?

How to get the code of the headers through urllib?
The getcode() method (Added in python2.6) returns the HTTP status code that was sent with the response, or None if the URL is no HTTP URL.
>>> a=urllib.urlopen('http://www.google.com/asdfsf')
>>> a.getcode()
404
>>> a=urllib.urlopen('http://www.google.com/')
>>> a.getcode()
200
You can use urllib2 as well:
import urllib2
req = urllib2.Request('http://www.python.org/fish.html')
try:
resp = urllib2.urlopen(req)
except urllib2.HTTPError as e:
if e.code == 404:
# do something...
else:
# ...
except urllib2.URLError as e:
# Not an HTTP-specific error (e.g. connection refused)
# ...
else:
# 200
body = resp.read()
Note that HTTPError is a subclass of URLError which stores the HTTP status code.
For Python 3:
import urllib.request, urllib.error
url = 'http://www.google.com/asdfsf'
try:
conn = urllib.request.urlopen(url)
except urllib.error.HTTPError as e:
# Return code error (e.g. 404, 501, ...)
# ...
print('HTTPError: {}'.format(e.code))
except urllib.error.URLError as e:
# Not an HTTP-specific error (e.g. connection refused)
# ...
print('URLError: {}'.format(e.reason))
else:
# 200
# ...
print('good')
import urllib2
try:
fileHandle = urllib2.urlopen('http://www.python.org/fish.html')
data = fileHandle.read()
fileHandle.close()
except urllib2.URLError, e:
print 'you got an error with the code', e

Categories

Resources