Website crawl behind a company proxy in python - python

I try to built a website crawl in behind behind a company proxy.
So far I am running the following code:
import requests
url = 'https://www.bundesliga.com/de/bundesliga/spieltag/2022-2023/19/fc-augsburg-vs-bayer-04-leverkusen/lineup'
def get_url_content(url):
return requests.get(url).text
get_url_content(url)
And i am running into the following errors:
PermissionError Traceback (most recent call last)
File C:\Program Files\Anaconda3\envs\Test\lib\site-packages\urllib3\connection.py:174, in HTTPConnection._new_conn(self)
173 try:
--> 174 conn = connection.create_connection(
175 (self._dns_host, self.port), self.timeout, **extra_kw
176 )
178 except SocketTimeout:
File C:\Program Files\Anaconda3\envs\Test\lib\site-packages\urllib3\util\connection.py:95, in create_connection(address, timeout, source_address, socket_options)
94 if err is not None:
---> 95 raise err
97 raise socket.error("getaddrinfo returns an empty list")
File C:\Program Files\Anaconda3\envs\Test\lib\site-packages\urllib3\util\connection.py:85, in create_connection(address, timeout, source_address, socket_options)
84 sock.bind(source_address)
---> 85 sock.connect(sa)
PermissionError: [WinError 10013] Der Zugriff auf einen Socket war aufgrund der Zugriffsrechte des Sockets unzulässig
During handling of the above exception, another exception occurred:
NewConnectionError Traceback (most recent call last)
File C:\Program Files\Anaconda3\envs\Test\lib\site-packages\urllib3\connectionpool.py:703, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
702 # Make the request on the httplib connection object.
--> 703 httplib_response = self._make_request(
704 conn,
705 method,
706 url,
707 timeout=timeout_obj,
708 body=body,
709 headers=headers,
710 chunked=chunked,
It seems like the command is blocked by the company proxy, how can I handle the error? Also for other Urls it is the same error.
There is the option to change the proxy with:
requests.get(url, proxies = { "https" : "https://1.1.0.1:80"})
but I have no idea which proxy I have to use.

Related

Python API simple requests.get throwing error

I'm going through the below tutorial and trying to execute the same commands # 24:45, but getting an error. What would be the issue?
https://www.youtube.com/watch?v=qbLc5a9jdXo
Commands I'm running are as below. I'm running this on Jupyter notebook with Anaconda.
If I enter that link in Chrome I can see the JSON data
import requests
import json
response = requests.get("https://api.stackexchange.com/2.3/questions?order=desc&sort=activity&site=stackoverflow")
**Error as below**
---------------------------------------------------------------------------
TimeoutError Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\connection.py in _new_conn(self)
158 conn = connection.create_connection(
--> 159 (self._dns_host, self.port), self.timeout, **extra_kw)
160
~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\util\connection.py in create_connection(address, timeout, source_address, socket_options)
79 if err is not None:
---> 80 raise err
81
~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\util\connection.py in create_connection(address, timeout, source_address, socket_options)
69 sock.bind(source_address)
---> 70 sock.connect(sa)
71 return sock
TimeoutError: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond
During handling of the above exception, another exception occurred:
NewConnectionError Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
599 body=body, headers=headers,
--> 600 chunked=chunked)
601
~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
342 try:
--> 343 self._validate_conn(conn)
344 except (SocketTimeout, BaseSSLError) as e:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\connectionpool.py in _validate_conn(self, conn)
838 if not getattr(conn, 'sock', None): # AppEngine might not have `.sock`
--> 839 conn.connect()
840
~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\connection.py in connect(self)
300 # Add certificate verification
--> 301 conn = self._new_conn()
302 hostname = self.host
~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\connection.py in _new_conn(self)
167 raise NewConnectionError(
--> 168 self, "Failed to establish a new connection: %s" % e)
169
NewConnectionError: <urllib3.connection.VerifiedHTTPSConnection object at 0x0000023BB5434B48>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond
During handling of the above exception, another exception occurred:
MaxRetryError Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
448 retries=self.max_retries,
--> 449 timeout=timeout
450 )
~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
637 retries = retries.increment(method, url, error=e, _pool=self,
--> 638 _stacktrace=sys.exc_info()[2])
639 retries.sleep()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\util\retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
398 if new_retry.is_exhausted():
--> 399 raise MaxRetryError(_pool, url, error or ResponseError(cause))
400
MaxRetryError: HTTPSConnectionPool(host='api.stackexchange.com', port=443): Max retries exceeded with url: /2.3/questions?order=desc&sort=activity&site=stackoverflow (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x0000023BB5434B48>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))
During handling of the above exception, another exception occurred:
ConnectionError Traceback (most recent call last)
<ipython-input-15-4db0c3b4a994> in <module>
----> 1 response = requests.get("https://api.stackexchange.com/2.3/questions?order=desc&sort=activity&site=stackoverflow")
~\AppData\Local\Continuum\anaconda3\lib\site-packages\requests\api.py in get(url, params, **kwargs)
73
74 kwargs.setdefault('allow_redirects', True)
---> 75 return request('get', url, params=params, **kwargs)
76
77
~\AppData\Local\Continuum\anaconda3\lib\site-packages\requests\api.py in request(method, url, **kwargs)
58 # cases, and look like a memory leak in others.
59 with sessions.Session() as session:
---> 60 return session.request(method=method, url=url, **kwargs)
61
62
~\AppData\Local\Continuum\anaconda3\lib\site-packages\requests\sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
531 }
532 send_kwargs.update(settings)
--> 533 resp = self.send(prep, **send_kwargs)
534
535 return resp
~\AppData\Local\Continuum\anaconda3\lib\site-packages\requests\sessions.py in send(self, request, **kwargs)
644
645 # Send the request
--> 646 r = adapter.send(request, **kwargs)
647
648 # Total elapsed time of the request (approximately)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
514 raise SSLError(e, request=request)
515
--> 516 raise ConnectionError(e, request=request)
517
518 except ClosedPoolError as e:
ConnectionError: HTTPSConnectionPool(host='api.stackexchange.com', port=443): Max retries exceeded with url: /2.3/questions?order=desc&sort=activity&site=stackoverflow (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x0000023BB5434B48>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))
I think adding timeout might help. Try doing:
requests.get("https://api.stackexchange.com/2.3/questions?order=desc&sort=activity&site=stackoverflow", timeout=10)
Note: timeout used here is in seconds
If that doesn't help then refer to this question for more help. Timeout for python requests.get entire response

Spotify Python API call timeout issues

I am getting below error on Spotify API call. is there work around to resolve these timeout issues.
Python Spotify Code
TimeoutError Traceback (most recent call last)
C:\ProgramData\Anaconda3\lib\site-packages\urllib3\util\connection.py in create_connection(address, timeout, source_address, socket_options)
73 sock.bind(source_address)
---> 74 sock.connect(sa)
75 return sock
TimeoutError: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond
During handling of the above exception, another exception occurred:
KeyboardInterrupt Traceback (most recent call last)
<ipython-input-11-eb4cef5ffdbb> in <module>
5 #print(artist)
6 #time.sleep(2)
----> 7 result = spotify.search(artist,search_type='artist')['artists']['items']
8 #print(result)
9 try:
<ipython-input-7-6a2d183a2f3a> in search(self, query, search_type)
80 data = urlencode({"q": query, "type": search_type.lower()})
81 lookup_url = f"{endpoint}?{data}"
---> 82 r = requests.get(lookup_url, headers=headers)#.json()
83 if r.status_code not in range(200, 299):
84 return {}
C:\ProgramData\Anaconda3\lib\site-packages\requests\api.py in get(url, params, **kwargs)
74
75 kwargs.setdefault('allow_redirects', True)
---> 76 return request('get', url, params=params, **kwargs)
77
78
C:\ProgramData\Anaconda3\lib\site-packages\requests\api.py in request(method, url, **kwargs)
59 # cases, and look like a memory leak in others.
60 with sessions.Session() as session:
---> 61 return session.request(method=method, url=url, **kwargs)
62
63
C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
528 }
529 send_kwargs.update(settings)
--> 530 resp = self.send(prep, **send_kwargs)
531
532 return resp
C:\ProgramData\Anaconda3\lib\site-packages\requests\sessions.py in send(self, request, **kwargs)
641
642 # Send the request
--> 643 r = adapter.send(request, **kwargs)
644
645 # Total elapsed time of the request (approximately)
C:\ProgramData\Anaconda3\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
437 try:
438 if not chunked:
--> 439 resp = conn.urlopen(
440 method=request.method,
441 url=url,
C:\ProgramData\Anaconda3\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
668
669 # Make the request on the httplib connection object.
--> 670 httplib_response = self._make_request(
671 conn,
672 method,
C:\ProgramData\Anaconda3\lib\site-packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
379 # Trigger any extra validation we need to do.
380 try:
--> 381 self._validate_conn(conn)
382 except (SocketTimeout, BaseSSLError) as e:
383 # Py2 raises this as a BaseSSLError, Py3 raises it as socket timeout.
C:\ProgramData\Anaconda3\lib\site-packages\urllib3\connectionpool.py in _validate_conn(self, conn)
974 # Force connect early to allow us to validate the connection.
975 if not getattr(conn, "sock", None): # AppEngine might not have `.sock`
--> 976 conn.connect()
977
978 if not conn.is_verified:
C:\ProgramData\Anaconda3\lib\site-packages\urllib3\connection.py in connect(self)
306 def connect(self):
307 # Add certificate verification
--> 308 conn = self._new_conn()
309 hostname = self.host
310
C:\ProgramData\Anaconda3\lib\site-packages\urllib3\connection.py in _new_conn(self)
157
158 try:
--> 159 conn = connection.create_connection(
160 (self._dns_host, self.port), self.timeout, **extra_kw
161 )
C:\ProgramData\Anaconda3\lib\site-packages\urllib3\util\connection.py in create_connection(address, timeout, source_address, socket_options)
72 if source_address:
73 sock.bind(source_address)
---> 74 sock.connect(sa)
75 return sock
76
KeyboardInterrupt:
1
###########Create Data frame with required columns to be extracte
[1]: https://i.stack.imgur.com/lCaz4.p
ng
I really wish spotipy would address this issue in their package, but nonetheless, here's my workaround:
Import the error type
If you don't catch this exact error type, you'll cause problems for yourself by leaving your catch open-ended. For example, if you have a TypeError in your syntax or something, it will error that out and go to your catch block, and you'll have no way of knowing that it errored in the first place.
from requests.exceptions import ReadTimeout
When you init your spotipy client, I recommend adding in a couple of params that might be helpful.
spotify = spotipy.Spotify(auth=spotify_token, requests_timeout=10, retries=10)
requests_timeout | throws the same error you're looking for after a number of seconds. I use this just so that it doesn't hang forever, and it'll just retry the request if nothing comes back after 10 seconds.
retries | the maximum number of retries spotipy will request if it gets an error. I think it defaults to 3, so I just like to increase this a little bit just in case.
Run a try/except type retry system
You can definitely expand this to make it keep retrying (using a while loop or something) if it keeps hanging, but this is the basic idea:
try:
result = spotify.search(artist,search_type='artist')['artists']['items']
except ReadTimeout:
print('Spotify timed out... trying again...')
result = spotify.search(artist,search_type='artist')['artists']['items']

cannot import texthero in python

I am trying to import and use the texthero module in python and keep getting an error message.
code is simply import texthero as hero
there is a really long error message
[nltk_data] Error loading stopwords: <urlopen error [WinError 10060] A
[nltk_data] connection attempt failed because the connected party
[nltk_data] did not properly respond after a period of time, or
[nltk_data] established connection failed because connected host
[nltk_data] has failed to respond>
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
C:\Users\name\AppData\Roaming\Python\Python36\site-packages\texthero\stopwords.py in <module>()
13 # If not present, download 'en_core_web_sm'
---> 14 spacy_model = spacy.load("en_core_web_sm")
15 except OSError:
C:\Users\name\AppData\Roaming\Python\Python36\site-packages\spacy\__init__.py in load(name, **overrides)
29 warnings.warn(Warnings.W001.format(path=depr_path), DeprecationWarning)
---> 30 return util.load_model(name, **overrides)
31
C:\Users\name\AppData\Roaming\Python\Python36\site-packages\spacy\util.py in load_model(name, **overrides)
174 return load_model_from_path(name, **overrides)
--> 175 raise IOError(Errors.E050.format(name=name))
176
OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.
During handling of the above exception, another exception occurred:
TimeoutError Traceback (most recent call last)
C:\Users\name\AppData\Roaming\Python\Python36\site-packages\urllib3\connection.py in _new_conn(self)
159 conn = connection.create_connection(
--> 160 (self._dns_host, self.port), self.timeout, **extra_kw
161 )
C:\Users\name\AppData\Roaming\Python\Python36\site-packages\urllib3\util\connection.py in create_connection(address, timeout, source_address, socket_options)
83 if err is not None:
---> 84 raise err
85
C:\Users\name\AppData\Roaming\Python\Python36\site-packages\urllib3\util\connection.py in create_connection(address, timeout, source_address, socket_options)
73 sock.bind(source_address)
---> 74 sock.connect(sa)
75 return sock
TimeoutError: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond
During handling of the above exception, another exception occurred:
NewConnectionError Traceback (most recent call last)
C:\Users\name\AppData\Roaming\Python\Python36\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
676 headers=headers,
--> 677 chunked=chunked,
678 )
C:\Users\name\AppData\Roaming\Python\Python36\site-packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
380 try:
--> 381 self._validate_conn(conn)
382 except (SocketTimeout, BaseSSLError) as e:
C:\Users\name\AppData\Roaming\Python\Python36\site-packages\urllib3\connectionpool.py in _validate_conn(self, conn)
977 if not getattr(conn, "sock", None): # AppEngine might not have `.sock`
--> 978 conn.connect()
979
C:\Users\name\AppData\Roaming\Python\Python36\site-packages\urllib3\connection.py in connect(self)
308 # Add certificate verification
--> 309 conn = self._new_conn()
310 hostname = self.host
C:\Users\name\AppData\Roaming\Python\Python36\site-packages\urllib3\connection.py in _new_conn(self)
171 raise NewConnectionError(
--> 172 self, "Failed to establish a new connection: %s" % e
173 )
NewConnectionError: <urllib3.connection.HTTPSConnection object at 0x0000000019860160>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond
During handling of the above exception, another exception occurred:
MaxRetryError Traceback (most recent call last)
C:\Users\name\AppData\Roaming\Python\Python36\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
448 retries=self.max_retries,
--> 449 timeout=timeout
450 )
C:\Users\name\AppData\Roaming\Python\Python36\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
726 retries = retries.increment(
--> 727 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
728 )
C:\Users\name\AppData\Roaming\Python\Python36\site-packages\urllib3\util\retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
438 if new_retry.is_exhausted():
--> 439 raise MaxRetryError(_pool, url, error or ResponseError(cause))
440
MaxRetryError: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /explosion/spacy-models/master/shortcuts-v2.json (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000000019860160>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond',))
During handling of the above exception, another exception occurred:
ConnectionError Traceback (most recent call last)
<ipython-input-3-a19be4912cd0> in <module>()
----> 1 import texthero as hero
2
C:\Users\name\AppData\Roaming\Python\Python36\site-packages\texthero\__init__.py in <module>()
4
5 """
----> 6 from . import preprocessing
7 from .preprocessing import *
8
C:\Users\name\AppData\Roaming\Python\Python36\site-packages\texthero\preprocessing.py in <module>()
12 from nltk.stem import PorterStemmer, SnowballStemmer
13
---> 14 from texthero import stopwords as _stopwords
15
16 from typing import List, Callable
C:\Users\name\AppData\Roaming\Python\Python36\site-packages\texthero\stopwords.py in <module>()
16 from spacy.cli.download import download as spacy_download
17
---> 18 spacy_download("en_core_web_sm")
19
20 from spacy.lang.en import stop_words as spacy_en_stopwords
C:\Users\name\AppData\Roaming\Python\Python36\site-packages\spacy\cli\download.py in download(model, direct, *pip_args)
42 dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args)
43 else:
---> 44 shortcuts = get_json(about.__shortcuts__, "available shortcuts")
45 model_name = shortcuts.get(model, model)
46 compatibility = get_compatibility()
C:\Users\name\AppData\Roaming\Python\Python36\site-packages\spacy\cli\download.py in get_json(url, desc)
93
94 def get_json(url, desc):
---> 95 r = requests.get(url)
96 if r.status_code != 200:
97 msg.fail(
C:\Users\name\AppData\Roaming\Python\Python36\site-packages\requests\api.py in get(url, params, **kwargs)
74
75 kwargs.setdefault('allow_redirects', True)
---> 76 return request('get', url, params=params, **kwargs)
77
78
C:\Users\name\AppData\Roaming\Python\Python36\site-packages\requests\api.py in request(method, url, **kwargs)
59 # cases, and look like a memory leak in others.
60 with sessions.Session() as session:
---> 61 return session.request(method=method, url=url, **kwargs)
62
63
C:\Users\name\AppData\Roaming\Python\Python36\site-packages\requests\sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
528 }
529 send_kwargs.update(settings)
--> 530 resp = self.send(prep, **send_kwargs)
531
532 return resp
C:\Users\name\AppData\Roaming\Python\Python36\site-packages\requests\sessions.py in send(self, request, **kwargs)
641
642 # Send the request
--> 643 r = adapter.send(request, **kwargs)
644
645 # Total elapsed time of the request (approximately)
C:\Users\name\AppData\Roaming\Python\Python36\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
514 raise SSLError(e, request=request)
515
--> 516 raise ConnectionError(e, request=request)
517
518 except ClosedPoolError as e:
ConnectionError: HTTPSConnectionPool(host='raw.githubusercontent.com', port=443): Max retries exceeded with url: /explosion/spacy-models/master/shortcuts-v2.json (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000000019860160>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond',))
So it seems not to be able to open the stopwords file. I have checked the data folders and it is present.
when it can't find it it then tries to find the spacy file 'en_core_web_sm'
I cannot find this in the folders although spacy is there.
I then get a time out as it can't find the file.
I have tried uninstalling and reinstalling spacy and texthero and updating both.
I have tired installing en_core_web_sm and the other versions
e.g. using code:
!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
and
!python -m spacy download en_core_web_sm
I have tried reinstalling/updating nltk
I have run in Jupyter notebook and in Python and CMD (as admin). all with the same error.
I've tried both python commands and pip to do the above installs etc.
I really don't know what else to try
any ideas?
thanks
Mizz

How to select links from a list of links which are giving some kind of error?

I have a list of links. Many of them are giving errors like 404: page note found and many are not getting connected ie timeout error. They might give all different kinds of error, I have not checked all. I want to separate good links which are working and bad links which give any kind of error.
I tried to filter out the connection timed out links like this:
import requests
r=requests.get("http://www.carillionplc.co.uk/sustain/f_con2.htm", timeout=10)
But I got the following error:
---------------------------------------------------------------------------
timeout Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/urllib3/connection.py in _new_conn(self)
158 conn = connection.create_connection(
--> 159 (self._dns_host, self.port), self.timeout, **extra_kw)
160
~/anaconda3/lib/python3.7/site-packages/urllib3/util/connection.py in create_connection(address, timeout, source_address, socket_options)
79 if err is not None:
---> 80 raise err
81
~/anaconda3/lib/python3.7/site-packages/urllib3/util/connection.py in create_connection(address, timeout, source_address, socket_options)
69 sock.bind(source_address)
---> 70 sock.connect(sa)
71 return sock
timeout: timed out
During handling of the above exception, another exception occurred:
ConnectTimeoutError Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
599 body=body, headers=headers,
--> 600 chunked=chunked)
601
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
353 else:
--> 354 conn.request(method, url, **httplib_request_kw)
355
~/anaconda3/lib/python3.7/http/client.py in request(self, method, url, body, headers, encode_chunked)
1228 """Send a complete request to the server."""
-> 1229 self._send_request(method, url, body, headers, encode_chunked)
1230
~/anaconda3/lib/python3.7/http/client.py in _send_request(self, method, url, body, headers, encode_chunked)
1274 body = _encode(body, 'body')
-> 1275 self.endheaders(body, encode_chunked=encode_chunked)
1276
~/anaconda3/lib/python3.7/http/client.py in endheaders(self, message_body, encode_chunked)
1223 raise CannotSendHeader()
-> 1224 self._send_output(message_body, encode_chunked=encode_chunked)
1225
~/anaconda3/lib/python3.7/http/client.py in _send_output(self, message_body, encode_chunked)
1015 del self._buffer[:]
-> 1016 self.send(msg)
1017
~/anaconda3/lib/python3.7/http/client.py in send(self, data)
955 if self.auto_open:
--> 956 self.connect()
957 else:
~/anaconda3/lib/python3.7/site-packages/urllib3/connection.py in connect(self)
180 def connect(self):
--> 181 conn = self._new_conn()
182 self._prepare_conn(conn)
~/anaconda3/lib/python3.7/site-packages/urllib3/connection.py in _new_conn(self)
163 self, "Connection to %s timed out. (connect timeout=%s)" %
--> 164 (self.host, self.timeout))
165
ConnectTimeoutError: (<urllib3.connection.HTTPConnection object at 0x7f7c86c14ba8>, 'Connection to www.carillionplc.co.uk timed out. (connect timeout=10)')
During handling of the above exception, another exception occurred:
MaxRetryError Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
448 retries=self.max_retries,
--> 449 timeout=timeout
450 )
~/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
637 retries = retries.increment(method, url, error=e, _pool=self,
--> 638 _stacktrace=sys.exc_info()[2])
639 retries.sleep()
~/anaconda3/lib/python3.7/site-packages/urllib3/util/retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
398 if new_retry.is_exhausted():
--> 399 raise MaxRetryError(_pool, url, error or ResponseError(cause))
400
MaxRetryError: HTTPConnectionPool(host='www.carillionplc.co.uk', port=80): Max retries exceeded with url: /sustain/f_con2.htm (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7f7c86c14ba8>, 'Connection to www.carillionplc.co.uk timed out. (connect timeout=10)'))
During handling of the above exception, another exception occurred:
ConnectTimeout Traceback (most recent call last)
<ipython-input-6-425ceeca52ad> in <module>
1 import requests
----> 2 r=requests.get("http://www.carillionplc.co.uk/sustain/f_con2.htm", timeout=10)
~/anaconda3/lib/python3.7/site-packages/requests/api.py in get(url, params, **kwargs)
73
74 kwargs.setdefault('allow_redirects', True)
---> 75 return request('get', url, params=params, **kwargs)
76
77
~/anaconda3/lib/python3.7/site-packages/requests/api.py in request(method, url, **kwargs)
58 # cases, and look like a memory leak in others.
59 with sessions.Session() as session:
---> 60 return session.request(method=method, url=url, **kwargs)
61
62
~/anaconda3/lib/python3.7/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
531 }
532 send_kwargs.update(settings)
--> 533 resp = self.send(prep, **send_kwargs)
534
535 return resp
~/anaconda3/lib/python3.7/site-packages/requests/sessions.py in send(self, request, **kwargs)
644
645 # Send the request
--> 646 r = adapter.send(request, **kwargs)
647
648 # Total elapsed time of the request (approximately)
~/anaconda3/lib/python3.7/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
502 # TODO: Remove this in 3.0.0: see #2811
503 if not isinstance(e.reason, NewConnectionError):
--> 504 raise ConnectTimeout(e, request=request)
505
506 if isinstance(e.reason, ResponseError):
ConnectTimeout: HTTPConnectionPool(host='www.carillionplc.co.uk', port=80): Max retries exceeded with url: /sustain/f_con2.htm (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7f7c86c14ba8>, 'Connection to www.carillionplc.co.uk timed out. (connect timeout=10)'))
Is there a simple way to just filter out the working links.
You can't know which links are working unless you try them.
So, if you required to filter broken links you can try this:
import requests
links = [...] #your links are stored here
working_links = [] #empty list
for link in links:
try:
r=requests.get(link, timeout=10)
working_links.append(link)
except Exception as e:
print(f'request to {link} failed with {str(e)}')
I think you can pass the requests into try/except statement, and remove the ones that reach the except statement from your list..
It would look something like the following:
for link in your_list:
try:
r = requests.get(link, timeout=1)
except:
your_list.remove(link)

Python Requests: IOError: [Errno 22] Invalid argument

I am new to Python Requests and am encountering an IOError:[Errno 22] Invalid argument when I attempt a requests.get(). In short, I am attempting to connect to an internal web application using SSL and so I pass a cert/key combination per the Requests documentation.
I've spent quite a bit of time researching potential issues and saw some mention of potential SNI issues but am not savvy enough to know how I might go about remedying the issue (again, new to Requests). Appreciate any nudge in the right direction/where to dig in further (I'm guessing the urllib3 piece?)
My Code
import requests
cert_file_path = "/Users/me/Documents/cert.pem"
key_file_path = "/Users/me/Documents/key.pem"
url = "https://mydomain/path/to/something"
cert = (cert_file_path, key_file_path)
r = requests.get(url, cert=cert)
My Error
IOError Traceback (most recent call last)
<ipython-input-48-1ee4a7f23d00> in <module>()
4 url = "https://mydomain/path/to/something"
5 cert = (cert_file_path, key_file_path)
----> 6 r = requests.get(url, cert=cert)
/Users/me/anaconda/lib/python2.7/site-packages/requests/api.pyc in get(url, **kwargs)
66
67 kwargs.setdefault('allow_redirects', True)
---> 68 return request('get', url, **kwargs)
69
70
/Users/me/anaconda/lib/python2.7/site-packages/requests/api.pyc in request(method, url, **kwargs)
48
49 session = sessions.Session()
---> 50 response = session.request(method=method, url=url, **kwargs)
51 # By explicitly closing the session, we avoid leaving sockets open which
52 # can trigger a ResourceWarning in some cases, and look like a memory leak
/Users/me/anaconda/lib/python2.7/site-packages/requests/sessions.pyc in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
462 }
463 send_kwargs.update(settings)
--> 464 resp = self.send(prep, **send_kwargs)
465
466 return resp
/Users/me/anaconda/lib/python2.7/site-packages/requests/sessions.pyc in send(self, request, **kwargs)
574
575 # Send the request
--> 576 r = adapter.send(request, **kwargs)
577
578 # Total elapsed time of the request (approximately)
/Users/me/anaconda/lib/python2.7/site-packages/requests/adapters.pyc in send(self, request, stream, timeout, verify, cert, proxies)
368 decode_content=False,
369 retries=self.max_retries,
--> 370 timeout=timeout
371 )
372
/Users/me/anaconda/lib/python2.7/site-packages/requests/packages/urllib3/connectionpool.pyc in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, **response_kw)
542 httplib_response = self._make_request(conn, method, url,
543 timeout=timeout_obj,
--> 544 body=body, headers=headers)
545
546 # If we're going to release the connection in ``finally:``, then
/Users/me/anaconda/lib/python2.7/site-packages/requests/packages/urllib3/connectionpool.pyc in _make_request(self, conn, method, url, timeout, **httplib_request_kw)
339 # Trigger any extra validation we need to do.
340 try:
--> 341 self._validate_conn(conn)
342 except (SocketTimeout, BaseSSLError) as e:
343 # Py2 raises this as a BaseSSLError, Py3 raises it as socket timeout.
/Users/me/anaconda/lib/python2.7/site-packages/requests/packages/urllib3/connectionpool.pyc in _validate_conn(self, conn)
760 # Force connect early to allow us to validate the connection.
761 if not getattr(conn, 'sock', None): # AppEngine might not have `.sock`
--> 762 conn.connect()
763
764 if not conn.is_verified:
/Users/me/anaconda/lib/python2.7/site-packages/requests/packages/urllib3/connection.pyc in connect(self)
236 ca_certs=self.ca_certs,
237 server_hostname=hostname,
--> 238 ssl_version=resolved_ssl_version)
239
240 if self.assert_fingerprint:
/Users/me/anaconda/lib/python2.7/site-packages/requests/packages/urllib3/util/ssl_.pyc in ssl_wrap_socket(sock, keyfile, certfile, cert_reqs, ca_certs, server_hostname, ssl_version, ciphers, ssl_context)
261 raise
262 if certfile:
--> 263 context.load_cert_chain(certfile, keyfile)
264 if HAS_SNI: # Platform-specific: OpenSSL with enabled SNI
265 return context.wrap_socket(sock, server_hostname=server_hostname)
IOError: [Errno 22] Invalid argument
Environment
Python: Python 2.7.11 :: Anaconda 2.4.1 (x86_64)
Requests: 2.6.0
Mac OSX: Yosemite (10.10.5)
One thing to check: The client certificate file must include all of the certificates in the certificate chain, up to and including the certificate of root CA that signed the client's certificate.
I may not be correct, but, are you sure the Python version you are using supports the arguments given in requests.get(*arguments)? Maybe update the Python version to like 3.7 and above

Categories

Resources