Python requests/urrllib3 - Retry on read timeout after 200 header received - python

I'm using requests to download some large files (100-5000 MB). I'm using session and urllib3.Retry to get automatic retries.
It appears such retries only applies before the HTTP Header has been received and content has started streaming. After 200 has been sent, a network dip will be raised as a ReadTimeoutError.
See following example:
import requests, logging
from requests.adapters import HTTPAdapter
from urllib3 import Retry
def create_session():
retries = Retry(total=5, backoff_factor=1)
s = requests.Session()
s.mount("http://", HTTPAdapter(max_retries=retries))
s.mount("https://", HTTPAdapter(max_retries=retries))
return s
logging.basicConfig(level=logging.DEBUG, stream=sys.stderr)
session = create_session()
response = session.get(url, timeout=(120, 10)) # Deliberate short read-timeout
This gives following log output:
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): example:443
DEBUG:urllib3.connectionpool:https://example:443 "GET /example.zip HTTP/1.1" 200 1568141974
< UNPLUG NETWORK CABLE FOR 10-15 sec HERE >
Traceback (most recent call last):
File "urllib3/response.py", line 438, in _error_catcher
yield
File "urllib3/response.py", line 519, in read
data = self._fp.read(amt) if not fp_closed else b""
File "/usr/lib/python3.8/http/client.py", line 458, in read
n = self.readinto(b)
File "/usr/lib/python3.8/http/client.py", line 502, in readinto
n = self.fp.readinto(b)
File "/usr/lib/python3.8/socket.py", line 669, in readinto
return self._sock.recv_into(b)
File "/usr/lib/python3.8/ssl.py", line 1241, in recv_into
return self.read(nbytes, buffer)
File "/usr/lib/python3.8/ssl.py", line 1099, in read
return self._sslobj.read(len, buffer)
socket.timeout: The read operation timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "requests/models.py", line 753, in generate
for chunk in self.raw.stream(chunk_size, decode_content=True):
File "urllib3/response.py", line 576, in stream
data = self.read(amt=amt, decode_content=decode_content)
File "urllib3/response.py", line 541, in read
raise IncompleteRead(self._fp_bytes_read, self.length_remaining)
File "/usr/lib/python3.8/contextlib.py", line 131, in __exit__
self.gen.throw(type, value, traceback)
File "urllib3/response.py", line 443, in _error_catcher
raise ReadTimeoutError(self._pool, None, "Read timed out.")
urllib3.exceptions.ReadTimeoutError: HTTPSConnectionPool(host='example', port=443): Read timed out.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "example.py", line 14, in _download
response = session.get(url, headers=headers, timeout=300)
File "requests/sessions.py", line 555, in get
return self.request('GET', url, **kwargs)
File "requests/sessions.py", line 542, in request
resp = self.send(prep, **send_kwargs)
File "requests/sessions.py", line 697, in send
r.content
File "requests/models.py", line 831, in content
self._content = b''.join(self.iter_content(CONTENT_CHUNK_SIZE)) or b''
File "requests/models.py", line 760, in generate
raise ConnectionError(e)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='example', port=443): Read timed out.
I can sort of understand why this would not work, things get even more obvious when you add the stream=True argument together with response.iter_content().
I assume the rationale is that the read_timeout and TCP layer should handle this (in my example i set read_timeout deliberately low to provoke it). But we have cases where servers restart/crash or where firewalls drop connections in the middle of a stream and the only option for a client is to retry the entire thing.
Is there any simple solution to this problem, ideally built into requests? One could always wrap the whole thing with tenacity or manual retries, but ideally i want to avoid that as it means adding another layer and one needs to identify network-errors from other real errors, etc.

Related

POST Multiple Requests concurrent.futures Status = Pending, Error = Timeout

The problem:
I have a large amount of product data that I need to POST to my website via a REST API.
The data:
[{'name': TOOL, LINER UPPER',
'description': 'TOOL, LINER UPPER',
'short_description': '<font size="+3">\r\nFits Engines:\r\n</font>NFAD6-(E)KMK\r\n NFAD6-KMK2\r\n NFAD7-KMK\r\n NFAD7-LEIK3\r\n NFAD7-LIK3\r\n NFAD8-(E)KMK\r\n NFD9-(E)KMK\r\n TF50\r\n TF55H-DI\r\n TF55L\r\n TF55R-DI\r\n TF60\r\n TF60B\r\n TF65H-DI\r\n TF65L\r\n TF65R-DI\r\n TF70\r\n TS105C-GBB\r\n TS190R-B\r\n TS60C-GB\r\n TS70C-GB\r\n TS80C-GBB',
'tags': [{'name': '101300-92010'}],
'sku': '98JCDOZE4P',
'categories': [{'id': 307931}],
'slug': 'tool-liner upper-yanmar/'},
{'name': ' SEAL, OIL',
'description': 'SEAL, OIL',
'short_description': '<font size="+3">\r\nFits Engines:\r\n</font>NFD13-MEA\r\n NFD13-MEAS\r\n NFD13-MEP\r\n NFD13-MEPA\r\n NFD13-MEPAS\r\n TF110M(E/H/L\r\n TF110N-L\r\n TF120M(E/H/L\r\n TF120ML-XA\r\n TF120N-L\r\n TF80-M(E/H/L\r\n TF90-M(E/H/L',
'tags': [{'name': '103288-02221'}],
'sku': 'PX8AKH5JDR',
'categories': [{'id': 307931}],
'slug': '-seal-oil-yanmar/'}]
What I have tried:
i. Using a for loop.
from woocommerce import API
wcapi = API(url=url, consumer_key=consumer_key, consumer_secret=consumer_secret,timeout=50)
for product in my_json_list:
print(wcapi.post("products", product).json())
This works, but it's going to take until next millennium because I have millions of product pages I want to create.
ii. The concurent.futures module.
def post_data(product):
return wcapi.post("products", product).json()
# protect the entry point
if __name__ == '__main__':
# create the thread pool
with ThreadPoolExecutor() as ex:
# issue many asynchronous tasks systematically
futures = [ex.submit(post_data, page) for page in data]
# enumerate futures and report results
for future in futures:
print(future.result())
When I try this I get a timeout error, which seems to be connected to the API (read timeout=50), and the status of the future object is pending.
Here's the full error output.
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/connectionpool.py", line 449, in _make_request
six.raise_from(e, None)
File "<string>", line 3, in raise_from
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/connectionpool.py", line 444, in _make_request
httplib_response = conn.getresponse()
^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/http/client.py", line 1374, in getresponse
response.begin()
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/http/client.py", line 318, in begin
version, status, reason = self._read_status()
^^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/http/client.py", line 279, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/socket.py", line 705, in readinto
return self._sock.recv_into(b)
^^^^^^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/ssl.py", line 1278, in recv_into
return self.read(nbytes, buffer)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/ssl.py", line 1134, in read
return self._sslobj.read(len, buffer)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TimeoutError: The read operation timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/requests/adapters.py", line 489, in send
resp = conn.urlopen(
^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/connectionpool.py", line 787, in urlopen
retries = retries.increment(
^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/util/retry.py", line 550, in increment
raise six.reraise(type(error), error, _stacktrace)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/packages/six.py", line 770, in reraise
raise value
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/connectionpool.py", line 703, in urlopen
httplib_response = self._make_request(
^^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/connectionpool.py", line 451, in _make_request
self._raise_timeout(err=e, url=url, timeout_value=read_timeout)
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/connectionpool.py", line 340, in _raise_timeout
raise ReadTimeoutError(
urllib3.exceptions.ReadTimeoutError: HTTPSConnectionPool(host='new.turbo-diesel.co.uk', port=443): Read timed out. (read timeout=50)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/martinhewing/Downloads/Python_Code/Woo_web_pages/create_DEP_pages.py", line 74, in <module>
futures = [ex.submit(wcapi.post("products", page).json()) for page in data]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/martinhewing/Downloads/Python_Code/Woo_web_pages/create_DEP_pages.py", line 74, in <listcomp>
futures = [ex.submit(wcapi.post("products", page).json()) for page in data]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/woocommerce/api.py", line 110, in post
return self.__request("POST", endpoint, data, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/woocommerce/api.py", line 92, in __request
return request(
^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/requests/api.py", line 59, in request
return session.request(method=method, url=url, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/requests/sessions.py", line 587, in request
resp = self.send(prep, **send_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/requests/sessions.py", line 701, in send
r = adapter.send(request, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/requests/adapters.py", line 578, in send
raise ReadTimeout(e, request=request)
requests.exceptions.ReadTimeout: HTTPSConnectionPool(host='new.turbo-diesel.co.uk', port=443): Read timed out. (read timeout=50)
When I look at the ex.summit() function I can see that the status is pending. <Future at 0x124d1f7d0 state=pending> does anyone know what might be the problem here?
The problem is with how many concurrency connections the server can take/can take from one client IP, as this is typically limited to a small number per client in modern web servers.
When I tried to POST using the plain for loop I have been blocked. Using the concurrent method is seen by the server as a DOS attack.;

Google Storage - Read timeout

I am using signed url of in Google Storage. I have a python code using requests that upload a file to the signed url using http PUT. The code is running in a GKE 1.17 deployment with 18 threads on ubuntu 18.04. The code implements retry using the retry python library. In addition the code is crawling websites.
The Problem
The code sometimes fail with the exception ReadTimeout:
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/urllib3/connectionpool.py", line 421, in _make_request
six.raise_from(e, None)
File "<string>", line 3, in raise_from
File "/usr/local/lib/python3.6/dist-packages/urllib3/connectionpool.py", line 416, in _make_request
httplib_response = conn.getresponse()
File "/usr/lib/python3.6/http/client.py", line 1346, in getresponse
response.begin()
File "/usr/lib/python3.6/http/client.py", line 307, in begin
version, status, reason = self._read_status()
File "/usr/lib/python3.6/http/client.py", line 268, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "/usr/lib/python3.6/socket.py", line 586, in readinto
return self._sock.recv_into(b)
File "/usr/lib/python3.6/ssl.py", line 1012, in recv_into
return self.read(nbytes, buffer)
File "/usr/lib/python3.6/ssl.py", line 874, in read
return self._sslobj.read(len, buffer)
File "/usr/lib/python3.6/ssl.py", line 631, in read
v = self._sslobj.read(len, buffer)
socket.timeout: The read operation timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/requests/adapters.py", line 449, in send
timeout=timeout
File "/usr/local/lib/python3.6/dist-packages/urllib3/connectionpool.py", line 720, in urlopen
method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
File "/usr/local/lib/python3.6/dist-packages/urllib3/util/retry.py", line 400, in increment
raise six.reraise(type(error), error, _stacktrace)
File "/usr/local/lib/python3.6/dist-packages/urllib3/packages/six.py", line 735, in reraise
raise value
File "/usr/local/lib/python3.6/dist-packages/urllib3/connectionpool.py", line 672, in urlopen
chunked=chunked,
File "/usr/local/lib/python3.6/dist-packages/urllib3/connectionpool.py", line 423, in _make_request
self._raise_timeout(err=e, url=url, timeout_value=read_timeout)
File "/usr/local/lib/python3.6/dist-packages/urllib3/connectionpool.py", line 331, in _raise_timeout
self, url, "Read timed out. (read timeout=%s)" % timeout_value
urllib3.exceptions.ReadTimeoutError: HTTPSConnectionPool(host='storage.googleapis.com', port=443): Read timed out. (read timeout=30)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
...
response = self.__upload(target, file_stream, headers)
File "<decorator-gen-18>", line 2, in __upload
File "/usr/local/lib/python3.6/dist-packages/retry/api.py", line 74, in retry_decorator
logger)
File "/usr/local/lib/python3.6/dist-packages/retry/api.py", line 33, in __retry_internal
return f()
File "/home/crawler/crawler/upload/UploadResults.py", line 179, in __upload
target, zip_file_stream, headers=headers, timeout=TIMEOUT)
File "/usr/local/lib/python3.6/dist-packages/requests/sessions.py", line 593, in put
return self.request('PUT', url, data=data, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/requests/sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "/usr/local/lib/python3.6/dist-packages/requests/sessions.py", line 646, in send
r = adapter.send(request, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/requests/adapters.py", line 529, in send
raise ReadTimeout(e, request=request)
requests.exceptions.ReadTimeout: HTTPSConnectionPool(host='storage.googleapis.com', port=443): Read timed out. (read timeout=30)
This means that the connection was made successfully and no data was received in the readtimeout window.
My Attempts To Fix It
More retries - Up to 10
Larger delay between retries
exponential backoff
Test if I raise ReadTimeout() in the function if the functions retries this specific exception, and it did.
Allow more connections in for the session and allow more connections per host.
The google storage url is avoiding naming bottleneck by using a different folder for each file.
Larger read timeout value - tried with up to 60 seconds.
My Code
import requests
from google.api_core.exceptions import GatewayTimeout, GoogleAPICallError, ServiceUnavailable
from requests.exceptions import ConnectionError
from crawler.exceptions.CrawlerException import CrawlerException, eCrawlingStatusCodes
from retry import retry
RETRIES = 10
BACKOFF = 1.5
DELAY = 2
TIMEOUT = (9.05, 30)
class UploadResults:
GOOGLE_STORAGE = GoogleStorage()
SESSION = requests.Session()
SESSION.mount('https://', requests.adapters.HTTPAdapter(
pool_connections=20, pool_maxsize=30))
#retry(exceptions=(CrawlerException, requests.exceptions.RequestException, ValueError,
requests.exceptions.Timeout, requests.exceptions.ReadTimeout, requests.exceptions.ConnectTimeout),
tries=RETRIES, delay=DELAY, backoff=BACKOFF)
def __upload(self, target, zip_file_stream, headers):
response = UploadResults.SESSION.put(
target, zip_file_stream, headers=headers, timeout=TIMEOUT)
if response.status_code >= 300:
raise CrawlerException(eCrawlingStatusCodes.InternalError,
"Internal error on uploading to path, Server status: %d" % response.status_code)
return response
Questions
What can cause this exception?
Is there a limit I am missing that can cause this behaviour?

Long chain of exceptions in scrapy splash application

My scrapy application is outputting this long chain of exceptions and I am failing to see what the issue is and the last one has me especially confused.
Before I explain why here is the chain:
2020-11-04 17:38:58,394:ERROR:Error while obtaining start requests
Traceback (most recent call last):
File "C:\Users\lguarro\Anaconda3\envs\virtual_workspace\lib\site-packages\urllib3\connectionpool.py", line 670, in urlopen
httplib_response = self._make_request(
File "C:\Users\lguarro\Anaconda3\envs\virtual_workspace\lib\site-packages\urllib3\connectionpool.py", line 426, in _make_request
six.raise_from(e, None)
File "<string>", line 3, in raise_from
File "C:\Users\lguarro\Anaconda3\envs\virtual_workspace\lib\site-packages\urllib3\connectionpool.py", line 421, in _make_request
httplib_response = conn.getresponse()
File "C:\Users\lguarro\Anaconda3\envs\virtual_workspace\lib\http\client.py", line 1347, in getresponse
response.begin()
File "C:\Users\lguarro\Anaconda3\envs\virtual_workspace\lib\http\client.py", line 307, in begin
version, status, reason = self._read_status()
File "C:\Users\lguarro\Anaconda3\envs\virtual_workspace\lib\http\client.py", line 276, in _read_status
raise RemoteDisconnected("Remote end closed connection without"
http.client.RemoteDisconnected: Remote end closed connection without response
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\lguarro\Anaconda3\envs\virtual_workspace\lib\site-packages\requests\adapters.py", line 439, in send
resp = conn.urlopen(
File "C:\Users\lguarro\Anaconda3\envs\virtual_workspace\lib\site-packages\urllib3\connectionpool.py", line 726, in urlopen
retries = retries.increment(
File "C:\Users\lguarro\Anaconda3\envs\virtual_workspace\lib\site-packages\urllib3\util\retry.py", line 403, in increment
raise six.reraise(type(error), error, _stacktrace)
File "C:\Users\lguarro\Anaconda3\envs\virtual_workspace\lib\site-packages\urllib3\packages\six.py", line 734, in reraise
raise value.with_traceback(tb)
File "C:\Users\lguarro\Anaconda3\envs\virtual_workspace\lib\site-packages\urllib3\connectionpool.py", line 670, in urlopen
httplib_response = self._make_request(
File "C:\Users\lguarro\Anaconda3\envs\virtual_workspace\lib\site-packages\urllib3\connectionpool.py", line 426, in _make_request
six.raise_from(e, None)
File "<string>", line 3, in raise_from
File "C:\Users\lguarro\Anaconda3\envs\virtual_workspace\lib\site-packages\urllib3\connectionpool.py", line 421, in _make_request
httplib_response = conn.getresponse()
File "C:\Users\lguarro\Anaconda3\envs\virtual_workspace\lib\http\client.py", line 1347, in getresponse
response.begin()
File "C:\Users\lguarro\Anaconda3\envs\virtual_workspace\lib\http\client.py", line 307, in begin
version, status, reason = self._read_status()
File "C:\Users\lguarro\Anaconda3\envs\virtual_workspace\lib\http\client.py", line 276, in _read_status
raise RemoteDisconnected("Remote end closed connection without"
urllib3.exceptions.ProtocolError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\lguarro\Anaconda3\envs\virtual_workspace\lib\site-packages\shadow_useragent\core.py", line 35, in _update
r = requests.get(url=self.URL)
File "C:\Users\lguarro\Anaconda3\envs\virtual_workspace\lib\site-packages\requests\api.py", line 76, in get
return request('get', url, params=params, **kwargs)
File "C:\Users\lguarro\Anaconda3\envs\virtual_workspace\lib\site-packages\requests\api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\lguarro\Anaconda3\envs\virtual_workspace\lib\site-packages\requests\sessions.py", line 530, in request
resp = self.send(prep, **send_kwargs)
File "C:\Users\lguarro\Anaconda3\envs\virtual_workspace\lib\site-packages\requests\sessions.py", line 643, in send
r = adapter.send(request, **kwargs)
File "C:\Users\lguarro\Anaconda3\envs\virtual_workspace\lib\site-packages\requests\adapters.py", line 498, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\lguarro\Anaconda3\envs\virtual_workspace\lib\site-packages\scrapy\core\engine.py", line 129, in _next_request
request = next(slot.start_requests)
File "C:\Users\lguarro\Anaconda3\envs\virtual_workspace\lib\site-packages\scrapy_splash\middleware.py", line 167, in process_start_requests
for req in start_requests:
File "C:\Users\lguarro\Documents\Work\SearchEngine_Pure\SearchEngine_Pure\spiders\SearchEngine.py", line 36, in start_requests
user_agent = self.ua.random_nomobile
File "C:\Users\lguarro\Anaconda3\envs\virtual_workspace\lib\site-packages\shadow_useragent\core.py", line 120, in random_nomobile
return self.pickrandom(exclude_mobile=True)
File "C:\Users\lguarro\Anaconda3\envs\virtual_workspace\lib\site-packages\shadow_useragent\core.py", line 83, in pickrandom
self.update()
File "C:\Users\lguarro\Anaconda3\envs\virtual_workspace\lib\site-packages\shadow_useragent\core.py", line 59, in update
self._update()
File "C:\Users\lguarro\Anaconda3\envs\virtual_workspace\lib\site-packages\shadow_useragent\core.py", line 38, in _update
self.logger.error(r.content.decode('utf-8'))
UnboundLocalError: local variable 'r' referenced before assignment
Now the last exception is complaining about some
UnboundLocalError: local variable 'r' referenced before assignment
The only code which is mine in that trace is the SearchEngine.py file which doesn't even have a variable 'r' thus leaving me very confused. Here is the implementation of start_requests from which the error occurs:
def start_requests(self):
user_agent = self.ua.random_nomobile # Exception raised here
rec = self.mh.FindIdleOneWithNoURLs()
if rec:
self.logger.info("Starting url scrape for company, %s using user agent: %s", rec["Company"], user_agent)
script = self.template.substitute(useragent=user_agent, searchquery=rec["Company"])
yield SplashRequest(url=self.url, callback=self.parse, endpoint="execute",
args={
'lua_source': script
},
meta={'RecID': rec["_id"], 'Company': rec["Company"]},
errback = self.logerror
)
It is complaining about the first line in that function for which I see no problem.
In case it is relevant, I will also add that my script seemed to be running fine just yesterday but today I had to reset my Docker configuration (that the splash container is running in) and since then I haven't been able to run my script smoothly.
I found out what was causing the issue! In fact there was no error on part, instead it is a bug inside the shadow-useragent library.
The library periodically makes an API request to fetch a list of the most used user agents and the server corresponding to this API is down and the authors of shadow-useragent were not properly handling the exception.
Fortunately shadow-useragent does cache the list of user agents that it was most recently able to receive. So my solution was to edit the shadow-useragent code to bypass the update function entirely and to use the cached list (inside the data.pk file) beyond its scheduled update. If anyone else runs into this issue, this is a temporary solution you can use until that server gets up and running again.. hopefully soon!

Python3 Requests Proxy (Socks5)

Edit: I don't know if this is a thing, but often this happens when i've been testing a lot and I have a lot of python scripts running. Usually none active at the same time but I'll have ran a few of them in a short space of time. I don't know if that's why teh error starts happening.
Ok here's my dilemma.
I've got a NordVPN account and I want to randomly loop through the ip's for the requests i'm making to google. This works fine .. sometimes. For some reason after a few request pings my program starts slowing down and then I start getting 'Max Connection Exceeded' errors and as far as I can tell the ip's are fine.
I started thinking that google was blocking the ip but then I request to other website and the same error continues.. I started thinking that maybe the proxy server only allows a certain amount of requests in a certain time frame so I tried some free proxies but I get the same errors again..
Either way if I leave it alone for a while it seems to work fine and then a few dozen pings later and the errors start again.
The only think I can think of and not sure how I can test or even if it's possible to test remotely (work router) is that maybe some open connections remain open and affects my code and then when the backlog of open connections are thrown out I can resume as normal. I started looking for if I needed to "close" my connection after my get request but it doesn't seem necessary (although I wasn't even sure what I was looking for).
This is one version of my code, I've tried without sessions, I've tried a different way of writing the sessions. All seem to work in the same way:
import requests, random
username = 'xxx'
password = 'yyy'
with open('proxies.txt', 'r') as p:
proxy_lost = p.read().splitlines()
with open('data.txt', 'r') as d:
data = d.read().splitlines()
for i in data:
result = None
while result is None:
try:
proxy = proxy_list[random.randint(0,len(proxy_list)-1)] + '.nordvpn.com'
prox = 'socks5://{}:{}#{}:{}'.format(username, password, proxy, 1080) #I've also tried socks5h
with requests.Session() as r:
r.proxies['http': prox]
r.proxies['https': prox]
result = r.get('http://icanhazip.com')
print(result.text.strip())
If anyone has any idea whatsoever any help is appreciated. I've reached a point where i'm struggling to think of new ideas to try.
This is an example of one of the errors I get during this whole process:
Traceback (most recent call last): File
"C:\Users*\AppData\Local\Programs\Python\Python37\lib\site-packages\socks.py",
line 809, in connect
negotiate(self, dest_addr, dest_port) File "C:\Users*\AppData\Local\Programs\Python\Python37\lib\site-packages\socks.py",
line 444, in _negotiate_SOCKS5
self, CONNECT, dest_addr) File "C:\Users*\AppData\Local\Programs\Python\Python37\lib\site-packages\socks.py",
line 503, in _SOCKS5_request
raise SOCKS5AuthError("SOCKS5 authentication failed") socks.SOCKS5AuthError: SOCKS5 authentication failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File
"C:\Users*\AppData\Local\Programs\Python\Python37\lib\site-packages\urllib3\contrib\socks.py",
line 88, in _new_conn
**extra_kw File "C:\Users*\AppData\Local\Programs\Python\Python37\lib\site-packages\socks.py",
line 209, in create_connection
raise err File "C:\Users*\AppData\Local\Programs\Python\Python37\lib\site-packages\socks.py",
line 199, in create_connection
sock.connect((remote_host, remote_port)) File "C:\Users*\AppData\Local\Programs\Python\Python37\lib\site-packages\socks.py",
line 47, in wrapper
return function(*args, **kwargs) File "C:\Users*\AppData\Local\Programs\Python\Python37\lib\site-packages\socks.py",
line 814, in connect
raise GeneralProxyError("Socket error", error) socks.GeneralProxyError: Socket error: SOCKS5 authentication failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File
"C:\Users*\AppData\Local\Programs\Python\Python37\lib\site-packages\urllib3\connectionpool.py",
line 600, in urlopen
chunked=chunked) File "C:\Users*\AppData\Local\Programs\Python\Python37\lib\site-packages\urllib3\connectionpool.py",
line 354, in _make_request
conn.request(method, url, **httplib_request_kw) File "C:\Users*\AppData\Local\Programs\Python\Python37\lib\http\client.py",
line 1244, in request
self._send_request(method, url, body, headers, encode_chunked) File
"C:\Users*\AppData\Local\Programs\Python\Python37\lib\http\client.py",
line 1290, in _send_request
self.endheaders(body, encode_chunked=encode_chunked) File "C:\Users*\AppData\Local\Programs\Python\Python37\lib\http\client.py",
line 1239, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked) File
"C:\Users*\AppData\Local\Programs\Python\Python37\lib\http\client.py",
line 1026, in _send_output
self.send(msg) File "C:\Users*\AppData\Local\Programs\Python\Python37\lib\http\client.py",
line 966, in send
self.connect() File "C:\Users*\AppData\Local\Programs\Python\Python37\lib\site-packages\urllib3\connection.py",
line 181, in connect
conn = self._new_conn() File "C:\Users*\AppData\Local\Programs\Python\Python37\lib\site-packages\urllib3\contrib\socks.py",
line 110, in _new_conn
"Failed to establish a new connection: %s" % error urllib3.exceptions.NewConnectionError:
:
Failed to establish a new connection: SOCKS5 authentication failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File
"C:\Users*\AppData\Local\Programs\Python\Python37\lib\site-packages\requests\adapters.py",
line 449, in send
timeout=timeout File "C:\Users*\AppData\Local\Programs\Python\Python37\lib\site-packages\urllib3\connectionpool.py",
line 638, in urlopen
_stacktrace=sys.exc_info()[2]) File "C:\Users*\AppData\Local\Programs\Python\Python37\lib\site-packages\urllib3\util\retry.py",
line 399, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause)) urllib3.exceptions.MaxRetryError:
SOCKSHTTPConnectionPool(host='icanhazip.com', port=80): Max retries
exceeded with url: / (Caused by
NewConnectionError(': Failed to establish a new connection: SOCKS5
authentication failed'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File
"C:\Users*\Documents_Scripts\Find
Proxies\FindProxies.py", line 23, in
result = requests.get('http://icanhazip.com', proxies=proxies) File
"C:\Users*\AppData\Local\Programs\Python\Python37\lib\site-packages\requests\api.py",
line 75, in get
return request('get', url, params=params, **kwargs) File "C:\Users*\AppData\Local\Programs\Python\Python37\lib\site-packages\requests\api.py",
line 60, in request
return session.request(method=method, url=url, **kwargs) File "C:\Users*\AppData\Local\Programs\Python\Python37\lib\site-packages\requests\sessions.py",
line 533, in request
resp = self.send(prep, **send_kwargs) File "C:\Users*\AppData\Local\Programs\Python\Python37\lib\site-packages\requests\sessions.py",
line 646, in send
r = adapter.send(request, **kwargs) File "C:\Users*\AppData\Local\Programs\Python\Python37\lib\site-packages\requests\adapters.py",
line 516, in send
raise ConnectionError(e, request=request) requests.exceptions.ConnectionError:
SOCKSHTTPConnectionPool(host='icanhazip.com', port=80): Max retries
exceeded with url: / (Caused by
NewConnectionError(': Failed to establish a new connection: SOCKS5
authentication failed'))
Are you also closing the sessions again after use?
Try with:
r = requests.session(config={'keep_alive': False})
Inspired by:
python-requests-close-http-connection

Exchangelib: How to iterate over all emails in a folder

I need to iterate all emails in a folder, eventually exporting to CSV format. Here's the important bits of code:
credentials = ServiceAccount(user, password)
...
qs = account.inbox.all()
qs.page_size = 100
qs.order_by('-datetime_received')
emails = qs.iterator()
for i, email in enumerate(emails):
print(i, email.subject, email.sender, email.datetime_received)
When I run, I get an error every time, but not at the same point in the process.
Sometimes I get one page back (i == 99), sometimes I get three pages back (i==299) or five pages back (i==499). Five pages are the most I've gotten back. No matter where the error occurs, it's this error. FWIW, the error does happen at page breaks.
Traceback (most recent call last):
File "outlook.py", line 81, in <module>
for i, email in enumerate(emails):
File "C:\dev\programs\anaconda3\lib\site-packages\exchangelib\queryset.py", line 390, in _as_items
for i in iterable:
File "C:\dev\programs\anaconda3\lib\site-packages\exchangelib\account.py", line 628, in fetch
shape=IdOnly,
File "C:\dev\programs\anaconda3\lib\site-packages\exchangelib\services.py", line 602, in _pool_requests
for elem in r.get():
File "C:\dev\programs\anaconda3\lib\multiprocessing\pool.py", line 644, in get
raise self._value
File "C:\dev\programs\anaconda3\lib\multiprocessing\pool.py", line 119, in worker
result = (True, func(*args, **kwds))
File "C:\dev\programs\anaconda3\lib\site-packages\exchangelib\services.py", line 591, in <lambda>
lambda c: self._get_elements(payload=payload_func(c, **kwargs)),
File "C:\dev\programs\anaconda3\lib\site-packages\exchangelib\services.py", line 90, in _get_elements
response = self._get_response_xml(payload=payload)
File "C:\dev\programs\anaconda3\lib\site-packages\exchangelib\services.py", line 174, in _get_response_xml
res = self._get_soap_payload(soap_response=soap_response_payload)
File "C:\dev\programs\anaconda3\lib\site-packages\exchangelib\services.py", line 229, in _get_soap_payload
raise SOAPError('Unknown SOAP response: %s' % xml_to_str(body))
exchangelib.errors.SOAPError: Unknown SOAP response: <ns0:Body xmlns:ns0="http://schemas.xmlsoap.org/soap/envelope/" />
Is this the correct approach to walking through all emails? What does this error mean and how do I fix it? Thanks!
UPDATE:
Following is the captured output. We don't see anything that looks like a 'backoff' message. Does this help you identify any issues?
<< This run had page size set to 10 >>
DEBUG:exchangelib.util:Session 5122 thread 9556: Useful response from https://our.host.name/EWS/Exchange.asmx
DEBUG:exchangelib.protocol:Server our.host.name: Releasing session 5122
DEBUG:exchangelib.services:Trying API version Exchange2010_SP2 for account myaccount#myhost.com
DEBUG:exchangelib.services:FindItem: Got page with next offset 360 (last_page False)
DEBUG:exchangelib.services:Starting GetItem._get_elements worker 36 for 10 items
DEBUG:exchangelib.services:GetItem._get_elements result 30 is ready early
<< 10 of these DEBUG lines come out >>
DEBUG:exchangelib.services:Getting item ('AAMkADJjMDIwNmQwLWQ2NzYtNDk5OC04MjgxLT
MzZTdhMTk4YWRiNQBGAAAAAABGQ9HU4LA/RY8Mwwr5Jp5OBwDOJO2TCOmBSZeM4xH+g3x5AAAAiE7BAA
DOJO2TCOmBSZeM4xH+g3x5AAAe2nAfAAA=', 'CQAAABYAAADOJO2TCOmBSZeM4xH+g3x5AAAe2wDA')
DEBUG:exchangelib.services:Getting item ('AAMkADJjMDIwNmQwLWQ2NzYtNDk5OC04MjgxLT
MzZTdhMTk4YWRiNQBGAAAAAABGQ9HU4LA/RY8Mwwr5Jp5OBwDOJO2TCOmBSZeM4xH+g3x5AAAAiE7BAA
DOJO2TCOmBSZeM4xH+g3x5AAAe2nAeAAA=', 'CQAAABYAAADOJO2TCOmBSZeM4xH+g3x5AAAe2wC+')
...
<<Seemed to process the above page successfully, then starts w/ errors >>
File "%USERPROFILE%\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\response.py", line 502, in read_chunked
DEBUG:exchangelib.protocol:Server our.host.name: Waiting for session
DEBUG:exchangelib.protocol:Server our.host.name: Got session 5122
DEBUG:exchangelib.util:Session 5122 thread 6408: retry 0 timeout 120000
POST'ing to https://our.host.name/EWS/Exchange.asmx after 1000s wait self._update_chunk_length()
File "%USERPROFILE%\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\response.py", line 448, in _update_chunk_length
line = self._fp.fp.readline()
File "%USERPROFILE%\AppData\Local\Continuum\Anaconda3\lib\socket.py", line 575, in readinto
return self._sock.recv_into(b)
File "%USERPROFILE%\AppData\Local\Continuum\Anaconda3\lib\ssl.py", line 929, in recv_into
return self.read(nbytes, buffer)
File "%USERPROFILE%\AppData\Local\Continuum\Anaconda3\lib\ssl.py", line 791, in read
return self._sslobj.read(len, buffer)
File "%USERPROFILE%\AppData\Local\Continuum\Anaconda3\lib\ssl.py", line 575, in read
v = self._sslobj.read(len, buffer)
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "%USERPROFILE%\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\models.py", line 676, in generate
for chunk in self.raw.stream(chunk_size, decode_content=True):
File "%USERPROFILE%\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\response.py", line 353, in stream
for line in self.read_chunked(amt, decode_content=decode_content):
File "%USERPROFILE%\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\response.py", line 530, in read_chunked
self._original_response.close()
File "%USERPROFILE%\AppData\Local\Continuum\Anaconda3\lib\contextlib.py", line 77, in __exit__
self.gen.throw(type, value, traceback)
File "%USERPROFILE%\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\response.py", line 250, in _error_catcher
raise ProtocolError('Connection broken: %r' % e, e)
requests.packages.urllib3.exceptions.ProtocolError: ("Connection broken:
ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None)",
ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "email_test.py", line 27, in <module>
for i, email in enumerate(emails):
File "%WORKSPACE%\nlp\exchangelib\exchangelib\queryset.py", line 390, in _as_items
for i in iterable:
File "%WORKSPACE%\nlp\exchangelib\exchangelib\account.py", line 628, in fetch
shape=IdOnly,
File "%WORKSPACE%\nlp\exchangelib\exchangelib\services.py", line 596, in _pool_requests
for elem in r.get():
File "%USERPROFILE%\AppData\Local\Continuum\Anaconda3\lib\multiprocessing\pool.py", line 608, in get
raise self._value
File "%USERPROFILE%\AppData\Local\Continuum\Anaconda3\lib\multiprocessing\pool.py", line 119, in worker
result = (True, func(*args, **kwds))
File "%WORKSPACE%\nlp\exchangelib\exchangelib\services.py", line 585, in <lambda>
lambda c: self._get_elements(payload=payload_func(c, **kwargs)),
File "%WORKSPACE%\nlp\exchangelib\exchangelib\services.py", line 89, in _get_elements
response = self._get_response_xml(payload=payload)
File "%WORKSPACE%\nlp\exchangelib\exchangelib\services.py", line 163, in _get_response_xml
allow_redirects=False)
File "%WORKSPACE%\nlp\exchangelib\exchangelib\util.py", line 547, in post_ratelimited
_raise_response_errors(r, protocol, log_msg, log_vals) # Always raises an exception
File "%WORKSPACE%\nlp\exchangelib\exchangelib\util.py", line 603, in _raise_response_errors
raise r.headers['TimeoutException']
File "%WORKSPACE%\nlp\exchangelib\exchangelib\util.py", line 494, in post_ratelimited
r = session.post(url=url, headers=headers, data=data, allow_redirects=False, timeout=protocol.TIMEOUT)
File "%USERPROFILE%\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\sessions.py", line 522, in post
return self.request('POST', url, data=data, json=json, **kwargs)
File "%USERPROFILE%\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\sessions.py", line 475, in request
resp = self.send(prep, **send_kwargs)
File "%USERPROFILE%\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\sessions.py", line 628, in send
r.content
File "%USERPROFILE%\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\models.py", line 755, in content
self._content = bytes().join(self.iter_content(CONTENT_CHUNK_SIZE)) or bytes()
File "%USERPROFILE%\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\models.py", line 679, in generate
raise ChunkedEncodingError(e)
requests.exceptions.ChunkedEncodingError: ("Connection broken:
ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None)",
ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))

Categories

Resources