Dask Dataframe Distributed Process ID Access Denied - python

I am running a set of pandas-like transformations to a dask dataframe, using the "distributed" set-up, running on my own machine - so using 8 workers corresponding to my computer's 8 cores.
I have the default set up of a distributed client:
from dask.distributed import Client
c = Client()
The process runs successfully with a small amount of data (1000 records), but when I scale it up only slightly to 7500 records, I get the following warnings:
tornado.application - ERROR - Exception in callback <bound method Nanny.memory_monitor of <Nanny: tcp://127.0.0.1:58103, threads: 1>>
Traceback (most recent call last):
File "/Users/user1/anaconda3/envs/ldaenv/lib/python3.6/site-packages/psutil/_psosx.py", line 348, in catch_zombie
yield
File "/Users/user1/anaconda3/envs/ldaenv/lib/python3.6/site-packages/psutil/_psosx.py", line 387, in _get_pidtaskinfo
ret = cext.proc_pidtaskinfo_oneshot(self.pid)
ProcessLookupError: [Errno 3] No such process
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/user1/anaconda3/envs/ldaenv/lib/python3.6/site-packages/tornado/ioloop.py", line 1026, in _run
return self.callback()
File "/Users/user1/anaconda3/envs/ldaenv/lib/python3.6/site-packages/distributed/nanny.py", line 245, in memory_monitor
memory = psutil.Process(self.process.pid).memory_info().rss
File "/Users/user1/anaconda3/envs/ldaenv/lib/python3.6/site-packages/psutil/_common.py", line 337, in wrapper
return fun(self)
File "/Users/user1/anaconda3/envs/ldaenv/lib/python3.6/site-packages/psutil/__init__.py", line 1049, in memory_info
return self._proc.memory_info()
File "/Users/user1/anaconda3/envs/ldaenv/lib/python3.6/site-packages/psutil/_psosx.py", line 330, in wrapper
return fun(self, *args, **kwargs)
File "/Users/user1/anaconda3/envs/ldaenv/lib/python3.6/site-packages/psutil/_psosx.py", line 456, in memory_info
rawtuple = self._get_pidtaskinfo()
File "/Users/user1/anaconda3/envs/ldaenv/lib/python3.6/site-packages/psutil/_common.py", line 337, in wrapper
return fun(self)
File "/Users/user1/anaconda3/envs/ldaenv/lib/python3.6/site-packages/psutil/_psosx.py", line 387, in _get_pidtaskinfo
ret = cext.proc_pidtaskinfo_oneshot(self.pid)
File "/Users/user1/anaconda3/envs/ldaenv/lib/python3.6/contextlib.py", line 99, in __exit__
self.gen.throw(type, value, traceback)
File "/Users/user1/anaconda3/envs/ldaenv/lib/python3.6/site-packages/psutil/_psosx.py", line 361, in catch_zombie
raise AccessDenied(proc.pid, proc._name)
psutil._exceptions.AccessDenied: psutil.AccessDenied (pid=17998)
Which repeats itself multiple times, as dask attempts to start the computation block again. After it's failed the amount of times specified in the config file, there is finally a KilledWorker error e.g. the below. I've tried this with different lengths of data and the KilledWorker is sometimes on a melt task, sometimes on an apply task.
KilledWorker Traceback (most recent call last)
<ipython-input-28-7ba288919b51> in <module>()
1 #Optional checkpoint to view output
2 with ProgressBar():
----> 3 output = aggdf.compute()
4 output.head()
~/anaconda3/envs/ldaenv/lib/python3.6/site-packages/dask/base.py in compute(self, **kwargs)
133 dask.base.compute
134 """
--> 135 (result,) = compute(self, traverse=False, **kwargs)
136 return result
137
~/anaconda3/envs/ldaenv/lib/python3.6/site-packages/dask/base.py in compute(*args, **kwargs)
331 postcomputes = [a.__dask_postcompute__() if is_dask_collection(a)
332 else (None, a) for a in args]
--> 333 results = get(dsk, keys, **kwargs)
334 results_iter = iter(results)
335 return tuple(a if f is None else f(next(results_iter), *a)
~/anaconda3/envs/ldaenv/lib/python3.6/site-packages/distributed/client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, **kwargs)
1997 secede()
1998 try:
-> 1999 results = self.gather(packed, asynchronous=asynchronous)
2000 finally:
2001 for f in futures.values():
~/anaconda3/envs/ldaenv/lib/python3.6/site-packages/distributed/client.py in gather(self, futures, errors, maxsize, direct, asynchronous)
1435 return self.sync(self._gather, futures, errors=errors,
1436 direct=direct, local_worker=local_worker,
-> 1437 asynchronous=asynchronous)
1438
1439 #gen.coroutine
~/anaconda3/envs/ldaenv/lib/python3.6/site-packages/distributed/client.py in sync(self, func, *args, **kwargs)
590 return future
591 else:
--> 592 return sync(self.loop, func, *args, **kwargs)
593
594 def __repr__(self):
~/anaconda3/envs/ldaenv/lib/python3.6/site-packages/distributed/utils.py in sync(loop, func, *args, **kwargs)
252 e.wait(1000000)
253 if error[0]:
--> 254 six.reraise(*error[0])
255 else:
256 return result[0]
~/anaconda3/envs/ldaenv/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
~/anaconda3/envs/ldaenv/lib/python3.6/site-packages/distributed/utils.py in f()
236 yield gen.moment
237 thread_state.asynchronous = True
--> 238 result[0] = yield make_coro()
239 except Exception as exc:
240 logger.exception(exc)
~/anaconda3/envs/ldaenv/lib/python3.6/site-packages/tornado/gen.py in run(self)
1053
1054 try:
-> 1055 value = future.result()
1056 except Exception:
1057 self.had_exception = True
~/anaconda3/envs/ldaenv/lib/python3.6/site-packages/tornado/concurrent.py in result(self, timeout)
236 if self._exc_info is not None:
237 try:
--> 238 raise_exc_info(self._exc_info)
239 finally:
240 self = None
~/anaconda3/envs/ldaenv/lib/python3.6/site-packages/tornado/util.py in raise_exc_info(exc_info)
~/anaconda3/envs/ldaenv/lib/python3.6/site-packages/tornado/gen.py in run(self)
1061 if exc_info is not None:
1062 try:
-> 1063 yielded = self.gen.throw(*exc_info)
1064 finally:
1065 # Break up a reference to itself
~/anaconda3/envs/ldaenv/lib/python3.6/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
1313 six.reraise(type(exception),
1314 exception,
-> 1315 traceback)
1316 if errors == 'skip':
1317 bad_keys.add(key)
~/anaconda3/envs/ldaenv/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
KilledWorker: ("('melt-b85b6b6b1aee5b5aaa8d24db1de65b8a', 0)", 'tcp://127.0.0.1:58108')
I'm not very familiar with the distributed or tornado packages, or the underlying architecture of which processes are being created and killed - is anyone able to point me in the right direction to debug/resolve this?
In the meantime I am switching to the default dask dataframe behaviour of multithreaded computation, which works successfully with a large amount of data.

It looks like your workers are dying for some reason. Unfortunately it's not clear from the workers what the cause is. You might consider setting up the cluster manually to have clearer access to the worker logs
$ dask-scheduler # run this in one terminal
$ dask-worker tcp://localhost:8786 # run this in another
worker logs will appear here

Related

How to fix: Connection Refused Error WinError 10061 - On PySpark/jupyter notebook

I'm trying to read a table from a PostgreSQL database.
Previously i was dealing with several errors, just as mentionad here. My solution was to download the PostgreSQL JDBC Driver and add it manually in the "jars" folder, that is: Inside "spark-3.2.0-bin-hadoop2.7\jars" and inside "anaconda3\Lib\site-packages\pyspark\jars". After that i was able to connect to the database using:
df = spark.read \
.format("jdbc") \
.option("url", "jdbc:postgresql://_my_host:5432/my_db_name") \
.option("dbtable", "my_table_name") \
.option("user", "_my_name") \
.option("password", "my_password") \
.option("driver", "org.postgresql.Driver") \
.load()
and i can execute some commands, like: df.printSchema(). So far so good.
But, when i try to read the data with df.head() or df.show(5), the following error throws:
ERROR:root:Exception while sending command.
Traceback (most recent call last):
File "C:\Users\danid\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3418, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-6-eb589bae8d4b>", line 1, in <module>
df.show(5)
File "C:\Users\danid\anaconda3\lib\site-packages\pyspark\sql\dataframe.py", line 494, in show
print(self._jdf.showString(n, 20, vertical))
File "C:\Users\danid\anaconda3\lib\site-packages\py4j\java_gateway.py", line 1309, in __call__
return_value = get_return_value(
File "C:\Users\danid\anaconda3\lib\site-packages\pyspark\sql\utils.py", line 111, in deco
return f(*a, **kw)
File "C:\Users\danid\anaconda3\lib\site-packages\py4j\protocol.py", line 326, in get_return_value
raise Py4JJavaError(
py4j.protocol.Py4JJavaError: <unprintable Py4JJavaError object>
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\danid\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2045, in showtraceback
stb = value._render_traceback_()
AttributeError: 'Py4JJavaError' object has no attribute '_render_traceback_'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\danid\anaconda3\lib\site-packages\py4j\clientserver.py", line 475, in send_command
answer = smart_decode(self.stream.readline()[:-1])
File "C:\Users\danid\anaconda3\lib\socket.py", line 669, in readinto
return self._sock.recv_into(b)
ConnectionResetError: [WinError 10054] Foi forçado o cancelamento de uma conexão existente pelo host remoto
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\danid\anaconda3\lib\site-packages\py4j\java_gateway.py", line 1038, in send_command
response = connection.send_command(command)
File "C:\Users\danid\anaconda3\lib\site-packages\py4j\clientserver.py", line 503, in send_command
raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
[... skipping hidden 1 frame]
<ipython-input-6-eb589bae8d4b> in <module>
----> 1 df.show(5)
~\anaconda3\lib\site-packages\pyspark\sql\dataframe.py in show(self, n, truncate, vertical)
493 if isinstance(truncate, bool) and truncate:
--> 494 print(self._jdf.showString(n, 20, vertical))
495 else:
~\anaconda3\lib\site-packages\py4j\java_gateway.py in __call__(self, *args)
1308 answer = self.gateway_client.send_command(command)
-> 1309 return_value = get_return_value(
1310 answer, self.gateway_client, self.target_id, self.name)
~\anaconda3\lib\site-packages\pyspark\sql\utils.py in deco(*a, **kw)
110 try:
--> 111 return f(*a, **kw)
112 except py4j.protocol.Py4JJavaError as e:
~\anaconda3\lib\site-packages\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
<class 'str'>: (<class 'ConnectionRefusedError'>, ConnectionRefusedError(10061, 'No connection could be made because the target machine actively refused it ', None, 10061, None))
During handling of the above exception, another exception occurred:
ConnectionRefusedError Traceback (most recent call last)
[... skipping hidden 1 frame]
~\anaconda3\lib\site-packages\IPython\core\interactiveshell.py in showtraceback(self, exc_tuple, filename, tb_offset, exception_only, running_compiled_code)
2048 value, tb, tb_offset=tb_offset)
2049
-> 2050 self._showtraceback(etype, value, stb)
2051 if self.call_pdb:
2052 # drop into debugger
~\anaconda3\lib\site-packages\ipykernel\zmqshell.py in _showtraceback(self, etype, evalue, stb)
544 u'traceback' : stb,
545 u'ename' : unicode_type(etype.__name__),
--> 546 u'evalue' : py3compat.safe_unicode(evalue),
547 }
548
~\anaconda3\lib\site-packages\ipython_genutils\py3compat.py in safe_unicode(e)
63 """
64 try:
---> 65 return unicode_type(e)
66 except UnicodeError:
67 pass
~\anaconda3\lib\site-packages\py4j\protocol.py in __str__(self)
469 def __str__(self):
470 gateway_client = self.java_exception._gateway_client
--> 471 answer = gateway_client.send_command(self.exception_cmd)
472 return_value = get_return_value(answer, gateway_client, None, None)
473 # Note: technically this should return a bytestring 'str' rather than
~\anaconda3\lib\site-packages\py4j\java_gateway.py in send_command(self, command, retry, binary)
1034 if `binary` is `True`.
1035 """
-> 1036 connection = self._get_connection()
1037 try:
1038 response = connection.send_command(command)
~\anaconda3\lib\site-packages\py4j\clientserver.py in _get_connection(self)
279
280 if connection is None or connection.socket is None:
--> 281 connection = self._create_new_connection()
282 return connection
283
~\anaconda3\lib\site-packages\py4j\clientserver.py in _create_new_connection(self)
286 self.java_parameters, self.python_parameters,
287 self.gateway_property, self)
--> 288 connection.connect_to_java_server()
289 self.set_thread_connection(connection)
290 return connection
~\anaconda3\lib\site-packages\py4j\clientserver.py in connect_to_java_server(self)
400 self.socket = self.ssl_context.wrap_socket(
401 self.socket, server_hostname=self.java_address)
--> 402 self.socket.connect((self.java_address, self.java_port))
403 self.stream = self.socket.makefile("rb")
404 self.is_connected = True
ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it
I don't know which configuration i shoul make to fix it.
After a lot of research i ended up with this new knowledge: Spark limitate your memory usage and it throws an error. So you have to configure something to obtain more memory, or you can use "cursor", which is an alternative for reading data in batches.

py4j.protocol.Py4JNetworkError: Answer from Java side is empty while trying to execute df.show(5)

I'm a newbie in PySpark and I got stuck at one point. Here I'm trying to do an analysis of the Twitter data dump in Parquet files through PySpark. I'm trying to read a parquet file in Pyspark on Google CoLab and it works fine up until I try to run "df.show(5)". I think there is some issue with memory of the driver and the executor but I'm not sure and also I don't know how much can I change it to. I'm using Google Colab Pro+. I have included the entire error below. This is after I build a SparkSession and do spark.read.parquet and then when I try to run df.show(5), it gives me this error.
Error along with all the exceptions:
ERROR:root:Exception while sending command.
Traceback (most recent call last):
File "/content/spark-3.2.0-bin-hadoop3.2/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py", line 480, in send_command
raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/content/spark-3.2.0-bin-hadoop3.2/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py", line 1038, in send_command
response = connection.send_command(command)
File "/content/spark-3.2.0-bin-hadoop3.2/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py", line 504, in send_command
"Error while sending or receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-14-eb589bae8d4b>", line 1, in <module>
df.show(5)
File "/content/spark-3.2.0-bin-hadoop3.2/python/pyspark/sql/dataframe.py", line 494, in show
print(self._jdf.showString(n, 20, vertical))
File "/content/spark-3.2.0-bin-hadoop3.2/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py", line 1310, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/content/spark-3.2.0-bin-hadoop3.2/python/pyspark/sql/utils.py", line 111, in deco
return f(*a, **kw)
File "/content/spark-3.2.0-bin-hadoop3.2/python/lib/py4j-0.10.9.2-src.zip/py4j/protocol.py", line 328, in get_return_value
format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: <unprintable Py4JJavaError object>
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 1823, in showtraceback
stb = value._render_traceback_()
AttributeError: 'Py4JJavaError' object has no attribute '_render_traceback_'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/content/spark-3.2.0-bin-hadoop3.2/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py", line 480, in send_command
raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/content/spark-3.2.0-bin-hadoop3.2/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py", line 1038, in send_command
response = connection.send_command(command)
File "/content/spark-3.2.0-bin-hadoop3.2/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py", line 504, in send_command
"Error while sending or receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while sending or receiving
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py in run_code(self, code_obj, result)
2881 #rprint('Running code', repr(code_obj)) # dbg
-> 2882 exec(code_obj, self.user_global_ns, self.user_ns)
2883 finally:
13 frames
<ipython-input-14-eb589bae8d4b> in <module>()
----> 1 df.show(5)
/content/spark-3.2.0-bin-hadoop3.2/python/pyspark/sql/dataframe.py in show(self, n, truncate, vertical)
493 if isinstance(truncate, bool) and truncate:
--> 494 print(self._jdf.showString(n, 20, vertical))
495 else:
/content/spark-3.2.0-bin-hadoop3.2/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py in __call__(self, *args)
1309 return_value = get_return_value(
-> 1310 answer, self.gateway_client, self.target_id, self.name)
1311
/content/spark-3.2.0-bin-hadoop3.2/python/pyspark/sql/utils.py in deco(*a, **kw)
110 try:
--> 111 return f(*a, **kw)
112 except py4j.protocol.Py4JJavaError as e:
/content/spark-3.2.0-bin-hadoop3.2/python/lib/py4j-0.10.9.2-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
<class 'str'>: (<class 'ConnectionRefusedError'>, ConnectionRefusedError(111, 'Connection refused'))
During handling of the above exception, another exception occurred:
ConnectionRefusedError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py in run_code(self, code_obj, result)
2897 if result is not None:
2898 result.error_in_exec = sys.exc_info()[1]
-> 2899 self.showtraceback()
2900 else:
2901 outflag = 0
/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py in showtraceback(self, exc_tuple, filename, tb_offset, exception_only)
1826 value, tb, tb_offset=tb_offset)
1827
-> 1828 self._showtraceback(etype, value, stb)
1829 if self.call_pdb:
1830 # drop into debugger
/usr/local/lib/python3.7/dist-packages/google/colab/_shell.py in _showtraceback(self, etype, evalue, stb)
131 'traceback': stb,
132 'ename': py3compat.unicode_type(etype.__name__),
--> 133 'evalue': py3compat.safe_unicode(evalue),
134 }
135
/usr/local/lib/python3.7/dist-packages/ipython_genutils/py3compat.py in safe_unicode(e)
63 """
64 try:
---> 65 return unicode_type(e)
66 except UnicodeError:
67 pass
/content/spark-3.2.0-bin-hadoop3.2/python/lib/py4j-0.10.9.2-src.zip/py4j/protocol.py in __str__(self)
469 def __str__(self):
470 gateway_client = self.java_exception._gateway_client
--> 471 answer = gateway_client.send_command(self.exception_cmd)
472 return_value = get_return_value(answer, gateway_client, None, None)
473 # Note: technically this should return a bytestring 'str' rather than
/content/spark-3.2.0-bin-hadoop3.2/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py in send_command(self, command, retry, binary)
1034 if `binary` is `True`.
1035 """
-> 1036 connection = self._get_connection()
1037 try:
1038 response = connection.send_command(command)
/content/spark-3.2.0-bin-hadoop3.2/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py in _get_connection(self)
279
280 if connection is None or connection.socket is None:
--> 281 connection = self._create_new_connection()
282 return connection
283
/content/spark-3.2.0-bin-hadoop3.2/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py in _create_new_connection(self)
286 self.java_parameters, self.python_parameters,
287 self.gateway_property, self)
--> 288 connection.connect_to_java_server()
289 self.set_thread_connection(connection)
290 return connection
/content/spark-3.2.0-bin-hadoop3.2/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py in connect_to_java_server(self)
400 self.socket = self.ssl_context.wrap_socket(
401 self.socket, server_hostname=self.java_address)
--> 402 self.socket.connect((self.java_address, self.java_port))
403 self.stream = self.socket.makefile("rb")
404 self.is_connected = True
ConnectionRefusedError: [Errno 111] Connection refused
I found the answer. I just configured the driver memory to have 12 G and it worked. I think it wasn't working because the driver was receiving a huge amount of data and the default driver memory of 2-4G wasn't able to handle it.

Plotly Dash in Jupyter notebook gives "gaierror: [Errno -2] Name or service not known"

I am new to the dash library and ran a simple example from the link :
Medium article.
I wanted to run dash plotly in jupyter notebook but this simple example gives me an error.
Dash is installed properly since the below code does not give any error
import dash_core_components as dcc
import dash_html_components as html
On running the example, I get:
Exception in thread Thread-20:
Traceback (most recent call last):
File "/root/anaconda3/envs/epp_test_env/lib/python3.6/threading.py", line 916, in _bootstrap_inner
self.run()
File "/root/anaconda3/envs/epp_test_env/lib/python3.6/threading.py", line 864, in run
self._target(*self._args, **self._kwargs)
File "/root/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/retrying.py", line 49, in wrapped_f
return Retrying(*dargs, **dkw).call(f, *args, **kw)
File "/root/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/retrying.py", line 212, in call
raise attempt.get()
File "/root/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/retrying.py", line 247, in get
six.reraise(self.value[0], self.value[1], self.value[2])
File "/root/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/six.py", line 703, in reraise
raise value
File "/root/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/retrying.py", line 200, in call
attempt = Attempt(fn(*args, **kwargs), attempt_number, False)
File "/root/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/jupyter_dash/jupyter_app.py", line 289, in run
super_run_server(**kwargs)
File "/root/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/dash/dash.py", line 1717, in run_server
self.server.run(host=host, port=port, debug=debug, **flask_run_options)
File "/root/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/flask/app.py", line 990, in run
run_simple(host, port, self, **options)
File "/root/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/werkzeug/serving.py", line 1052, in run_simple
inner()
File "/root/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/werkzeug/serving.py", line 1005, in inner
fd=fd,
File "/root/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/werkzeug/serving.py", line 848, in make_server
host, port, app, request_handler, passthrough_errors, ssl_context, fd=fd
File "/root/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/werkzeug/serving.py", line 740, in __init__
HTTPServer.__init__(self, server_address, handler)
File "/root/anaconda3/envs/epp_test_env/lib/python3.6/socketserver.py", line 456, in __init__
self.server_bind()
File "/root/anaconda3/envs/epp_test_env/lib/python3.6/http/server.py", line 136, in server_bind
socketserver.TCPServer.server_bind(self)
File "/root/anaconda3/envs/epp_test_env/lib/python3.6/socketserver.py", line 470, in server_bind
self.socket.bind(self.server_address)
socket.gaierror: [Errno -2] Name or service not known
---------------------------------------------------------------------------
gaierror Traceback (most recent call last)
~/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/requests/packages/urllib3/connection.py in _new_conn(self)
137 conn = connection.create_connection(
--> 138 (self.host, self.port), self.timeout, **extra_kw)
139
~/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/requests/packages/urllib3/util/connection.py in create_connection(address, timeout, source_address, socket_options)
74
---> 75 for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
76 af, socktype, proto, canonname, sa = res
~/anaconda3/envs/epp_test_env/lib/python3.6/socket.py in getaddrinfo(host, port, family, type, proto, flags)
744 addrlist = []
--> 745 for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
746 af, socktype, proto, canonname, sa = res
gaierror: [Errno -2] Name or service not known
During handling of the above exception, another exception occurred:
NewConnectionError Traceback (most recent call last)
~/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/requests/packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, **response_kw)
593 body=body, headers=headers,
--> 594 chunked=chunked)
595
~/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/requests/packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
360 else:
--> 361 conn.request(method, url, **httplib_request_kw)
362
~/anaconda3/envs/epp_test_env/lib/python3.6/http/client.py in request(self, method, url, body, headers, encode_chunked)
1271 """Send a complete request to the server."""
-> 1272 self._send_request(method, url, body, headers, encode_chunked)
1273
~/anaconda3/envs/epp_test_env/lib/python3.6/http/client.py in _send_request(self, method, url, body, headers, encode_chunked)
1317 body = _encode(body, 'body')
-> 1318 self.endheaders(body, encode_chunked=encode_chunked)
1319
~/anaconda3/envs/epp_test_env/lib/python3.6/http/client.py in endheaders(self, message_body, encode_chunked)
1266 raise CannotSendHeader()
-> 1267 self._send_output(message_body, encode_chunked=encode_chunked)
1268
~/anaconda3/envs/epp_test_env/lib/python3.6/http/client.py in _send_output(self, message_body, encode_chunked)
1037 del self._buffer[:]
-> 1038 self.send(msg)
1039
~/anaconda3/envs/epp_test_env/lib/python3.6/http/client.py in send(self, data)
975 if self.auto_open:
--> 976 self.connect()
977 else:
~/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/requests/packages/urllib3/connection.py in connect(self)
162 def connect(self):
--> 163 conn = self._new_conn()
164 self._prepare_conn(conn)
~/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/requests/packages/urllib3/connection.py in _new_conn(self)
146 raise NewConnectionError(
--> 147 self, "Failed to establish a new connection: %s" % e)
148
NewConnectionError: <requests.packages.urllib3.connection.HTTPConnection object at 0x7f46d007e940>: Failed to establish a new connection: [Errno -2] Name or service not known
During handling of the above exception, another exception occurred:
MaxRetryError Traceback (most recent call last)
~/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
422 retries=self.max_retries,
--> 423 timeout=timeout
424 )
~/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/requests/packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, **response_kw)
642 retries = retries.increment(method, url, error=e, _pool=self,
--> 643 _stacktrace=sys.exc_info()[2])
644 retries.sleep()
~/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/requests/packages/urllib3/util/retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
362 if new_retry.is_exhausted():
--> 363 raise MaxRetryError(_pool, url, error or ResponseError(cause))
364
MaxRetryError: HTTPConnectionPool(host='x86_64-conda-linux-gnu', port=9095): Max retries exceeded with url: /_alive_34d5d5d5-1c51-45a2-88f9-7d8f414f2314 (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x7f46d007e940>: Failed to establish a new connection: [Errno -2] Name or service not known',))
During handling of the above exception, another exception occurred:
ConnectionError Traceback (most recent call last)
<ipython-input-2-15c373e4f88e> in <module>
52
53 app.run_server(mode='jupyterlab', port = 9095, dev_tools_ui=True, #debug=True,
---> 54 dev_tools_hot_reload =True, threaded=True)
~/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/jupyter_dash/jupyter_app.py in run_server(self, mode, width, height, inline_exceptions, **kwargs)
317 )
318
--> 319 wait_for_app()
320
321 if JupyterDash._in_colab:
~/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/retrying.py in wrapped_f(*args, **kw)
47 #six.wraps(f)
48 def wrapped_f(*args, **kw):
---> 49 return Retrying(*dargs, **dkw).call(f, *args, **kw)
50
51 return wrapped_f
~/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/retrying.py in call(self, fn, *args, **kwargs)
210 if not self._wrap_exception and attempt.has_exception:
211 # get() on an attempt with an exception should cause it to be raised, but raise just in case
--> 212 raise attempt.get()
213 else:
214 raise RetryError(attempt)
~/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/retrying.py in get(self, wrap_exception)
245 raise RetryError(self)
246 else:
--> 247 six.reraise(self.value[0], self.value[1], self.value[2])
248 else:
249 return self.value
~/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
701 if value.__traceback__ is not tb:
702 raise value.with_traceback(tb)
--> 703 raise value
704 finally:
705 value = None
~/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/retrying.py in call(self, fn, *args, **kwargs)
198 while True:
199 try:
--> 200 attempt = Attempt(fn(*args, **kwargs), attempt_number, False)
201 except:
202 tb = sys.exc_info()
~/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/jupyter_dash/jupyter_app.py in wait_for_app()
305 )
306 def wait_for_app():
--> 307 res = requests.get(alive_url).content.decode()
308 if res != "Alive":
309 url = "http://{host}:{port}".format(
~/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/requests/api.py in get(url, params, **kwargs)
68
69 kwargs.setdefault('allow_redirects', True)
---> 70 return request('get', url, params=params, **kwargs)
71
72
~/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/requests/api.py in request(method, url, **kwargs)
54 # cases, and look like a memory leak in others.
55 with sessions.Session() as session:
---> 56 return session.request(method=method, url=url, **kwargs)
57
58
~/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
486 }
487 send_kwargs.update(settings)
--> 488 resp = self.send(prep, **send_kwargs)
489
490 return resp
~/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/requests/sessions.py in send(self, request, **kwargs)
607
608 # Send the request
--> 609 r = adapter.send(request, **kwargs)
610
611 # Total elapsed time of the request (approximately)
~/anaconda3/envs/epp_test_env/lib/python3.6/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
485 raise ProxyError(e, request=request)
486
--> 487 raise ConnectionError(e, request=request)
488
489 except ClosedPoolError as e:
ConnectionError: HTTPConnectionPool(host='x86_64-conda-linux-gnu', port=9095): Max retries exceeded with url: /_alive_34d5d5d5-1c51-45a2-88f9-7d8f414f2314 (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x7f46d007e940>: Failed to establish a new connection: [Errno -2] Name or service not known',))
I am working on a centos linux machine inside a virtual environment.
Please suggest.
Apologies if this is a stupid mistake from my side.

pandas merge command failing in parallel loop - "ValueError: buffer source array is read-only"

I am writing a bootstrap algorithm using parallel loops and pandas. The problem i experience is that a merge command inside the parallel loop causes a "ValueError: buffer source array is read-only" error - but only if i use the full dataset to merge (120k lines). Any subset with less than 12k lines will work just fine and so i infer it is not a problem of the syntax. What can i do?
Current pandas version is 0.24.2 and cython is 0.29.7.
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
"""
Traceback (most recent call last):
File "/home/ubuntu/.local/lib/python3.6/site-packages/joblib/externals/loky/process_executor.py", line 418, in _process_worker
r = call_item()
File "/home/ubuntu/.local/lib/python3.6/site-packages/joblib/externals/loky/process_executor.py", line 272, in __call__
return self.fn(*self.args, **self.kwargs)
File "/home/ubuntu/.local/lib/python3.6/site-packages/joblib/_parallel_backends.py", line 567, in __call__
return self.func(*args, **kwargs)
File "/home/ubuntu/.local/lib/python3.6/site-packages/joblib/parallel.py", line 225, in __call__
for func, args, kwargs in self.items]
File "/home/ubuntu/.local/lib/python3.6/site-packages/joblib/parallel.py", line 225, in <listcomp>
for func, args, kwargs in self.items]
File "<ipython-input-72-cdb83eaf594c>", line 12, in bootstrap
File "/home/ubuntu/.local/lib/python3.6/site-packages/pandas/core/frame.py", line 6868, in merge
copy=copy, indicator=indicator, validate=validate)
File "/home/ubuntu/.local/lib/python3.6/site-packages/pandas/core/reshape/merge.py", line 48, in merge
return op.get_result()
File "/home/ubuntu/.local/lib/python3.6/site-packages/pandas/core/reshape/merge.py", line 546, in get_result
join_index, left_indexer, right_indexer = self._get_join_info()
File "/home/ubuntu/.local/lib/python3.6/site-packages/pandas/core/reshape/merge.py", line 756, in _get_join_info
right_indexer) = self._get_join_indexers()
File "/home/ubuntu/.local/lib/python3.6/site-packages/pandas/core/reshape/merge.py", line 735, in _get_join_indexers
how=self.how)
File "/home/ubuntu/.local/lib/python3.6/site-packages/pandas/core/reshape/merge.py", line 1130, in _get_join_indexers
llab, rlab, shape = map(list, zip(* map(fkeys, left_keys, right_keys)))
File "/home/ubuntu/.local/lib/python3.6/site-packages/pandas/core/reshape/merge.py", line 1662, in _factorize_keys
rlab = rizer.factorize(rk)
File "pandas/_libs/hashtable.pyx", line 111, in pandas._libs.hashtable.Int64Factorizer.factorize
File "stringsource", line 653, in View.MemoryView.memoryview_cwrapper
File "stringsource", line 348, in View.MemoryView.memoryview.__cinit__
ValueError: buffer source array is read-only
"""
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
<ipython-input-73-652c1db5701b> in <module>()
1 num_cores = multiprocessing.cpu_count()
----> 2 results = Parallel(n_jobs=num_cores, prefer='processes', verbose = 5)(delayed(bootstrap)() for i in range(n_trials))
3 #pd.DataFrame(results[0])
~/.local/lib/python3.6/site-packages/joblib/parallel.py in __call__(self, iterable)
932
933 with self._backend.retrieval_context():
--> 934 self.retrieve()
935 # Make sure that we get a last message telling us we are done
936 elapsed_time = time.time() - self._start_time
~/.local/lib/python3.6/site-packages/joblib/parallel.py in retrieve(self)
831 try:
832 if getattr(self._backend, 'supports_timeout', False):
--> 833 self._output.extend(job.get(timeout=self.timeout))
834 else:
835 self._output.extend(job.get())
~/.local/lib/python3.6/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
519 AsyncResults.get from multiprocessing."""
520 try:
--> 521 return future.result(timeout=timeout)
522 except LokyTimeoutError:
523 raise TimeoutError()
/usr/lib/python3.6/concurrent/futures/_base.py in result(self, timeout)
430 raise CancelledError()
431 elif self._state == FINISHED:
--> 432 return self.__get_result()
433 else:
434 raise TimeoutError()
/usr/lib/python3.6/concurrent/futures/_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
ValueError: buffer source array is read-only
and the code is
def bootstrap():
df_resample_ids = skl.utils.resample(ob_ids)
df_resample_ids = pd.DataFrame(df_resample_ids).sort_values(by="0").reset_index(drop=True)
df_resample_ids.columns = [ob_id_field]
df_resample = pd.DataFrame(df_resample_ids.merge(df, on = ob_id_field))
return df_resample
num_cores = multiprocessing.cpu_count()
results = Parallel(n_jobs=num_cores, prefer='processes', verbose = 5)(delayed(bootstrap)() for i in range(n_trials))
The algo will create resampled/replaced IDs from an ID variable and use the merge command to create a new dataset based on the resampled IDs and the original dataset stored in df. If i cut out a subset of the original dataset (anywhere) leaving less than ~12k lines, then the parallel loop will finish without an error and do as expected.
As requested, below is a new snippet to re-create the data structures and mirror the principal approach i am currently working on:
import pandas as pd
import sklearn as skl
import multiprocessing
from joblib import Parallel, delayed
df = pd.DataFrame(np.random.randn(200000, 24), columns=list('ABCDDEFGHIJKLMNOPQRSTUVW'))
df["ID"] = df.index.drop_duplicates().tolist()
ob_ids = df.index.drop_duplicates().tolist()
def bootstrap2():
df_resample_ids = skl.utils.resample(ob_ids)
df_resample_ids = pd.DataFrame(df_resample_ids).sort_values(by=0).reset_index(drop=True)
df_resample_ids.columns = ['ID']
df_resample = pd.DataFrame(df1.merge(df_resample_ids, on = 'ID'))
result = df_resample
return result
num_cores = multiprocessing.cpu_count()
results = Parallel(n_jobs=num_cores, prefer='processes', verbose = 5)(delayed(bootstrap2)() for i in range(n_trials))
However, i notice that when the data is completely made up of np.random numbers, the loop goes through without an error. The dtypes of the original dataframe are:
start_rtg int64
end_rtg float64
days_diff float64
ultimate_customer_system_id int64
How can i avoid the read-only error?
posting an answer to my question as i found that one of the variables was of int64 datatype. when i converted all variables to float64, the error disappeared. so it is an issue that is restricted to certain datatypes only...
cheers
stephan

using Dask library to merge two large dataframes

I am very new to dask. I am trying to merge two dataframes (one in a small file that fits a pandas dataframe size but I'm using it as a dask dataframe for convenience, the other is really large). I try to save the result in a csv file since I know it might not fit in a dataframe.
import pandas as pd
import dask.dataframe as dd
AF=dd.read_csv("../data/AuthorFieldOfStudy.csv")
AF.columns=['AID','FID']
#extract subset of Authors to reduce final merge size
AF = AF.loc[AF['FID'] == '0271BC14']
#This is a large file 9 MB
PAA=dd.read_csv("../data/PAA.csv")
PAA.columns=['PID','AID', 'AffID']
result = dd.merge(AF,PAA, on='AID')
result.to_csv("../data/CompSciPaperAuthorAffiliations.csv").compute()
I get the following error, and don't quite understand it:
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-1-6b2f889f44ff> in <module>()
14 result = dd.merge(AF,PAA, on='AID')
15
---> 16 result.to_csv("../data/CompSciPaperAuthorAffiliations.csv").compute()
/usr/local/lib/python2.7/dist-packages/dask/dataframe/core.pyc in to_csv(self, filename, **kwargs)
936 """ See dd.to_csv docstring for more information """
937 from .io import to_csv
--> 938 return to_csv(self, filename, **kwargs)
939
940 def to_delayed(self):
/usr/local/lib/python2.7/dist-packages/dask/dataframe/io/csv.pyc in to_csv(df, filename, name_function, compression, compute, get, **kwargs)
411 if compute:
412 from dask import compute
--> 413 compute(*values, get=get)
414 else:
415 return values
/usr/local/lib/python2.7/dist-packages/dask/base.pyc in compute(*args, **kwargs)
177 dsk = merge(var.dask for var in variables)
178 keys = [var._keys() for var in variables]
--> 179 results = get(dsk, keys, **kwargs)
180
181 results_iter = iter(results)
/usr/local/lib/python2.7/dist-packages/dask/threaded.pyc in get(dsk, result, cache, num_workers, **kwargs)
74 results = get_async(pool.apply_async, len(pool._pool), dsk, result,
75 cache=cache, get_id=_thread_get_id,
---> 76 **kwargs)
77
78 # Cleanup pools associated to dead threads
/usr/local/lib/python2.7/dist-packages/dask/async.pyc in get_async(apply_async, num_workers, dsk, result, cache, get_id, raise_on_exception, rerun_exceptions_locally, callbacks, dumps, loads, **kwargs)
491 _execute_task(task, data) # Re-execute locally
492 else:
--> 493 raise(remote_exception(res, tb))
494 state['cache'][key] = res
495 finish_task(dsk, key, state, results, keyorder.get)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc5 in position 14: ordinal not in range(128)
Traceback
---------
File "/usr/local/lib/python2.7/dist-packages/dask/async.py", line 268, in execute_task
result = _execute_task(task, data)
File "/usr/local/lib/python2.7/dist-packages/dask/async.py", line 249, in _execute_task
return func(*args2)
File "/usr/local/lib/python2.7/dist-packages/dask/dataframe/shuffle.py", line 329, in collect
res = p.get(part)
File "/usr/local/lib/python2.7/dist-packages/partd/core.py", line 73, in get
return self.get([keys], **kwargs)[0]
File "/usr/local/lib/python2.7/dist-packages/partd/core.py", line 79, in get
return self._get(keys, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/partd/encode.py", line 30, in _get
for chunk in raw]
File "/usr/local/lib/python2.7/dist-packages/partd/pandas.py", line 144, in deserialize
for block, dt, shape in zip(b_blocks, dtypes, shapes)]
File "/usr/local/lib/python2.7/dist-packages/partd/numpy.py", line 127, in deserialize
l = decode(l)
File "/usr/local/lib/python2.7/dist-packages/partd/numpy.py", line 114, in decode
return list(map(decode, o))
File "/usr/local/lib/python2.7/dist-packages/partd/numpy.py", line 110, in decode
return [item.decode() for item in o]

Categories

Resources