having problemns while using dask map_partitions with string matching algorithm

having problemns while using dask map_partitions with string matching algorithm - python

I'm having some probems apllying a text search algorithm with parallelized dask insfrastructure.
I'm tryng to find the best match for 40,000 stirngs in a series object against a 4000 string list.
I could have done it using pandas.apply but it's to time expensive, so i decided try parallelization with map_partitions in dask.
I'm using this text search library with python-Levenshtein https://marcobonzanini.com/2015/02/25/fuzzy-string-matching-in-python
As you can see, it works ok on this example from a pandas dataset:
process.extractOne(df['endereco2'][1],choices=choices,scorer=fuzz.token_set_ratio,
score_cutoff=60)
Output: ('R ALVARO DUARTE DE ALMEIDA PROFESSOR', 85)
but its not working while using dask:
from dask import dataframe as dd
sd = dd.from_pandas(r13_2["endereco2"],npartitions=3).map_partitions(lambda df : df.apply(process.extractOne,choices=choices,scorer=fuzz.token_set_ratio,score_cutoff=60)).compute(scheduler='processes')

Output:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-69-f39ab0d086b5> in <module>
1 from dask import dataframe as dd
----> 2 sd = dd.from_pandas(r13_2["endereco2"],npartitions=3).map_partitions(lambda df : df.apply(process.extractOne,choices=choices,scorer=fuzz.token_set_ratio,score_cutoff=60)).compute(scheduler='processes')
~\Anaconda3\envs\mono\lib\site-packages\dask\base.py in compute(self, **kwargs)
154 dask.base.compute
155 """
--> 156 (result,) = compute(self, traverse=False, **kwargs)
157 return result
158
~\Anaconda3\envs\mono\lib\site-packages\dask\base.py in compute(*args, **kwargs)
396 keys = [x.__dask_keys__() for x in collections]
397 postcomputes = [x.__dask_postcompute__() for x in collections]
--> 398 results = schedule(dsk, keys, **kwargs)
399 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
400
~\Anaconda3\envs\mono\lib\site-packages\dask\multiprocessing.py in get(dsk, keys, num_workers, func_loads, func_dumps, optimize_graph, pool, **kwargs)
190 get_id=_process_get_id, dumps=dumps, loads=loads,
191 pack_exception=pack_exception,
--> 192 raise_exception=reraise, **kwargs)
193 finally:
194 if cleanup:
~\Anaconda3\envs\mono\lib\site-packages\dask\local.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)
460 _execute_task(task, data) # Re-execute locally
461 else:
--> 462 raise_exception(exc, tb)
463 res, worker_id = loads(res_info)
464 state['cache'][key] = res
~\Anaconda3\envs\mono\lib\site-packages\dask\compatibility.py in reraise(exc, tb)
109 def reraise(exc, tb=None):
110 if exc.__traceback__ is not tb:
--> 111 raise exc.with_traceback(tb)
112 raise exc
113
~\Anaconda3\envs\mono\lib\site-packages\dask\local.py in execute_task()
228 try:
229 task, data = loads(task_info)
--> 230 result = _execute_task(task, data)
231 id = get_id()
232 result = dumps((result, id))
~\Anaconda3\envs\mono\lib\site-packages\dask\core.py in _execute_task()
117 func, args = arg[0], arg[1:]
118 args2 = [_execute_task(a, cache) for a in args]
--> 119 return func(*args2)
120 elif not ishashable(arg):
121 return arg
~\Anaconda3\envs\mono\lib\site-packages\dask\optimization.py in __call__()
940 % (len(self.inkeys), len(args)))
941 return core.get(self.dsk, self.outkey,
--> 942 dict(zip(self.inkeys, args)))
943
944 def __reduce__(self):
~\Anaconda3\envs\mono\lib\site-packages\dask\core.py in get()
147 for key in toposort(dsk):
148 task = dsk[key]
--> 149 result = _execute_task(task, cache)
150 cache[key] = result
151 result = _execute_task(out, cache)
~\Anaconda3\envs\mono\lib\site-packages\dask\core.py in _execute_task()
117 func, args = arg[0], arg[1:]
118 args2 = [_execute_task(a, cache) for a in args]
--> 119 return func(*args2)
120 elif not ishashable(arg):
121 return arg
~\Anaconda3\envs\mono\lib\site-packages\dask\compatibility.py in apply()
91 def apply(func, args, kwargs=None):
92 if kwargs:
---> 93 return func(*args, **kwargs)
94 else:
95 return func(*args)
~\Anaconda3\envs\mono\lib\site-packages\dask\dataframe\core.py in apply_and_enforce()
3877 func = kwargs.pop('_func')
3878 meta = kwargs.pop('_meta')
-> 3879 df = func(*args, **kwargs)
3880 if is_dataframe_like(df) or is_series_like(df) or is_index_like(df):
3881 if not len(df):
<ipython-input-69-f39ab0d086b5> in <lambda>()
1 from dask import dataframe as dd
----> 2 sd = dd.from_pandas(r13_2["endereco2"],npartitions=3).map_partitions(lambda df : df.apply(process.extractOne,choices=choices,scorer=fuzz.token_set_ratio,score_cutoff=60)).compute(scheduler='processes')
~\Anaconda3\envs\mono\lib\site-packages\pandas\core\series.py in apply()
3589 else:
3590 values = self.astype(object).values
-> 3591 mapped = lib.map_infer(values, f, convert=convert_dtype)
3592
3593 if len(mapped) and isinstance(mapped[0], Series):
pandas/_libs/lib.pyx in pandas._libs.lib.map_infer()
~\Anaconda3\envs\mono\lib\site-packages\pandas\core\series.py in f()
3576 if kwds or args and not isinstance(func, np.ufunc):
3577 def f(x):
-> 3578 return func(x, *args, **kwds)
3579 else:
3580 f = func
~\Anaconda3\envs\mono\lib\site-packages\fuzzywuzzy\process.py in extractOne()
218 best_list = extractWithoutOrder(query, choices, processor, scorer, score_cutoff)
219 try:
--> 220 return max(best_list, key=lambda i: i[1])
221 except ValueError:
222 return None
~\Anaconda3\envs\mono\lib\site-packages\fuzzywuzzy\process.py in extractWithoutOrder()
76
77 # Run the processor on the input query.
---> 78 processed_query = processor(query)
79
80 if len(processed_query) == 0:
~\Anaconda3\envs\mono\lib\site-packages\fuzzywuzzy\utils.py in full_process()
93 s = asciidammit(s)
94 # Keep only Letters and Numbers (see Unicode docs).
---> 95 string_out = StringProcessor.replace_non_letters_non_numbers_with_whitespace(s)
96 # Force into lowercase.
97 string_out = StringProcessor.to_lower_case(string_out)
~\Anaconda3\envs\mono\lib\site-packages\fuzzywuzzy\string_processing.py in replace_non_letters_non_numbers_with_whitespace()
24 numbers with a single white space.
25 """
---> 26 return cls.regex.sub(" ", a_string)
27
28 strip = staticmethod(string.strip)
TypeError: expected string or bytes-like object
What's happenig?
Obs: I solved my problem using the pool.apply from multplocessing lib, but i still want to know what happened with Dask

Doing the MCVE I realized that it was a naive syntax problem: I can't use the map_partitions on a dask dataframe without specifying the column that im using even if there is only one column. So I should had used sd[0].map_partitions insted of sd.map_partitions

Related

Parameter estimation with parmest and pyomo model

I have the enzymatic reaction:
R+L<->Y
R+I<->X
Describes by the following system of coupled differential equations:
dR/dt=k2Y(t)-k1R(t)L(t)+k4X(t)-k3*R(t)*I(t)
dL/dt=k2Y(t)-k1R(t)*L(t)
dI/dt=k4X(t)-k3R(t)*I(t)
dY/dt=k1*R(t)L(t)-k2Y(t)
dX/dt=k3*R(t)I(t)-k4X(t)
The parameters k1 and k2 are knowed. I want to estimate the value of k3 and k4 using a pyomo model and a set of Y data in time. This is my code:
#Modules
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import scipy.stats as stats
import scipy.optimize as optimize
from pyomo.environ import *
from pyomo.dae import *
from pyomo.dae.simulator import Simulator
import pyomo.contrib.parmest.parmest as parmest
from pyomo.contrib.parmest.examples.reactor_design.reactor_design import reactor_design_model
import sys
import os.path
#Experimental data
texp=[0,0.5,0.75,1,1.25,2,2.77,3.55,4.32,5.1,5.87,6.65,7.42,8.2,8.97,13.92,\
18.92,23.92,28.92,33.92,38.92,43.92,48.92,53.92,58.92,63.92,68.92,83.9,\
98.9,113.9,128.9,143.9,158.9,173.9,188.9,203.9,218.9,233.9,248.9]
yexp=[0,21.00084301,-54.20967226,-12.0118567,-25.27198718,-1.764831016,\
10.29814076,-5.340599221,6.988265971,9.56252586,-3.705303123,1.063813346,\
12.32611118,7.763248428,9.074028389,20.60003402,22.1001936,23.13229101,\
27.31536018,25.00455108,31.70315201,35.10288809,38.0816535,35.30253723,\
36.81655545,36.11171691,41.57221204,42.47852506,46.28315167,42.66070948,\
44.73318881,37.36241544,39.69557981,38.71667563,37.49757832,42.35943236,\
41.68017195,44.91883581,47.80088108]
df=pd.DataFrame()
df['t']=texp
df['1Cc']=yexp
#MODEL
def create_model(data):
m=ConcreteModel()
#···Parameters to estimate
m.k3=Var(initialize=1e8, within=PositiveReals)
m.k3.fixed=True
m.k4=Var(initialize=0.01, within=PositiveReals)
m.k3.fixed=True
#···Knowed parameters
k1=3.58e6
k2=1.25e-1
#···Data
m.t=ContinuousSet(initialize=texp)
#···Variables
m.Y=Var(m.t)
m.X=Var(m.t)
m.R=Var(m.t)
m.L=Var(m.t)
m.I=Var(m.t)
m.dYdt=DerivativeVar(m.Y, wrt=m.t)
m.dXdt=DerivativeVar(m.X, wrt=m.t)
m.dRdt=DerivativeVar(m.R, wrt=m.t)
m.dLdt=DerivativeVar(m.L, wrt=m.t)
m.dIdt=DerivativeVar(m.I, wrt=m.t)
#···Initial Conditions
m.Y[0]=0.0
m.X[0]=0.0
m.R[0]=0.5e-9
m.L[0]=30e-9
m.I[0]=1e-9
#Constraints
def DiffX(m,t):
return m.dXdt[t]==-m.k4*m.X[t]+m.k3*m.R[t]*m.I[t]
m.XC=Constraint(m.t, rule=DiffX)
def DiffR(m,t):
return m.dRdt[t]==k2*m.Y[t]-k1*m.R[t]*m.L[t]+m.k4*m.X[t]-m.k3*m.R[t]*m.I[t]
m.RC=Constraint(m.t, rule=DiffR)
def DiffL(m,t):
return m.dLdt[t]==k2*m.Y[t]-k1*m.R[t]*m.L[t]
m.LC=Constraint(m.t, rule=DiffL)
def DiffI(m,t):
return m.dIdt[t]==m.k4*m.X[t]-m.k3*m.R[t]*m.I[t]
m.IC=Constraint(m.t, rule=DiffI)
def DiffY(m,t):
return m.dYdt[t]==-k2*m.Y[t]+k1*m.R[t]*m.L[t]
m.YC=Constraint(m.t, rule=DiffY)
return m
def main():
#Vars to estimate
theta_names=['k3','k4']
#Data
data=df
def SSE(model,data):
expr=(data['1Cc']-model.Y)**2
return expr
#Instance of the parmest estimator
pest=parmest.Estimator(create_model,data,theta_names,SSE)
#Parameter estimation
obj,theta=pest.theta_est()
#Assert
k3_expected=1e8
k4_expected=0.01
relative_error=abs(theta['k3']-k3_expected)/k3_expected
assert relative_error<0.05
relative_error=abs(theta['k4']-k4_expected)/k4_expected
assert relative_error<0.05
if __name__=="__main__":
main()
I get the following error:
ERROR: Rule failed for Expression 'SecondStageCost' with index None:
TypeError: 'float' object cannot be interpreted as an integer
ERROR: Constructing component 'SecondStageCost' from data=None failed:
TypeError: 'float' object cannot be interpreted as an integer
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
File ~\AppData\Roaming\Python\Python39\site-packages\pyomo\contrib\parmest\parmest.py:143, in _experiment_instance_creation_callback(scenario_name, node_names, cb_data)
142 try:
--> 143 instance = callback(experiment_number = exp_num, cb_data = cb_data)
144 except TypeError:
File ~\AppData\Roaming\Python\Python39\site-packages\pyomo\contrib\parmest\parmest.py:391, in Estimator._instance_creation_callback(self, experiment_number, cb_data)
390 raise RuntimeError(f'Unexpected data format for cb_data={cb_data}')
--> 391 model = self._create_parmest_model(exp_data)
393 return model
File ~\AppData\Roaming\Python\Python39\site-packages\pyomo\contrib\parmest\parmest.py:366, in Estimator._create_parmest_model(self, data)
365 model.FirstStageCost = pyo.Expression(rule=FirstStageCost_rule)
--> 366 model.SecondStageCost = pyo.Expression(rule=_SecondStageCostExpr(self.obj_function, data))
368 def TotalCost_rule(model):
File ~\AppData\Roaming\Python\Python39\site-packages\pyomo\core\base\block.py:544, in _BlockData.__setattr__(self, name, val)
540 if isinstance(val, Component):
541 #
542 # Pyomo components are added with the add_component method.
543 #
--> 544 self.add_component(name, val)
545 else:
546 #
547 # Other Python objects are added with the standard __setattr__
548 # method.
549 #
File ~\AppData\Roaming\Python\Python39\site-packages\pyomo\core\base\block.py:1089, in _BlockData.add_component(self, name, val)
1088 try:
-> 1089 val.construct(data)
1090 except:
File ~\AppData\Roaming\Python\Python39\site-packages\pyomo\core\base\expression.py:369, in Expression.construct(self, data)
368 assert data is None
--> 369 self._construct_from_rule_using_setitem()
370 finally:
File ~\AppData\Roaming\Python\Python39\site-packages\pyomo\core\base\indexed_component.py:708, in IndexedComponent._construct_from_rule_using_setitem(self)
705 elif rule.constant():
706 # Slight optimization: if the initializer is known to be
707 # constant, then only call the rule once.
--> 708 val = rule(block, None)
709 for index in self.index_set():
File ~\AppData\Roaming\Python\Python39\site-packages\pyomo\core\base\initializer.py:373, in ScalarCallInitializer.__call__(self, parent, idx)
372 def __call__(self, parent, idx):
--> 373 return self._fcn(parent)
File ~\AppData\Roaming\Python\Python39\site-packages\pyomo\contrib\parmest\parmest.py:270, in _SecondStageCostExpr.__call__(self, model)
269 def __call__(self, model):
--> 270 return self._ssc_function(model, self._data)
Input In [35], in main.<locals>.SSE(model, data)
91 def SSE(model,data):
---> 92 expr=(data['1Cc']-model.Y)**2
93 return expr
File ~\AppData\Roaming\Python\Python39\site-packages\pandas\core\ops\common.py:70, in _unpack_zerodim_and_defer.<locals>.new_method(self, other)
68 other = item_from_zerodim(other)
---> 70 return method(self, other)
File ~\AppData\Roaming\Python\Python39\site-packages\pandas\core\arraylike.py:108, in OpsMixin.__sub__(self, other)
106 #unpack_zerodim_and_defer("__sub__")
107 def __sub__(self, other):
--> 108 return self._arith_method(other, operator.sub)
File ~\AppData\Roaming\Python\Python39\site-packages\pandas\core\series.py:5639, in Series._arith_method(self, other, op)
5638 self, other = ops.align_method_SERIES(self, other)
-> 5639 return base.IndexOpsMixin._arith_method(self, other, op)
File ~\AppData\Roaming\Python\Python39\site-packages\pandas\core\base.py:1295, in IndexOpsMixin._arith_method(self, other, op)
1294 with np.errstate(all="ignore"):
-> 1295 result = ops.arithmetic_op(lvalues, rvalues, op)
1297 return self._construct_result(result, name=res_name)
File ~\AppData\Roaming\Python\Python39\site-packages\pandas\core\ops\array_ops.py:222, in arithmetic_op(left, right, op)
220 _bool_arith_check(op, left, right)
--> 222 res_values = _na_arithmetic_op(left, right, op)
224 return res_values
File ~\AppData\Roaming\Python\Python39\site-packages\pandas\core\ops\array_ops.py:163, in _na_arithmetic_op(left, right, op, is_cmp)
162 try:
--> 163 result = func(left, right)
164 except TypeError:
File ~\AppData\Roaming\Python\Python39\site-packages\pandas\core\computation\expressions.py:239, in evaluate(op, a, b, use_numexpr)
237 if use_numexpr:
238 # error: "None" not callable
--> 239 return _evaluate(op, op_str, a, b) # type: ignore[misc]
240 return _evaluate_standard(op, op_str, a, b)
File ~\AppData\Roaming\Python\Python39\site-packages\pandas\core\computation\expressions.py:128, in _evaluate_numexpr(op, op_str, a, b)
127 if result is None:
--> 128 result = _evaluate_standard(op, op_str, a, b)
130 return result
File ~\AppData\Roaming\Python\Python39\site-packages\pandas\core\computation\expressions.py:69, in _evaluate_standard(op, op_str, a, b)
68 _store_test_result(False)
---> 69 return op(a, b)
File ~\AppData\Roaming\Python\Python39\site-packages\pyomo\core\base\indexed_component.py:1113, in IndexedComponent_NDArrayMixin.__array_ufunc__(self, ufunc, method, *inputs, **kwargs)
1112 def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
-> 1113 return NumericNDArray.__array_ufunc__(
1114 None, ufunc, method, *inputs, **kwargs)
File pyomo\core\expr\numvalue.pyx:997, in pyomo.core.expr.numvalue.NumericNDArray.__array_ufunc__()
File ~\AppData\Roaming\Python\Python39\site-packages\pyomo\core\base\indexed_component.py:1107, in IndexedComponent_NDArrayMixin.__array__(self, dtype)
1106 shape = tuple(b+1 for b in bounds[1])
-> 1107 ans = NumericNDArray(shape=shape, dtype=object)
1108 for k, v in self.items():
TypeError: 'float' object cannot be interpreted as an integer
During handling of the above exception, another exception occurred:
RuntimeError Traceback (most recent call last)
Input In [35], in <cell line: 110>()
107 assert relative_error<0.05
110 if __name__=="__main__":
--> 111 main()
Input In [35], in main()
96 pest=parmest.Estimator(create_model,data,theta_names,SSE)
98 #Parameter estimation
---> 99 obj,theta=pest.theta_est()
101 #Assert
102 k3_expected=1e8
File ~\AppData\Roaming\Python\Python39\site-packages\pyomo\contrib\parmest\parmest.py:687, in Estimator.theta_est(self, solver, return_values, calc_cov, cov_n)
684 assert isinstance(cov_n, int), "The number of datapoints that are used in the objective function is required to calculate the covariance matrix"
685 assert cov_n > len(self.theta_names), "The number of datapoints must be greater than the number of parameters to estimate"
--> 687 return self._Q_opt(solver=solver, return_values=return_values,
688 bootlist=None, calc_cov=calc_cov, cov_n=cov_n)
File ~\AppData\Roaming\Python\Python39\site-packages\pyomo\contrib\parmest\parmest.py:432, in Estimator._Q_opt(self, ThetaVals, solver, return_values, bootlist, calc_cov, cov_n)
426 ef = sputils.create_EF(scen_names,
427 _experiment_instance_creation_callback,
428 EF_name = "_Q_opt",
429 suppress_warnings=True,
430 scenario_creator_kwargs=scenario_creator_options)
431 else:
--> 432 ef = local_ef.create_EF(scen_names,
433 _experiment_instance_creation_callback,
434 EF_name = "_Q_opt",
435 suppress_warnings=True,
436 scenario_creator_kwargs=scenario_creator_options)
437 self.ef_instance = ef
439 # Solve the extensive form with ipopt
File ~\AppData\Roaming\Python\Python39\site-packages\pyomo\contrib\parmest\create_ef.py:88, in create_EF(scenario_names, scenario_creator, scenario_creator_kwargs, EF_name, suppress_warnings, nonant_for_fixed_vars)
86 if scenario_creator_kwargs is None:
87 scenario_creator_kwargs = dict()
---> 88 scen_dict = {
89 name: scenario_creator(name, **scenario_creator_kwargs)
90 for name in scenario_names
91 }
93 if (len(scen_dict) == 0):
94 raise RuntimeError("create_EF() received empty scenario list")
File ~\AppData\Roaming\Python\Python39\site-packages\pyomo\contrib\parmest\create_ef.py:89, in <dictcomp>(.0)
86 if scenario_creator_kwargs is None:
87 scenario_creator_kwargs = dict()
88 scen_dict = {
---> 89 name: scenario_creator(name, **scenario_creator_kwargs)
90 for name in scenario_names
91 }
93 if (len(scen_dict) == 0):
94 raise RuntimeError("create_EF() received empty scenario list")
File ~\AppData\Roaming\Python\Python39\site-packages\pyomo\contrib\parmest\parmest.py:145, in _experiment_instance_creation_callback(scenario_name, node_names, cb_data)
143 instance = callback(experiment_number = exp_num, cb_data = cb_data)
144 except TypeError:
--> 145 raise RuntimeError("Only one callback signature is supported: "
146 "callback(experiment_number, cb_data) ")
147 """
148 try:
149 instance = callback(scenario_tree_model, scen_name, node_names)
(...)
158 raise
159 """
160 if hasattr(instance, "_mpisppy_node_list"):
RuntimeError: Only one callback signature is supported: callback(experiment_number, cb_data)
The model runs ok in simulation, so I don´t know where is the problem. I will be very grateful if someone can help me. Thank you!

Error when trying to access dask datafame - ValueError: Length of passed values is 0, index implies

I'm getting above the error when trying to compute a dask dataframe. Here's what I'm doing(taking a pandas dataframe, then converting year to datatime then merging it with another dataframe):
from dask import dataframe as dd
#setup variables
df1x = dd.from_pandas(df1, npartitions=4).reset_index() # cudf.DataFrame.from_pandas(FullMerge)
df2x = dd.from_pandas(df2, npartitions=4).reset_index() #cudf.DataFrame.from_pandas(emissions)
# add year
df1x['year'] = dd.to_datetime(df1x.date_x,unit='ns') #pd.to_datetime(df1['date_x'])
df2x['year'] = dd.to_datetime(df2x.year,unit='ns')
#we must rename emissions DF values to match fullMerge so data can merge correctly
df2x = df2x.rename(columns={'reference_name': 'Name'})
# map revenueOut to df1 #set it to value
df1x['value'] = df1x[['year', 'Name']].merge(df2x, how='left').revenueOutput
It seems to work(no errors) but when I want to view the results, I get above error:
df1x.to_csv('myfiles.csv', single_file = True)
I get this stack trace(if it helps):
ValueError Traceback (most recent call last)
<ipython-input-10-78b6500075c4> in <module>
----> 1 df1x.to_csv('myfiles.csv', single_file = True)
2 # dd.compute(Full_df)
20 frames
/usr/local/lib/python3.7/dist-packages/dask/dataframe/core.py in to_csv(self, filename, **kwargs)
1344 from .io import to_csv
1345
-> 1346 return to_csv(self, filename, **kwargs)
1347
1348 def to_json(self, filename, *args, **kwargs):
/usr/local/lib/python3.7/dist-packages/dask/dataframe/io/csv.py in to_csv(df, filename, single_file, encoding, mode, name_function, compression, compute, scheduler, storage_options, header_first_partition_only, **kwargs)
787 )
788 if compute:
--> 789 delayed(values).compute(scheduler=scheduler)
790 return [f.path for f in files]
791 else:
/usr/local/lib/python3.7/dist-packages/dask/base.py in compute(self, **kwargs)
164 dask.base.compute
165 """
--> 166 (result,) = compute(self, traverse=False, **kwargs)
167 return result
168
/usr/local/lib/python3.7/dist-packages/dask/base.py in compute(*args, **kwargs)
435 keys = [x.__dask_keys__() for x in collections]
436 postcomputes = [x.__dask_postcompute__() for x in collections]
--> 437 results = schedule(dsk, keys, **kwargs)
438 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
439
/usr/local/lib/python3.7/dist-packages/dask/threaded.py in get(dsk, result, cache, num_workers, pool, **kwargs)
82 get_id=_thread_get_id,
83 pack_exception=pack_exception,
---> 84 **kwargs
85 )
86
/usr/local/lib/python3.7/dist-packages/dask/local.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)
484 _execute_task(task, data) # Re-execute locally
485 else:
--> 486 raise_exception(exc, tb)
487 res, worker_id = loads(res_info)
488 state["cache"][key] = res
/usr/local/lib/python3.7/dist-packages/dask/local.py in reraise(exc, tb)
314 if exc.__traceback__ is not tb:
315 raise exc.with_traceback(tb)
--> 316 raise exc
317
318
/usr/local/lib/python3.7/dist-packages/dask/local.py in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
220 try:
221 task, data = loads(task_info)
--> 222 result = _execute_task(task, data)
223 id = get_id()
224 result = dumps((result, id))
/usr/local/lib/python3.7/dist-packages/dask/core.py in _execute_task(arg, cache, dsk)
119 # temporaries by their reference count and can execute certain
120 # operations in-place.
--> 121 return func(*(_execute_task(a, cache) for a in args))
122 elif not ishashable(arg):
123 return arg
/usr/local/lib/python3.7/dist-packages/dask/optimization.py in __call__(self, *args)
980 if not len(args) == len(self.inkeys):
981 raise ValueError("Expected %d args, got %d" % (len(self.inkeys), len(args)))
--> 982 return core.get(self.dsk, self.outkey, dict(zip(self.inkeys, args)))
983
984 def __reduce__(self):
/usr/local/lib/python3.7/dist-packages/dask/core.py in get(dsk, out, cache)
149 for key in toposort(dsk):
150 task = dsk[key]
--> 151 result = _execute_task(task, cache)
152 cache[key] = result
153 result = _execute_task(out, cache)
/usr/local/lib/python3.7/dist-packages/dask/core.py in _execute_task(arg, cache, dsk)
119 # temporaries by their reference count and can execute certain
120 # operations in-place.
--> 121 return func(*(_execute_task(a, cache) for a in args))
122 elif not ishashable(arg):
123 return arg
/usr/local/lib/python3.7/dist-packages/dask/core.py in <genexpr>(.0)
119 # temporaries by their reference count and can execute certain
120 # operations in-place.
--> 121 return func(*(_execute_task(a, cache) for a in args))
122 elif not ishashable(arg):
123 return arg
/usr/local/lib/python3.7/dist-packages/dask/core.py in _execute_task(arg, cache, dsk)
119 # temporaries by their reference count and can execute certain
120 # operations in-place.
--> 121 return func(*(_execute_task(a, cache) for a in args))
122 elif not ishashable(arg):
123 return arg
/usr/local/lib/python3.7/dist-packages/dask/utils.py in apply(func, args, kwargs)
28 def apply(func, args, kwargs=None):
29 if kwargs:
---> 30 return func(*args, **kwargs)
31 else:
32 return func(*args)
/usr/local/lib/python3.7/dist-packages/dask/dataframe/core.py in apply_and_enforce(*args, **kwargs)
5072 func = kwargs.pop("_func")
5073 meta = kwargs.pop("_meta")
-> 5074 df = func(*args, **kwargs)
5075 if is_dataframe_like(df) or is_series_like(df) or is_index_like(df):
5076 if not len(df):
/usr/local/lib/python3.7/dist-packages/dask/dataframe/shuffle.py in partitioning_index(df, npartitions)
604 An array of int64 values mapping each record to a partition.
605 """
--> 606 return hash_object_dispatch(df, index=False) % int(npartitions)
607
608
/usr/local/lib/python3.7/dist-packages/dask/utils.py in __call__(self, arg, *args, **kwargs)
504 """
505 meth = self.dispatch(type(arg))
--> 506 return meth(arg, *args, **kwargs)
507
508 #property
/usr/local/lib/python3.7/dist-packages/dask/dataframe/utils.py in hash_object_pandas(obj, index, encoding, hash_key, categorize)
470 ):
471 return pd.util.hash_pandas_object(
--> 472 obj, index=index, encoding=encoding, hash_key=hash_key, categorize=categorize
473 )
474
/usr/local/lib/python3.7/dist-packages/pandas/core/util/hashing.py in hash_pandas_object(obj, index, encoding, hash_key, categorize)
134 h = _combine_hash_arrays(hashes, num_items)
135
--> 136 h = Series(h, index=obj.index, dtype="uint64", copy=False)
137 else:
138 raise TypeError(f"Unexpected type for hashing {type(obj)}")
/usr/local/lib/python3.7/dist-packages/pandas/core/series.py in __init__(self, data, index, dtype, name, copy, fastpath)
312 if len(index) != len(data):
313 raise ValueError(
--> 314 f"Length of passed values is {len(data)}, "
315 f"index implies {len(index)}."
316 )
ValueError: Length of passed values is 0, index implies 41478.
I'm not sure what to do as the pandas version is working.

IIUC, the following line is not something dask will handle well:
df1x['value'] = df1x[['year', 'Name']].merge(df2x, how='left').revenueOutput
The reason is that partitions must be aligned when assigning the variable (df1x['value'] = ...), while merge (in general) does not yield the same alignment (df1x[['year', 'Name']].merge(df2x, how='left')). This is not an issue when all data is in memory.
If df2y defined below fits into memory, then one possible option is to do it with .map_partitions:
# make sure this fits into memory
df2y = df2x[['year', 'Name', 'revenueOutput']].compute()
def add_value(df):
df = df.merge(df2y, how='left')
df['value'] = df['revenueOutput']
return df
df1x = df1x.map_partitions(add_value)
If df2y does not fit into memory, then it might be possible to do an explicit dask merge and then use the merged dataframe for further analysis:
merged_df = dd.merge(df1x, df2x, on=['year', 'Name'], how='left')
merged_df['value'] = merged_df['revenueOutput']
# I assume that the line above is needed for some further
# transformation, but if that's not the case, then
# a simple column rename is more efficient

TypeError: 'DesignMatrix' object is not callable

I am trying to create B-splines with the patsy package on a ipynb notebook on JupyterLab:
from patsy import dmatrix
bs = dmatrix("bs(x, df=50, degree=1) - 1", {"x": x})
axes[0].plot(x, bs)
axes[0].set_title("Basis functions")
plt.show()
This works fine the first time I run it. But when I try to rerun this cell again, it fails with the following error:
-----------------------------------------------------
TypeError Traceback (most recent call last)
/opt/conda/lib/python3.8/site-packages/patsy/compat.py in call_and_wrap_exc(msg, origin, f, *args, **kwargs)
35 try:
---> 36 return f(*args, **kwargs)
37 except Exception as e:
/opt/conda/lib/python3.8/site-packages/patsy/eval.py in eval(self, expr, source_name, inner_namespace)
164 code = compile(expr, source_name, "eval", self.flags, False)
--> 165 return eval(code, {}, VarLookupDict([inner_namespace]
166 + self._namespaces))
<string> in <module>
TypeError: 'DesignMatrix' object is not callable
The above exception was the direct cause of the following exception:
PatsyError Traceback (most recent call last)
<ipython-input-6-6ed4ba95a384> in <module>
2
3 _, axes = plt.subplots(2, figsize=(16, 16))
----> 4 bs = dmatrix("bs(x, df=50, degree=1) - 1", {"x": x})
5 axes[0].plot(x, bs)
6 axes[0].set_title("Basis functions")
/opt/conda/lib/python3.8/site-packages/patsy/highlevel.py in dmatrix(formula_like, data, eval_env, NA_action, return_type)
288 """
289 eval_env = EvalEnvironment.capture(eval_env, reference=1)
--> 290 (lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env,
291 NA_action, return_type)
292 if lhs.shape[1] != 0:
/opt/conda/lib/python3.8/site-packages/patsy/highlevel.py in _do_highlevel_design(formula_like, data, eval_env, NA_action, return_type)
162 def data_iter_maker():
163 return iter([data])
--> 164 design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env,
165 NA_action)
166 if design_infos is not None:
/opt/conda/lib/python3.8/site-packages/patsy/highlevel.py in _try_incr_builders(formula_like, data_iter_maker, eval_env, NA_action)
64 if isinstance(formula_like, ModelDesc):
65 assert isinstance(eval_env, EvalEnvironment)
---> 66 return design_matrix_builders([formula_like.lhs_termlist,
67 formula_like.rhs_termlist],
68 data_iter_maker,
/opt/conda/lib/python3.8/site-packages/patsy/build.py in design_matrix_builders(termlists, data_iter_maker, eval_env, NA_action)
691 # on some data to find out what type of data they return.
692 (num_column_counts,
--> 693 cat_levels_contrasts) = _examine_factor_types(all_factors,
694 factor_states,
695 data_iter_maker,
/opt/conda/lib/python3.8/site-packages/patsy/build.py in _examine_factor_types(factors, factor_states, data_iter_maker, NA_action)
441 for data in data_iter_maker():
442 for factor in list(examine_needed):
--> 443 value = factor.eval(factor_states[factor], data)
444 if factor in cat_sniffers or guess_categorical(value):
445 if factor not in cat_sniffers:
/opt/conda/lib/python3.8/site-packages/patsy/eval.py in eval(self, memorize_state, data)
562
563 def eval(self, memorize_state, data):
--> 564 return self._eval(memorize_state["eval_code"],
565 memorize_state,
566 data)
/opt/conda/lib/python3.8/site-packages/patsy/eval.py in _eval(self, code, memorize_state, data)
545 def _eval(self, code, memorize_state, data):
546 inner_namespace = VarLookupDict([data, memorize_state["transforms"]])
--> 547 return call_and_wrap_exc("Error evaluating factor",
548 self,
549 memorize_state["eval_env"].eval,
/opt/conda/lib/python3.8/site-packages/patsy/compat.py in call_and_wrap_exc(msg, origin, f, *args, **kwargs)
41 origin)
42 # Use 'exec' to hide this syntax from the Python 2 parser:
---> 43 exec("raise new_exc from e")
44 else:
45 # In python 2, we just let the original exception escape -- better
/opt/conda/lib/python3.8/site-packages/patsy/compat.py in <module>
PatsyError: Error evaluating factor: TypeError: 'DesignMatrix' object is not callable
bs(x, df=50, degree=1) - 1
^^^^^^^^^^^^^^^^^^^^^^

Ends up it was because of myself overriding the variable bs hence overriding the bs function inside the patsy string.
This is why eval is an antipattern as usual...

ValueError when using .diff() with dask dataframe

I have a large time series data set which I want to process with Dask.
apart from a few other columns, there is a column called 'id' which identifies individuals and a column transc_date which identifies the date and a column transc_time identifying the time when an individual made a transaction.
The data is sorted using:
df = df.map_partitions(lambda x: x.sort_values(['id', 'transc_date', 'transc_time'], ascending=[True, True, True]))
transc_time is of type int and transc_date is of type datetime64.
I want to create a new column which gives me for each individual the number of days since the last transaction. For this I created the following function:
def get_diff_since_last_trans(df, plot=True):
df['diff_last'] = df.map_overlap(lambda x: x.groupby('id')['transc_date'].diff(), before=10, after=10)
diffs = df[['id', 'diff_last']].groupby(['id']).agg('max')['diff_last'].dt.days.compute()
if plot:
sns.distplot(diffs.values, kde = False, rug = False)
return diffs
When I try this function on a small subset of the data (200k rows) it works as intended. But when I use it on the full data set I get a ValueErro below.
I dropped all ids which have fewer than 10 occurrences first. transc_date does not contain nans, it only contains datetime64 entries.
Any idea what's going wrong?
ValueError Traceback (most recent call last)
<ipython-input-12-551d7256f328> in <module>()
1 a = get_diff_first_last_trans(df, plot=False)
----> 2 b = get_diff_since_last_trans(df, plot=False)
3 plot_trans_diff(a,b)
<ipython-input-10-8f83d4571659> in get_diff_since_last_trans(df, plot)
12 def get_diff_since_last_trans(df, plot=True):
13 df['diff_last'] = df.map_overlap(lambda x: x.groupby('id')['transc_date'].diff(), before=10, after=10)
---> 14 diffs = df[['id', 'diff_last']].groupby(['id']).agg('max')['diff_last'].dt.days.compute()
15 if plot:
16 sns.distplot(diffs.values, kde = False, rug = False)
~/venv/lib/python3.6/site-packages/dask/base.py in compute(self, **kwargs)
133 dask.base.compute
134 """
--> 135(result,)= compute(self, traverse=False,**kwargs) 136return result
137
~/venv/lib/python3.6/site-packages/dask/base.py in compute(*args, **kwargs)
331 postcomputes = [a.__dask_postcompute__() if is_dask_collection(a)
332 else (None, a) for a in args]
--> 333 results = get(dsk, keys, **kwargs)
334 results_iter = iter(results)
335 return tuple(a if f is None else f(next(results_iter), *a)
~/venv/lib/python3.6/site-packages/distributed/client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, **kwargs)
1997 secede()
1998 try:
-> 1999 results = self.gather(packed, asynchronous=asynchronous)
2000 finally:
2001 for f in futures.values():
~/venv/lib/python3.6/site-packages/distributed/client.py in gather(self, futures, errors, maxsize, direct, asynchronous)
1435 return self.sync(self._gather, futures, errors=errors,
1436 direct=direct, local_worker=local_worker,
-> 1437 asynchronous=asynchronous)
1438
1439 #gen.coroutine
~/venv/lib/python3.6/site-packages/distributed/client.py in sync(self, func, *args, **kwargs)
590 return future
591 else:
--> 592return sync(self.loop, func,*args,**kwargs) 593 594def __repr__(self):
~/venv/lib/python3.6/site-packages/distributed/utils.py in sync(loop, func, *args, **kwargs)
252 e.wait(1000000)
253 if error[0]:
--> 254 six.reraise(*error[0])
255 else:
256 return result[0]
~/venv/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693raise value
694finally: 695 value =None
~/venv/lib/python3.6/site-packages/distributed/utils.py in f()
236 yield gen.moment
237 thread_state.asynchronous = True
--> 238 result[0] = yield make_coro()
239 except Exception as exc:
240 logger.exception(exc)
~/venv/lib/python3.6/site-packages/tornado/gen.py in run(self)
1053
1054 try:
-> 1055 value = future.result()
1056 except Exception:
1057 self.had_exception = True
~/venv/lib/python3.6/site-packages/tornado/concurrent.py in result(self, timeout)
236 if self._exc_info is not None:
237 try:
--> 238 raise_exc_info(self._exc_info)
239 finally:
240 self = None
~/venv/lib/python3.6/site-packages/tornado/util.py in raise_exc_info(exc_info)
~/venv/lib/python3.6/site-packages/tornado/gen.py in run(self)
1061 if exc_info is not None:
1062 try:
-> 1063 yielded = self.gen.throw(*exc_info)
1064 finally:
1065 # Break up a reference to itself
~/venv/lib/python3.6/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
1313 six.reraise(type(exception),
1314 exception,
-> 1315 traceback)
1316 if errors == 'skip':
1317 bad_keys.add(key)
~/venv/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
690 value = tp()
691 if value.__traceback__ is not tb:
--> 692raise value.with_traceback(tb) 693raise value
694finally:
~/venv/lib/python3.6/site-packages/dask/dataframe/rolling.py in overlap_chunk()
30 parts = [p for p in (prev_part, current_part, next_part) if p is not None]
31 combined = pd.concat(parts)
---> 32 out = func(combined, *args, **kwargs)
33 if prev_part is None:
34 before = None
<ipython-input-10-8f83d4571659> in <lambda>()
11
12 def get_diff_since_last_trans(df, plot=True):
---> 13 df['diff_last'] = df.map_overlap(lambda x: x.groupby('id')['transc_date'].diff(), before=10, after=10)
14 diffs = df[['id', 'diff_last']].groupby(['id']).agg('max')['diff_last'].dt.days.compute()
15 if plot:
~/venv/lib/python3.6/site-packages/pandas/core/groupby.py in wrapper()
737 *args, **kwargs)
738 except (AttributeError):
--> 739raise ValueError
740 741return wrapper
ValueError:

PatsyError, name error, name is not defined when using smf.ols

I am trying to use multi linear regression to analysis some time series data and their lags. Basically variables are some currency rate and their lag1 and lag2. Code is as below.
I tried to check each variable and there is nothing abnormal..
rate = pd.read_csv('P2training.csv', header=0)
#change date format in csv
rate['Date'] = pd.to_datetime(rate['Date'], format='%Y-%m-%d')
rate.set_index('Date', inplace=True, drop=True)
lags = [1,2]
lagdata = rate
for i in lags:
tmp = rate.shift(i).copy();
lagdata = lagdata.join(tmp, rsuffix='_lag{}'.format(i));
# fit the linear regression models
collist = list(lagdata.columns);
collist.remove('AUD/USD')
collist.remove('GBP/USD')
collist.remove('CAD/USD')
collist.remove('NLG/USD')
collist.remove('FRF/USD')
collist.remove('DEM/USD')
collist.remove('JPY/USD')
collist.remove('CHF/USD')
form = 'JPY/USD' + '~' + '+'.join(collist);
lagdata.dropna(inplace=True)
model = smf.ols(formula=form, data = lagdata).fit()
error occurs in last step when using smf.ols. A few name errors said some variables is not defined.
NameError Traceback (most recent call last)
C:\Users\yaojia\AppData\Local\Continuum\Anaconda3\lib\site- packages\patsy\compat.py in call_and_wrap_exc(msg, origin, f, *args, **kwargs)
116 try:
--> 117 return f(*args, **kwargs)
118 except Exception as e:
C:\Users\yaojia\AppData\Local\Continuum\Anaconda3\lib\site-packages\patsy\eval.py in eval(self, expr, source_name, inner_namespace)
165 return eval(code, {}, VarLookupDict([inner_namespace]
--> 166 + self._namespaces))
167
<string> in <module>()
NameError: name 'USD_lag2' is not defined
The above exception was the direct cause of the following exception:
PatsyError Traceback (most recent call last)
<ipython-input-26-1985b8d39238> in <module>()
51 #print(collist)
52 #print(lagdata)
---> 53 model = smf.ols(formula=form, data = lagdata).fit()
54
55 #print(model.summary())
C:\Users\yaojia\AppData\Local\Continuum\Anaconda3\lib\site- packages\statsmodels\base\model.py in from_formula(cls, formula, data, subset, drop_cols, *args, **kwargs)
153
154 tmp = handle_formula_data(data, None, formula, depth=eval_env,
--> 155 missing=missing)
156 ((endog, exog), missing_idx, design_info) = tmp
157
C:\Users\yaojia\AppData\Local\Continuum\Anaconda3\lib\site-packages\statsmodels\formula\formulatools.py in handle_formula_data(Y, X, formula, depth, missing)
63 if data_util._is_using_pandas(Y, None):
64 result = dmatrices(formula, Y, depth, return_type='dataframe',
---> 65 NA_action=na_action)
66 else:
67 result = dmatrices(formula, Y, depth, return_type='dataframe',
C:\Users\yaojia\AppData\Local\Continuum\Anaconda3\lib\site-packages\patsy\highlevel.py in dmatrices(formula_like, data, eval_env, NA_action, return_type)
308 eval_env = EvalEnvironment.capture(eval_env, reference=1)
309 (lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env,
--> 310 NA_action, return_type)
311 if lhs.shape[1] == 0:
312 raise PatsyError("model is missing required outcome variables")
C:\Users\yaojia\AppData\Local\Continuum\Anaconda3\lib\site-packages\patsy\highlevel.py in _do_highlevel_design(formula_like, data, eval_env, NA_action, return_type)
163 return iter([data])
164 design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env,
--> 165 NA_action)
166 if design_infos is not None:
167 return build_design_matrices(design_infos, data,
C:\Users\yaojia\AppData\Local\Continuum\Anaconda3\lib\site-packages\patsy\highlevel.py in _try_incr_builders(formula_like, data_iter_maker, eval_env, NA_action)
68 data_iter_maker,
69 eval_env,
---> 70 NA_action)
71 else:
72 return None
C:\Users\yaojia\AppData\Local\Continuum\Anaconda3\lib\site-packages\patsy\build.py in design_matrix_builders(termlists, data_iter_maker, eval_env, NA_action)
694 factor_states,
695 data_iter_maker,
--> 696 NA_action)
697 # Now we need the factor infos, which encapsulate the knowledge of
698 # how to turn any given factor into a chunk of data:
C:\Users\yaojia\AppData\Local\Continuum\Anaconda3\lib\site-packages\patsy\build.py in _examine_factor_types(factors, factor_states, data_iter_maker, NA_action)
441 for data in data_iter_maker():
442 for factor in list(examine_needed):
--> 443 value = factor.eval(factor_states[factor], data)
444 if factor in cat_sniffers or guess_categorical(value):
445 if factor not in cat_sniffers:
C:\Users\yaojia\AppData\Local\Continuum\Anaconda3\lib\site-packages\patsy\eval.py in eval(self, memorize_state, data)
564 return self._eval(memorize_state["eval_code"],
565 memorize_state,
--> 566 data)
567
568 __getstate__ = no_pickling
C:\Users\yaojia\AppData\Local\Continuum\Anaconda3\lib\site-packages\patsy\eval.py in _eval(self, code, memorize_state, data)
549 memorize_state["eval_env"].eval,
550 code,
--> 551 inner_namespace=inner_namespace)
552
553 def memorize_chunk(self, state, which_pass, data):
C:\Users\yaojia\AppData\Local\Continuum\Anaconda3\lib\site-packages\patsy\compat.py in call_and_wrap_exc(msg, origin, f, *args, **kwargs)
122 origin)
123 # Use 'exec' to hide this syntax from the Python 2 parser:
--> 124 exec("raise new_exc from e")
125 else:
126 # In python 2, we just let the original exception escape -- better
C:\Users\yaojia\AppData\Local\Continuum\Anaconda3\lib\site-packages\patsy\compat.py in <module>()
PatsyError: Error evaluating factor: NameError: name 'USD_lag2' is not defined
JPY/USD~AUD/USD_lag1+GBP/USD_lag1+CAD/USD_lag1+NLG/USD_lag1+FRF/USD_lag1+DEM/USD_lag1+JPY/USD_lag1+CHF/USD_lag1+AUD/USD_lag2+GBP/USD_lag2+CAD/USD_lag2+NLG/USD_lag2+FRF/USD_lag2+DEM/USD_lag2+JPY/USD_lag2+CHF/USD_lag2

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

having problemns while using dask map_partitions with string matching algorithm - python

Doing the MCVE I realized that it was a naive syntax problem: I can't use the map_partitions on a dask dataframe without specifying the column that im using even if there is only one column. So I should had used sd[0].map_partitions insted of sd.map_partitions

Related

Parameter estimation with parmest and pyomo model

Error when trying to access dask datafame - ValueError: Length of passed values is 0, index implies

TypeError: 'DesignMatrix' object is not callable

ValueError when using .diff() with dask dataframe

PatsyError, name error, name is not defined when using smf.ols

Categories

Resources